From 345121425182f7eaa17b6ebd1d22186d5f83d90d Mon Sep 17 00:00:00 2001 From: Lilith River Date: Mon, 30 Sep 2024 01:36:18 -0600 Subject: [PATCH] Check img src too, allow disabling external url checking --- README.md | 22 ++++++++++++----- check-links.js | 48 +++++++++++++++++++++---------------- index.js | 3 ++- tests/integration.test.js | 3 +++ tests/public/exists.jpg | 0 tests/src/pages/about.astro | 3 +++ 6 files changed, 52 insertions(+), 27 deletions(-) create mode 100644 tests/public/exists.jpg diff --git a/README.md b/README.md index f6b2660..c6d81f5 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,21 @@ An Astro integration that checks for broken links in your website during static build. It logs any broken links to the console and writes them to a file, grouping them by the document in which they occur. -## Features +## Goals - **Checks Internal and External Links**: Validates all `` links found in your HTML pages. - **Logs Broken Links**: Outputs broken link information to both the console and a log file. -- **Grouped by Document**: Broken links are grouped by the document in which they occur, making it easier to identify and fix issues. -- **Caching Mechanism**: Avoids redundant checks by caching the results of previously checked links. -- **Parallel Processing**: Checks links in parallel to improve performance. -- **Development Mode Middleware**: Checks links on each page load during development. -- **Post-Build Validation**: Scans all generated HTML files after building your site. +- **Grouped by broken URL**: To allow for quick search and replacement, a list of all pages containing the broken URL is logged. +- **Caching Mechanism**: Avoids redundant checks by caching the results of previously checked links, both internal and external, whether they are valid or not. +- **Parallel Processing**: Checks links and does IO and network operations in parallel to improve performance. We first collect all links from all pages, then only check each once, first loading the tsv cache, then saving it again when we are done. All http requests happen in parallel. +- **Local redirect awareness**: If a link is redirected in astro.config.mjs, it will be followed. +- **Timeouts and retries**: To avoid false positives, links that fail to load with ECONNRESET are retried 3 times with exponential backoff. Timeouts are set to 3 seconnd max including retries. +- **Link text preservation**: The contents of "href" are only normalized to a domain-relative path (like /foo/bar/) if they are "../relative" or "./relative" or "relative" etc. It is otherwise preserved for reportinng purposes. +- **Cross-platform compatibility**: The physical paths of the html files are normalized to domain relative paths. +- **Disk cachinng of remote links**: To speed up subsequent builds, a tab-delimied text file is optionally written to disk containing the contents of all remote links checked and the status code returned by the server, in the form URLok/failedstatus codeISO-8601-formatted timestamp. + + + ## Installation @@ -28,6 +34,10 @@ export default defineConfig({ integrations: [ astroBrokenLinksChecker({ logFilePath: 'broken-links.log', // Optional: specify the log file path + remoteLinksCacheFilePath: 'remote-links-cache.tsv', // Optional: specify the path to a tab-separated file to cache remote links + maxConcurrency: 10, // Optional: specify the maximum number of concurrent link checks + timeout: 3000, // Optional: specify the maximum time in milliseconds for a link check to complete + cacheExpiryMinutes: 30, // Optional: specify the number of minutes after which a cached externallink should be re-checked }), ], }); diff --git a/check-links.js b/check-links.js index 1a14c2d..0c9f7b2 100644 --- a/check-links.js +++ b/check-links.js @@ -13,13 +13,18 @@ export async function checkLinksInHtml( checkedLinks = new Map(), distPath = '', astroConfigRedirects = {}, - logger + logger, + checkExternalLinks = true ) { const root = parse(htmlContent); const linkElements = root.querySelectorAll('a[href]'); const links = linkElements.map((el) => el.getAttribute('href')); + // add img src + const imgElements = root.querySelectorAll('img[src]'); + const imgLinks = imgElements.map((el) => el.getAttribute('src')); + links.push(...imgLinks); - const limit = pLimit(10); // Limit to 10 concurrent link checks + const limit = pLimit(50); // Limit to 10 concurrent link checks const checkLinkPromises = links.map((link) => limit(async () => { @@ -89,27 +94,30 @@ export async function checkLinksInHtml( } } else { // External link, check via HTTP request. Retry 3 times if ECONNRESET - let retries = 0; - while (retries < 3) { - try { - const response = await fetch(fetchLink, { method: 'GET' }); - isBroken = !response.ok; - if (isBroken) { - logger.error(`${response.status} Error fetching ${fetchLink}`); + if (checkExternalLinks) { + let retries = 0; + while (retries < 3) { + try { + const response = await fetch(fetchLink, { method: 'GET' }); + isBroken = !response.ok; + if (isBroken) { + logger.error(`${response.status} Error fetching ${fetchLink}`); + } + break; + } catch (error) { + isBroken = true; + let statusCodeNumber = error.errno == 'ENOTFOUND' ? 404 : (error.errno); + logger.error(`${statusCodeNumber} error fetching ${fetchLink}`); + if (error.errno === 'ECONNRESET') { + retries++; + continue; + } + break; + } } - break; - } catch (error) { - isBroken = true; - let statusCodeNumber = error.errno == 'ENOTFOUND' ? 404 : (error.errno); - logger.error(`${statusCodeNumber} error fetching ${fetchLink}`); - if (error.errno === 'ECONNRESET') { - retries++; - continue; - } - break; } } - } + // Cache the link's validity checkedLinks.set(fetchLink, !isBroken); diff --git a/index.js b/index.js index f62d625..d68891b 100644 --- a/index.js +++ b/index.js @@ -40,7 +40,8 @@ export default function astroBrokenLinksChecker(options = {}) { checkedLinks, distPath, astroConfigRedirects, - logger + logger, + options.checkExternalLinks ); }); await Promise.all(checkHtmlPromises); diff --git a/tests/integration.test.js b/tests/integration.test.js index ca2892c..7861173 100644 --- a/tests/integration.test.js +++ b/tests/integration.test.js @@ -43,6 +43,8 @@ describe('Astro Broken Links Checker Integration', () => { expect(logContent).toContain('../path/changing/relative-broken-link'); expect(logContent).toContain('https://non-existent-page.com/page'); expect(logContent).toContain('https://non-existent-page.com/page?query=string#fragment'); + expect(logContent).toContain('https://non-existent-page.com/image.jpg'); + expect(logContent).toContain('/missing.jpg'); expect(logContent).toContain('Found in'); expect(logContent).toContain('/'); @@ -56,5 +58,6 @@ describe('Astro Broken Links Checker Integration', () => { expect(logContent).not.toContain('Broken link: /\n'); // Expect '/about' to not be reported as broken expect(logContent).not.toContain('Broken link: https://microsoft.com'); // Expect 'https://microsoft.com' to not be reported as broken expect(logContent).not.toContain('Broken link: /redirected'); // Expect '/redirected' to not be reported as broken + expect(logContent).not.toContain('Broken link: /exists.jpg'); // Expect '/exists.jpg' to not be reported as broken }); }); diff --git a/tests/public/exists.jpg b/tests/public/exists.jpg new file mode 100644 index 0000000..e69de29 diff --git a/tests/src/pages/about.astro b/tests/src/pages/about.astro index ca0c081..6a76a0d 100644 --- a/tests/src/pages/about.astro +++ b/tests/src/pages/about.astro @@ -8,3 +8,6 @@ Broken link with query and fragment Non Existent Page Non Existent Page with query and fragment +Non Existent Image +Non Existent Local Image +Real Local Image \ No newline at end of file