Skip to content

Commit

Permalink
Check img src too, allow disabling external url checking
Browse files Browse the repository at this point in the history
  • Loading branch information
lilith committed Sep 30, 2024
1 parent e1238d7 commit 3451214
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 27 deletions.
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@

An Astro integration that checks for broken links in your website during static build. It logs any broken links to the console and writes them to a file, grouping them by the document in which they occur.

## Features
## Goals

- **Checks Internal and External Links**: Validates all `<a href="...">` links found in your HTML pages.
- **Logs Broken Links**: Outputs broken link information to both the console and a log file.
- **Grouped by Document**: Broken links are grouped by the document in which they occur, making it easier to identify and fix issues.
- **Caching Mechanism**: Avoids redundant checks by caching the results of previously checked links.
- **Parallel Processing**: Checks links in parallel to improve performance.
- **Development Mode Middleware**: Checks links on each page load during development.
- **Post-Build Validation**: Scans all generated HTML files after building your site.
- **Grouped by broken URL**: To allow for quick search and replacement, a list of all pages containing the broken URL is logged.
- **Caching Mechanism**: Avoids redundant checks by caching the results of previously checked links, both internal and external, whether they are valid or not.
- **Parallel Processing**: Checks links and does IO and network operations in parallel to improve performance. We first collect all links from all pages, then only check each once, first loading the tsv cache, then saving it again when we are done. All http requests happen in parallel.
- **Local redirect awareness**: If a link is redirected in astro.config.mjs, it will be followed.
- **Timeouts and retries**: To avoid false positives, links that fail to load with ECONNRESET are retried 3 times with exponential backoff. Timeouts are set to 3 seconnd max including retries.
- **Link text preservation**: The contents of "href" are only normalized to a domain-relative path (like /foo/bar/) if they are "../relative" or "./relative" or "relative" etc. It is otherwise preserved for reportinng purposes.
- **Cross-platform compatibility**: The physical paths of the html files are normalized to domain relative paths.
- **Disk cachinng of remote links**: To speed up subsequent builds, a tab-delimied text file is optionally written to disk containing the contents of all remote links checked and the status code returned by the server, in the form URL<tab>ok/failed<tab>status code<tab>ISO-8601-formatted timestamp.




## Installation

Expand All @@ -28,6 +34,10 @@ export default defineConfig({
integrations: [
astroBrokenLinksChecker({
logFilePath: 'broken-links.log', // Optional: specify the log file path
remoteLinksCacheFilePath: 'remote-links-cache.tsv', // Optional: specify the path to a tab-separated file to cache remote links
maxConcurrency: 10, // Optional: specify the maximum number of concurrent link checks
timeout: 3000, // Optional: specify the maximum time in milliseconds for a link check to complete
cacheExpiryMinutes: 30, // Optional: specify the number of minutes after which a cached externallink should be re-checked
}),
],
});
Expand Down
48 changes: 28 additions & 20 deletions check-links.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,18 @@ export async function checkLinksInHtml(
checkedLinks = new Map(),
distPath = '',
astroConfigRedirects = {},
logger
logger,
checkExternalLinks = true
) {
const root = parse(htmlContent);
const linkElements = root.querySelectorAll('a[href]');
const links = linkElements.map((el) => el.getAttribute('href'));
// add img src
const imgElements = root.querySelectorAll('img[src]');
const imgLinks = imgElements.map((el) => el.getAttribute('src'));
links.push(...imgLinks);

const limit = pLimit(10); // Limit to 10 concurrent link checks
const limit = pLimit(50); // Limit to 10 concurrent link checks

const checkLinkPromises = links.map((link) =>
limit(async () => {
Expand Down Expand Up @@ -89,27 +94,30 @@ export async function checkLinksInHtml(
}
} else {
// External link, check via HTTP request. Retry 3 times if ECONNRESET
let retries = 0;
while (retries < 3) {
try {
const response = await fetch(fetchLink, { method: 'GET' });
isBroken = !response.ok;
if (isBroken) {
logger.error(`${response.status} Error fetching ${fetchLink}`);
if (checkExternalLinks) {
let retries = 0;
while (retries < 3) {
try {
const response = await fetch(fetchLink, { method: 'GET' });
isBroken = !response.ok;
if (isBroken) {
logger.error(`${response.status} Error fetching ${fetchLink}`);
}
break;
} catch (error) {
isBroken = true;
let statusCodeNumber = error.errno == 'ENOTFOUND' ? 404 : (error.errno);
logger.error(`${statusCodeNumber} error fetching ${fetchLink}`);
if (error.errno === 'ECONNRESET') {
retries++;
continue;
}
break;
}
}
break;
} catch (error) {
isBroken = true;
let statusCodeNumber = error.errno == 'ENOTFOUND' ? 404 : (error.errno);
logger.error(`${statusCodeNumber} error fetching ${fetchLink}`);
if (error.errno === 'ECONNRESET') {
retries++;
continue;
}
break;
}
}
}


// Cache the link's validity
checkedLinks.set(fetchLink, !isBroken);
Expand Down
3 changes: 2 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ export default function astroBrokenLinksChecker(options = {}) {
checkedLinks,
distPath,
astroConfigRedirects,
logger
logger,
options.checkExternalLinks
);
});
await Promise.all(checkHtmlPromises);
Expand Down
3 changes: 3 additions & 0 deletions tests/integration.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ describe('Astro Broken Links Checker Integration', () => {
expect(logContent).toContain('../path/changing/relative-broken-link');
expect(logContent).toContain('https://non-existent-page.com/page');
expect(logContent).toContain('https://non-existent-page.com/page?query=string#fragment');
expect(logContent).toContain('https://non-existent-page.com/image.jpg');
expect(logContent).toContain('/missing.jpg');

expect(logContent).toContain('Found in');
expect(logContent).toContain('/');
Expand All @@ -56,5 +58,6 @@ describe('Astro Broken Links Checker Integration', () => {
expect(logContent).not.toContain('Broken link: /\n'); // Expect '/about' to not be reported as broken
expect(logContent).not.toContain('Broken link: https://microsoft.com'); // Expect 'https://microsoft.com' to not be reported as broken
expect(logContent).not.toContain('Broken link: /redirected'); // Expect '/redirected' to not be reported as broken
expect(logContent).not.toContain('Broken link: /exists.jpg'); // Expect '/exists.jpg' to not be reported as broken
});
});
Empty file added tests/public/exists.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions tests/src/pages/about.astro
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@
<a href="/broken/with?query=string#fragment">Broken link with query and fragment</a>
<a href="https://non-existent-page.com/page">Non Existent Page</a>
<a href="https://non-existent-page.com/page?query=string#fragment">Non Existent Page with query and fragment</a>
<img src="https://non-existent-page.com/image.jpg" alt="Non Existent Image">
<img src="/missing.jpg" alt="Non Existent Local Image">
<img src="/exists.jpg" alt="Real Local Image">

0 comments on commit 3451214

Please sign in to comment.