Skip to content

Commit 5a5cc54

Browse files
committedSep 5, 2021
Add tool for fixing broken links
1 parent 038cb16 commit 5a5cc54

File tree

6 files changed

+257
-0
lines changed

6 files changed

+257
-0
lines changed
 

‎.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1+
*.un~
12
book
3+
target/

‎Cargo.lock

+54
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[workspace]
2+
members = [
3+
"tools/fixlinks",
4+
]

‎tools/fixlinks/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/target

‎tools/fixlinks/Cargo.toml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "fixlinks"
3+
version = "0.1.0"
4+
authors = ["Tyler Mandry <tmandry@gmail.com>"]
5+
edition = "2018"
6+
7+
[dependencies]
8+
camino = "1.0"
9+
regex = "1.5"
10+
pathdiff = "0.2"

‎tools/fixlinks/src/main.rs

+186
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
//! Searches for broken local paths in markdown files and attempts to repair them.
2+
//!
3+
//! Usage: cargo run --bin=fixlinks ./**/*.md
4+
5+
use std::{collections::{BTreeSet, HashMap}, fs, ops::Range, path::Component};
6+
use camino::*;
7+
use pathdiff::diff_paths;
8+
use regex::{Match, Regex};
9+
10+
#[allow(unused)]
11+
struct FilenameEntry {
12+
file: Utf8PathBuf,
13+
reported: bool,
14+
}
15+
16+
fn main() {
17+
let files: Vec<Utf8PathBuf> = std::env::args().skip(1).map(Into::into).collect();
18+
let mut filenames = HashMap::<String, Vec<Utf8PathBuf>>::new();
19+
for file in &files {
20+
let name = file.file_name().unwrap();
21+
filenames.entry(name.to_owned()).or_default().push(file.clone());
22+
}
23+
24+
for (name, paths) in &filenames {
25+
if paths.len() > 1 {
26+
eprintln!("Note: Duplicate filename: {}", name);
27+
for path in paths {
28+
eprintln!("- {}", path);
29+
}
30+
}
31+
}
32+
33+
let mut cx = Context::new(filenames);
34+
35+
for file in &files {
36+
let contents = fs::read_to_string(&file).unwrap();
37+
cx.check_file(file, &contents);
38+
}
39+
40+
eprintln!("{:#?}", cx.count);
41+
}
42+
43+
struct Context {
44+
filenames: HashMap::<String, Vec<Utf8PathBuf>>,
45+
count: Counts,
46+
}
47+
48+
#[derive(Default, Debug)]
49+
struct Counts {
50+
urls: usize,
51+
paths: usize,
52+
missing: usize,
53+
matches: usize,
54+
ambiguous: usize,
55+
ties: usize,
56+
}
57+
58+
impl Context {
59+
fn new(filenames: HashMap::<String, Vec<Utf8PathBuf>>) -> Self {
60+
Context {
61+
filenames,
62+
count: Default::default(),
63+
}
64+
}
65+
66+
fn check_file<'c>(&mut self, file: &Utf8Path, contents: &'c str) {
67+
let re_linkref = Regex::new(r"^\[(.*?)\]: (.*)").unwrap();
68+
let re_inline = Regex::new(r"\[([^\[\]]*)\]\((.*?)\)").unwrap();
69+
70+
struct UrlMatch {
71+
line_no: usize,
72+
source_range: Range<usize>,
73+
}
74+
let mut matches = vec![];
75+
let mut add_match = |line: &'c str, line_no, mat: Match| {
76+
// Safety: line and contents are from the same allocation.
77+
let line_start_idx = unsafe { line.as_ptr().offset_from(contents.as_ptr()) } as usize;
78+
let source_range = (line_start_idx + mat.start())..(line_start_idx + mat.end());
79+
matches.push(UrlMatch { line_no, source_range });
80+
};
81+
82+
// Match against linkrefs, then inline links.
83+
let mut linkrefs = BTreeSet::new();
84+
for (idx, line) in contents.lines().enumerate() {
85+
let line_no = idx + 1;
86+
for cap in re_linkref.captures_iter(line) {
87+
add_match(line, line_no, cap.get(2).unwrap());
88+
linkrefs.insert(cap.get(1).unwrap().as_str());
89+
}
90+
}
91+
for (idx, line) in contents.lines().enumerate() {
92+
let line_no = idx + 1;
93+
for cap in re_inline.captures_iter(line) {
94+
if let Some(url) = cap.get(2) {
95+
if !linkrefs.contains(url.as_str()) {
96+
add_match(line, line_no, url);
97+
}
98+
}
99+
}
100+
}
101+
102+
// We need to process through matches in order so we can build the
103+
// modified contents sequentially.
104+
matches.sort_by_key(|um| um.source_range.start);
105+
106+
let mut modified = String::new();
107+
let mut source_bytes_written = 0;
108+
for UrlMatch { line_no, source_range } in matches {
109+
if let Some(replacement) = self.check_url(file, line_no, &contents[source_range.clone()]) {
110+
modified.push_str(&contents[source_bytes_written..source_range.start]);
111+
modified.push_str(&replacement);
112+
source_bytes_written = source_range.end;
113+
}
114+
};
115+
116+
if !modified.is_empty() {
117+
modified.push_str(&contents[source_bytes_written..]);
118+
fs::write(file, modified).unwrap();
119+
}
120+
}
121+
122+
fn check_url(&mut self, file: &Utf8Path, line_no: usize, path: &str) -> Option<String> {
123+
self.count.urls += 1;
124+
if path.starts_with("http:") || path.starts_with("https:") || path.starts_with("#") {
125+
return None;
126+
}
127+
let path = path.split('#').next().unwrap();
128+
129+
self.count.paths += 1;
130+
131+
let resolved = file.parent().unwrap().join(Utf8Path::new(path));
132+
if !resolved.exists() {
133+
self.count.missing += 1;
134+
// println!("{}:{}: {}", file, line_no, path);
135+
136+
let filenames = &self.filenames;
137+
if let Some(names) = resolved.file_name().and_then(|f| filenames.get(f)) {
138+
self.count.matches += 1;
139+
if names.len() > 1 {
140+
self.count.ambiguous += 1;
141+
}
142+
if let Some(mut replacement) = Self::rank_names(file, names, &mut self.count) {
143+
// mdBook doesn't seem to like "raw" filenames.
144+
if !replacement.starts_with("../") {
145+
replacement.insert_str(0, "./");
146+
}
147+
// println!("- Replacing with: {}", replacement);
148+
return Some(replacement);
149+
}
150+
} else {
151+
eprintln!("Warning: Unable to resolve at {}:{}: {}", file, line_no, path);
152+
}
153+
}
154+
None
155+
}
156+
157+
fn rank_names(file: &Utf8Path, names: &[Utf8PathBuf], count: &mut Counts) -> Option<String> {
158+
let parent_dir = file.parent().unwrap();
159+
let mut best_score = usize::MAX;
160+
let mut best_path = None;
161+
let mut ties = 0;
162+
for candidate in names {
163+
// println!("- Could be: {}", candidate);
164+
165+
// Candidates are scored by how many differing path components they have.
166+
if let Some(relpath) = diff_paths(candidate, parent_dir) {
167+
let score = relpath.components().count() +
168+
// Triple count ../ components (prefer candidates in the same directory).
169+
relpath.components().take_while(|c| c == &Component::ParentDir).count() * 2;
170+
if score < best_score {
171+
best_score = score;
172+
best_path = Some(relpath.into_os_string().into_string().unwrap());
173+
ties = 0;
174+
} else if score == best_score {
175+
ties += 1;
176+
}
177+
};
178+
}
179+
180+
if best_path.is_some() && ties > 0 {
181+
count.ties += 1;
182+
}
183+
184+
best_path
185+
}
186+
}

0 commit comments

Comments
 (0)
Please sign in to comment.