Skip to content

Commit

Permalink
NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols which mi…
Browse files Browse the repository at this point in the history
…ght require it

- strip the userinfo from the authority only for HTTP and HTTPS
  • Loading branch information
sebastian-nagel committed Dec 4, 2024
1 parent 86b893a commit df115cb
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,18 +242,37 @@ public String normalize(String urlString, String scope)
if (normalizePath) {
// check for unnecessary use of "/../", "/./", and "//"
if (changed) {
url = new URL(protocol, host, port, file);
URL u = new URL(protocol, host, port, file);
file2 = getFileWithNormalizedPath(u);
} else {
file2 = getFileWithNormalizedPath(url);
}
file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
}

if (changed) {
url = new URL(protocol, host, port, file);
urlString = url.toString();
if (protocol.equals("http") || protocol.equals("https")
|| url.getUserInfo() == null) {
url = new URL(protocol, host, port, file);
urlString = url.toString();
} else {
/*
* NUTCH-3087 - userinfo is required for protocols with frequent
* authentication. Note: need to build the URL string directly, because
* there is no URL constructor which takes the userinfo as parameter.
*/
StringBuilder sb = new StringBuilder();
sb.append(protocol).append("://").append(url.getUserInfo()).append('@')
.append(host);
if (port != -1) {
sb.append(':').append(port);
}
sb.append(file);
urlString = sb.toString();
}
}

return urlString;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,32 @@ public void testNormalizer() throws Exception {
normalizeTest("file:/var/www/html/////./bar/index.html",
"file:/var/www/html/bar/index.html");
}

@Test
public void testNUTCH3087() throws Exception {
// NUTCH-3087 userinfo to be kept in URLs with protocols usually requiring
// authentication
normalizeTest("ftp://[email protected]/path/file.txt",
"ftp://[email protected]/path/file.txt");
normalizeTest("ftp://[email protected]/",
"ftp://[email protected]/");
normalizeTest("ftp://user:[email protected]/path/file.txt",
"ftp://user:[email protected]/path/file.txt");
// But for HTTP(S) the userinfo should be removed.
// (example from https://en.wikipedia.org/wiki/Uniform_Resource_Identifier)
normalizeTest(
"https://[email protected]:1234/forum/questions/?tag=networking&order=newest#top",
"https://www.example.com:1234/forum/questions/?tag=networking&order=newest");
// URLs with IPv6 address
normalizeTest("ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/../path/file.txt",
"ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/file.txt");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]:443/",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/../to/index.html",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/to/index.html");
}

@Test
public void testCurlyBraces() throws Exception {
Expand Down

0 comments on commit df115cb

Please sign in to comment.