Skip to content

Commit a29a0b5

Browse files
committed
Fix doublefetching of query messages
1 parent d7c315d commit a29a0b5

File tree

3 files changed

+69
-8
lines changed

3 files changed

+69
-8
lines changed

modules/web-discovery-project/sources/content-extractor.es

+4-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ export class ContentExtractor {
9797
this.urlAnalyzer = new UrlAnalyzer(this.patterns);
9898
}
9999

100-
run(pageContent, url) {
100+
run(pageContent, url, addStrictQuery) {
101101
function discard(reason = "") {
102102
logger.debug("No messages found for query:", query, "Reason:", reason);
103103
return {
@@ -109,6 +109,9 @@ export class ContentExtractor {
109109
const { found, type, query } = this.urlAnalyzer.parseLinks(url);
110110
if (!found) return discard("No content found.");
111111

112+
if (addStrictQuery)
113+
this.wdp.addStrictQueries(url, query);
114+
112115
const messages = this.extractMessages(pageContent, type, query, url);
113116
if (messages.length === 0) {
114117
return discard("No content found.");

modules/web-discovery-project/sources/url-analyzer.es

+19
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
* License, v. 2.0. If a copy of the MPL was not distributed with this
33
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
44

5+
import { extractHostname } from "../core/tlds";
56
import { parse } from "../core/url";
67
import logger from "./logger";
78

@@ -135,4 +136,22 @@ export default class UrlAnalyzer {
135136
const parsedUrl = parse(url);
136137
return isBraveSearch && parsedUrl.searchParams.get("q");
137138
}
139+
140+
checkAnonSearchURL(url, query) {
141+
const { found, type } = this.parseLinks(url);
142+
if (!found) return { isSearchEngineUrl: false, queryUrl: null };
143+
const isSearchEngineUrl = SEARCH_ENGINE_TYPES.has(type);
144+
const urlPattern = URL_PATTERNS.find((p) => p.type == type);
145+
const queryPrefix = urlPattern.prefix;
146+
if (!queryPrefix) {
147+
logger.debug(
148+
`URL pattern with type '${urlPattern.type}' has no query prefix`
149+
);
150+
return { isSearchEngineUrl: false, queryUrl: null };
151+
}
152+
const encodedQuery = encodeURIComponent(query).replace(/%20/g, "+");
153+
const hostname = extractHostname(url);
154+
const queryUrl = `https://${hostname}/${queryPrefix}${encodedQuery}`;
155+
return { isSearchEngineUrl, queryUrl };
156+
}
138157
}

modules/web-discovery-project/sources/web-discovery-project.es

+46-7
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ function _log(...msg) {
3939
}
4040
}
4141

42+
function getRandomIntInclusive(min, max) {
43+
const _min = Math.ceil(min);
44+
const _max = Math.floor(max);
45+
return Math.floor(random() * (_max - _min + 1)) + min;
46+
}
47+
4248
function cleanFinalUrl(domain, href) {
4349
/*
4450
We need to get the final domain, there are 2 elements that we try to capture.
@@ -1597,7 +1603,7 @@ const WebDiscoveryProject = {
15971603
WebDiscoveryProject.patterns.update(rules);
15981604
WebDiscoveryProject._patternsLastUpdated = new Date();
15991605
logger.info(
1600-
"WebDiscoveryProject patterns successfully updated at ${this._patternsLastUpdated}"
1606+
`WebDiscoveryProject patterns successfully updated at ${WebDiscoveryProject._patternsLastUpdated}`
16011607
);
16021608
} catch (e) {
16031609
logger.warn("Failed to apply new WebDiscoveryProject patterns", e);
@@ -3382,7 +3388,7 @@ const WebDiscoveryProject = {
33823388
}
33833389
getContentDocument(originalURL)
33843390
.then((doc) => {
3385-
WebDiscoveryProject.checkURL(doc, url);
3391+
WebDiscoveryProject.checkURL(doc, url, true);
33863392
})
33873393
.catch((e) => {
33883394
logger.info(
@@ -3518,7 +3524,7 @@ const WebDiscoveryProject = {
35183524
)
35193525
) {
35203526
try {
3521-
WebDiscoveryProject.checkURL(cd, currURL);
3527+
WebDiscoveryProject.checkURL(cd, currURL, false);
35223528
} catch (e) {}
35233529
//Check active usage...
35243530
// WebDiscoveryProject.activeUsage += 1;
@@ -3619,7 +3625,7 @@ const WebDiscoveryProject = {
36193625
}
36203626
})
36213627
.catch((e) => {
3622-
_log("Error fetching fetching the currentURL: " + e);
3628+
_log("Error fetching the currentURL: " + e);
36233629
});
36243630

36253631
WebDiscoveryProject.counter += 4;
@@ -4701,10 +4707,11 @@ const WebDiscoveryProject = {
47014707
else return null;
47024708
},
47034709

4704-
checkURL(pageContent, url) {
4710+
checkURL(pageContent, url, addStrictQuery) {
47054711
const { messages } = WebDiscoveryProject.contentExtractor.run(
47064712
pageContent,
4707-
url
4713+
url,
4714+
addStrictQuery
47084715
);
47094716
for (const message of messages)
47104717
WebDiscoveryProject.telemetry({
@@ -5043,7 +5050,7 @@ const WebDiscoveryProject = {
50435050
e.qurl,
50445051
function (url, page_data, ourl, x) {
50455052
let cd = WebDiscoveryProject.docCache[url]["doc"];
5046-
WebDiscoveryProject.checkURL(cd, url);
5053+
WebDiscoveryProject.checkURL(cd, url, false);
50475054
},
50485055
function (a, b, c, d) {
50495056
_log("Error aux>>>> " + d);
@@ -5714,6 +5721,38 @@ const WebDiscoveryProject = {
57145721
}
57155722
});
57165723
},
5724+
addStrictQueries(url, query) {
5725+
// In some cases, we get query undefined.
5726+
if (!query) {
5727+
_log(">> Got an undefined query >>> " + url);
5728+
return;
5729+
}
5730+
5731+
if (WebDiscoveryProject.isSuspiciousQuery(query)) {
5732+
_log("Dropping suspicious query before double-fetch:", query);
5733+
return;
5734+
}
5735+
5736+
const { isSearchEngineUrl, queryUrl } =
5737+
WebDiscoveryProject.contentExtractor.urlAnalyzer.checkAnonSearchURL(
5738+
url,
5739+
query
5740+
);
5741+
if (isSearchEngineUrl) {
5742+
try {
5743+
const qObj = {
5744+
qurl: queryUrl,
5745+
ts: Date.now(),
5746+
tDiff: getRandomIntInclusive(1, 20),
5747+
};
5748+
logger.debug("PCN: pushed to strictQueries:", queryUrl);
5749+
WebDiscoveryProject.strictQueries.push(qObj);
5750+
WebDiscoveryProject.saveStrictQueries();
5751+
} catch (ee) {
5752+
logger.error("Failed to add query:", ee);
5753+
}
5754+
}
5755+
},
57175756
};
57185757
WebDiscoveryProject.contentExtractor = new ContentExtractor(
57195758
WebDiscoveryProject.patterns,

0 commit comments

Comments
 (0)