Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to the new DSL #384

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ $ npm run start:brave # start Brave with extension loaded
### Patterns

There are prod and test versions of the patterns file. Test patterns are used for tests only. Prod patterns are fetched from
CDN (https://patterns.hpn.brave.com/patterns.gz). If you have to change patterns during development you need to:
CDN (https://patterns.wdp.brave.com/patterns.gz). If you have to change patterns during development you need to:
1. Serve a gzipped patterns file locally using an HTTP server.
2. Update patterns URL for your environment in [the config file](./configs/common/urls.js) to point to your locally served file.
3. Disable the signature verification of a patterns file by setting `WDP_PATTERNS_SIGNING` option to `true` in the config file for your environment. For `sandbox` environment such file is [/configs/sandbox.js](./configs/sandbox.js).
3. Disable the signature verification of a patterns file by setting `WDP_PATTERNS_SIGNING` option to `false` in the config file for your environment. For `sandbox` environment such file is [/configs/sandbox.js](./configs/sandbox.js).

## Useful commands

Expand Down
318 changes: 318 additions & 0 deletions modules/core/sources/sanitizer.es
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */

import logger from "./logger";

function isCharNumber(char) {
const code = char.charCodeAt(0);
return code >= 48 && code <= 57; // ASCII range for 0-9
}

// precondition: isCharNumber(char) === true
function uncheckedCharToNumber(char) {
return char.charCodeAt(0) - 48; // 48 == ASCII '0'
}

// https://en.wikipedia.org/wiki/International_Article_Number
// In the US, also known as GTIN or UPTC.
export function isValidEAN13(ean) {
if (ean.length !== 13 || ![...ean].every(isCharNumber)) {
return false;
}
let sum = 0;
for (let i = 0; i < 12; i += 1) {
const factor = i % 2 === 0 ? 1 : 3;
sum += factor * uncheckedCharToNumber(ean[i]);
}
const checksum = 10 - (sum % 10);
return checksum === uncheckedCharToNumber(ean[12]);
}

// https://en.wikipedia.org/wiki/International_Standard_Serial_Number
export function isValidISSN(issn) {
if (!/^[0-9]{4}-?[0-9]{3}[0-9xX]$/.test(issn)) {
return false;
}
issn = issn.replace("-", "");

let checksum = 0;
for (let i = 0; i < 7; i++) {
checksum += uncheckedCharToNumber(issn[i]) * (8 - i);
}
const endsWithX = issn[7] === "x" || issn[7] === "X";
checksum += endsWithX ? 10 : uncheckedCharToNumber(issn[7]);

return checksum % 11 === 0;
}

/**
* Returns true if the given string contains any text that looks
* like an email address. The check is conservative, that means
* false positives are expected, but false negatives are not.
*/
function checkForEmail(str) {
return /[a-z0-9\-_@]+(@|%40|%(25)+40)[a-z0-9\-_]+\.[a-z0-9\-_]/i.test(str);
}

/**
* Intended to filter out potentially problematic numbers.
* Tries to reduce the number of false-positives by detecting certain common
* product IDs (EAN, ISSN), which are common in search, but don't have personal
* information.
*
* Otherwise, it discard query that contain numbers longer than 7 digits.
* So, 123456 is still allowed, but phone numbers like (090)90-2 or 5555 3235
* will be dropped.
*
* Note:
* - the current implementation discard anything that contains full dates
* (e.g. "2023/05/17", "17.05.2023").
* (TODO: perhaps this restriction should be reconsidered to allow a search
* like "What happened on 24.12.1914?")
*/
function hasLongNumber(str) {
// allow one ISSN number
const issn = str.split(" ").find(isValidISSN);
if (issn) {
str = str.replace(issn, " ");
}

const numbers = str
.replace(/[^A-Za-z0-9]/g, "")
.replace(/[^0-9]+/g, " ")
.trim()
.split(" ")
.filter((num) => num.length > 2);
if (numbers.length === 1) {
const num = numbers[0];
if (num.length === 13 && str.includes(num)) {
const isEAN = isValidEAN13(num);
return !isEAN;
}
}

return numbers.some((num) => num.length > 7);
}

function isLogogramChar(char) {
const codePoint = char.codePointAt(0);

// Chinese: Range of Unicode code points for common Chinese characters
if (codePoint >= 0x4e00 && codePoint <= 0x9fff) {
return true;
}

// Japanese: Range of Unicode code points for Hiragana and Katakana characters
if (codePoint >= 0x3040 && codePoint <= 0x30ff) {
return true;
}

// Korean: Range of Unicode code points for Hangul syllables
if (codePoint >= 0xac00 && codePoint <= 0xd7af) {
return true;
}

// Thai: Range of Unicode code points for Thai characters
if (codePoint >= 0x0e00 && codePoint <= 0x0e7f) {
return true;
}

return false;
}

/**
* Most languages have an alphabet where a word consist of multiple characters.
* But other languages (e.g. Chinese) use logograms, where a single character
* is equivalent to a word. Thus, heuristics need to adjusted if they count the
* number of characters or words ("words" being defined as characters not
* separated by whitespace).
*
* Note: texts in Arabic or European languages should not trigger this check.
*/
function hasLogograms(str) {
return [...str].some(isLogogramChar);
}

export function checkSuspiciousQuery(query) {
function accept() {
return {
accept: true,
};
}

function discard(reason) {
return {
accept: false,
reason,
};
}

// First, normalize white spaces
//
// Note: this code doesn't trim but preserves a leading or trailing
// whitespace. We could trim (and the expected differences would be minimal).
// Yet there is little benefit in trimming and it would lose information.
query = query.replace(/\s+/g, " ");

// Remove the msg if the query is too long
if (query.length > 120) {
return discard("too long (120 character limit)");
}
if (query.length > 50 && hasLogograms(query)) {
return discard("too long (50 characters and logograms are present)");
}

const words = query.split(" ");
if (words.length > 9) {
if (words.filter((x) => x.length >= 4).length > 16) {
return discard("too many words");
}
if (hasLogograms(query)) {
return discard("too many words (smaller limit but logograms are present");
}
}

if (hasLongNumber(query)) {
return discard("long number detected");
}

// Remove if it contains text that could be an email,
// even if the email is not well formed
if (checkForEmail(query)) {
return discard("looks like an email");
}

if (/[^:]+:[^@]+@/.test(query)) {
return discard("looks like an http password");
}

for (let i = 0; i < words.length; i += 1) {
if (words[i].length > 45) {
return discard("found long word");
}

// Long words are common in some languages (e.g. German)
if (
words[i].length > 20 &&
!/^[a-zA-ZäöüéÄÖÜ][a-zäöüéß]+$/.test(words[i])
) {
return discard("found long word (smaller limit but uncommon shape)");
}
}

return accept();
}

function tryParseUrl(url) {
try {
return new URL(url);
} catch (e) {
return null;
}
}

function checkForInternalIp(hostname) {
// TODO: this could be extended to detect more cases
return hostname === "localhost" || hostname === "127.0.0.1";
}

/**
* There should be no reason for these URLs to show up, but if they do
* we should never send them to the backend. Especially, "moz-extension"
* is problematic, as it includes an id that is unique per user and
* can be used to link messages.
*/
function urlLeaksExtensionId(url) {
return (
url.startsWith("moz-extension://") || url.startsWith("chrome-extension://")
);
}

/**
* Sanity checks to protect against accidentially sending sensitive URLs.
*
* There are three possible outcomes:
* 1) "safe": URL can be accepted as is
* 2) "truncated": URL may have sensitive parts but can be truncated
* (use includ the hostname but remove the rest)
* 3) "dropped": URL is corrupted or unsafe
*
* Expections: this function should be seen as an additional layer of defence,
* but do not expect it to detect all situation. Instead, make sure to extract
* only URLs where the context is safe. Otherwise, you are expecting too
* much from this static classifier.
*
* When changing new rules here, it is OK to be conservative. Since
* classification error are expected, rather err on the side of
* dropping (or truncating) too much.
*/
export function sanitizeUrl(url) {
const accept = () => ({ result: "safe", safeUrl: url });
const drop = (reason) => ({ result: "dropped", safeUrl: null, reason });

// first run some sanity check on the structure of the URL
const parsedUrl = tryParseUrl(url);
if (!parsedUrl) {
return drop("invalid URL");
}
if (parsedUrl.username) {
return drop("URL sets username");
}
if (parsedUrl.password) {
return drop("URL sets password");
}
if (parsedUrl.port && parsedUrl.port !== "80" && parsedUrl.port !== "443") {
return drop("URL has uncommon port");
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
return drop("URL has uncommon protocol");
}
if (checkForInternalIp(parsedUrl.hostname)) {
return drop("URL is not public");
}
if (urlLeaksExtensionId(url)) {
return drop("URL leaks extension ID");
}

try {
// At this point, the most problematic URLs should be gone;
// now we can also decide to truncated by limiting it to the hostname.
//
// Often, that is a good compromise, as it still provides value
// but the risk that it contains sensitive information is limited.
// Note that even on https, the hostname will be shared in plaintext,
// so it is less likely that sites include secrets or personal
// identifiers in the hostname.
const truncate = (reason) => {
const safeUrl = `${parsedUrl.protocol}//${parsedUrl.hostname}/ (PROTECTED)`;
logger.debug("sanitizeUrl truncated URL:", url, "->", safeUrl);
return {
result: "truncated",
safeUrl,
reason,
};
};

// TODO: these rules could use some polishing
if (url.hostname > 50) {
return drop("hostname too long");
}
if (url.length > 800) {
return truncate("url too long");
}

const decodedUrl = decodeURIComponent(url);
if (checkForEmail(url) || checkForEmail(decodedUrl)) {
return truncate("potential email found");
}

// TODO: check each path and query parameter and truncate if there
// are fields that could be tokens, secrets, names or logins.

return accept();
} catch (e) {
logger.warn(`Unexpected error in sanitizeUrl. Skipping url=${url}`, e);
return drop("Unexpected error");
}
}
49 changes: 47 additions & 2 deletions modules/core/sources/url.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,10 @@ export function isPrivateIP(ip: string): boolean {
if (ip === "::1") {
return true;
}
if (ip.toLowerCase().startsWith("fc00:") ||
ip.toLowerCase().startsWith("fe80:")) {
if (
ip.toLowerCase().startsWith("fc00:") ||
ip.toLowerCase().startsWith("fe80:")
) {
return true;
}
const ipParts = ip.split(":");
Expand Down Expand Up @@ -206,3 +208,46 @@ export function isUrlShortener(url: URL | null): boolean {

return SHORTENERS.has(url.hostname);
}

/**
* split0(str, on) === str.split(on)[0]
*/
function split0(str: string, on: string) {
const pos = str.indexOf(on);
return pos < 0 ? str : str.slice(0, pos);
}

/**
* Given a URL and a list of query parameters, it returns an
* equivalent URL, but with those query parameters removed.
*
* Note: this function will not do any decoding. Instead, it will try
* to preserve the original URL as best as it can (e.g. the invalid URL
* "https://example.test?q=x y" will not be normalized to the valid URL
* "https://example.test/?q=x%20y").
*/
export function removeQueryParams(url: string, queryParams: string[]) {
const searchStart = url.indexOf("?");
if (searchStart === -1) {
return url;
}
const searchEnd = url.indexOf("#", searchStart + 1);
const search =
searchEnd === -1
? url.slice(searchStart + 1)
: url.slice(searchStart + 1, searchEnd);
if (!search) {
return url;
}
const parts = search
.split("&")
.filter((x) => !queryParams.includes(split0(x, "=")));
const beforeSearch = url.slice(0, searchStart);

const hash = searchEnd === -1 ? "" : url.slice(searchEnd);
if (parts.length === 0) {
return beforeSearch + hash;
} else {
return `${beforeSearch}?${parts.join("&")}${hash}`;
}
}
Loading
Loading