diff --git a/README.md b/README.md index b61e760a..d8fc292c 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,10 @@ $ npm run start:brave # start Brave with extension loaded ### Patterns There are prod and test versions of the patterns file. Test patterns are used for tests only. Prod patterns are fetched from -CDN (https://patterns.hpn.brave.com/patterns.gz). If you have to change patterns during development you need to: +CDN (https://patterns.wdp.brave.com/patterns.gz). If you have to change patterns during development you need to: 1. Serve a gzipped patterns file locally using an HTTP server. 2. Update patterns URL for your environment in [the config file](./configs/common/urls.js) to point to your locally served file. -3. Disable the signature verification of a patterns file by setting `WDP_PATTERNS_SIGNING` option to `true` in the config file for your environment. For `sandbox` environment such file is [/configs/sandbox.js](./configs/sandbox.js). +3. Disable the signature verification of a patterns file by setting `WDP_PATTERNS_SIGNING` option to `false` in the config file for your environment. For `sandbox` environment such file is [/configs/sandbox.js](./configs/sandbox.js). ## Useful commands diff --git a/modules/core/sources/sanitizer.es b/modules/core/sources/sanitizer.es new file mode 100644 index 00000000..15e225da --- /dev/null +++ b/modules/core/sources/sanitizer.es @@ -0,0 +1,318 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ + +import logger from "./logger"; + +function isCharNumber(char) { + const code = char.charCodeAt(0); + return code >= 48 && code <= 57; // ASCII range for 0-9 +} + +// precondition: isCharNumber(char) === true +function uncheckedCharToNumber(char) { + return char.charCodeAt(0) - 48; // 48 == ASCII '0' +} + +// https://en.wikipedia.org/wiki/International_Article_Number +// In the US, also known as GTIN or UPTC. +export function isValidEAN13(ean) { + if (ean.length !== 13 || ![...ean].every(isCharNumber)) { + return false; + } + let sum = 0; + for (let i = 0; i < 12; i += 1) { + const factor = i % 2 === 0 ? 1 : 3; + sum += factor * uncheckedCharToNumber(ean[i]); + } + const checksum = 10 - (sum % 10); + return checksum === uncheckedCharToNumber(ean[12]); +} + +// https://en.wikipedia.org/wiki/International_Standard_Serial_Number +export function isValidISSN(issn) { + if (!/^[0-9]{4}-?[0-9]{3}[0-9xX]$/.test(issn)) { + return false; + } + issn = issn.replace("-", ""); + + let checksum = 0; + for (let i = 0; i < 7; i++) { + checksum += uncheckedCharToNumber(issn[i]) * (8 - i); + } + const endsWithX = issn[7] === "x" || issn[7] === "X"; + checksum += endsWithX ? 10 : uncheckedCharToNumber(issn[7]); + + return checksum % 11 === 0; +} + +/** + * Returns true if the given string contains any text that looks + * like an email address. The check is conservative, that means + * false positives are expected, but false negatives are not. + */ +function checkForEmail(str) { + return /[a-z0-9\-_@]+(@|%40|%(25)+40)[a-z0-9\-_]+\.[a-z0-9\-_]/i.test(str); +} + +/** + * Intended to filter out potentially problematic numbers. + * Tries to reduce the number of false-positives by detecting certain common + * product IDs (EAN, ISSN), which are common in search, but don't have personal + * information. + * + * Otherwise, it discard query that contain numbers longer than 7 digits. + * So, 123456 is still allowed, but phone numbers like (090)90-2 or 5555 3235 + * will be dropped. + * + * Note: + * - the current implementation discard anything that contains full dates + * (e.g. "2023/05/17", "17.05.2023"). + * (TODO: perhaps this restriction should be reconsidered to allow a search + * like "What happened on 24.12.1914?") + */ +function hasLongNumber(str) { + // allow one ISSN number + const issn = str.split(" ").find(isValidISSN); + if (issn) { + str = str.replace(issn, " "); + } + + const numbers = str + .replace(/[^A-Za-z0-9]/g, "") + .replace(/[^0-9]+/g, " ") + .trim() + .split(" ") + .filter((num) => num.length > 2); + if (numbers.length === 1) { + const num = numbers[0]; + if (num.length === 13 && str.includes(num)) { + const isEAN = isValidEAN13(num); + return !isEAN; + } + } + + return numbers.some((num) => num.length > 7); +} + +function isLogogramChar(char) { + const codePoint = char.codePointAt(0); + + // Chinese: Range of Unicode code points for common Chinese characters + if (codePoint >= 0x4e00 && codePoint <= 0x9fff) { + return true; + } + + // Japanese: Range of Unicode code points for Hiragana and Katakana characters + if (codePoint >= 0x3040 && codePoint <= 0x30ff) { + return true; + } + + // Korean: Range of Unicode code points for Hangul syllables + if (codePoint >= 0xac00 && codePoint <= 0xd7af) { + return true; + } + + // Thai: Range of Unicode code points for Thai characters + if (codePoint >= 0x0e00 && codePoint <= 0x0e7f) { + return true; + } + + return false; +} + +/** + * Most languages have an alphabet where a word consist of multiple characters. + * But other languages (e.g. Chinese) use logograms, where a single character + * is equivalent to a word. Thus, heuristics need to adjusted if they count the + * number of characters or words ("words" being defined as characters not + * separated by whitespace). + * + * Note: texts in Arabic or European languages should not trigger this check. + */ +function hasLogograms(str) { + return [...str].some(isLogogramChar); +} + +export function checkSuspiciousQuery(query) { + function accept() { + return { + accept: true, + }; + } + + function discard(reason) { + return { + accept: false, + reason, + }; + } + + // First, normalize white spaces + // + // Note: this code doesn't trim but preserves a leading or trailing + // whitespace. We could trim (and the expected differences would be minimal). + // Yet there is little benefit in trimming and it would lose information. + query = query.replace(/\s+/g, " "); + + // Remove the msg if the query is too long + if (query.length > 120) { + return discard("too long (120 character limit)"); + } + if (query.length > 50 && hasLogograms(query)) { + return discard("too long (50 characters and logograms are present)"); + } + + const words = query.split(" "); + if (words.length > 9) { + if (words.filter((x) => x.length >= 4).length > 16) { + return discard("too many words"); + } + if (hasLogograms(query)) { + return discard("too many words (smaller limit but logograms are present"); + } + } + + if (hasLongNumber(query)) { + return discard("long number detected"); + } + + // Remove if it contains text that could be an email, + // even if the email is not well formed + if (checkForEmail(query)) { + return discard("looks like an email"); + } + + if (/[^:]+:[^@]+@/.test(query)) { + return discard("looks like an http password"); + } + + for (let i = 0; i < words.length; i += 1) { + if (words[i].length > 45) { + return discard("found long word"); + } + + // Long words are common in some languages (e.g. German) + if ( + words[i].length > 20 && + !/^[a-zA-ZäöüéÄÖÜ][a-zäöüéß]+$/.test(words[i]) + ) { + return discard("found long word (smaller limit but uncommon shape)"); + } + } + + return accept(); +} + +function tryParseUrl(url) { + try { + return new URL(url); + } catch (e) { + return null; + } +} + +function checkForInternalIp(hostname) { + // TODO: this could be extended to detect more cases + return hostname === "localhost" || hostname === "127.0.0.1"; +} + +/** + * There should be no reason for these URLs to show up, but if they do + * we should never send them to the backend. Especially, "moz-extension" + * is problematic, as it includes an id that is unique per user and + * can be used to link messages. + */ +function urlLeaksExtensionId(url) { + return ( + url.startsWith("moz-extension://") || url.startsWith("chrome-extension://") + ); +} + +/** + * Sanity checks to protect against accidentially sending sensitive URLs. + * + * There are three possible outcomes: + * 1) "safe": URL can be accepted as is + * 2) "truncated": URL may have sensitive parts but can be truncated + * (use includ the hostname but remove the rest) + * 3) "dropped": URL is corrupted or unsafe + * + * Expections: this function should be seen as an additional layer of defence, + * but do not expect it to detect all situation. Instead, make sure to extract + * only URLs where the context is safe. Otherwise, you are expecting too + * much from this static classifier. + * + * When changing new rules here, it is OK to be conservative. Since + * classification error are expected, rather err on the side of + * dropping (or truncating) too much. + */ +export function sanitizeUrl(url) { + const accept = () => ({ result: "safe", safeUrl: url }); + const drop = (reason) => ({ result: "dropped", safeUrl: null, reason }); + + // first run some sanity check on the structure of the URL + const parsedUrl = tryParseUrl(url); + if (!parsedUrl) { + return drop("invalid URL"); + } + if (parsedUrl.username) { + return drop("URL sets username"); + } + if (parsedUrl.password) { + return drop("URL sets password"); + } + if (parsedUrl.port && parsedUrl.port !== "80" && parsedUrl.port !== "443") { + return drop("URL has uncommon port"); + } + if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") { + return drop("URL has uncommon protocol"); + } + if (checkForInternalIp(parsedUrl.hostname)) { + return drop("URL is not public"); + } + if (urlLeaksExtensionId(url)) { + return drop("URL leaks extension ID"); + } + + try { + // At this point, the most problematic URLs should be gone; + // now we can also decide to truncated by limiting it to the hostname. + // + // Often, that is a good compromise, as it still provides value + // but the risk that it contains sensitive information is limited. + // Note that even on https, the hostname will be shared in plaintext, + // so it is less likely that sites include secrets or personal + // identifiers in the hostname. + const truncate = (reason) => { + const safeUrl = `${parsedUrl.protocol}//${parsedUrl.hostname}/ (PROTECTED)`; + logger.debug("sanitizeUrl truncated URL:", url, "->", safeUrl); + return { + result: "truncated", + safeUrl, + reason, + }; + }; + + // TODO: these rules could use some polishing + if (url.hostname > 50) { + return drop("hostname too long"); + } + if (url.length > 800) { + return truncate("url too long"); + } + + const decodedUrl = decodeURIComponent(url); + if (checkForEmail(url) || checkForEmail(decodedUrl)) { + return truncate("potential email found"); + } + + // TODO: check each path and query parameter and truncate if there + // are fields that could be tokens, secrets, names or logins. + + return accept(); + } catch (e) { + logger.warn(`Unexpected error in sanitizeUrl. Skipping url=${url}`, e); + return drop("Unexpected error"); + } +} diff --git a/modules/core/sources/url.ts b/modules/core/sources/url.ts index c58cd283..111b4d54 100644 --- a/modules/core/sources/url.ts +++ b/modules/core/sources/url.ts @@ -79,8 +79,10 @@ export function isPrivateIP(ip: string): boolean { if (ip === "::1") { return true; } - if (ip.toLowerCase().startsWith("fc00:") || - ip.toLowerCase().startsWith("fe80:")) { + if ( + ip.toLowerCase().startsWith("fc00:") || + ip.toLowerCase().startsWith("fe80:") + ) { return true; } const ipParts = ip.split(":"); @@ -206,3 +208,46 @@ export function isUrlShortener(url: URL | null): boolean { return SHORTENERS.has(url.hostname); } + +/** + * split0(str, on) === str.split(on)[0] + */ +function split0(str: string, on: string) { + const pos = str.indexOf(on); + return pos < 0 ? str : str.slice(0, pos); +} + +/** + * Given a URL and a list of query parameters, it returns an + * equivalent URL, but with those query parameters removed. + * + * Note: this function will not do any decoding. Instead, it will try + * to preserve the original URL as best as it can (e.g. the invalid URL + * "https://example.test?q=x y" will not be normalized to the valid URL + * "https://example.test/?q=x%20y"). + */ +export function removeQueryParams(url: string, queryParams: string[]) { + const searchStart = url.indexOf("?"); + if (searchStart === -1) { + return url; + } + const searchEnd = url.indexOf("#", searchStart + 1); + const search = + searchEnd === -1 + ? url.slice(searchStart + 1) + : url.slice(searchStart + 1, searchEnd); + if (!search) { + return url; + } + const parts = search + .split("&") + .filter((x) => !queryParams.includes(split0(x, "="))); + const beforeSearch = url.slice(0, searchStart); + + const hash = searchEnd === -1 ? "" : url.slice(searchEnd); + if (parts.length === 0) { + return beforeSearch + hash; + } else { + return `${beforeSearch}?${parts.join("&")}${hash}`; + } +} diff --git a/modules/web-discovery-project/sources/content-extractor.es b/modules/web-discovery-project/sources/content-extractor.es index c2c3f208..7d1223d5 100644 --- a/modules/web-discovery-project/sources/content-extractor.es +++ b/modules/web-discovery-project/sources/content-extractor.es @@ -2,11 +2,11 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ -import { extractHostname } from "../core/tlds"; -import { parse } from "../core/url"; +/* eslint-disable no-continue */ import logger from "./logger"; -import { parseURL } from "./network"; +import { lookupBuiltinTransform } from "./patterns"; +import UrlAnalyzer from "./url-analyzer"; export function parseQueryString(query) { if (query.length === 0) { @@ -29,651 +29,229 @@ export function parseQueryString(query) { return q; } -// helper function to implement the "splitF" function -// (part of the DSL used in the patterns description) -function refineSplitFunc(splitString, splitON, arrPos) { - const result = splitString.split(splitON)[arrPos]; - try { - if (result) { - return decodeURIComponent(result); +function runSelector(item, selector, attr, baseURI) { + const elem = selector ? item.querySelector(selector) : item; + if (elem) { + if (attr === "textContent") { + return elem.textContent; } - return decodeURIComponent(splitString); - } catch (e) { - logger.warn("Unable to decode URI", result || splitString); - return null; - } -} - -// helper function to implement the "parseU" function -// (part of the DSL used in the patterns description) -function refineParseURIFunc(url, extractType, keyName) { - const urlParts = parseURL(url); - if (urlParts && urlParts.query_string) { - const parseResult = parseQueryString(urlParts.query_string); - if (extractType === "qs") { - if (parseResult[keyName]) { - return decodeURIComponent(parseResult[keyName][0]); - } - return url; - } - // For now, leave the old semantic. - // TODO: Probably, we should return "url" here. - return undefined; - } - - return url; -} - -/** - * Helper to implement the "json" function (part of the DSL used - * in the patterns description). - * - * Takes a JSON string object, parses it and extract the data under the - * given path. By default, it will only extract safe types (strings, - * numbers, booleans), mostly to prevent accidentally extracting - * more than intended. - * - * (exported for tests only) - */ -export function _jsonPath(json, path, extractObjects = false) { - try { - let obj = JSON.parse(json); - for (const field of path.split(".")) { - obj = obj[field]; - } - if (typeof obj === "string") { - return obj; - } - if (typeof obj === "number" || typeof obj === "boolean") { - return obj.toString(); + if (attr === "href") { + // Going throw the attribute "href" avoids some of the problems of using + // directly "elem.href". For relative links the DOMParser cannot + // accidentally fill in the extension ID as the base. Another advantage + // (also for absolute links) is that it avoids a double-encoding problem + // in certain DOM parser (doesn't seem to affect Firefox, but linkedom). + // + // Since this part may dependent on the DOMParse implementation, two + // notes about the intended semantic: + // * links should be as close to the original page as possible + // * extensions IDs must not leak into the output + const rawLink = elem.getAttribute("href"); + return rawLink ? new URL(rawLink, baseURI).href : null; } - if (extractObjects && obj) { - return JSON.stringify(obj); + if (elem.hasAttribute(attr)) { + return elem.getAttribute(attr); } - // prevent uncontrolled text extraction - return ""; - } catch (e) { - return ""; } + return null; } -// (exported for tests only) -export function _mergeArr(arrS) { - const messageList = []; - const allKeys = Object.keys(arrS); - - // assumption: all keys have the same number of elements - const elemsPerKey = arrS[allKeys[0]].length; - for (let i = 0; i < elemsPerKey; i += 1) { - const innerDict = {}; - for (const e of allKeys) { - innerDict[e] = arrS[e][i]; - } - messageList.push(innerDict); +function runTransforms(value, transformSteps = []) { + if (!Array.isArray(transformSteps)) { + throw new Error("Transform definitions must be array."); } - return messageList; -} - -/** - * Verifies that the given payload has all expected fields set. - * - * @param payload the payload of the message to be checked - * @param expectedFields array with entries - * { key: name of the field; type: 'object'|'array' }. - * - * (exported for tests only) - */ -export function _allMandatoryFieldsSet(payload, expectedFields) { - function isDefined(value) { - return value !== null && value !== undefined && value !== ""; + if (value === undefined || value === null) { + return null; } - - function isArrayLikeWithAtLeastOneTruthyEntry(arrayValue) { - // Not an array according to JavaScript, but what we expect - // is a JSON mapping like the following: - // - // { "0": , "1": , ... } - // - // The message will be rejected only if there is no entry at all, - // or if all its entries consists only of undefined mappings: - // - // { "0": { "t": null, "u": null } } // false - // { "0": { "t": "", "u": "" } } // false - // { "0": { "t": "foo", "u": "" } } // true (partial matches are OK) - // - return Object.values(arrayValue).some((innerValue) => - Object.values(innerValue).some(isDefined), - ); + let tmpValue = value; + for (const step of transformSteps) { + const [name, ...args] = step; + const transform = lookupBuiltinTransform(name); + tmpValue = transform(tmpValue, ...args); } + return tmpValue ?? null; +} - for (const { key, type } of expectedFields) { - const value = payload[key]; - if (!isDefined(value)) { - return false; - } - - // Perform additional checks for aggregated fields (e.g., result lists). - if (type === "array" && !isArrayLikeWithAtLeastOneTruthyEntry(value)) { - return false; +function findFirstMatch(rootItem, selectorDef, baseURI) { + // special case: allows to define multiple rules (first matching rule wins) + if (selectorDef.firstMatch) { + for (const { select, attr, transform = [] } of selectorDef.firstMatch) { + const match = runSelector(rootItem, select, attr, baseURI) ?? null; + if (match !== null) { + return runTransforms(match, transform); + } } + return null; } - return true; + + // default case: only one rule + return ( + runSelector(rootItem, selectorDef.select, selectorDef.attr, baseURI) ?? null + ); } -/** - * This class is responsible for most of the data collection - * from HTML pages. - * - * Internally, it has a list of patterns that describe how to - * find the information, and what type and fields the resulting message - * should have. The pattern definition itself is fetched from the server. - * - * Be careful: code and rules are released separately. That is intended - * as we need to be able to quickly react on changes to the HTML structure - * by pushing new patterns (without waiting for releases). - * - * That has two consequences: - * - * 1) client-side: - * The code in the extension should not make too strong assumptions about - * the rules themself. Unnecessary assumptions can make it later more - * difficult to push new rules. Also be prepared to handle errors - * gracefully. - * - * 2) server-side: - * When you release new rules, it should not break core functionality - * of older clients. - * - * Note: In general, that should be not an issue, as long as the clients - * handle errors properly. In the worst case, you will then break data - * collection, but that it is more of a trade-off. - * Breaking data collections in old clients is acceptable, as long as - * the percentage of affected clients is low enough. - * - * As the outside world is not static, rules that worked the day before might - * suddenly break. Expect that the structure of HTML pages will change. - * - * To detect breakage, you can monitor the collected events that we - * receive on the server. Typically, if the structure of a page changes, - * there is a sudden drop of incoming messages for that specific type. - */ export class ContentExtractor { - /** - * Note: In production, _WebDiscoveryProject will be the global WebDiscoveryProject object. - */ - constructor(_WebDiscoveryProject) { - this._WebDiscoveryProject = _WebDiscoveryProject; - this.msgType = _WebDiscoveryProject.msgType; - - // Patterns for content extraction. - // Will be initialized by the ContentExtractionPatternsLoader, - // which polls for configuration changes from the backend. - // - // Even before the rules are loaded, you can immediately use the class. - // There will be no errors, but as the rules are empty, it will not - // be able to collect any data. - // - this.patterns = { - normal: { - searchEngines: [], - rArray: [], - extractRules: {}, - payloads: {}, - queryTemplate: {}, - }, - strict: { - searchEngines: [], - rArray: [], - extractRules: {}, - payloads: {}, - queryTemplate: {}, - }, - }; - - // TODO: maskURL depends heavily on WebDiscoveryProject functionality. - // Could also be extracted to another file, but that is a - // non-trivial change. For now leave the dependence to WebDiscoveryProject. - this.refineFuncMappings = { - splitF: refineSplitFunc, - parseU: refineParseURIFunc, - maskU: (url) => this._WebDiscoveryProject.maskURL(url), - json: _jsonPath, - }; - - // TODO: can this state be avoided?! - this._messageTemplate = {}; + constructor(patterns, wdp) { + this.wdp = wdp; + this.patterns = patterns; + this.urlAnalyzer = new UrlAnalyzer(this.patterns); } - /** - * Update content extraction patterns with the latest one from the backend. - */ - updatePatterns(patternConfig, ruleset) { - this.patterns[ruleset] = { - searchEngines: patternConfig.searchEngines, - extractRules: patternConfig.scrape, - payloads: patternConfig.payloads, - idMappings: patternConfig.idMapping, - rArray: patternConfig.urlPatterns.map((x) => new RegExp(x)), - queryTemplate: patternConfig.queryTemplate || {}, - }; - this._patternsLastUpdated = new Date(); - logger.debug( - `Successfully updated "${ruleset}" patterns at ${this._patternsLastUpdated}`, - ); + extractQuery(url) { + const { found, query } = this.urlAnalyzer.parseLinks(url); + if (!found) return; + return query; } - checkURL(pageContent, url, ruleset) { - const patterns = this.patterns[ruleset]; - const rArray = patterns.rArray; - const searchEngines = patterns.searchEngines; - - logger.debug("PCN: checking URL", url, "against ruleset", ruleset); - for (let i = 0; i < rArray.length; i += 1) { - if (rArray[i].test(url)) { - logger.debug("PCN: URL", url, `matches #${i}`); - const baseURI = parse(url).origin; - this._extractContent(i, pageContent, url, baseURI, ruleset); - - // Do not want to continue after search engines... - if (searchEngines.indexOf(String(i)) !== -1) { - return; - } - if (this._WebDiscoveryProject.debug) { - logger.debug("Continue further after search engines "); - } - } + run(pageContent, url) { + function discard(reason = "") { + logger.debug("No messages found for query:", query, "Reason:", reason); + return { + messages: [], + reason, + }; } - } - /** - * True if the url matches one of the known search engines - * (as defined by the loaded patterns). - */ - isSearchEngineUrl(url) { - return this._checkSearchURL(url, "normal") !== -1; - } - - // construct the doublefetch URL for the given query - _createAnonSearchQuery(hostname, query, searchEngineIdx) { - const template = - this.patterns.strict.queryTemplate[String(searchEngineIdx)] || {}; + const { found, type, query } = this.urlAnalyzer.parseLinks(url); + if (!found) return discard("No content found."); - let queryPrefix = template.prefix; - if (queryPrefix === undefined) { - // fallback for old patterns - queryPrefix = "search?q="; + const messages = this.extractMessages(pageContent, type, query, url); + if (messages.length === 0) { + return discard("No content found."); } - // Workaround for an encoding issue (source: https://stackoverflow.com/a/24417399/783510). - // Reason: we want to preserve the original search term. In other words, searches - // for "abc def" and "abc+def" should be distinguishable. That is why we need to - // avoid the ambigious '+' character and use explicit white space encoding. - const encodedQuery = encodeURIComponent(query).replace(/%20/g, "+"); - return `https://${hostname}/${queryPrefix}${encodedQuery}`; + logger.debug(messages.length, "messages found for query:", query); + return { messages }; } - checkAnonSearchURL(url, query) { - const index = this._checkSearchURL(url, "strict"); - if (index === -1) { - return { isSearchEngineUrl: false }; - } - - const hostname = extractHostname(url); - const queryUrl = this._createAnonSearchQuery(hostname, query, index); - - if (extractHostname(queryUrl) !== hostname) { - // paranoid check: should not be possible to reach - throw new Error("refusing to make a request to another host"); + extractMessages(doc, type, query, url) { + const rules = this.patterns.getRulesSnapshot(); + if (!rules[type]) { + return []; } - return { isSearchEngineUrl: true, queryUrl }; - } - _checkSearchURL(url, ruleset) { - const patterns = this.patterns[ruleset]; - const searchEngines = patterns.searchEngines; - const rArray = patterns.rArray; - - logger.debug("_checkSearchURL", url, ruleset, patterns); - for (let i = 0; i < rArray.length; i += 1) { - if (rArray[i].test(url)) { - if (searchEngines.indexOf(String(i)) !== -1) { - return i; - } - - if (this._WebDiscoveryProject.debug) { - logger.debug( - `Not search engine >>> url=${url}, i=${i}, searchEngines=${searchEngines}, ruleset=${ruleset}`, - ); - } - return -1; - } - } - return -1; - } - - _extractContent(ind, cd, url, baseURI, ruleset) { - logger.debug("_extractContent", url); - const scrapeResults = {}; - - const patterns = this.patterns[ruleset]; - const rules = patterns.extractRules[ind]; - const payloadRules = patterns.payloads[ind]; - const idMappings = patterns.idMappings[ind]; - - let urlArray = []; - for (const key of Object.keys(rules)) { - const innerDict = {}; - for (const eachKey of Object.keys(rules[key])) { - if (rules[key][eachKey].type === "standard") { - // Depending on etype, currently only supporting url. Maybe ctry too. - if (rules[key][eachKey].etype === "url") { - let qurl = url; - const functionsApplied = - rules[key][eachKey].functionsApplied || null; - // Check if the value needs to be refined or not. - if (functionsApplied) { - qurl = functionsApplied.reduce((attribVal, e) => { - if ( - Object.prototype.hasOwnProperty.call( - this.refineFuncMappings, - e[0], - ) - ) { - return this.refineFuncMappings[e[0]](attribVal, e[1], e[2]); - } - return attribVal; - }, qurl); - } - innerDict[eachKey] = [qurl]; - } - - if (rules[key][eachKey].etype === "ctry") { - innerDict[eachKey] = [this._WebDiscoveryProject.getCountryCode()]; + const found = {}; + const baseURI = url; + + const { input = {}, output = {} } = rules[type]; + for (const [selector, selectorDef] of Object.entries(input)) { + found[selector] = found[selector] || {}; + if (selectorDef.first) { + const item = doc.querySelector(selector); + if (item) { + for (const [key, def] of Object.entries(selectorDef.first)) { + const value = findFirstMatch(item, def, baseURI); + found[selector][key] = runTransforms(value, def.transform); } - } else if ( - rules[key][eachKey].type === "searchQuery" || - rules[key][eachKey].type === "widgetTitle" - ) { - urlArray = this._getAttribute( - cd, - key, - rules[key][eachKey].item, - rules[key][eachKey].etype, - rules[key][eachKey].keyName, - rules[key][eachKey].functionsApplied || null, - baseURI, - ); - innerDict[eachKey] = urlArray; - if (ruleset === "normal") { - const query = urlArray[0]; - if (query) { - logger.debug( - "Populating query Cache <<<< ", - url, - " >>>> ", - query, - ); - this._WebDiscoveryProject.addStrictQueries(url, query); - this._WebDiscoveryProject.queryCache[url] = { - d: 0, - q: query, - t: idMappings, - }; + } + } else if (selectorDef.all) { + const rootItems = doc.querySelectorAll(selector); + if (rootItems) { + found[selector] = found[selector] || {}; + for (const [key, def] of Object.entries(selectorDef.all)) { + found[selector][key] = []; + for (const rootItem of rootItems) { + const item = findFirstMatch(rootItem, def, baseURI); + found[selector][key].push(runTransforms(item, def.transform)); } } - } else { - urlArray = this._getAttribute( - cd, - key, - rules[key][eachKey].item, - rules[key][eachKey].etype, - rules[key][eachKey].keyName, - rules[key][eachKey].functionsApplied || null, - baseURI, - ); - innerDict[eachKey] = urlArray; } - } - - if (this._messageTemplate[ind]) { - this._messageTemplate[ind][key] = innerDict; } else { - this._messageTemplate[ind] = {}; - this._messageTemplate[ind][key] = innerDict; - } - - // Check if array has values. - const merged = _mergeArr(this._messageTemplate[ind][key]); - if (merged.length > 0) { - scrapeResults[key] = merged; + throw new Error( + 'Internal error: bad selector (expected "first" or "all")', + ); } } - for (const rule of Object.keys(payloadRules || {})) { - this._processExtractedData(scrapeResults, ind, rule, ruleset); - } - } + // meta fields, which are provided instead of being extracted + const context = { + q: query ?? null, + qurl: url, + ctry: this.wdp.getCountryCode(), + }; + const isPresent = (x) => x !== null && x !== undefined && x !== ""; - _getAttribute( - cd, - parentItem, - item, - attrib, - keyName, - functionsApplied, - baseURI, - ) { - const arr = []; - const rootElement = Array.prototype.slice.call( - cd.querySelectorAll(parentItem), - ); - for (let i = 0; i < rootElement.length; i += 1) { - const val = item ? rootElement[i].querySelector(item) : rootElement[i]; - if (val) { - // Check if the value needs to be refined or not. - let attribVal; - if (attrib === "href") { - // Unless there is a tag, DOMParser will use the extension id - // as the implicit for all relative links. - try { - const rawLink = val.getAttribute(attrib); - attribVal = rawLink ? new URL(rawLink, baseURI).href : null; - } catch (e) { - attribVal = null; + // Now combine the results to build the messages as specified + // in the "output" section of the patterns. + // + // Message payload + // --------------- + // There are three origins of the data: + // 1) a single keys + // (extracted from an input with a "first" section) + // 2) array entries that need to be merged + // (extracted from an input with an "all" section) + // 3) special entries provided in the context + // + // Filtering: + // ---------- + // By default, all keys of a message have to be present (where empty arrays + // and empty strings are considered to absent). The default behaviour can be + // overwritten by setting the "optional" property of a field. Also, the merging + // of arrays can allow entries with missing values by overwriting the + // "requiredKeys" property. If not specified, all keys of the array entry need + // to be present; otherwise, the entry will be skipped. + const messages = []; + // eslint-disable-line no-labels, no-restricted-syntax + nextaction: for (const [action, schema] of Object.entries(output)) { + const payload = {}; + for (const { + key, + source, + requiredKeys, + optional = false, + } of schema.fields) { + if (source) { + if (!input[source]) { + throw new Error( + `Output rule for action=${action} references invalid input source=${source}`, + ); } - } else { - attribVal = val[attrib] || val.getAttribute(attrib); - } - if (functionsApplied) { - attribVal = functionsApplied.reduce((accum, e) => { - if ( - Object.prototype.hasOwnProperty.call( - this.refineFuncMappings, - e[0], - ) - ) { - return this.refineFuncMappings[e[0]](accum, e[1], e[2]); + if (input[source].first) { + // case 1: single extracted value + if (!optional && !isPresent(found[source][key])) { + continue nextaction; // eslint-disable-line no-labels } - return accum; - }, attribVal); - } - arr.push(attribVal); - } else { - arr.push(val); - } - } - return arr; - } - - /** - * The structure of the final messages is defined in the "payloads" - * section in "fields". Messages can have multiple fields. - * Each field consists of two or three entries: - * - * [] - * - * The first () is the key to the data that we just extracted. - * The second () defines the name of the key in the final message - * (in other words, what the server expects). - * - * The third is optional and defines how data should be aggregated. - * Currently, the only aggregation rule is "join", which puts all - * matches in one list. For example, "query" messages use it to - * aggregate the list of results. - * - * When data for all keys is available, the message is ready to - * be sent. - * - * To preserve anonymity, there are two additional steps, - * which are out of scope of this function: - * - * - To prevent leaking sensitive information, the message - * will have to pass the wdp sanitizer heuristics - * (i.e., parts of the message could be omitted or the - * message could be dropped completely.) - * - For network anonymity, sending is done through HPN. - */ - _processExtractedData(scrapeResults, idx, key, ruleset) { - let payloadRules; - try { - const patterns = this.patterns[ruleset]; - payloadRules = patterns.payloads[idx][key]; - - if (payloadRules.type === "single" && payloadRules.results === "single") { - scrapeResults[key].forEach((e) => { - e.ctry = this._WebDiscoveryProject.getCountryCode(); - this._sendMessageIfAllFieldsAreSet(payloadRules, e); - }); - } else if ( - payloadRules.type === "single" && - payloadRules.results === "custom" - ) { - // Note: currently, only used the "maliciousUrl" action. - const payload = {}; - payloadRules.fields.forEach((e) => { - try { - payload[e[1]] = scrapeResults[e[0]][0][e[1]]; - } catch (ee) { - // TODO: When does this happen? Is it a problem? - } - this._sendMessageIfAllFieldsAreSet(payloadRules, payload); - }); - } else if ( - payloadRules.type === "query" && - payloadRules.results === "clustered" - ) { - const payload = {}; - payloadRules.fields.forEach((e) => { - const extractedContent = scrapeResults[e[0]]; - if (extractedContent !== undefined) { - if (e.length > 2) { - if (e[2] === "join") { - // Aggregate all results into one array-like map - // ({ "0" => , "1": , ... }). - // - // Skip entries where all values are empty, but keep - // entries when there is at least one value, for instance, - // '{ "t": "foo", "u": null }' would still be added: - // - // * Filtering values without any match is useful because - // it allows to throw away false positives (i.e., when a - // css selector matched unrelated parts). - // - // * Partial matches, on the other hand, should not be dropped. - // In general, they are not false positives, but rules that - // used to work before but are now partly broken because of - // recent layout changes. In that case, it is still better - // to send the message with the partial results then dropping - // it completely. - // - const joinArr = {}; - let counter = 0; - for (let i = 0; i < extractedContent.length; i += 1) { - if (Object.values(extractedContent[i]).some((x) => x)) { - joinArr[String(counter)] = extractedContent[i]; - counter += 1; - } - } - payload[e[1]] = joinArr; - } else { - // Currently unreachable by the published patterns. - logger.warn("Ignoring rule with unexpected aggregator:", e); - } - } else { - payload[e[1]] = extractedContent[0][e[1]]; + payload[key] = found[source][key] ?? null; + } else if (input[source].all) { + // case 2: merge the fields from an array of previously extracted values + const results = []; + const innerKeys = Object.keys(input[source].all); + for (const innerKey of innerKeys) { + found[source][innerKey].forEach((value, idx) => { + results[idx] = results[idx] || {}; + results[idx][innerKey] = value ?? null; + }); } - } - }); - this._sendMessageIfAllFieldsAreSet(payloadRules, payload); - } else if ( - payloadRules.type === "query" && - payloadRules.results === "scattered" - ) { - // Note: currently not used (TODO: remove or leave?) - const payload = {}; - payloadRules.fields.forEach((e) => { - if (e.length > 2) { - const joinArr = {}; - let counter = 0; - e[0].forEach((eachPattern) => { - for (let i = 0; i < scrapeResults[eachPattern].length; i += 1) { - joinArr[String(counter)] = scrapeResults[eachPattern][i]; - counter += 1; - } - }); - if (Object.keys(joinArr).length > 0) { - payload[e[1]] = joinArr; + + // check if all required data was found + // (by default, all keys in the fields need to be present) + const required = requiredKeys || innerKeys; + const allFieldsPresent = (entry) => + required.every((x) => isPresent(entry[x])); + const cleanedResults = results.filter(allFieldsPresent); + if (cleanedResults.length === 0 && !optional) { + continue nextaction; // eslint-disable-line no-labels } + payload[key] = { ...cleanedResults }; } else { - payload[e[1]] = scrapeResults[e[0]][0][e[1]]; + throw new Error( + `Output rule for action=${action} does not match input key=${key}`, + ); } - }); - this._sendMessageIfAllFieldsAreSet(payloadRules, payload); - } - } catch (ee) { - if (this._WebDiscoveryProject.debug) { - logger.warn( - "_processExtractedData failed (scrapeResults:", - scrapeResults, - ", key:", - key, - ", ruleset:", - ruleset, - ", payloadRules:", - payloadRules, - ", error:", - ee, - ")", - ); - } else { - logger.warn(`_processExtractedData failed: ${ee}`); + } else { + // case 3: access special keys from the context + if (!optional && !isPresent(context[key])) { + continue; + } + payload[key] = context[key] ?? null; + } } - } - } - _sendMessageIfAllFieldsAreSet(payloadRules, payload) { - const expectedFields = payloadRules.fields.map(([, key, aggregator]) => { - // Note: currently, 'join' is the only aggregator and thus - // it is the only possibility to have arrays in the output. - const type = aggregator === "join" ? "array" : "object"; - return { key, type }; - }); - if (_allMandatoryFieldsSet(payload, expectedFields)) { - logger.debug("PCN: created message:", this.msgType, payload); - this._WebDiscoveryProject.telemetry({ - type: this.msgType, - action: payloadRules.action, - payload, - }); + const body = { action, payload }; + messages.push(body); } - this._messageTemplate = {}; - } - - tryExtractBraveSerpQuery(url) { - const isBraveSearch = - url.startsWith("https://search.brave.com/search?") || - url.startsWith("https://bravesearch.com/search?") || - url.startsWith("https://search.brave.software/search?"); - return isBraveSearch && parse(url).searchParams.get("q"); + logger.debug("Found the following messages:", messages); + return messages; } } diff --git a/modules/web-discovery-project/sources/patterns.es b/modules/web-discovery-project/sources/patterns.es new file mode 100644 index 00000000..ec173234 --- /dev/null +++ b/modules/web-discovery-project/sources/patterns.es @@ -0,0 +1,314 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ + +import logger from "./logger"; +import { sanitizeUrl } from "../core/sanitizer"; +import { removeQueryParams } from "../core/url"; + +function expectString(arg) { + if (typeof arg !== "string") { + throw new Error(`Expected string argument but got: ${arg}`); + } +} + +function expectInteger(arg) { + if (typeof arg !== "number" || arg % 1 !== 0) { + throw new Error(`Expected integer argument but got: ${arg}`); + } +} + +function expectBoolean(arg) { + if (arg !== true && arg !== false) { + throw new Error(`Expected boolean argument but got: ${arg}`); + } +} + +function expectArrayOfStrings(arg) { + if (!Array.isArray(arg)) { + throw new Error(`Parameter should be an array of strings, but got: ${arg}`); + } + arg.forEach((x, idx) => { + if (typeof x !== "string") { + throw new Error( + `Parameter should be an array of string, but got: ${arg} (stopped at pos #${idx}: ${x})`, + ); + } + }); + return arg; +} + +/** + * A list of predefined string transformations that can be specified + * in the DSL in the "transforms" definition. + * + * Notes: + * - All transformations are stateless and must be free of side-effects. + * - If a single steps return "null", the following steps will + * not be executed. + * - The first argument is the current value (the accumulator), + * but extra parameters can be defined in the DSL; these will be + * passed to the function as additional arguments. + * + * Preventing remote code execution + * -------------------------------- + * + * The predefined functions need to be carefully checked. To illustrate + * the threat model, let us look at a constructed example first: + * + * badIdea: (x, param) => eval(param) + * + * Now, if an attacker compromises the servers and gets control to push + * malicious pattern updates, the function could be exploited: + * + * ["badIdea", ""]. + * + * Be careful not to introduce a function that allows an attack + * like that. That is why it is so important to keep the function free + * of side-effects! + * + * ---------------------------------------------------------------------- + * + * Additional warnings: + * + * 1) Do not allow DoS (be careful when looping; if possible avoid any loops): + * + * As long as the functions are free of side-effects, the worst possible + * attack would be denial-of-service (in other words, someone could push a + * rule that results in an infinite loop). So, also be careful when using + * explicit loops - there should be no need for it anyway. + * Best keep the transformations simple. + * + * 2) Do not trust the parameters: + * + * Note that an attacker will be able to control the arguments passed + * into the function: + * - extra parameters are under direct control (as they are taken + * from the rule definitions) + * - the first parameter (the accumulator) is more difficult to + * control but expect that it is prudent to assume that it can + * be controlled as well (e.g., if a user can be tricked to visit + * any website where the attacker can control text) + * + * As long as you avoid side-effects and loops, critical exploits + * are not possible, but again there are DoS type attacks. + * + * For instance, if you are writing a rule with an parameter that will + * be used as a regular expression, be careful. What will happen if the + * attacker pushes a rule with a long regular expression that may lead + * to exponential backtracking? Think about these kind of attacks and + * about mitigations (e.g. reject overly long parameters). + * Again, it is best to keep the functions simple to avoid any surprises. + * + * ---------------------------------------------------------------------- + * + * Error handling: + * + * 1) Throwing an exception is supported. In that case, expect the whole + * rule to be skipped (no message will be sent). In other words, reserve + * it for unexpected cases. + * 2) Returning "null"/"undefined" has the semantic of stopping the + * execution without an error. It is still possible that a + * message will be sent, but with a missing value. + */ +const TRANSFORMS = new Map( + Object.entries({ + /** + * Extracts a given query parameter and decodes it. + * + * Example ["queryParam", "foo"]: + * - "https://example.test/path?foo=bar+baz" -> "bar baz" + * - "/example.test/path?foo=bar+baz" -> "bar baz" + * - "/example.test/path" -> null + * - "This is a string but not an URL" -> null + */ + queryParam: (x, queryParam) => { + expectString(x); + expectString(queryParam); + try { + // we only need the query parameter, but to handle relative + // URLs we have to pass a base URL (any domain will work) + return new URL(x, "http://x").searchParams.get(queryParam); + } catch (e) { + return null; + } + }, + + /** + * Given a URL and a list of query parameters, it returns an equivalent + * URL, but with those query parameters removed. + * + * Notes: + * - If the parameter occurs multiple times, all of them will be removed. + * - If the URL is invalid, null is returned. + * + * Example ["removeParams", ["foo"]]: + * - "https://example.test/path?foo=remove&bar=keep" -> "https://example.test/path?bar=keep" + * - "This is a string but not an URL" -> null + * - "/example.test/path" -> null (relative URLs are not supported) + * + * Example ["removeParams", ["foo", "bar"]]: + * - "https://example.test/path?foo=1&bar=2" -> "https://example.test/path" + * + * @since: 1 + */ + removeParams: (url, queryParams) => { + expectString(url); + expectArrayOfStrings(queryParams); + if (URL.canParse(url)) { + return removeQueryParams(url, queryParams); + } else { + return null; + } + }, + + /** + * Given text, it will verify that it is a well-formed URL; + * otherwise, it will end the processing by "nulling" it out. + * + * @since: 1 + */ + requireURL: (url) => { + expectString(url); + return URL.canParse(url) ? url : null; + }, + + /** + * Validates if the given value is in a predefined list of allowed + * values; otherwise, it will end the processing by "nulling" it out. + * + * @since: 2 + */ + filterExact: (text, allowedStrings) => { + expectString(text); + expectArrayOfStrings(allowedStrings); + return allowedStrings.includes(text) ? text : null; + }, + + /** + * Given a URL, it runs a set of extra checks to filter out + * parts that may be sensitive (i.e. keeping only the hostname), + * or even drop it completely. + */ + maskU: (x) => { + expectString(x); + try { + return sanitizeUrl(x).safeUrl; + } catch (e) { + return null; + } + }, + + split: (x, splitON, arrPos) => { + expectString(x); + expectString(splitON); + expectInteger(arrPos); + + const parts = x.split(splitON); + if (parts.length === 1) { + return null; + } + return parts[arrPos] ?? null; + }, + + trySplit: (x, splitON, arrPos) => { + expectString(x); + expectString(splitON); + expectInteger(arrPos); + + return x.split(splitON)[arrPos] || x; + }, + + decodeURIComponent: (x) => { + expectString(x); + try { + return decodeURIComponent(x); + } catch (e) { + return null; + } + }, + + tryDecodeURIComponent: (x) => { + expectString(x); + try { + return decodeURIComponent(x); + } catch (e) { + return x; + } + }, + + /** + * Takes a JSON string object, parses it and extract the data under the + * given path. By default, it will only extract safe types (strings, + * numbers, booleans), mostly to prevent accidentally extracting + * more than intended. + */ + json: (x, path, extractObjects = false) => { + expectString(x); + expectString(path); + expectBoolean(extractObjects); + try { + let obj = JSON.parse(x); + for (const field of path.split(".")) { + obj = obj[field]; + } + if (typeof obj === "string") { + return obj; + } + if (typeof obj === "number" || typeof obj === "boolean") { + return obj.toString(); + } + if (extractObjects && obj) { + return JSON.stringify(obj); + } + // prevent uncontrolled text extraction + return ""; + } catch (e) { + return ""; + } + }, + }), +); + +export function lookupBuiltinTransform(name) { + const transform = TRANSFORMS.get(name); + if (transform) { + return transform; + } + throw new Error(`Unknown transformation: "${name}"`); +} + +/** + * Represents the currently active rules. + * + * It is updated by the PatternsUpdater, which polls + * the server for updates. + */ +export default class Patterns { + constructor() { + this._rules = {}; + } + + update(rules) { + logger.debug("Loaded patterns:", rules); + this._rules = rules; + } + + /** + * Grants access to the active patterns. It is guaranteed that the + * returned object will not be modified. + * + * If you plan to perform multiple operations, it is recommended + * to call this function one and then operate on this snapshot. + * Even though it is unlikely, patterns can change at any point + * in time. As long as you operate on the snapshot, you do not have + * to worry about it. + */ + getRulesSnapshot() { + return this._rules; + } + + typeExists(type) { + return type in this.getRulesSnapshot(); + } +} diff --git a/modules/web-discovery-project/sources/url-analyzer.es b/modules/web-discovery-project/sources/url-analyzer.es new file mode 100644 index 00000000..dd02de6d --- /dev/null +++ b/modules/web-discovery-project/sources/url-analyzer.es @@ -0,0 +1,157 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ + +import { extractHostname } from "../core/tlds"; +import { parse } from "../core/url"; +import logger from "./logger"; + +const URL_PATTERNS = [ + { + type: "search-goi", + regexp: + /^https:[/][/][^/]*[.]google[.].*?[#?&;]((q=[^&]+&([^&]+&)*tbm=isch)|(tbm=isch&([^&]+&)*q=[^&]+))/, + prefix: "search?tbm=isch&gbv=1&q=", + }, + { + type: "search-gov", + regexp: + /^https:[/][/][^/]*[.]google[.].*?[#?&;]((q=[^&]+&([^&]+&)*tbm=vid)|(tbm=vid&([^&]+&)*q=[^&]+))/, + prefix: "search?tbm=vid&gbv=1&q=", + }, + { + type: "search-go", + regexp: /^https:[/][/][^/]*[.]google[.].*?[#?&;]/, + prefix: "search?q=", + }, + { + type: "search-ya", + regexp: /^https:[/][/][^/]*[.]search[.]yahoo[.].*?[#?&;][pq]=[^$&]+/, + prefix: "search?q=", + queryFinder(parsedUrl) { + return parsedUrl.searchParams.get("q") || parsedUrl.searchParams.get("p"); + }, + }, + { + type: "search-bii", + regexp: /^https:[/][/][^/]*[.]bing[.][^/]+[/]images[/]search[?]q=[^$&]+/, + prefix: "images/search?q=", + }, + { + type: "search-bi", + regexp: /^https:[/][/][^/]*[.]bing[.].*?[#?&;]q=[^$&]+/, + prefix: "search?q=", + }, + { + type: "search-am", + regexp: + /^https:[/][/][^/]*[.]amazon[.][^/]+[/](s[?]k=[^$&]+|.*[?&]field-keywords=[^$&]+)/, + prefix: "s/?field-keywords=", + queryFinder(parsedUrl) { + return ( + parsedUrl.searchParams.get("field-keywords") || + parsedUrl.searchParams.get("k") + ); + }, + }, + { + type: "amp", + regexp: + /^https:[/][/][^/]*[.]amazon[.][^/]+[/]([/]dp[/]|[/]gp[/]product[/])/, + queryFinder(parsedUrl) { + return parsedUrl.searchParams.get("keywords"); + }, + }, + { + type: "search-dd", + regexp: + /^https:[/][/]duckduckgo.com[/](?:html$|.*[?&]q=[^&]+.*&ia=web|[?]q=[^&]+$)/, + prefix: "?q=", + }, + { + type: "li", + regexp: /^https:[/][/][^/]*linkedin[.][^/]+[/]pub[/]dir+/, + }, +]; +const SEARCH_ENGINE_TYPES = new Set([ + "search-goi", + "search-gov", + "search-go", + "search-ya", + "search-bii", + "search-bi", + "search-dd", +]); + +export default class UrlAnalyzer { + constructor(patterns) { + this.patterns = patterns; + this._urlPatterns = URL_PATTERNS; + } + + parseLinks(url) { + for (const { + type, + regexp, + queryFinder = (parsedUrl) => parsedUrl.searchParams.get("q"), + } of this._urlPatterns) { + if (regexp.test(url)) { + // Workaround for an encoding issue (source: https://stackoverflow.com/a/24417399/783510). + // Reason: we want to preserve the original search term. In other words, searches + // for "abc def" and "abc+def" should be distinguishable. That is why we need to + // avoid the ambigious '+' character and use explicit white space encoding. + const url_ = url.replaceAll("+", "%20"); + const parsedUrl = parse(url_); + + const query = queryFinder(parsedUrl); + if (!query) { + return { found: false }; + } + if (!this.patterns.typeExists(type)) { + logger.debug( + "Matching rule for", + url, + "skipped (no matching server side rules exist)", + ); + return { found: false }; + } + return { found: true, type, query }; + } + } + + return { found: false }; + } + + isSearchEngineUrl(url) { + const { found, type } = this.parseLinks(url); + if (!found) return false; + return SEARCH_ENGINE_TYPES.has(type); + } + + tryExtractBraveSerpQuery(url) { + const isBraveSearch = + url.startsWith("https://search.brave.com/search?") || + url.startsWith("https://bravesearch.com/search?") || + url.startsWith("https://search.brave.software/search?"); + const parsedUrl = parse(url); + return isBraveSearch && parsedUrl.searchParams.get("q"); + } + + checkAnonSearchURL(url, query) { + const { found, type } = this.parseLinks(url); + if (!found) return { isSearchEngineUrl: false, queryUrl: null }; + const isSearchEngineUrl = SEARCH_ENGINE_TYPES.has(type); + const urlPattern = URL_PATTERNS.find((p) => p.type == type); + const queryPrefix = urlPattern.prefix; + if (!queryPrefix) { + logger.debug( + `URL pattern with type '${urlPattern.type}' has no query prefix`, + ); + return { isSearchEngineUrl: false, queryUrl: null }; + } + const encodedQuery = encodeURIComponent(query).replace(/%20/g, "+"); + const hostname = extractHostname(url); + const queryUrl = `https://${hostname}/${queryPrefix}${encodedQuery}`; + return { isSearchEngineUrl, queryUrl }; + } +} diff --git a/modules/web-discovery-project/sources/web-discovery-project.es b/modules/web-discovery-project/sources/web-discovery-project.es index 1993b538..94e6bb51 100644 --- a/modules/web-discovery-project/sources/web-discovery-project.es +++ b/modules/web-discovery-project/sources/web-discovery-project.es @@ -23,6 +23,7 @@ import { parseURL, Network } from "./network"; import prefs from "../core/prefs"; import pacemaker from "../core/services/pacemaker"; import SafebrowsingEndpoint from "./safebrowsing-endpoint"; +import Patterns from "./patterns"; /* Configuration for Bloomfilter @@ -32,12 +33,6 @@ const bloomFilterSize = 500001; // false-positive 0.01, hashes 7 const bloomFilterNHashes = 7; const allowedCountryCodes = config.settings.ALLOWED_COUNTRY_CODES; -function _log(...msg) { - if (WebDiscoveryProject.debug) { - logger.log(WebDiscoveryProject.LOG_KEY, ...msg); - } -} - function getRandomIntInclusive(min, max) { const _min = Math.ceil(min); const _max = Math.floor(max); @@ -81,7 +76,6 @@ const WebDiscoveryProject = { WAIT_TIME: 2000, PAGE_WAIT_TIME: 5000, LOG_KEY: "wdp", - debug: false, utility_regression_tests: false, httpCache: {}, httpCache401: {}, @@ -96,6 +90,8 @@ const WebDiscoveryProject = { deadFiveMts: 5, deadTwentyMts: 20, msgType: "wdp", + patterns: new Patterns(), + _patternsLastUpdated: null, probHashLogM: [ [ -1.839225984234144, -1.8009413231413045, -2.5864601561900273, @@ -1595,11 +1591,13 @@ const WebDiscoveryProject = { config.settings.ENDPOINT_PATTERNS, (content) => { try { - const { normal, strict } = JSON.parse(content); - logger.debug("Got new patterns", { normal, strict }); - WebDiscoveryProject.contentExtractor.updatePatterns(normal, "normal"); - WebDiscoveryProject.contentExtractor.updatePatterns(strict, "strict"); - logger.info("WebDiscoveryProject patterns successfully updated"); + const rules = JSON.parse(content); + logger.debug("Got new patterns", rules); + WebDiscoveryProject.patterns.update(rules); + WebDiscoveryProject._patternsLastUpdated = new Date(); + logger.info( + `WebDiscoveryProject patterns successfully updated at ${WebDiscoveryProject._patternsLastUpdated}`, + ); } catch (e) { logger.warn("Failed to apply new WebDiscoveryProject patterns", e); } @@ -1675,7 +1673,7 @@ const WebDiscoveryProject = { }, maskURL(url) { if (WebDiscoveryProject.urlLeaksExtensionId(url)) { - _log("Dropping URL with extension id:", url); + logger.debug("Dropping URL with extension id:", url); return ""; } var url_parts = null; @@ -1708,7 +1706,7 @@ const WebDiscoveryProject = { }, maskURLStrict(url) { if (WebDiscoveryProject.urlLeaksExtensionId(url)) { - _log("Dropping URL with extension id:", url); + logger.debug("Dropping URL with extension id:", url); return ""; } var url_parts = null; @@ -1786,7 +1784,7 @@ const WebDiscoveryProject = { if (!url_parts) return true; - _log(JSON.stringify(url_parts)); + logger.debug(JSON.stringify(url_parts)); if (aURI.indexOf("about:") == 0) return true; if (isIpAddress(url_parts.hostname)) { @@ -1817,10 +1815,12 @@ const WebDiscoveryProject = { if (pos_hash_char > -1) { if ( - !WebDiscoveryProject.contentExtractor.isSearchEngineUrl(aURI) && + !WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( + aURI, + ) && aURI.length - pos_hash_char >= 10 ) { - _log("Dropped because of # in url: " + decodeURIComponent(aURI)); + logger.debug("Dropped because of # in url: " + decodeURIComponent(aURI)); return true; } } @@ -1841,12 +1841,12 @@ const WebDiscoveryProject = { return true; } - _log("Sanitize: URL is ok: " + aURI); + logger.debug("Sanitize: URL is ok: " + aURI); return false; } catch (ee) { // if there were any exception, we return true for safety - _log("Exception in isSuspiciousURL: " + ee); + logger.debug("Exception in isSuspiciousURL: " + ee); return true; } }, @@ -1882,7 +1882,7 @@ const WebDiscoveryProject = { return strict_value; }, dropLongURL: function (url, options) { - _log("DLU called with arguments:", url, options); + logger.debug("DLU called with arguments:", url, options); try { if (options == null) options = { @@ -1930,7 +1930,7 @@ const WebDiscoveryProject = { url_parts.query_string && url_parts.query_string.length > WebDiscoveryProject.qs_len ) { - _log("DLU failed: length of query string is longer than qs_len"); + logger.debug("DLU failed: length of query string is longer than qs_len"); return true; } @@ -1938,7 +1938,7 @@ const WebDiscoveryProject = { var v = url_parts.query_string.split(/[&;]/); if (v.length > 4) { // that means that there is a least one &; hence 5 params - _log("DLU failed: there are more than 4 parameters"); + logger.debug("DLU failed: there are more than 4 parameters"); return true; } if ( @@ -1948,7 +1948,7 @@ const WebDiscoveryProject = { 12, ) != null ) { - _log( + logger.debug( "DLU failed: long number in the query string: ", url_parts.query_string, ); @@ -1960,7 +1960,7 @@ const WebDiscoveryProject = { !options.allowlisted && WebDiscoveryProject.checkForLongNumber(url_parts.path, 12) != null ) { - _log("DLU failed: long number in path: ", url_parts.path); + logger.debug("DLU failed: long number in path: ", url_parts.path); return true; } } @@ -1977,7 +1977,7 @@ const WebDiscoveryProject = { return true; } else { if (vpath[i].length > 12 && WebDiscoveryProject.isHash(vpath[i])) { - _log("DLU failed: hash in the URL ", vpath[i]); + logger.debug("DLU failed: hash in the URL ", vpath[i]); return true; } } @@ -1990,7 +1990,7 @@ const WebDiscoveryProject = { if (options.strict == true) mult = 0.5; if (cstr.length > WebDiscoveryProject.rel_segment_len * mult) { if (WebDiscoveryProject.isHash(cstr)) { - _log("DLU failed: hash in the path ", cstr); + logger.debug("DLU failed: hash in the path ", cstr); return true; } } @@ -2057,7 +2057,7 @@ const WebDiscoveryProject = { if (url_parts.query_string && url_parts.query_string.length > 0) { for (var i = 0; i < v.length; i++) if (v[i].test("?" + url_parts.query_string)) { - _log("Prohibited keyword found: ", url_parts.query_string); + logger.debug("Prohibited keyword found: ", url_parts.query_string); return true; } } @@ -2065,7 +2065,7 @@ const WebDiscoveryProject = { if (path_query_string && path_query_string.length > 0) { for (var i = 0; i < v.length; i++) if (v[i].test(path_query_string)) { - _log("Prohibited keyword found: ", path_query_string); + logger.debug("Prohibited keyword found: ", path_query_string); return true; } } @@ -2074,7 +2074,7 @@ const WebDiscoveryProject = { return false; } catch (ee) { // if there were any exception, we return true for safety - _log("Exception in dropLongURL: " + ee); + logger.debug("Exception in dropLongURL: " + ee); return true; } }, @@ -2192,7 +2192,7 @@ const WebDiscoveryProject = { }; } } catch (e) { - _log(e); + logger.debug(e); } }, }, @@ -2259,7 +2259,7 @@ const WebDiscoveryProject = { } } } catch (ee) { - _log(">>>> REDIRECT ERROR >>> " + ee); + logger.debug(">>>> REDIRECT ERROR >>> " + ee); } return res; }, @@ -2308,7 +2308,7 @@ const WebDiscoveryProject = { onsuccess(url, page_data, original_url, x); }) .catch((error_message) => { - _log(`Error on doublefetch: ${error_message}`); + logger.debug(`Error on doublefetch: ${error_message}`); onerror(url, page_data, original_url, error_message); }); }, @@ -2343,25 +2343,25 @@ const WebDiscoveryProject = { // compares the structure of the page when rendered in the browser with the structure of // the page after. - _log("xbef: " + JSON.stringify(struct_bef)); - _log("xaft: " + JSON.stringify(struct_aft)); + logger.debug("xbef: " + JSON.stringify(struct_bef)); + logger.debug("xaft: " + JSON.stringify(struct_aft)); // Check if struct_bef or struct_aft is not null, in case anyone is then we mark it as private. // if any of the titles is null (false), then decline (discard) if (!(struct_bef && struct_aft)) { - _log("fovalidDoubleFetch: found an empty structure"); + logger.debug("fovalidDoubleFetch: found an empty structure"); return false; } if (!(struct_bef["t"] && struct_aft["t"])) { - _log("fovalidDoubleFetch: found an empty title"); + logger.debug("fovalidDoubleFetch: found an empty title"); return false; } // if any of the two struct has a iall to false decline if (!(struct_bef["iall"] && struct_aft["iall"])) { - _log("fovalidDoubleFetch: found a noindex"); + logger.debug("fovalidDoubleFetch: found a noindex"); return false; } @@ -2374,12 +2374,12 @@ const WebDiscoveryProject = { /* Adding key to check how many pages will we loose if frame check is turned on if (struct_bef['nfsh']==null || struct_aft['nfsh']==null || struct_bef['nfsh']!=struct_aft['nfsh']) { - _log("fovalidDoubleFetch: number of internal frames does not match"); + logger.debug("fovalidDoubleFetch: number of internal frames does not match"); return false; } if (struct_bef['nifsh']==null || struct_aft['nifsh']==null || struct_bef['nifsh']!=struct_aft['nifsh']) { - _log("fovalidDoubleFetch: number of internal iframes does not match"); + logger.debug("fovalidDoubleFetch: number of internal iframes does not match"); return false; } */ @@ -2412,7 +2412,7 @@ const WebDiscoveryProject = { (struct_bef["lh"] || 0) / ((struct_bef["lh"] || 0) + (struct_aft["lh"] || 0)); if (ratio_lh < 0.1 || ratio_lh > 0.9) { - _log("fovalidDoubleFetch: lh is not balanced"); + logger.debug("fovalidDoubleFetch: lh is not balanced"); length_html_ok = false; } } @@ -2426,7 +2426,7 @@ const WebDiscoveryProject = { (struct_bef["nl"] || 0) / ((struct_bef["nl"] || 0) + (struct_aft["nl"] || 0)); if (ratio_nl < 0.1 || ratio_nl > 0.9) { - _log("fovalidDoubleFetch: nl is not balanced"); + logger.debug("fovalidDoubleFetch: nl is not balanced"); length_text_ok = false; } } @@ -2445,7 +2445,7 @@ const WebDiscoveryProject = { struct_aft["nip"] == null || (struct_bef["nip"] == 0 && struct_aft["nip"] > 0) ) { - _log("validDoubleFetch: fail nip"); + logger.debug("validDoubleFetch: fail nip"); return false; } @@ -2455,7 +2455,7 @@ const WebDiscoveryProject = { struct_aft["nf"] == null || (struct_bef["nf"] == 0 && struct_aft["nf"] > 0) ) { - _log("validDoubleFetch: fail text nf"); + logger.debug("validDoubleFetch: fail text nf"); return false; } } @@ -2481,7 +2481,7 @@ const WebDiscoveryProject = { // the longest titles is 4 tokens long, the, we are a bit flexible on title differences if (jc >= 0.5) return true; else { - _log("short title fail title overlap"); + logger.debug("short title fail title overlap"); return false; } } else { @@ -2511,11 +2511,11 @@ const WebDiscoveryProject = { WebDiscoveryProject.auxUnion(vtt1, vtt2).length; // we are more demanding on the title overlap now if (jc <= 0.8) { - _log("validDoubleFetch: fail title overlap after ascii"); + logger.debug("validDoubleFetch: fail title overlap after ascii"); return false; } } else { - _log("validDoubleFetch: fail title overlap"); + logger.debug("validDoubleFetch: fail title overlap"); return false; } } @@ -2530,7 +2530,7 @@ const WebDiscoveryProject = { struct_aft["nip"] == null || (struct_bef["nip"] == 0 && struct_aft["nip"] > 0) ) { - _log("validDoubleFetch: fail nip"); + logger.debug("validDoubleFetch: fail nip"); return false; } @@ -2540,7 +2540,7 @@ const WebDiscoveryProject = { struct_aft["nf"] == null || (struct_bef["nf"] == 0 && struct_aft["nf"] > 0) ) { - _log("validDoubleFetch: fail text nf"); + logger.debug("validDoubleFetch: fail text nf"); return false; } @@ -2551,7 +2551,7 @@ const WebDiscoveryProject = { return true; } - _log("validDoubleFetch: default option"); + logger.debug("validDoubleFetch: default option"); return false; }, @@ -2600,7 +2600,7 @@ const WebDiscoveryProject = { } else return url; }, fetchReferral: function (referral_url, callback) { - _log("PPP in fetchReferral: " + referral_url); + logger.debug("PPP in fetchReferral: " + referral_url); if (referral_url && referral_url != "") { if (WebDiscoveryProject.docCache[referral_url] == null) { @@ -2609,20 +2609,20 @@ const WebDiscoveryProject = { null, null, function (referral_url) { - _log( + logger.debug( "PPP in fetchReferral success auxGetPageData: " + referral_url, ); callback(); }, function (referral_url) { - _log( + logger.debug( "PPP in fetchReferral failure auxGetPageData: " + referral_url, ); callback(); }, ); } else { - _log("PPP in fetchReferral already in docCache: " + referral_url); + logger.debug("PPP in fetchReferral already in docCache: " + referral_url); callback(); } } else callback(); @@ -2645,7 +2645,7 @@ const WebDiscoveryProject = { if (page_doc == null || page_doc["x"] == null) { // this should not happen, but it does. Need to debug why the 'x' field gets lost // right now, let's set is a private to avoid any risk - _log("page_doc.x missing for url:", url); + logger.debug("page_doc.x missing for url:", url); return discard("page_doc.x missing"); } @@ -2666,7 +2666,7 @@ const WebDiscoveryProject = { if (cUrl) { if (!allowlisted && WebDiscoveryProject.dropLongURL(cUrl)) { // oops, the canonical is also bad, therefore mark as private - _log(`both URL=${url} and canonical_url=${cUrl} are too long`); + logger.debug(`both URL=${url} and canonical_url=${cUrl} are too long`); return discard(`both URL and canonical_url are too long`); } // proceed, as we are in the good scenario in which the canonical @@ -2695,14 +2695,14 @@ const WebDiscoveryProject = { await WebDiscoveryProject.telemetry(result.msgCandidate); } } catch (e) { - _log("Unexpected error during doublefetch", e); + logger.debug("Unexpected error during doublefetch", e); } }, _doubleFetch(url, page_doc) { return new Promise((resolve, _) => { function privateUrlFound(url, explanation) { - _log( + logger.debug( "The URL", url, "failed one of the doublefetch heuristics. Details:", @@ -2715,7 +2715,7 @@ const WebDiscoveryProject = { }); } function publicUrlFound(url, page_doc) { - _log("The URL", url, "passed all doublefetch heuristics"); + logger.debug("The URL", url, "passed all doublefetch heuristics"); resolve({ url, isPrivate: false, @@ -2741,7 +2741,7 @@ const WebDiscoveryProject = { return; } - _log("going to double-fetch:", url); + logger.debug("going to double-fetch:", url); WebDiscoveryProject.auxGetPageData( url, page_doc, @@ -2749,7 +2749,7 @@ const WebDiscoveryProject = { function (url, page_doc, original_url, data) { // data contains the public data of the url double-fetch, - _log("success on doubleFetch, need further validation", url); + logger.debug("success on doubleFetch, need further validation", url); if ( WebDiscoveryProject.validDoubleFetch(page_doc["x"], data, { @@ -2796,11 +2796,11 @@ const WebDiscoveryProject = { if (page_doc["ref"] && page_doc["ref"] != "") { // the page has a referral - _log( + logger.debug( "PPP: page has a referral, " + url + " < " + page_doc["ref"], ); var hasurl = WebDiscoveryProject.hasURL(page_doc["ref"], url); - _log( + logger.debug( "PPP: page has a referral, " + url + " < " + @@ -2810,7 +2810,7 @@ const WebDiscoveryProject = { ); // overwrite strict value because the link exists on a public fetchable page - _log( + logger.debug( "Strictness values:", url_strict_value, structure_strict_value, @@ -2821,13 +2821,13 @@ const WebDiscoveryProject = { } } else { // page has no referral - _log("PPP: page has NO referral,", url); + logger.debug("PPP: page has NO referral,", url); // we do not know the origin of the page, run the dropLongURL strict version, if // there is no canonical or if there is canonical and is the same as the url, } - _log( + logger.debug( "strict URL:", url, "> struct:", @@ -2885,7 +2885,7 @@ const WebDiscoveryProject = { ); return; } - _log("success on doubleFetch, need further validation"); + logger.debug("success on doubleFetch, need further validation"); // // we need to modify the 'x' field of page_doc to substitute any structural information about @@ -2960,14 +2960,14 @@ const WebDiscoveryProject = { // because it would be cleaner hence safer // - _log("going to clean_url double-fetch:", clean_url); + logger.debug("going to clean_url double-fetch:", clean_url); WebDiscoveryProject.auxGetPageData( clean_url, page_doc, first_url_double_fetched, function (url, page_doc, original_url, data) { - _log( + logger.debug( "success on clean_url doubleFetch, need further validation", ); @@ -2992,10 +2992,8 @@ const WebDiscoveryProject = { // in such a case, it's safer to assume that the fragments cleaned were identifiying a user, and the // website is redirecting to the login page, in such a case, we should not send the page at all, in fact, we // should mark it as private just to be sure, - if (WebDiscoveryProject.debug) { - _log("checking clean_url, page_doc:", page_doc); - _log("checking clean_url, data: ", data); - } + logger.debug("checking clean_url, page_doc:", page_doc); + logger.debug("checking clean_url, data: ", data); if (page_doc["x"]["nip"] < data["nip"]) { // the page with url_clean have more input password fields or more forms, this is dangerous, @@ -3012,7 +3010,7 @@ const WebDiscoveryProject = { function (url, page_doc, original_url, error_message) { // there was a failure, the clean_url does not go to the same place, therefore it's better // not to replace - _log( + logger.debug( "failure on clean_url doubleFetch! structure did not match", ); publicUrlFound(original_url, page_doc); @@ -3030,7 +3028,7 @@ const WebDiscoveryProject = { } }, function (url, page_doc, original_url, error_message) { - _log("failure on doubleFetch!", error_message); + logger.debug("failure on doubleFetch!", error_message); privateUrlFound( url, `rejected as doublefetch failed with an error ${error_message}`, @@ -3048,7 +3046,7 @@ const WebDiscoveryProject = { if (!cd) { // fetch the content of the source_url, // - _log("hasURL no CD!!! "); + logger.debug("hasURL no CD!!! "); return false; } } else return false; @@ -3095,7 +3093,7 @@ const WebDiscoveryProject = { return found; } catch (ee) { - _log("Error on hasURL: " + ee); + logger.debug("Error on hasURL: " + ee); return false; } }, @@ -3233,7 +3231,7 @@ const WebDiscoveryProject = { } } } catch (ee) { - _log("no-index check failed " + ee); + logger.debug("no-index check failed " + ee); } // extract the canonical url if available @@ -3360,7 +3358,7 @@ const WebDiscoveryProject = { if (activeURL.indexOf("about:") != 0) { if (WebDiscoveryProject.state["v"][activeURL] == null) { const braveQuery = - WebDiscoveryProject.contentExtractor.tryExtractBraveSerpQuery( + WebDiscoveryProject.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery( activeURL, ); logger.debug("[onLocationChange] isBraveQuery", braveQuery); @@ -3371,27 +3369,21 @@ const WebDiscoveryProject = { t: "br", }; } else if ( - WebDiscoveryProject.contentExtractor.isSearchEngineUrl(activeURL) + WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( + activeURL, + ) ) { logger.debug("[onLocationChange] isSearchEngineUrl", activeURL); pacemaker.setTimeout( - function (url, originalURL) { + function (url) { if (!WebDiscoveryProject) { return; } - getContentDocument(originalURL) - .then((doc) => { - WebDiscoveryProject.checkURL(doc, url, "normal"); - }) - .catch((e) => { - logger.info( - `Failed to get content for originalURL=${originalURL} (internalURL=${url}, details=${e})`, - ); - }); + const query = WebDiscoveryProject.contentExtractor.extractQuery(url); + if (query) WebDiscoveryProject.addStrictQueries(url, query) }, WebDiscoveryProject.WAIT_TIME, activeURL, - originalURL, ); } @@ -3512,14 +3504,11 @@ const WebDiscoveryProject = { .then( function (cd) { if ( - !WebDiscoveryProject.contentExtractor.isSearchEngineUrl( + !WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( currURL, ) ) { - try { - WebDiscoveryProject.checkURL(cd, currURL, "normal"); - } catch (e) {} - //Check active usage... + // Check active usage... // WebDiscoveryProject.activeUsage += 1; WebDiscoveryProject.incrActiveUsage(); } @@ -3552,13 +3541,11 @@ const WebDiscoveryProject = { } }, function () { - if (WebDiscoveryProject.debug) { - _log("CANNOT GET THE CONTENT OF : " + currURL); - } + logger.debug("CANNOT GET THE CONTENT OF : " + currURL); }, ) .catch((ee) => { - _log( + logger.debug( "Error fetching title and length of page: " + ee + " : " + @@ -3618,7 +3605,7 @@ const WebDiscoveryProject = { } }) .catch((e) => { - _log("Error fetching fetching the currentURL: " + e); + logger.debug("Error fetching the currentURL: " + e); }); WebDiscoveryProject.counter += 4; @@ -3681,15 +3668,15 @@ const WebDiscoveryProject = { ); delete WebDiscoveryProject.state["v"][url]; delete WebDiscoveryProject.queryCache[url]; - //_log("Deleted: moved to dead pages after 20 mts."); - //_log("Deleted: moved to dead pages after 20 mts: " + WebDiscoveryProject.state['m'].length); + //logger.debug("Deleted: moved to dead pages after 20 mts."); + //logger.debug("Deleted: moved to dead pages after 20 mts: " + WebDiscoveryProject.state['m'].length); } } } } }) .catch((ee) => { - _log(ee); + logger.debug(ee); }); }, { timeout: 5000 }, @@ -3786,9 +3773,7 @@ const WebDiscoveryProject = { tasks.push( pacemaker.register( function checkActiveUsage() { - if (WebDiscoveryProject.debug) { - _log("Check if alive"); - } + logger.debug("Check if alive"); WebDiscoveryProject.checkActiveUsage(); }, { timeout: 20 * 60 * 1000 }, @@ -3849,9 +3834,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.lastEv["keypresspage"] | 0) > 1 * WebDiscoveryProject.tmult ) { - if (WebDiscoveryProject.debug) { - //_log('captureKeyPressPage'); - } + //logger.debug('captureKeyPressPage'); WebDiscoveryProject.lastEv["keypresspage"] = WebDiscoveryProject.counter; WebDiscoveryProject.lastActive = WebDiscoveryProject.counter; var activeURL = WebDiscoveryProject.cleanCurrentUrl(ev.target.baseURI); @@ -3870,9 +3853,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.lastEv["mousemovepage"] | 0) > 1 * WebDiscoveryProject.tmult ) { - if (WebDiscoveryProject.debug) { - _log("captureMouseMovePage"); - } + logger.debug("captureMouseMovePage"); WebDiscoveryProject.lastEv["mousemovepage"] = WebDiscoveryProject.counter; WebDiscoveryProject.lastActive = WebDiscoveryProject.counter; var activeURL = WebDiscoveryProject.cleanCurrentUrl(ev.target.baseURI); @@ -3886,7 +3867,7 @@ const WebDiscoveryProject = { } }, getURLFromEvent: function (ev) { - _log(">>>> Get url from event >>> " + ev.target.href); + logger.debug(">>>> Get url from event >>> " + ev.target.href); try { if (ev.target.href != null || ev.target.href != undefined) { return decodeURIComponent("" + ev.target.href); @@ -3899,9 +3880,7 @@ const WebDiscoveryProject = { } } } catch (ee) { - if (WebDiscoveryProject.debug) { - _log("Error in getURLFromEvent: " + ee); - } + logger.debug("Error in getURLFromEvent: " + ee); } return null; }, @@ -3913,7 +3892,7 @@ const WebDiscoveryProject = { var targetURL = WebDiscoveryProject.getURLFromEvent(ev) || href; - _log("captureMouseClickPage>> " + targetURL); + logger.debug("captureMouseClickPage>> " + targetURL); if (contextHTML) { WebDiscoveryProject.contextFromEvent = { html: contextHTML, @@ -3929,25 +3908,23 @@ const WebDiscoveryProject = { // Need to improve. var activeURL = WebDiscoveryProject.cleanCurrentUrl(ev.target.baseURI); - if (WebDiscoveryProject.debug) { - _log( - "captureMouseClickPage>> " + - WebDiscoveryProject.counter + - " " + - targetURL + - " : " + - " active: " + - activeURL + - " " + - (WebDiscoveryProject.state["v"][activeURL] != null) + - " " + - ev.target + - " :: " + - ev.target.value + - " >>" + - JSON.stringify(WebDiscoveryProject.lastEv), - ); - } + logger.debug( + "captureMouseClickPage>> " + + WebDiscoveryProject.counter + + " " + + targetURL + + " : " + + " active: " + + activeURL + + " " + + (WebDiscoveryProject.state["v"][activeURL] != null) + + " " + + ev.target + + " :: " + + ev.target.value + + " >>" + + JSON.stringify(WebDiscoveryProject.lastEv), + ); if (WebDiscoveryProject.state["v"][activeURL] != null) { WebDiscoveryProject.linkCache[targetURL] = { @@ -3993,9 +3970,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.lastEv["mouseclickpage"] | 0) > 1 * WebDiscoveryProject.tmult ) { - if (WebDiscoveryProject.debug) { - _log("captureMouseClickPage"); - } + logger.debug("captureMouseClickPage"); WebDiscoveryProject.lastEv["mouseclickpage"] = WebDiscoveryProject.counter; WebDiscoveryProject.lastActive = WebDiscoveryProject.counter; @@ -4015,9 +3990,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.lastEv["scrollpage"] | 0) > 1 * WebDiscoveryProject.tmult ) { - if (WebDiscoveryProject.debug) { - _log("captureScrollPage "); - } + logger.debug("captureScrollPage "); WebDiscoveryProject.lastEv["scrollpage"] = WebDiscoveryProject.counter; WebDiscoveryProject.lastActive = WebDiscoveryProject.counter; @@ -4037,9 +4010,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.lastEv["copypage"] | 0) > 1 * WebDiscoveryProject.tmult ) { - if (WebDiscoveryProject.debug) { - _log("captureCopyPage"); - } + logger.debug("captureCopyPage"); WebDiscoveryProject.lastEv["copypage"] = WebDiscoveryProject.counter; WebDiscoveryProject.lastActive = WebDiscoveryProject.counter; var activeURL = WebDiscoveryProject.cleanCurrentUrl(ev.target.baseURI); @@ -4070,19 +4041,19 @@ const WebDiscoveryProject = { init: function () { return Promise.resolve().then(() => { - _log("Init function called:"); - WebDiscoveryProject.log = _log; + logger.debug("Init function called:"); + WebDiscoveryProject.log = logger.debug; return Promise.resolve() .then(() => { if (WebDiscoveryProject.db) { - _log("Closing database connections..."); + logger.debug("Closing database connections..."); return WebDiscoveryProject.db .asyncClose() .then(() => { WebDiscoveryProject.db = undefined; - _log("Closing database connections...done"); + logger.debug("Closing database connections...done"); }) - .catch((e) => _log(e)); + .catch((e) => logger.debug(e)); } else { return Promise.resolve(); } @@ -4091,7 +4062,7 @@ const WebDiscoveryProject = { const db = new Storage(WebDiscoveryProject); return db.init().then(() => { WebDiscoveryProject.db = db; - _log("Successfully connected to database"); + logger.debug("Successfully connected to database"); }); }) .then(() => { @@ -4136,7 +4107,7 @@ const WebDiscoveryProject = { // Means we have never sent the signal. WebDiscoveryProject.saveActiveUsageTime(); } else { - _log(`Active usage last sent from db as ${data}`); + logger.debug(`Active usage last sent from db as ${data}`); WebDiscoveryProject.activeUsageLastSent = parseInt(data); } }, @@ -4176,7 +4147,7 @@ const WebDiscoveryProject = { if (msg.action == "page") { if (msg.payload.tend && msg.payload.tin) { var duration = msg.payload.tend - msg.payload.tin; - _log( + logger.debug( "Duration spent: " + msg.payload.tend + " : " + @@ -4186,7 +4157,7 @@ const WebDiscoveryProject = { ); } else { var duration = null; - _log( + logger.debug( "Duration spent: " + msg.payload.tend + " : " + @@ -4227,11 +4198,11 @@ const WebDiscoveryProject = { // Check for title. if (msg.payload.x.t) { if (WebDiscoveryProject.isSuspiciousTitle(msg.payload.x.t)) { - _log("Suspicious Title: " + msg.payload.x.t); + logger.debug("Suspicious Title: " + msg.payload.x.t); return null; } } else { - _log("Missing Title: " + msg.payload.x.t); + logger.debug("Missing Title: " + msg.payload.x.t); return null; } @@ -4252,7 +4223,7 @@ const WebDiscoveryProject = { // the canonical exists and is ok msg.payload.url = canonical_url; } else { - _log("Suspicious url with no/bad canonical: " + msg.payload.url); + logger.debug("Suspicious url with no/bad canonical: " + msg.payload.url); return null; } } else { @@ -4307,10 +4278,10 @@ const WebDiscoveryProject = { // Check for canonical seen or not. if (msg.payload["x"]["canonical_url"]) { if (msg.payload["url"] == msg.payload["x"]["canonical_url"]) { - _log("Canoncial is same: "); + logger.debug("Canoncial is same: "); // canonicalSeen = WebDiscoveryProject.canoincalUrlSeen(msg.payload['x']['canonical_url']); if (msg.payload["csb"] && msg.payload["ft"]) { - _log("Canoncial seen before: "); + logger.debug("Canoncial seen before: "); delete msg.payload.csb; delete msg.payload.ft; } @@ -4397,7 +4368,7 @@ const WebDiscoveryProject = { // there are billions of results, only few of them are on the first page. // That it where we currently set the threshold. if (cleanR.length < 4) { - _log( + logger.debug( `Dropping message for query ${msg.payload.q}, as there are too few search results.`, ); return null; @@ -4406,8 +4377,8 @@ const WebDiscoveryProject = { newR[idx] = each; }); - _log("Original: " + JSON.stringify(msg.payload.r)); - _log("New: " + JSON.stringify(newR)); + logger.debug("Original: " + JSON.stringify(msg.payload.r)); + logger.debug("New: " + JSON.stringify(newR)); msg.payload.r = newR; } } @@ -4430,7 +4401,7 @@ const WebDiscoveryProject = { payload: payload, }); } else { - _log("Not a valid object, not sent to notification"); + logger.debug("Not a valid object, not sent to notification"); } }, @@ -4440,7 +4411,7 @@ const WebDiscoveryProject = { await WebDiscoveryProject.runAllMessageSanitizers(msg); if (accepted) { - _log( + logger.debug( "all checks passed: telemetry message added to the send queue:", msg, ); @@ -4466,7 +4437,7 @@ const WebDiscoveryProject = { ); }); } else { - _log("telemetry message has been discarded:", rejectDetails, msg); + logger.debug("telemetry message has been discarded:", rejectDetails, msg); } }, @@ -4507,7 +4478,7 @@ const WebDiscoveryProject = { try { msg = WebDiscoveryProject.sanitizePageMessageUrls(msg); } catch (e) { - _log("Error while sanitizing urls of page message", e, msg); + logger.debug("Error while sanitizing urls of page message", e, msg); return discard("failed safe quorum check for other urls"); } @@ -4631,7 +4602,11 @@ const WebDiscoveryProject = { let state; let comment; if (isPrivate) { - if (WebDiscoveryProject.contentExtractor.isSearchEngineUrl(url)) { + if ( + WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( + url, + ) + ) { state = "search"; comment = 'search pages never generate "page" messages'; } else { @@ -4696,42 +4671,41 @@ const WebDiscoveryProject = { else return null; }, - checkURL(pageContent, url, ruleset) { - return WebDiscoveryProject.contentExtractor.checkURL( + checkURL(pageContent, url) { + const { messages } = WebDiscoveryProject.contentExtractor.run( pageContent, url, - ruleset, ); + for (const message of messages) + WebDiscoveryProject.telemetry({ + type: WebDiscoveryProject.msgType, + action: message.action, + payload: message.payload, + }); }, /** * Used in context-search module - * - * TODO: A safer option would be to hard-code the list of - * search engines. Otherwise, updating the patterns can potentially - * change the search results that we show. */ isSearchEngineUrl(url) { - return WebDiscoveryProject.contentExtractor.isSearchEngineUrl(url); + return WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( + url, + ); }, aggregateMetrics: function (metricsBefore, metricsAfter) { var aggregates = { cp: 0, mm: 0, kp: 0, sc: 0, md: 0 }; - if (WebDiscoveryProject.debug) { - _log( - "aggregates: " + - JSON.stringify(metricsBefore) + - JSON.stringify(metricsAfter), - ); - } + logger.debug( + "aggregates: " + + JSON.stringify(metricsBefore) + + JSON.stringify(metricsAfter), + ); var _keys = Object.keys(aggregates); for (var i = 0; i < _keys.length; i++) { aggregates[_keys[i]] = metricsBefore[_keys[i]] + metricsAfter[_keys[i]]; } - if (WebDiscoveryProject.debug) { - _log("aggregates: " + JSON.stringify(aggregates)); - } + logger.debug("aggregates: " + JSON.stringify(aggregates)); return aggregates; }, @@ -4893,7 +4867,7 @@ const WebDiscoveryProject = { "activeUsage", JSON.stringify(WebDiscoveryProject.activeUsage), (result) => { - _log("Active usage stats saved:", result); + logger.debug("Active usage stats saved:", result); }, ); }, @@ -4937,7 +4911,7 @@ const WebDiscoveryProject = { "activeUsage", JSON.stringify(WebDiscoveryProject.activeUsage), (result) => { - _log("Active usage stats saved:", result); + logger.debug("Active usage stats saved:", result); }, ); WebDiscoveryProject.saveActiveUsageTime(); @@ -4952,7 +4926,7 @@ const WebDiscoveryProject = { ctry: WebDiscoveryProject.getCountryCode(), // Need to fix this. }; - _log( + logger.debug( `Sending alive message for the hour: ${h} , ${JSON.stringify(payload)}`, ); @@ -4969,17 +4943,17 @@ const WebDiscoveryProject = { t, (result) => { WebDiscoveryProject.activeUsageLastSent = t; - _log(`Active usage last sent as ${t}`); + logger.debug(`Active usage last sent as ${t}`); }, ); }, saveStrictQueries: function () { - _log("Saving local table"); + logger.debug("Saving local table"); WebDiscoveryProject.db.saveRecordTelemetry( "localStrictQueries", JSON.stringify(WebDiscoveryProject.strictQueries), (result) => { - _log("localStrictQueries saved:", result); + logger.debug("localStrictQueries saved:", result); }, ); }, @@ -4990,7 +4964,7 @@ const WebDiscoveryProject = { "bf", bf.join("|"), (result) => { - _log("bloom filter saved:", result); + logger.debug("bloom filter saved:", result); }, ); } @@ -4998,7 +4972,7 @@ const WebDiscoveryProject = { loadBloomFilter: function () { WebDiscoveryProject.db.loadRecordTelemetry("bf", function (data) { if (data == null) { - _log("There was no data on WebDiscoveryProject.bf"); + logger.debug("There was no data on WebDiscoveryProject.bf"); WebDiscoveryProject.bloomFilter = new BloomFilter( Array(bloomFilterSize).join("0"), bloomFilterNHashes, @@ -5017,7 +4991,7 @@ const WebDiscoveryProject = { "localStrictQueries", function (data) { if (data == null || data.length == 0) { - _log("There was no data on WebDiscoveryProject.bf"); + logger.debug("There was no data on WebDiscoveryProject.bf"); WebDiscoveryProject.strictQueries = []; } else { WebDiscoveryProject.strictQueries = JSON.parse(data); @@ -5035,10 +5009,10 @@ const WebDiscoveryProject = { e.qurl, function (url, page_data, ourl, x) { let cd = WebDiscoveryProject.docCache[url]["doc"]; - WebDiscoveryProject.checkURL(cd, url, "strict"); + WebDiscoveryProject.checkURL(cd, url); }, function (a, b, c, d) { - _log("Error aux>>>> " + d); + logger.debug("Error aux>>>> " + d); }, ); WebDiscoveryProject.strictQueries.splice(idx, 1); @@ -5103,13 +5077,13 @@ const WebDiscoveryProject = { // Check if there is a query. if (!query || query.length == 0) { - _log("No Query"); + logger.debug("No Query"); return Promise.reject("No Query"); } // If suspicious query. if (WebDiscoveryProject.isSuspiciousQuery(query)) { - _log("Query is suspicious"); + logger.debug("Query is suspicious"); sanitisedQuery = "(PROTECTED)"; } @@ -5130,7 +5104,7 @@ const WebDiscoveryProject = { (WebDiscoveryProject.isSuspiciousURL(query) || WebDiscoveryProject.dropLongURL(query)) ) { - _log("Query is dangerous"); + logger.debug("Query is dangerous"); sanitisedQuery = "(PROTECTED)"; } @@ -5145,20 +5119,24 @@ const WebDiscoveryProject = { } const urlPrivate = WebDiscoveryProject.bloomFilter.testSingle(md5(url)); if (urlPrivate) { - _log("Url is already marked private"); + logger.debug("Url is already marked private"); return Promise.reject("Url is already marked private"); } // Check URL is suspicious if (WebDiscoveryProject.isSuspiciousURL(url)) { - _log("Url is suspicious"); + logger.debug("Url is suspicious"); url = "(PROTECTED)"; } // Check URL is dangerous, with strict DROPLONGURL. if (WebDiscoveryProject.dropLongURL(url, { strict: true })) { // If it's Google / Yahoo / Bing. Then mask and send them. - if (WebDiscoveryProject.contentExtractor.isSearchEngineUrl(url)) { + if ( + WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl( + url, + ) + ) { url = WebDiscoveryProject.maskURL(url); } else { url = "(PROTECTED)"; @@ -5207,7 +5185,7 @@ const WebDiscoveryProject = { .isHostNamePrivate(query) .then((res) => { if (res) { - _log("Private Domain"); + logger.debug("Private Domain"); sanitisedQuery = "(PROTECTED)"; } if (sanitisedQuery) { @@ -5256,7 +5234,7 @@ const WebDiscoveryProject = { struct_aft.nifsh == null || struct_bef.nifsh != struct_aft.nifsh ) { - _log("fovalidDoubleFetch: number of internal iframes does not match"); + logger.debug("fovalidDoubleFetch: number of internal iframes does not match"); return false; } @@ -5274,7 +5252,7 @@ const WebDiscoveryProject = { struct_aft.nfsh == null || struct_bef.nfsh != struct_aft.nfsh ) { - _log("fovalidDoubleFetch: number of internal frameset does not match"); + logger.debug("fovalidDoubleFetch: number of internal frameset does not match"); return false; } @@ -5301,7 +5279,7 @@ const WebDiscoveryProject = { WebDiscoveryProject.location = json.location; } } catch (e) { - _log("Error loading config.", e); + logger.debug("Error loading config.", e); } }, getCountryCode: function () { @@ -5396,21 +5374,21 @@ const WebDiscoveryProject = { // engine. if (msg.action === "page") { const urls = WebDiscoveryProject.getUrlsToSanitizeFromPageMessage(msg); - _log("All urls in the message:" + JSON.stringify(urls)); + logger.debug("All urls in the message:" + JSON.stringify(urls)); for (const original of urls) { if (original.t === "canonical") { msg.payload.x.canonical_url = WebDiscoveryProject.maskURLStrict( original.url, ); - _log( + logger.debug( `Sanitized 'canonical': ${original.url} -> ${msg.payload.x.canonical_url}`, ); } if (original.t === "ref") { msg.payload.ref = WebDiscoveryProject.maskURLStrict(original.url); - _log(`Sanitized 'ref': ${original.url} -> ${msg.payload.ref}`); + logger.debug(`Sanitized 'ref': ${original.url} -> ${msg.payload.ref}`); } if (original.t.startsWith("red")) { @@ -5418,12 +5396,12 @@ const WebDiscoveryProject = { msg.payload.red[redPos] = WebDiscoveryProject.maskURLStrict( original.url, ); - _log( + logger.debug( `Sanitized 'ref++${redPos}': ${original.url} -> ${msg.payload.red[redPos]}`, ); } } - _log("All urls in the message:" + JSON.stringify(msg)); + logger.debug("All urls in the message:" + JSON.stringify(msg)); } }, addURLtoDB(url, ref, paylobj) { @@ -5450,7 +5428,9 @@ const WebDiscoveryProject = { */ var tt = new Date().getTime(); - if (WebDiscoveryProject.contentExtractor.isSearchEngineUrl(url)) { + if ( + WebDiscoveryProject.contentExtractor.urlAnalyzer.isSearchEngineUrl(url) + ) { return; } @@ -5482,7 +5462,7 @@ const WebDiscoveryProject = { WebDiscoveryProject.db.getURL(url, function (obj) { // If the url is already not in the DB or marked private, then we need save it. - _log(">>>>> Add url to dbobj" + obj.length + privateHash); + logger.debug(">>>>> Add url to dbobj" + obj.length + privateHash); if (!privateHash && obj.length === 0) { // does not exist var setPrivate = false; @@ -5499,40 +5479,38 @@ const WebDiscoveryProject = { // page data structure is empty, so no need to double fetch, is private let reason = "empty page data"; setPrivate = true; - _log("Setting private because empty page data"); + logger.debug("Setting private because empty page data"); } else if (WebDiscoveryProject.isSuspiciousURL(url)) { // if the url looks private already add it already as checked and private let reason = "susp. url"; setPrivate = true; - _log("Setting private because suspiciousURL"); + logger.debug("Setting private because suspiciousURL"); } else { if (WebDiscoveryProject.httpCache401[url]) { let reason = "401"; setPrivate = true; - _log("Setting private because of 401"); + logger.debug("Setting private because of 401"); } else { let reason = ""; setPrivate = false; } } - _log(">>>>> lets save >>> " + JSON.stringify(newObj)); + logger.debug(">>>>> lets save >>> " + JSON.stringify(newObj)); // This needs to simplified, if it needs to set Private, why insert it in the first place. // Possibly because else the remove url would break in setAsPrivate. WebDiscoveryProject.db.saveURL(url, newObj, function () { - if (WebDiscoveryProject.debug) { - _log("Insertion success add urltoDB"); - } + logger.debug("Insertion success add urltoDB"); if (setPrivate) WebDiscoveryProject.setAsPrivate(url); }); } else if (obj.length === 1) { - _log(">>>>> Add url to dbobj found record" + JSON.stringify(obj)); + logger.debug(">>>>> Add url to dbobj found record" + JSON.stringify(obj)); let record = obj[0]; // Looks like the URL is already there, we just need to update the stats. //Need to aggregate the engagement metrics. - _log(record); + logger.debug(record); let metricsBefore; if (typeof record.payload === "string") { // (possibly only reachable on Bootstrapped extensions) @@ -5553,7 +5531,7 @@ const WebDiscoveryProject = { cloneObj.payload = paylobj || {}; WebDiscoveryProject.db.updateURL(url, cloneObj, function () { - _log("Record updated"); + logger.debug("Record updated"); }); paylobj["e"] = { cp: 0, mm: 0, kp: 0, sc: 0, md: 0 }; @@ -5566,7 +5544,7 @@ const WebDiscoveryProject = { } WebDiscoveryProject.db.removeUnsafe(url, (result) => { - _log(`Deleting ${url} : ${result}`); + logger.debug(`Deleting ${url} : ${result}`); }); if (WebDiscoveryProject.state["v"][url]) { @@ -5576,7 +5554,7 @@ const WebDiscoveryProject = { }, setAsPublic: function (url) { WebDiscoveryProject.db.removeUnsafe(url, (result) => { - _log(`Deleting ${url} : ${result}`); + logger.debug(`Deleting ${url} : ${result}`); }); if (WebDiscoveryProject.state["v"][url]) { @@ -5594,7 +5572,7 @@ const WebDiscoveryProject = { ); }, processUnchecks: function (listOfUncheckedUrls) { - _log(">>> URLS UNPROCESSED >>> " + JSON.stringify(listOfUncheckedUrls)); + logger.debug(">>> URLS UNPROCESSED >>> " + JSON.stringify(listOfUncheckedUrls)); var url_pagedocPair = {}; for (var i = 0; i < listOfUncheckedUrls.length; i++) { @@ -5635,12 +5613,12 @@ const WebDiscoveryProject = { } } } - _log(">>>>> DOUBLE FETCH COUNT >>> " + JSON.stringify(obj)); + logger.debug(">>>>> DOUBLE FETCH COUNT >>> " + JSON.stringify(obj)); WebDiscoveryProject.db.saveRecordTelemetry( "last-double-fetch", JSON.stringify(obj), (result) => { - _log("last-double-fetch saved:", result); + logger.debug("last-double-fetch saved:", result); }, ); @@ -5658,7 +5636,7 @@ const WebDiscoveryProject = { // google and aclk? in it. if (targetURL.includes("google") && targetURL.includes("aclk?")) { const clickedU = normalizeAclkUrl(targetURL); - _log("ad-ctr: targetURL:", targetURL, "normalized to", clickedU); + logger.debug("ad-ctr: targetURL:", targetURL, "normalized to", clickedU); if (WebDiscoveryProject.adDetails[clickedU]) { let query = WebDiscoveryProject.adDetails[clickedU].query; @@ -5682,7 +5660,7 @@ const WebDiscoveryProject = { }, }; - _log("ad-ctr payload:", payload); + logger.debug("ad-ctr payload:", payload); WebDiscoveryProject.telemetry(payload); } } @@ -5703,17 +5681,20 @@ const WebDiscoveryProject = { addStrictQueries(url, query) { // In some cases, we get query undefined. if (!query) { - _log(">> Got an undefined query >>> " + url); + logger.debug(">> Got an undefined query >>> " + url); return; } if (WebDiscoveryProject.isSuspiciousQuery(query)) { - _log("Dropping suspicious query before double-fetch:", query); + logger.debug("Dropping suspicious query before double-fetch:", query); return; } const { isSearchEngineUrl, queryUrl } = - WebDiscoveryProject.contentExtractor.checkAnonSearchURL(url, query); + WebDiscoveryProject.contentExtractor.urlAnalyzer.checkAnonSearchURL( + url, + query, + ); if (isSearchEngineUrl) { try { const qObj = { @@ -5731,6 +5712,7 @@ const WebDiscoveryProject = { }, }; WebDiscoveryProject.contentExtractor = new ContentExtractor( + WebDiscoveryProject.patterns, WebDiscoveryProject, ); diff --git a/modules/web-discovery-project/tests/unit/content-extractor-test.es b/modules/web-discovery-project/tests/unit/content-extractor-test.es index 43f3be77..ac593ecf 100644 --- a/modules/web-discovery-project/tests/unit/content-extractor-test.es +++ b/modules/web-discovery-project/tests/unit/content-extractor-test.es @@ -9,8 +9,6 @@ const fs = require("fs"); const path = require("path"); const zlib = require("zlib"); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; const expect = chai.expect; const R = require("ramda"); @@ -61,14 +59,9 @@ function findAllFixtures() { * If they deviate too much from production, the tests will have less * value in catching bugs. */ -const DEFAULT_PATTERNS = { - normal: jsonParse( - fs.readFileSync(`${FIXTURES_BASE_PATH}/patterns.json`, "utf8"), - ), - strict: jsonParse( - fs.readFileSync(`${FIXTURES_BASE_PATH}/patterns-anon.json`, "utf8"), - ), -}; +const DEFAULT_PATTERNS = jsonParse( + fs.readFileSync(`${FIXTURES_BASE_PATH}/rules.json`, "utf8"), +); const enableLogging = true; @@ -101,25 +94,14 @@ export default describeModule( this.timeout(20000); let ContentExtractor; - let WebDiscoveryProject; - let uut; - let mockWindow; + let WDP; let document; let fixture; - const setupDocument = function (html) { - mockWindow = new JSDOM(`

Test DOM

`).window; - - document = mockWindow.document; - document.open(); - document.write(html); - document.close(); - }; - const initFixture = function (_path) { try { fixture = readFixtureFromDisk(_path); - setupDocument(fixture.html); + document = WDP.parseHtml(fixture.html); } catch (e) { throw new Error(`Failed to load test fixture "${_path}": ${e}`, e); } @@ -136,9 +118,7 @@ export default describeModule( )(sinonSpy.args); } - const messages = groupTelemetryCallsByAction( - WebDiscoveryProject.telemetry, - ); + const messages = groupTelemetryCallsByAction(WDP.telemetry); // uncomment to export expectations: // fs.writeFileSync('/tmp/failing-test-expected-messages.json', JSON.stringify(messages)); if (fixture.mustContain) { @@ -176,51 +156,69 @@ export default describeModule( }; const oldURL = global.URL; - beforeEach(function () { + beforeEach(async function () { /* eslint-disable-next-line global-require */ global.URL = global.URL || require("url").URL; + const Patterns = ( + await this.system.import("web-discovery-project/patterns") + ).default; + const parseHtml = ( + await this.system.import("web-discovery-project/html-helpers") + ).parseHtml; + ContentExtractor = this.module().ContentExtractor; - WebDiscoveryProject = { + WDP = { debug: enableLogging, msgType: "wdp", getCountryCode() { return "de"; }, - maskURL(url) { return url; }, - // args: msg, instantPush telemetry: sinon.fake(), - // args: url, query addStrictQueries: sinon.fake(), - queryCache: {}, + patterns: new Patterns(), + checkURL: (doc, url) => { + const { messages } = WDP.contentExtractor.run(doc, url); + for (const message of messages) + WDP.telemetry({ + type: WDP.msgType, + action: message.action, + payload: message.payload, + }); + }, }; - uut = new ContentExtractor(WebDiscoveryProject); + WDP.contentExtractor = new ContentExtractor(WDP.patterns, WDP); + WDP.parseHtml = parseHtml; }); afterEach(function () { document = null; fixture = null; global.URL = oldURL; - - if (mockWindow) { - mockWindow = null; - } }); describe("with an empty ruleset", function () { describe("#isSearchEngineUrl", function () { it("should not match any URL", function () { - expect(uut.isSearchEngineUrl("about:blank")).to.be.false; - expect(uut.isSearchEngineUrl("http://www.example.com/")).to.be - .false; - expect(uut.isSearchEngineUrl("https://www.google.de/search?q=test")) - .to.be.false; + expect( + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl("about:blank"), + ).to.be.false; + expect( + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl( + "http://www.example.com/", + ), + ).to.be.false; + expect( + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl( + "https://www.google.de/search?q=test", + ), + ).to.be.false; }); }); @@ -229,38 +227,36 @@ export default describeModule( initFixture("go/angela-merkel-2023-10-10"); }); - it('should not find any data (ruleset: "normal")', function () { - uut.checkURL(document, fixture.url, "normal"); - expect(WebDiscoveryProject.addStrictQueries.notCalled); - expect(WebDiscoveryProject.telemetry.notCalled); - }); - - it('should not find any data (ruleset: "strict")', function () { - uut.checkURL(document, fixture.url, "strict"); - expect(WebDiscoveryProject.addStrictQueries.notCalled); - expect(WebDiscoveryProject.telemetry.notCalled); + it("should not find any data", function () { + WDP.checkURL(document, fixture.url); + expect(WDP.addStrictQueries.notCalled); + expect(WDP.telemetry.notCalled); }); }); }); describe("with a realistic ruleset", function () { beforeEach(function () { - uut.updatePatterns(DEFAULT_PATTERNS.normal, "normal"); - uut.updatePatterns(DEFAULT_PATTERNS.strict, "strict"); + WDP.patterns.update(DEFAULT_PATTERNS); }); describe("#isSearchEngineUrl", function () { it("matches the configured search engines", function () { // no match: - expect(uut.isSearchEngineUrl("about:blank")).to.be.false; - expect(uut.isSearchEngineUrl("http://www.example.com/")).to.be - .false; + expect( + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl("about:blank"), + ).to.be.false; + expect( + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl( + "http://www.example.com/", + ), + ).to.be.false; // should match: - expect(uut.isSearchEngineUrl("https://www.google.de/search?q=test")) - .to.be.true; expect( - uut.isSearchEngineUrl("https://www.google.com/search?q=test"), + WDP.contentExtractor.urlAnalyzer.isSearchEngineUrl( + "https://www.google.de/search?q=test", + ), ).to.be.true; }); }); @@ -271,10 +267,9 @@ export default describeModule( }); it("should not find any data", function () { - uut.checkURL(document, fixture.url, "normal"); - uut.checkURL(document, fixture.url, "strict"); - expect(WebDiscoveryProject.addStrictQueries.notCalled); - expect(WebDiscoveryProject.telemetry.notCalled); + WDP.checkURL(document, fixture.url); + expect(WDP.addStrictQueries.notCalled); + expect(WDP.telemetry.notCalled); }); }); @@ -283,16 +278,10 @@ export default describeModule( initFixture("go/angela-merkel-2023-10-10"); }); - it('should find search results (ruleset: "normal")', function () { - uut.checkURL(document, fixture.url, "normal"); - expect(WebDiscoveryProject.addStrictQueries.called); - expect(WebDiscoveryProject.telemetry.notCalled); - }); - - it('should find search results (ruleset: "strict")', function () { - uut.checkURL(document, fixture.url, "strict"); - expect(WebDiscoveryProject.addStrictQueries.notCalled); - expect(WebDiscoveryProject.telemetry.called); + it("should find search results", function () { + WDP.checkURL(document, fixture.url); + expect(WDP.addStrictQueries.called); + expect(WDP.telemetry.called); }); }); }); @@ -300,17 +289,16 @@ export default describeModule( findAllFixtures().forEach((fixtureDir) => { describe(`in scenario: ${fixtureDir}`, function () { beforeEach(function () { - uut.updatePatterns(DEFAULT_PATTERNS.normal, "normal"); - uut.updatePatterns(DEFAULT_PATTERNS.strict, "strict"); + WDP.patterns.update(DEFAULT_PATTERNS); }); it("should pass the fixture's expections", function () { // Given initFixture(fixtureDir); - WebDiscoveryProject.telemetry = sinon.spy(); + WDP.telemetry = sinon.spy(); // When - uut.checkURL(document, fixture.url, "strict"); + WDP.checkURL(document, fixture.url); // Then verifyFixtureExpectations(); @@ -320,20 +308,20 @@ export default describeModule( describe("#tryExtractBraveSerpQuery", function () { const expectNotFound = (url) => { - if (uut.tryExtractBraveSerpQuery(url)) { + if (WDP.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery(url)) { chai.assert.fail(`Expected not to find a query on url=${url}`); } }; it("should find search terms on search.brave.software", function () { expect( - uut.tryExtractBraveSerpQuery( + WDP.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery( "https://search.brave.software/search?lang=en&country=us&safe_search=on&q=harzer%20k%C3%A4se", ), ).to.equal("harzer käse"); expect( - uut.tryExtractBraveSerpQuery( + WDP.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery( "https://search.brave.software/search?q=m%C3%BCnchen&lang=en&country=de", ), ).to.equal("münchen"); @@ -341,13 +329,13 @@ export default describeModule( it("should find search terms on search.brave.com", function () { expect( - uut.tryExtractBraveSerpQuery( + WDP.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery( "https://search.brave.com/search?lang=en&country=us&safe_search=on&q=harzer%20k%C3%A4se", ), ).to.equal("harzer käse"); expect( - uut.tryExtractBraveSerpQuery( + WDP.contentExtractor.urlAnalyzer.tryExtractBraveSerpQuery( "https://search.brave.com/search?q=m%C3%BCnchen&lang=en&country=de", ), ).to.equal("münchen"); @@ -406,177 +394,5 @@ export default describeModule( }); }); }); - - describe("#_jsonPath", function () { - let _jsonPath; - - beforeEach(function () { - _jsonPath = this.module()._jsonPath; - }); - - it("should extract fields from JSON", function () { - expect(_jsonPath('{"a":1}', "a")).to.equal("1"); - expect(_jsonPath('{"a":1, "b":"2"}', "b")).to.equal("2"); - }); - - it("should extract nested fields from JSON", function () { - expect(_jsonPath('{ "a": { "nested": true } }', "a.nested")).to.equal( - "true", - ); - expect(_jsonPath('{ "a": { "b": { "c": "3" } } }', "a.b.c")).to.equal( - "3", - ); - }); - - it("should reject unexpected normal text", function () { - expect(_jsonPath("Some example text", "")).to.equal(""); - expect(_jsonPath("Some example text", "key")).to.equal(""); - expect(_jsonPath('Some example text {"key":"1"}', "key")).to.equal(""); - }); - - it("should by default not extract non-trivial objects", function () { - expect(_jsonPath('{"a":[1,2,3]}', "a")).to.equal(""); - expect(_jsonPath('{"a":{"b":1}"}', "a")).to.equal(""); - }); - - it("should extract non-trivial objects when enabled", function () { - expect(JSON.parse(_jsonPath('{"a":[1,2,3]}', "a", true))).to.deep.equal( - [1, 2, 3], - ); - expect(JSON.parse(_jsonPath('{"a":[1,2,3]}', "a", true))).to.deep.equal( - [1, 2, 3], - ); - expect(JSON.parse(_jsonPath('{"a":{"b":1}}', "a", true))).to.deep.equal( - { b: 1 }, - ); - }); - - it("should ignore incorrect JSON", function () { - expect(_jsonPath("", "a")).to.equal(""); - expect(_jsonPath("][", "a")).to.equal(""); - expect(_jsonPath("a:3", "a")).to.equal(""); - expect(_jsonPath("a:3}", "a")).to.equal(""); - }); - }); - - describe("#_mergeArr", function () { - let _mergeArr; - - beforeEach(function () { - _mergeArr = this.module()._mergeArr; - }); - - it("should pass regression tests", function () { - expect( - _mergeArr({ x: [1, 2, 3], y: [4, 5, 6], z: [7, 8, 9] }), - ).to.deep.equal([ - { x: 1, y: 4, z: 7 }, - { x: 2, y: 5, z: 8 }, - { x: 3, y: 6, z: 9 }, - ]); - }); - }); - - describe("#_allMandatoryFieldsSet", function () { - let _allMandatoryFieldsSet; - - beforeEach(function () { - _allMandatoryFieldsSet = this.module()._allMandatoryFieldsSet; - }); - - it("should accept a message where all mandatory fields are set", function () { - // this is similar in structure to a search query - const payload = { - r: { - 0: { - t: "title: foo", - u: "https://example.test/foo", - }, - }, - q: "some query", - qurl: "https://example.test/some/query", - ctry: "de", - }; - const expectedFields = [ - { key: "r", type: "array" }, - { key: "q", type: "object" }, - { key: "qurl", type: "object" }, - { key: "ctry", type: "object" }, - ]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.true; - }); - - it("should accept an array where all inner entries are filled", function () { - const payload = { - r: { - 0: { - t: "title: foo", - u: "https://example.test/foo", - }, - 1: { - t: "title: bar", - u: "https://example.test/bar", - }, - }, - }; - const expectedFields = [{ key: "r", type: "array" }]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.true; - }); - - it("should accept an array where at least one inner entry is filled", function () { - const payload = { - r: { - 0: { - t: null, - u: "https://example.test/foo", - }, - 1: { - t: null, - u: null, - }, - }, - }; - const expectedFields = [{ key: "r", type: "array" }]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.true; - }); - - describe("should reject an array where all inner entries are missing:", function () { - it("when not found by css selectors", function () { - const payload = { - r: { - 0: { t: null, u: null }, - 1: { t: null, u: null }, - }, - }; - const expectedFields = [{ key: "r", type: "array" }]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.false; - }); - - it("when all values are falsy", function () { - const payload = { - r: { - 0: { t: null, u: undefined }, - 1: { t: "" }, - }, - }; - const expectedFields = [{ key: "r", type: "array" }]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.false; - }); - - it("when the array itself is empty", function () { - const payload = { - r: {}, - }; - const expectedFields = [{ key: "r", type: "array" }]; - - expect(_allMandatoryFieldsSet(payload, expectedFields)).to.be.false; - }); - }); - }); }, ); diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/page.html.gz b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/page.html.gz index bb1896ac..e05cbdf3 100644 Binary files a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/page.html.gz and b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/page.html.gz differ diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/scenario.json index 7bb94270..70b3db1d 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/fussballschuh-2023-10-10/scenario.json @@ -1,6 +1,5 @@ { - "url": "https://www.amazon.de/s/ref=nb_sb_noss_2?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&url=search-alias%3Daps&field-keywords=fussballschuh&rh=i%3Aaps%2Ck%3Afussballschuh", - + "url": "https://www.amazon.de/s?k=fussballschuh", "mustContain": [ { "type": "wdp", @@ -8,313 +7,310 @@ "payload": { "r": { "0": { - "t": "sudorun Football Boots Children's Artificial Grass Football Boots For Boys Girls FG AG Training Shoes", - "a": null, - "u": "/sudorun-Football-Childrens-Artificial-Training/dp/B0CKBH7S3L" + "t": "WOWEI Fußballschuhe Herren High Top Spike Cleats Outdoor Athletics Fußballschuhe Kinder Professionelle Trainingsschuhe Sport Fußball Stiefel Wettkampf", + "a": "Gesponsert", + "u": "/WOWEI-Fußballschuhe-Professionelle-Fussballschuhe-Trainingsschuhe/dp/B0D9Y3VF7C" }, "1": { - "t": "AMLCWZ Football Boots Spike Shoes for Men High Top Football Boots Sporty Competition Training Shoes for Outdoor Indoor Football Shoes Boys Youth Sporty Shoes Unisex", - "a": null, - "u": "/Football-Competition-Training-Outdoor-2celadon/dp/B0CBXM9X33" + "t": "Kmrlofiy Fußballschuhe Herren Professionelle Fußballschuhe High Top Spike Cleats Fussballschuhe Outdoor Sport Athletics Herren Junge Sportschuhe Trainingsschuhe", + "a": "Gesponsert", + "u": "/Kmrlofiy-Fußballschuhe-Professionelle-Fussballschuhe-Trainingsschuhe/dp/B0DD6V1DWR" }, "2": { - "t": "Xinghuanhua Men's Outdoor Turf Trainer High Top Lightweight Football Boots Slip Resistant", - "a": null, - "u": "/dp/B0CKNM2WQF" + "t": "DASHANGYAN Fußballschuhe Herren High Top Spikes Fußballschuhe Outdoor-Fußballtraining Jugendliche Atmungsaktive Turnschuhe Kicking Athletic Training Schuhe Turnschuhe", + "a": "Gesponsert", + "u": "/DASHANGYAN-Fußballschuhe-Outdoor-Fußballtraining-Jugendliche-Atmungsaktive/dp/B0DB1ZP4PR" }, "3": { - "t": "BINBINNIAO Men's Football Boots for Boys Children Football Shoes Stud Shoes Outdoor Professional Training Shoes Trainers Lightweight and Non-Slip", - "a": null, - "u": "/BINBINNIAO-Football-Children-Professional-Lightweight/dp/B0CJ7DRDZG" + "t": "WEJIESS Herren Fußballschuhe High Top Spikes Jugend Outdoor-Training Fußballschuhe Professionelle sportliche Sportschuhe Turf Trainer", + "a": "Gesponsert", + "u": "/WEJIESS-Fußballschuhe-Outdoor-Training-Professionelle-Sportschuhe/dp/B0DBVV7S3L" }, "4": { - "t": "PUMA Men's Future Play Fg/AG Football Boots", + "t": "adidas Unisex F50 Club Football Boots Fg/Mg Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Football-Boots-Black-Silver/dp/B0BLHTW5YJ" + "u": "https://www.amazon.de/adidas-Unisex-Fußballschuhe-Sneaker-Yellow/dp/B0CYN2FHWP" }, "5": { - "t": "Puma Men's Future Match FG/AG Football Boots", + "t": "Nike Herren Zoom Vapor 16 Academy FG/MG Sneaker, Glacier Blue/Blue Orbit, 39 EU", "a": null, - "u": "https://www.amazon.de/-/en/Puma-Future-Match-Football-Boots/dp/B0BLHTW3BJ" + "u": "https://www.amazon.de/Nike-Herren-Academy-Sneaker-Glacier/dp/B0D9DF3Z8T" }, "6": { - "t": "adidas Unisex Predator Accuracy.3 Laceless Firm Ground Football Boots (Firm Surface)", + "t": "PUMA Unisex King Top Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Predator-Accuracy-3-Laceless-Football-Surface/dp/B0C5PQF4SN" + "u": "https://www.amazon.de/PUMA-Unisex-Fussballschuh-Black-White/dp/B0C2V9QWB3" }, "7": { - "t": "Nike Men's Vapor Jersey", + "t": "PUMA Unisex Future 7 Pro Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Vapor-Jersey-Black-Chrome-Hyper/dp/B0C4LL7J4R" + "u": "https://www.amazon.de/PUMA-Unisex-Future-Fussballschuh-White-Fizzy/dp/B0CKJ93XP2" }, "8": { - "t": "adidas Unisex X Crazyfast.3 Firm Ground Fußballschuhe (Fester Untergrund)", + "t": "PUMA Unisex Ultra 5 Play Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Crazyfast-3-Football-Ground/dp/B0C3WJP1L2" + "u": "https://www.amazon.de/PUMA-Unisex-Ultra-Fussballschuh-Black/dp/B0CKJ9TGW4" }, "9": { - "t": "adidas Copa Pure.3 Men's Football Boots Multiground Football Boots (Multi Ground)", + "t": "adidas Unisex F50 League Football Boots Fg/Mg Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Pure-3-Football-Ground/dp/B0BZ7WKM45" + "u": "https://www.amazon.de/adidas-Unisex-League-Fußballschuhe-Sneaker/dp/B0CYNFDCND" }, "10": { - "t": "PUMA Men's Ultra Play Fg/AG Football Boots", - "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Football-Boots-Black-Asphalt/dp/B0BLHSMFX9" + "t": "Adoff Fußballschuhe Herren Stollen Fußballschuhe High Top Spikes Outdoor Training Jugend Atmungsaktive Sportschuhe Fußballschuhe Anti-Rutsch", + "a": "Gesponsert", + "u": "/Adoff-Fußballschuhe-Atmungsaktive-Sportschuhe-Anti-Rutsch/dp/B0D4RDJRWM" }, "11": { - "t": "adidas Unisex Copa Pure.3 Firm Ground Sneaker", - "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Pure-3-Ground-Trainers/dp/B0BPF79ZCT" + "t": "MINGSIBO Fußballschuhe Herren Kinder Fußballtraining,Training, professionelle Fußballschuhe, Fußballschuhe", + "a": "Gesponsert", + "u": "/MINGSIBO-Fußballschuhe-Fußballtraining-professionelle-Fußballschuhe-Beige/dp/B0DDLPJL51" }, "12": { - "t": "Football Shoes Unisex Professional Football Shoes Men Outdoor Boys High Band for Artificial Grass Suitable AG Competition Training Shoes Children", - "a": null, - "u": "/Football-Professional-Suitable-Artificial-Competition/dp/B0C7D34BJP" + "t": "AMLCWZ Herren Fußballschuhe Stiefel Jugendliche im Freien Spikes High Top Fußballschuhe Kicking Training Turnschuhe Professionelle Spiel Fußballschuhe Obere Nahtlose EIN Stück Fußballschuhe", + "a": "Gesponsert", + "u": "/AMLCWZ-Fußballschuhe-Jugendliche-Turnschuhe-Professionelle/dp/B0D9R5XVZM" }, "13": { - "t": "WEJIESS Men's Football Boots, High Top Spikes Youth Outdoor Training Football Shoes, Professional Sporty Sports Shoes, Turf Trainer", - "a": null, - "u": "/Wejiess-Football-Outdoor-Training-Professional/dp/B0BP13RGPD" + "t": "Meidiastra Fussballschuhe Herren Professionelle Fussballschuhe Unisex Outdoor Kinder High Top Spikes Kicking Training Turnschuhe AG Turf Trainingsschuhe", + "a": "Gesponsert", + "u": "/Meidiastra-Fussballschuhe-Professionelle-Turnschuhe-Trainingsschuhe/dp/B0DFXQX1JD" }, "14": { - "t": "WEJIESS Football Boots, Men's Non-Slip Spikes, Professional Spikes, Football Game Shoes, Boys Football Trainers, Laces", + "t": "PUMA Unisex Future 8 Play Fg/Ag Fussballschuh", "a": null, - "u": "/WEJIESS-Football-Non-Slip-Professional-Training/dp/B0BLH53HDP" + "u": "https://www.amazon.de/PUMA-Unisex-Future-Fussballschuh-White-Glowing/dp/B0D1YL73FJ" }, "15": { - "t": "BLBK Fußballschuhe Unisex Professionelle fußballschuhe Herren Outdoor Jungen hohe Bande für Kunstrasen geeignet AG Wettkampf- Trainingsschuhe Kinder", + "t": "PUMA Unisex Future 7 Play Fg/Ag Soccer Shoe", "a": null, - "u": "/Football-Professional-Suitable-Artificial-Competition/dp/B0C6NGF1MJ" + "u": "https://www.amazon.de/PUMA-Unisex-Fussballschuh-HYPERLINK-Blue-Mint/dp/B0CKJ8S89T" }, "16": { - "t": "adidas Herren Predator Accuracy.3 Fg Sneaker", + "t": "adidas Unisex Copa Pure Ii Club Football Boots Flexible Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Predator-Accuracy-3-Football-Surface/dp/B0BPF4ZBQS" + "u": "https://www.amazon.de/adidas-Unisex-Football-Flexible-Fußballschuhe/dp/B0CKY16GCY" }, "17": { - "t": "Nike Herren Phantom Gx Academy Fg/Mg Sneaker", + "t": "adidas Unisex F50 Club Football Boots Indoor Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Phantom-Academy-Trainers-Bright-Crimson/dp/B0C4LPVL1N" + "u": "https://www.amazon.de/adidas-Unisex-Fußballschuhe-Indoor-Sneaker/dp/B0CYNLC367" }, "18": { - "t": "PUMA Men's Future Play IT Football Boots", - "a": null, - "u": "https://www.amazon.de/-/en/Future-Football-Boots-White-Orchid/dp/B0BLHVD167" + "t": "MINGSIBO Fussballschuhe Cleats Herren-fußballschuhe Kinder Outdoor AG Turf Jugendliche Erwachsene Professionelle Trainingsschuhe", + "a": "Gesponsert", + "u": "/MINGSIBO-Fussballschuhe-Herren-fußballschuhe-Professionelle-Trainingsschuhe/dp/B0CDQ3NJR5" }, "19": { - "t": "PUMA Unisex Ultra Ultimate Fg/Ag Football Boots", - "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Ultimate-Football-Orange-Glimmer/dp/B0BSLGSB5M" + "t": "AMLCWZ Fussballschuhe Herren Outdoor Professionelle Fußballschuhe Neutrale Kick Sport Training Schuhe Teenager Fußballschuhe Kinder Fußballschuhe Für Kunstrasen AG Professionelle Spiel Schuhe", + "a": "Gesponsert", + "u": "/AMLCWZ-Fussballschuhe-Professionelle-Fußballschuhe-Kunstrasen/dp/B0DD8G8DSP" }, "20": { - "t": "BLBK Fußballschuhe Unisex Professionelle fußballschuhe Herren Outdoor Jungen hohe Bande für Kunstrasen geeignet AG Wettkampf- Trainingsschuhe Kinder", - "a": null, - "u": "/Football-Professional-Suitable-Artificial-Competition/dp/B0C6NJJSV8" + "t": "Adoff Fußballschuhe, Herren Stollen, Fußballschuhe, Low Top Stollen, Outdoor Training, atmungsaktive Sportschuhe für Jugendliche, Fußballschuhe, Anti Rutsch", + "a": "Gesponsert", + "u": "/Adoff-Fußballschuhe-atmungsaktive-Sportschuhe-Jugendliche/dp/B0D7SV1631" }, "21": { - "t": "Xinghuanhua Boys' Football Boots with Gold Soles, Studs, Football Boots for Students, Grass Training Boots, Outdoor Football Boots", - "a": null, - "u": "/Xinghuanhua-Football-Students-Training-Outdoor/dp/B09NJDQH5P" + "t": "Fussballschuhe Herren Professionelle Fussballschuhe Unisex Outdoor Kinder hohe Bande für Kunstrasen geeignet AG Wettkampf- Trainingsschuhe", + "a": "Gesponsert", + "u": "/Fussballschuhe-Professionelle-Kunstrasen-Wettkampf-Trainingsschuhe/dp/B0D9VHXXFQ" }, "22": { - "t": "BINBINNIAO Football Boots Men's Sports High Top Non-Slip Men's Football Boots Football Team Turf Shoes Unisex Football Shoes for Men", + "t": "Nike Herren Vapor Trikot", "a": null, - "u": "/BINBINNIAO-Football-Sports-Non-Slip-Moonlight/dp/B0CJ9PJLHC" + "u": "https://www.amazon.de/Nike-Herren-Vapor-Fussballschuh-Chrome-Hyper/dp/B0C4LG4375" }, "23": { - "t": "BLBK Football shoes, men's training, sports shoes, children's football boots, studs, men's and boys' football shoes", + "t": "PUMA Herren Future 7 Play Fg/Ag Soccer Shoe", "a": null, - "u": "/Football-Training-Sports-Childrens-platinum/dp/B0BHF79VD9" + "u": "https://www.amazon.de/Future-Soccer-Shoes-Black-Puma-White/dp/B0C341GQWS" }, "24": { - "t": "Nike Men's Legend 10 Football Boots", + "t": "adidas Unisex X Crazyfast.3 Firm Ground Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Legend-Football-Boots-Bright-Crimson/dp/B0C4LTQB9F" + "u": "https://www.amazon.de/adidas-Unisex-Crazyfast-3-Football-Ground/dp/B0C3WHWX6B" }, "25": { - "t": "PUMA Unisex Children's Future Ultimate Fg/Ag Football Boots", + "t": "adidas Unisex Copa Pure Ii Club Football Boots Turf Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Ultimate-Football-Boots-Orchid/dp/B0C92T18M2" + "u": "https://www.amazon.de/adidas-Unisex-Football-Boots-Turf-Fußballschuhe/dp/B0CKXZJTYC" }, "26": { - "t": "adidas Unisex X Crazyfast.3 Soft Ground Fußballschuhe (weicher Boden)", + "t": "adidas Unisex X Crazyfast.3 Sneaker", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Crazyfast-3-Ground-Football/dp/B0C5QHZH3D" + "u": "https://www.amazon.de/adidas-Unisex-Crazyfast-3-Sneaker-Schwarz/dp/B0C3WH5BRY" }, "27": { - "t": "PUMA Men's Future Pro Fg/Ag Football Boots, Puma Black Puma White", + "t": "adidas Unisex Predator Club Turf Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Football-Boots-White-Orchid/dp/B0BLHRHPMM" + "u": "https://www.amazon.de/adidas-Unisex-Predator-Fußballschuhe-Sneaker/dp/B0CYNBR44M" }, "28": { - "t": "PUMA Kids Ultra Play FG/AG JR Football Boots", + "t": "adidas Unisex Copa Pure Ii League Football Boots Firm Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Football-Boots-White-Orchid/dp/B0BLHR7L3P" + "u": "https://www.amazon.de/adidas-Unisex-League-Football-Fußballschuhe/dp/B0CKY1658L" }, "29": { - "t": "adidas Unisex X Crazyfast.3 Fußballschuhe (Multi Ground)", + "t": "adidas Unisex Predator Club Firm Multi Ground Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Crazyfast-3-Football-Ground/dp/B0BZ7T2WM8" + "u": "https://www.amazon.de/adidas-Unisex-Predator-Fußballschuhe-Sneaker/dp/B0CYN9KY8G" }, "30": { - "t": "adidas Unisex Copa Pure Ii.3 Firm Ground Fußballschuhe (Fester Untergrund)", + "t": "PUMA Herren Attacanto Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Ground-Football-Surface/dp/B0C5Q1V3QB" + "u": "https://www.amazon.de/PUMA-Herren-ATTACANTO-Fussballschuh-White-Glowing/dp/B0D1YK2TD7" }, "31": { - "t": "adidas X CRAZYFAST.3 FG J", + "t": "adidas Unisex Copa Pure Iii Club Indoor Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Childrens-Crazyfast-3-Football/dp/B0C5QH5BD9" + "u": "https://www.amazon.de/adidas-Unisex-Hallenfußballschuhe-Sneaker-White/dp/B0CYNH165N" }, "32": { - "t": "PUMA Men's Ultra Pro Fg/Ag Football Boots", + "t": "adidas Unisex Predator League Laceless Football Boots Firm Ground Fußballschuhe für festen Boden", "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Football-Boots-White-Orchid/dp/B0BLHT9ZHX" + "u": "https://www.amazon.de/adidas-Predator24-Laceless-Football-Fußballschuhe/dp/B0CKY1B38Y" }, "33": { - "t": "PUMA Men's Future Pro Mxsg Football Boots", + "t": "PUMA Herren Future Match Fg/Ag Fussballschuh", "a": null, - "u": "https://www.amazon.de/-/en/Future-Football-Boots-White-Orchid/dp/B0BLHTL1P7" + "u": "https://www.amazon.de/PUMA-Herren-Future-Fussballschuh-Schwarz/dp/B09YVW2RQP" }, "34": { - "t": "PUMA Herren Zukünftiges Spiel+ Ll Fg/Ag Fussballschuh", + "t": "PUMA Unisex King 21 It Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Boots-Orchid/dp/B0BLHVH2XV" + "u": "https://www.amazon.de/Unisex-Adults-Soccer-White-Puma-Black-Gum/dp/B09L5KJ143" }, "35": { - "t": "PUMA Men's Future Game MG Football Boots", + "t": "adidas Unisex F50 League Football Boots Soft Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Boots-Orchid/dp/B0BLHS1WJR" + "u": "https://www.amazon.de/adidas-Unisex-League-Fußballschuhe-Sneaker/dp/B0CYN2FWNG" }, "36": { - "t": "adidas Men's Predator Accuracy.3 Firm Ground Football Boots (Firm Surface)", + "t": "adidas Unisex Copa Pure Iii Club Turf Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Predator-Accuracy-3-Football-Surface/dp/B0C5Q2WGG6" + "u": "https://www.amazon.de/adidas-Unisex-Fußballschuhe-Sneaker-Carbon/dp/B0CYN87R2L" }, "37": { - "t": "PUMA Men's Ultra Play MG Football Boots", + "t": "PUMA Unisex Ultra 5 Match Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Football-Boots-White-Orchid/dp/B0BLHTGHTS" + "u": "https://www.amazon.de/PUMA-Unisex-Ultra-Fussballschuh-White-BLUEMAZING/dp/B0CKJ7QW88" }, "38": { - "t": "PUMA Men's Future Match+ Ll Fg/Ag Football Boots", + "t": "Nike Zoom Vapor 15 Academy Sneaker für Herren", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Boots-Orchid/dp/B0BLHW4F5M" + "u": "https://www.amazon.de/Nike-Herren-Academy-Fußballschuh-Lemonade/dp/B0CSJVDR4D" }, "39": { - "t": "Nike Herren Academy Fussballschuh", + "t": "PUMA Unisex Kinder Future 7 Play Fg/Ag Jr Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Academy-Football-Boots-Bright-Crimson/dp/B0C4LJ8J39" + "u": "https://www.amazon.de/PUMA-Future-Fussballschuh-Skies-Elektro-Purple-Fizzy/dp/B0CKJ7D4HC" }, "40": { - "t": "PUMA Unisex King Top FG Football Boots, black", + "t": "PUMA Unisex Future 8 Match Low Fg/Ag Fussballschuh", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Football-Boots-Black-White/dp/B07KFZQWQK" + "u": "https://www.amazon.de/PUMA-Unisex-Future-Fussballschuh-White-Glowing/dp/B0D1YK81RK" }, "41": { - "t": "adidas - Kaiser 5, Herren Fußballschuhe", + "t": "adidas Unisex Predator Club Football Boots Flexible Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Performance-Mundial-Football-Boots/dp/B000OWL7CO" + "u": "https://www.amazon.de/adidas-Predator-Football-Flexible-Fußballschuhe/dp/B0CKY26KHN" }, "42": { - "t": "PUMA Men's King Top Tt Botas de Fútbol", + "t": "Puma KING ULTIMATE FG/AG", "a": null, - "u": "https://www.amazon.de/-/en/PUMA-Botas-fútbol-Black-White/dp/B07KG4G136" + "u": "https://www.amazon.de/Puma-Ultimate-FG-AG-Fussballschuhe/dp/B0D8TLFZGK" }, "43": { - "t": "PUMA Unisex Children's Ultra Match Ll Fg/Ag Jr Football Boots", + "t": "adidas Unisex F50 League Laceless Football Boots Fg/Mg Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Childrens-Ultra-Football-Asphalt/dp/B0BLHRQXTD" + "u": "https://www.amazon.de/adidas-Unisex-Schnürlose-Fußballschuhe-Sneaker/dp/B0CYN2C743" }, "44": { - "t": "Nike Men's Superfly 9 Football Boots, Bright Crimson White Black", + "t": "adidas Unisex Copa Pure Iii Club Fg/Mg Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Superfly-Football-Boots-Bright-Crimson/dp/B0C4LJZ8L5" + "u": "https://www.amazon.de/adidas-Unisex-Fußballschuhe-Sneaker-White/dp/B0CYNHSQ4W" }, "45": { - "t": "PUMA Herren Future Match Mxsg Fussballschuh", + "t": "adidas Unisex Predator League J Football Boots Firm Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Boots-Orchid/dp/B0BLHRTXNV" + "u": "https://www.amazon.de/adidas-Predator-League-Football-Fußballschuhe/dp/B0CKXSGYWF" }, "46": { - "t": "PUMA Herren Attacanto It Fussballschuh", + "t": "adidas Unisex Predator League Fold-Over Tongue Firm Multi Ground Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Attacanto-Football-Boots-Black-Orchid/dp/B0BLHVYLN6" + "u": "https://www.amazon.de/adidas-Unisex-Predator-Fußballschuhe-Sneaker/dp/B0CYN8CWWM" }, "47": { - "t": "PUMA Men's Ultra Pro Fg/Ag Football Boots", + "t": "PUMA Herren Future 7 Match Fg/Ag Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Football-Boots-Black-White/dp/B09YHYFB48" + "u": "https://www.amazon.de/Future-Match-Soccer-Shoes-Black-Puma/dp/B0C33Y5TMS" }, "48": { - "t": "PUMA Unisex King Match Fg/Ag Football Boots", + "t": "adidas Unisex F50 League Football Boots Multi Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Match-Football-Boots-Black/dp/B0C2VBVSGM" + "u": "https://www.amazon.de/adidas-Unisex-League-Fußballschuhe-Sneaker/dp/B0CYN2FWND" }, "49": { - "t": "PUMA Unisex King Pro Fg/Ag Football Boots", + "t": "PUMA Unisex Future 7 Play Mg Soccer Shoe", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Football-Boots-Black-White/dp/B0C2VCKZ6W" + "u": "https://www.amazon.de/PUMA-Unisex-Fussballschuh-Skies-Elektro-Purple-Fizzy/dp/B0CKJ8P8QY" }, "50": { - "t": "adidas Unisex Copa Pure.3 Turf Football Boots (Turf)", + "t": "adidas Unisex Copa Pure Ii League Football Boots Firm Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/adidas-Unisex-Pure-3-Football-Boots/dp/B0BZ7WT9P5" + "u": "https://www.amazon.de/adidas-Unisex-League-Football-Fußballschuhe/dp/B0CKXYW5S3" }, "51": { - "t": "PUMA Men's Ultra Match+ Ll Fg/Ag Football Boots", + "t": "PUMA Unisex Future 8 Match Mxsg Fussballschuh", "a": null, - "u": "https://www.amazon.de/-/en/Ultra-Match-Football-Boots-Asphalt/dp/B0BLHT351V" + "u": "https://www.amazon.de/PUMA-Unisex-Future-Fussballschuh-Silver-Fluo/dp/B0D1YK3RG6" }, "52": { - "t": "Nike Mercurial Lite Shinguard", + "t": "adidas Predator League FT FG JB CBLACK/Goldmt/CBLACK - 9/43", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Shinguard-Mercurial-White-DN3611-100/dp/B0BKJYPZWG" + "u": "https://www.amazon.de/adidas-Predator-League-CBLACK-Goldmt/dp/B0DJLQDXK8" }, "53": { - "t": "PUMA Unisex Children's Future Play Fg/Ag Jr Football Boots", + "t": "Nike Unisex Superfly 10 Club Fg/Mg Sneaker", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Childrens-Future-Football-Orchid/dp/B0BLPC385C" + "u": "https://www.amazon.de/Nike-Herren-Superfly-Sneaker-Jungle/dp/B0D4TWQD4V" }, "54": { - "t": "PUMA Herren Future Match It Fussballschuh", + "t": "adidas Unisex Predator League Fold-Over Tongue Soft Ground Football Boots Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Boots-Orchid/dp/B0BLHTQ4B4" + "u": "https://www.amazon.de/adidas-Unisex-Predator-Fußballschuhe-Sneaker/dp/B0CYMWPS2G" }, "55": { - "t": "Nike Men's Zoom Vapor Football Boots", + "t": "adidas Unisex X Crazyfast.3 Fg Messi Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Nike-Mens-Vapor-Football-Boots/dp/B0B51YLXXT" + "u": "https://www.amazon.de/adidas-Unisex-Crazyfast-3-Sneaker-Dunkelblau-Silber-met/dp/B0C3WHS1KL" }, "56": { - "t": "PUMA Unisex Children's Ultra Pro Fg/Ag Jr Football Boots", + "t": "adidas Unisex Predator League Football Boots Firm Ground Fußballschuhe", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Childrens-Ultra-Football-Orchid/dp/B0BLHW24W8" + "u": "https://www.amazon.de/adidas-Unisex-Predator-Football-Fußballschuhe/dp/B0CKXZJ1HK" }, "57": { - "t": "PUMA Herren Future Play Tt Fussballschuh", + "t": "PUMA Erwachsene Future 7 PRO FG/AG Fußballschuhe ┃Fußballschuhe Herren", "a": null, - "u": "https://www.amazon.de/-/en/Future-Football-Boots-White-Orchid/dp/B0BLHTVTSQ" + "u": "https://www.amazon.de/PUMA-Erwachsene-PRO-Fußballschuhe-┃Fußballschuhe/dp/B0D6SX5G9V" }, "58": { - "t": "PUMA Men's Future Match Fg/AG Football Boots", + "t": "Nike Herren Zm Superfly 10 Acad Fg/Mg Sneaker", "a": null, - "u": "https://www.amazon.de/-/en/Future-Match-Football-Persian-White-PRO/dp/B0BLHXTPPH" + "u": "https://www.amazon.de/Nike-Unisex-Superfly-Sneaker-Aurora/dp/B0D4TVDD69" }, "59": { - "t": "PUMA Unisex King Pro 21 Tt Fussballschuh", + "t": "PUMA Unisex Future 8 Match Fg/Ag Fussballschuh", "a": null, - "u": "https://www.amazon.de/-/en/Unisex-Football-Boots-Black-White/dp/B08MV3JPGD" + "u": "https://www.amazon.de/PUMA-Unisex-Future-Fussballschuh-White-Glowing/dp/B0D1ZCYL1F" } }, "q": "fussballschuh", - "ctry": "de", - - // this is a limitation of the current test setup: - // in production, it should be the doublefetch URL, which would be: "https://www.amazon.de/s/?field-keywords=fussballschuh" - "qurl": "https://www.amazon.de/s/ref=nb_sb_noss_2?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&url=search-alias%3Daps&field-keywords=fussballschuh&rh=i%3Aaps%2Ck%3Afussballschuh" + "qurl": "https://www.amazon.de/s?k=fussballschuh", + "ctry": "de" } } ] -} +} \ No newline at end of file diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/page.html.gz b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/page.html.gz index 13b9543e..645c92fb 100644 Binary files a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/page.html.gz and b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/page.html.gz differ diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/scenario.json index a1024066..151f4b2f 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/gardening-2023-10-10/scenario.json @@ -1,6 +1,5 @@ { - "url": "https://www.amazon.com/s?k=gardening&ref=nb_sb_noss_1", - + "url": "https://www.amazon.de/s?k=gardening", "mustContain": [ { "type": "wdp", @@ -8,313 +7,310 @@ "payload": { "r": { "0": { - "t": "Lazy Lawn Care: Proven Lawn Care Tips for Saving Time and Effort on Landscaping, Weed Control, Lawn Watering and Gardening", - "a": null, - "u": "/Lazy-Lawn-Care-Landscaping-Gardening-ebook/dp/B0CCJYYVFG" + "t": "Gartensitzkniebank Kniebank mit weiches Kniekissen für Gartenarbeit - Garten-Kniebank klappbar Gartenhocker bis 170 kg mit 2 Werkzeugtaschen, Gartenstuhl zum Knien & Sitzen | KNEEBENCH", + "a": "Gesponsert", + "u": "/Gartensitzkniebank-Kniebank-weiches-Kniekissen-Gartenarbeit/dp/B0CLDP9P2H" }, "1": { - "t": "Gorilla Carts GOR400-COM Steel Garden Cart, Steel Mesh Removable Sides, 3 cu ft, 400 lb Capacity, Green", - "a": null, - "u": "https://www.amazon.com/Gorilla-Carts-GOR400-COM-Removable-400-lbs/dp/B01BECQBZ0" + "t": "Gefüllte Begonie Rot | 3 Stück | Knolle | Topfpflanze | Geeignet für Kübel | Rot | 100% Blütegarantie | QFB Gardening", + "a": "Gesponsert", + "u": "/Gefüllte-Topfpflanze-Blütegarantie-QFB-Gardening/dp/B0DSQ4YRJV" }, "2": { - "t": "The 2024 Old Farmer’s Almanac Gardening Calendar", - "a": null, - "u": "https://www.amazon.com/2024-Farmers-Almanac-Gardening-Calendar/dp/1571989617" + "t": "Gefüllte Begonie orange | 3 Stück | Topfpflanze | Geeignet für Kübel | Orange | 100% Blütegarantie | QFB Gardening", + "a": "Gesponsert", + "u": "/Gefüllte-Topfpflanze-Blütegarantie-QFB-Gardening/dp/B0DSQ8Y2B8" }, "3": { - "t": "Fiskars Multi-purpose Garden Shears", - "a": null, - "u": "https://www.amazon.com/Fiskars-356922-1009-Multi-Purpose-Scissor-Titanium/dp/B07QDMMNBJ" + "t": "KOMFOTTEU Kniebank, Faltbarer Gartenhocker mit Schaummatte und 2 Werkzeugtasche, Gartenbank für Knieschutz, 58cm x 28cm x 13/49cm, für Gartenarbeit (Mit 2 großen Werkzeugtaschen)", + "a": "Gesponsert", + "u": "/KOMFOTTEU-Gartenhocker-Werkzeugtasche-Gartenarbeit-Werkzeugtaschen/dp/B0CGZGKR8M" }, "4": { - "t": "Gardening Log Book and Planner: Monthly Garden Planting Journal and Organizer for Gardeners", + "t": "The Complete Gardener: A Practical, Imaginative Guide to Every Aspect of Gardening", "a": null, - "u": "https://www.amazon.com/Gardening-Log-Book-Planner-Organizer/dp/B0CDFS5YSZ" + "u": "https://www.amazon.de/Complete-Gardener-practical-imaginative-gardening/dp/0241424305" }, "5": { - "t": "TORCHSTAR LED Indoor Herb Garden, CRI 95+, Herb Grower Light with Timer, Indoor Gardening System, Full Spectrum Light, 24V Garden Kit for Plant Grow Novice, White, 4000K, Pots & Plants Not Included", + "t": "The Healing Garden: Cultivating and Handcrafting Herbal Remedies", "a": null, - "u": "https://www.amazon.com/TORCHSTAR-Spectrum-Sun-Like-Succulent-Vegetable/dp/B075GJ93PP" + "u": "https://www.amazon.de/Healing-Garden-Cultivating-Handcrafting-Remedies/dp/0358313384" }, "6": { - "t": "Gardening Log Book: Plant Organizer for Avid Gardeners | Gardening Notebook to Better Track Plants, Details and Care System | Perfect for Garden Lovers | 8.5 x 11", + "t": "Die neue Gartenschule: Planen, pflanzen, pflegen, ernten", "a": null, - "u": "https://www.amazon.com/Gardening-Log-Book-Organizer-Gardeners/dp/B09X2M5DBP" + "u": "https://www.amazon.de/Die-neue-Gartenschule-pflanzen-pflegen/dp/3831046204" }, "7": { - "t": "Xtreme Gardening 4401, 1-Pound Mykos Granular Nutrient, 1 lb", + "t": "The Gardening Year", "a": null, - "u": "https://www.amazon.com/Xtreme-Gardening-4401-Granular-Nutrient/dp/B004KPKAWO" + "u": "https://www.amazon.de/Gardening-Year-Readers-Digest/dp/0276001354" }, "8": { - "t": "Gardening: Perennial Vegetables - Plant Once and Harvest Year After Year (3rd Edition) (botanical, home garden, horticulture, garden, landscape, plants, gardening)", + "t": "Relaxdays, grün Kniebank, weiches Kniekissen, 2 Taschen, klappbar, Kniehilfe Gartenarbeit, bis 150 kg, HBT 50x60x27,5cm, EVA, Gartenbank", "a": null, - "u": "https://www.amazon.com/Gardening-Perennial-Vegetables-botanical-horticulture-ebook/dp/B00SZ0SI1U" + "u": "https://www.amazon.de/Relaxdays-Kniebank-Kniekissen-Kniehilfe-Gartenarbeit/dp/B07TF9X42H" }, "9": { - "t": "Fiskars 4-Claw Stand Up Weeder - Gardening Hand Weeding Tool with 39\" Long Ergonomic Handle - Black", + "t": "Homefarming: Selbstversorgung ohne grünen Daumen (GU Selbstversorgung)", "a": null, - "u": "https://www.amazon.com/Fiskars-Deluxe-Stand-up-Weeder-4-claw/dp/B0030MIHAU" + "u": "https://www.amazon.de/Homefarming-Selbstversorgung-grünen-Daumen-Garten/dp/3833877839" }, "10": { - "t": "Homegrown Vegetables, Fruits, and Herbs: A Bountiful, Healthful Garden for Lean Times (Creative Homeowner) Expert Gardening Advice: Reduce Costs, Save Time, & Grow Safe, Delicious Food for Your Family", - "a": null, - "u": "https://www.amazon.com/Homegrown-Vegetables-Fruits-Herbs-Bountiful/dp/1580114717" + "t": "NTG Schaumstoff-Knieschutz 40 mm Dick Kniekissen, Knieschoner & Knieunterlage Ideal für Garten, Werkstatt & Haus kniematte kniekissen kniebrett Kneepad 47 * 28 * 4 cm (1 Stück)", + "a": "Gesponsert", + "u": "/NTG-Schaumstoff-Knieschutz-Kniekissen-Knieschoner-Knieunterlage/dp/B0DWFW25Q4" }, "11": { - "t": "Radius Garden 10211, Ergonomic Aluminum Hand Weeder, Red", - "a": null, - "u": "https://www.amazon.com/Radius-Garden-10211-Ergonomic-Aluminum/dp/B0711JH63Z" + "t": "Wiltec Gartenwagen mit Ablagefläche bis 150kg Gartenhocker mit Reifen Gartentrolley für Gartenarbeit Werkstattwagen Rollwagen Arbeitssitz Gartensitz fahrbar", + "a": "Gesponsert", + "u": "/Werkstattwagen-belastbar-Gartenwagen-Arbeitssitz-Gartensitz/dp/B07NWTXBLP" }, "12": { - "t": "Womens Youre Never Too Old to Play in The Dirt Tshirt Funny Gardening Tee", - "a": null, - "u": "https://www.amazon.com/Gardening-Crazy-Dog-T-Shirts-Comfortable/dp/B087HN38FW" + "t": "Gimisgu Garten Kniebank Gartenhocker mit Eva Schaummatte, Tragbare Klappbar Arbeitstasche Gartenbank mit Werkzeugtasche, Garten Hocker für Gartenarbeit bis zu 150 kg", + "a": "Gesponsert", + "u": "/Gimisgu-Gartenhocker-Arbeitstasche-Werkzeugtasche-Gartenarbeit/dp/B0B4BB22LD" }, "13": { - "t": "Gardening (Merit Badge Series)", - "a": null, - "u": "https://www.amazon.com/Gardening-Merit-Badge-Series/dp/B00NC9XFQ2" + "t": "KESSER® Pflanzsack 40 Liter 5 Stück | Pflanzsäcke aus Vlies-Stoff mit Namensschildern | Kartoffel-Pflanztasche mit Fenster & Griffen | Pflanzbeutel für Gemüse Blumen Früchte Schwarz", + "a": "Gesponsert", + "u": "/Pflanzsäcke-Vlies-Stoff-Namensschildern-Kartoffel-Pflanztasche-Pflanzbeutel/dp/B0CZPF1CVH" }, "14": { - "t": "Garden Planner & Journal: Gardening Gifts / Calendar / Diary [ Paperback Notebook * 1 Year - Start any time * Large - 8.5 x 11 inch * Decorative Black ... * van Gogh ] (Gifts & Presents for Gardeners)", + "t": "RHS How To Garden When You're New To Gardening: The Basics For Absolute Beginners", "a": null, - "u": "https://www.amazon.com/Garden-Planner-Journal-Gardening-Decorative/dp/1545290555" + "u": "https://www.amazon.de/RHS-Garden-When-Youre-Gardening/dp/0241336651" }, "15": { - "t": "A.M. Leonard Complete Aluminum Gardening Tool Set (5 Tools)", + "t": "Gartenjahr für Einsteiger: Schritt für Schritt zum grünen Paradies", "a": null, - "u": "https://www.amazon.com/M-Leonard-Complete-Aluminum-Gardening/dp/B0034035A0" + "u": "https://www.amazon.de/Gartenjahr-für-Einsteiger-Schritt-Paradies/dp/3833839465" }, "16": { - "t": "OUHO Multi-Functional Outdoor Garden Cleaning Shovel Steel Flat Shovel ice Shovel Weeding Planting Farm Weeding", + "t": "Field Guide to Urban Gardening: How to Grow Plants, No Matter Where You Live: Raised Beds - Vertical Gardening - Indoor Edibles - Balconies and ... • Balconies and Rooftops • Hydroponics", "a": null, - "u": "https://www.amazon.com/OUHO-Multi-Functional-Outdoor-Cleaning-Planting/dp/B09V7PKQLW" + "u": "https://www.amazon.de/Field-Guide-Urban-Gardening-Balconies/dp/076036396X" }, "17": { - "t": "Happyyami 1Pc Garden Tool Bag Garden Bucket Bag Storage Tote Foldable Tote Storage Bag Adjustable Gardening Carrier Storage Basket Household Barrel Bucket Rack Toolbox Tote Bag", + "t": "Anstiftung zum gärtnerischen Ungehorsam: Bekenntnisse einer Guerillagärtnerin: Gebt Insekten ein Zuhause! -", "a": null, - "u": "https://www.amazon.com/Happyyami-Foldable-Adjustable-Gardening-Household/dp/B0CJ8C9WD9" + "u": "https://www.amazon.de/Anstiftung-zum-gärtnerischen-Ungehorsam-Guerillagärtnerin/dp/3453605470" }, "18": { - "t": "HANTOP Shovel Round Digging Shovel Garden Shovel Heavy Duty Shovel for Digging Trenching Gardening Camping, 43 inches", - "a": null, - "u": "https://www.amazon.com/HANTOP-Digging-Trenching-Gardening-Camping/dp/B0C3VDDB5F" + "t": "Sekey Höhenverstellbare Kniebank für Gartenarbeit Belastbarkeit 160KG, Gartenhocker Klappbar mit Werkzeugtaschen, Kniehilfe für Gartenarbeiten 60 * 50 * 28 cm, Grün", + "a": "Gesponsert", + "u": "/Sekey-Gartenarbeit-Gartenhocker-Höhenverstellbar-Werkzeugtasche/dp/B082KHWVTP" }, "19": { - "t": "HANTOP Garden Fork 4-Tine Digging Fork Heavy Duty Spading Fork Pitch Fork for Gardening, 43 inches", - "a": null, - "u": "https://www.amazon.com/HANTOP-Garden-Digging-Spading-Gardening/dp/B0C1BMMD7Y" + "t": "GARDEBRUK® Gartenabfallsack 6x280 Liter 50 kg Belastbarkeit doppelte Nähte 3 stabile Griffe robust abwaschbar Garten Rasensack Gartentasche Laubsack", + "a": "Gesponsert", + "u": "/GARDEBRUK®-Gartenabfallsack-Belastbarkeit-abwaschbar-Gartentasche/dp/B0D41X6RX2" }, "20": { - "t": "Corona Tools | ClassicCUT ClassicCUT® Branch & Stem Pruning Shears for Gardening | Cuts Branches and Stems up to 1-inch in Diameter | BP 15180", - "a": null, - "u": "https://www.amazon.com/Corona-BP15180-Forged-ClassicCUTBypass-Branch/dp/B083XCYMM4" + "t": "COSTWAY Rollsitz mit Rollen, 360 ° drehbar | höhenverstellbar, Mobiler Gartensitz Outdoor Gartenhocker, bis 150 kg | breite Reifen, Gartenrollsitz Metall, für Gartenarbeit & Autopflege, Grün", + "a": "Gesponsert", + "u": "/COSTWAY-höhenverstellbar-Gartenhocker-Gartenrollsitz-Gartenarbeit/dp/B0C6QMTWK1" }, "21": { - "t": "Nisaku NJP650 The Original Hori Hori Namibagata Japanese Stainless Steel Weeding Knife & Fiskars Ergo Gardening Hand Trowel - Ergonomic Handle Design with Hang Hole Heavy Duty Garden Tool for Digging", - "a": null, - "u": "https://www.amazon.com/Nisaku-Original-Namibagata-Stainless-Gardening/dp/B0C98W3PPL" + "t": "Ultiness® Balkon Gewächshaus Laura mit Pflanzkasten, Rankgitter & Schutzfolie, Tomatengewächshaus, Foliengewächshaus – Wetterfest, UV-beständig, Platzsparendes Urban Gardening für Balkon & Terrasse", + "a": "Gesponsert", + "u": "/Gewächshaus-Pflanzkasten-Schutzfolie-Tomatengewächshaus-Foliengewächshaus/dp/B0DGLGD98T" }, "22": { - "t": "Strongway Steel Yard Cart Jumbo Garden Wagon with Removable Sides 1400-Lb. Capacity, 50in.L x 24.1in.W x 26.75in.H", + "t": "RHS Gardening Through the Year: Month-by-month Planning Instructions and Inspiration", "a": null, - "u": "https://www.amazon.com/Strongway-Removable-1400-Lb-Capacity-26-75in-H/dp/B0C5Y5K3P5" + "u": "https://www.amazon.de/Gardening-Through-Year-Month-month/dp/0241315611" }, "23": { - "t": "Gigglebug", + "t": "RHS Complete Gardener's Manual: The one-stop guide to plan, sow, plant, and grow your garden", "a": null, - "u": "https://www.amazon.com/Gardening/dp/B09DM87PDT" + "u": "https://www.amazon.de/RHS-Complete-Gardeners-Manual-one-stop/dp/024143243X" }, "24": { - "t": "The Vegetable Garden Problem Solver Handbook: Identify and manage diseases and other common problems on edible plants", + "t": "Veg in One Bed: How to Grow an Abundance of Food in One Raised Bed, Month by Month", "a": null, - "u": "/Vegetable-Garden-Problem-Solver-Handbook-ebook/dp/B0BLZP3R5Q" + "u": "https://www.amazon.de/Veg-One-Bed-Abundance-Raised/dp/0241376521" }, "25": { - "t": "Senkichi SGKN-6 Double Edged Hatchet for Gardening, For Wood Splitting and Pruning Branches, 6.5 inches (165 mm)", + "t": "Wildlife Gardening: Die Kunst, im eigenen Garten die Welt zu retten", "a": null, - "u": "https://www.amazon.com/Gardening-Double-Beveled-Blade-SGKN-6/dp/B003UYTDEW" + "u": "https://www.amazon.de/Wildlife-Gardening-eigenen-Garten-retten/dp/3446261885" }, "26": { - "t": "Gorilla Carts GCR-4 Poly Dump Cart, 2-Wheel Garden Wagon with Foldable Handle, 4 cu ft, 300 lb. Capacity, Black/Yellow", + "t": "Rebel Gardening: A beginner’s handbook to organic urban gardening", "a": null, - "u": "https://www.amazon.com/Gorilla-Carts-GCR-4-300-pound-Capacity/dp/B08B43QY61" + "u": "https://www.amazon.de/Rebel-Gardening-beginners-handbook-gardening/dp/1786786915" }, "27": { - "t": "A.M. Leonard Garden Scoooter with Flat-Free Tires", + "t": "The Money-Saving Garden Year: A Month-by-month Guide to a Great Garden that Costs Less", "a": null, - "u": "https://www.amazon.com/M-Leonard-Garden-Scoooter-Flat-Free/dp/B0CFB4L2W6" + "u": "https://www.amazon.de/Money-Saving-Garden-Year-Christmas-gardener/dp/0241733065" }, "28": { - "t": "Onebttl Gardening Gifts for Plant Lovers, Women, Cool Gifts for Gardeners, Wife, Mom- 20oz/590ml Double Wall Vacuum Insulated Stainless Steel Tumblers Cups - Wet Rosegold", + "t": "Gärtnern: kann jede*r!: Ökologisch. Nachhaltig. Urban. Von Social-Media-Star \"Spicy Moustache\". Urban Gardening - cool, inspirierend und unkonventionell", "a": null, - "u": "https://www.amazon.com/Onebttl-Gardening-Gardeners-Insulated-Stainless/dp/B0C6R2FG54" + "u": "https://www.amazon.de/Gärtnern-Ökologisch-Social-Media-Star-inspirierend-unkonventionell/dp/3831048355" }, "29": { - "t": "The Spiritual Gardener: Insights from the Jewish Tradition to Help your Garden Grow (Second Edition) (The Spiritual Garden Series)", + "t": "Slow Gardening: Unser Weg zum naturnahen Küchengarten. Selbstversorgung mit den @farmmade_sisters: nachhaltig anbauen, bewusst und saisonal genießen", "a": null, - "u": "/Spiritual-Gardener-Insights-Jewish-Tradition-ebook/dp/B0BNJTFKQ3" + "u": "https://www.amazon.de/Slow-Gardening-Küchengarten-Selbstversorgung-farmmade_sisters/dp/3440177807" }, "30": { - "t": "Gardening with Sacred Geometry: We use water for our crops that has been enriched with Sacred Geometry to increase its life force energy by 2.5. Our crops are larger, healthier, and more nutritous.", + "t": "Gartenwissen Pflanzenschnitt: Genaue Anleitungen für mehr als 200 Pflanzen", "a": null, - "u": "/Gardening-Sacred-Geometry-healthier-nutritous-ebook/dp/B096L5CZPF" + "u": "https://www.amazon.de/Gartenwissen-Pflanzenschnitt-Genaue-Anleitungen-Pflanzen/dp/3831042640" }, "31": { - "t": "Tomato Tales: A Garderner's Journey From Soil To Sauce", + "t": "Das Gartenjahr: Die richtige Planung Monat für Monat", "a": null, - "u": "/Tomato-Tales-Garderners-Journey-Sauce-ebook/dp/B0CJ772ZCW" + "u": "https://www.amazon.de/Das-Gartenjahr-richtige-Planung-Monat/dp/3831041539" }, "32": { - "t": "Garden Clubbed!: From Mums to Mayhem", + "t": "RHS Little Book of Small-Space Gardening: Easy-grow Ideas for Balconies, Window Boxes & Other Outdoor Areas (Royal Horticultural Society Handbooks)", "a": null, - "u": "/Garden-Clubbed-Mayhem-Josh-Langston-ebook/dp/B08JYVBB9M" + "u": "https://www.amazon.de/RHS-Little-Book-Small-Space-Gardening/dp/1784724262" }, "33": { - "t": "Fiskars Bypass Pruning Shears 5/8” Garden Clippers - Plant Cutting Scissors with Sharp Precision-Ground Steel Blade", + "t": "Unforgettable Gardens: 500 Years of Historic Gardens and Landscapes", "a": null, - "u": "https://www.amazon.com/Fiskars-91095935J-Bypass-Pruning-Shears/dp/B00002N66H" + "u": "https://www.amazon.de/Unforgettable-Gardens-Years-Historic-Landscapes/dp/1849949034" }, "34": { - "t": "Womanswork Stretch Gardening Glove with Micro Suede Palm, Hot Pink, Small", + "t": "A Year Full of Pots: Container Flowers for All Seasons", "a": null, - "u": "https://www.amazon.com/Womanswork-Stretch-Gardening-Glove-Micro/dp/B001KWFGO2" + "u": "https://www.amazon.de/Year-Full-Pots-Container-Flowers/dp/1526667479" }, "35": { - "t": "Garden Planner and Log Book: Monthly Gardening Journal and Notebook for Gardening Enthusiasts, Detailed Organizer to Help You Grow and Maintain Your Garden - An Ideal Gardening Gift", + "t": "Square Foot Gardening: A New Way to Garden in Less Space with Less Work", "a": null, - "u": "https://www.amazon.com/Garden-Planner-Log-Book-Enthusiasts/dp/B0C12JW9Z5" + "u": "https://www.amazon.de/Square-Foot-Gardening-Garden-Space/dp/1579548563" }, "36": { - "t": "Gorilla Carts 7GCG-NF Heavy-Duty Poly Dump Cart with No-Flat Tires, 7 cu ft, 1200 lb Capacity, Black", + "t": "Wie lange braucht eine Schnecke zurück in meinen Garten?: Kuriose Fragen und erstaunliche Antworten rund um den Garten. Über 100 praktische Tipps zu ... & Früchte, Erde, Klima und Gartenarbeit.", "a": null, - "u": "https://www.amazon.com/Gorilla-Carts-Heavy-Duty-No-Flat-Tires/dp/B084NY9PTB" + "u": "https://www.amazon.de/braucht-Schnecke-zurück-meinen-Garten/dp/3784355374" }, "37": { - "t": "Greenhouse Setup Manual", + "t": "Gardening for Self-Sufficiency - From Soil to Sustenance: A Beginner's Guide to Growing Your Way to Independence (English Edition)", "a": null, - "u": "/Greenhouse-Setup-Manual-Barry-Nadel-ebook/dp/B0BV7K2HBQ" + "u": "https://www.amazon.de/Gardening-Self-Sufficiency-Sustenance-Beginners-Independence-ebook/dp/B0DV8MSDD6" }, "38": { - "t": "Indoor Succulent Care: A Beginner's Guide on How Succulent Plants Can Keep You Out of Trouble and Make You a Better Person", + "t": "Companion Planting Made Simple For Beginners: Strategies to Improve Soil Enrichment, Maximize Garden Space, and Learn Natural Pest Control for a Healthier Bountiful Harvest (English Edition)", "a": null, - "u": "/Indoor-Succulent-Care-Beginners-Trouble-ebook/dp/B089KKHQ3Y" + "u": "https://www.amazon.de/Companion-Planting-Made-Simple-Beginners-ebook/dp/B0D5BC6G28" }, "39": { - "t": "Backyard Herb Gardening: How To Grow Herbs From Your Backyard and Use It For Everyday Life", + "t": "Ground Force Container Gardening", "a": null, - "u": "/Backyard-Herb-Gardening-Herbs-Everyday-ebook/dp/B01N788EJU" + "u": "https://www.amazon.de/Ground-Container-Gardening-Charlie-Dimmock/dp/0563488093" }, "40": { - "t": "Hydroponics: The Definitive Beginner’s Guide to Quickly Start Growing Vegetables, Fruits, & Herbs for Self-Sufficiency! (Gardening, Organic Gardening, Homesteading, Horticulture, Aquaculture)", + "t": "The Gardener's Almanac: A stunning month-by-month treasury of gardening wisdom and inspiration from the nation's best-loved gardener", "a": null, - "u": "/Hydroponics-Self-Sufficiency-Homesteading-Horticulture-Aquaculture-ebook/dp/B01BRQ06IG" + "u": "https://www.amazon.de/Gardeners-Almanac-Treasury-Knowledge-Inspiration/dp/1529389410" }, "41": { - "t": "Radius Garden 30511 Ergonomic Garden Hand Tool Set, Red", + "t": "Quickfinder Gartenjahr: Der beste Zeitpunkt für jede Gartenarbeit (GU Gartenpraxis)", "a": null, - "u": "https://www.amazon.com/Radius-Garden-30511-Ergonomic-Gardening/dp/B0785NCG16" + "u": "https://www.amazon.de/Quickfinder-Gartenjahr-Zeitpunkt-Gartenarbeit-Garten/dp/3833853980" }, "42": { - "t": "BIRASIL Outdoor Galvanized Raised Beds, Steel Garden Boxes for Flower Vegetables, Metal Planter Box for Gardening (8x4x1FT, Silver)", + "t": "RHS Let's Get Gardening", "a": null, - "u": "https://www.amazon.com/BIRASIL-Outdoor-Galvanized-Vegetables-Gardening/dp/B0B4D8V87N" + "u": "https://www.amazon.de/Lets-Gardening-Royal-Horticultural-Society/dp/0241382637" }, "43": { - "t": "Gardening Log Book and Journal: Complete Garden Planner and Organizer, Plant Logs for Vegetables, Fruits, Flowers - Monthly Calendars (Garden Organizer for All Ages)", + "t": "The Urban Gardening Blueprint : How to Plan, Grow, and Thrive in Limited Spaces (English Edition)", "a": null, - "u": "https://www.amazon.com/Gardening-Log-Book-Journal-Vegetables/dp/B0B9QTH68Q" + "u": "https://www.amazon.de/Urban-Gardening-Blueprint-Limited-English-ebook/dp/B0DTFM4YSZ" }, "44": { - "t": "Amyove Garage Tool Storage Rack, Garden Tool Organizer,Utility Rack,Holds Yard Tools, Wheeled for Garage, Shed, Outdoor, Garden Tool Stand", + "t": "Gardening for Beginners: 1", "a": null, - "u": "https://www.amazon.com/Amyove-Storage-Organizer-Utility-Wheeled/dp/B0BX46TVN5" + "u": "https://www.amazon.de/Gardening-Beginners-Abigail-Wheatley/dp/140955015X" }, "45": { - "t": "Womens I Love Gardening So Much I Wet My Plants Tshirt Cute Summer Tee", + "t": "RHS Encyclopedia of Gardening", "a": null, - "u": "https://www.amazon.com/Gardening-Crazy-Dog-T-Shirts-Comfortable/dp/B07RT1VP3Y" + "u": "https://www.amazon.de/RHS-Encyclopedia-Gardening-Christopher-Brickell/dp/1409383946" }, "46": { - "t": "Ironton Steel Garden Cart - 400-Lb. Capacity, 38in.L x 18 1/2in.W x 21in.H", + "t": "MINIGÄRTEN: Gardening-Ideen auf kleinstem Raum", "a": null, - "u": "https://www.amazon.com/Ironton-Steel-Garden-Cart-Capacity/dp/B0BVGN1MFW" + "u": "https://www.amazon.de/MINIGÄRTEN-Gardening-Ideen-auf-kleinstem-Raum/dp/3965630644" }, "47": { - "t": "Miracle Gro Women's High-Dexterity Gardening Work Gloves, Touchscreen Technologies, Abrasion Resistant, Comfort, Blue/Tan, Medium, (MG86207/WML)", + "t": "Tropical Plants: For Home and Garden", "a": null, - "u": "https://www.amazon.com/Miracle-Gro-High-Dexterity-Technologies-MG86207-WML/dp/B09SV9L6M3" + "u": "https://www.amazon.de/Tropical-Plants-Garden-William-Warren/dp/0500017956" }, "48": { - "t": "Florida Gardening Planner 2023: Florida month by month edible garden planner and reference guide for abundant harvests and self-sufficiency. (Southern Garden Regional Gardening Planners)", + "t": "Beatrix Potter's Gardening Life: The Plants and Places That Inspired the Classic Children's Tales", "a": null, - "u": "https://www.amazon.com/Florida-Gardening-Planner-2023-self-sufficiency/dp/1946050245" + "u": "https://www.amazon.de/Beatrix-Potters-Gardening-Life-Childrens/dp/1604693630" }, "49": { - "t": "Kate Spade New York Make It Pop Floral Box, 0.85 LB, Green/Navy", + "t": "Guerilla Gardening: Ein botanisches Manifest", "a": null, - "u": "https://www.amazon.com/Kate-Spade-Make-Floral-Green/dp/B0B88T3GBH" + "u": "https://www.amazon.de/Guerilla-Gardening-Richard-Reynolds/dp/3936086443" }, "50": { - "t": "Baby Baby", + "t": "RHS How to Create your Garden: Ideas and Advice for Transforming your Outdoor Space", "a": null, - "u": "https://www.amazon.com/Gardening/dp/B0CBD86VPX" + "u": "https://www.amazon.de/RHS-How-Create-your-Garden/dp/0241332311" }, "51": { - "t": "The Martha Stewart Gardening Collection - Martha's Spring Garden [DVD]", + "t": "Grow Containers: Essential Know-how and Expert Advice for Gardening Success", "a": null, - "u": "https://www.amazon.com/Martha-Stewart-Gardening-Collection-Marthas/dp/B000E8M0WY" + "u": "https://www.amazon.de/Grow-Containers-Essential-know-how-gardening/dp/024143582X" }, "52": { - "t": "Mykos RT4401 Mycorrhizae, Granular, 1-Lb. - Quantity 1", + "t": "Seasons at Highclere: Gardening, Growing, and Cooking through the Year at the Real Downton Abbey", "a": null, - "u": "https://www.amazon.com/Mykos-RT4401-Mycorrhizae-Granular-1-Lb/dp/B08QV8M84N" + "u": "https://www.amazon.de/Seasons-Highclere-Gardening-Growing-Cooking/dp/1529135583" }, "53": { - "t": "Fiskars 384490-1001 Ergo Garden Tool Set, Regular Package, Black/Orange", + "t": "Relaxdays Kniebank für Gartenarbeit, klappbar, bis 150kg, inkl. 2 Taschen, gepolsterte Kniehilfe, 50x60x27,5 cm, schwarz", "a": null, - "u": "https://www.amazon.com/Fiskars-384490-1001-Garden-Piece-Orange/dp/B07DLQNBS9" + "u": "https://www.amazon.de/Relaxdays-Kniebank-Gartenarbeit-gepolsterte-Kniehilfe/dp/B08LN665NJ" }, "54": { - "t": "Gardener's Log Book:2019 A 5-Year Planner (New York Botanical Garden)", + "t": "No-Waste Organic Gardening: Eco-friendly Solutions to Improve any Garden (No-Waste Gardening)", "a": null, - "u": "https://www.amazon.com/Gardeners-Log-Book-Planner-Botanical/dp/1524759074" + "u": "https://www.amazon.de/No-Waste-Organic-Gardening-Eco-Friendly-Solutions/dp/0760367647" }, "55": { - "t": "New Stens OEM Replacement Belt 266-124 Compatible with/Replacement for John Deere M48444", + "t": "Gardening for Life - The Biodynamic Way: A Practical Introduction to a New Art of Gardening, Sowing, Planting, Harvesting (Art & Science)", "a": null, - "u": "https://www.amazon.com/Stens-266-124-Replacement-Belt-Black/dp/B07G7KT5FK" + "u": "https://www.amazon.de/Gardening-Life-Biodynamic-Way-Science/dp/1869890329" }, "56": { - "t": "Burgon & Ball KPADPLUM Plum Kneelo GKN Ultra-Cushion Kneeler/Gardening Knee Pad", + "t": "Martha Stewart's Gardening Handbook: The Essential Guide to Designing, Planting, and Growing (English Edition)", "a": null, - "u": "https://www.amazon.com/Burgon-Ball-GKN-KPADPLUM-Kneelo/dp/B06WGLDGWC" + "u": "https://www.amazon.de/Martha-Stewarts-Gardening-Handbook-Essential-ebook/dp/B0DKS76QB2" }, "57": { - "t": "Outdoor Garden Cleaning Shovel Multi-Functional Steel Flat Shovel Ice Shovel Weeding Planting Farm Weeding Tool (S: 21X25.5X3.4CM)", + "t": "On Guerrilla Gardening: A handbook for gardening without boundaries", "a": null, - "u": "https://www.amazon.com/Outdoor-Cleaning-Multi-Functional-Weeding-Planting/dp/B0BH4L53PK" + "u": "https://www.amazon.de/Guerrilla-Gardening-Handbook-Without-Boundaries/dp/0747590818" }, "58": { - "t": "Garden Jewels: The Comprehensive Guide to Ornamental Planting. (The A-Z of Gardening Plants)", + "t": "Zuhause im Skandi-Garten. Inspiration, Dekoideen und Rezepte von Januar bis Dezember: Beate Balz zu Gast bei Andrea Schliep und Sebastian Streich", "a": null, - "u": "/Garden-Jewels-Comprehensive-Ornamental-Planting-ebook/dp/B0CH4TYY7R" + "u": "https://www.amazon.de/Zuhause-Skandi-Garten-Inspiration-Dekoideen-Dezember/dp/3735852238" }, "59": { - "t": "RAISED BED GARDENING: A DIY GUIDE TO RAISED BED GARDENING", + "t": "Solar Gardening: Growing Vegetables Year-Round the American Intensive Way (The Real Goods Independent Living Books)", "a": null, - "u": "/RAISED-BED-GARDENING-DIY-GUIDE-ebook/dp/B0797MYX5C" + "u": "https://www.amazon.de/Solar-Gardening-Vegetables-Year-Round-Independent/dp/0930031695" } }, "q": "gardening", - "ctry": "de", - - // this is a limitation of the current test setup: - // in production, it should be the doublefetch URL, which would be: "https://www.amazon.de/s/?field-keywords=gardening" - "qurl": "https://www.amazon.com/s?k=gardening&ref=nb_sb_noss_1" + "qurl": "https://www.amazon.de/s?k=gardening", + "ctry": "de" } } ] -} +} \ No newline at end of file diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/page.html.gz b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/page.html.gz index 0d70f7d5..d0540f5d 100644 Binary files a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/page.html.gz and b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/page.html.gz differ diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/scenario.json index 11d4f9cb..07718afe 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/am/test-2023-10-10/scenario.json @@ -1,6 +1,5 @@ { - "url": "https://www.amazon.de/s/ref=nb_sb_noss/261-9879674-9680331?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&url=search-alias%3Daps&field-keywords=test", - + "url": "https://www.amazon.de/s?k=test", "mustContain": [ { "type": "wdp", @@ -8,313 +7,310 @@ "payload": { "r": { "0": { - "t": "Accu Chek Pro INSTANT 100 TEST STRIPS", - "a": null, - "u": "/Accu-Chek-Pro-INSTANT-STRIPS/dp/B09SZCL64P" + "t": "pDiagnostics 10x Corona Schnelltests | Nasaler COVID-19 Antigentest | Zuverlässig für neue Varianten 2025 | Hohe Sensitivität | Test für Laien", + "a": "Gesponsert", + "u": "/pDiagnostics-Schnelltests-Antigentest-Zuverlässig-Sensitivität/dp/B0DFWRWRR9" }, "1": { - "t": "Vitamin B12 Test Kit von CERASCREEN – Vitamin-B12-Spiegel einfach per Selbsttest von Zuhause bestimmen I Vitaminmangel erkennen | Holo-TC-Test | Zertifiziertes Labor & detaillierter Ergebnisbericht", - "a": null, - "u": "/Vitamin-Test-CERASCREEN-Vitamin-B12-Spiegel-Ergebnisbericht/dp/B07TS8DKYC" + "t": "Lebensmittelallergie Test von CERASCREEN - IgE-Analyse von 44 Lebensmitteln Testkit für Zuhause | Nahrungsmittelunverträglichkeit prüfen | Zertifiziertes Labor & detaillierter Ergebnisbericht", + "a": "Gesponsert", + "u": "/Lebensmittelallergie-Test-CERASCREEN-Nahrungsmittelunverträglichkeit-Ergebnisbericht/dp/B0DFCVN26Y" }, "2": { - "t": "Lebensmittelallergie Test von CERASCREEN - IgE-Analyse von 38 Lebensmitteln mit Selbsttest für Zuhause | Nahrungsmittelunverträglichkeit prüfen | Zertifiziertes Labor & detaillierter Ergebnisbericht", - "a": null, - "u": "/Lebensmittelallergie-Test-CERASCREEN-Ergebnisbericht-Nahrungsmittelunverträglichkeiten/dp/B08LDSYHMC" + "t": "Preventis SmarTest® Vitamin D | Vitamin D Test in 15 Min. | Nachweis von Vitamin D Mangel | Vitamin D Schnelltest für Zuhause (1 St.)", + "a": "Gesponsert", + "u": "/Preventis-SmarTest®-Vitamin-Nachweis-Schnelltest/dp/B0BQJBPBRJ" }, "3": { - "t": "Vitamin D Test Kit von CERASCREEN - Spiegel messen und Mangel erkennen mit dem Selbsttest für zu Hause | Zertifiziertes Labor | Konkrete Handlungsempfehlungen und Gesundheitsinformationen", - "a": null, - "u": "/cerascreen-Cerascreen-Vitamin-D-Testkit/dp/B088NJL1CV" + "t": "Eisen Test von CERASCREEN – Ferritinwert schnell & einfach mit Testkit von Zuhause bestimmen I Jetzt auf Eisenunterversorgung testen | Professionelle Laboranalyse | Detaillierte Ergebnisse", + "a": "Gesponsert", + "u": "/Eisenmangel-Test-von-CERASCREEN-Ferritinwert/dp/B08N55SXXW" }, "4": { - "t": "Hotgen Coronavirus (2019-nCoV) -Antigentest 20x 1er", + "t": "CorDx 4 in 1 Laien-Antigen Kombi-Test RSV Viren + Corona COVID-19 + Influenza A + B | Packung (1 Test)", "a": null, - "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigentest-20x/dp/B002UCYKCU" + "u": "https://www.amazon.de/CorDx-Laien-Antigen-Kombi-Test-COVID-19-Influenza/dp/B0CP9MVMRQ" }, "5": { - "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 25 Stück, (5 x 5 Tests)", + "t": "10 Stück Corona Schnelltest Selbsttest - Nasen-Selbsttest für zu Hause SARS-CoV-2", "a": null, - "u": "https://www.amazon.de/NewGene-COVID-19-Selbsttest-Antigen-Schnelltest/dp/B0B7SN1HNM" + "u": "https://www.amazon.de/Stück-Corona-Schnelltest-Selbsttest-Nasen-Selbsttest/dp/B09QWH3T52" }, "6": { - "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Schnelltest Corona 39 Stück", + "t": "fluorecare SARS-CoV-2, Influenza A/B & RSV Combo-Schnelltest, Corona schnelltest", "a": null, - "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09W368MQS" + "u": "https://www.amazon.de/fluorecare®-SARS-CoV-2-Influenza-RSV-Combo-Schnelltest/dp/B0CGVY2W2D" }, "7": { - "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Corona Schnelltest 100 Stück", + "t": "Hotgen Corona Schnelltest Selbsttest Antigentest auf SARS-CoV-2, 10 Stück (1er Pack)…", "a": null, - "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09SS9S71L" + "u": "https://www.amazon.de/Schnelltest-Selbsttest-Antigentest-SARS-CoV-2-Eigenanwendung/dp/B0924W1CV5" }, "8": { - "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Schnelltest Corona 400 Stück", + "t": "ExactSign Antigen Schnelltest - Nasal-Swab/Laientest 4 x 5er (20 Stück) MHD 08/2026", "a": null, - "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09T3NKCDL" + "u": "https://www.amazon.de/ExactSign-Antigen-Schnelltest-Nasal-Swab-Laientest/dp/B0CR7T3PQB" }, "9": { - "t": "autotest VIH® von ratiopharm: Selbsttest zur schnellen und zuverlässigen Feststellung einer potentiellen HIV-Infektion. Diskret und einfach daheim durchführbar.", + "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 25 Stück, (5 x 5 Tests)", "a": null, - "u": "https://www.amazon.de/autotest-VIH®-ratiopharm-zuverlässigen-HIV-Infektion/dp/B07J9JJJ17" + "u": "https://www.amazon.de/NewGene-COVID-19-Selbsttest-Antigen-Schnelltest/dp/B0B7SN1HNM" }, "10": { - "t": "Clearblue Schwangerschaftstest Frühtest digital, Pregnancy Test, 1x Frühschwangerschaftstest / Schwangerschaftsfrühtest mit Wochenbestimmung, Schwangerschaft bestimmen, 25 mIU/ml", - "a": null, - "u": "https://www.amazon.de/Clearblue-Schwangerschaftstest-Anzeige-Wochen-digitaler/dp/B00D6B1900" + "t": "PSA Test von CERASCREEN – Unterstützt die Früherkennung von Prostataerkrankungen | Testkit für Zuhause | Professionelle Laboranalyse | Detaillierte Online-Ergebnisse & Handlungsempfehlungen", + "a": "Gesponsert", + "u": "/Test-CERASCREEN-Prostataerkrankungen-Professionelle-Ergebnisbericht/dp/B09N43VRF7" }, "11": { - "t": "The Test: A New Era For Australia's Team - Series 1", - "a": null, - "u": "https://www.amazon.de/Under-Siege/dp/B08BYY73XW" + "t": "Darmgesundheits Test von CERASCREEN – Gesundheitscheck Darmflora Plus | Stuhltest zur umfangreichen Untersuchung von 18 Darmbakterien und Pilzen (u.a. Candida) und dem pH-Wert des Stuhls", + "a": "Gesponsert", + "u": "/Darmgesundheits-Test-CERASCREEN-Gesundheitscheck-umfangreichen/dp/B0D1CJRMTR" }, "12": { - "t": "5er Set Covid-19 Antigen Schnelltests Corona Selbsttest SARS-CoV-19 (5 Tests)", - "a": null, - "u": "/Covid-19-Antigen-Schnelltests-Selbsttest-SARS-CoV-19/dp/B094YGDC3R" + "t": "ASPILOS Schilddrüse Schnelltest - Schilddrüsenunterfunktion Selbsttest für zu Hause - Ergebnis in 10 Minuten - Zuverlässig & Präzise - Labor-Qualität", + "a": "Gesponsert", + "u": "/ASPILOS-Schilddrüse-Schnelltest-Schilddrüsenunterfunktion-Labor-Qualität/dp/B0D5HTBYFY" }, "13": { - "t": "HIGHTOP 4+1 Grippe A+B/Rsv / 20 Stück", - "a": null, - "u": "/HIGHTOP-Grippe-Rsv-20-Stück/dp/B0CCPQ524B" + "t": "well2wellness® Pooltester Test Kit Chlor/pH mit je 20 Messtabletten DPD No. 1 (Chlor) + Phenol Red Rapid (pH)", + "a": "Gesponsert", + "u": "/well2wellness®-Pooltester-Chlor-Messtabletten-Phenol/dp/B0C48DJRW6" }, "14": { - "t": "Tropical 6 in 1 Test 50 Teststreifen, 1er Pack (1 x 12 g)", + "t": "pDiagnostics 20x Corona Schnelltests | Nasaler COVID-19 Antigentest | Zuverlässig für neue Varianten 2025 | Hohe Sensitivität | Test für Laien", "a": null, - "u": "/Tropical-Test-Teststreifen-1er-Pack/dp/B00FJZIR6A" + "u": "https://www.amazon.de/pDiagnostics-Schnelltests-Antigentest-Zuverlässig-Sensitivität/dp/B0DFWQ6YHL" }, "15": { - "t": "CITEST Corona Schnelltest für Zuhause, Testsíeger Stíftung Warentest, 5 Stück", + "t": "Clearblue Schwangerschaftstest Schnelle Erkennung, 5x Pregnancy Test, über 99 % zuverlässig mit FloorguardTM , Ergebnis innerhalb von 1 Minute", "a": null, - "u": "/CITEST-Schnelltest-Zuhause-Covid-19-Selbsttest/dp/B0BJ33Z7JQ" + "u": "https://www.amazon.de/Clearblue-Schwangerschaftstest-Erkennung-Pregnancy-zuverlässig/dp/B0CQJSL7Z6" }, "16": { - "t": "Vitamin D Test Kit von CERASCREEN - Spiegel messen und Mangel erkennen mit dem Selbsttest für zu Hause | Zertifiziertes Labor | Konkrete Handlungsempfehlungen und Gesundheitsinformationen", + "t": "Combur 5-Test HC Urinteststreifen 10 Stück", "a": null, - "u": "https://www.amazon.de/cerascreen-Cerascreen-Vitamin-D-Testkit/dp/B088NJL1CV" + "u": "https://www.amazon.de/Combur-5-Test-HC-Urinteststreifen-Stück/dp/B001GK1SXQ" }, "17": { - "t": "Clearblue Schwangerschaftstest Frühe Erkennung, Frühtest, Pregnancy Test, 2x Frühschwangerschaftstest / Schwangerschaftsfrühtest, über 99 % zuverlässig, Schwangerschaft bestimmen, 25 mIU/ml", + "t": "HIGHTOP SARS-CoV-2 Rapid Test (10 Stück einzeln verpackt), Corona Schnelltest, Antigen Schnelltest", "a": null, - "u": "https://www.amazon.de/Clearblue-Schwangerschaftstest-Schnell-Einfach-Tests/dp/B016I7W61Y" + "u": "https://www.amazon.de/HIGHTOP-SARS-CoV-2-einzeln-verpackt-Schnelltest/dp/B0CQ5JB1J2" }, "18": { - "t": "ZuhauseTEST Schilddrüse - TSH Test zur Feststellung einer Schilddrüsenunterfunktion", - "a": null, - "u": "https://www.amazon.de/NanoRepro-AG-ZUHAUSE-TEST-Schilddrüse/dp/B08WH7HHCR" + "t": "Leaky Gut Test von CERASCREEN - Leaky Gut Syndrom untersuchen | Durchlässiger Darm Test | Stuhltest einfach von Zuhause durchführen | Professionelle Analyse im medizinischen Labor", + "a": "Gesponsert", + "u": "/Leaky-Test-CERASCREEN-Durchlässiger-Professionelle/dp/B08CKXKCSL" }, "19": { - "t": "Clearblue Kinderwunsch Ovulationstest Kit, 20 Tests + 1 digitale Testhalterung, Fruchtbarkeitstest für Frauen / Eisprung, Fortschrittlich & Digital (testet 2 Hormone), schneller schwanger werden", - "a": null, - "u": "https://www.amazon.de/Clearblue-Ovulationstest-Fortschrittlich-Digital-Tests/dp/B077Q5FYX1" + "t": "Miniatur-Set zum testen - 24x 20 ml Überraschungspacket Warda Saunaduftkonzentrat Adventskalenderfüllung", + "a": "Gesponsert", + "u": "/Miniatur-Set-testen-Überraschungspacket-Saunaduftkonzentrat-Adventskalenderfüllung/dp/B07XPVNQ7K" }, "20": { - "t": "Vitamin D Test - Doppelpackung von CERASCREEN | Vitamin-D-Spiegel zu Hause messen und Mangel erkennen - Spare jetzt im Kombi- Paket | Professionelle Laboranalyse mit konkreten Handlungsinformationen", - "a": null, - "u": "/Vitamin-Test-Doppelpackung-Professionelle-Laboranalyse/dp/B094D3D6CH" + "t": "cerascreen® Serotonin Test Kit – Serotonin Spiegel schnell & einfach per Selbsttest von Zuhause bestimmen | Serotoninunterversorgung | Jetzt Serotoninspiegel online messen", + "a": "Gesponsert", + "u": "/cerascreen®-Serotonin-Test-Serotoninmangel-Serotoninspiegel/dp/B016V1QKLO" }, "21": { - "t": "Omega 3 Test von CERASCREEN - Fettsäuren-Analyse | Verhältnis von Omega-3- zu Omega-6-Fettsäuren per Selbsttest bequem von zu Hause bestimmen | Zertifiziertes Fachlabor | Detaillierter Ergebnisbericht", - "a": null, - "u": "/Omega-Test-CERASCREEN-Omega-6-Fettsäuren-Professionelles/dp/B095C386GR" + "t": "Histamin-Intoleranz Test Kit von CERASCREEN – Histamin-Unverträglichkeit einfach von Zuhause per Selbsttest bestimmen | Diaminoxidase-Wert (DAO) I Zertifiziertes Labor I Detaillierter Ergebnisbericht", + "a": "Gesponsert", + "u": "/Histamin-Intoleranz-Test-CERASCREEN-Histamin-Unverträglichkeit-Ergebnisbericht/dp/B07TXHKH31" }, "22": { - "t": "1x Sammelbecher Gurgeltest Röhrchen Behälter für Schule Test Kinder nach WICOVIR Studie", + "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Schnelltest Corona 39 Stück", "a": null, - "u": "/Sammelbecher-Gurgeltest-Röhrchen-Behälter-WICOVIR/dp/B0968L4P5C" + "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09W368MQS" }, "23": { - "t": "Großer Allergie Test von CERASCREEN - Umfangreiche Laboruntersuchung von 61 Allergenen wie Lebensmittel, Pollen und Katzen | Selbsttest für zu Hause | IgE-Antikörper | Detaillierter Ergebnisbericht", + "t": "autotest VIH® von ratiopharm: Selbsttest zur schnellen und zuverlässigen Feststellung einer potentiellen HIV-Infektion. Diskret und einfach daheim durchführbar.", "a": null, - "u": "/Großer-Allergie-CERASCREEN-Laboruntersuchung-Handlungsempfehlungen/dp/B09N3TZMLC" + "u": "https://www.amazon.de/autotest-VIH®-ratiopharm-zuverlässigen-HIV-Infektion/dp/B07J9JJJ17" }, "24": { - "t": "The Ultimate TOEFL iBT Test Prep Savings Bundle, Third Edition", + "t": "EUROPAPA® 40x Corona Laientest Selbsttest Covid-19 Antigentest auf SARS-CoV-2 Schnelltest zur Eigenanwendung Testkassete Probentupfer Antigenextrakt einzelverpackt", "a": null, - "u": "https://www.amazon.de/Ultimate-TOEFL-Savings-Bundle-Third/dp/1260473473" + "u": "https://www.amazon.de/Antigentest-Eigenanwendung-Probentupfer-Antigenextrakt-einzelverpackt/dp/B0DP2YF7M8" }, "25": { - "t": "ZuhauseTEST Gesunder Magen - Helicobacter Pylori Schnelltest", + "t": "30 Stück One+Step Ovulationstest mit optimaler Sensitivität - Fruchtbarkeitstest für Frauen - Ovulationstests für Eisprung", "a": null, - "u": "https://www.amazon.de/ZuhauseTEST-Schnelltest-Antikörper-Helicobacter-Magenschmerzen/dp/B08WHPX72F" + "u": "https://www.amazon.de/One-Step-Ovulationstests-optimaler-Sensitivität/dp/B019DQABHI" }, "26": { - "t": "Official Guide to the TOEFL Test", + "t": "fluorecare 1 x Corona Schnelltest Quick Influenza A/B & RSV | 4 in 1 Quick Home use Test | corona schnelltest neue variante 2024 1er pack", "a": null, - "u": "https://www.amazon.de/Official-Guide-Educational-Testing-Service/dp/1260470350" + "u": "https://www.amazon.de/fluorecare-Schnelltest-Influenza-schnelltest-variante/dp/B0BFBLH9Q7" }, "27": { - "t": "TFA Dostmann Batterietester BatteryCheck, 98.1126.01, für Batterien und Akkus(AAA,AA,C,D),Knopfzelle, Blockbatterie, Anzeige des Ladestatus/Volt, einfach und schnell,schwarz", + "t": "OneTouch Ultra Plus Reflect Start-Set, Blutzuckermessgerät für Diabetes, inkl. 1 Blutzucker-Messgerät (mmol/L), 40 Teststreifen, 1 Stechhilfe, 40 Lanzetten, 1 Etui, 2 Batterien", "a": null, - "u": "https://www.amazon.de/TFA-Dostmann-Batterietester-98-1126-01-Blockbatterie-schwarz/dp/B08XQNZWYP" + "u": "https://www.amazon.de/OneTouch-Behandlung-Zucker-Krankheit-Blutzucker-Messgerät-Teststreifen/dp/B0BFC3T9QH" }, "28": { - "t": "Tetra Test 6in1 - Wassertest für das Aquarium, schnelle und einfache Überprüfung der Wasserqualität, 1 Dose (25 Teststreifen)", + "t": "12×SAFECARE BIO-TECH Covid-19 & Influenza A+B Antigen Combo Rapid Test 3in1 (1er)", "a": null, - "u": "https://www.amazon.de/Tetra-Wassertest-Aquarium-Überprüfung-Wasserqualität/dp/B001B65ZVQ" + "u": "https://www.amazon.de/12×SAFECARE-BIO-TECH-Covid-19-Influenza-Antigen/dp/B0BVZV82J6" }, "29": { - "t": "harren24 Testkit Set inkl. 60 Testtabletten (Rapid), Wassertester und Aufbewahrungsbox, 30x DPD1 Chlor/Brom + 30x Phenol Red pH-Wert, Pool Wasseranalyse (60 Tablets + Tester)", + "t": "Clearblue Schwangerschaftstest Frühe Erkennung, Frühtest, Pregnancy Test, 2x Frühschwangerschaftstest/Schwangerschaftsfrühtest, über 99% zuverlässig, Schwangerschaft bestimmen, 25 mIU/ml", "a": null, - "u": "https://www.amazon.de/harren24-Testtabletten-Wassertester-Aufbewahrungsbox-Wasseranalyse/dp/B0C3R15GL4" + "u": "https://www.amazon.de/Clearblue-Schwangerschaftstest-Frühschwangerschaftstest-Schwangerschaftsfrühtest-Schwangerschaft/dp/B016I7W61Y" }, "30": { - "t": "sera 04942 Silikat Test (SiO3), Wassertest, misst zuverlässig und genau den Silikatgehalt, die Ursache bei Kieselalgen, für Süß- & Meerwasser, im Aquarium oder Teich, farblos", + "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Corona Schnelltest 100 Stück", "a": null, - "u": "https://www.amazon.de/sera-04942-SiO3-Test-Bestimmungen-Kieselalgen/dp/B001O68DQW" + "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09SS9S71L" }, "31": { - "t": "harren24 Testtabletten für Schütteltester (Rapid) zur chemischen Wasseranalyse von pH-Wert (Phenol red) und freiem Chlor (DPD1) (100 Tablets)", + "t": "COMBUR 9 Test M Teststreifen", "a": null, - "u": "https://www.amazon.de/harren24-Testtabletten-Schütteltester-chemischen-Wasseranalyse/dp/B0C27XYQRZ" + "u": "https://www.amazon.de/COMBUR-Test-Teststreifen-100-St/dp/B0041MKU4E" }, "32": { - "t": "LPA Test Bayern: Einstellungstest öffentlicher Dienst in Bayern bestehen | 1.200 Aufgaben mit Lösungen | Mittlerer (2. QE) & gehobener (3. QE) Dienst | Allgemeinwissen, Logik, Konzentration, Sprache", + "t": "Fluorecare 4in1 Corona- & Grippetest (NEUE VARIANTE 2025) | Influenza A/B, RSV & COVID-19 Schnelltest | Laien-Selbsttest für Zuhause (4 Stück)", "a": null, - "u": "https://www.amazon.de/LPA-Test-Bayern-Einstellungstest-Konzentration/dp/3948144478" + "u": "https://www.amazon.de/Fluorecare-Grippetest-Influenza-Schnelltest-Laien-Selbsttest/dp/B0DV9TNHXN" }, "33": { - "t": "harren24 Photometer Testtabletten Chlor DPD1, DPD3, Aktivsauerstoff DPD4, pH-Wert Phenol Red, Alkalinity M, CYA Test, Pool Wassertester, 50x, 100x, 250x, 500x (DPD 1 - freies Chlor, 100 Tablets)", + "t": "CodeFree Blutzuckerteststreifen 50 Stück Diabetes-Tests zur Kontrolle bei Blutzucker (Zuckerkrankheit)", "a": null, - "u": "https://www.amazon.de/harren24-Photometer-Testtabletten-Aktivsauerstoff-Wassertester/dp/B0C1CBYM81" + "u": "https://www.amazon.de/CodeFree-Blutzuckerteststreifen-Diabetes-Tests-Kontrolle-Blutzucker/dp/B082XHVZCH" }, "34": { - "t": "Mellerud Schimmelpilz Schnelltest | 1 x Schimmeltest | Mikrobiologischer Test zur Erkennung von Schimmelbefall", + "t": "Tetra Test 6in1 - Wassertest für das Aquarium, schnelle und einfache Überprüfung der Wasserqualität, 1 Dose (25 Teststreifen)", "a": null, - "u": "https://www.amazon.de/Mellerud-2001009205-MELLERUD-Schimmelpilz-Test/dp/B004530EA4" + "u": "https://www.amazon.de/Tetra-Wassertest-Aquarium-Überprüfung-Wasserqualität/dp/B001B65ZVQ" }, "35": { - "t": "Ottolenghi Test Kitchen: Extra Good Things", + "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Schnelltest Corona 60 Stück", "a": null, - "u": "https://www.amazon.de/Ottolenghi-Test-Kitchen-Extra-Things/dp/1529109477" + "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09VYKBMJF" }, "36": { - "t": "Testtrainer LPA-Test Bayern: Fit für den Beamtentest – mittlerer Dienst (2. QE) und gehobener Dienst (3. QE) | Über 800 Aufgaben mit allen Lösungswegen | Einstellungstest Öffentlicher Dienst üben", + "t": "JBL 2414400 Aquarienwasser-Teststreifen, 50 Teststreifen, PROAQUATEST EASY 7in1", "a": null, - "u": "https://www.amazon.de/Testtrainer-LPA-Test-Bayern-Lösungswegen-Einstellungstest/dp/3956241177" + "u": "https://www.amazon.de/JBL-ProAquaTest-Easy-7in1-Teststreifen/dp/B07R5WDK5M" }, "37": { - "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 50 Stück, (10x 5 Tests)", + "t": "CYD® Check Your Drink 10 Teststreifen zum Nachweis von Ketamin und GHB (K.O. Tropfen) in Getränken", "a": null, - "u": "https://www.amazon.de/NewGene-COVID-19-Selbsttest-Antigen-Schnelltest/dp/B0B7SDYLFH" + "u": "https://www.amazon.de/Teststreifen-Nachweis-Ketamin-Tropfen-Getränken/dp/B0BZ587X1M" }, "38": { - "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 25 x 1 Stück", + "t": "JBL Wassertest-Koffer, Mit 14 Tests und Zubehör, Für Süßwasser-Aquarien und Leitungswasser, ProAquaTest Lab", "a": null, - "u": "https://www.amazon.de/NewGene-COVID-19-Selbsttest-Antigen-Schnelltest/dp/B0B7SKCDTY" + "u": "https://www.amazon.de/JBL-Wassertest-Koffer-Süßwasser-Aquarien-Leitungswasser-ProAquaTest/dp/B07QNYNS7T" }, "39": { - "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 100 Stück, (20x 5 Tests)", + "t": "EKNA Cannabis Test 3,5 ng - 2x Polizei Drogenschnelltest - Drogentest THC - Drogentest Speichel - THC Schnelltest für Fahrtauglichkeit - Zuverlässige & Sichere Ergebnisse (2 Stück)", "a": null, - "u": "https://www.amazon.de/NewGene-COVID-19-Selbsttest-Antigen-Schnelltest/dp/B0B7SCVB4B" + "u": "https://www.amazon.de/EKNA-Cannabis-Test-Drogenschnelltest-Fahrtauglichkeit/dp/B0DJRF58PP" }, "40": { - "t": "Lebensmittel-Reaktionstest von CERASCREEN – Allergien und Unverträglichkeiten von 40 verschiedenen Lebensmitteln testen | Weizen, Milch, Nuss | Zertifiziertes Labor I Detaillierter Ergebnisbericht", + "t": "CYD® Check Your Drink 5 Teststreifen zum Nachweis von Ketamin und GHB (K.O. Tropfen) in Getränken", "a": null, - "u": "https://www.amazon.de/Lebensmittel-Reaktionstest-CERASCREEN-Unverträglichkeiten-Zertifiziertes-Ergebnisbericht/dp/B08PPVDNGC" + "u": "https://www.amazon.de/Teststreifen-Nachweis-Ketamin-Tropfen-Getränken/dp/B0BZ55KW8C" }, "41": { - "t": "Gezielt vorbereitet - Originalaufgaben des LPA-Tests mit kommentierten Lösungen - Ideal für die Prüfungsvorbereitung: 3. Qualifikationsebene – Ausgabe ... für die Einstellungsjahre 2021, 2022 und 2023", + "t": "JBL ProScan 25420 Wassertest mit Smartphoneauswertung für Süßwasser Aquarien, 1 Stück (1er Pack)", "a": null, - "u": "https://www.amazon.de/Gezielt-vorbereitet-Prüfungsvorbereitung-Qualifikationsebene-Einstellungsjahre/dp/3896505491" + "u": "https://www.amazon.de/JBL-Wassertest-Smartphoneauswertung-Süßwasser-Aquarien/dp/B00R5S9EQ6" }, "42": { - "t": "Pontes Gesamtband 1: Fit für Tests und Klassenarbeiten. Arbeitsheft mit Lösungen 1. Lernjahr (Pontes Gesamtband. Ausgabe 2020)", + "t": "test Jahrbuch 2025 - Der Ratgeber für die besten Produkte und die optimale Kaufentscheidung, Überblick über zahlreiche Produkte mit ehrlichen Bewertungen: Mehr als 100 Tests und Reports", "a": null, - "u": "https://www.amazon.de/Pontes-Gesamtband-Klassenarbeiten-Arbeitsheft-Mediensammlung/dp/3126233201" + "u": "https://www.amazon.de/test-Jahrbuch-2025-Tests-Reports/dp/3747108202" }, "43": { - "t": "NanoRepro ZuhauseTEST Vaginalpilz, Schnelltest für Antikörper gegen Scheidenpilz, einfacher vaginaler Test für Frauen bei Symptomen wie Juckreiz oder Schmerzen beim Urinieren", + "t": "2𝟬 𝘅 𝗖𝗼𝗿𝗼𝗻𝗮 + 𝗜𝗻𝗳𝗹𝘂𝗲𝗻𝘇𝗮 𝗔/𝗕 𝟯𝗶𝗻𝟭 Test | 𝗠𝗛𝗗 𝟬𝟭/𝟮𝟲 | Set für Zuhause mit E-Book: Bleiben Sie gesund! by Saloot® | Grippe Influenza Covid-19 Corona Selbsttest Antigentest", "a": null, - "u": "https://www.amazon.de/ZUHAUSE-TEST-Vaginalpilz-1-Stück/dp/B07NJS3YZ3" + "u": "https://www.amazon.de/𝗖𝗼𝗿𝗼𝗻𝗮-𝗜𝗻𝗳𝗹𝘂𝗲𝗻𝘇𝗮-𝟯𝗶𝗻𝟭-𝗠𝗛𝗗-Zuhause/dp/B0DJRPW1R4" }, "44": { - "t": "sera 04110 Gesamthärte Test (GH), misst zuverlässig und genau die Gesamthärte, für Süßwasser, im Aquarium oder Teich, 15 ml (1er Pack)", + "t": "Basiswissen KI-Testen: Qualität von und mit KI-basierten Systemen Aus- und Weiterbildung zum »Certified Tester AI Testing«– Foundation Level Specialist nach ISTQB®-Standard", "a": null, - "u": "https://www.amazon.de/sera-04110-gH-Test-Gesamthärte-zuverlässig/dp/B00178JA4Q" + "u": "https://www.amazon.de/Basiswissen-KI-Testen-KI-basierten-Weiterbildung-ISTQB®-Standard/dp/3864909473" }, "45": { - "t": "NewGene COVID-19 Selbsttest für Laien, Antigen Schnelltest, 5 Stück", + "t": "Salifert Calcium (Ca) Test Kit - 50 to 100 Tests by", "a": null, - "u": "https://www.amazon.de/NEWGENE-Covid-19-Coronatest-Nasenabstich-Selbsttest/dp/B09LH8F397" + "u": "https://www.amazon.de/Salifert-Calcium-Ca-Test-Kit/dp/B001EIZT4Y" }, "46": { - "t": "Übungsheft mit Tests im Sachunterricht 4. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen für den Übertritt - Sachkunde (Lernzielkontrollen, Band 404)", + "t": "Hotgen Coronavirus (2019-nCoV) - Covid 19 Antigen Schnelltest Corona 80 Stück", "a": null, - "u": "https://www.amazon.de/Tests-Sachunterricht-Lernzielkontrollen-Klasse-Proben/dp/3881004041" + "u": "https://www.amazon.de/Hotgen-Coronavirus-2019-nCoV-Antigen-Schnelltest/dp/B09W38VN81" }, "47": { - "t": "Übungsheft mit Tests in Deutsch 4. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen für den Übertritt - Rechtschreibung, Grammatik und Lesen üben (Lernzielkontrollen, Band 284)", + "t": "CTEK MXS 5.0 Test & Charge, Batterieladegerät 12V, Batteriepfleger, Ladegerät Auto Und LKW Ladegerät, Testet Batterie Und Lichtmaschine, Entsulfatierungsprogramm Und Rekonditionierungsmodus", "a": null, - "u": "https://www.amazon.de/Tests-Deutsch-Lernzielkontrollen-Klasse-Klassenarbeiten/dp/3881002847" + "u": "https://www.amazon.de/CTEK-Batterieladegerät-Batteriepfleger-Entsulfatierungsprogramm-Rekonditionierungsmodus/dp/B00ARU3M5S" }, "48": { - "t": "sera 04210 Karbonathärte Test (KH), misst zuverlässig und genau die Karbonathärte, für Süß- & Meerwasser, im Aquarium oder Teich, 15 ml (1er Pack)", + "t": "TFA Dostmann Batterietester BatteryCheck, 98.1126.01, für Batterien und Akkus(AAA,AA,C,D),Knopfzelle, Blockbatterie, Anzeige des Ladestatus/Volt, einfach und schnell,schwarz", "a": null, - "u": "https://www.amazon.de/sera-04210-kH-Test-Bestimmung-Karbonathärte/dp/B0013ZEFPW" + "u": "https://www.amazon.de/TFA-Dostmann-Batterietester-98-1126-01-Blockbatterie-schwarz/dp/B08XQNZWYP" }, "49": { - "t": "Übungsheft mit Tests in Deutsch - Aufsatz Gymnasium 5. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen (Lernzielkontrollen, Band 285)", + "t": "Roche Combur - 7 Test, 100 Stück", "a": null, - "u": "https://www.amazon.de/Übungsheft-Tests-Deutsch-Klassenarbeiten-Lernzielkontrollen/dp/3881002855" + "u": "https://www.amazon.de/Economed-Roche-Combur-7-Test-100/dp/B002ZH676C" }, "50": { - "t": "Prüfung Express – Deutsch-Test für Zuwanderer A2–B1: Neuausgabe.Deutsch als Zweitsprache / Übungsbuch mit Audios online", + "t": "Übungsheft mit Tests in Mathe 3. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen - Rechnen üben (Lernzielkontrollen, Band 83)", "a": null, - "u": "https://www.amazon.de/Prüfung-Express-Deutsch-Test-Neuausgabe-Deutsch-Zweitsprache/dp/3198016517" + "u": "https://www.amazon.de/Tests-Mathe-Lernzielkontrollen-Klasse-Klassenarbeiten/dp/3881000836" }, "51": { - "t": "25x Longsee", + "t": "Übungsheft mit Tests in Mathe 4. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen - Rechnen üben für den Übertritt (Lernzielkontrollen, Band 84)", "a": null, - "u": "https://www.amazon.de/parahealth-25x-Longsee/dp/B0C74R6ZZB" + "u": "https://www.amazon.de/Tests-Mathe-Lernzielkontrollen-Lernzielkontrolle-Klassenarbeiten/dp/3881000844" }, "52": { - "t": "Tetra Pond Test 6in1 - Wassertest für den Teich, schnelle und einfache Überprüfung der Wasserqualität im Gartenteich, 1 Dose (25 Teststreifen)", + "t": "Adeste – 5 x 1er Corona Schnelltest für Zuhause COVID 19 Antigen Rapid Test Swab Selbsttest. Geprüft für alle neuen 2024 Varianten. Zertifiziert für den Heimgebrauch", "a": null, - "u": "https://www.amazon.de/Tetra-Teststreifen-Bestimmung-Wasserwerten-Gartenteich/dp/B004UAFVEG" + "u": "https://www.amazon.de/Adeste-Schnelltest-Zuhause-Covid-19-Selbsttest/dp/B09HVCKFTB" }, "53": { - "t": "Vitamin B12 Test Kit von CERASCREEN – Vitamin-B12-Spiegel einfach per Selbsttest von Zuhause bestimmen I Vitaminmangel erkennen | Holo-TC-Test | Zertifiziertes Labor & detaillierter Ergebnisbericht", + "t": "Übungsheft mit Tests in Deutsch 3. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen - Rechtschreibung, Grammatik und Lesen üben (Lernzielkontrollen, Band 283)", "a": null, - "u": "https://www.amazon.de/Vitamin-Test-CERASCREEN-Vitamin-B12-Spiegel-Ergebnisbericht/dp/B07TS8DKYC" + "u": "https://www.amazon.de/Tests-Deutsch-Lernzielkontrollen-Klasse-Klassenarbeiten/dp/3881002839" }, "54": { - "t": "Testtrainer IQ-Tests: . Mit Spaß trainieren - . Erfolgreich testen", + "t": "Prüfungstraining DaF - A2/B1: Deutsch-Test für Zuwanderer - Übungsbuch mit Lösungen und Audios als Download (2. Auflage 2022) - Mit Hörtexten und Beispielen", "a": null, - "u": "https://www.amazon.de/Testtrainer-IQ-Tests-trainieren-Erfolgreich-testen/dp/3442175313" + "u": "https://www.amazon.de/Prüfungstraining-DaF-Deutsch-Test-Zuwanderer-Lösungsbeileger/dp/3060203741" }, "55": { - "t": "Histamin-Intoleranz Test Kit von CERASCREEN – Histamin-Unverträglichkeit einfach von Zuhause per Selbsttest bestimmen | Diaminoxidase-Wert (DAO) I Zertifiziertes Labor I Detaillierter Ergebnisbericht", + "t": "COAGUCHEK XS PT Test PST 1X24 St", "a": null, - "u": "https://www.amazon.de/Histamin-Intoleranz-Test-CERASCREEN-Histamin-Unverträglichkeit-Ergebnisbericht/dp/B07TXHKH31" + "u": "https://www.amazon.de/CoaguChek-XS-PT-Test-24St/dp/B01MSHLE17" }, "56": { - "t": "test Jahrbuch 2023: Unsere Themen - Akkuwischer, E-Bikes, Geschirrspüler, Kinderwagen, Smartphones, Kopfhörer uvm.: Mehr als 100 Tests und Reports", + "t": "Benning SDT 1. Steckdosentester (Prüfung von Schutzkontaktsteckdosen, Prüfart 2-polig, Spannungsmessbereich 230 V AC, Klingenlänge 17 mm) 20053, schwarz, grau und rot", "a": null, - "u": "https://www.amazon.de/test-Jahrbuch-2023-Bikes-Geschirrspüler-ebook/dp/B0BNLSSWWX" + "u": "https://www.amazon.de/Benning-Sockeltester-SDT-Fingerkontakt-020053/dp/B01KWJPNGO" }, "57": { - "t": "The Test - Staffel 2", + "t": "Clean Code - Refactoring, Patterns, Testen und Techniken für: Deutsche Ausgabe", "a": null, - "u": "https://www.amazon.de/kannst-nicht-sein-wenn-sehen/dp/B0B7GKPLN9" + "u": "https://www.amazon.de/Clean-Code-Refactoring-Patterns-Techniken-ebook/dp/B00MIF2ANK" }, "58": { - "t": "Jamar 9-Loch Peg Test Kit, Pegboard Set zur Verbesserung der motorischen Koordination und Fingerfertigkeit, Handtrainer Spiel, Ergotherapie, Physiotherapie und für Kinder, Erwachsene, & Kleinkinder", + "t": "Übungsheft mit Tests in Mathe 2. Klasse: Echte Klassenarbeiten mit Punktevergabe und Lösungen - Rechnen lernen (Lernzielkontrollen, Band 82)", "a": null, - "u": "https://www.amazon.de/Verbesserung-Koordination-Fingerfertigkeit-Ergotherapie-Physiotherapie/dp/B0056PQ6VQ" + "u": "https://www.amazon.de/Tests-Mathe-Lernzielkontrollen-Klasse-Klassenarbeiten/dp/3881000828" }, "59": { - "t": "Test: Manchmal ist es nur eine Frage des Glucks. [OmU]", + "t": "The Official Guide to the TOEFL IBT Test (Official Guide to the TOEFL Test)", "a": null, - "u": "https://www.amazon.de/Test-Manchmal-Frage-Glücks-Untertiteln/dp/B00KXV74UA" + "u": "https://www.amazon.de/Official-Guide-TOEFL-Test-Toefl/dp/1265477310" } }, "q": "test", - "ctry": "de", - - // this is a limitation of the current test setup: - // in production, it should be the doublefetch URL, which would be: "https://www.amazon.de/s/?field-keywords=test" - "qurl": "https://www.amazon.de/s/ref=nb_sb_noss/261-9879674-9680331?__mk_de_DE=%C3%85M%C3%85%C5%BD%C3%95%C3%91&url=search-alias%3Daps&field-keywords=test" + "qurl": "https://www.amazon.de/s?k=test", + "ctry": "de" } } ] -} +} \ No newline at end of file diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/angela-merkel-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/angela-merkel-2023-10-10/scenario.json index 3e21c03d..06fedf4e 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/angela-merkel-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/angela-merkel-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "angela merkel", + "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,109 +20,83 @@ "t": "Angela Merkel", "u": "https://en.wikipedia.org/wiki/Angela_Merkel", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Angela Merkel", "u": "https://www.forbes.com/profile/angela-merkel/", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Angela Merkel | World", "u": "https://www.theguardian.com/world/angela-merkel", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Merkel will end her tenure in office as a leader who ...", "u": "https://www.pewresearch.org/short-reads/2021/09/22/merkel-will-end-her-tenure-in-office-as-a-leader-who-was-internationally-popular-during-tumultuous-times/", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Angela Merkel | Biography, Education, Political Career ...", "u": "https://www.britannica.com/biography/Angela-Merkel", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Angela Merkel", "u": "https://www.dw.com/en/angela-merkel/t-17280098", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Angela Merkel - German Chancellor, Politics, Diplomacy", "u": "https://www.britannica.com/biography/Angela-Merkel/Chancellorship", "age": null, - "m": null + "m": null, + "lang": "en" }, "7": { "t": "Angela Merkel", "u": "https://www.nytimes.com/topic/person/angela-merkel", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Angela Merkel, the scientist who became a world leader", "u": "https://news.harvard.edu/gazette/story/2019/05/those-who-have-known-angela-merkel-describe-her-rise-to-prominence/", "age": null, - "m": null + "m": null, + "lang": "en" }, "9": { "t": "Angela Merkel", "u": "https://www.cnbc.com/angela-merkel/", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "angela merkel", "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "snippet2", - "payload": { - "r": { - "0": { - "u": "https://en.wikipedia.org/wiki/Angela_Merkel" - } - }, - "q": "angela merkel", - "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "img-p", - "payload": { - "q": "angela merkel", - "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", - "r": { - "0": { - "u": "https://www.bbc.com/news/world-europe-58570507" - }, - "1": { - "u": "https://www.pewresearch.org/short-reads/2021/09/22/merkel-will-end-her-tenure-in-office-as-a-leader-who-was-internationally-popular-during-tumultuous-times/" - }, - "2": { - "u": "https://www.investopedia.com/angela-merkel-7559368" - } - }, - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "infobox", "payload": { - "q": "angela merkel", - "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", - "ctry": "de", "t": "About", "desc": "Angela Dorothea Merkel is a German former politician and scientist who served as chancellor of Germany from 2005 to 2021. A member of the Christian Democratic Union, she previously served as Leader of the Opposition from 2002 to 2005 and as Leader of the Christian Democratic Union from 2000 to 2018.", "u": "https://en.wikipedia.org/wiki/Angela_Merkel", @@ -145,7 +129,30 @@ "prop": "Parents", "val": "Horst Kasner, Herlind Kasner" } - } + }, + "q": "angela merkel", + "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "img-p", + "payload": { + "r": { + "0": { + "u": "https://www.bbc.com/news/world-europe-58570507" + }, + "1": { + "u": "https://www.pewresearch.org/short-reads/2021/09/22/merkel-will-end-her-tenure-in-office-as-a-leader-who-was-internationally-popular-during-tumultuous-times/" + }, + "2": { + "u": "https://www.investopedia.com/angela-merkel-7559368" + } + }, + "q": "angela merkel", + "qurl": "https://www.google.com/search?q=angela%20merkel&gl=us&hl=en", + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/cricket-icc-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/cricket-icc-2023-10-10/scenario.json index 8b90139e..f733172e 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/cricket-icc-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/cricket-icc-2023-10-10/scenario.json @@ -3,12 +3,170 @@ "mustContain": [ { "type": "wdp", - "action": "widgetTitle", + "action": "query0", "payload": { "q": "cricket icc", - "widgetTitle": "ICC Cricket World Cup", + "qurl": "https://www.google.com/search?q=cricket%20icc&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, + { + "type": "wdp", + "action": "query", + "payload": { + "r": { + "0": { + "t": "ICC (@ICC) · Twitter", + "u": "https://twitter.com/ICC", + "age": null, + "m": null, + "lang": null + }, + "1": { + "t": "Official International Cricket Council Website", + "u": "https://www.icc-cricket.com/homepage", + "age": null, + "m": null, + "lang": "en" + }, + "2": { + "t": "Official ICC Men's Cricket World Cup 2023 Website", + "u": "https://www.cricketworldcup.com/", + "age": null, + "m": null, + "lang": "en" + }, + "3": { + "t": "Upcoming Men's Fixtures", + "u": "https://www.icc-cricket.com/mens-schedule/list", + "age": null, + "m": null, + "lang": "en" + }, + "4": { + "t": "International Cricket Council", + "u": "https://en.wikipedia.org/wiki/International_Cricket_Council", + "age": null, + "m": null, + "lang": "en" + }, + "5": { + "t": "ICC (@icc) • Instagram photos and videos", + "u": "https://www.instagram.com/icc/", + "age": null, + "m": null, + "lang": "en" + }, + "6": { + "t": "ICC Cricket World Cup", + "u": "https://www.espn.com/espnplus/catalog/ac34effe-35e5-4e49-9fc0-89f001f9cbba/icc-cricket-world-cup", + "age": null, + "m": null, + "lang": "en" + }, + "7": { + "t": "ICC - International Cricket Council", + "u": "https://www.facebook.com/icc/", + "age": null, + "m": null, + "lang": "en" + } + }, + "q": "cricket icc", + "qurl": "https://www.google.com/search?q=cricket%20icc&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, + { + "type": "wdp", + "action": "top-stories", + "payload": { + "r": { + "0": { + "u": "https://www.bbc.com/sport/live/cricket/66854377", + "lu": "37 mins ago", + "h": "England vs Bangladesh LIVE: ICC Men's Cricket World Cup 2023 - score, \ncommentary, video highlights & updates - Live" + }, + "1": { + "u": "https://www.hindustantimes.com/cricket/joe-root-breaks-graham-goochs-31-year-old-world-cup-record-with-dazzling-knock-against-bangladesh-101696935811252.html", + "lu": "2 hours ago", + "h": "Joe Root breaks Graham Gooch's 31-year-old World Cup record with dazzling \nknock" + }, + "2": { + "u": "https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856/bangladesh-vs-england-7th-match-1384398/full-scorecard", + "lu": "1 hour ago", + "h": "BAN vs ENG, ICC Cricket World Cup 2023/24, 7th Match at Dharamsala, October \n10, 2023 - Full Scorecard" + }, + "3": { + "u": "https://www.aljazeera.com/sports/liveblog/2023/10/10/live-pakistan-vs-sri-lanka-icc-cricket-world-cup-2023", + "lu": "LIVE", + "h": "LIVE: Pakistan vs Sri Lanka – ICC Cricket World Cup 2023" + }, + "4": { + "u": "https://zeenews.india.com/cricket/live-updates/live-cricket-score-pak-vs-sl-icc-odi-cricket-world-cup-2023-match-no-8-today-pakistan-vs-sri-lanka-rajiv-gandhi-international-stadium-hyderabad-babar-azam-dasun-shanaka-345-runs-2nd-innings-target-for-pak-2673187", + "lu": "LIVE", + "h": "PAK 56-2 (12) | PAK Vs SL ICC ODI World Cup 2023 Live Cricket Score and \nUpdates: Babar Azam Departs," + }, + "5": { + "u": "https://www.espncricinfo.com/series/icc-cricket-world-cup-2023-24-1367856", + "lu": "Jun 27, 2023", + "h": "World Cup 2023 | 2023/24 ICC Cricket World Cup | Live Score, Schedule, News" + }, + "6": { + "u": "https://thewire.in/sport/cricket-world-cup-pakistani-presenter-zainab-abbas-leaves-india", + "lu": "1 hour ago", + "h": "Cricket World Cup: Pakistani Presenter Leaves India After Complaint Over \n'Derogatory Remarks'" + } + }, + "q": "cricket icc", + "qurl": "https://www.google.com/search?q=cricket%20icc&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "infobox", + "payload": { + "t": "International Cricket Council", + "desc": "The International Cricket Council is the global governing body of cricket. It was founded as the Imperial Cricket Conference in 1909 by representatives from Australia, England and South Africa. It was renamed as the International Cricket Conference in 1965, and took up its current name in 1987.", + "u": "https://en.wikipedia.org/wiki/International_Cricket_Council", + "st": { + "0": { + "prop": "Headquarters", + "val": "Dubai, United Arab Emirates" + }, + "1": { + "prop": "Founded", + "val": "June 15, 1909" + }, + "2": { + "prop": "Abbreviation", + "val": "ICC" + }, + "3": { + "prop": "CEO", + "val": "Geoff Allardice" + }, + "4": { + "prop": "Chairman", + "val": "Greg Barclay" + } + }, + "q": "cricket icc", + "qurl": "https://www.google.com/search?q=cricket%20icc&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "widget-title", + "payload": { + "q": "cricket icc", + "wt": "ICC Cricket World Cup", "ctry": "de" } } ] -} +} \ No newline at end of file diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/elon-musk-twitter-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/elon-musk-twitter-2023-10-10/scenario.json index 610d05eb..75853670 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/elon-musk-twitter-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/elon-musk-twitter-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "elon musk twitter", + "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,48 +20,56 @@ "t": "Elon Musk (@elonmusk) · Twitter", "u": "https://twitter.com/elonmusk", "age": null, - "m": null + "m": null, + "lang": null }, "1": { "t": "Elon Musk Deletes Tweet Telling People to Follow ...", "u": "https://www.rollingstone.com/politics/politics-news/elon-musk-israel-hamas-war-misinformation-twitter-1234848927/", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Twitter under Elon Musk", "u": "https://en.wikipedia.org/wiki/Twitter_under_Elon_Musk", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Twitter Is at Death's Door, One Year After Elon Musk's ...", "u": "https://www.rollingstone.com/culture/culture-commentary/elon-musk-killed-twitter-one-year-1234840622/", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Elon Musk admits X 'may fail' after glitch deletes Twitter ...", "u": "https://www.theguardian.com/technology/2023/aug/21/elon-musk-x-glitch-deletes-twitter-photos-pictures-links", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Elon Musk's Twitter Takeover | FRONTLINE", "u": "https://www.pbs.org/wgbh/frontline/documentary/elon-musks-twitter-takeover/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Musk says X will charge everyone to use the platform", "u": "https://www.axios.com/2023/09/19/musk-x-twitter-charge-all-users-monthly-subscription-fees", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "elon musk twitter", "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { @@ -117,72 +135,72 @@ }, { "type": "wdp", - "action": "img-p", + "action": "infobox", "payload": { - "q": "elon musk twitter", - "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", - "r": { + "t": "Elon Musk", + "desc": "Elon Reeve Musk is a business magnate and investor. Musk is the founder, chairman, CEO and chief technology officer of SpaceX; angel investor, CEO, product architect and former chairman of Tesla, Inc.; ...", + "u": "https://en.wikipedia.org/wiki/Elon_Musk", + "st": { "0": { - "u": "http://google.com/search?tbm=isch&q=Elon+Musk" + "prop": "Born", + "val": "June 28, 1971 (age 52 years), Pretoria, South Africa" }, "1": { - "u": "https://www.analyticsinsight.net/elon-musk-twitter-to-embrace-dark-mode-only/" + "prop": "Children", + "val": "Vivian Jenna Wilson, Nevada Alexander Musk, MORE" }, "2": { - "u": "https://people.com/twitter-bird-logo-will-change-soon-elon-musk-says-7564408" + "prop": "Net worth", + "val": "252.6 billion USD (2023) Forbes" }, "3": { - "u": "https://www.cnbc.com/2023/07/27/harvard-expert-elon-musk-is-out-of-his-element-at-twitter-x.html" + "prop": "Spouse", + "val": "Talulah Riley (m. 2013–2016), Talulah Riley (m. 2010–2012), Justine Musk (m. 2000–2008)" }, "4": { - "u": "https://www.salon.com/2022/10/27/elon-musk-has-officially-dubbed-himself-chief-twit/" + "prop": "Parents", + "val": "Errol Musk, Maye Musk" }, "5": { - "u": "https://slate.com/technology/2022/09/elon-musk-twitter-gotta-pick-one.html" - }, - "6": { - "u": "https://www.npr.org/2022/06/16/1105608659/elon-musk-twitter-employees" + "prop": "Siblings", + "val": "Kimbal Musk, Tosca Musk" } }, + "q": "elon musk twitter", + "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", "ctry": "de" } }, { "type": "wdp", - "action": "infobox", + "action": "img-p", "payload": { - "q": "elon musk twitter", - "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", - "ctry": "de", - "t": "Choose what you’re giving feedback on", - "desc": "Elon Reeve Musk is a business magnate and investor. Musk is the founder, chairman, CEO and chief technology officer of SpaceX; angel investor, CEO, product architect and former chairman of Tesla, Inc.; ...", - "u": "https://en.wikipedia.org/wiki/Elon_Musk", - "st": { + "r": { "0": { - "prop": "Born", - "val": "June 28, 1971 (age 52 years), Pretoria, South Africa" + "u": "http://google.com/search?tbm=isch&q=Elon+Musk" }, "1": { - "prop": "Children", - "val": "Vivian Jenna Wilson, Nevada Alexander Musk, MORE" + "u": "https://www.analyticsinsight.net/elon-musk-twitter-to-embrace-dark-mode-only/" }, "2": { - "prop": "Net worth", - "val": "252.6 billion USD (2023) Forbes" + "u": "https://people.com/twitter-bird-logo-will-change-soon-elon-musk-says-7564408" }, "3": { - "prop": "Spouse", - "val": "Talulah Riley (m. 2013–2016), Talulah Riley (m. 2010–2012), Justine Musk (m. 2000–2008)" + "u": "https://www.cnbc.com/2023/07/27/harvard-expert-elon-musk-is-out-of-his-element-at-twitter-x.html" }, "4": { - "prop": "Parents", - "val": "Errol Musk, Maye Musk" + "u": "https://www.salon.com/2022/10/27/elon-musk-has-officially-dubbed-himself-chief-twit/" }, "5": { - "prop": "Siblings", - "val": "Kimbal Musk, Tosca Musk" + "u": "https://slate.com/technology/2022/09/elon-musk-twitter-gotta-pick-one.html" + }, + "6": { + "u": "https://www.npr.org/2022/06/16/1105608659/elon-musk-twitter-employees" } - } + }, + "q": "elon musk twitter", + "qurl": "https://www.google.com/search?q=elon%20musk%20twitter&gl=us&hl=en", + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/f1-2023-09-27/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/f1-2023-09-27/scenario.json index 991a6cf3..eefe1d53 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/f1-2023-09-27/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/f1-2023-09-27/scenario.json @@ -3,10 +3,222 @@ "mustContain": [ { "type": "wdp", - "action": "widgetTitle", + "action": "query0", "payload": { "q": "f1", - "widgetTitle": "Formula 1", + "qurl": "https://www.google.com/search?q=f1&gl=us&hl=en", + "ctry": "de", + "lang": "en-UA" + } + }, + { + "type": "wdp", + "action": "query", + "payload": { + "r": { + "0": { + "t": "F1 - The Official Home of Formula 1® Racing", + "u": "https://www.formula1.com/", + "age": null, + "m": null, + "lang": null + }, + "1": { + "t": "Formula 1 (@F1) · Twitter", + "u": "https://twitter.com/F1", + "age": null, + "m": null, + "lang": null + }, + "2": { + "t": "Formula One", + "u": "https://en.wikipedia.org/wiki/Formula_One", + "age": null, + "m": null, + "lang": "en" + }, + "3": { + "t": "FORMULA 1® (@f1) • Instagram photos and videos", + "u": "https://www.instagram.com/f1/", + "age": null, + "m": null, + "lang": "en" + }, + "4": { + "t": "F1 News, Drivers, Results - Formula 1 Live Online", + "u": "https://www.skysports.com/f1", + "age": null, + "m": null, + "lang": "en" + }, + "5": { + "t": "Adrian Newey - F1 Beyond The Grid Podcast", + "u": "https://www.youtube.com/watch?v=tnIn_x2SZLQ", + "age": null, + "m": null, + "lang": "en" + } + }, + "q": "f1", + "qurl": "https://www.google.com/search?q=f1&gl=us&hl=en", + "ctry": "de", + "lang": "en-UA" + } + }, + { + "type": "wdp", + "action": "top-stories", + "payload": { + "r": { + "0": { + "u": "https://www.formula1.com/en/latest/article.hawkins-completes-debut-f1-test-with-aston-martin-in-hungary.4c3cvfQj3yEYpukGy5UxVm.html", + "lu": "18 hours ago", + "h": "Hawkins completes debut F1 test with Aston Martin in Hungary" + }, + "1": { + "u": "https://www.formula1.com/en/latest/article.watch-from-the-new-kids-on-the-block-to-six-time-champions-red-bull-racings.2CCwgBseMCK0tvp2uUMziB.html", + "lu": "9 hours ago", + "h": "WATCH: From the new kids on the block to six-time champions – Red Bull's \nstory so far" + }, + "2": { + "u": "https://www.formula1.com/en/latest/article.icymi-baby-pics-bubbles-and-plenty-of-buzzing-bees-its-the-best-social-media.4oHnRf4Jypw2QW33VCj4b2.html", + "lu": "12 hours ago", + "h": "ICYMI: Baby pics, bubbles and plenty of buzzing bees – it's the best social \nmedia from Japan" + }, + "3": { + "u": "https://www.formula1.com/en/latest/article.hinchs-heroes-who-does-hinch-reckon-was-super-in-suzuka.2OOpHwN02PBjUl5xSiNdnd.html", + "lu": "18 hours ago", + "h": "HINCH'S HEROES: Who does Hinch reckon was super in Suzuka? | Formula 1®" + }, + "4": { + "u": "https://www.formula1.com/en/latest/article.bottas-reflects-on-early-lap-collisions-in-japan-that-forced-him-to-retire.2FmrPBKleMeXb60mO9hj3T.html", + "lu": "20 hours ago", + "h": "Bottas reflects on early lap collisions in Japan that forced him to retire \n'undriveable' car" + }, + "5": { + "u": "https://www.formula1.com/en/latest/article.lawson-took-no-satisfaction-from-beating-tsunoda-in-japan-after-missing-out.5aVJF9mwEpVQPBadC62gDV.html", + "lu": "21 hours ago", + "h": "Lawson took no 'satisfaction' from beating Tsunoda in Japan after missing \nout on 2024 AlphaTauri seat" + }, + "6": { + "u": "https://www.formula1.com/en/latest/article.tech-tuesday-the-main-factors-behind-red-bulls-singapore-slump-and-how-they.7CZrbubuGeUSCgTEU6N24F.html", + "lu": "1 day ago", + "h": "TECH TUESDAY: The main factors behind Red Bull's Singapore slump and how \nthey bounced back in style at Suzuka ..." + }, + "7": { + "u": "https://www.formula1.com/en/latest/article.say-what-intra-team-squabbles-crashes-galore-and-verstappens-joy-all-feature.2ebYgI088M63WsKLTyUaNR.html", + "lu": "1 day ago", + "h": "SAY WHAT?! Intra-team squabbles, crashes galore and Verstappen's joy all \nfeature in the best team radio from Japan" + }, + "8": { + "u": "https://www.formula1.com/en/latest/article.5-winners-and-5-losers-from-the-japanese-gp-who-impressed-around-suzukas.3XGujxtX8Oum1cryW0UJPc.html", + "lu": "1 day ago", + "h": "5 Winners and 5 Losers from the Japanese GP – Who impressed around Suzuka's \nfamous corners?" + }, + "9": { + "u": "https://www.formula1.com/en/latest/article.i-dont-fully-understand-it-gasly-left-frustrated-by-alpines-call-to-let-ocon.4llfI6zk7Q2nq4iEsE2t8D.html", + "lu": "2 days ago", + "h": "'I don't fully understand it' – Gasly left 'frustrated' by Alpine's call to \nlet Ocon through on the final lap in Japan" + }, + "10": { + "u": "https://www.motorsport.com/f1/news/why-ferrari-doesnt-agree-with-mclaren-f1-fairytale-revival/10525559/", + "lu": "19 hours ago", + "h": "Why Ferrari doesn't agree with McLaren F1 fairytale revival" + }, + "11": { + "u": "https://www.the-race.com/formula-1/aston-martins-painful-contrast-and-why-it-matters-for-2024/", + "lu": "2 days ago", + "h": "Aston Martin's painful F1 contrast - and why it matters" + }, + "12": { + "u": "https://www.autosport.com/f1/news/the-two-f1-rules-problems-perezs-recent-mishaps-expose/10525719/", + "lu": "1 hour ago", + "h": "The two F1 rules problems Perez's recent mishaps expose" + }, + "13": { + "u": "https://www.skysports.com/f1/news/12433/12970520/george-russell-cant-argue-against-lewis-hamilton-status-in-mercedes-hierarchy-says-anthony-davidson", + "lu": "6 hours ago", + "h": "George Russell can't argue against Lewis Hamilton status in Mercedes \nhierarchy, says Anthony Davidson" + }, + "14": { + "u": "https://www.crash.net/f1/news/1036849/1/russell-has-zero-hard-feelings-about-mercedes-instruction", + "lu": "20 hours ago", + "h": "George Russell has “zero hard feelings” about Mercedes F1 team order to \nswap positions with Lewis Hamilton | F1 ..." + }, + "15": { + "u": "https://www.the-race.com/formula-e/of-course-it-hurts-de-vries-rebuild-mission-after-f1-failure/", + "lu": "1 hour ago", + "h": "'Of course it hurts' - De Vries' rebuild mission after F1 failure" + }, + "16": { + "u": "https://racingnews365.com/alpine-provide-update-on-f1-team-principal-search", + "lu": "1 hour ago", + "h": "Alpine boss Famin 'not under pressure' in search for Szafnauer replacement" + } + }, + "q": "f1", + "qurl": "https://www.google.com/search?q=f1&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "infobox", + "payload": { + "t": "Formula 1", + "desc": "Formula One is the highest class of international racing for open-wheel single-seater formula racing cars sanctioned by the Fédération Internationale de l'Automobile. The FIA Formula One World Championship has been one of the premier forms of racing around the world since its inaugural season in 1950.", + "u": "https://en.wikipedia.org/wiki/Formula_One", + "st": { + "0": { + "prop": "Drivers", + "val": "20" + }, + "1": { + "prop": "Teams", + "val": "10" + }, + "2": { + "prop": "Category", + "val": "Open-wheel single-seater Formula auto racing" + } + }, + "q": "f1", + "qurl": "https://www.google.com/search?q=f1&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "img-p", + "payload": { + "r": { + "0": { + "u": "http://t0.gstatic.com/images?q=tbn:ANd9GcTxRmfGmozn5szS7lnaBIceJ9sweiO45WBJmnsRzTdcjFAlLFQ4" + }, + "1": { + "u": "https://www.formula1.com/en/latest/article.f1-announces-24-race-calendar-for-2023.7oNRaq4kZ2bwTAmL7r6dqg.html" + }, + "2": { + "u": "https://www.ea.com/games/f1/f1-23/news/f1-23-patch-notes-v109" + }, + "3": { + "u": "https://www.formula1.com/en/latest/article.10-things-to-be-excited-for-as-f1-gears-up-for-24-races-and-6-sprints-in.2sFaphuPkMnIz8mnt3EEuF.html" + }, + "4": { + "u": "https://www.amalgamcollection.com/products/ferrari-f1-75-bahrain" + } + }, + "q": "f1", + "qurl": "https://www.google.com/search?q=f1&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "widget-title", + "payload": { + "q": "f1", + "wt": "Formula 1", "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/goetze-wm-2014-tor-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/goetze-wm-2014-tor-2023-10-10/scenario.json index 967c24aa..a51d0aaa 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/goetze-wm-2014-tor-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/goetze-wm-2014-tor-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=goetze%20wm%202014%20tor&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "goetze wm 2014 tor", + "qurl": "https://www.google.com/search?q=goetze%20wm%202014%20tor&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,66 +20,77 @@ "t": "Mario Götze WM 2014 Finale Tor - YouTube", "u": "https://www.youtube.com/watch?v=TEWcZ-6l10U", "age": null, - "m": null + "m": null, + "lang": null }, "1": { "t": "Mario Götze WM 2014 Finale Tor in 4K/UHD 13.07.2014", "u": "https://www.youtube.com/watch?v=YiWllGdMITE", "age": null, - "m": null + "m": null, + "lang": null }, "2": { "t": "WM FINALE 2014 - Tor Götze ! ( 5 TV Kommentatoren )", "u": "https://www.youtube.com/watch?v=t9xm_Ingtuc", "age": null, - "m": null + "m": null, + "lang": null }, "3": { "t": "WM 2014 Finale Tor Mario Götze - YouTube", "u": "https://www.youtube.com/watch?v=j_guXT4I-AQ", "age": null, - "m": null + "m": null, + "lang": null }, "4": { "t": "Gotze Goal vs Argentina | 2014 World Cup Final - YouTube", "u": "https://www.youtube.com/watch?v=lu_0zOUTdPE", "age": null, - "m": null + "m": null, + "lang": null }, "5": { "t": "Mario Götze Germany vs Argentina 2014 FIFA World Cup Goal.", "u": "https://www.youtube.com/watch?v=1QJ6P8Yxil4", "age": null, - "m": "tor" + "m": "tor", + "lang": null }, "6": { "t": "WM 2014 Finale Tor Mario Götze - YouTube", "u": "https://www.youtube.com/watch?v=AZ-AzW8Y06w", "age": null, - "m": null + "m": null, + "lang": null }, "7": { "t": "WM-Finale 2014 - Tor von Mario Götze - YouTube", "u": "https://www.youtube.com/watch?v=g67hj4VB8z0", "age": null, - "m": "goetze" + "m": "goetze", + "lang": null }, "8": { "t": "Mario Götzes WM-Siegtreffer ist das \"Tor des Jahrzehnts\"", "u": "https://www.dfb.de/news/detail/mario-goetzes-wm-siegtreffer-ist-das-tor-des-jahrzehnts-215608/", "age": null, - "m": null + "m": null, + "lang": "de" }, "9": { "t": "WM 2014: Das Tor, das ewig bleibt - Sport", "u": "https://www.sueddeutsche.de/sport/goetze-schuerrle-wm-2014-tor-finale-1.4971888", "age": null, - "m": null + "m": null, + "lang": "de" } }, "q": "goetze wm 2014 tor", "qurl": "https://www.google.com/search?q=goetze%20wm%202014%20tor&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/green-apple-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/green-apple-2023-10-10/scenario.json index fed164e0..638b7fa3 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/green-apple-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/green-apple-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=green%20apple&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "green apple", + "qurl": "https://www.google.com/search?q=green%20apple&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,56 +20,62 @@ "t": "Green Apple Books | REAL {books, people, local}", "u": "https://www.greenapplebooks.com/", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Green Apple Waterford", "u": "https://greenapplewaterford.com/", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Green Apple China Bistro", "u": "http://www.greenapplechinabistro.com/", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Granny Smith", "u": "https://en.wikipedia.org/wiki/Granny_Smith", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Health Benefits of Green Apples", "u": "https://www.webmd.com/diet/health-benefits-green-apples", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Green Apple NW", "u": "https://www.greenapplenw.com/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Green Apple Active | Organic Bamboo Active & Yoga Clothing", "u": "https://www.greenappleactive.com/", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "green apple", "qurl": "https://www.google.com/search?q=green%20apple&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "img-p", "payload": { - "q": "green apple", - "qurl": "https://www.google.com/search?q=green%20apple&gl=us&hl=en", "r": { "0": { "u": "https://www.kroger.com/p/small-green-granny-smith-apple/0000000004139" @@ -92,29 +108,8 @@ "u": "https://www.bebeautiful.in/all-thing-lifestyle/health-and-wellness/heres-why-green-apple-is-the-super-fruit-you-need" } }, - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "videos-p", - "payload": { "q": "green apple", "qurl": "https://www.google.com/search?q=green%20apple&gl=us&hl=en", - "r": { - "0": { - "t": "Mrs. GREEN APPLE - Feeling", - "u": "https://www.google.com/#" - }, - "1": { - "t": "Green Apple Ribbons: just the ASMR🍏😍 part 2", - "u": "https://www.google.com/#" - }, - "2": { - "t": "Green Apple Ribbons: just the ASMR 🍏😍 | Ribbon Candy", - "u": "https://www.tiktok.com/@loganscandies/video/7286219511290727722" - } - }, "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/grossformat-laserdrucker-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/grossformat-laserdrucker-2023-10-10/scenario.json index ed2e419e..a6808cb8 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/grossformat-laserdrucker-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/grossformat-laserdrucker-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=grossformat%20laserdrucker&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "grossformat laserdrucker", + "qurl": "https://www.google.com/search?q=grossformat%20laserdrucker&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,68 +20,76 @@ "t": "Large Format | Printers | For Work", "u": "https://epson.com/For-Work/Printers/Large-Format/c/w140", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Suchergebnis Auf Amazon.de Für: Großformat-Laserdrucker", "u": "https://www.amazon.de/Großformat-Laserdrucker/s?k=Großformat-Laserdrucker", "age": null, - "m": null + "m": null, + "lang": "de" }, "2": { "t": "KIP - Wide Format Printers", "u": "https://de.kip.com/", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "SLM®280 2.0", "u": "https://www.slm-solutions.com/products-and-solutions/machines/slm-280/", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Großformat-Kopierer MFP ab 1.190,00", - "u": null, + "u": "https://www.stoffel.de/cat/100215/gro%DFformat-kopierer-mfp.html", "age": null, - "m": null + "m": null, + "lang": "de" }, "5": { "t": "Großformat-Drucksysteme", "u": "https://www.canon.de/business/products/wide-format-printers/", "age": null, - "m": null + "m": null, + "lang": "de" }, "6": { "t": "Plotter und Großformatdrucker bis Din A0", "u": "https://www.fido-buerosysteme.de/Plotter", "age": null, - "m": null + "m": null, + "lang": "de" }, "7": { "t": "ZBAITU M81 Laser Assembly In detail - YouTube", "u": "https://www.youtube.com/watch?v=sfOKJ6AAHks", "age": null, - "m": null + "m": null, + "lang": null }, "8": { "t": "Großformat-Drucker - Bei OTTO Office günstig kaufen.", "u": "https://www.otto-office.com/de/Drucker-und-Multifunktionsgeraete/Grossformat-Drucker/205011/s", "age": null, - "m": null + "m": null, + "lang": "de" } }, "q": "grossformat laserdrucker", "qurl": "https://www.google.com/search?q=grossformat%20laserdrucker&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "img-p", "payload": { - "q": "grossformat laserdrucker", - "qurl": "https://www.google.com/search?q=grossformat%20laserdrucker&gl=us&hl=en", "r": { "0": { "u": "https://www.hp.com/de-de/shop/product.aspx?id=f9a29d&opt=b19&sel=prn" @@ -104,6 +122,8 @@ "u": "https://m.youtube.com/watch?v=CCiee5hfXyU" } }, + "q": "grossformat laserdrucker", + "qurl": "https://www.google.com/search?q=grossformat%20laserdrucker&gl=us&hl=en", "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/konrad-zuse-bilder-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/konrad-zuse-bilder-2023-10-10/scenario.json index ea636c52..649e2106 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/konrad-zuse-bilder-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/konrad-zuse-bilder-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=konrad%20zuse%20bilder&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "konrad zuse bilder", + "qurl": "https://www.google.com/search?q=konrad%20zuse%20bilder&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,68 +20,76 @@ "t": "Konrad Zuse - 53 artworks - painting", "u": "https://www.wikiart.org/en/konrad-zuse", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Konrad zuse Stock Photos and Images", "u": "https://www.alamy.com/stock-photo/konrad-zuse.html", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Sold at Auction: Konrad Zuse", "u": "https://www.invaluable.com/artist/zuse-konrad-smxqcw2o4n/sold-at-auction-prices/", "age": null, - "m": null + "m": null, + "lang": "de" }, "3": { "t": "72 Konrad Zuse Photos & High Res Pictures", "u": "https://www.gettyimages.com/photos/konrad-zuse", "age": null, - "m": null + "m": null, + "lang": "de" }, "4": { "t": "20 Konrad Zuse Images, Stock Photos & Vectors", "u": "https://www.shutterstock.com/search/konrad-zuse", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Konrad Zuse", "u": "http://www.artnet.com/artists/konrad-zuse/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Konrad Zuse", "u": "https://www.german-way.com/notable-people/featured-bios/konrad-zuse/", "age": null, - "m": null + "m": null, + "lang": "en" }, "7": { "t": "Art by Konrad Zuse", "u": "https://www.wut.de/e-174ww-ww-daus-000.php", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Konrad Zuse Stock Photos - Free & Royalty- ... - Dreamstime", "u": "https://www.dreamstime.com/photos-images/konrad-zuse.html", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "konrad zuse bilder", "qurl": "https://www.google.com/search?q=konrad%20zuse%20bilder&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "img-p", "payload": { - "q": "konrad zuse bilder", - "qurl": "https://www.google.com/search?q=konrad%20zuse%20bilder&gl=us&hl=en", "r": { "0": { "u": "https://www.wikiart.org/en/konrad-zuse" @@ -116,6 +134,8 @@ "u": "https://www.alamy.com/stock-photo/konrad-zuse.html" } }, + "q": "konrad zuse bilder", + "qurl": "https://www.google.com/search?q=konrad%20zuse%20bilder&gl=us&hl=en", "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/kuckucksuhr-kaufen-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/kuckucksuhr-kaufen-2023-10-10/scenario.json index 56c9f6ff..ab391547 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/kuckucksuhr-kaufen-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/kuckucksuhr-kaufen-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=kuckucksuhr%20kaufen&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "kuckucksuhr kaufen", + "qurl": "https://www.google.com/search?q=kuckucksuhr%20kaufen&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,66 +20,77 @@ "t": "Kuckucksuhren » jetzt online kaufen | uhren-park.de", "u": "https://www.uhren-park.de/index.php/cat/c1_Kuckucksuhren.html", "age": null, - "m": null + "m": null, + "lang": "de" }, "1": { "t": "Original Kuckucksuhren aus dem Schwarzwald", "u": "https://www.kuckucksuhr.net/", "age": null, - "m": null + "m": null, + "lang": "de" }, "2": { "t": "Kuckucksuhren aus dem Schwarzwald", "u": "https://www.schwarzwaldpalast.de/", "age": null, - "m": null + "m": null, + "lang": "de" }, "3": { "t": "Original Kuckucksuhren aus dem Schwarzwald", "u": "https://www.hausder1000uhren.de/", "age": null, - "m": null + "m": null, + "lang": "de" }, "4": { "t": "Kuckucksuhren / Spezialuhren: Küche, Haushalt & Wohnen", "u": "https://www.amazon.de/Kuckucksuhren/b?ie=UTF8&node=2970892031", "age": null, - "m": null + "m": null, + "lang": "de" }, "5": { "t": "Kuckucksuhren bestellen | Große Auswahl an ...", "u": "https://www.haus-der-schwarzwalduhren.de/", "age": null, - "m": null + "m": null, + "lang": "de" }, "6": { "t": "Kuckucksuhren online kaufen", "u": "https://www.ebay.de/b/Kuckucksuhren/79644/bn_2394739", "age": null, - "m": null + "m": null, + "lang": "de" }, "7": { "t": "Exklusive Kuckucksuhren Unikate", "u": "https://www.kuckucksuhr.com/", "age": null, - "m": null + "m": null, + "lang": "de" }, "8": { "t": "Kuckucksuhren günstig online kaufen", "u": "https://www.kaufland.de/wanduhren/kuckucksuhren/", "age": null, - "m": null + "m": null, + "lang": "de" }, "9": { "t": "Kuckucksuhren aus dem Schwarzwald", "u": "https://kuckucksuhren.shop/", "age": null, - "m": null + "m": null, + "lang": "de" } }, "q": "kuckucksuhr kaufen", "qurl": "https://www.google.com/search?q=kuckucksuhr%20kaufen&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nba-2023-09-27/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nba-2023-09-27/scenario.json index cbfd05b4..2092ebcb 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nba-2023-09-27/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nba-2023-09-27/scenario.json @@ -3,10 +3,165 @@ "mustContain": [ { "type": "wdp", - "action": "widgetTitle", + "action": "query0", "payload": { "q": "nba", - "widgetTitle": "NBA", + "qurl": "https://www.google.com/search?q=nba&gl=us&hl=en", + "ctry": "de", + "lang": "en-UA" + } + }, + { + "type": "wdp", + "action": "query", + "payload": { + "r": { + "0": { + "t": "The official site of the NBA for the latest NBA Scores, Stats ...", + "u": "https://www.nba.com/", + "age": null, + "m": null, + "lang": null + }, + "1": { + "t": "List of oldest and youngest National Basketball Association players", + "u": "https://en.wikipedia.org/wiki/List_of_oldest_and_youngest_National_Basketball_Association_players#:~:text=He played in 1,611 regular,NBA after they turned 40.", + "age": null, + "m": null, + "lang": null + }, + "2": { + "t": "National Basketball Association Scheduling Simulation - CMU Math", + "u": "https://www.math.cmu.edu/~af1p/Teaching/OR2/Projects/P51/393FinalPaper.pdf", + "age": null, + "m": null, + "lang": null + }, + "3": { + "t": "List of National Basketball Association seasons played leaders - Wikipedia", + "u": "https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_seasons_played_leaders#:~:text=Vince Carter, who began his,have played in 22 seasons.", + "age": null, + "m": null, + "lang": null + }, + "4": { + "t": "Best Record Since NBA All Star Break | StatMuse", + "u": "https://www.statmuse.com/nba/ask/best-record-since-nba-all-star-break#:~:text=The Los Angeles Lakers have,a record of 1363-935.", + "age": null, + "m": null, + "lang": null + }, + "5": { + "t": "NBA (@NBA) · Twitter", + "u": "https://twitter.com/NBA", + "age": null, + "m": null, + "lang": null + }, + "6": { + "t": "National Basketball Association", + "u": "https://en.wikipedia.org/wiki/National_Basketball_Association", + "age": null, + "m": null, + "lang": "en" + }, + "7": { + "t": "NBA on ESPN - Scores, Stats and Highlights", + "u": "https://www.espn.com/nba/", + "age": null, + "m": null, + "lang": "en" + }, + "8": { + "t": "NBA (@nba) • Instagram photos and videos", + "u": "https://www.instagram.com/nba/", + "age": null, + "m": null, + "lang": "en" + }, + "9": { + "t": "NBA | National Basketball Association, News, Scores ...", + "u": "https://bleacherreport.com/nba", + "age": null, + "m": null, + "lang": "en" + }, + "10": { + "t": "NBA", + "u": "https://www.youtube.com/user/NBA", + "age": null, + "m": null, + "lang": "en" + } + }, + "q": "nba", + "qurl": "https://www.google.com/search?q=nba&gl=us&hl=en", + "ctry": "de", + "lang": "en-UA" + } + }, + { + "type": "wdp", + "action": "top-stories", + "payload": { + "r": { + "0": { + "u": "https://www.foxsports.com.au/basketball/nba/nba-2023-philadelphia-76ers-news-damien-lillard-trade-kelly-oubre-jr-james-harden-future-latest-trade-news-updates/news-story/62fd644009f224490ad5497a1604141c", + "lu": "11 hours ago", + "h": "NBA 2023: Philadelphia 76ers news; Damien Lillard trade; Kelly Oubre Jr. \nJames Harden future, latest trade news ..." + }, + "1": { + "u": "https://www.espn.com/nba/story/_/id/38498956/magic-johnson-says-knicks-only-nba-team-consider-owning", + "lu": "13 hours ago", + "h": "Magic Johnson says Knicks only NBA team he'd consider owning - ESPN" + }, + "2": { + "u": "https://www.si.com/nba/2023/09/26/nba-mailbag-does-damian-lillard-trade-make-sense-raptors", + "lu": "17 hours ago", + "h": "NBA Mailbag: Does a Damian Lillard Trade Make Sense for the Raptors?" + } + }, + "q": "nba", + "qurl": "https://www.google.com/search?q=nba&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "infobox", + "payload": { + "t": "NBA", + "desc": "The National Basketball Association is a professional basketball league in North America composed of 30 teams. It is one of the major professional sports leagues in the United States and Canada and is considered the premier professional basketball league in the world.", + "u": "https://en.wikipedia.org/wiki/National_Basketball_Association", + "st": { + "0": { + "prop": "Founded", + "val": "June 6, 1946, New York, New York, United States" + }, + "1": { + "prop": "Headquarters", + "val": "New York, New York, United States" + }, + "2": { + "prop": "Inaugural season", + "val": "1946–47" + }, + "3": { + "prop": "Most recent champion(s)", + "val": "Denver Nuggets; (1st title)" + } + }, + "q": "nba", + "qurl": "https://www.google.com/search?q=nba&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "widget-title", + "payload": { + "q": "nba", + "wt": "NBA", "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nirvana-polly-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nirvana-polly-2023-10-10/scenario.json index 447dcd47..04c43de3 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nirvana-polly-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/nirvana-polly-2023-10-10/scenario.json @@ -3,45 +3,89 @@ "mustContain": [ { "type": "wdp", - "action": "video-p", + "action": "query0", "payload": { "q": "nirvana polly", "qurl": "https://www.google.com/search?q=nirvana%20polly&gl=us&hl=en", - "r": { - "0": { - "t": "Nirvana - Polly (Audio)", - "u": "https://www.youtube.com/watch?v=DrlaVYKWeLU" - } - }, - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", - "action": "videos-p", + "action": "query", "payload": { - "q": "nirvana polly", - "qurl": "https://www.google.com/search?q=nirvana%20polly&gl=us&hl=en", "r": { "0": { - "t": "Nirvana - Polly (Live On MTV Unplugged, 1993 / Unedited)", - "u": "https://www.youtube.com/watch?v=3H0NHHKBemg" + "t": "Polly (Nirvana song)", + "u": "https://en.wikipedia.org/wiki/Polly_(Nirvana_song)", + "age": null, + "m": null, + "lang": "en" }, "1": { - "t": "Nirvana - Polly (Live On MTV Unplugged, 1993 / Rehearsal)", - "u": "https://www.youtube.com/watch?v=ZSEoYDA2txw" + "t": "Nirvana – Polly Lyrics", + "u": "https://genius.com/Nirvana-polly-lyrics", + "age": null, + "m": null, + "lang": "en" }, "2": { - "t": "Nirvana - Polly (Live At The Paramount, Seattle / 1991)", - "u": "https://www.youtube.com/watch?v=abBgsNx85mI" + "t": "The brutally tragic events that led to Nirvana song 'Polly'", + "u": "https://faroutmagazine.co.uk/tragic-events-led-to-nirvana-song-polly/", + "age": null, + "m": null, + "lang": "en" }, "3": { "t": "Polly", - "u": "https://www.google.com/#" + "u": "https://www.youtube.com/watch?v=J8XzaK43KbE", + "age": null, + "m": null, + "lang": "en" + }, + "4": { + "t": "Polly Lyrics - Nirvana", + "u": "https://songmeanings.com/songs/view/379/", + "age": null, + "m": null, + "lang": "en" + }, + "5": { + "t": "Polly | Nirvana Wiki - Fandom", + "u": "https://nirvana.fandom.com/wiki/Polly", + "age": null, + "m": null, + "lang": "en" + }, + "6": { + "t": "Polly - song and lyrics by Nirvana", + "u": "https://open.spotify.com/track/3tlXDvaNrrOmdvG0XVUOcv", + "age": null, + "m": null, + "lang": "en" } }, + "q": "nirvana polly", + "qurl": "https://www.google.com/search?q=nirvana%20polly&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, + { + "type": "wdp", + "action": "video-p", + "payload": { + "r": { + "0": { + "t": "Nirvana - Polly (Audio)", + "u": "https://www.youtube.com/watch?v=DrlaVYKWeLU" + } + }, + "q": "nirvana polly", + "qurl": "https://www.google.com/search?q=nirvana%20polly&gl=us&hl=en", "ctry": "de" } } ] -} +} \ No newline at end of file diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/passauer-dom-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/passauer-dom-2023-10-10/scenario.json index 7ecbc46a..5b76b135 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/passauer-dom-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/passauer-dom-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "passauer dom", + "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,80 +20,161 @@ "t": "St. Stephen's Cathedral, Passau", "u": "https://en.wikipedia.org/wiki/St._Stephen's_Cathedral,_Passau", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "St. Stephen's Cathedral - Passau Tourism", "u": "https://tourism.passau.de/discover-passau/sights/st-stephens-cathedral/", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Dom St. Stephan, Passau", "u": "https://www.tripadvisor.com/Attraction_Review-g187311-d1899014-Reviews-Dom_St_Stephan-Passau_Lower_Bavaria_Bavaria.html", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Dom St. Stephan", "u": "https://www.bistum-passau.de/dom-kultur/dom-st-stephan-passau", "age": null, - "m": null + "m": null, + "lang": "de" }, "4": { "t": "St. Stephen's Cathedral | Straße der Kaiser und Könige", "u": "https://www.routeofemperorsandkings.com/location/dom-st-stephan-in-passau/", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Der Dom zu Passau: Vom Mittelalter bis zur Gegenwart", "u": "https://www.amazon.com/Der-Dom-zu-Passau/dp/3791727079", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Passau Cathedral / Passauer Dom - Google My Maps", "u": "https://www.google.com/mymaps/viewer?mid=1C7BLBkYAgru9xUPPCRzoAfdS6uI&hl=en", "age": null, - "m": null + "m": null, + "lang": "de" }, "7": { "t": "Organ Dom Passau", "u": "https://www.amazon.com/Organ-Dom-Passau-Athanasiades/dp/B000024E99", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Dom St. Stephan (Passau) - Cycle Routes and Map", "u": "https://www.komoot.com/highlight/295304", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "passauer dom", "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", - "action": "places2", + "action": "infobox", "payload": { + "t": "St. Stephan's Cathedral", + "desc": "St. Stephen's Cathedral is a baroque church from 1688 in Passau, Germany, dedicated to Saint Stephen. It is the seat of the Catholic Bishop of Passau and the main church of his diocese.\nSince 730, there have been many churches built on the site of the current cathedral.", + "u": "https://en.wikipedia.org/wiki/St._Stephen's_Cathedral,_Passau", + "st": { + "0": { + "prop": "Architect", + "val": "Carlo Lurago" + }, + "1": { + "prop": "Architectural style", + "val": "Baroque architecture" + }, + "2": { + "prop": "Opened", + "val": "1693" + }, + "3": { + "prop": "Bells", + "val": "8" + } + }, "q": "passauer dom", - "type": "Map", "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", "ctry": "de" } }, { "type": "wdp", - "action": "place", + "action": "placeinfo", "payload": { + "ad": null, + "addr": "Domplatz, 94032 Passau, Germany", "t": "St. Stephan's Cathedral", "u": "https://www.bistum-passau.de/dom-kultur/dom-st-stephan-passau", - "addr": "Domplatz, 94032 Passau, Germany", - "loc": "48.5741597,13.4656793,15z", + "sum": "Catholic cathedral in Passau, Germany", + "desc": " ", + "p": "+49 851 3930", + "opt": null, + "open": { + "0": { + "item": "Tuesday" + }, + "1": { + "item": "6:30 AM–7 PM" + }, + "2": { + "item": "Wednesday" + }, + "3": { + "item": "6:30 AM–7 PM" + }, + "4": { + "item": "Thursday" + }, + "5": { + "item": "6:30 AM–7 PM" + }, + "6": { + "item": "Friday" + }, + "7": { + "item": "6:30 AM–7 PM" + }, + "8": { + "item": "Saturday" + }, + "9": { + "item": "6:30 AM–7 PM" + }, + "10": { + "item": "Sunday" + }, + "11": { + "item": "6:30 AM–7 PM" + }, + "12": { + "item": "Monday" + }, + "13": { + "item": "6:30 AM–7 PM" + } + }, + "shut": null, + "q": "passauer dom", + "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", "ctry": "de" } }, @@ -91,8 +182,6 @@ "type": "wdp", "action": "img-p", "payload": { - "q": "passauer dom", - "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", "r": { "0": { "u": "https://en.wikipedia.org/wiki/St._Stephen%27s_Cathedral,_Passau" @@ -125,37 +214,30 @@ "u": "https://www.pfarrbriefservice.de/image/dom-st-stephan-passau" } }, + "q": "passauer dom", + "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", "ctry": "de" } }, { "type": "wdp", - "action": "infobox", + "action": "place", + "payload": { + "t": "St. Stephan's Cathedral", + "u": "https://www.bistum-passau.de/dom-kultur/dom-st-stephan-passau", + "addr": "Domplatz, 94032 Passau, Germany", + "loc": "48.5741597,13.4656793,15z", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "places2", "payload": { "q": "passauer dom", + "type": "map", "qurl": "https://www.google.com/search?q=passauer%20dom&gl=us&hl=en", - "ctry": "de", - "t": "St. Stephan's Cathedral", - "desc": "St. Stephen's Cathedral is a baroque church from 1688 in Passau, Germany, dedicated to Saint Stephen. It is the seat of the Catholic Bishop of Passau and the main church of his diocese.\nSince 730, there have been many churches built on the site of the current cathedral.", - "u": "https://en.wikipedia.org/wiki/St._Stephen's_Cathedral,_Passau", - "st": { - "0": { - "prop": "Architect", - "val": "Carlo Lurago" - }, - "1": { - "prop": "Architectural style", - "val": "Baroque architecture" - }, - "2": { - "prop": "Opened", - "val": "1693" - }, - "3": { - "prop": "Bells", - "val": "8" - } - } + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sneezing-panda-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sneezing-panda-2023-10-10/scenario.json index 4e5eddb8..ba26e526 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sneezing-panda-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sneezing-panda-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=sneezing%20panda&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "sneezing panda", + "qurl": "https://www.google.com/search?q=sneezing%20panda&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,74 +20,83 @@ "t": "Sneezing Baby Panda | Original Video - YouTube", "u": "https://www.youtube.com/watch?v=93hq0YU3Gqk", "age": null, - "m": null + "m": null, + "lang": null }, "1": { "t": "Sneezing Baby Panda (2006) - YouTube", "u": "https://www.youtube.com/watch?v=YOdQ88rQ464", "age": null, - "m": null + "m": null, + "lang": null }, "2": { "t": "Sneezing Baby Panda the Original.mov - YouTube", "u": "https://www.youtube.com/watch?v=O4rfQSgkZOE", "age": null, - "m": null + "m": null, + "lang": null }, "3": { "t": "The Panda Keeps Sneezing | iPanda - YouTube", "u": "https://www.youtube.com/watch?v=cLn5NCBXYkQ", "age": null, - "m": null + "m": null, + "lang": null }, "4": { "t": "Sneezing Baby Panda", "u": "https://knowyourmeme.com/memes/sneezing-baby-panda", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Sneezing Baby Panda: The Movie (2015)", "u": "https://www.imdb.com/title/tt1986164/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Adorable panda sneezes so loud he startles his playmates", "u": "https://www.dailymail.co.uk/video/news/video-2310152/Video-Adorable-panda-sneezes-loud-startles-playmates.html", "age": null, - "m": null + "m": null, + "lang": null }, "7": { "t": "Baby Pandas in Minecraft have a chance of sneezing ...", "u": "https://www.reddit.com/r/GamingDetails/comments/cokc75/baby_pandas_in_minecraft_have_a_chance_of/", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Panda sneezing hilariously | By iPanda - Facebook", "u": "https://www.facebook.com/ipandacom/videos/panda-sneezing-hilariously/1027178498031751/", "age": null, - "m": null + "m": null, + "lang": null }, "9": { "t": "this poor mama panda, also who knew pandas even sneezed ...", "u": "https://www.tiktok.com/@wholesome_content1020/video/7022350825741683973?lang=en", "age": null, - "m": null + "m": null, + "lang": null } }, "q": "sneezing panda", "qurl": "https://www.google.com/search?q=sneezing%20panda&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "img-p", "payload": { - "q": "sneezing panda", - "qurl": "https://www.google.com/search?q=sneezing%20panda&gl=us&hl=en", "r": { "0": { "u": "http://google.com/search?tbm=isch&q=Sneezing+Baby+Panda+-+The+Movie" @@ -101,6 +120,8 @@ "u": "https://mashable.com/video/claymation-panda-sneeze" } }, + "q": "sneezing panda", + "qurl": "https://www.google.com/search?q=sneezing%20panda&gl=us&hl=en", "ctry": "de" } } diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sq-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sq-2023-10-10/scenario.json index 061bb9f3..7dce4045 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sq-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/sq-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=sq&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "sq", + "qurl": "https://www.google.com/search?q=sq&gl=us&hl=en", + "ctry": "de", + "lang": "en-UA" + } + }, { "type": "wdp", "action": "query", @@ -10,75 +20,106 @@ "t": "Loch Ness Monster", "u": "https://en.wikipedia.org/wiki/Loch_Ness_Monster", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Ungeheuer von Loch Ness", "u": "https://de.wikipedia.org/wiki/Ungeheuer_von_Loch_Ness", "age": null, - "m": null + "m": null, + "lang": "de" }, "2": { "t": "Monster von Loch Ness: Was wir über Nessie wissen", "u": "https://www.galileo.tv/natur/monster-von-loch-ness-suche-nach-nessie-see-ungeheuer/", "age": null, - "m": null + "m": null, + "lang": "de" }, "3": { "t": "Das Ungeheuer von Loch Ness! Wie ist der Nessie ... - YouTube", "u": "https://www.youtube.com/watch?v=SlT232EQSQ8", "age": null, - "m": null + "m": null, + "lang": null }, "4": { "t": "Ein Ungeheuer? Loch Ness gibt sein Rätsel nicht preis", "u": "https://www.youtube.com/watch?v=2hryr-XK66Y", "age": null, - "m": null + "m": null, + "lang": null }, "5": { "t": "Entdecken Sie das Loch Ness Monster | Highland Titles", "u": "https://www.highlandtitles.de/das-loch-ness-monster/", "age": null, - "m": null + "m": null, + "lang": "de" }, "6": { "t": "Ist das ENDLICH der Beweis für „Nessie“? - YouTube", "u": "https://www.youtube.com/watch?v=tznzEUE3krc", "age": null, - "m": null + "m": null, + "lang": null }, "7": { "t": "Nessie – das Ungeheuer von Loch Ness - Fabelwesen - ...", "u": "https://www.planet-wissen.de/kultur/fabelwesen/britische_schauergestalten/pwienessiedasungeheuervonlochness100.html", "age": null, - "m": null + "m": null, + "lang": "de" }, "8": { "t": "Loch Ness in Schottland: Größte Suche nach Ungeheuer ...", "u": "https://www.spiegel.de/panorama/loch-ness-in-schottland-groesste-suche-nach-ungeheuer-nessie-seit-jahrzehnten-geplant-a-e005552e-425a-4367-9df1-f7a43f9d3ce5", "age": null, - "m": null + "m": null, + "lang": "de" }, "9": { "t": "Nessie - Das Ungeheuer von Loch Ness: Monster, Mythen ...", "u": "https://www.amazon.com/Nessie-Das-Ungeheuer-Loch-Ness/dp/3890943853", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "ungeheuer von loch ness", "qurl": "https://www.google.com/search?q=sq&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en-UA" } }, { "type": "wdp", - "action": "snippet2", + "action": "infobox", "payload": { - "r": { + "t": "About", + "desc": "The Loch Ness Monster, affectionately known as Nessie, is a creature in Scottish folklore that is said to inhabit Loch Ness in the Scottish Highlands. It is often described as large, long-necked, and with one or more humps protruding from the water.", + "u": "https://en.wikipedia.org/wiki/Loch_Ness_Monster", + "st": { "0": { - "u": "https://en.wikipedia.org/wiki/Loch_Ness_Monster" + "prop": "Notable aliases", + "val": "Loch Ness monster, Nessie, Nessie, Niseag" + }, + "1": { + "prop": "Similar Creatures", + "val": "Ogopogo, Mokele-mbembe, Champ" + }, + "2": { + "prop": "Based on", + "val": "Plesiosaur" + }, + "3": { + "prop": "First attested", + "val": "565" + }, + "4": { + "prop": "Similar entities", + "val": "Champ, Ogopogo, Altamaha-ha" } }, "q": "ungeheuer von loch ness", @@ -90,8 +131,6 @@ "type": "wdp", "action": "img-p", "payload": { - "q": "ungeheuer von loch ness", - "qurl": "https://www.google.com/search?q=sq&gl=us&hl=en", "r": { "0": { "u": "https://www.youtube.com/watch?v=SlT232EQSQ8" @@ -109,41 +148,9 @@ "u": "https://www.nzz.ch/panorama/monster-von-loch-ness-ein-hype-wird-90-ld.1735498" } }, - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "infobox", - "payload": { "q": "ungeheuer von loch ness", "qurl": "https://www.google.com/search?q=sq&gl=us&hl=en", - "ctry": "de", - "t": "About", - "desc": "The Loch Ness Monster, affectionately known as Nessie, is a creature in Scottish folklore that is said to inhabit Loch Ness in the Scottish Highlands. It is often described as large, long-necked, and with one or more humps protruding from the water.", - "u": "https://en.wikipedia.org/wiki/Loch_Ness_Monster", - "st": { - "0": { - "prop": "Notable aliases", - "val": "Loch Ness monster, Nessie, Nessie, Niseag" - }, - "1": { - "prop": "Similar Creatures", - "val": "Ogopogo, Mokele-mbembe, Champ" - }, - "2": { - "prop": "Based on", - "val": "Plesiosaur" - }, - "3": { - "prop": "First attested", - "val": "565" - }, - "4": { - "prop": "Similar entities", - "val": "Champ, Ogopogo, Altamaha-ha" - } - } + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/trump-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/trump-2023-10-10/scenario.json index 1abce3ac..973b06ef 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/trump-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/trump-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=trump&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "trump", + "qurl": "https://www.google.com/search?q=trump&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,60 +20,70 @@ "t": "Donald Trump", "u": "https://en.wikipedia.org/wiki/Donald_Trump", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Donald Trump - Breaking News, Photos and Videos", "u": "https://thehill.com/people/donald-trump/", "age": null, - "m": null + "m": null, + "lang": "en" }, "2": { "t": "Donald Trump | Breaking News & Latest Updates", "u": "https://apnews.com/hub/donald-trump", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Trump News, Commentary and Analysis", "u": "https://www.cnn.com/specials/politics/president-donald-trump-45", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Donald J. Trump – The White House - National Archives |", "u": "https://trumpwhitehouse.archives.gov/people/donald-j-trump/", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Donald Trump", "u": "https://www.cnbc.com/donald-trump/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "Donald Trump | US news", "u": "https://www.theguardian.com/us-news/donaldtrump", "age": null, - "m": null + "m": null, + "lang": "en" }, "7": { "t": "Donald Trump", "u": "https://www.nytimes.com/spotlight/donald-trump", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Donald Trump | Today's latest from Al Jazeera", "u": "https://www.aljazeera.com/tag/donald-trump/", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "trump", "qurl": "https://www.google.com/search?q=trump&gl=us&hl=en", - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { @@ -114,11 +134,31 @@ }, { "type": "wdp", - "action": "snippet2", + "action": "infobox", "payload": { - "r": { + "t": "About", + "desc": "Donald John Trump is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021.\nTrump received a BS in economics from the University of Pennsylvania in 1968, and his father named him president of his real-estate business in 1971.", + "u": "https://en.wikipedia.org/wiki/Donald_Trump", + "st": { "0": { - "u": "https://en.wikipedia.org/wiki/Donald_Trump" + "prop": "Born", + "val": "June 14, 1946 (age 77 years), Jamaica Hospital Medical Center, New York, NY" + }, + "1": { + "prop": "Party", + "val": "Republican Party" + }, + "2": { + "prop": "Net worth", + "val": "2.5 billion USD (2023) Forbes" + }, + "3": { + "prop": "Spouse", + "val": "Melania Trump (m. 2005), Marla Maples (m. 1993–1999), Ivana Trump (m. 1977–1990)" + }, + "4": { + "prop": "Height", + "val": "6′ 3″" } }, "q": "trump", @@ -130,8 +170,6 @@ "type": "wdp", "action": "img-p", "payload": { - "q": "trump", - "qurl": "https://www.google.com/search?q=trump&gl=us&hl=en", "r": { "0": { "u": "http://t1.gstatic.com/images?q=tbn:ANd9GcQQn6_Hz9zTckXYuOa1biiMhulnHv6pKtadAFcdg79yocrL3Y29" @@ -149,41 +187,9 @@ "u": "https://encrypted-tbn0.gstatic.com/licensed-image?q=tbn:ANd9GcSxE9wxi30Xzg-PPmyB3ne1hIn-3O_unBBYHvhL-9K1c-oQQ1u7Y6fUH8jI6-EnbqVsXZAmsRU7G-YfEF0" } }, - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "infobox", - "payload": { "q": "trump", "qurl": "https://www.google.com/search?q=trump&gl=us&hl=en", - "ctry": "de", - "t": "About", - "desc": "Donald John Trump is an American politician, media personality, and businessman who served as the 45th president of the United States from 2017 to 2021.\nTrump received a BS in economics from the University of Pennsylvania in 1968, and his father named him president of his real-estate business in 1971.", - "u": "https://en.wikipedia.org/wiki/Donald_Trump", - "st": { - "0": { - "prop": "Born", - "val": "June 14, 1946 (age 77 years), Jamaica Hospital Medical Center, New York, NY" - }, - "1": { - "prop": "Party", - "val": "Republican Party" - }, - "2": { - "prop": "Net worth", - "val": "2.5 billion USD (2023) Forbes" - }, - "3": { - "prop": "Spouse", - "val": "Melania Trump (m. 2005), Marla Maples (m. 1993–1999), Ivana Trump (m. 1977–1990)" - }, - "4": { - "prop": "Height", - "val": "6′ 3″" - } - } + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/von-der-leyen-2023-10-10/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/von-der-leyen-2023-10-10/scenario.json index 61441676..b46d70f4 100644 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/von-der-leyen-2023-10-10/scenario.json +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/go/von-der-leyen-2023-10-10/scenario.json @@ -1,6 +1,16 @@ { "url": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", "mustContain": [ + { + "type": "wdp", + "action": "query0", + "payload": { + "q": "von der leyen", + "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", + "ctry": "de", + "lang": "en" + } + }, { "type": "wdp", "action": "query", @@ -10,109 +20,83 @@ "t": "Ursula von der Leyen", "u": "https://en.wikipedia.org/wiki/Ursula_von_der_Leyen", "age": null, - "m": null + "m": null, + "lang": "en" }, "1": { "t": "Ursula von der Leyen (@vonderleyen) · Twitter", "u": "https://twitter.com/vonderleyen", "age": null, - "m": null + "m": null, + "lang": null }, "2": { "t": "Ursula von der Leyen - The Commissioners", "u": "https://commissioners.ec.europa.eu/ursula-von-der-leyen_en", "age": null, - "m": null + "m": null, + "lang": "en" }, "3": { "t": "Ursula von der Leyen - European Commission", "u": "https://be.linkedin.com/in/ursula-von-der-leyen", "age": null, - "m": null + "m": null, + "lang": "en" }, "4": { "t": "Ursula von der Leyen", "u": "https://apnews.com/hub/ursula-von-der-leyen", "age": null, - "m": null + "m": null, + "lang": "en" }, "5": { "t": "Ursula von der Leyen (@ursulavonderleyen)", "u": "https://www.instagram.com/ursulavonderleyen/", "age": null, - "m": null + "m": null, + "lang": "en" }, "6": { "t": "2023 State of the Union Address by President Ursula von ...", "u": "https://www.wilsoncenter.org/article/2023-state-union-address-president-ursula-von-der-leyen-how-will-eu-answer-call-history", "age": null, - "m": null + "m": null, + "lang": "en" }, "7": { "t": "Ursula von der Leyen - Geschichte der CDU", "u": "https://www.kas.de/en/web/geschichte-der-cdu/biogram-detail/-/content/ursula-von-der-leyen-2", "age": null, - "m": null + "m": null, + "lang": "en" }, "8": { "t": "Ursula von der Leyen | Facts, European Commission, & ...", "u": "https://www.britannica.com/biography/Ursula-von-der-Leyen", "age": null, - "m": null + "m": null, + "lang": "en" }, "9": { "t": "Ursula von der Leyen | World", "u": "https://www.theguardian.com/world/ursula-von-der-leyen", "age": null, - "m": null + "m": null, + "lang": "en" } }, "q": "von der leyen", "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "snippet2", - "payload": { - "r": { - "0": { - "u": "https://en.wikipedia.org/wiki/Ursula_von_der_Leyen" - } - }, - "q": "von der leyen", - "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", - "ctry": "de" - } - }, - { - "type": "wdp", - "action": "img-p", - "payload": { - "q": "von der leyen", - "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", - "r": { - "0": { - "u": "https://time.com/6185490/ursula-von-der-leyen-interview/" - }, - "1": { - "u": "https://encrypted-tbn0.gstatic.com/licensed-image?q=tbn:ANd9GcTFJJMhkRUA1RqVwF49U81zW0e9qoZG-r4afDeq0FuBVH71KJYwiEKoEYcTfdChTTzFD0AXUmyu3b9Vqe0" - }, - "2": { - "u": "https://encrypted-tbn3.gstatic.com/licensed-image?q=tbn:ANd9GcRN6TyNorrXvkQRDMmDSaPQuIJpU8qrD0y-6CHMoF_KsLWCIcOAZRxTBVr5bwoOCYawcJdO6Kgxfy7S680" - } - }, - "ctry": "de" + "ctry": "de", + "lang": "en" } }, { "type": "wdp", "action": "infobox", "payload": { - "q": "von der leyen", - "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", - "ctry": "de", "t": "About", "desc": "Ursula Gertrud von der Leyen is a German physician and politician serving as the 13th president of the European Commission since 2019. She served in the German federal government between 2005 and 2019, holding successive positions in Angela Merkel's cabinet, most recently as minister of defence.", "u": "https://en.wikipedia.org/wiki/Ursula_von_der_Leyen", @@ -145,7 +129,30 @@ "prop": "Party", "val": "European People's Party group" } - } + }, + "q": "von der leyen", + "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", + "ctry": "de" + } + }, + { + "type": "wdp", + "action": "img-p", + "payload": { + "r": { + "0": { + "u": "https://time.com/6185490/ursula-von-der-leyen-interview/" + }, + "1": { + "u": "https://encrypted-tbn0.gstatic.com/licensed-image?q=tbn:ANd9GcTFJJMhkRUA1RqVwF49U81zW0e9qoZG-r4afDeq0FuBVH71KJYwiEKoEYcTfdChTTzFD0AXUmyu3b9Vqe0" + }, + "2": { + "u": "https://encrypted-tbn3.gstatic.com/licensed-image?q=tbn:ANd9GcRN6TyNorrXvkQRDMmDSaPQuIJpU8qrD0y-6CHMoF_KsLWCIcOAZRxTBVr5bwoOCYawcJdO6Kgxfy7S680" + } + }, + "q": "von der leyen", + "qurl": "https://www.google.com/search?q=von%20der%20leyen&gl=us&hl=en", + "ctry": "de" } } ] diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/patterns.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/patterns.json deleted file mode 100644 index de9fea17..00000000 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/patterns.json +++ /dev/null @@ -1,239 +0,0 @@ -{ - "urlPatterns": [ - "\\.google\\..*?[#?&;]((q=[^&]+&([^&]+&)*tbm=isch)|(tbm=isch&([^&]+&)*q=[^&]+))", - "^https:[/][/][^/]*[.]google[.].*?[#?&;]((q=[^&]+&([^&]+&)*tbm=vid)|(tbm=vid&([^&]+&)*q=[^&]+))", - "^https:[/][/][^/]+[.]google[.][^/]+/search[?](.+[&])?q=[^$&]+", - ".search.yahoo\\..*?[#?&;]p=[^$&]+", - ".linkedin.*?\\/pub\\/dir+", - "\\.bing\\.[^/]+\\/images/search[?]q=[^$&]+", - "\\.bing\\..*?[#?&;]q=[^$&]+", - "\\.amazon\\.[^/]+\\/(s[?]k=[^$&]+|.*[?&]field-keywords=[^$&]+)", - "\\.amazon\\..*(/dp/|/gp/product/)" - ], - "searchEngines": ["0", "1", "2", "3", "5", "6", "7"], - "scrape": { - "0": { - "form[role=\"search\"]": { - "q": { - "item": "form input[name=\"q\"]", - "type": "searchQuery", - "etype": "value", - "keyName": "q" - } - } - }, - "1": { - "dont>match>body>head>html": { - "q": { - "type": "searchQuery", - "etype": "url", - "keyName": "q", - "functionsApplied": [["parseU", "qs", "q"]] - } - }, - "form[role=\"search\"]": { - "q": { - "item": "form .gLFyf[name=\"q\"]", - "type": "searchQuery", - "etype": "value", - "keyName": "q" - } - } - }, - "2": { - "dont>match>body>head>html": { - "q": { - "type": "searchQuery", - "etype": "url", - "keyName": "q", - "functionsApplied": [["parseU", "qs", "q"]] - } - }, - "#mobile-search #sfcnt": { - "q": { - "item": "input", - "type": "searchQuery", - "etype": "value", - "keyName": "q" - } - }, - "form[role=\"search\"]": { - "q": { - "item": "form .gLFyf[name=\"q\"]", - "type": "searchQuery", - "etype": "value", - "keyName": "q" - } - }, - "#search": { - "q": { - "item": "#rso", - "type": "searchQuery", - "etype": "data-async-context", - "keyName": "q", - "functionsApplied": [["splitF", "query:", 1]] - }, - "widgetTitle": { - "item": "div.EfDVh.viOShc div.ofy7ae, div.EfDVh.viOShc table.torspo_view__table span.tsp-ht", - "type": "widgetTitle", - "etype": "textContent", - "keyName": "wt" - } - } - }, - "3": { - ".sbq-w": { - "q": { - "item": "#yschsp", - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - } - }, - "4": { - ".profile-card": { - "img": { - "item": ".image", - "type": "arr", - "etype": "src", - "keyName": "imgl" - }, - "fullName": { - "item": ".content h3 a", - "type": "arr", - "etype": "text", - "keyName": "fn" - }, - "profileLink": { - "item": ".content h3 a", - "type": "arr", - "etype": "href", - "keyName": "pl" - }, - "currentWork": { - "item": ".content p.headline", - "type": "arr", - "etype": "textContent", - "keyName": "cw" - } - }, - "ctry": { - "ctry": { - "type": "standard", - "etype": "ctry", - "keyName": "ctry" - } - } - }, - "5": { - "#sb_form": { - "q": { - "item": "#sb_form_q", - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - } - }, - "6": { - "#sb_form": { - "q": { - "item": "#sb_form_q", - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - } - }, - "7": { - "#twotabsearchtextbox": { - "q": { - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - }, - "[data-component-type=\"s-result-sort\"]": { - "q": { - "item": "form > input[name=\"k\"]", - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - } - }, - "8": { - ".nav-search-field > input": { - "q": { - "type": "searchQuery", - "etype": "value", - "keyName": "query" - } - }, - "#wayfinding-breadcrumbs_container": { - "c": { - "item": "div", - "etype": "textContent", - "keyName": "c" - } - }, - "ctry": { - "ctry": { - "type": "standard", - "etype": "ctry", - "keyName": "ctry" - } - }, - "link[rel='canonical']": { - "curl": { - "etype": "href", - "keyName": "curl" - } - } - } - }, - "payloads": { - "2": { - "#search": { - "type": "single", - "results": "single", - "action": "widgetTitle", - "fields": [ - ["#search", "q"], - ["#search", "widgetTitle"] - ] - } - }, - "4": { - ".profile-card": { - "type": "single", - "results": "single", - "action": "linkedin" - } - }, - "8": { - "q": { - "type": "query", - "results": "clustered", - "action": "category-am", - "fields": [ - ["#wayfinding-breadcrumbs_container", "c"], - ["link[rel='canonical']", "curl"], - ["ctry", "ctry"] - ] - } - } - }, - "idMapping": { - "0": "goi", - "1": "gov", - "2": "go", - "3": "ya", - "4": "lnkd", - "5": "bingi", - "6": "bing", - "7": "am", - "8": "amc" - } -} diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/rules.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/rules.json new file mode 100644 index 00000000..fbb3502f --- /dev/null +++ b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/rules.json @@ -0,0 +1,1880 @@ +{ + "search-go": { + "input": { + "#search": { + "first": { + "q": { + "select": "#rso", + "attr": "data-async-context", + "transform": [ + [ + "trySplit", + "query:", + 1 + ], + [ + "decodeURIComponent" + ] + ] + }, + "wt": { + "select": "div.EfDVh.viOShc div.ofy7ae, div.EfDVh.viOShc table.torspo_view__table span.tsp-ht", + "attr": "textContent" + } + } + }, + "html[lang]": { + "first": { + "lang": { + "attr": "lang", + "transform": [ + [ + "filterExact", + [ + "en", + "fr", + "en-GB", + "de", + "en-CA", + "ja", + "en-IN", + "en-AU", + "nl", + "it", + "pl", + "es-419", + "pt-BR", + "es", + "en-NL", + "en-DE", + "en-FR", + "de-AT", + "zh-TW", + "id", + "en-PH", + "en-SG", + "tr", + "nl-BE", + "en-ID", + "hu", + "cs", + "en-ZA", + "en-NZ", + "sv", + "fr-BE", + "en-PL", + "el", + "de-CH", + "pt-PT", + "en-MY", + "zh-CN", + "ko", + "da", + "en-BE", + "en-IE", + "ru-UA", + "fr-CA", + "en-IT", + "en-ES", + "en-RO", + "en-CH", + "ro", + "en-SE", + "fi", + "uk", + "en-GR", + "es-AR", + "en-PK", + "en-BR", + "fr-CH", + "es-MX", + "en-JP", + "no", + "en-AT", + "en-DK", + "sk", + "en-PT", + "en-CZ", + "vi", + "en-HK", + "en-FI", + "en-IL", + "zh-HK", + "en-NG", + "ru", + "en-TR", + "en-NO", + "en-VN", + "bg", + "en-BG", + "es-CO", + "en-TH", + "ru-BY", + "en-UA", + "en-MX", + "iw", + "en-HU", + "es-PE", + "en-EG", + "th", + "sl", + "hr", + "en-HR", + "en-RS", + "sr", + "ar-AE", + "bn", + "es-CL", + "en-BD", + "ar", + "ru-KZ", + "en-TW", + "fr-MA" + ] + ] + ] + } + } + }, + "div#rso div.g:not(:has(div.g))": { + "all": { + "t": { + "select": "a > br + h3, g-section-with-header g-link > a > h3", + "attr": "textContent" + }, + "u": { + "select": "div.yuRUbf > div > span > a[jsname], div.yuRUbf > div > a[jsname], div.nhaZ2c > div > span > a, div.DhN8Cf > div > a[jsname], g-section-with-header g-link > a.a-no-hover-decoration", + "attr": "href", + "transform": [ + [ + "trySplit", + "?ref_src=twsrc", + 0 + ], + [ + "tryDecodeURIComponent" + ] + ] + }, + "age": { + "select": "div > span.LEwnzc.Sqrs4e:not(:has(span[aria-hidden=\"true\"])) > span", + "attr": "textContent" + }, + "m": { + "select": ".TXwUJf a.fl", + "attr": "textContent" + }, + "lang": { + "attr": "lang" + } + } + }, + "#rso a.WlydOe": { + "all": { + "u": { + "attr": "href", + "transform": [ + [ + "maskU" + ] + ] + }, + "lu": { + "select": "div.OSrXXb > span", + "attr": "textContent" + }, + "h": { + "select": "div.ynAwRc[role='heading']", + "attr": "textContent" + } + } + }, + "#main": { + "first": { + "sq": { + "select": "#taw #fprsl.gL9Hy", + "attr": "textContent" + }, + "oq": { + "select": "#fprs a.spell_orig", + "attr": "textContent" + }, + "type": { + "firstMatch": [ + { + "select": "#lu_map", + "attr": "id", + "transform": [ + [ + "trySplit", + "lu_", + 1 + ] + ] + }, + { + "select": "#rso a > g-img > img[id^=dimg_][alt]", + "attr": "alt", + "transform": [ + [ + "trySplit", + " ", + 0 + ] + ] + }, + { + "select": "#rso a > g-img > img[id^=dimg_][alt]", + "attr": "alt", + "transform": [ + [ + "trySplit", + " ", + 0 + ] + ] + }, + { + "select": "#main div[jscontroller] a[aria-label][href*=\"/maps/\"]:has(img[alt])", + "attr": "aria-label", + "transform": [ + [ + "trySplit", + " ", + 0 + ] + ] + }, + { + "select": "div[jscontroller] a[href*=\"/maps/\"]:has(g-img > img[alt])", + "attr": "alt", + "transform": [ + [ + "trySplit", + " ", + 0 + ] + ] + } + ] + } + } + }, + "#main div[jsmodel] div.YzSd": { + "first": { + "type": { + "attr": "textContent" + } + } + }, + "#tads div[data-text-ad]": { + "all": { + "u": { + "firstMatch": [ + { + "select": "a.sVXRqc[data-pcu][href^=\"https://www.googleadservices.com/\"]", + "attr": "data-pcu", + "transform": [ + [ + "trySplit", + ",", + 0 + ], + [ + "maskU" + ] + ] + }, + { + "select": "a.sVXRqc", + "attr": "href", + "transform": [ + [ + "removeParams", + [ + "utm_source", + "utm_medium", + "utm_campaign", + "utm_term", + "utm_content", + "utm_cluster", + "utm_group", + "mtm_campaign", + "mtm_kwd", + "mtm_source", + "adg_ctx", + "_x_ads_sub_channel", + "_p_rfs", + "_x_ns_prz_type", + "_x_ns_sku_id", + "_x_ns_gid", + "mrk_rec", + "_x_ads_channel", + "_x_gmc_account", + "_x_login_type", + "_bg_fs", + "_p_jump_id", + "_x_vst_scene", + "source", + "ref_", + "smid", + "referralCode", + "locale_override", + "cpkey", + "Locale" + ] + ], + [ + "trySplit", + "#", + 0 + ], + [ + "relaxedMaskU" + ] + ] + } + ] + }, + "pcu": { + "select": "a.sVXRqc", + "attr": "data-pcu", + "transform": [ + [ + "trySplit", + ",", + 0 + ], + [ + "maskU" + ] + ] + }, + "t": { + "select": "a.sVXRqc > div.CCgQ5[role='heading'] > span", + "attr": "textContent" + }, + "text": { + "select": "div > div.Va3FIb", + "attr": "textContent" + } + } + }, + "[id^='vplap']": { + "all": { + "u": { + "attr": "href", + "transform": [ + [ + "removeParams", + [ + "utm_source", + "utm_medium", + "utm_campaign", + "utm_term", + "utm_content", + "utm_cluster", + "utm_group", + "mtm_campaign", + "mtm_kwd", + "mtm_source", + "adg_ctx", + "_x_ads_sub_channel", + "_p_rfs", + "_x_ns_prz_type", + "_x_ns_sku_id", + "_x_ns_gid", + "mrk_rec", + "_x_ads_channel", + "_x_gmc_account", + "_x_login_type", + "_bg_fs", + "_p_jump_id", + "_x_vst_scene", + "source", + "ref_", + "smid", + "referralCode", + "locale_override", + "cpkey", + "Locale" + ] + ], + [ + "trySplit", + "#", + 0 + ], + [ + "relaxedMaskU" + ] + ] + }, + "t": { + "attr": "aria-label" + } + } + }, + "#bottomads div[data-text-ad]": { + "all": { + "u": { + "firstMatch": [ + { + "select": "a[data-pcu][href^=\"https://www.googleadservices.com/\"]", + "attr": "data-pcu", + "transform": [ + [ + "trySplit", + ",", + 0 + ], + [ + "maskU" + ] + ] + }, + { + "select": "a[data-pcu]", + "attr": "href", + "transform": [ + [ + "removeParams", + [ + "utm_source", + "utm_medium", + "utm_campaign", + "utm_term", + "utm_content", + "utm_cluster", + "utm_group", + "mtm_campaign", + "mtm_kwd", + "mtm_source", + "adg_ctx", + "_x_ads_sub_channel", + "_p_rfs", + "_x_ns_prz_type", + "_x_ns_sku_id", + "_x_ns_gid", + "mrk_rec", + "_x_ads_channel", + "_x_gmc_account", + "_x_login_type", + "_bg_fs", + "_p_jump_id", + "_x_vst_scene", + "source", + "ref_", + "smid", + "referralCode", + "locale_override", + "cpkey", + "Locale" + ] + ], + [ + "trySplit", + "#", + 0 + ], + [ + "relaxedMaskU" + ] + ] + } + ] + }, + "pcu": { + "select": "a[data-pcu]", + "attr": "data-pcu", + "transform": [ + [ + "trySplit", + ",", + 0 + ], + [ + "maskU" + ] + ] + }, + "t": { + "select": "div[role='heading']", + "attr": "textContent" + }, + "text": { + "select": "div > div.Va3FIb", + "attr": "textContent" + } + } + }, + "#rso div[data-id=\"jobs-detail-viewer\"] div.EimVGf[data-share-url]": { + "all": { + "t": { + "select": "div.tNxQIb", + "attr": "textContent" + }, + "c": { + "select": "div.wHYlTd", + "attr": "textContent" + }, + "loc": { + "select": "div.wHYlTd.FqK3wc", + "attr": "textContent", + "transform": [ + [ + "split", + "\u2022", + 0 + ], + [ + "trim" + ] + ] + }, + "via": { + "select": "div.wHYlTd.FqK3wc", + "attr": "textContent", + "transform": [ + [ + "split", + "\u2022", + 1 + ], + [ + "trySplit", + " \u00fcber ", + 1 + ], + [ + "trySplit", + " via ", + 1 + ], + [ + "trim" + ] + ] + }, + "age": { + "select": "div.ApHyTb div.I2Cbhb:has(svg > path[d^=\"M11.99 2C6\"])", + "attr": "textContent" + }, + "jty": { + "select": "div.ApHyTb div:has(svg > path[d^=\"M20 6h\"])", + "attr": "textContent" + }, + "jsa": { + "firstMatch": [ + { + "select": "div.ApHyTb div:has(svg > path[d^=\"M11.8 10.9\"])", + "attr": "textContent" + }, + { + "select": "div.ApHyTb div:has(svg > path[d^=\"M2 5v14h20V5H2zm18 12H4V7h16v10zm-5-5.25c1.03\"])", + "attr": "textContent" + } + ] + }, + "jhe": { + "select": "div.ApHyTb div:has(svg path[d^=\"M10.5,13H8v-3h2.5V7.5h3V10H16v3h-2.5v2.5h-3V13z\"])", + "attr": "textContent" + }, + "jde": { + "select": "div.ApHyTb div:has(svg path[d^=\"M15.05,20\"])", + "attr": "textContent" + }, + "jqu": { + "select": "div.ApHyTb div:has(svg path[d^=\"M21 5v13.5c-1.1-.35-2.3-.5-3.5-.5-1.7\"])", + "attr": "textContent" + }, + "jva": { + "select": "div.ApHyTb div:has(svg path[d^=\"M19 4h-1V2h-2v2H8V2H6v2H5c-1.11\"])", + "attr": "textContent" + }, + "jrp": { + "select": "div.ApHyTb div:has(svg path[d^=\"M15,10c0-0.55,0.45-1,1-1s1,0.45,1,1c0,0.55-0.45,1-1,1S15,10.55,15,10z\"])", + "attr": "textContent" + } + } + }, + "#rcnt div.ifM9O > div": { + "all": { + "ent": { + "select": "div[role=\"heading\"] > :nth-child(1)", + "attr": "textContent" + }, + "pred": { + "select": "div[role=\"heading\"] > :nth-child(3)", + "attr": "textContent" + }, + "ans": { + "select": "div.kno-fb-ctx.KBXm4e, .kp-header div[data-attrid] > div", + "attr": "textContent" + } + } + }, + "div#rcnt.GyAeWb div#rso > div.ULSxyf block-component, div#rcnt.GyAeWb > div.M8OgIe": { + "all": { + "u": { + "select": "div.yuRUbf a", + "attr": "href", + "transform": [ + [ + "trySplit", + "#", + 0 + ], + [ + "maskU" + ] + ] + } + } + }, + "#cnt:has(#rcnt > div.mNh24c + div.UFQ0Gb)": { + "all": { + "ent": { + "select": "#rcnt > div.mNh24c + div.UFQ0Gb div[data-attrid=\"title\"][role=\"heading\"]", + "attr": "textContent" + }, + "pred": { + "firstMatch": [ + { + "select": "#center_col[role=main] div[role=\"tabpanel\"] div[jsname=\"xQjRM\"] a[data-ti^=\"default_tab:kc:\"] div.T6zPgb > div[aria-level=\"2\"][role=\"heading\"] > span.mgAbYb.OSrXXb.RES9jf.IFnjPb", + "attr": "textContent" + }, + { + "select": "#center_col[role=main] div[role=\"tabpanel\"] div[jsname=\"xQjRM\"]:not(:has(a[role=\"link\"])) div.T6zPgb:not(.YC72Wc) > div[aria-level=\"2\"][role=\"heading\"] > span.mgAbYb.OSrXXb.RES9jf.IFnjPb", + "attr": "textContent" + } + ] + } + } + }, + "#cnt div.ifM9O div[data-attrid=\"wa:/description\"] > span.ILfuVd > span.hgKElc": { + "first": { + "t": { + "attr": "textContent" + } + } + }, + "#cnt div.ifM9O div[data-attrid=\"wa:/description\"] > span.ILfuVd > span.hgKElc b": { + "all": { + "k": { + "attr": "textContent" + } + } + }, + "div.webanswers-webanswers_table__webanswers-table tbody > tr": { + "all": { + "0": { + "select": "th:nth-child(1), td:nth-child(1)", + "attr": "textContent" + }, + "1": { + "select": "th:nth-child(2), td:nth-child(2)", + "attr": "textContent" + }, + "2": { + "select": "th:nth-child(3), td:nth-child(3)", + "attr": "textContent" + }, + "3": { + "select": "th:nth-child(4), td:nth-child(4)", + "attr": "textContent" + }, + "4": { + "select": "th:nth-child(5), td:nth-child(5)", + "attr": "textContent" + }, + "5": { + "select": "th:nth-child(6), td:nth-child(6)", + "attr": "textContent" + }, + "6": { + "select": "th:nth-child(7), td:nth-child(7)", + "attr": "textContent" + }, + "7": { + "select": "th:nth-child(8), td:nth-child(8)", + "attr": "textContent" + } + } + }, + "div.ifM9O:has(div.webanswers-webanswers_table__webanswers-table)": { + "first": { + "t": { + "select": "div[role=\"heading\"]", + "attr": "textContent" + }, + "u": { + "select": "div.webanswers-webanswers_table__webanswers-table a", + "attr": "href" + }, + "lang": { + "select": "div[lang]", + "attr": "lang" + } + } + }, + "#rcnt div.M8OgIe div[jscontroller] div[aria-valuetext=Generating][role=progressbar]": { + "first": { + "type": { + "attr": "role" + } + } + }, + "#rhs[role=\"complementary\"]": { + "first": { + "t": { + "firstMatch": [ + { + "select": "[data-attrid=\"title\"]", + "attr": "textContent" + }, + { + "select": "div[role=heading]", + "attr": "textContent" + } + ] + } + } + }, + "#rhs[role=\"complementary\"]:has(div[data-attrid])": { + "first": { + "desc": { + "firstMatch": [ + { + "select": "div[data-attrid=\"VisualDigestDescription\"] span.QoPDcf > span", + "attr": "textContent" + }, + { + "select": "div[data-attrid=\"description\"] div.V4pKmd div.kno-rdesc > h3 + span", + "attr": "textContent" + }, + { + "select": "div[data-attrid=\"description\"] div.kno-rdesc > h3 + span", + "attr": "textContent" + } + ] + }, + "u": { + "firstMatch": [ + { + "select": "div[data-attrid=\"VisualDigestDescription\"] > a", + "attr": "href" + }, + { + "select": "div[data-attrid=\"description\"] div.kno-rdesc > h3 + span + span > a", + "attr": "href" + } + ] + } + } + }, + "#rhs[role=\"complementary\"] div.wDYxhc[data-attrid^=\"kc:/\"], #rhs[role=\"complementary\"] div.wDYxhc[data-attrid^=\"ss:/\"]": { + "all": { + "prop": { + "select": "div.rVusze > span.w8qArf:nth-child(1)", + "attr": "textContent", + "transform": [ + [ + "split", + ": ", + 0 + ] + ] + }, + "val": { + "select": "div.rVusze > span:nth-child(2)", + "attr": "textContent" + } + } + }, + "#rso #rhs div.kp-wholepage.kp-wholepage-osrp": { + "first": { + "ad": { + "select": "div[data-attrid=\"kc:/local:promotions\"] span.PsFiZe", + "attr": "textContent" + }, + "t": { + "select": "h2[data-attrid=\"title\"] > span", + "attr": "textContent" + }, + "u": { + "firstMatch": [ + { + "select": "h2[data-attrid=\"title\"] + div > div > a[href^=\"http\"]", + "attr": "href", + "transform": [ + [ + "maskU" + ] + ] + }, + { + "select": "div[data-attrid=\"kc:/local:menu\"] a[href^=\"http\"]", + "attr": "href", + "transform": [ + [ + "maskU" + ] + ] + } + ] + }, + "sum": { + "select": "div[data-attrid=\"kc:/local:one line summary\"] > div > span.YhemCb:not([data-ved])", + "attr": "textContent" + }, + "desc": { + "firstMatch": [ + { + "select": "div[data-attrid=\"description\"] div.kno-rdesc > span > span", + "attr": "textContent" + }, + { + "select": "div[data-attrid=\"kc:/local:scalable attributes\"] > c-wiz > div > div > div > span", + "attr": "textContent" + } + ] + }, + "addr": { + "firstMatch": [ + { + "select": "div[data-attrid=\"kc:/location/location:address\"] a[data-url^=\"/maps/place\"] > span", + "attr": "textContent" + }, + { + "select": "div[data-attrid=\"kc:/location/location:address\"] > div > div > span.LrzXr", + "attr": "textContent" + } + ] + }, + "opt": { + "select": "div[data-attrid=\"kc:/local:business_availability_modes\"] > c-wiz > div", + "attr": "textContent" + }, + "p": { + "firstMatch": [ + { + "select": "div[data-attrid=\"kc:/collection/knowledge_panels/has_phone:phone\"] [data-dtype=\"d3ph\"]", + "attr": "textContent" + }, + { + "select": "div[data-attrid=\"kc:/local:alt phone\"] [data-dtype=\"d3ph\"]", + "attr": "textContent" + } + ] + }, + "shut": { + "select": "div[data-attrid=\"kc:/local:permanently closed\"] span#Shyhc", + "attr": "textContent" + } + } + }, + "#rso #rhs div.kp-wholepage.kp-wholepage-osrp div[data-attrid=\"kc:/location/location:hours\"] table tr > td": { + "all": { + "item": { + "attr": "textContent" + } + } + }, + "a.k8XOCe, a.ngTNl span.dg6jd": { + "all": { + "t": { + "attr": "textContent" + } + } + }, + "div.EyBRub div[data-lpage]": { + "all": { + "u": { + "attr": "data-lpage" + } + } + }, + "#main div.PhiYYd.QBl4oe": { + "all": { + "t": { + "select": "img", + "attr": "alt" + }, + "u": { + "select": "div.twQ0Be > a", + "attr": "href" + } + } + }, + "#main a[href^=http].xMqpbd:has(div.ZxS7Db)": { + "all": { + "t": { + "select": "div.KYaZsb div.ZxS7Db", + "attr": "textContent" + }, + "u": { + "attr": "href" + } + } + }, + "div.kp-wholepage-osrp": { + "first": { + "t": { + "select": "div.kp-header div.SPZz6b h2[data-attrid=\"title\"]", + "attr": "textContent" + }, + "u": { + "select": "div.kp-header div.SPZz6b h2[data-attrid=\"title\"] + div.IzNS7c > div.QqG1Sd > a.ab_button:not([data-url])", + "attr": "href" + }, + "addr": { + "select": "div[data-attrid=\"kc:/location/location:address\"] span.w8qArf + span.LrzXr", + "attr": "textContent" + }, + "loc": { + "select": "a[data-url^='/maps/place/']", + "attr": "data-url", + "transform": [ + [ + "trySplit", + "/", + 4 + ], + [ + "trySplit", + "@", + 1 + ], + [ + "trySplit", + "?", + 0 + ] + ] + } + } + }, + "#main .VkpGBb": { + "all": { + "t": { + "select": ".cXedhc div.dbg0pd", + "attr": "textContent" + }, + "addr": { + "select": "div.CNIbvd > div.rllt__details > [role='heading'] + div, div:not(.CNIbvd) > div.rllt__details > [role='heading'] + div + div", + "attr": "textContent" + } + } + }, + "#rso div.KYLHhb div[role=heading] > span.mgAbYb": { + "first": { + "type": { + "attr": "textContent" + } + } + }, + "#rso div.KYLHhb div.VqeGe": { + "all": { + "t": { + "select": "div.T3Fozb > div[role=heading]", + "attr": "textContent" + }, + "u": { + "select": "a", + "attr": "href", + "transform": [ + [ + "relaxedMaskU" + ] + ] + }, + "site": { + "select": "div.R8BTeb", + "attr": "textContent" + } + } + } + }, + "output": { + "query0": { + "fields": [ + { + "key": "q" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "query": { + "fields": [ + { + "key": "r", + "source": "div#rso div.g:not(:has(div.g))", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "top-stories": { + "fields": [ + { + "key": "r", + "source": "#rso a.WlydOe", + "requiredKeys": [ + "u", + "h" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "sq": { + "fields": [ + { + "key": "sq", + "source": "#main" + }, + { + "key": "oq", + "source": "#main" + }, + { + "key": "ctry" + } + ] + }, + "ads_A": { + "fields": [ + { + "key": "r", + "source": "#tads div[data-text-ad]", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "ads_C": { + "fields": [ + { + "key": "r", + "source": "[id^='vplap']" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "ads_D": { + "fields": [ + { + "key": "r", + "source": "#bottomads div[data-text-ad]", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "job-ads": { + "fields": [ + { + "key": "r", + "source": "#rso div[data-id=\"jobs-detail-viewer\"] div.EimVGf[data-share-url]", + "requiredKeys": [ + "t", + "c", + "via", + "loc" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "snippet": { + "fields": [ + { + "key": "r", + "source": "#rcnt div.ifM9O > div" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "snippet2": { + "fields": [ + { + "key": "r", + "source": "div#rcnt.GyAeWb div#rso > div.ULSxyf block-component, div#rcnt.GyAeWb > div.M8OgIe" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "snippet3": { + "fields": [ + { + "key": "q", + "source": "#search" + }, + { + "key": "r", + "source": "#cnt:has(#rcnt > div.mNh24c + div.UFQ0Gb)" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "snippet-t1": { + "fields": [ + { + "key": "t", + "source": "div.ifM9O:has(div.webanswers-webanswers_table__webanswers-table)" + }, + { + "key": "r", + "source": "div.webanswers-webanswers_table__webanswers-table tbody > tr", + "requiredKeys": [ + "0" + ] + }, + { + "key": "u", + "source": "div.ifM9O:has(div.webanswers-webanswers_table__webanswers-table)", + "optional": true + }, + { + "key": "lang", + "source": "div.ifM9O:has(div.webanswers-webanswers_table__webanswers-table)", + "optional": true + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "aio": { + "fields": [ + { + "key": "type", + "source": "#rcnt div.M8OgIe div[jscontroller] div[aria-valuetext=Generating][role=progressbar]" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "infobox": { + "fields": [ + { + "key": "t", + "source": "#rhs[role=\"complementary\"]" + }, + { + "key": "desc", + "source": "#rhs[role=\"complementary\"]:has(div[data-attrid])", + "optional": true + }, + { + "key": "u", + "source": "#rhs[role=\"complementary\"]:has(div[data-attrid])" + }, + { + "key": "st", + "source": "#rhs[role=\"complementary\"] div.wDYxhc[data-attrid^=\"kc:/\"], #rhs[role=\"complementary\"] div.wDYxhc[data-attrid^=\"ss:/\"]", + "requiredKeys": [ + "prop", + "val" + ], + "optional": true + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "placeinfo": { + "fields": [ + { + "key": "ad", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "addr", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp" + }, + { + "key": "t", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "u", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "sum", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "desc", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "p", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "opt", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "open", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp div[data-attrid=\"kc:/location/location:hours\"] table tr > td", + "optional": true + }, + { + "key": "shut", + "source": "#rso #rhs div.kp-wholepage.kp-wholepage-osrp", + "optional": true + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "img-p": { + "fields": [ + { + "key": "r", + "source": "div.EyBRub div[data-lpage]" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "video-p": { + "fields": [ + { + "key": "r", + "source": "#main div.PhiYYd.QBl4oe" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "videos-p": { + "fields": [ + { + "key": "r", + "source": "#main a[href^=http].xMqpbd:has(div.ZxS7Db)" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "place": { + "fields": [ + { + "key": "t", + "source": "div.kp-wholepage-osrp" + }, + { + "key": "u", + "source": "div.kp-wholepage-osrp", + "optional": true + }, + { + "key": "addr", + "source": "div.kp-wholepage-osrp" + }, + { + "key": "loc", + "source": "div.kp-wholepage-osrp" + }, + { + "key": "ctry" + } + ] + }, + "places": { + "fields": [ + { + "key": "r", + "source": "#main .VkpGBb" + }, + { + "key": "ctry" + } + ] + }, + "places2": { + "fields": [ + { + "key": "q", + "source": "#search" + }, + { + "key": "type", + "source": "#main" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "local": { + "fields": [ + { + "key": "type", + "source": "#main div[jsmodel] div.YzSd" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + }, + "sites": { + "fields": [ + { + "key": "type", + "source": "#rso div.KYLHhb div[role=heading] > span.mgAbYb" + }, + { + "key": "r", + "source": "#rso div.KYLHhb div.VqeGe" + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + }, + { + "key": "lang", + "source": "html[lang]", + "optional": true + } + ] + }, + "widget-title": { + "fields": [ + { + "key": "q", + "source": "#search" + }, + { + "key": "wt", + "source": "#search" + }, + { + "key": "ctry" + } + ] + } + } + }, + "search-goi": { + "input": { + "#search": { + "first": { + "q": { + "select": "#rso", + "attr": "data-async-context", + "transform": [ + [ + "trySplit", + "query:", + 1 + ], + [ + "decodeURIComponent" + ] + ] + } + } + }, + "div[jscontroller=\"Um3BXb\"]": { + "all": { + "u": { + "attr": "data-lpage" + }, + "ru": { + "select": "div.guK3rf.cHaqb > span", + "attr": "textContent" + }, + "t": { + "select": "div.toI8Rb.OSrXXb", + "attr": "textContent" + } + } + } + }, + "output": { + "img": { + "fields": [ + { + "key": "r", + "source": "div[jscontroller=\"Um3BXb\"]", + "requiredKeys": [ + "u", + "ru", + "t" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + } + } + }, + "search-gov": { + "input": { + "#search": { + "first": { + "q": { + "select": "#rso", + "attr": "data-async-context", + "transform": [ + [ + "trySplit", + "query:", + 1 + ], + [ + "decodeURIComponent" + ] + ] + } + } + }, + "#rso .g": { + "all": { + "u": { + "select": "div.xe8e1b a", + "attr": "href" + }, + "t": { + "select": "div.xe8e1b h3", + "attr": "textContent" + }, + "len": { + "select": "div.gY2b2c div.c8rnLc > span", + "attr": "textContent" + }, + "age": { + "select": "div.fzUZNc div.gqF9jc > span > span", + "attr": "textContent" + } + } + } + }, + "output": { + "videos": { + "fields": [ + { + "key": "r", + "source": "#rso .g", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#search" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + } + } + }, + "search-am": { + "input": { + "#twotabsearchtextbox": { + "first": { + "q": { + "attr": "value" + } + } + }, + "#search div.s-result-item[data-component-type=\"s-search-result\"]": { + "all": { + "t": { + "select": "div.sg-col-inner div > div > div > a > h2 > span", + "attr": "textContent" + }, + "a": { + "select": ".puis-sponsored-label-text > span > span", + "attr": "textContent" + }, + "u": { + "select": "div.sg-col-inner div > div > div > a", + "attr": "href", + "transform": [ + [ + "decodeURIComponent" + ], + [ + "decodeURIComponent" + ], + [ + "trySplit", + "&url=", + 1 + ], + [ + "trySplit", + "/ref=", + 0 + ] + ] + } + } + } + }, + "output": { + "query-am": { + "fields": [ + { + "key": "r", + "source": "#search div.s-result-item[data-component-type=\"s-search-result\"]", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#twotabsearchtextbox" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + } + } + }, + "search-bi": { + "input": { + ".b_algo": { + "all": { + "t": { + "select": "h2 [href]", + "attr": "textContent" + }, + "u": { + "select": "h2 [href]", + "attr": "href" + } + } + }, + "#sb_form": { + "first": { + "q": { + "select": "#sb_form_q", + "attr": "value" + } + } + } + }, + "output": { + "query": { + "fields": [ + { + "key": "r", + "source": ".b_algo", + "requiredKeys": [ + "t", + "u" + ] + }, + { + "key": "q", + "source": "#sb_form" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + } + } + }, + "search-bii": { + "input": { + "div[role=\"main\"] div.imgpt > a": { + "all": { + "t": { + "attr": "m", + "transform": [ + [ + "json", + "t" + ] + ] + }, + "u": { + "attr": "m", + "transform": [ + [ + "json", + "murl" + ] + ] + }, + "ru": { + "attr": "m", + "transform": [ + [ + "json", + "purl" + ] + ] + } + } + }, + "#sb_form_q": { + "first": { + "q": { + "attr": "value" + } + } + } + }, + "output": { + "img": { + "fields": [ + { + "key": "r", + "source": "div[role=\"main\"] div.imgpt > a", + "requiredKeys": [ + "u", + "ru", + "t" + ] + }, + { + "key": "q", + "source": "#sb_form_q" + }, + { + "key": "qurl" + }, + { + "key": "ctry" + } + ] + } + } + } +} diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/page.html.gz b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/page.html.gz deleted file mode 100644 index 47affb2a..00000000 Binary files a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/page.html.gz and /dev/null differ diff --git a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/scenario.json b/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/scenario.json deleted file mode 100644 index 864004ee..00000000 --- a/modules/web-discovery-project/tests/unit/fixtures/content-extractor/yahoo/donald-trump-2018-05-28/scenario.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "url": "https://de.search.yahoo.com/search?p=donald+trump&fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8", - - "mustContain": [ - { - "type": "wdp", - "action": "query", - "payload": { - "r": { - "0": { - "t": "Donald Trump – Wikipedia", - "u": "https://de.wikipedia.org/wiki/Donald_Trump" - }, - "1": { - "t": "Donald Trump - SPIEGEL ONLINE", - "u": "http://www.spiegel.de/thema/donald_trump/" - }, - "2": { - "t": "Donald Trump: Präsident der USA | ZEIT ONLINE", - "u": "https://www.zeit.de/thema/donald-trump" - }, - "3": { - "t": "Donald Trump - Aktuelle Beiträge zum US-Präsidenten", - "u": "https://www.tagesspiegel.de/themen/donald-trump/" - }, - "4": { - "t": "Donald J. Trump (@realDonaldTrump) | Twitter", - "u": "https://twitter.com/realdonaldtrump" - }, - "5": { - "t": "Donald Trump | STERN.de", - "u": "https://www.stern.de/lifestyle/leute/themen/donald-trump-4540962.html" - }, - "6": { - "t": "Donald Trump- Steckbrief, News, Bilder | GALA.de", - "u": "https://www.gala.de/stars/starportraets/donald-trump-20526492.html" - }, - "7": { - "t": "Donald Trump - News von WELT", - "u": "https://www.welt.de/themen/donald-trump/" - }, - "8": { - "t": "Donald Trump | Us-news | The Guardian", - "u": "https://www.theguardian.com/us-news/donaldtrump" - }, - "9": { - "t": "Donald Trump - Wikipedia", - "u": "https://en.wikipedia.org/wiki/Donald_Trump" - } - }, - "q": "donald trump", - "qurl": "https://de.search.yahoo.com/search?p=donald+trump&fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8", - "ctry": "de" - } - } - ] -} diff --git a/modules/web-discovery-project/tests/unit/generate-fixtures.js b/modules/web-discovery-project/tests/unit/generate-fixtures.js index 136b3f11..df10e49d 100755 --- a/modules/web-discovery-project/tests/unit/generate-fixtures.js +++ b/modules/web-discovery-project/tests/unit/generate-fixtures.js @@ -7,24 +7,20 @@ const { expect } = require("chai"); const sinon = require("sinon"); const FileHound = require("filehound"); const { gunzipSync, gzipSync } = require("zlib"); -const { JSDOM } = require("jsdom"); const stripJsonComments = require("strip-json-comments"); const { ContentExtractor } = require("../../../../build/web-discovery-project/content-extractor.js"); +const Patterns = require("../../../../build/web-discovery-project/patterns.js").default; +const { parseHtml } = require("../../../../build/web-discovery-project/html-helpers.js"); function jsonParse(text) { return JSON.parse(stripJsonComments(text)); } const FIXTURES_BASE_PATH = path.join(__dirname, "fixtures/content-extractor"); -const DEFAULT_PATTERNS = { - normal: jsonParse( - fs.readFileSync(`${FIXTURES_BASE_PATH}/patterns.json`, "utf8") - ), - strict: jsonParse( - fs.readFileSync(`${FIXTURES_BASE_PATH}/patterns-anon.json`, "utf8") - ), -}; -const ALLOWED_SOURCES = new Set(["go", "bing"]); +const DEFAULT_PATTERNS = jsonParse( + fs.readFileSync(`${FIXTURES_BASE_PATH}/rules.json`, "utf8") +); +const ALLOWED_SOURCES = new Set(["go", "bing", "am"]); function findAllFixtures() { function isFixtureDir(file) { @@ -78,18 +74,8 @@ const groupTelemetryCallsByAction = (sinonSpy) => { )(sinonSpy.args); }; -const setupDocument = function (html) { - const mockWindow = new JSDOM(`

Test DOM

`).window; - - const document = mockWindow.document; - document.open(); - document.write(html); - document.close(); - return document; -}; - const generateScenario = (url, html) => { - const WebDiscoveryProject = { + const WDP = { debug: false, msgType: "wdp", getCountryCode() { @@ -103,13 +89,22 @@ const generateScenario = (url, html) => { // args: url, query addStrictQueries: sinon.fake(), queryCache: {}, + patterns: new Patterns(), + checkURL: (doc, url) => { + const { messages } = WDP.contentExtractor.run(doc, url); + for (const message of messages) + WDP.telemetry({ + type: WDP.msgType, + action: message.action, + payload: message.payload, + }); + }, }; - const contentExtractor = new ContentExtractor(WebDiscoveryProject); - contentExtractor.updatePatterns(DEFAULT_PATTERNS.normal, "normal"); - contentExtractor.updatePatterns(DEFAULT_PATTERNS.strict, "strict"); - const document = setupDocument(html); - contentExtractor.checkURL(document, url, "strict"); - const messages = groupTelemetryCallsByAction(WebDiscoveryProject.telemetry); + WDP.patterns.update(DEFAULT_PATTERNS); + WDP.contentExtractor = new ContentExtractor(WDP.patterns, WDP); + const document = parseHtml(html); + WDP.checkURL(document, url); + const messages = groupTelemetryCallsByAction(WDP.telemetry); const mustContain = Object.values(messages).reduce((acc, v) => acc.concat(v), []); return {url, mustContain}; }; @@ -129,6 +124,9 @@ const generateFixture = async (dir) => { case "bing": url = `https://www.bing.com/search?q=${encodeURIComponent(query)}`; break; + case "am": + url = `https://www.amazon.de/s?k=${encodeURIComponent(query)}` + break; default: return; } diff --git a/modules/web-discovery-project/tests/unit/web-discovery-project-test.es b/modules/web-discovery-project/tests/unit/web-discovery-project-test.es index 592133b4..24bdb1fb 100644 --- a/modules/web-discovery-project/tests/unit/web-discovery-project-test.es +++ b/modules/web-discovery-project/tests/unit/web-discovery-project-test.es @@ -129,6 +129,9 @@ const MOCK = { flushExpiredCacheEntries() {} }, }, + "webextension-polyfill": { + default: {}, + }, }; export default describeModule( diff --git a/package-lock.json b/package-lock.json index b3a4090c..cff7ee7e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,7 +14,7 @@ "abortcontroller-polyfill": "1.7.5", "anonymous-credentials": "https://github.com/human-web/anonymous-credentials/releases/download/1.0.0/anonymous-credentials-1.0.0.tgz", "dexie": "3.2.4", - "linkedom": "0.14.12", + "linkedom": "^0.16.11", "pako": "2.1.0", "punycode": "2.3.1", "star-wasm": "https://github.com/brave/web-discovery-project/releases/download/star-wasm-0.1.3.tgz/star-wasm-0.1.3.tgz", @@ -2455,6 +2455,7 @@ "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", "integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==", "dev": true, + "license": "MIT", "engines": { "node": ">= 10" } @@ -3179,7 +3180,8 @@ "resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz", "integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==", "deprecated": "Use your platform's native atob() and btoa() methods instead", - "dev": true + "dev": true, + "license": "BSD-3-Clause" }, "node_modules/abort-controller": { "version": "3.0.0", @@ -3839,6 +3841,7 @@ "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==", "dev": true, + "license": "MIT", "dependencies": { "debug": "4" }, @@ -6465,6 +6468,7 @@ "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-3.0.0.tgz", "integrity": "sha512-N4u2ABATi3Qplzf0hWbVCdjenim8F3ojEXpBDF5hBpjzW182MjNGLqfmQ0SkSPeQ+V86ZXgeH8aXj6kayd4jgg==", "dev": true, + "license": "MIT", "dependencies": { "rrweb-cssom": "^0.6.0" }, @@ -6498,6 +6502,7 @@ "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-4.0.0.tgz", "integrity": "sha512-/mMTei/JXPqvFqQtfyTowxmJVwr2PVAeCcDxyFf6LhoOu/09TX2OX3kb2wzi4DMXcfj4OItwDOnhl5oziPnT6g==", "dev": true, + "license": "MIT", "dependencies": { "abab": "^2.0.6", "whatwg-mimetype": "^3.0.0", @@ -6829,6 +6834,7 @@ "integrity": "sha512-A2is4PLG+eeSfoTMA95/s4pvAoSo2mKtiM5jlHkAVewmiO8ISFTFKZjH7UAM1Atli/OT/7JHOrJRJiMKUZKYBw==", "deprecated": "Use your platform's native DOMException instead", "dev": true, + "license": "MIT", "dependencies": { "webidl-conversions": "^7.0.0" }, @@ -8886,10 +8892,11 @@ } }, "node_modules/form-data": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", - "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.1.tgz", + "integrity": "sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==", "dev": true, + "license": "MIT", "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", @@ -9803,6 +9810,7 @@ "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz", "integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==", "dev": true, + "license": "MIT", "dependencies": { "whatwg-encoding": "^2.0.0" }, @@ -9819,6 +9827,7 @@ "version": "8.0.2", "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "dev": true, "funding": [ "https://github.com/fb55/htmlparser2?sponsor=1", { @@ -9874,6 +9883,7 @@ "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", "dev": true, + "license": "MIT", "dependencies": { "@tootallnate/once": "2", "agent-base": "6", @@ -9922,6 +9932,7 @@ "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==", "dev": true, + "license": "MIT", "dependencies": { "agent-base": "6", "debug": "4" @@ -10613,6 +10624,7 @@ "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-22.1.0.tgz", "integrity": "sha512-/9AVW7xNbsBv6GfWho4TTNjEo9fe6Zhf9O7s0Fhhr3u+awPwAJMKwAMXnkk5vBxflqLW9hTHX/0cs+P3gW+cQw==", "dev": true, + "license": "MIT", "dependencies": { "abab": "^2.0.6", "cssstyle": "^3.0.0", @@ -10650,6 +10662,28 @@ } } }, + "node_modules/jsdom/node_modules/ws": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", + "integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/jsesc": { "version": "2.5.2", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.2.tgz", @@ -11012,15 +11046,35 @@ } }, "node_modules/linkedom": { - "version": "0.14.12", - "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.14.12.tgz", - "integrity": "sha512-8uw8LZifCwyWeVWr80T79sQTMmNXt4Da7oN5yH5gTXRqQM+TuZWJyBqRMcIp32zx/f8anHNHyil9Avw9y76ziQ==", + "version": "0.16.11", + "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.16.11.tgz", + "integrity": "sha512-WgaTVbj7itjyXTsCvgerpneERXShcnNJF5VIV+/4SLtyRLN+HppPre/WDHRofAr2IpEuujSNgJbCBd5lMl6lRw==", + "license": "ISC", "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", - "htmlparser2": "^8.0.1", - "uhyphen": "^0.1.0" + "htmlparser2": "^9.1.0", + "uhyphen": "^0.2.0" + } + }, + "node_modules/linkedom/node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" } }, "node_modules/loader-runner": { @@ -12280,10 +12334,11 @@ } }, "node_modules/nwsapi": { - "version": "2.2.7", - "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.7.tgz", - "integrity": "sha512-ub5E4+FBPKwAZx0UwIQOjYWGHTEq5sPqHQNRN8Z9e4A7u3Tj1weLJsL59yH9vmvqEtBHaOmT6cYQKIZOxp35FQ==", - "dev": true + "version": "2.2.16", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.16.tgz", + "integrity": "sha512-F1I/bimDpj3ncaNDhfyMWuFqmQDBwDB0Fogc2qpL3BWvkQteFD/8BzWuIRl83rq0DXfm8SGt/HFhLXZyljTXcQ==", + "dev": true, + "license": "MIT" }, "node_modules/oauth-sign": { "version": "0.9.0", @@ -12814,12 +12869,13 @@ } }, "node_modules/parse5": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz", - "integrity": "sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==", + "version": "7.2.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", + "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==", "dev": true, + "license": "MIT", "dependencies": { - "entities": "^4.4.0" + "entities": "^4.5.0" }, "funding": { "url": "https://github.com/inikulin/parse5?sponsor=1" @@ -14197,7 +14253,8 @@ "version": "0.6.0", "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz", "integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/rsvp": { "version": "4.8.5", @@ -16096,6 +16153,7 @@ "resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz", "integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==", "dev": true, + "license": "MIT", "dependencies": { "punycode": "^2.3.0" }, @@ -16271,9 +16329,10 @@ } }, "node_modules/uhyphen": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.1.0.tgz", - "integrity": "sha512-o0QVGuFg24FK765Qdd5kk0zU/U4dEsCtN/GSiwNI9i8xsSVtjIAOdTaVhLwZ1nrbWxFVMxNDDl+9fednsOMsBw==" + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz", + "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==", + "license": "ISC" }, "node_modules/underscore": { "version": "1.13.6", @@ -16684,6 +16743,7 @@ "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-4.0.0.tgz", "integrity": "sha512-d+BFHzbiCx6zGfz0HyQ6Rg69w9k19nviJspaj4yNscGjrHu94sVP+aRm75yEbCh+r2/yR+7q6hux9LVtbuTGBw==", "dev": true, + "license": "MIT", "dependencies": { "xml-name-validator": "^4.0.0" }, @@ -17033,6 +17093,7 @@ "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", "dev": true, + "license": "BSD-2-Clause", "engines": { "node": ">=12" } @@ -17135,6 +17196,7 @@ "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz", "integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==", "dev": true, + "license": "MIT", "dependencies": { "iconv-lite": "0.6.3" }, @@ -17147,6 +17209,7 @@ "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", "dev": true, + "license": "MIT", "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, @@ -17159,6 +17222,7 @@ "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz", "integrity": "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==", "dev": true, + "license": "MIT", "engines": { "node": ">=12" } @@ -17168,6 +17232,7 @@ "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-12.0.1.tgz", "integrity": "sha512-Ed/LrqB8EPlGxjS+TrsXcpUond1mhccS3pchLhzSgPCnTimUCKj3IZE75pAs5m6heB2U2TMerKFUXheyHY+VDQ==", "dev": true, + "license": "MIT", "dependencies": { "tr46": "^4.1.1", "webidl-conversions": "^7.0.0" @@ -17498,6 +17563,7 @@ "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-4.0.0.tgz", "integrity": "sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw==", "dev": true, + "license": "Apache-2.0", "engines": { "node": ">=12" } diff --git a/package.json b/package.json index bf9335a5..4976c163 100644 --- a/package.json +++ b/package.json @@ -116,7 +116,7 @@ "abortcontroller-polyfill": "1.7.5", "anonymous-credentials": "https://github.com/human-web/anonymous-credentials/releases/download/1.0.0/anonymous-credentials-1.0.0.tgz", "dexie": "3.2.4", - "linkedom": "0.14.12", + "linkedom": "^0.16.11", "pako": "2.1.0", "punycode": "2.3.1", "star-wasm": "https://github.com/brave/web-discovery-project/releases/download/star-wasm-0.1.3.tgz/star-wasm-0.1.3.tgz",