@@ -39,6 +39,12 @@ function _log(...msg) {
39
39
}
40
40
}
41
41
42
+ function getRandomIntInclusive ( min , max ) {
43
+ const _min = Math . ceil ( min ) ;
44
+ const _max = Math . floor ( max ) ;
45
+ return Math . floor ( random ( ) * ( _max - _min + 1 ) ) + min ;
46
+ }
47
+
42
48
function cleanFinalUrl ( domain , href ) {
43
49
/*
44
50
We need to get the final domain, there are 2 elements that we try to capture.
@@ -1597,7 +1603,7 @@ const WebDiscoveryProject = {
1597
1603
WebDiscoveryProject . patterns . update ( rules ) ;
1598
1604
WebDiscoveryProject . _patternsLastUpdated = new Date ( ) ;
1599
1605
logger . info (
1600
- " WebDiscoveryProject patterns successfully updated at ${this ._patternsLastUpdated}"
1606
+ ` WebDiscoveryProject patterns successfully updated at ${ WebDiscoveryProject . _patternsLastUpdated } `
1601
1607
) ;
1602
1608
} catch ( e ) {
1603
1609
logger . warn ( "Failed to apply new WebDiscoveryProject patterns" , e ) ;
@@ -3382,7 +3388,7 @@ const WebDiscoveryProject = {
3382
3388
}
3383
3389
getContentDocument ( originalURL )
3384
3390
. then ( ( doc ) => {
3385
- WebDiscoveryProject . checkURL ( doc , url ) ;
3391
+ WebDiscoveryProject . checkURL ( doc , url , true ) ;
3386
3392
} )
3387
3393
. catch ( ( e ) => {
3388
3394
logger . info (
@@ -3518,7 +3524,7 @@ const WebDiscoveryProject = {
3518
3524
)
3519
3525
) {
3520
3526
try {
3521
- WebDiscoveryProject . checkURL ( cd , currURL ) ;
3527
+ WebDiscoveryProject . checkURL ( cd , currURL , false ) ;
3522
3528
} catch ( e ) { }
3523
3529
//Check active usage...
3524
3530
// WebDiscoveryProject.activeUsage += 1;
@@ -3619,7 +3625,7 @@ const WebDiscoveryProject = {
3619
3625
}
3620
3626
} )
3621
3627
. catch ( ( e ) => {
3622
- _log ( "Error fetching fetching the currentURL: " + e ) ;
3628
+ _log ( "Error fetching the currentURL: " + e ) ;
3623
3629
} ) ;
3624
3630
3625
3631
WebDiscoveryProject . counter += 4 ;
@@ -4701,10 +4707,11 @@ const WebDiscoveryProject = {
4701
4707
else return null ;
4702
4708
} ,
4703
4709
4704
- checkURL ( pageContent , url ) {
4710
+ checkURL ( pageContent , url , addStrictQuery ) {
4705
4711
const { messages } = WebDiscoveryProject . contentExtractor . run (
4706
4712
pageContent ,
4707
- url
4713
+ url ,
4714
+ addStrictQuery
4708
4715
) ;
4709
4716
for ( const message of messages )
4710
4717
WebDiscoveryProject . telemetry ( {
@@ -5043,7 +5050,7 @@ const WebDiscoveryProject = {
5043
5050
e . qurl ,
5044
5051
function ( url , page_data , ourl , x ) {
5045
5052
let cd = WebDiscoveryProject . docCache [ url ] [ "doc" ] ;
5046
- WebDiscoveryProject . checkURL ( cd , url ) ;
5053
+ WebDiscoveryProject . checkURL ( cd , url , false ) ;
5047
5054
} ,
5048
5055
function ( a , b , c , d ) {
5049
5056
_log ( "Error aux>>>> " + d ) ;
@@ -5714,6 +5721,38 @@ const WebDiscoveryProject = {
5714
5721
}
5715
5722
} ) ;
5716
5723
} ,
5724
+ addStrictQueries ( url , query ) {
5725
+ // In some cases, we get query undefined.
5726
+ if ( ! query ) {
5727
+ _log ( ">> Got an undefined query >>> " + url ) ;
5728
+ return ;
5729
+ }
5730
+
5731
+ if ( WebDiscoveryProject . isSuspiciousQuery ( query ) ) {
5732
+ _log ( "Dropping suspicious query before double-fetch:" , query ) ;
5733
+ return ;
5734
+ }
5735
+
5736
+ const { isSearchEngineUrl, queryUrl } =
5737
+ WebDiscoveryProject . contentExtractor . urlAnalyzer . checkAnonSearchURL (
5738
+ url ,
5739
+ query
5740
+ ) ;
5741
+ if ( isSearchEngineUrl ) {
5742
+ try {
5743
+ const qObj = {
5744
+ qurl : queryUrl ,
5745
+ ts : Date . now ( ) ,
5746
+ tDiff : getRandomIntInclusive ( 1 , 20 ) ,
5747
+ } ;
5748
+ logger . debug ( "PCN: pushed to strictQueries:" , queryUrl ) ;
5749
+ WebDiscoveryProject . strictQueries . push ( qObj ) ;
5750
+ WebDiscoveryProject . saveStrictQueries ( ) ;
5751
+ } catch ( ee ) {
5752
+ logger . error ( "Failed to add query:" , ee ) ;
5753
+ }
5754
+ }
5755
+ } ,
5717
5756
} ;
5718
5757
WebDiscoveryProject . contentExtractor = new ContentExtractor (
5719
5758
WebDiscoveryProject . patterns ,
0 commit comments