@@ -321,44 +321,128 @@ def _remove_query(url):
321
321
# XXX chop off path after last slash??
322
322
site_surt_canon = urlcanon .Canonicalizer (urlcanon .semantic .steps + [_remove_query ])
323
323
324
- import doublethink
325
- import datetime
326
324
327
- EPOCH_UTC = datetime .datetime .utcfromtimestamp (0.0 ).replace (tzinfo = doublethink .UTC )
325
+ def mdfind (identifier ):
326
+ import subprocess
327
+
328
+ try :
329
+ result = subprocess .check_output (
330
+ ["mdfind" , f"kMDItemCFBundleIdentifier == { identifier } " ], text = True
331
+ )
332
+ # Just treat any errors as "couldn't find app"
333
+ except subprocess .CalledProcessError :
334
+ return None
335
+
336
+ if result :
337
+ return result .rstrip ("\n " )
338
+
339
+
340
+ def suggest_default_chrome_exe_mac ():
341
+ import os
342
+
343
+ path = None
344
+ # Try Chromium first, then Chrome
345
+ result = mdfind ("org.chromium.Chromium" )
346
+ if result is not None :
347
+ path = f"{ result } /Contents/MacOS/Chromium"
348
+
349
+ result = mdfind ("com.google.Chrome" )
350
+ if result is not None :
351
+ path = f"{ result } /Contents/MacOS/Google Chrome"
352
+
353
+ if path is not None and os .path .exists (path ):
354
+ return path
355
+
356
+ # Fall back to default paths if mdfind couldn't find it
357
+ # (mdfind might fail to find them even in their default paths
358
+ # if the system has Spotlight disabled.)
359
+ for path in [
360
+ "/Applications/Chromium.app/Contents/MacOS/Chromium" ,
361
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" ,
362
+ ]:
363
+ if os .path .exists (path ):
364
+ return path
365
+
366
+
367
+ def suggest_default_chrome_exe ():
368
+ import shutil , sys
369
+
370
+ # First ask mdfind, which lets us find it in non-default paths
371
+ if sys .platform == "darwin" :
372
+ path = suggest_default_chrome_exe_mac ()
373
+ if path is not None :
374
+ return path
375
+
376
+ # "chromium-browser" is the executable on ubuntu trusty
377
+ # https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
378
+ # google chrome executable names taken from these packages:
379
+ # http://www.ubuntuupdates.org/ppa/google_chrome
380
+ for exe in [
381
+ "chromium-browser" ,
382
+ "chromium" ,
383
+ "google-chrome" ,
384
+ "google-chrome-stable" ,
385
+ "google-chrome-beta" ,
386
+ "google-chrome-unstable" ,
387
+ ]:
388
+ if shutil .which (exe ):
389
+ return exe
390
+ return "chromium-browser"
328
391
329
- # we could make this configurable if there's a good reason
330
- MAX_PAGE_FAILURES = 3
331
392
332
- from brozzler .worker import BrozzlerWorker
333
393
from brozzler .robots import is_permitted_by_robots
334
- from brozzler .frontier import RethinkDbFrontier
335
394
from brozzler .browser import Browser , BrowserPool , BrowsingException
336
- from brozzler .model import (
337
- new_job ,
338
- new_job_file ,
339
- new_site ,
340
- Job ,
341
- Page ,
342
- Site ,
343
- InvalidJobConf ,
344
- )
345
- from brozzler .cli import suggest_default_chrome_exe
346
395
347
396
__all__ = [
348
- "Page" ,
349
- "Site" ,
350
- "BrozzlerWorker" ,
351
397
"is_permitted_by_robots" ,
352
- "RethinkDbFrontier" ,
353
398
"Browser" ,
354
399
"BrowserPool" ,
355
400
"BrowsingException" ,
356
- "new_job" ,
357
- "new_site" ,
358
- "Job" ,
359
- "new_job_file" ,
360
- "InvalidJobConf" ,
361
401
"sleep" ,
362
402
"thread_accept_exceptions" ,
363
403
"thread_raise" ,
404
+ "suggest_default_chrome_exe" ,
364
405
]
406
+
407
+ import datetime
408
+
409
+ try :
410
+ import doublethink
411
+
412
+ # Synchronize epoch with doublethink if available
413
+ EPOCH_UTC = datetime .datetime .utcfromtimestamp (0.0 ).replace (tzinfo = doublethink .UTC )
414
+
415
+ # All of these imports use doublethink for real and are unsafe
416
+ # to do if doublethink is unavailable.
417
+ from brozzler .worker import BrozzlerWorker
418
+ from brozzler .frontier import RethinkDbFrontier
419
+ from brozzler .model import (
420
+ new_job ,
421
+ new_job_file ,
422
+ new_site ,
423
+ Job ,
424
+ Page ,
425
+ Site ,
426
+ InvalidJobConf ,
427
+ )
428
+
429
+ __all__ .extend (
430
+ [
431
+ "Page" ,
432
+ "BrozzlerWorker" ,
433
+ "RethinkDbFrontier" ,
434
+ "Site" ,
435
+ "new_job" ,
436
+ "new_site" ,
437
+ "Job" ,
438
+ "new_job_file" ,
439
+ "InvalidJobConf" ,
440
+ ]
441
+ )
442
+ except ImportError :
443
+ EPOCH_UTC = datetime .datetime .utcfromtimestamp (0.0 ).replace (
444
+ tzinfo = datetime .timezone .utc
445
+ )
446
+
447
+ # we could make this configurable if there's a good reason
448
+ MAX_PAGE_FAILURES = 3
0 commit comments