Skip to content

Commit 3b017ad

Browse files
committed
__init__.py: rework imports
Although doublethink is an optional dependency to allow brozzler to be used as a library without it, in practice we had some mandatory import statements that prevented brozzler from being imported without it. This fixes that by gating off some of the imports and exports. If doublethink is available, brozzler works as it is now. But if it isn't, we make a few changes: * brozzler.worker, brozzler.cli and brozzler.model reexports are disabled * One brozzler.cli function, which is used outside brozzler's own cli, has been moved into brozzler's __init__.py. For compatibility, it's reexported from brozzler.cli.
1 parent c59b08d commit 3b017ad

File tree

2 files changed

+112
-90
lines changed

2 files changed

+112
-90
lines changed

brozzler/__init__.py

+110-26
Original file line numberDiff line numberDiff line change
@@ -321,44 +321,128 @@ def _remove_query(url):
321321
# XXX chop off path after last slash??
322322
site_surt_canon = urlcanon.Canonicalizer(urlcanon.semantic.steps + [_remove_query])
323323

324-
import doublethink
325-
import datetime
326324

327-
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
325+
def mdfind(identifier):
326+
import subprocess
327+
328+
try:
329+
result = subprocess.check_output(
330+
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
331+
)
332+
# Just treat any errors as "couldn't find app"
333+
except subprocess.CalledProcessError:
334+
return None
335+
336+
if result:
337+
return result.rstrip("\n")
338+
339+
340+
def suggest_default_chrome_exe_mac():
341+
import os
342+
343+
path = None
344+
# Try Chromium first, then Chrome
345+
result = mdfind("org.chromium.Chromium")
346+
if result is not None:
347+
path = f"{result}/Contents/MacOS/Chromium"
348+
349+
result = mdfind("com.google.Chrome")
350+
if result is not None:
351+
path = f"{result}/Contents/MacOS/Google Chrome"
352+
353+
if path is not None and os.path.exists(path):
354+
return path
355+
356+
# Fall back to default paths if mdfind couldn't find it
357+
# (mdfind might fail to find them even in their default paths
358+
# if the system has Spotlight disabled.)
359+
for path in [
360+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
361+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
362+
]:
363+
if os.path.exists(path):
364+
return path
365+
366+
367+
def suggest_default_chrome_exe():
368+
import shutil, sys
369+
370+
# First ask mdfind, which lets us find it in non-default paths
371+
if sys.platform == "darwin":
372+
path = suggest_default_chrome_exe_mac()
373+
if path is not None:
374+
return path
375+
376+
# "chromium-browser" is the executable on ubuntu trusty
377+
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
378+
# google chrome executable names taken from these packages:
379+
# http://www.ubuntuupdates.org/ppa/google_chrome
380+
for exe in [
381+
"chromium-browser",
382+
"chromium",
383+
"google-chrome",
384+
"google-chrome-stable",
385+
"google-chrome-beta",
386+
"google-chrome-unstable",
387+
]:
388+
if shutil.which(exe):
389+
return exe
390+
return "chromium-browser"
328391

329-
# we could make this configurable if there's a good reason
330-
MAX_PAGE_FAILURES = 3
331392

332-
from brozzler.worker import BrozzlerWorker
333393
from brozzler.robots import is_permitted_by_robots
334-
from brozzler.frontier import RethinkDbFrontier
335394
from brozzler.browser import Browser, BrowserPool, BrowsingException
336-
from brozzler.model import (
337-
new_job,
338-
new_job_file,
339-
new_site,
340-
Job,
341-
Page,
342-
Site,
343-
InvalidJobConf,
344-
)
345-
from brozzler.cli import suggest_default_chrome_exe
346395

347396
__all__ = [
348-
"Page",
349-
"Site",
350-
"BrozzlerWorker",
351397
"is_permitted_by_robots",
352-
"RethinkDbFrontier",
353398
"Browser",
354399
"BrowserPool",
355400
"BrowsingException",
356-
"new_job",
357-
"new_site",
358-
"Job",
359-
"new_job_file",
360-
"InvalidJobConf",
361401
"sleep",
362402
"thread_accept_exceptions",
363403
"thread_raise",
404+
"suggest_default_chrome_exe",
364405
]
406+
407+
import datetime
408+
409+
try:
410+
import doublethink
411+
412+
# Synchronize epoch with doublethink if available
413+
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=doublethink.UTC)
414+
415+
# All of these imports use doublethink for real and are unsafe
416+
# to do if doublethink is unavailable.
417+
from brozzler.worker import BrozzlerWorker
418+
from brozzler.frontier import RethinkDbFrontier
419+
from brozzler.model import (
420+
new_job,
421+
new_job_file,
422+
new_site,
423+
Job,
424+
Page,
425+
Site,
426+
InvalidJobConf,
427+
)
428+
429+
__all__.extend(
430+
[
431+
"Page",
432+
"BrozzlerWorker",
433+
"RethinkDbFrontier",
434+
"Site",
435+
"new_job",
436+
"new_site",
437+
"Job",
438+
"new_job_file",
439+
"InvalidJobConf",
440+
]
441+
)
442+
except ImportError:
443+
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
444+
tzinfo=datetime.timezone.utc
445+
)
446+
447+
# we could make this configurable if there's a good reason
448+
MAX_PAGE_FAILURES = 3

brozzler/cli.py

+2-64
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,17 @@
3030
import signal
3131
import string
3232
import structlog
33-
import subprocess
3433
import sys
3534
import threading
3635
import time
3736
import traceback
3837
import warnings
3938
import yaml
40-
import shutil
4139
import base64
4240
import rethinkdb as rdb
4341

42+
from brozzler import suggest_default_chrome_exe
43+
4444
r = rdb.RethinkDB()
4545

4646
logger = structlog.get_logger(logger_name=__name__)
@@ -174,68 +174,6 @@ def configure_logging(args):
174174
)
175175

176176

177-
def mdfind(identifier):
178-
try:
179-
result = subprocess.check_output(
180-
["mdfind", f"kMDItemCFBundleIdentifier == {identifier}"], text=True
181-
)
182-
# Just treat any errors as "couldn't find app"
183-
except subprocess.CalledProcessError:
184-
return None
185-
186-
if result:
187-
return result.rstrip("\n")
188-
189-
190-
def suggest_default_chrome_exe_mac():
191-
path = None
192-
# Try Chromium first, then Chrome
193-
result = mdfind("org.chromium.Chromium")
194-
if result is not None:
195-
path = f"{result}/Contents/MacOS/Chromium"
196-
197-
result = mdfind("com.google.Chrome")
198-
if result is not None:
199-
path = f"{result}/Contents/MacOS/Google Chrome"
200-
201-
if path is not None and os.path.exists(path):
202-
return path
203-
204-
# Fall back to default paths if mdfind couldn't find it
205-
# (mdfind might fail to find them even in their default paths
206-
# if the system has Spotlight disabled.)
207-
for path in [
208-
"/Applications/Chromium.app/Contents/MacOS/Chromium",
209-
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
210-
]:
211-
if os.path.exists(path):
212-
return path
213-
214-
215-
def suggest_default_chrome_exe():
216-
# First ask mdfind, which lets us find it in non-default paths
217-
if sys.platform == "darwin":
218-
path = suggest_default_chrome_exe_mac()
219-
if path is not None:
220-
return path
221-
222-
# "chromium-browser" is the executable on ubuntu trusty
223-
# https://github.com/internetarchive/brozzler/pull/6/files uses "chromium"
224-
# google chrome executable names taken from these packages:
225-
# http://www.ubuntuupdates.org/ppa/google_chrome
226-
for exe in [
227-
"chromium-browser",
228-
"chromium",
229-
"google-chrome",
230-
"google-chrome-stable",
231-
"google-chrome-beta",
232-
"google-chrome-unstable",
233-
]:
234-
if shutil.which(exe):
235-
return exe
236-
return "chromium-browser"
237-
238-
239177
class BetterArgumentDefaultsHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
240178
"""
241179
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value

0 commit comments

Comments
 (0)