Skip to content

Commit 1bbb319

Browse files
WT-2950 Implement Seed-level video capture setting handling + Job-level PDF-only option
1 parent 353cc1b commit 1bbb319

File tree

5 files changed

+105
-47
lines changed

5 files changed

+105
-47
lines changed

brozzler/cli.py

-17
Original file line numberDiff line numberDiff line change
@@ -664,21 +664,6 @@ def dump_state(signum, frame):
664664
finally:
665665
signal.signal(signal.SIGQUIT, dump_state)
666666

667-
def get_skip_av_seeds():
668-
# TODO: develop UI and refactor
669-
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
670-
try:
671-
# make set from seed IDs in SKIP_AV_SEEDS_FILE
672-
with open(SKIP_AV_SEEDS_FILE) as skips:
673-
skip_av_seeds = {int(line) for line in skips.readlines()}
674-
logger.info(
675-
"running with skip_av_seeds file", skip_av_seeds=SKIP_AV_SEEDS_FILE
676-
)
677-
except Exception:
678-
skip_av_seeds = set()
679-
logger.info("running with empty skip_av_seeds")
680-
return skip_av_seeds
681-
682667
def get_ytdlp_proxy_endpoints():
683668
YTDLP_PROXY_ENDPOINTS_FILE = args.ytdlp_proxy_file
684669
try:
@@ -698,12 +683,10 @@ def get_ytdlp_proxy_endpoints():
698683
rr = rethinker(args)
699684
frontier = brozzler.RethinkDbFrontier(rr)
700685
service_registry = doublethink.ServiceRegistry(rr)
701-
skip_av_seeds_from_file = get_skip_av_seeds()
702686
ytdlp_proxy_endpoints_from_file = get_ytdlp_proxy_endpoints()
703687
worker = brozzler.worker.BrozzlerWorker(
704688
frontier,
705689
service_registry,
706-
skip_av_seeds=skip_av_seeds_from_file,
707690
ytdlp_proxy_endpoints=ytdlp_proxy_endpoints_from_file,
708691
max_browsers=int(args.max_browsers),
709692
chrome_exe=args.chrome_exe,

brozzler/job_schema.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,13 @@ seeds:
9595
password:
9696
type: string
9797

98+
video_capture:
99+
type: string
100+
98101
<<: *multi_level_options
99102

100103
max_claimed_sites:
101104
type: integer
102105

106+
pdfs_only:
107+
type: boolean

brozzler/model.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import urllib
2626
import uuid
2727
import zlib
28+
from enum import Enum
2829
from typing import Optional
2930

3031
import cerberus
@@ -101,6 +102,8 @@ def new_job(frontier, job_conf):
101102
job.id = job_conf["id"]
102103
if "max_claimed_sites" in job_conf:
103104
job.max_claimed_sites = job_conf["max_claimed_sites"]
105+
if "pdfs_only" in job_conf:
106+
job.pdfs_only = job_conf["pdfs_only"]
104107
job.save()
105108

106109
sites = []
@@ -199,6 +202,8 @@ class Job(doublethink.Document, ElapsedMixIn):
199202
def populate_defaults(self):
200203
if "status" not in self:
201204
self.status = "ACTIVE"
205+
if "pdfs_only" not in self:
206+
self.pdfs_only = False
202207
if "starts_and_stops" not in self:
203208
if self.get("started"): # backward compatibility
204209
self.starts_and_stops = [
@@ -221,6 +226,26 @@ def finish(self):
221226
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
222227

223228

229+
class VideoCaptureOptions(Enum):
230+
"""
231+
Enumeration of possible values for the `video_capture` config key.
232+
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
233+
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
234+
combination of the next two values.
235+
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
236+
containing the word "video" is not captured.
237+
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
238+
239+
Note: Ensuring full video MIME type blocking requires an additional entry in the
240+
Warcprox-Meta header `mime-type-filters` key.
241+
"""
242+
243+
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
244+
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
245+
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
246+
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
247+
248+
224249
class Site(doublethink.Document, ElapsedMixIn):
225250
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
226251
table = "sites"
@@ -236,8 +261,8 @@ def populate_defaults(self):
236261
self.last_claimed = brozzler.EPOCH_UTC
237262
if "scope" not in self:
238263
self.scope = {}
239-
if "skip_ytdlp" not in self:
240-
self.skip_ytdlp = None
264+
if "video_capture" not in self:
265+
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
241266

242267
# backward compatibility
243268
if "surt" in self.scope:

brozzler/worker.py

+40-28
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
import brozzler
4040
import brozzler.browser
41+
from brozzler.model import VideoCaptureOptions
4142

4243
from . import metrics
4344

@@ -60,7 +61,6 @@ def __init__(
6061
self,
6162
frontier,
6263
service_registry=None,
63-
skip_av_seeds=None,
6464
ytdlp_proxy_endpoints=None,
6565
max_browsers=1,
6666
chrome_exe="chromium-browser",
@@ -85,7 +85,6 @@ def __init__(
8585
):
8686
self._frontier = frontier
8787
self._service_registry = service_registry
88-
self._skip_av_seeds = skip_av_seeds
8988
self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints
9089
self._max_browsers = max_browsers
9190

@@ -274,36 +273,23 @@ def thumb_jpeg(self, full_jpeg):
274273
img.save(out, "jpeg", quality=95)
275274
return out.getbuffer()
276275

277-
def should_ytdlp(self, logger, site, page, page_status, skip_av_seeds):
276+
def should_ytdlp(self, logger, site, page, page_status):
278277
# called only after we've passed needs_browsing() check
279278

280279
if page_status != 200:
281280
logger.info("skipping ytdlp: non-200 page status", page_status=page_status)
282281
return False
283-
if site.skip_ytdlp:
284-
logger.info("skipping ytdlp: site marked skip_ytdlp")
285-
return False
282+
if site.video_capture in [
283+
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
284+
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
285+
]:
286+
logger.info("skipping ytdlp: site has video capture disabled")
286287

287288
ytdlp_url = page.redirect_url if page.redirect_url else page.url
288289

289290
if "chrome-error:" in ytdlp_url:
290291
return False
291292

292-
ytdlp_seed = (
293-
site["metadata"]["ait_seed_id"]
294-
if "metadata" in site and "ait_seed_id" in site["metadata"]
295-
else None
296-
)
297-
298-
# TODO: develop UI and refactor
299-
if ytdlp_seed:
300-
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
301-
logger.info("skipping ytdlp: site in skip_av_seeds")
302-
site.skip_ytdlp = True
303-
return False
304-
else:
305-
site.skip_ytdlp = False
306-
307293
return True
308294

309295
@metrics.brozzler_page_processing_duration_seconds.time()
@@ -325,7 +311,17 @@ def brozzle_page(
325311

326312
if not self._needs_browsing(page_headers):
327313
page_logger.info("needs fetch")
328-
self._fetch_url(site, page=page)
314+
if site.pdfs_only and not self._is_pdf(page_headers):
315+
self.logger.info("skipping non-PDF content: PDFs only option enabled")
316+
elif site.video_capture in [
317+
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
318+
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
319+
] and self._is_video_type(page_headers):
320+
self.logger.info(
321+
"skipping video content: video MIME type capture disabled for site"
322+
)
323+
else:
324+
self._fetch_url(site, page=page)
329325
else:
330326
page_logger.info("needs browsing")
331327
try:
@@ -340,7 +336,7 @@ def brozzle_page(
340336
page_logger.info("page interstitial shown (http auth)")
341337

342338
if enable_youtube_dl and self.should_ytdlp(
343-
page_logger, site, page, status_code, self._skip_av_seeds
339+
page_logger, site, page, status_code
344340
):
345341
try:
346342
from . import ydl
@@ -399,13 +395,29 @@ def _get_page_headers(self, site, page):
399395
url_logger.warning("Failed to get headers", exc_info=True)
400396
return {}
401397

402-
def _needs_browsing(self, page_headers):
403-
if (
398+
def _needs_browsing(self, page_headers) -> bool:
399+
return not bool(
404400
"content-type" in page_headers
405401
and "html" not in page_headers["content-type"]
406-
):
407-
return False
408-
return True
402+
)
403+
404+
def _is_video_type(self, page_headers) -> bool:
405+
"""
406+
Determines if the page's Content-Type header specifies that it contains
407+
a video.
408+
"""
409+
return (
410+
"content-type" in page_headers and "video" in page_headers["content-type"]
411+
)
412+
413+
def _is_pdf(self, page_headers) -> bool:
414+
"""
415+
Determines if the page's Content-Type header specifies that it is a PDF.
416+
"""
417+
return (
418+
"content-type" in page_headers
419+
and "application/pdf" in page_headers["content-type"]
420+
)
409421

410422
@metrics.brozzler_browsing_duration_seconds.time()
411423
@metrics.brozzler_in_progress_browses.track_inprogress()

job-conf.rst

+33
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
107107
simultaneously across the cluster. Addresses the problem of a job with many
108108
seeds starving out other jobs.
109109

110+
``pdfs_only``
111+
~~~~~~~~~~~~~~~~~~~~~
112+
+---------+----------+-----------+
113+
| type | required | default |
114+
+=========+==========+===========+
115+
| boolean | no | ``false`` |
116+
+---------+----------+-----------+
117+
Limits capture to PDFs based on the MIME type set in the HTTP response's
118+
Content-Type header. This value only impacts processing of outlinks within
119+
Brozzler.
120+
121+
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
122+
entry in the Warcprox-Meta header `mime-type-filters` key.*
123+
110124
``seeds``
111125
~~~~~~~~~
112126
+------------------------+----------+---------+
@@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
158172
the default values in place. Brozzler submits login forms after page load.
159173
Then brozzling proceeds as usual.
160174

175+
``video_capture``
176+
~~~~~~~~~~~~~~~~~
177+
+--------+----------+--------------------------+
178+
| type | required | default |
179+
+========+==========+==========================+
180+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
181+
+--------+----------+--------------------------+
182+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
183+
184+
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
185+
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
186+
combination of the next two values.
187+
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
188+
the word "video" is not captured.
189+
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
190+
191+
*Note: Ensuring full video MIME type blocking requires an additional entry in
192+
the Warcprox-Meta header `mime-type-filters` key.*
193+
161194
Seed-level / top-level settings
162195
-------------------------------
163196
These are seed settings that can also be specified at the top level, in which

0 commit comments

Comments
 (0)