Skip to content

Commit d16f036

Browse files
WT-2950 Implement Seed-level video capture setting handling + Job-level PDF-only option
1 parent 0d8721a commit d16f036

File tree

6 files changed

+115
-53
lines changed

6 files changed

+115
-53
lines changed

brozzler/cli.py

-15
Original file line numberDiff line numberDiff line change
@@ -544,27 +544,12 @@ def dump_state(signum, frame):
544544
finally:
545545
signal.signal(signal.SIGQUIT, dump_state)
546546

547-
def get_skip_av_seeds():
548-
# TODO: develop UI and refactor
549-
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
550-
try:
551-
# make set from seed IDs in SKIP_AV_SEEDS_FILE
552-
with open(SKIP_AV_SEEDS_FILE) as skips:
553-
skip_av_seeds = {int(l) for l in skips.readlines()}
554-
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
555-
except Exception as e:
556-
skip_av_seeds = set()
557-
logging.info("running with empty skip_av_seeds")
558-
return skip_av_seeds
559-
560547
rr = rethinker(args)
561548
frontier = brozzler.RethinkDbFrontier(rr)
562549
service_registry = doublethink.ServiceRegistry(rr)
563-
skip_av_seeds_from_file = get_skip_av_seeds()
564550
worker = brozzler.worker.BrozzlerWorker(
565551
frontier,
566552
service_registry,
567-
skip_av_seeds=skip_av_seeds_from_file,
568553
max_browsers=int(args.max_browsers),
569554
chrome_exe=args.chrome_exe,
570555
proxy=args.proxy,

brozzler/job_schema.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,13 @@ seeds:
9595
password:
9696
type: string
9797

98+
video_capture:
99+
type: string
100+
98101
<<: *multi_level_options
99102

100103
max_claimed_sites:
101104
type: integer
102105

106+
pdfs_only:
107+
type: boolean

brozzler/model.py

+36-11
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import uuid
3535
import yaml
3636
import zlib
37+
from enum import Enum
3738
from typing import Optional
3839

3940

@@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
100101
job.id = job_conf["id"]
101102
if "max_claimed_sites" in job_conf:
102103
job.max_claimed_sites = job_conf["max_claimed_sites"]
104+
if "pdfs_only" in job_conf:
105+
job.pdfs_only = job_conf["pdfs_only"]
103106
job.save()
104107

105108
sites = []
@@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
198201
def populate_defaults(self):
199202
if not "status" in self:
200203
self.status = "ACTIVE"
204+
if "pdfs_only" not in self:
205+
self.pdfs_only = False
201206
if not "starts_and_stops" in self:
202207
if self.get("started"): # backward compatibility
203208
self.starts_and_stops = [
@@ -220,33 +225,53 @@ def finish(self):
220225
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()
221226

222227

228+
class VideoCaptureOptions(Enum):
229+
"""
230+
Enumeration of possible values for the `video_capture` config key.
231+
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
232+
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
233+
combination of the next two values.
234+
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
235+
containing the word "video" is not captured.
236+
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
237+
238+
Note: Ensuring full video MIME type blocking requires an additional entry in the
239+
Warcprox-Meta header `mime-type-filters` key.
240+
"""
241+
242+
ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
243+
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
244+
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
245+
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"
246+
247+
223248
class Site(doublethink.Document, ElapsedMixIn):
224249
logger = logging.getLogger(__module__ + "." + __qualname__)
225250
table = "sites"
226251

227252
def populate_defaults(self):
228-
if not "status" in self:
253+
if "status" not in self:
229254
self.status = "ACTIVE"
230-
if not "claimed" in self:
255+
if "claimed" not in self:
231256
self.claimed = False
232-
if not "last_disclaimed" in self:
257+
if "last_disclaimed" not in self:
233258
self.last_disclaimed = brozzler.EPOCH_UTC
234-
if not "last_claimed" in self:
259+
if "last_claimed" not in self:
235260
self.last_claimed = brozzler.EPOCH_UTC
236-
if not "scope" in self:
261+
if "scope" not in self:
237262
self.scope = {}
238-
if not "skip_ytdlp" in self:
239-
self.skip_ytdlp = None
263+
if "video_capture" not in self:
264+
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value
240265

241266
# backward compatibility
242267
if "surt" in self.scope:
243-
if not "accepts" in self.scope:
268+
if "accepts" not in self.scope:
244269
self.scope["accepts"] = []
245270
self.scope["accepts"].append({"surt": self.scope["surt"]})
246271
del self.scope["surt"]
247272

248273
# backward compatibility
249-
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
274+
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
250275
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
251276
if "max_hops_off_surt" in self.scope:
252277
del self.scope["max_hops_off_surt"]
@@ -256,7 +281,7 @@ def populate_defaults(self):
256281
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
257282
)
258283

259-
if not "starts_and_stops" in self:
284+
if "starts_and_stops" not in self:
260285
if self.get("start_time"): # backward compatibility
261286
self.starts_and_stops = [
262287
{"start": self.get("start_time"), "stop": None}
@@ -271,7 +296,7 @@ def __str__(self):
271296
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
272297

273298
def _accept_ssurt_if_not_redundant(self, ssurt):
274-
if not "accepts" in self.scope:
299+
if "accepts" not in self.scope:
275300
self.scope["accepts"] = []
276301
simple_rule_ssurts = (
277302
rule["ssurt"]

brozzler/worker.py

+34-9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import logging
2222
import brozzler
2323
import brozzler.browser
24+
from brozzler.model import VideoCaptureOptions
2425
import threading
2526
import time
2627
import urllib.request
@@ -54,7 +55,6 @@ def __init__(
5455
self,
5556
frontier,
5657
service_registry=None,
57-
skip_av_seeds=None,
5858
max_browsers=1,
5959
chrome_exe="chromium-browser",
6060
warcprox_auto=False,
@@ -74,7 +74,6 @@ def __init__(
7474
):
7575
self._frontier = frontier
7676
self._service_registry = service_registry
77-
self._skip_av_seeds = skip_av_seeds
7877
self._max_browsers = max_browsers
7978

8079
self._warcprox_auto = warcprox_auto
@@ -250,7 +249,17 @@ def brozzle_page(
250249

251250
if not self._needs_browsing(page_headers):
252251
self.logger.info("needs fetch: %s", page)
253-
self._fetch_url(site, page=page)
252+
if site.pdfs_only and not self._is_pdf(page_headers):
253+
self.logger.info("skipping non-PDF content: PDFs only option enabled")
254+
elif site.video_capture in [
255+
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
256+
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
257+
] and self._is_video_type(page_headers):
258+
self.logger.info(
259+
"skipping video content: video MIME type capture disabled for site"
260+
)
261+
else:
262+
self._fetch_url(site, page=page)
254263
else:
255264
self.logger.info("needs browsing: %s", page)
256265
try:
@@ -262,7 +271,7 @@ def brozzle_page(
262271
self.logger.info("page interstitial shown (http auth): %s", page)
263272

264273
if enable_youtube_dl and ydl.should_ytdlp(
265-
site, page, browser.websock_thread.page_status, self._skip_av_seeds
274+
site, page, browser.websock_thread.page_status
266275
):
267276
try:
268277
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@@ -303,13 +312,29 @@ def _get_page_headers(self, page):
303312
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
304313
return {}
305314

306-
def _needs_browsing(self, page_headers):
307-
if (
315+
def _needs_browsing(self, page_headers) -> bool:
316+
return not bool(
308317
"content-type" in page_headers
309318
and "html" not in page_headers["content-type"]
310-
):
311-
return False
312-
return True
319+
)
320+
321+
def _is_video_type(self, page_headers) -> bool:
322+
"""
323+
Determines if the page's Content-Type header specifies that it contains
324+
a video.
325+
"""
326+
return (
327+
"content-type" in page_headers and "video" in page_headers["content-type"]
328+
)
329+
330+
def _is_pdf(self, page_headers) -> bool:
331+
"""
332+
Determines if the page's Content-Type header specifies that it is a PDF.
333+
"""
334+
return (
335+
"content-type" in page_headers
336+
and "application/pdf" in page_headers["content-type"]
337+
)
313338

314339
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
315340
def _on_screenshot(screenshot_jpeg):

brozzler/ydl.py

+7-18
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import yt_dlp
2121
from yt_dlp.utils import match_filter_func
2222
import brozzler
23+
from brozzler.model import VideoCaptureOptions
2324
import urllib.request
2425
import tempfile
2526
import urlcanon
@@ -32,36 +33,24 @@
3233
thread_local = threading.local()
3334

3435

35-
def should_ytdlp(site, page, page_status, skip_av_seeds):
36+
def should_ytdlp(site, page, page_status):
3637
# called only after we've passed needs_browsing() check
3738

3839
if page_status != 200:
3940
logging.info("skipping ytdlp: non-200 page status %s", page_status)
4041
return False
41-
if site.skip_ytdlp:
42-
logging.info("skipping ytdlp: site marked skip_ytdlp")
42+
if site.video_capture in [
43+
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
44+
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
45+
]:
46+
logging.info("skipping ytdlp: site has video capture disabled")
4347
return False
4448

4549
ytdlp_url = page.redirect_url if page.redirect_url else page.url
4650

4751
if "chrome-error:" in ytdlp_url:
4852
return False
4953

50-
ytdlp_seed = (
51-
site["metadata"]["ait_seed_id"]
52-
if "metadata" in site and "ait_seed_id" in site["metadata"]
53-
else None
54-
)
55-
56-
# TODO: develop UI and refactor
57-
if ytdlp_seed:
58-
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
59-
logging.info("skipping ytdlp: site in skip_av_seeds")
60-
site.skip_ytdlp = True
61-
return False
62-
else:
63-
site.skip_ytdlp = False
64-
6554
return True
6655

6756

job-conf.rst

+33
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,20 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
107107
simultaneously across the cluster. Addresses the problem of a job with many
108108
seeds starving out other jobs.
109109

110+
``pdfs_only``
111+
~~~~~~~~~~~~~~~~~~~~~
112+
+---------+----------+-----------+
113+
| type | required | default |
114+
+=========+==========+===========+
115+
| boolean | no | ``false`` |
116+
+---------+----------+-----------+
117+
Limits capture to PDFs based on the MIME type set in the HTTP response's
118+
Content-Type header. This value only impacts processing of outlinks within
119+
Brozzler.
120+
121+
*Note: Ensuring comprehensive limiting to only PDFs requires an additional
122+
entry in the Warcprox-Meta header `mime-type-filters` key.*
123+
110124
``seeds``
111125
~~~~~~~~~
112126
+------------------------+----------+---------+
@@ -158,6 +172,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
158172
the default values in place. Brozzler submits login forms after page load.
159173
Then brozzling proceeds as usual.
160174

175+
``video_capture``
176+
~~~~~~~~~~~~~~~~~
177+
+--------+----------+--------------------------+
178+
| type | required | default |
179+
+========+==========+==========================+
180+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
181+
+--------+----------+--------------------------+
182+
Determines the level of video capture for the seed. This is an enumeration with four possible values:
183+
184+
* ENABLE_VIDEO_CAPTURE (default): All video is captured.
185+
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
186+
combination of the next two values.
187+
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
188+
the word "video" is not captured.
189+
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.
190+
191+
*Note: Ensuring full video MIME type blocking requires an additional entry in
192+
the Warcprox-Meta header `mime-type-filters` key.*
193+
161194
Seed-level / top-level settings
162195
-------------------------------
163196
These are seed settings that can also be specified at the top level, in which

0 commit comments

Comments
 (0)