-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththnr_scrapers.py
630 lines (527 loc) · 21.2 KB
/
thnr_scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
import logging
import os
import time
import traceback
import magic
import requests
from bs4 import BeautifulSoup
import config
import thumbs
import utils_aws
import utils_http
import utils_random
import utils_text
import utils_time
from thnr_exceptions import *
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def download_og_image2(
story_id, hostname_dict, og_image_dict, first_time=True, url_queue=None
):
log_prefix_id = f"id={story_id}: "
log_prefix_local = log_prefix_id + f"d_og_i2: "
if first_time:
url_queue = []
if og_image_dict["og_image_url"]:
url_queue.append(og_image_dict["og_image_url"])
if not url_queue:
logger.info(log_prefix_local + "failed to download og:image")
return False
cur_url = url_queue.pop(0)
## attempt to heal a few common URL problems
"""
heal these problems:
- update download og image2 to heal the following problems:
<script src="//cdn.jsdelivr.net/npm/[email protected]/dist/purify.min.js"></script>
<script src="/static/browse/0.3.4/js/toggle-labs.js?20230622" type="text/javascript">
"""
# TODO: eventually move these validations and healings to a separate function
# problems where the URLs start with two slashes
if cur_url[0:2] == "//" and cur_url[2:3] != "/":
pass
# URL starts with two slashes and then the host
# //example.com/images/foo.jpg
# //example.com
# URL starts with two slashes and then the path
# //images/foo.jpg
# //images
# problems where the URLs start with one slash
elif cur_url[0:1] == "/" and cur_url[1:2] != "/":
pass
# URL starts with one slash and then the host (the slash is a malformed scheme separator)
# /example.com/images/foo.jpg
# URL starts with one slash and then the path
# /images/foo.jpg
elif not cur_url.startswith("http://") and not cur_url.startswith("https://"):
pass
# URL starts with the host
# example.com/images/foo.jpg
# URL starts with the path but without the initial path separator (i.e., the root slash)
# images/foo.jpg
# idea: try adding "http://" and "https://" and trying to reach them to see if it works
elif cur_url.startswith("./") and cur_url[2:3] != "/":
pass
# URL begins with a single dot and a slash, but the next character is not a slash
# saw in wild: ./noterous-banner.png s.b. example.com/noterous-banner.png
# URL is any of various localhost development URLs
elif cur_url[:15] in [
"http://localhos",
"https://localho",
"http://192.168.",
"https://192.168",
"http://127.0.0.",
"https://127.0.0",
]:
localhost_url = cur_url
healed_url = utils_text.heal_localhost_url(
localhost_url=localhost_url,
real_hostname=hostname_dict["full"],
)
og_image_dict["og_image_url"] = healed_url
cur_url = healed_url
logger.info(
log_prefix_local
+ f"healed localhost og:image URL {localhost_url} to {healed_url} ~Tim~"
)
# by now, we have somewhat validated and possibly healed the URL
get_response = None
try:
with requests.get(
allow_redirects=True,
headers={"User-Agent": config.settings["SCRAPING"]["UA_STR"]},
timeout=config.settings["SCRAPING"]["REQUESTS_GET_TIMEOUT_S"],
url=cur_url,
verify=False,
) as response:
if response and response.status_code == 200:
get_response = response
possibly_redirected_url = get_response.url
except requests.exceptions.MissingSchema as exc:
short_exc_name = exc.__class__.__name__
exc_name = exc.__class__.__module__ + "." + short_exc_name
exc_msg = str(exc)
exc_slug = f"{exc_name}: {exc_msg}"
tb_str = traceback.format_exc()
logger.error(
log_prefix_local
+ "unexpected exception "
+ exc_slug
+ f" for url {cur_url}"
)
logger.error(log_prefix_local + tb_str)
return False
except Exception as exc:
utils_http.handle_exception(
exc=exc,
log_prefix=log_prefix_local + "get: ",
context={"url": cur_url},
)
return False
if not get_response:
# I think we won't enter this block, because an exception would have occurred to cause this situation
logger.error(log_prefix_local + "no get_response ~Tim~")
return False
og_image_dict["og_image_filename_details_from_url"] = (
utils_text.get_filename_details_from_url(
og_image_dict["og_image_url_possibly_redirected"]
)
)
og_image_dict["og_image_content_type"] = (
utils_http.get_content_type_via_head_request(
url=possibly_redirected_url, log_prefix=log_prefix_local
)
)
if og_image_dict["og_image_content_type"]:
logger.info(
log_prefix_local
+ f"content-type is '{og_image_dict['og_image_content_type']}'"
+ f" for og:image uri {possibly_redirected_url}"
)
else:
logger.info(
log_prefix_local
+ f"content-type unavailable for og:image uri {possibly_redirected_url}"
)
# download og:image
og_image_dict["normalized_og_image_filename"] = f"{story_id}-og-image"
og_image_dict["downloaded_orig_thumb_full_path"] = os.path.join(
config.settings["TEMP_DIR"],
og_image_dict["normalized_og_image_filename"],
)
with open(og_image_dict["downloaded_orig_thumb_full_path"], "wb") as fout:
fout.write(get_response.content)
# determine magic type of downloaded og:image
og_image_dict["downloaded_og_image_magic_result"] = magic.from_file(
og_image_dict["downloaded_orig_thumb_full_path"], mime=True
)
logger.info(
log_prefix_local
+ f"magic type is '{og_image_dict['downloaded_og_image_magic_result']}'"
+ f" for downloaded og:image"
)
return True
def download_og_image1(
story_object, alt_url=None, use_url_queue=False, url_queue=None
) -> bool:
log_prefix_id = f"id={story_object.id}: "
log_prefix_local = log_prefix_id + f"d_og_i1: "
if not use_url_queue:
url_queue = []
if story_object.og_image_url:
url_queue.append(story_object.og_image_url)
if alt_url:
url_queue.insert(0, alt_url)
if not url_queue:
logger.info(log_prefix_local + "failed to download og:image")
return False
cur_url = url_queue.pop(0)
## attempt to heal a few common URL problems
# URL is a dev URL using localhost
if cur_url[:16] in [
"http://localhost",
"https://localhos",
]:
localhost_url = cur_url
possibly_healed_url = utils_text.heal_localhost_url(
localhost_url=localhost_url,
real_hostname=story_object.hostname_dict["full"],
)
story_object.og_image_url = possibly_healed_url
cur_url = possibly_healed_url
logger.info(
log_prefix_local
+ f"healed localhost og:image URL {localhost_url} to {possibly_healed_url} ~Tim~"
)
# # check whether we disqualify this og:image url
# if thumbs.image_url_is_disqualified(url=cur_url, log_prefix=log_prefix_id):
# return False
get_response = None
try:
with requests.get(
url=cur_url,
allow_redirects=True,
verify=False,
timeout=config.settings["SCRAPING"]["REQUESTS_GET_TIMEOUT_S"],
headers={"User-Agent": config.settings["SCRAPING"]["UA_STR"]},
) as response:
get_response = response
if get_response.url != cur_url:
if thumbs.image_url_is_disqualified(
url=get_response.url, log_prefix=log_prefix_id
):
return False
story_object.og_image_url_possibly_redirected = get_response.url
except requests.exceptions.MissingSchema as exc:
short_exc_name = exc.__class__.__name__
exc_name = f"{exc.__class__.__module__}.{short_exc_name}"
exc_msg = str(exc)
exc_slug = f"{exc_name}: {exc_msg}"
if cur_url[0:2] == "//" and cur_url[2:3] != "/":
possibly_healed_url = (
f"http://{story_object.hostname_dict['minus_www']}/{cur_url[2:]}"
)
logger.info(
log_prefix_local
+ "get: "
+ f"attempting to heal and retry schemeless og:image URL {cur_url} as {possibly_healed_url}"
)
time.sleep(utils_random.random_real(0.5, 1.5))
return download_og_image1(story_object, alt_url=possibly_healed_url)
elif cur_url[0:1] == "/" and cur_url[1:2] != "/":
possibly_healed_url = (
f"http://{story_object.hostname_dict['minus_www']}/{cur_url[1:]}"
)
logger.info(
log_prefix_local
+ "get: "
+ f"attempting to heal and retry schemeless og:image URL {cur_url} as {possibly_healed_url}"
)
time.sleep(utils_random.random_real(0.5, 1.5))
return download_og_image1(story_object, alt_url=possibly_healed_url)
else:
logger.info(log_prefix_local + "get: " + "failed to get og:image")
logger.info(log_prefix_local + "get: " + exc_slug)
tb_str = traceback.format_exc()
logger.info(log_prefix_local + "get: " + tb_str)
story_object.og_image_url = ""
story_object.og_image_url_possibly_redirected = ""
return False
except Exception as exc:
utils_http.handle_exception(
exc=exc,
log_prefix=log_prefix_local + "get: ",
context={"url": cur_url},
)
return False
if not get_response:
logger.error(log_prefix_local + "no get_response ~Tim~")
return False
story_object.og_image_filename_details_from_url = (
utils_text.get_filename_details_from_url(
story_object.og_image_url_possibly_redirected
)
)
# get server-reported content type for og:image
if story_object.og_image_url_possibly_redirected:
url_to_use = story_object.og_image_url_possibly_redirected
elif story_object.og_image_url:
url_to_use = story_object.og_image_url
story_object.og_image_content_type = utils_http.get_content_type_via_head_request(
url=url_to_use, log_prefix=log_prefix_local
)
if story_object.og_image_content_type:
logger.info(
log_prefix_id
+ f"content-type is '{story_object.og_image_content_type}'"
+ f" for og:image uri {story_object.og_image_url_possibly_redirected}"
)
else:
logger.info(
log_prefix_id
+ f"content-type unavailable for og:image uri {story_object.og_image_url_possibly_redirected}"
)
# download og:image
story_object.normalized_og_image_filename = f"{story_object.id}-og-image"
story_object.downloaded_orig_thumb_full_path = os.path.join(
config.settings["TEMP_DIR"],
story_object.normalized_og_image_filename,
)
with open(story_object.downloaded_orig_thumb_full_path, "wb") as fout:
fout.write(get_response.content)
# determine magic type of downloaded og:image
story_object.downloaded_og_image_magic_result = magic.from_file(
story_object.downloaded_orig_thumb_full_path, mime=True
)
logger.info(
log_prefix_id
+ f"magic type is '{story_object.downloaded_og_image_magic_result}'"
+ f" for downloaded og:image"
)
if alt_url:
if (
story_object.downloaded_og_image_magic_result.startswith("image/")
or story_object.downloaded_og_image_magic_result == "application/pdf"
):
logger.info(
log_prefix_id
+ f"successfully healed og:image URL {story_object.og_image_url} to {alt_url}"
)
return True
else:
logger.info(
log_prefix_id
+ f"og:image URL {story_object.og_image_url} seemed healed to {alt_url}, but downloaded og:image has magic type {story_object.downloaded_og_image_magic_result}"
)
return False
return True
def get_roster_for_story_type(roster_story_type: str = None, log_prefix=""):
roster = []
if roster_story_type in ["active", "classic"]:
try:
roster = get_roster_via_screen_scraping(
roster_story_type=roster_story_type, log_prefix=log_prefix
)
except Exception as exc:
exc_module = exc.__class__.__module__
exc_short_name = exc.__class__.__name__
exc_name = exc_module + "." + exc_short_name
exc_msg = str(exc)
exc_slug = exc_name + ": " + exc_msg
logger.error("unexpected exception: " + exc_slug)
tb_str = traceback.format_exc()
logger.error(tb_str)
elif roster_story_type in ["best", "new", "top"]:
try:
roster = get_roster_via_endpoint(roster_story_type, log_prefix=log_prefix)
except Exception as exc:
exc_module = exc.__class__.__module__
exc_short_name = exc.__class__.__name__
exc_name = exc_module + "." + exc_short_name
exc_msg = str(exc)
exc_slug = exc_name + ": " + exc_msg
logger.error("unexpected exception: " + exc_slug)
tb_str = traceback.format_exc()
logger.error(tb_str)
else:
logger.info(
log_prefix
+ f"Error: cannot get a roster for unrecognized story type {roster_story_type}"
)
return roster
def get_roster_via_endpoint(story_type: str, log_prefix=""):
query = f"/v0/{story_type}stories.json"
res = utils_http.firebaseio_endpoint_query(query=query, log_prefix=log_prefix)
return res if res else []
def get_roster_via_screen_scraping(roster_story_type: str = None, log_prefix=""):
log_prefix += "get_roster_via_screen_scraping: "
if roster_story_type not in ["active", "classic"]:
raise Exception(
log_prefix
+ f"cannot create a roster for unrecognized story type '{roster_story_type}'"
)
prev_roster_as_dict = None
try:
prev_roster_as_dict = utils_aws.get_json_from_s3_as_dict(
f"rosters/{roster_story_type}_roster.json"
)
if prev_roster_as_dict:
prev_time = prev_roster_as_dict["time_retrieved"]
cur_time = utils_time.get_time_now_in_epoch_seconds_int()
roster_age_seconds = cur_time - prev_time
if 0 < roster_age_seconds < 2 * utils_time.SECONDS_PER_HOUR:
if prev_roster_as_dict["story_ids"]:
logger.info(
log_prefix
+ f"reusing old '{roster_story_type}' roster with length {len(prev_roster_as_dict['story_ids'])} since it's still recent"
)
return prev_roster_as_dict["story_ids"]
else:
logger.info(
log_prefix + f"previous '{roster_story_type}' roster is too old"
)
else:
logger.info(
log_prefix
+ f"previous '{roster_story_type}' roster doesn't seem to exist"
)
except CouldNotGetObjectFromS3Error as exc:
logger.warning(log_prefix + f"{str(exc)}")
raise
# invariant now: prev_roster_as_dict is None, or else prev_roster_as_dict["story_ids"] data is more than 2 hours old
cur_page = 1
tries_left = 3
cur_roster = []
try:
while True:
url = f"https://news.ycombinator.com/{roster_story_type}?p={str(cur_page)}"
try:
page_source = utils_http.get_page_source(url=url, log_prefix="")
except FailedAfterRetrying as exc:
break
if page_source:
soup = BeautifulSoup(page_source, "lxml")
tr_els = soup.find_all(name="tr", class_="athing")
if not tr_els:
tries_left -= 1
if tries_left == 0:
break
else:
continue
for row in tr_els:
cur_roster.append(int(row["id"]))
cur_page += 1
else:
tries_left -= 1
if tries_left == 0:
break
else:
time.sleep(3) # courtesy pause between scrape attempts
continue
except Exception as exc:
logger.error(f"Error during while loop: {str(exc)}")
logger.error(traceback.format_exc())
if not cur_roster:
if prev_roster_as_dict:
logger.info(
log_prefix
+ f"reusing previous '{roster_story_type}' roster since new roster couldn't be scraped"
)
return prev_roster_as_dict["story_ids"]
else:
logger.info(
log_prefix
+ f"no previous '{roster_story_type}' roster, and no current roster either"
)
return []
else:
new_roster = {}
new_roster["story_type"] = roster_story_type
new_roster["time_retrieved"] = utils_time.get_time_now_in_epoch_seconds_int()
new_roster["story_ids"] = cur_roster
res = utils_aws.upload_roster_to_s3(
roster_dict=new_roster,
roster_dest_fullpath=f"rosters/{roster_story_type}_roster.json",
)
if res:
logger.info(
log_prefix
+ f"populated a fresh '{roster_story_type}' roster with length {len(cur_roster)}"
)
return cur_roster
else:
logger.error(
log_prefix
+ f"failed to upload fresh '{roster_story_type}' roster to S3"
)
if prev_roster_as_dict:
logger.info(
log_prefix
+ f"reusing previous '{roster_story_type}' roster since new roster couldn't be uploaded"
)
return prev_roster_as_dict["story_ids"]
else:
logger.info(
log_prefix
+ f"no previous '{roster_story_type}' roster, and no current roster either"
)
return []
# def handle_exception(exc: Exception = None, log_prefix="", context=None):
# short_exc_name = exc.__class__.__name__
# exc_name = exc.__class__.__module__ + "." + short_exc_name
# exc_msg = str(exc)
# exc_slug = f"{exc_name}: {exc_msg}"
# if isinstance(exc, requests.exceptions.ConnectTimeout):
# if "Max retries exceeded with url" in exc_msg:
# url = context["url"] if (context and "url" in context) else ""
# if url:
# url_slug = f" for url {url}"
# else:
# url_slug = ""
# logger.info(log_prefix + exc_name + url_slug)
# else:
# tb_str = traceback.format_exc()
# logger.error(log_prefix + "unexpected exception: " + exc_slug)
# logger.error(log_prefix + tb_str)
# elif isinstance(exc, requests.exceptions.ConnectionError):
# if any(
# _ in exc_msg
# for _ in ["Failed to resolve", "No address associated with hostname"]
# ):
# url = context["url"] if (context and "url" in context) else ""
# if url:
# url_slug = f" for url {url}"
# else:
# url_slug = ""
# logger.info(log_prefix + exc_name + url_slug)
# else:
# tb_str = traceback.format_exc()
# logger.error(log_prefix + "unexpected exception: " + exc_slug)
# logger.error(log_prefix + tb_str)
# elif isinstance(exc, requests.exceptions.ReadTimeout):
# if "Read timed out." in exc_msg:
# url = context["url"] if (context and "url" in context) else ""
# if url:
# url_slug = f" for url {url}"
# else:
# url_slug = ""
# logger.info(log_prefix + exc_name + url_slug)
# else:
# tb_str = traceback.format_exc()
# logger.error(log_prefix + "unexpected exception: " + exc_slug)
# logger.error(log_prefix + tb_str)
# elif isinstance(exc, requests.exceptions.HTTPError):
# if "502 Server Error: Bad Gateway for url" in exc_msg:
# url = context["url"] if (context and "url" in context) else ""
# if url:
# url_slug = f" for url {url}"
# else:
# url_slug = ""
# logger.info(log_prefix + exc_name + url_slug)
# else:
# tb_str = traceback.format_exc()
# logger.error(log_prefix + "unexpected exception: " + exc_slug)
# logger.error(log_prefix + tb_str)
# else:
# tb_str = traceback.format_exc()
# logger.error(log_prefix + "unexpected exception: " + exc_slug)
# logger.error(log_prefix + tb_str)