-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontents.pyx
60 lines (45 loc) · 1.92 KB
/
contents.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import csv
import time
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
def fetch_and_parse(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
title_element = soup.select_one(
"body > div > main > div:nth-of-type(2) > " +
"div:nth-of-type(1) > div:nth-of-type(1) > " +
"div:nth-of-type(2) > div > div:nth-of-type(1) > a > h1"
)
if title_element:
title = title_element.get_text()
target_div = soup.select_one(
"body > div > main > div:nth-of-type(2) > " +
"div:nth-of-type(1) > div:nth-of-type(1) > div:nth-of-type(3)"
)
if target_div:
paragraphs = [p.get_text() for p in target_div.find_all('p')]
content = ' '.join(paragraphs)
return {'title': title, 'url': url, 'content': content}
else: print(f"[ FAILED ]: div not found for url -> {url}")
else: print(f"[ FAILED ]: title element not found for url -> {url}")
else: print(f"[ FAILED ]: failed to retrieve the page. status code -> {response.status_code} for url -> {url}")
except Exception as e: print(f"[ FAILED ]: {url} :{e}"); return None
def save_contents(workers):
with open("links.json", 'r') as file:
data = json.load(file)
with open("datasets.csv", 'w', newline = '', encoding = 'utf-8') as csvfile:
fieldnames = ['title', 'url', 'content']
writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
writer.writeheader()
with ThreadPoolExecutor(max_workers = workers) as executor:
future_to_url = {
executor.submit(fetch_and_parse, url):
url for url in data['links']
}
for future in as_completed(future_to_url):
result = future.result()
if result: writer.writerow(result); print(f"[ SUCCESS ]: {result['title']}")