-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_requests.py
67 lines (57 loc) · 2.43 KB
/
scrape_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import argparse
from bs4 import BeautifulSoup
import requests
import json
import tqdm
import multiprocessing
from pythainlp.ulmfit import rm_useless_spaces
def get_parallel_texts(parallel_url, timeout=(10,10), use_min=True,
tags = ['h1','h2','h3','h4','h5','h6','p','li','span','strong']):
try:
with requests.get(parallel_url['en_url'],timeout=timeout) as r:
soup_en = BeautifulSoup(r.content,features='html.parser')
with requests.get(parallel_url['th_url'],timeout=timeout) as r:
soup_th = BeautifulSoup(r.content,features='html.parser')
except:
print('Request error')
return None
parallel_texts = []
for tag in tags:
tags_en = soup_en.find_all(tag)
tags_th = soup_th.find_all(tag)
nb_tags = len(tags_en)
if len(tags_en)!=len(tags_th):
if use_min:
nb_tags = min(len(tags_en),len(tags_th))
else:
continue
elif (len(tags_en)==0)|(len(tags_th)==0):
continue
for tag_en, tag_th in zip(tags_en[:nb_tags], tags_th[:nb_tags]):
parallel_texts.append({'en_text': rm_useless_spaces(tag_en.get_text(separator=" ")),
'th_text': rm_useless_spaces(tag_th.get_text(separator=" "))})
return parallel_texts
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input_path', type=str)
parser.add_argument('--output_path', type=str)
parser.add_argument('--bs', type=int, default=multiprocessing.cpu_count() * 50)
args = parser.parse_args()
with open(args.input_path,'r') as f: parallel_urls = json.load(f)
print(f'There are {len(parallel_urls)} parallel urls')
parallel_texts = []
total_batches = len(parallel_urls)//args.bs+1
save_every = total_batches//10
for i in tqdm.tqdm(range(total_batches)):
parallel_urls_subs = parallel_urls[i*args.bs:(i+1)*args.bs]
p = multiprocessing.Pool(multiprocessing.cpu_count())
res = p.map(get_parallel_texts, parallel_urls_subs)
parallel_texts+=[r for r in res if r is not None]
p.terminate()
p.join()
if i%save_every==0:
with open(args.output_path, 'w') as f:
json.dump(parallel_texts, f)
#save
with open(args.output_path, 'w') as f:
json.dump(parallel_texts, f)