-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
executable file
·61 lines (46 loc) · 1.5 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python3
from bs4 import BeautifulSoup
import urllib.request
import os
import yaml
INFO = "INFO"
ERROR = "ERROR"
log_level = INFO
# this folder must exist at runtime
dest_folder = 'img'
alt_texts = {}
def getImages():
os.chdir( dest_folder)
for i in range(1,10):
url = 'http://xkcd.com/' + str(i)
with urllib.request.urlopen(url) as conn:
page = conn.read()
soup = BeautifulSoup(page)
image_tags = soup.find_all('img')
for img in image_tags:
# img[src] is the contents of the src field
# http://imgs.xkcd.com/comics/xxxx.png
if 'comics' in img['src']:
log(INFO, 'Getting ' + img['src'])
filename = ''
if '.jpg' in img['src']:
filename = str(i) + '.jpg'
elif '.png' in img['src']:
filename = str(i) + '.png'
else:
log(ERROR, 'Did not recognize image format: ' + img['src'])
break
urllib.request.urlretrieve(img['src'], filename)
alt_texts[filename] = img['title']
break
saveAltTexts()
def saveAltTexts():
with open('alt.yaml', 'w') as f:
yaml.dump(alt_texts, f)
def log(level, msg):
if log_level == INFO:
print(level + ': ' + msg)
elif log_level == ERROR:
if level == ERROR:
print(level + ': ' + msg)
getImages()