-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmto-scrape.py
36 lines (28 loc) · 949 Bytes
/
mto-scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from lxml import html
from lxml import etree
ARCHIVE_URL = "http://mto.mediatakeout.com/archive"
"""
Given a base URL and a file, write headlines for a single
archive page and return the integer value of the last page.
The return value is really just for convenience.
If MTO changes their DOM structure, this could break.
"""
def writeHeadlines(url, txtfile):
doc = html.parse(url)
limit = int(str(doc.getroot().cssselect('div.paging p.tr a:last-child')[0].text_content()))
for element in doc.getroot().cssselect('a.link,a.article'):
try:
line = element.text_content()
txtfile.write(line+"\n")
except:
pass
return limit
"""
Iterate over all pages and scrape headline text.
"""
txtfile = open("headlines.txt", 'w')
limit = writeHeadlines(ARCHIVE_URL, txtfile)
for i in range (1, limit):
writeHeadlines(ARCHIVE_URL+"?p="+str(i), txtfile)
print i,
txtfile.close()