-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathautosources.py
138 lines (99 loc) · 3.52 KB
/
autosources.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import collections
import re
import time
from urllib.parse import urlparse, quote, unquote
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import json
import nltk
import lxml.html
import lxml.etree
REGEX_TWITTER = re.compile(r"\Ahttps?://(www\.)?twitter\.com/(?P<username>\w{1,15})\Z")
HTTP_HEADER = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
# TODO: Rewrite this with Selenium
# TODO: Integrate OpenCV to match images
def twittercrawl(url0, visited = set(), depth = -1):
handles = []
queue = collections.deque([url0])
while queue:
url = queue.popleft()
if url in visited:
continue
print(url, file = sys.stderr)
visited.add(url)
try:
htm = lxml.html.parse(urlopen(Request(url, headers = HTTP_HEADER)))
except Exception:
continue
for a in htm.xpath("//a"):
a_href = a.get("href")
if a_href is None:
continue
regex = REGEX_TWITTER.match(a_href)
if regex is not None:
handles.append(regex.group("username"))
newurl = urlparse(a_href)
oldurl = urlparse(url)
if newurl.netloc and newurl.netloc != oldurl.netloc:
continue
elif not newurl.netloc and not newurl.path:
continue
if newurl.path and newurl.path[0] == "/":
nexturl = oldurl.scheme + "://" + oldurl.netloc + newurl.path
else:
nexturl = url[:url.rfind("/")] + "/" + newurl.path
if depth != 0:
queue.append(nexturl)
depth -= 1
return handles
# Use SerpApi to get the first few Google search results for query
def google_search(query):
new_t = time.clock()
dt = new_t - ddg_search.t
google_search.t = new_t
if dt < 5:
time.sleep(5 - dt)
url = "https://serpapi.com/search.json?hl=en&gl=us&q=" + quote(query.replace(" ", "+"), safe = "")
with urlopen(url) as request:
r = json.loads(request.read())
return [x["link"] for x in r["organic_results"]]
# Finds the closest match to str0 in the iterable strs
def fuzzy_match(str0, strs):
def normalize(s):
stemmer = nltk.stem.PorterStemmer()
words = nltk.tokenize.wordpunct_tokenize(s.lower().strip())
return ' '.join([stemmer.stem(w) for w in words])
str0_norm = normalize(str0)
def edit_distance(s):
return nltk.edit_distance(str0_norm, normalize(s))
match = list(sorted(strs, key = edit_distance))
return match[0]
google_search.t = time.clock()
import csv
import sys
if __name__ == "__main__":
csvin = csv.reader(sys.stdin)
csvout = csv.writer(sys.stdout)
for irow in csvin:
orow = irow + [""]
handles = []
visited = set()
match = None
try:
for url in google_search(orow[0]):
handles += twittercrawl(url, visited = visited, depth = 20)
if handles:
match = fuzzy_match(orow[0], handles)
except Exception:
pass
if match is not None:
orow[-1] = match
csvout.writerow(orow)
#sys.stdout.flush()