-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_interesting_links_from_url.py
159 lines (125 loc) · 5.54 KB
/
extract_interesting_links_from_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/python3
"""docstring"""
# Tim H 2023
#
# Visits a webpage and scrolls down until it stops giving new interesting links
# Outputs that list of interesting links to a text file. Useful for SPAs like
# Pinterest where a single snapshot of the DOM doesn't list all of the
# urls
#
# References:
# https://www.geeksforgeeks.org/driving-headless-chrome-with-python/
from time import sleep
import sys
import getopt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import numpy as np
WAIT_FOR_PAGE_TO_LOAD_TIME_IN_SEC = 3
def extract_unique_and_interesting_urls(var_driver, interesting_url_pattern_var, unique_links_output_file, max_scroll_down_count_var):
"""docstring"""
# interesting_url_pattern='https://www.tiktok.com/@shaq/video/'
# Initialize for first loop
new_links_found = True
# initialize an empty list of interesting URLs
unique_interesting_urls_list = []
# initialize loop counter to zero
loop_counter = 0
# do the scrolling in here, only add new URLs if they aren't
# already in the list
while loop_counter <= max_scroll_down_count_var and new_links_found:
# assume no new links have been found if on second iteration or later
if loop_counter >= 1:
new_links_found = False
# gather the list of all hyperlinks in the current DOM
lnks=var_driver.find_elements(By.TAG_NAME, 'a')
if len(lnks) > 0:
# iterate through list of found hrefs
for iter_href_elem in lnks:
# extract just the href's URL as string, not object
iter_url = iter_href_elem.get_attribute('href')
# if the href isn't empty
if iter_url is not None:
# If the URL is considered interesting:
if iter_url.startswith(interesting_url_pattern_var):
# if it is new and hasn't been seen before:
if iter_url not in unique_interesting_urls_list:
# add it to the list
unique_interesting_urls_list.append(iter_url)
new_links_found=True
print('Scrolling down...')
var_driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
sleep(WAIT_FOR_PAGE_TO_LOAD_TIME_IN_SEC)
loop_counter += 1
print("Total number of interesting URLs (non-unique): ", len(unique_interesting_urls_list))
print("Total number of interesting and unique URLs : ", len(np.unique(unique_interesting_urls_list)))
print("Writing unique and interesting URLs to output file...")
with open(unique_links_output_file, "w", encoding='utf-8') as file_to_write:
file_to_write.write("\n".join(np.unique(unique_interesting_urls_list)))
def main(argv):
"""docstring"""
# initialize variables with defaults, will be parsed out later
url_to_render = ''
# html_output_file = 'default.html'
# screenshot_output_file = 'default.png'
max_scroll_count = 0
usage = ('extract-interesting-links-from-url.py --url=<url> '
'--max_scroll_count=<int> '
'--interesting_url_pattern=<url> '
'--output_file=<filename>')
try:
# extract the command line arguments
opts, args = getopt.getopt(argv, "hi:o:",
["url=", "interesting_url_pattern=", "max_scroll_count=",
"output_file="])
except getopt.GetoptError:
print(usage)
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print(usage)
sys.exit()
elif opt in ("--url"):
# assign the extracted parameter
url_to_render = arg
elif opt in ("--max_scroll_count"):
max_scroll_count = int(arg)
elif opt in ("--interesting_url_pattern"):
interesting_url_pattern = arg
elif opt in ("--output_file"):
output_file = arg
# initialize an empty Options object
options = Options()
# make this run in Headless mode, so no GUI pops up
# GUI slows it down and prevents it from working in command line
options.add_argument("--headless=new")
options.add_argument("--window-size=1920,1200")
# construct a new instance of headless Chrome
print("Initializing browser...")
driver = webdriver.Chrome(options=options)
# have the headless Chrome fetch the specific URL
print("Fetching URL: ", url_to_render)
driver.get(url_to_render)
print("Finished fetching URL, now sleeping...")
# wait for page to finish loading
sleep(WAIT_FOR_PAGE_TO_LOAD_TIME_IN_SEC)
print("Finished sleeping.")
# print("counting links in HTML/DOM")
extract_unique_and_interesting_urls(driver, interesting_url_pattern, output_file, max_scroll_count)
# save page's HTML as local file
#page_source = driver.page_source
#with open(html_output_file, "w", encoding='utf-8') as file_to_write:
# print("Writing HTML file: ", html_output_file)
# file_to_write.write(page_source)
# save local screenshot, if specified (not default)
#if screenshot_output_file != 'default.png':
# print("Saving screenshot: ", screenshot_output_file)
# driver.save_screenshot(screenshot_output_file)
# driver.get_screenshot_as_png()
# properly clean up the Chrome instance
print("Shutting down Chrome...")
driver.quit()
if __name__ == "__main__":
main(sys.argv[1:])
print("Python script finished successfully.")