Skip to content

Commit f3b98dc

Browse files
committed
init
0 parents  commit f3b98dc

14 files changed

+622
-0
lines changed

.gitignore

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.editorconfig
2+
.env
3+
env
4+
venv
5+
__pycache__
6+
*.pyc
7+
.DS_Store
8+
output_*
9+
.pytest*
10+
.vscode

README.md

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Concurrent Web Scraping with Python and Selenium
2+
3+
## Want to learn how to build this project?
4+
5+
Check out the [blog post](https://testdriven.io/blog/building-a-concurrent-web-scraper-with-python-and-selenium/).
6+
7+
## Want to use this project?
8+
9+
1. Fork/Clone
10+
11+
1. Create and activate a virtual environment
12+
13+
1. Install the requirements
14+
15+
1. Run the scrapers:
16+
17+
```sh
18+
# sync
19+
(env)$ python script.py headless
20+
21+
# parallel with multiprocessing
22+
(env)$ python script_parallel_1.py headless
23+
24+
# parallel with concurrent.futures
25+
(env)$ python script_parallel_2.py headless
26+
27+
# concurrent with concurrent.futures (should be the fastest!)
28+
(env)$ python script_concurrent.py headless
29+
30+
# parallel with concurrent.futures and concurrent with asyncio
31+
(env)$ python script_asyncio.py headless
32+
```
33+
34+
1. Run the tests:
35+
36+
```sh
37+
(env)$ python -m pytest test/test_scraper.py
38+
(env)$ python -m pytest test/test_scraper_mock.py
39+
```

requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
beautifulsoup4==4.9.1
2+
pytest==6.0.1
3+
requests==2.24.0
4+
selenium==3.141.0

scrapers/__init__.py

Whitespace-only changes.

scrapers/scraper.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import csv
2+
from pathlib import Path
3+
4+
import requests
5+
from bs4 import BeautifulSoup
6+
from selenium import webdriver
7+
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.support import expected_conditions as EC
9+
from selenium.webdriver.support.ui import WebDriverWait
10+
11+
BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
12+
13+
def get_driver(headless):
14+
options = webdriver.ChromeOptions()
15+
if headless:
16+
options.add_argument("--headless")
17+
18+
# initialize driver
19+
driver = webdriver.Chrome(chrome_options=options)
20+
return driver
21+
22+
23+
def connect_to_base(browser, page_number):
24+
base_url = f"https://news.ycombinator.com/news?p={page_number}"
25+
connection_attempts = 0
26+
while connection_attempts < 3:
27+
try:
28+
browser.get(base_url)
29+
# wait for table element with id = 'hnmain' to load
30+
# before returning True
31+
WebDriverWait(browser, 5).until(
32+
EC.presence_of_element_located((By.ID, "hnmain"))
33+
)
34+
return True
35+
except Exception as e:
36+
print(e)
37+
connection_attempts += 1
38+
print(f"Error connecting to {base_url}.")
39+
print(f"Attempt #{connection_attempts}.")
40+
return False
41+
42+
43+
def parse_html(html):
44+
# create soup object
45+
soup = BeautifulSoup(html, "html.parser")
46+
output_list = []
47+
# parse soup object to get article id, rank, score, and title
48+
tr_blocks = soup.find_all("tr", class_="athing")
49+
article = 0
50+
for tr in tr_blocks:
51+
article_id = tr.get("id")
52+
article_url = tr.find_all("a")[1]["href"]
53+
# check if article is a hacker news article
54+
if "item?id=" in article_url:
55+
article_url = f"https://news.ycombinator.com/{article_url}"
56+
load_time = get_load_time(article_url)
57+
try:
58+
score = soup.find(id=f"score_{article_id}").string
59+
except Exception as e:
60+
print(e)
61+
score = "0 points"
62+
article_info = {
63+
"id": article_id,
64+
"load_time": load_time,
65+
"rank": tr.span.string,
66+
"score": score,
67+
"title": tr.find(class_="storylink").string,
68+
"url": article_url,
69+
}
70+
# appends article_info to output_list
71+
output_list.append(article_info)
72+
article += 1
73+
return output_list
74+
75+
76+
def get_load_time(article_url):
77+
try:
78+
# set headers
79+
headers = {
80+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
81+
}
82+
# make get request to article_url
83+
response = requests.get(
84+
article_url, headers=headers, stream=True, timeout=3.000
85+
)
86+
# get page load time
87+
load_time = response.elapsed.total_seconds()
88+
except Exception as e:
89+
print(e)
90+
load_time = "Loading Error"
91+
return load_time
92+
93+
94+
def write_to_file(output_list, filename):
95+
for row in output_list:
96+
with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
97+
fieldnames = ["id", "load_time", "rank", "score", "title", "url"]
98+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
99+
writer.writerow(row)

script.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import datetime
2+
import sys
3+
from time import sleep, time
4+
5+
from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
6+
7+
8+
def run_process(page_number, filename, browser):
9+
if connect_to_base(browser, page_number):
10+
sleep(2)
11+
html = browser.page_source
12+
output_list = parse_html(html)
13+
write_to_file(output_list, filename)
14+
else:
15+
print("Error connecting to hacker news")
16+
17+
18+
if __name__ == "__main__":
19+
20+
# headless mode?
21+
headless = False
22+
if len(sys.argv) > 1:
23+
if sys.argv[1] == "headless":
24+
print("Running in headless mode")
25+
headless = True
26+
27+
# set variables
28+
start_time = time()
29+
current_page = 1
30+
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
31+
output_filename = f"output_{output_timestamp}.csv"
32+
33+
# init browser
34+
browser = get_driver(headless=headless)
35+
36+
# scrape and crawl
37+
while current_page <= 20:
38+
print(f"Scraping page #{current_page}...")
39+
run_process(current_page, output_filename, browser)
40+
current_page = current_page + 1
41+
42+
# exit
43+
browser.quit()
44+
end_time = time()
45+
elapsed_time = end_time - start_time
46+
print(f"Elapsed run time: {elapsed_time} seconds")

script_asyncio.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import asyncio
2+
import datetime
3+
import sys
4+
from concurrent.futures import ProcessPoolExecutor
5+
from time import sleep, time
6+
7+
from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
8+
9+
10+
def run_process(page_number, filename, headless):
11+
12+
# init browser
13+
browser = get_driver(headless)
14+
15+
if connect_to_base(browser, page_number):
16+
sleep(2)
17+
html = browser.page_source
18+
output_list = parse_html(html)
19+
write_to_file(output_list, filename)
20+
21+
# exit
22+
browser.quit()
23+
else:
24+
print("Error connecting to hackernews")
25+
browser.quit()
26+
27+
28+
async def run_blocking_tasks(executor):
29+
# headless mode?
30+
headless = False
31+
if len(sys.argv) > 1:
32+
if sys.argv[1] == "headless":
33+
print("Running in headless mode")
34+
headless = True
35+
36+
loop = asyncio.get_event_loop()
37+
38+
# set variables
39+
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
40+
output_filename = f"output_{output_timestamp}.csv"
41+
42+
# scrape and crawl
43+
blocking_tasks = [
44+
loop.run_in_executor(executor, run_process, i, output_filename, headless)
45+
for i in range(1, 21)
46+
]
47+
completed, pending = await asyncio.wait(blocking_tasks)
48+
49+
50+
if __name__ == "__main__":
51+
start_time = time()
52+
executor = ProcessPoolExecutor()
53+
54+
# create event loop
55+
event_loop = asyncio.get_event_loop()
56+
try:
57+
event_loop.run_until_complete(run_blocking_tasks(executor))
58+
finally:
59+
event_loop.close()
60+
end_time = time()
61+
elapsed_time = end_time - start_time
62+
print(f"Elapsed run time: {elapsed_time} seconds")

script_concurrent.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import datetime
2+
import sys
3+
from concurrent.futures import ThreadPoolExecutor, wait
4+
from time import sleep, time
5+
6+
from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
7+
8+
9+
def run_process(page_number, filename, headless):
10+
11+
# init browser
12+
browser = get_driver(headless)
13+
14+
if connect_to_base(browser, page_number):
15+
sleep(2)
16+
html = browser.page_source
17+
output_list = parse_html(html)
18+
write_to_file(output_list, filename)
19+
20+
# exit
21+
browser.quit()
22+
else:
23+
print("Error connecting to hackernews")
24+
browser.quit()
25+
26+
27+
if __name__ == "__main__":
28+
29+
# headless mode?
30+
headless = False
31+
if len(sys.argv) > 1:
32+
if sys.argv[1] == "headless":
33+
print("Running in headless mode")
34+
headless = True
35+
36+
# set variables
37+
start_time = time()
38+
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
39+
output_filename = f"output_{output_timestamp}.csv"
40+
futures = []
41+
42+
# scrape and crawl
43+
with ThreadPoolExecutor() as executor:
44+
for number in range(1, 21):
45+
futures.append(
46+
executor.submit(run_process, number, output_filename, headless)
47+
)
48+
49+
wait(futures)
50+
end_time = time()
51+
elapsed_time = end_time - start_time
52+
print(f"Elapsed run time: {elapsed_time} seconds")

script_parallel_1.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import datetime
2+
import sys
3+
from itertools import repeat
4+
from multiprocessing import Pool, cpu_count
5+
from time import sleep, time
6+
7+
from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
8+
9+
10+
def run_process(page_number, filename, headless):
11+
12+
# init browser
13+
browser = get_driver(headless)
14+
15+
if connect_to_base(browser, page_number):
16+
sleep(2)
17+
html = browser.page_source
18+
output_list = parse_html(html)
19+
write_to_file(output_list, filename)
20+
21+
# exit
22+
browser.quit()
23+
else:
24+
print("Error connecting to hackernews")
25+
browser.quit()
26+
27+
28+
if __name__ == "__main__":
29+
30+
# headless mode?
31+
headless = False
32+
if len(sys.argv) > 1:
33+
if sys.argv[1] == "headless":
34+
print("Running in headless mode")
35+
headless = True
36+
37+
# set variables
38+
start_time = time()
39+
output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
40+
output_filename = f"output_{output_timestamp}.csv"
41+
42+
# scrape and craw
43+
with Pool(cpu_count() - 1) as p:
44+
p.starmap(run_process, zip(range(1, 21), repeat(output_filename), repeat(headless)))
45+
p.close()
46+
p.join()
47+
48+
end_time = time()
49+
elapsed_time = end_time - start_time
50+
print(f"Elapsed run time: {elapsed_time} seconds")

0 commit comments

Comments
 (0)