testdrivenio
diff --git a/‎.gitignore
+10 b/‎.gitignore
+10
diff --git a/‎README.md
+39 b/‎README.md
+39
diff --git a/‎requirements.txt
+4 b/‎requirements.txt
+4
diff --git a/‎scrapers/__init__.py b/‎scrapers/__init__.py
diff --git a/‎scrapers/scraper.py
+99 b/‎scrapers/scraper.py
+99
diff --git a/‎script.py
+46 b/‎script.py
+46
diff --git a/‎script_asyncio.py
+62 b/‎script_asyncio.py
+62
diff --git a/‎script_concurrent.py
+52 b/‎script_concurrent.py
+52
diff --git a/‎script_parallel_1.py
+50 b/‎script_parallel_1.py
+50
@@ -0,0 +1,10 @@
+.editorconfig
+.env
+env
+venv
+__pycache__
+*.pyc
+.DS_Store
+output_*
+.pytest*
+.vscode
@@ -0,0 +1,39 @@
+# Concurrent Web Scraping with Python and Selenium
+
+## Want to learn how to build this project?
+
+Check out the [blog post](https://testdriven.io/blog/building-a-concurrent-web-scraper-with-python-and-selenium/).
+
+## Want to use this project?
+
+1. Fork/Clone
+
+1. Create and activate a virtual environment
+
+1. Install the requirements
+
+1. Run the scrapers:
+
+    ```sh
+    # sync
+    (env)$ python script.py headless
+
+    # parallel with multiprocessing
+    (env)$ python script_parallel_1.py headless
+
+    # parallel with concurrent.futures
+    (env)$ python script_parallel_2.py headless
+
+    # concurrent with concurrent.futures (should be the fastest!)
+    (env)$ python script_concurrent.py headless
+
+    # parallel with concurrent.futures and concurrent with asyncio
+    (env)$ python script_asyncio.py headless
+    ```
+
+1. Run the tests:
+
+    ```sh
+    (env)$ python -m pytest test/test_scraper.py
+    (env)$ python -m pytest test/test_scraper_mock.py
+    ```
@@ -0,0 +1,4 @@
+beautifulsoup4==4.9.1
+pytest==6.0.1
+requests==2.24.0
+selenium==3.141.0
@@ -0,0 +1,99 @@
+import csv
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+BASE_DIR = Path(__file__).resolve(strict=True).parent.parent
+
+def get_driver(headless):
+    options = webdriver.ChromeOptions()
+    if headless:
+        options.add_argument("--headless")
+
+    # initialize driver
+    driver = webdriver.Chrome(chrome_options=options)
+    return driver
+
+
+def connect_to_base(browser, page_number):
+    base_url = f"https://news.ycombinator.com/news?p={page_number}"
+    connection_attempts = 0
+    while connection_attempts < 3:
+        try:
+            browser.get(base_url)
+            # wait for table element with id = 'hnmain' to load
+            # before returning True
+            WebDriverWait(browser, 5).until(
+                EC.presence_of_element_located((By.ID, "hnmain"))
+            )
+            return True
+        except Exception as e:
+            print(e)
+            connection_attempts += 1
+            print(f"Error connecting to {base_url}.")
+            print(f"Attempt #{connection_attempts}.")
+    return False
+
+
+def parse_html(html):
+    # create soup object
+    soup = BeautifulSoup(html, "html.parser")
+    output_list = []
+    # parse soup object to get article id, rank, score, and title
+    tr_blocks = soup.find_all("tr", class_="athing")
+    article = 0
+    for tr in tr_blocks:
+        article_id = tr.get("id")
+        article_url = tr.find_all("a")[1]["href"]
+        # check if article is a hacker news article
+        if "item?id=" in article_url:
+            article_url = f"https://news.ycombinator.com/{article_url}"
+        load_time = get_load_time(article_url)
+        try:
+            score = soup.find(id=f"score_{article_id}").string
+        except Exception as e:
+            print(e)
+            score = "0 points"
+        article_info = {
+            "id": article_id,
+            "load_time": load_time,
+            "rank": tr.span.string,
+            "score": score,
+            "title": tr.find(class_="storylink").string,
+            "url": article_url,
+        }
+        # appends article_info to output_list
+        output_list.append(article_info)
+        article += 1
+    return output_list
+
+
+def get_load_time(article_url):
+    try:
+        # set headers
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
+        }
+        # make get request to article_url
+        response = requests.get(
+            article_url, headers=headers, stream=True, timeout=3.000
+        )
+        # get page load time
+        load_time = response.elapsed.total_seconds()
+    except Exception as e:
+        print(e)
+        load_time = "Loading Error"
+    return load_time
+
+
+def write_to_file(output_list, filename):
+    for row in output_list:
+        with open(Path(BASE_DIR).joinpath(filename), "a") as csvfile:
+            fieldnames = ["id", "load_time", "rank", "score", "title", "url"]
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writerow(row)
@@ -0,0 +1,46 @@
+import datetime
+import sys
+from time import sleep, time
+
+from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
+
+
+def run_process(page_number, filename, browser):
+    if connect_to_base(browser, page_number):
+        sleep(2)
+        html = browser.page_source
+        output_list = parse_html(html)
+        write_to_file(output_list, filename)
+    else:
+        print("Error connecting to hacker news")
+
+
+if __name__ == "__main__":
+
+    # headless mode?
+    headless = False
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "headless":
+            print("Running in headless mode")
+            headless = True
+
+    # set variables
+    start_time = time()
+    current_page = 1
+    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    output_filename = f"output_{output_timestamp}.csv"
+
+    # init browser
+    browser = get_driver(headless=headless)
+
+    # scrape and crawl
+    while current_page <= 20:
+        print(f"Scraping page #{current_page}...")
+        run_process(current_page, output_filename, browser)
+        current_page = current_page + 1
+
+    # exit
+    browser.quit()
+    end_time = time()
+    elapsed_time = end_time - start_time
+    print(f"Elapsed run time: {elapsed_time} seconds")
@@ -0,0 +1,62 @@
+import asyncio
+import datetime
+import sys
+from concurrent.futures import ProcessPoolExecutor
+from time import sleep, time
+
+from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
+
+
+def run_process(page_number, filename, headless):
+
+    # init browser
+    browser = get_driver(headless)
+
+    if connect_to_base(browser, page_number):
+        sleep(2)
+        html = browser.page_source
+        output_list = parse_html(html)
+        write_to_file(output_list, filename)
+
+        # exit
+        browser.quit()
+    else:
+        print("Error connecting to hackernews")
+        browser.quit()
+
+
+async def run_blocking_tasks(executor):
+    # headless mode?
+    headless = False
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "headless":
+            print("Running in headless mode")
+            headless = True
+
+    loop = asyncio.get_event_loop()
+
+    # set variables
+    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    output_filename = f"output_{output_timestamp}.csv"
+
+    # scrape and crawl
+    blocking_tasks = [
+        loop.run_in_executor(executor, run_process, i, output_filename, headless)
+        for i in range(1, 21)
+    ]
+    completed, pending = await asyncio.wait(blocking_tasks)
+
+
+if __name__ == "__main__":
+    start_time = time()
+    executor = ProcessPoolExecutor()
+
+    # create event loop
+    event_loop = asyncio.get_event_loop()
+    try:
+        event_loop.run_until_complete(run_blocking_tasks(executor))
+    finally:
+        event_loop.close()
+    end_time = time()
+    elapsed_time = end_time - start_time
+    print(f"Elapsed run time: {elapsed_time} seconds")
@@ -0,0 +1,52 @@
+import datetime
+import sys
+from concurrent.futures import ThreadPoolExecutor, wait
+from time import sleep, time
+
+from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
+
+
+def run_process(page_number, filename, headless):
+
+    # init browser
+    browser = get_driver(headless)
+
+    if connect_to_base(browser, page_number):
+        sleep(2)
+        html = browser.page_source
+        output_list = parse_html(html)
+        write_to_file(output_list, filename)
+
+        # exit
+        browser.quit()
+    else:
+        print("Error connecting to hackernews")
+        browser.quit()
+
+
+if __name__ == "__main__":
+
+    # headless mode?
+    headless = False
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "headless":
+            print("Running in headless mode")
+            headless = True
+
+    # set variables
+    start_time = time()
+    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    output_filename = f"output_{output_timestamp}.csv"
+    futures = []
+
+    # scrape and crawl
+    with ThreadPoolExecutor() as executor:
+        for number in range(1, 21):
+            futures.append(
+                executor.submit(run_process, number, output_filename, headless)
+            )
+
+    wait(futures)
+    end_time = time()
+    elapsed_time = end_time - start_time
+    print(f"Elapsed run time: {elapsed_time} seconds")
@@ -0,0 +1,50 @@
+import datetime
+import sys
+from itertools import repeat
+from multiprocessing import Pool, cpu_count
+from time import sleep, time
+
+from scrapers.scraper import connect_to_base, get_driver, parse_html, write_to_file
+
+
+def run_process(page_number, filename, headless):
+
+    # init browser
+    browser = get_driver(headless)
+
+    if connect_to_base(browser, page_number):
+        sleep(2)
+        html = browser.page_source
+        output_list = parse_html(html)
+        write_to_file(output_list, filename)
+
+        # exit
+        browser.quit()
+    else:
+        print("Error connecting to hackernews")
+        browser.quit()
+
+
+if __name__ == "__main__":
+
+    # headless mode?
+    headless = False
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "headless":
+            print("Running in headless mode")
+            headless = True
+
+    # set variables
+    start_time = time()
+    output_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    output_filename = f"output_{output_timestamp}.csv"
+
+    # scrape and craw
+    with Pool(cpu_count() - 1) as p:
+        p.starmap(run_process, zip(range(1, 21), repeat(output_filename), repeat(headless)))
+    p.close()
+    p.join()
+
+    end_time = time()
+    elapsed_time = end_time - start_time
+    print(f"Elapsed run time: {elapsed_time} seconds")