Skip to content

Commit 72ea57c

Browse files
committed
Created a Reddit Scraper for different topics
1 parent c42ad4b commit 72ea57c

File tree

1 file changed

+60
-0
lines changed

1 file changed

+60
-0
lines changed

reddit_scraper.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import requests
2+
import csv
3+
import time
4+
from bs4 import BeautifulSoup
5+
6+
7+
class HaikuScraper:
8+
"""
9+
This scraper is designed with the purpose of scraping Haikus (Japanese poems) from Reddit.
10+
"""
11+
def __init__(self, url: str, headers: dict):
12+
self.url = url
13+
self.headers = headers
14+
15+
def make_request(self):
16+
time.sleep(3)
17+
page = requests.get(self.url, headers=self.headers)
18+
soup = BeautifulSoup(page.text, 'html.parser')
19+
return soup
20+
21+
def get_next_page(self, soup: BeautifulSoup):
22+
time.sleep(3)
23+
next_button = soup.find('span', class_='next-button')
24+
next_page_link = next_button.find("a").attrs['href']
25+
return next_page_link
26+
27+
def get_haikus(self, soup: BeautifulSoup):
28+
haikus = [str(title.text) for title in soup.find_all("a", class_="title may-blank ")]
29+
return haikus
30+
31+
def write_haikus_to_csv(self, haikus: list):
32+
with open('scraped_haikus_v2.txt', 'a') as f:
33+
writer = csv.writer(f)
34+
for haiku in haikus:
35+
writer.writerow([haiku])
36+
f.close()
37+
38+
39+
40+
url = "https://old.reddit.com/r/haiku/"
41+
# Headers to mimic a browser visit
42+
headers = {'User-Agent': 'Mozilla/5.0'}
43+
44+
scraper = HaikuScraper(url, headers)
45+
soup = scraper.make_request()
46+
47+
haikus = scraper.get_haikus(soup)
48+
scraper.write_haikus_to_csv(haikus)
49+
50+
counter = 1
51+
52+
while (counter <= 2500):
53+
time.sleep(2)
54+
link = scraper.get_next_page(soup)
55+
print(f"Page {counter + 1}. Link {link}.")
56+
scraper = HaikuScraper(link, headers)
57+
soup = scraper.make_request()
58+
haikus = scraper.get_haikus(soup)
59+
scraper.write_haikus_to_csv(haikus)
60+
counter += 1

0 commit comments

Comments
 (0)