1
+ import requests
2
+ import csv
3
+ import time
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ class HaikuScraper :
8
+ """
9
+ This scraper is designed with the purpose of scraping Haikus (Japanese poems) from Reddit.
10
+ """
11
+ def __init__ (self , url : str , headers : dict ):
12
+ self .url = url
13
+ self .headers = headers
14
+
15
+ def make_request (self ):
16
+ time .sleep (3 )
17
+ page = requests .get (self .url , headers = self .headers )
18
+ soup = BeautifulSoup (page .text , 'html.parser' )
19
+ return soup
20
+
21
+ def get_next_page (self , soup : BeautifulSoup ):
22
+ time .sleep (3 )
23
+ next_button = soup .find ('span' , class_ = 'next-button' )
24
+ next_page_link = next_button .find ("a" ).attrs ['href' ]
25
+ return next_page_link
26
+
27
+ def get_haikus (self , soup : BeautifulSoup ):
28
+ haikus = [str (title .text ) for title in soup .find_all ("a" , class_ = "title may-blank " )]
29
+ return haikus
30
+
31
+ def write_haikus_to_csv (self , haikus : list ):
32
+ with open ('scraped_haikus_v2.txt' , 'a' ) as f :
33
+ writer = csv .writer (f )
34
+ for haiku in haikus :
35
+ writer .writerow ([haiku ])
36
+ f .close ()
37
+
38
+
39
+
40
+ url = "https://old.reddit.com/r/haiku/"
41
+ # Headers to mimic a browser visit
42
+ headers = {'User-Agent' : 'Mozilla/5.0' }
43
+
44
+ scraper = HaikuScraper (url , headers )
45
+ soup = scraper .make_request ()
46
+
47
+ haikus = scraper .get_haikus (soup )
48
+ scraper .write_haikus_to_csv (haikus )
49
+
50
+ counter = 1
51
+
52
+ while (counter <= 2500 ):
53
+ time .sleep (2 )
54
+ link = scraper .get_next_page (soup )
55
+ print (f"Page { counter + 1 } . Link { link } ." )
56
+ scraper = HaikuScraper (link , headers )
57
+ soup = scraper .make_request ()
58
+ haikus = scraper .get_haikus (soup )
59
+ scraper .write_haikus_to_csv (haikus )
60
+ counter += 1
0 commit comments