-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_spider.py
62 lines (51 loc) · 2.93 KB
/
my_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 9 23:03:14 2019
@author: Ewe
"""
import scrapy
from bs4 import BeautifulSoup
HOME_URL = 'https://en.wikipedia.org'
MAX_LEVEL = 1 # max crawl depth
class WikipediaSpider(scrapy.Spider):
name = 'wiki'
currentLevel = 0
def start_requests(self):
start_urls = ['https://en.wikipedia.org/wiki/Unus_pro_omnibus,_omnes_pro_uno']
for url in start_urls:
yield scrapy.Request(url = url, callback = self.parse)
def parse(self, response):
print(response)
if 'level' not in response.meta:
currentLevel = 0
else:
currentLevel = response.meta['level']
self_title = response.xpath('//h1[contains(@class,"firstHeading")]//text()').get() #uses XPath expression to search for object from HTML code, gets first matched element
self_url = response.url # gets URL of current page to be scraped
soup = BeautifulSoup(response.body,'lxml') # parses the HTML code of the page into the BeautifulSoup constructor using the lxml HTML parser
# stores all hyperlinks found on page
listTags = [] # list to store all hyperlinks found
for paragraph in soup.findAll('p'): # for each paragraph found
listTags.extend(paragraph.findAll('a')) # stores all hyperlinks found
# cleans up list of hyperlinks; retains only relevant links
listLinks = [] # stores the name and url of each hyperlink found
listOfFilterKeywords = ['cite_note', 'File'] # stores list of keywords that indicates links to be filtered out
for tag in listTags:
for keyword in listOfFilterKeywords:
if keyword in str(tag): # checks if keyword is found; if so, skip this tag
continue
if 'title' in tag.attrs and 'href' in tag.attrs: # checks if title and link elements exist in the tag
listLinks.append((tag['title'], HOME_URL + tag['href'])) # appends a title-url pair to listLinks
break
for link in listLinks: # for each hyperlink found
"""
if currentLevel == 0: # base level
self_title =
"""
yield {"self_title": self_title, "self_url": self_url, "ext_title": link[0], "ext_url": link[1], "current_level": currentLevel} # stores a dictionary of the information regarding each hyperlink i.e. which page it is found on
if currentLevel + 1 > MAX_LEVEL: # stops sending requests if spider is going to reach max crawl levvel
continue
request = scrapy.Request(link[1], callback=self.parse)
request.meta['level'] = currentLevel + 1
yield request
#yield response.follow(link[1], callback=self.parse) # creates a new Request object using the url of each hyperlink