-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
34 lines (29 loc) · 893 Bytes
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
url_list = []
class LadSpidyTest(CrawlSpider):
name = "lad_scraper"
allowed_domains = ["www.memset.com"]
start_urls = (
'http://www.memset.com/',
)
rules = (
Rule(LinkExtractor(canonicalize=True, unique=True), callback="parse_obj", follow=True),
)
def parse_obj(self, response):
extractor = LinkExtractor(allow=self.allowed_domains, unique=True)
links = extractor.extract_links(response)
for i in links:
url_list.append(i.url)
def closed(self, reason):
completeList()
def completeList():
url_set = set(url_list)
myfile = open('results.txt', 'w')
for i in url_set:
myfile.write(url_set)
myfile.write('\n')
print i
#audit call
myfile.close()