-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper-old.py
29 lines (24 loc) · 1.05 KB
/
scraper-old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
class LadSpider(scrapy.Spider):
# The name of the spider
name = "lad_spider"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['memset.com']
# The URLs to start with
start_urls = ['https://www.memset.com']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
Rule(LinkExtractor(canonicalize=True, unique=True), follow=True)
def parse(self, response):
#SET_SELECTOR = 'a'
#for lada in response.css(SET_SELECTOR):
# NAME_SELECTOR = 'a ::attr(href)'
# yield {
# 'name': lada.css(NAME_SELECTOR).extract(),
# }
links = LinkExtractor(allow=self.allowed_domains, tags=("a"),attrs=("href")).extract_links(response)
for link in links:
yield {
'XXXname': response.css('a').xpath('@href').extract(),
}