|
| 1 | +import re |
| 2 | +from datetime import date, datetime as dt |
| 3 | + |
| 4 | +import scrapy |
| 5 | + |
| 6 | +from gazette.items import Gazette |
| 7 | +from gazette.spiders.base import BaseGazetteSpider |
| 8 | + |
| 9 | + |
| 10 | +class RjBarraDoPiraiSpider(BaseGazetteSpider): |
| 11 | + name = "rj_barra_do_pirai" |
| 12 | + TERRITORY_ID = "3300308" |
| 13 | + allowed_domains = ["transparencia.portalbarradopirai.com.br"] |
| 14 | + start_urls = [ |
| 15 | + "https://transparencia.portalbarradopirai.com.br/index.php/pt/links/boletim-municipal" |
| 16 | + ] |
| 17 | + start_date = date(2009, 1, 7) |
| 18 | + |
| 19 | + def parse(self, response): |
| 20 | + year_links = response.xpath("//div[@itemprop='articleBody']//a/@href").getall() |
| 21 | + range_years = range(self.start_date.year, self.end_date.year + 1) |
| 22 | + for year_link in year_links: |
| 23 | + year = int(year_link[-4:]) |
| 24 | + if year in range_years: |
| 25 | + yield scrapy.Request( |
| 26 | + url=response.urljoin(year_link), |
| 27 | + callback=self.parse_document, |
| 28 | + ) |
| 29 | + |
| 30 | + def parse_document(self, response): |
| 31 | + gazette_links = response.xpath("//table[@class='easyfolderlisting ']//a") |
| 32 | + for gazette_link in gazette_links: |
| 33 | + raw_edition = gazette_link.xpath("./text()").get() |
| 34 | + date_tmp = re.search(r"Data\s+(\d{2}-\d{2})", raw_edition) |
| 35 | + if not date_tmp: |
| 36 | + continue |
| 37 | + |
| 38 | + year = response.url[-4:] |
| 39 | + date_tmp = date_tmp.group(1) |
| 40 | + date_tmp = f"{date_tmp}-{year}" |
| 41 | + date_tmp = dt.strptime(date_tmp, "%d-%m-%Y").date() |
| 42 | + |
| 43 | + if date_tmp > self.end_date: |
| 44 | + continue |
| 45 | + |
| 46 | + if date_tmp < self.start_date: |
| 47 | + return |
| 48 | + |
| 49 | + match = re.search(r"(\d+)", raw_edition) |
| 50 | + edition = match.group(0) if match else None |
| 51 | + extra = "extra" in raw_edition.lower() |
| 52 | + pdf_link = gazette_link.xpath(".//@href").get() |
| 53 | + if date_tmp <= date(2021, 10, 21): |
| 54 | + power_type = "executive" |
| 55 | + else: |
| 56 | + power_type = "executive_legislative" |
| 57 | + |
| 58 | + yield Gazette( |
| 59 | + date=date_tmp, |
| 60 | + edition_number=edition, |
| 61 | + is_extra_edition=extra, |
| 62 | + file_urls=[pdf_link], |
| 63 | + power=power_type, |
| 64 | + ) |
0 commit comments