Skip to content

Commit 6b4cff4

Browse files
committed
cria spider Barra Do Pirai okfn-brasil#1192
aplicando alterações okfn-brasil#1362
1 parent 338dbbe commit 6b4cff4

File tree

1 file changed

+64
-0
lines changed

1 file changed

+64
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import re
2+
from datetime import date, datetime as dt
3+
4+
import scrapy
5+
6+
from gazette.items import Gazette
7+
from gazette.spiders.base import BaseGazetteSpider
8+
9+
10+
class RjBarraDoPiraiSpider(BaseGazetteSpider):
11+
name = "rj_barra_do_pirai"
12+
TERRITORY_ID = "3300308"
13+
allowed_domains = ["transparencia.portalbarradopirai.com.br"]
14+
start_urls = [
15+
"https://transparencia.portalbarradopirai.com.br/index.php/pt/links/boletim-municipal"
16+
]
17+
start_date = date(2009, 1, 7)
18+
19+
def parse(self, response):
20+
year_links = response.xpath("//div[@itemprop='articleBody']//a/@href").getall()
21+
range_years = range(self.start_date.year, self.end_date.year + 1)
22+
for year_link in year_links:
23+
year = int(year_link[-4:])
24+
if year in range_years:
25+
yield scrapy.Request(
26+
url=response.urljoin(year_link),
27+
callback=self.parse_document,
28+
)
29+
30+
def parse_document(self, response):
31+
gazette_links = response.xpath("//table[@class='easyfolderlisting ']//a")
32+
for gazette_link in gazette_links:
33+
raw_edition = gazette_link.xpath("./text()").get()
34+
date_tmp = re.search(r"Data\s+(\d{2}-\d{2})", raw_edition)
35+
if not date_tmp:
36+
continue
37+
38+
year = response.url[-4:]
39+
date_tmp = date_tmp.group(1)
40+
date_tmp = f"{date_tmp}-{year}"
41+
date_tmp = dt.strptime(date_tmp, "%d-%m-%Y").date()
42+
43+
if date_tmp > self.end_date:
44+
continue
45+
46+
if date_tmp < self.start_date:
47+
return
48+
49+
match = re.search(r"(\d+)", raw_edition)
50+
edition = match.group(0) if match else None
51+
extra = "extra" in raw_edition.lower()
52+
pdf_link = gazette_link.xpath(".//@href").get()
53+
if date_tmp <= date(2021, 10, 21):
54+
power_type = "executive"
55+
else:
56+
power_type = "executive_legislative"
57+
58+
yield Gazette(
59+
date=date_tmp,
60+
edition_number=edition,
61+
is_extra_edition=extra,
62+
file_urls=[pdf_link],
63+
power=power_type,
64+
)

0 commit comments

Comments
 (0)