-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathredfin.py
120 lines (104 loc) · 4.48 KB
/
redfin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
This is an example web scraper for redfin.com used in scrapfly blog article:
https://scrapfly.io/blog/how-to-scrape-redfin/
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import json
import os
from typing import List, Dict
from loguru import logger as log
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# Redfin.com requires Anti Scraping Protection bypass feature:
"asp": True,
# Set the proxy location to US
"country": "US",
}
def parse_search_api(response: ScrapeApiResponse) -> List[Dict]:
"""parse JSON data from the search API"""
return json.loads(response.content.replace("{}&&", ""))["payload"]["homes"]
def parse_property_for_sale(response: ScrapeApiResponse) -> List[Dict]:
selector = response.selector
price = selector.xpath("//div[@data-rf-test-id='abp-price']/div/text()").get()
estimated_monthly_price = "".join(selector.xpath("//span[@class='est-monthly-payment']/text()").getall())
address = (
"".join(selector.xpath("//div[contains(@class, 'street-address')]/text()").getall())
+ " " + "".join(selector.xpath("//div[contains(@class, 'cityStateZip')]/text()").getall())
)
description = selector.xpath("//div[@id='marketing-remarks-scroll']/p/span/text()").get()
images = [
image.attrib["src"]
for image in selector.xpath("//img[contains(@class, 'widenPhoto')]")
]
details = [
"".join(text_content.getall())
for text_content in selector.css("div .keyDetails-value::text")
]
features_data = {}
for feature_block in selector.css(".amenity-group ul div.title"):
label = feature_block.css("::text").get()
features = feature_block.xpath("following-sibling::li/span")
features_data[label] = [
"".join(feat.xpath(".//text()").getall()).strip() for feat in features
]
return {
"address": address,
"description": description,
"price": price,
"estimatedMonthlyPrice": estimated_monthly_price,
"propertyUrl": str(response.context["url"]),
"attachments": images,
"details": details,
"features": features_data,
}
def parse_property_for_rent(response: ScrapeApiResponse):
"""get the rental ID from the HTML to use it in the API"""
selector = response.selector
data = selector.xpath("//meta[@property='og:image']").attrib["content"]
try:
rental_id = data.split("rent/")[1].split("/")[0]
# validate the rentalId
assert len(rental_id) == 36
return rental_id
except:
print("proeprty isn't for rent")
return None
async def scrape_search(url: str) -> List[Dict]:
"""scrape search data from the searh API"""
# send a request to the search API
search_api_response = await SCRAPFLY.async_scrape(
ScrapeConfig(url, country="US")
)
search_data = parse_search_api(search_api_response)
log.success(f"scraped ({len(search_data)}) search results from the search API")
return search_data
async def scrape_property_for_sale(urls: List[str]) -> List[Dict]:
"""scrape properties for sale data from HTML"""
# add the property pages to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
properties = []
# scrape all property pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = parse_property_for_sale(response)
properties.append(data)
log.success(f"scraped {len(properties)} property listings for sale")
return properties
async def scrape_property_for_rent(urls: List[str]) -> list[Dict]:
"""scrape properties for rent from the API"""
api_urls = []
properties = []
for url in urls:
response_html = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
rental_id = parse_property_for_rent(response_html)
if rental_id:
api_urls.append(
f"https://www.redfin.com/stingray/api/v1/rentals/{rental_id}/floorPlans"
)
# add the property pages API URLs to a scraping list
to_scrape = [ScrapeConfig(url, country="US") for url in api_urls]
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
properties.append(json.loads(response.content))
log.success(f"scraped {len(properties)} property listings for rent")
return properties