-
Notifications
You must be signed in to change notification settings - Fork 118
/
Copy pathglassdoor.py
281 lines (240 loc) · 10.7 KB
/
glassdoor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
"""
This is an example web scraper for Glassdoor.com used in scrapfly blog article:
https://scrapfly.io/blog/how-to-scrape-glassdoor/
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
from enum import Enum
import json
import os
import re
from typing import Dict, List, Optional, Tuple, TypedDict
from urllib.parse import urljoin
from loguru import logger as log
from scrapfly import ScrapeApiResponse, ScrapeConfig, ScrapflyClient, ScrapflyScrapeError
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# Glassdoor.com requires Anti Scraping Protection bypass feature.
# for more: https://scrapfly.io/docs/scrape-api/anti-scraping-protection
"asp": True,
"country": "GB",
"render_js": True
}
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""
Extract hidden web cache (Apollo Graphql framework) from Glassdoor page HTML
It's either in NEXT_DATA script or direct apolloState js variable
"""
# data can be in __NEXT_DATA__ cache
data = result.selector.css("script#__NEXT_DATA__::text").get()
if data:
data = json.loads(data)["props"]["pageProps"]["apolloCache"]
else: # or in direct apolloState cache
data = re.findall(r'apolloState":\s*({.+})};', result.content)[0]
data = json.loads(data)
def _unpack_apollo_data(apollo_data):
"""
Glassdoor uses Apollo GraphQL client and the dataset is a graph of references.
This function unpacks the __ref references to actual values.
"""
def resolve_refs(data, root):
if isinstance(data, dict):
if "__ref" in data:
return resolve_refs(root[data["__ref"]], root)
else:
return {k: resolve_refs(v, root) for k, v in data.items()}
elif isinstance(data, list):
return [resolve_refs(i, root) for i in data]
else:
return data
return resolve_refs(apollo_data.get("ROOT_QUERY") or apollo_data, apollo_data)
return _unpack_apollo_data(data)
def parse_jobs(result: ScrapeApiResponse) -> Tuple[List[Dict], List[str]]:
"""Parse Glassdoor jobs page for job data and other page pagination urls"""
cache = find_hidden_data(result)
job_cache = next(v for k, v in cache.items() if k.startswith("jobListings"))
jobs = [v["jobview"]["header"] for v in job_cache["jobListings"]]
other_pages = [
urljoin(result.context["url"], page["urlLink"])
for page in job_cache["paginationLinks"]
if page["isCurrentPage"] is False
]
return jobs, other_pages
async def scrape_jobs(url: str, max_pages: Optional[int] = None) -> List[Dict]:
"""Scrape Glassdoor job listing page for job listings (with pagination)"""
log.info("scraping job listings from {}", url)
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
jobs, other_page_urls = parse_jobs(first_page)
_total_pages = len(other_page_urls) + 1
if max_pages and _total_pages > max_pages:
other_page_urls = other_page_urls[:max_pages]
log.info("scraped first page of jobs of {}, scraping remaining {} pages", url, _total_pages - 1)
other_pages = [ScrapeConfig(url, **BASE_CONFIG) for url in other_page_urls]
async for result in SCRAPFLY.concurrent_scrape(other_pages):
if not isinstance(result, ScrapflyScrapeError):
jobs.extend(parse_jobs(result)[0])
else:
log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
log.info("scraped {} jobs from {} in {} pages", len(jobs), url, _total_pages)
return jobs
def parse_reviews(result: ScrapeApiResponse) -> Dict:
"""parse Glassdoor reviews page for review data"""
cache = find_hidden_data(result)
reviews = next(v for k, v in cache.items() if k.startswith("employerReviews") and v.get("reviews"))
return reviews
async def scrape_reviews(url: str, max_pages: Optional[int] = None) -> Dict:
"""Scrape Glassdoor reviews listings from reviews page (with pagination)"""
log.info("scraping reviews from {}", url)
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url=url, **BASE_CONFIG))
reviews = parse_reviews(first_page)
total_pages = reviews["numberOfPages"]
if max_pages and max_pages < total_pages:
total_pages = max_pages
log.info("scraped first page of reviews of {}, scraping remaining {} pages", url, total_pages - 1)
other_pages = [
ScrapeConfig(url=Url.change_page(first_page.context["url"], page=page), **BASE_CONFIG)
for page in range(2, total_pages + 1)
]
async for result in SCRAPFLY.concurrent_scrape(other_pages):
if not isinstance(result, ScrapflyScrapeError):
reviews["reviews"].extend(parse_reviews(result)["reviews"])
else:
log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
log.info("scraped {} reviews from {} in {} pages", len(reviews["reviews"]), url, total_pages)
return reviews
def parse_salaries(result: ScrapeApiResponse) -> Dict:
"""Parse Glassdoor salaries page for salary data"""
cache = find_hidden_data(result)
salaries = next(v for k, v in cache.items() if k.startswith("aggregatedSalaryEstimates") and v.get("results"))
return salaries
async def scrape_salaries(url: str, max_pages: Optional[int] = None) -> Dict:
"""Scrape Glassdoor Salary page for salary listing data (with pagination)"""
log.info("scraping salaries from {}", url)
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url=url, **BASE_CONFIG))
salaries = parse_salaries(first_page)
total_pages = salaries["numPages"]
if max_pages and total_pages > max_pages:
total_pages = max_pages
log.info("scraped first page of salaries of {}, scraping remaining {} pages", url, total_pages - 1)
other_pages = [
ScrapeConfig(url=Url.change_page(first_page.context["url"], page=page), **BASE_CONFIG)
for page in range(2, total_pages + 1)
]
async for result in SCRAPFLY.concurrent_scrape(other_pages):
if not isinstance(result, ScrapflyScrapeError):
salaries["results"].extend(parse_salaries(result)["results"])
else:
log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
log.info("scraped {} salaries from {} in {} pages", len(salaries["results"]), url, total_pages)
return salaries
class FoundCompany(TypedDict):
"""type hint for company search result"""
name: str
id: int
logoURL: str
employerId: int
employerName: str
async def find_companies(query: str) -> List[FoundCompany]:
"""find company Glassdoor ID and name by query. e.g. "ebay" will return "eBay" with ID 7853"""
result = await SCRAPFLY.async_scrape(
ScrapeConfig(
url=f"https://www.glassdoor.com/api-web/employer/find.htm?autocomplete=true&maxEmployersForAutocomplete=50&term={query}",
**BASE_CONFIG,
)
)
data = json.loads(result.content)
companies = []
for result in data:
companies.append(
{
"name": result["label"],
"id": result["id"],
"logoURL": result["logoURL"],
"employerId": (
result["parentRelationshipVO"]["employerId"] if result["parentRelationshipVO"] is not None else None
),
"employerName": (
result["parentRelationshipVO"]["employerName"] if result["parentRelationshipVO"] is not None else None
),
}
)
return companies
class Region(Enum):
"""glassdoor.com region codes"""
UNITED_STATES = "1"
UNITED_KINGDOM = "2"
CANADA_ENGLISH = "3"
INDIA = "4"
AUSTRALIA = "5"
FRANCE = "6"
GERMANY = "7"
SPAIN = "8"
BRAZIL = "9"
NETHERLANDS = "10"
AUSTRIA = "11"
MEXICO = "12"
ARGENTINA = "13"
BELGIUM_NEDERLANDS = "14"
BELGIUM_FRENCH = "15"
SWITZERLAND_GERMAN = "16"
SWITZERLAND_FRENCH = "17"
IRELAND = "18"
CANADA_FRENCH = "19"
HONG_KONG = "20"
NEW_ZEALAND = "21"
SINGAPORE = "22"
ITALY = "23"
class Url:
"""
Helper URL generator that generates full URLs for glassdoor.com pages
from given employer name and ID
For example:
> GlassdoorUrl.overview("eBay Motors Group", "4189745")
https://www.glassdoor.com/Overview/Working-at-eBay-Motors-Group-EI_IE4189745.11,28.htm
Note that URL formatting is important when it comes to scraping Glassdoor
as unusual URL formats can lead to scraper blocking.
"""
@staticmethod
def overview(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
employer = employer.replace(" ", "-")
url = f"https://www.glassdoor.com/Overview/Working-at-{employer}-EI_IE{employer_id}"
# glassdoor is allowing any prefix for employer name and
# to indicate the prefix suffix numbers are used like:
# https://www.glassdoor.com/Overview/Working-at-eBay-Motors-Group-EI_IE4189745.11,28.htm
# 11,28 is the slice where employer name is
_start = url.split("/Overview/")[1].find(employer)
_end = _start + len(employer)
url += f".{_start},{_end}.htm"
if region:
return url + f"?filter.countryId={region.value}"
return url
@staticmethod
def reviews(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
employer = employer.replace(" ", "-")
url = f"https://www.glassdoor.com/Reviews/{employer}-Reviews-E{employer_id}.htm?"
if region:
return url + f"?filter.countryId={region.value}"
return url
@staticmethod
def salaries(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
employer = employer.replace(" ", "-")
url = f"https://www.glassdoor.com/Salary/{employer}-Salaries-E{employer_id}.htm?"
if region:
return url + f"?filter.countryId={region.value}"
return url
@staticmethod
def jobs(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
employer = employer.replace(" ", "-")
url = f"https://www.glassdoor.com/Jobs/{employer}-Jobs-E{employer_id}.htm?"
if region:
return url + f"?filter.countryId={region.value}"
return url
@staticmethod
def change_page(url: str, page: int) -> str:
"""update page number in a glassdoor url"""
if re.search(r"_P\d+\.htm", url):
new = re.sub(r"(?:_P\d+)*.htm", f"_P{page}.htm", url)
else:
new = re.sub(".htm", f"_P{page}.htm", url)
assert new != url
return new