11
11
12
12
13
13
BASE_SALARY_RANGE_INDIA = (2_00_000 , 100_00_000 )
14
+ TOTAL_SALARY_RANGE_INDIA = (2_00_000 , 200_00_000 )
15
+ TOTAL_TO_BASE_MAX_RATIO = 2.5
14
16
INTERN_SALARY_RANGE_INDIA = (10_000 , 2_00_000 )
15
17
16
18
LABEL_SPECIFICATION = {
24
26
"RE_SALARY" : re .compile (r"(salary|base|base pay)\s?[:-]-?\s?(?P<label>[\w\,\₹\$\.\/\-\(\)\`\\u20b9₹\~ ]+)" ),
25
27
"RE_LOCATION" : re .compile (r"location\s?[:-]-?\s?(?P<label>[\w\,\` ]+)" ),
26
28
"RE_SALARY_TOTAL" : re .compile (
27
- r"\ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
29
+ r"\\ ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
28
30
),
29
31
"RE_SALARY_CLEAN_LPA" : re .compile (r"(\d{1,3}(\.\d{1,2})?)\s?(lpa|lakh|lac|l)" ),
30
32
}
@@ -63,16 +65,17 @@ def _find_matches(regex_pattern: Pattern[str], content: str) -> List[str]:
63
65
64
66
65
67
def _get_info_as_flat_list (
66
- companies : List [str ], titles : List [str ], yoes : List [str ], salaries : List [str ], info : Dict [str , Any ]
68
+ companies : List [str ], roles : List [str ], yoes : List [str ], pays : List [ str ], pays_t : List [str ], info : Dict [str , Any ]
67
69
) -> List [Dict [str , Any ]]:
68
- n_info = min ([len (companies ), len (titles ), len (yoes ), len (salaries )])
70
+ n_info = min ([len (companies ), len (roles ), len (yoes ), len (pays )])
69
71
expanded_info = []
70
72
for _ in range (n_info ):
71
73
_info = info .copy ()
72
74
_info ["company" ] = companies [0 ]
73
- _info ["role" ] = titles [0 ]
75
+ _info ["role" ] = roles [0 ]
74
76
_info ["yoe" ] = yoes [0 ]
75
- _info ["salary" ] = salaries [0 ]
77
+ _info ["salary" ] = pays [0 ]
78
+ _info ["salaryTotal" ] = pays_t [0 ] if pays_t else ""
76
79
expanded_info .append (_info )
77
80
return expanded_info
78
81
@@ -133,6 +136,9 @@ def _report(raw_info: List[Dict[str, Any]]) -> None:
133
136
logger .info (f"Posts with Location: { len ([r for r in raw_info if 'country' in r ])} " )
134
137
logger .info (f"Posts with YOE: { len ([r for r in raw_info if r ['cleanYoe' ] >= 0 ])} " )
135
138
logger .info (f"Posts from India: { len ([r for r in raw_info if 'country' in r and r ['country' ] == 'india' ])} " )
139
+ logger .info (
140
+ f"Posts with Total Comp: { len ([r for r in raw_info if 'cleanSalaryTotal' in r and r ['cleanSalaryTotal' ] != - 1.0 ])} "
141
+ )
136
142
137
143
138
144
def _is_valid_yearly_base_pay_from_india (base_pay : float ):
@@ -143,13 +149,23 @@ def _is_valid_monthly_internship_pay_from_india(base_pay: float):
143
149
return base_pay >= INTERN_SALARY_RANGE_INDIA [0 ] and base_pay <= INTERN_SALARY_RANGE_INDIA [1 ]
144
150
145
151
152
+ def _is_valid_monthly_total_pay_from_india (base_pay : float ):
153
+ return base_pay >= TOTAL_SALARY_RANGE_INDIA [0 ] and base_pay <= TOTAL_SALARY_RANGE_INDIA [1 ]
154
+
155
+
146
156
def _filter_invalid_salaries (raw_info : List [Dict [str , Any ]]) -> List [Dict [str , Any ]]:
147
157
n_india = 0
148
158
n_dropped = 0
149
159
filtered_info = []
150
160
for r in raw_info :
151
161
if "country" in r and r ["country" ] == "india" :
152
162
n_india += 1
163
+ if "cleanSalaryTotal" in r and r ["cleanSalaryTotal" ] != - 1 :
164
+ if not _is_valid_monthly_total_pay_from_india (r ["cleanSalaryTotal" ]):
165
+ r ["cleanSalaryTotal" ] = - 1.0
166
+ elif r ["cleanSalaryTotal" ] / r ["cleanSalary" ] > TOTAL_TO_BASE_MAX_RATIO :
167
+ r ["cleanSalaryTotal" ] = - 1.0
168
+
153
169
if r ["yrOrPm" ] == "yearly" and not _is_valid_yearly_base_pay_from_india (r ["cleanSalary" ]):
154
170
n_dropped += 1
155
171
continue
@@ -162,6 +178,28 @@ def _filter_invalid_salaries(raw_info: List[Dict[str, Any]]) -> List[Dict[str, A
162
178
return filtered_info
163
179
164
180
181
+ def _add_clean_yoe_and_salaries (expanded_info : List [Dict [str , Any ]], info : Dict [str , Any ], title : str ) -> None :
182
+ for info in expanded_info :
183
+ info ["cleanYoe" ] = _get_clean_yoe (info ["yoe" ].lower (), _preprocess_text (title ).lower (), info ["role" ].lower ())
184
+ if "country" in info and info ["country" ] == "india" :
185
+ if "\\ n" in info ["salary" ].replace ("," , "" ).lower ():
186
+ info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
187
+ info ["salary" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
188
+ )
189
+ else :
190
+ info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
191
+ info ["salary" ].replace ("," , "" ).lower ()
192
+ )
193
+ if info ["yrOrPm" ] == "yearly" :
194
+ total_salary , _ = _get_clean_salary_for_india (
195
+ info ["salaryTotal" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
196
+ )
197
+ if info ["cleanSalary" ] != - 1 and total_salary > info ["cleanSalary" ]:
198
+ info ["cleanSalaryTotal" ] = total_salary
199
+ else :
200
+ info ["cleanSalaryTotal" ] = - 1
201
+
202
+
165
203
def _get_clean_company_text (company : str ) -> str :
166
204
return " " .join (re .findall (r"\w+" , company .lower ()))
167
205
@@ -179,6 +217,14 @@ def _add_clean_companies(raw_info: List[Dict[str, Any]]) -> None:
179
217
r ["cleanCompany" ] = " " .join ([txt .capitalize () for txt in clean_company .split (" " )])
180
218
181
219
220
+ def _drop_info (raw_info : List [Dict [str , Any ]]) -> None :
221
+ for r in raw_info :
222
+ try :
223
+ del r ["title" ], r ["yoe" ], r ["salary" ], r ["salaryTotal" ], r ["city" ], r ["country" ]
224
+ except KeyError :
225
+ continue
226
+
227
+
182
228
def _save_raw_info (raw_info : List [Dict [str , Any ]]) -> None :
183
229
with open ("data/posts_info.json" , "w" ) as f :
184
230
json .dump (raw_info , f )
@@ -197,6 +243,9 @@ def _save_meta_info(total_posts: int, raw_info: List[Dict[str, Any]]) -> Dict[st
197
243
meta_info = {
198
244
"totalPosts" : total_posts ,
199
245
"totalPostsFromIndia" : len ([r for r in raw_info if "country" in r and r ["country" ] == "india" ]),
246
+ "totalPostsWithTotalComp" : len (
247
+ [r for r in raw_info if "cleanSalaryTotal" in r and r ["cleanSalaryTotal" ] != - 1.0 ]
248
+ ),
200
249
"lastUpdated" : datetime .now ().strftime ("%Y/%m/%d %H:%M:%S" ),
201
250
"top20Companies" : top_20 ,
202
251
"mostOffersInLastMonth" : most_offers ,
@@ -229,39 +278,28 @@ def parse_posts_and_save_tagged_info() -> None:
229
278
roles = _find_matches (LABEL_SPECIFICATION ["RE_ROLE" ], clean_content )
230
279
yoes = _find_matches (LABEL_SPECIFICATION ["RE_YOE" ], clean_content )
231
280
salaries = _find_matches (LABEL_SPECIFICATION ["RE_SALARY" ], clean_content )
281
+ total_salaies = _find_matches (LABEL_SPECIFICATION ["RE_SALARY_TOTAL" ], clean_content )
232
282
if companies and roles and yoes and salaries :
233
- expanded_info = _get_info_as_flat_list (companies , roles , yoes , salaries , info )
283
+ expanded_info = _get_info_as_flat_list (companies , roles , yoes , salaries , total_salaies , info )
234
284
location = _get_clean_location (_preprocess_text (r .title ), clean_content )
235
285
if location [1 ]:
236
286
for info in expanded_info :
237
287
info ["city" ] = location [0 ]; info ["country" ] = location [1 ]
238
- for info in expanded_info :
239
- info ["cleanYoe" ] = _get_clean_yoe (
240
- info ["yoe" ].lower (), _preprocess_text (r .title ).lower (), info ["role" ].lower ()
241
- )
242
- if "country" in info and info ["country" ] == "india" :
243
- if "\\ n" in info ["salary" ].replace ("," , "" ).lower ():
244
- info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
245
- info ["salary" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
246
- )
247
- else :
248
- info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
249
- info ["salary" ].replace ("," , "" ).lower ()
250
- )
288
+ _add_clean_yoe_and_salaries (expanded_info , info , r .title )
251
289
raw_info += expanded_info
252
290
else :
253
291
n_dropped += 1
254
292
# fmt: on
255
-
256
293
logger .info (f"Total posts: { total_posts } " )
257
294
logger .info (f"N posts dropped (missing data): { n_dropped } " )
258
295
_report (raw_info )
259
296
raw_info = _filter_invalid_salaries (raw_info )
260
297
261
298
_add_clean_companies (raw_info )
262
299
raw_info = sorted (raw_info , key = lambda x : x ["date" ], reverse = True )
263
- _save_raw_info (raw_info )
264
300
meta_info = _save_meta_info (total_posts , raw_info )
301
+ _drop_info (raw_info )
302
+ _save_raw_info (raw_info )
265
303
_update_data_in_js (raw_info , meta_info )
266
304
267
305
0 commit comments