-
Notifications
You must be signed in to change notification settings - Fork 130
/
Copy pathresume_parsing.py
114 lines (65 loc) · 2.08 KB
/
resume_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import spacy
import pdfminer
import re
import os
import pandas as pd
# In[2]:
import pdf2txt
# In[3]:
def convert_pdf(f):
output_filename = os.path.basename(os.path.splitext(f)[0]) + '.txt'
#output_filepath = os.path.join('..', 'data', 'output', output_filename)
output_filepath = os.path.join('output/txt/', output_filename)
#logging.info('Writing text from {} to {}'.format(f, output_filepath))
pdf2txt.main(args=[f, '--outfile', output_filepath])
print(output_filepath + " saved successfully!!!")
return open(output_filepath).read()
# In[4]:
os.path.splitext("Sally CrookCV.pdf")[0]
# In[5]:
nlp = spacy.load("en_core_web_sm")
# In[6]:
#phone_num credit https://stackoverflow.com/a/3868861
# In[7]:
result_dict = {'name': [], 'phone': [], 'email': [], 'skills': []}
names = []
phones = []
emails = []
skills = []
# In[8]:
def parse_content(text):
skillset = re.compile('python|java|sql|hadoop|tableau')
phone_num = re.compile('(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
doc = nlp(text)
name = [entity.text for entity in doc.ents if entity.label_ is 'PERSON'][0]
print(name)
email = [word for word in doc if word.like_email == True][0]
print(email)
phone = str(re.findall(phone_num,text.lower()))
skills_list = re.findall(skillset,text.lower())
unique_skills_list = str(set(skills_list))
names.append(name)
emails.append(email)
phones.append(phone)
skills.append(unique_skills_list)
print("Extraction completed successfully!!!")
# In[9]:
for file in os.listdir('resumes/'):
if file.endswith('.pdf'):
print('Reading.....' + file)
txt = convert_pdf(os.path.join('resumes/',file))
parse_content(txt)
# In[10]:
result_dict['name'] = names
result_dict['phone'] = phones
result_dict['email'] = emails
result_dict['skills'] = skills
#print(result_dict)
# In[11]:
result_df = pd.DataFrame(result_dict)
result_df
# In[12]:
result_df.to_csv('output/csv/parsed_resumes.csv')