-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap_cs.py
103 lines (79 loc) · 3.69 KB
/
scrap_cs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
==========================================
Title: Web scraping academic staff profile (Static Site)
Site: https://cs.usm.my/index.php/about/our-people/facultycs-member
Author: Teoh Sin Yee
Date: 11 Aug 2022
==========================================
"""
import requests
import pandas as pd
from collections import OrderedDict
from bs4 import BeautifulSoup, NavigableString, Tag
baseurl = "https://cs.usm.my"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36'
}
r = requests.get('https://cs.usm.my/index.php/about/our-people/facultycs-member')
soup = BeautifulSoup(r.content, 'lxml')
lecturer_list = soup.find_all('p')
lecturer_links = []
for item in lecturer_list:
for link in item.find_all('a', href = True):
lecturer_links.append(baseurl + link['href'])
lecturer_links = lecturer_links[16:]
lecturer_profile = []
for link in lecturer_links:
r = requests.get(link, headers= headers)
soup = BeautifulSoup(r.content, 'lxml')
profile_description = OrderedDict()
name = soup.find('h1',class_="uk-article-title").text.strip()
title = soup.find('strong').text.strip()
classname_research = "uk-grid uk-grid-divider uk-grid-width-1-1 uk-grid-width-medium-1-3 uk-grid-width-large-1-3"
# email = soup.find('ul',class_=classname_research).findAll('p', class_=False)[0]
# -- email can't be scraped because of spam protection --
try:
research_cluster = soup.find('ul',class_=classname_research).findAll('p', class_=False)[2].text
research_interest = soup.find('ul',class_=classname_research).findAll('p', class_=False)[3].text
specialization = soup.find('ul',class_= classname_research).findAll('p', class_=False)[4].text
except:
research_cluster, research_interest, specialization = "missing info"
profile_description.update(
{'Name': name,
'Title': title,
'Research Cluster':research_cluster,
'Specialization':specialization
})
profile_details = soup.find('div',class_ = "tm-article-content")
all_headers = ['h3','h5']
for header in profile_details.find_all(all_headers)[0:2]:
info_get = header.get_text().strip()
nextNode = header
profile_details_text = []
while True:
nextNode = nextNode.nextSibling
# This writes out the last of the H3 & H5 tags and its following contents
if not nextNode:
profile_description[info_get] = "\n".join(profile_details_text)
break
# This adds non-H3 & non-H5 tags to the text to attach to the text of the H3 & H5
elif isinstance(nextNode, NavigableString):
if nextNode.strip():
profile_details_text.append(nextNode.strip())
pass
# This detects the next H3 & H5 and writes the compiled text to the previous H3 & H5
elif isinstance(nextNode, Tag):
if nextNode.name in all_headers and len(nextNode.name) <30:
profile_description[info_get] = "\n".join(profile_details_text)
break
profile_details_text.append(nextNode.get_text(strip=True))
lecturer_profile.append(profile_description)
print('saving: ',profile_description['Name'])
print('keys: ',profile_description.keys())
df_profile = pd.DataFrame(lecturer_profile)
print(df_profile)
names = df_profile['Name'].tolist()
source_url = lecturer_links
#df_profile.to_csv('lecturer_profile.csv', index=None, header=True)
dir = pd.DataFrame(list(map(list, zip(names,source_url))))
dir.to_csv('directory.csv', index=None, header=True)