-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathspider.py
227 lines (203 loc) · 7.66 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
from selenium.webdriver import Chrome, ChromeOptions
import requests
import pymysql
import time
import json
import traceback
import sys
def get_conn():
"""
:return: 连接,游标l
"""
# 创建连接
conn = pymysql.connect(host="localhost",
user="root",
password="123456",
db="cov",
charset="utf8")
# 创建游标
cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
return conn, cursor
def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
def get_tencent_data():
"""
:return: 返回历史数据和当日详细数据
"""
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
url2='&name=disease_h5'
headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36',
'Referer':' https://news.qq.com/zt2020/page/feiyan.htm'
}
r = requests.get(url, headers)
res = json.loads(r.text) # json字符串转字典
data_all = json.loads(res['data'])
history = {} # 历史数据h5
for i in data_all["chinaDayList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d")
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
for i in data_all["chinaDayAddList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d")
ds = time.strftime("%Y-%m-%d", tup)
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
details = [] # 当日详细数据
update_time = data_all["lastUpdateTime"]
data_country = data_all["areaTree"] # list 25个国家
data_province = data_country[0]["children"] # 中国各省
for pro_infos in data_province:
province = pro_infos["name"] # 省名
for city_infos in pro_infos["children"]:
city = city_infos["name"]
confirm = city_infos["total"]["confirm"]
confirm_add = city_infos["today"]["confirm"]
heal = city_infos["total"]["heal"]
dead = city_infos["total"]["dead"]
details.append([update_time, province, city, confirm, confirm_add, heal, dead])
return history, details
def get_baidu_hot():
"""
:return: 返回百度疫情热搜
"""
option = ChromeOptions() # 创建谷歌浏览器实例
option.add_argument("--headless") # 隐藏浏览器
option.add_argument('--no-sandbox')
url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1"
browser = Chrome(options=option,executable_path=".C:\\Program Files (x86)\\Google\Chrome\\Application/chromedriver.exe")
browser.get(url)
# 找到展开按钮
dl = browser.find_element_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/div')
dl.click()
time.sleep(1)
# 找到热搜标签
c = browser.find_elements_by_xpath('//*[@id="main"]/div/div/section/div[2]/div/div[2]/section/a/div/span[2]')
context = [i.text for i in c] # 获取标签内容
print(context)
return context
def update_hotsearch():
"""
将疫情热搜插入数据库
:return:
"""
cursor = None
conn = None
try:
context = get_baidu_hot()
print(f"{time.asctime()}开始更新热搜数据")
conn, cursor = get_conn()
sql = "insert into hotsearch(dt,content) values(%s,%s)"
ts = time.strftime("%Y-%m-%d %X")
for i in context:
cursor.execute(sql, (ts, i)) # 插入数据
conn.commit() # 提交事务保存数据
print(f"{time.asctime()}数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
def update_details():
"""
更新 details 表
:return:
"""
cursor = None
conn = None
try:
li = get_tencent_data()[1] # 0 是历史数据字典,1 最新详细数据列表
conn, cursor = get_conn()
sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)"
sql_query = 'select %s=(select update_time from details order by id desc limit 1)' #对比当前最大时间戳
cursor.execute(sql_query,li[0][0])
if not cursor.fetchone()[0]:
print(f"{time.asctime()}开始更新最新数据")
for item in li:
cursor.execute(sql, item)
conn.commit() # 提交事务 update delete insert操作
print(f"{time.asctime()}更新最新数据完毕")
else:
print(f"{time.asctime()}已是最新数据!")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
def insert_history():
"""
插入历史数据
:return:
"""
cursor = None
conn = None
try:
dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表
print(f"{time.asctime()}开始插入历史数据")
conn, cursor = get_conn()
sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
for k, v in dic.items():
# item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1}
cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
v.get("dead"), v.get("dead_add")])
conn.commit() # 提交事务 update delete insert操作
print(f"{time.asctime()}插入历史数据完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
def update_history():
"""
更新历史数据
:return:
"""
cursor = None
conn = None
try:
dic = get_tencent_data()[0] # 0 是历史数据字典,1 最新详细数据列表
print(f"{time.asctime()}开始更新历史数据")
conn, cursor = get_conn()
sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
sql_query = "select confirm from history where ds=%s"
for k, v in dic.items():
# item 格式 {'2020-01-13': {'confirm': 41, 'suspect': 0, 'heal': 0, 'dead': 1}
if not cursor.execute(sql_query, k):
cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), v.get("suspect"),
v.get("suspect_add"), v.get("heal"), v.get("heal_add"),
v.get("dead"), v.get("dead_add")])
conn.commit() # 提交事务 update delete insert操作
print(f"{time.asctime()}历史数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn, cursor)
if __name__ == "__main__":
l = len(sys.argv)
if l == 1:
s = """
请输入参数
参数说明:
up_his 更新历史记录表
up_hot 更新实时热搜
up_det 更新详细表
"""
print(s)
else:
order = sys.argv[1]
if order == "up_his":
update_history()
elif order == "up_det":
update_details()
elif order == "up_hot":
update_hotsearch()