forked from imfht/ScanSql
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_website.py
31 lines (30 loc) · 1.16 KB
/
get_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests
import re
from urllib import parse
from bs4 import BeautifulSoup
def Fuck_Href(href):
pattern = re.compile('site_(.*?)\.html')
return re.findall(pattern,href)[0]
def Fuck_main(keyWord): # this is the fuck main
req = requests.get("http://search.top.chinaz.com/Search.aspx?p=1&url="+str(keyWord))
pattern = re.compile("相关的结果(.*?)条")
print(re.findall(pattern,req.text)[0])
page = int(int(re.findall(pattern,req.text)[0])/30)+1
print(page)
for i in range(1,page+1):
soup = BeautifulSoup(req.text,'html.parser')
for j in soup.findAll(class_='pr10 fz14'):#print(i.getText()) #打印名字
print(Fuck_Href(j['href']),j.text)
req = requests.get("http://search.top.chinaz.com/Search.aspx?p=%d&url="%i+str(keyWord))
def get_All():
for i in range(1,1680):
try:
req = requests.get('http://top.chinaz.com/hangye/index_%d.html'%i)
pattern = re.compile('<span class="col-gray">(.*?)</span>')
for i in re.findall(pattern,req.text):
print(i)
except Exception as e:
print(e)
continue
#Fuck_main('山东')
get_All()