Skip to content

Commit 4878bf5

Browse files
everhopingandwaitingjy崔庆才丨静觅
authored
增加若干代理, 优化部分代码 (#108)
* Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻 代理 * update example usage2 * update requirements.txt * 优化 public crawlers * add proxy jiangxianli * tester 增加单个proxy测试方法 * reset setting Dockerfile docker-compose to default Co-authored-by: jy <[email protected]> Co-authored-by: 崔庆才丨静觅 <[email protected]>
1 parent cf03d87 commit 4878bf5

16 files changed

+231
-29
lines changed

Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
FROM python:3.6
22
WORKDIR /app
33
COPY . .
4-
RUN pip install -r requirements.txt
4+
# RUN pip install -r requirements.txt -i https://pypi.douban.com/simple
5+
RUN pip install -r requirements.txt -i
56
VOLUME ["/app/proxypool/crawlers/private"]
67
CMD ["supervisord", "-c", "supervisord.conf"]

docker-compose.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ services:
66
command: redis-server
77
ports:
88
- "6379:6379"
9-
# restart: always
9+
# restart: always
1010
proxypool:
1111
build: .
1212
image: 'germey/proxypool'

examples/usage2.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# -*- coding: UTF-8 -*-
2+
3+
'''
4+
'''
5+
import requests
6+
import time
7+
import threading
8+
import urllib3
9+
from fake_headers import Headers
10+
import uuid
11+
from geolite2 import geolite2
12+
ips = []
13+
14+
# 爬数据的线程类
15+
16+
def getChinaIP(ip='127.0.0.1'):
17+
reader = geolite2.reader()
18+
ip_info = reader.get(ip)
19+
geolite2.close()
20+
print(ip_info)
21+
return True if ip_info['country']['iso_code'] == 'CN' else False
22+
23+
24+
25+
class CrawlThread(threading.Thread):
26+
def __init__(self, proxyip):
27+
super(CrawlThread, self).__init__()
28+
self.proxyip = proxyip
29+
30+
def run(self):
31+
# 开始计时
32+
pure_ip_address = self.proxyip.split(':')[0]
33+
# 验证IP归属
34+
if not getChinaIP(pure_ip_address):
35+
# pass
36+
raise ValueError('不是有效IP')
37+
#
38+
start = time.time()
39+
# 消除关闭证书验证的警告
40+
urllib3.disable_warnings()
41+
headers = Headers(headers=True).generate()
42+
headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
43+
headers['Pragma'] = 'no-cache'
44+
headers['Host'] = 'bb.cf08tp.cn'
45+
headers['x-forward-for'] = pure_ip_address
46+
headers['Cookie'] = 'PHPSESSID={}'.format(
47+
''.join(str(uuid.uuid1()).split('-')))
48+
print(headers)
49+
html = requests.get(headers=headers, url=targetUrl, proxies={
50+
"http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode()
51+
# 结束计时
52+
end = time.time()
53+
# 输出内容
54+
print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) +
55+
"毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")
56+
57+
# 获取代理IP的线程类
58+
59+
60+
class GetIpThread(threading.Thread):
61+
def __init__(self, fetchSecond):
62+
super(GetIpThread, self).__init__()
63+
self.fetchSecond = fetchSecond
64+
65+
def run(self):
66+
global ips
67+
while True:
68+
# 获取IP列表
69+
res = requests.get(apiUrl).content.decode()
70+
# 按照\n分割获取到的IP
71+
ips = res.split('\n')
72+
# 利用每一个IP
73+
for proxyip in ips:
74+
if proxyip.strip():
75+
# 开启一个线程
76+
# CrawlThread(proxyip).start()
77+
try:
78+
CrawlThread(proxyip).run()
79+
time.sleep(1.5)
80+
except Exception as e:
81+
print(e)
82+
# 休眠
83+
time.sleep(len(ips) /self.fetchSecond )
84+
85+
86+
if __name__ == '__main__':
87+
# 获取IP的API接口
88+
# apiUrl = "http://127.0.0.1:5555/all"
89+
apiUrl = "http://127.0.0.1:5555/random"
90+
# 要抓取的目标网站地址
91+
targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp="
92+
# targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp='
93+
fetchSecond = 5
94+
# 开始自动获取IP
95+
GetIpThread(fetchSecond).start()

proxypool/crawlers/base.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,19 @@
22
import requests
33
from loguru import logger
44
from proxypool.setting import GET_TIMEOUT
5-
6-
5+
from fake_headers import Headers
6+
import time
77
class BaseCrawler(object):
88
urls = []
99

1010
@retry(stop_max_attempt_number=3, retry_on_result=lambda x: x is None, wait_fixed=2000)
1111
def fetch(self, url, **kwargs):
1212
try:
13+
headers = Headers(headers=True).generate()
1314
kwargs.setdefault('timeout', GET_TIMEOUT)
1415
kwargs.setdefault('verify', False)
15-
response = requests.get(url, **kwargs)
16+
kwargs.setdefault('headers', headers)
17+
response = requests.get(url ,**kwargs)
1618
if response.status_code == 200:
1719
response.encoding = 'utf-8'
1820
return response.text
@@ -27,6 +29,7 @@ def crawl(self):
2729
for url in self.urls:
2830
logger.info(f'fetching {url}')
2931
html = self.fetch(url)
32+
time.sleep(.5)
3033
for proxy in self.parse(html):
3134
logger.info(f'fetched proxy {proxy.string()} from {url}')
3235
yield proxy

proxypool/crawlers/public/daili66.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
BASE_URL = 'http://www.66ip.cn/{page}.html'
7-
MAX_PAGE = 5
7+
MAX_PAGE = 50
88

99

1010
class Daili66Crawler(BaseCrawler):

proxypool/crawlers/public/fatezero_proxylist.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,12 @@ def parse(self, html):
1919

2020
hosts_ports = html.split('\n')
2121
for addr in hosts_ports:
22-
ip_address = json.loads(addr)
23-
if(True):
22+
if(addr):
23+
ip_address = json.loads(addr)
2424
host = ip_address['host']
2525
port = ip_address['port']
2626
yield Proxy(host=host, port=port)
2727

28-
2928
if __name__ == '__main__':
3029
crawler = FatezeroCrawler()
3130
for proxy in crawler.crawl():
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
import re
4+
from pyquery import PyQuery as pq
5+
import time
6+
BASE_URL = 'http://www.goubanjia.com/'
7+
8+
9+
class GoubanjiaCrawler(BaseCrawler):
10+
"""
11+
ip Goubanjia crawler, http://www.goubanjia.com/
12+
"""
13+
urls = [BASE_URL]
14+
15+
def parse(self, html):
16+
"""
17+
parse html file to get proxies
18+
:return:
19+
"""
20+
doc = pq(html)('.ip').items()
21+
# ''.join([*filter(lambda x: x != '',re.compile('\>([\d:\.]*)\<').findall(td.html()))])
22+
for td in doc:
23+
trs = td.children()
24+
ip_str = ''
25+
for tr in trs:
26+
attrib = tr.attrib
27+
if 'style' in attrib and 'none' in tr.attrib['style']:
28+
continue
29+
ip_str+= '' if not tr.text else tr.text
30+
addr_split = ip_str.split(':')
31+
if(len(addr_split) == 2):
32+
host = addr_split[0]
33+
port = addr_split[1]
34+
yield Proxy(host=host, port=port)
35+
else:
36+
port = trs[-1].text
37+
host = ip_str.replace(port,'')
38+
yield Proxy(host=host, port=port)
39+
40+
41+
if __name__ == '__main__':
42+
crawler = GoubanjiaCrawler()
43+
for proxy in crawler.crawl():
44+
print(proxy)

proxypool/crawlers/public/ihuan.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@ class IhuanCrawler(BaseCrawler):
1010
"""
1111
ip ihuan crawler, https://ip.ihuan.me
1212
"""
13-
urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]
14-
13+
path = time.strftime("%Y/%m/%d/%H", time.localtime())
14+
urls = [BASE_URL.format(path=path)]
15+
ignore = False
1516
def parse(self, html):
1617
"""
1718
parse html file to get proxies

proxypool/crawlers/public/ip3366.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
import re
44

55

6-
MAX_PAGE = 5
7-
BASE_URL = 'http://www.ip3366.net/free/?stype=1&page={page}'
6+
MAX_PAGE = 8
7+
BASE_URL = 'http://www.ip3366.net/free/?stype={stype}&page={page}'
88

99

1010
class IP3366Crawler(BaseCrawler):
1111
"""
1212
ip3366 crawler, http://www.ip3366.net/
1313
"""
14-
urls = [BASE_URL.format(page=i) for i in range(1, 8)]
14+
urls = [BASE_URL.format(stype=stype,page=i) for stype in range(1,3) for i in range(1, 8)]
1515

1616
def parse(self, html):
1717
"""
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from proxypool.schemas.proxy import Proxy
2+
from proxypool.crawlers.base import BaseCrawler
3+
import re
4+
import json
5+
BASE_URL = 'https://ip.jiangxianli.com/api/proxy_ips?page={page}'
6+
7+
MAX_PAGE = 10
8+
class JiangxianliCrawler(BaseCrawler):
9+
"""
10+
jiangxianli crawler,https://ip.jiangxianli.com/
11+
"""
12+
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
13+
14+
def parse(self, html):
15+
"""
16+
parse html file to get proxies
17+
:return:
18+
"""
19+
20+
result =json.loads(html)
21+
if result['code'] != 0:
22+
return
23+
MAX_PAGE = int(result['data']['last_page'])
24+
hosts_ports = result['data']['data']
25+
for ip_address in hosts_ports:
26+
if(ip_address):
27+
host = ip_address['ip']
28+
port = ip_address['port']
29+
yield Proxy(host=host, port=port)
30+
31+
32+
if __name__ == '__main__':
33+
crawler = JiangxianliCrawler()
34+
for proxy in crawler.crawl():
35+
print(proxy)

proxypool/crawlers/public/kuaidaili.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
from pyquery import PyQuery as pq
55

66

7-
BASE_URL = 'https://www.kuaidaili.com/free/inha/{page}/'
8-
MAX_PAGE = 5
7+
BASE_URL = 'https://www.kuaidaili.com/free/{type}/{page}/'
8+
MAX_PAGE = 300
99

1010

1111
class KuaidailiCrawler(BaseCrawler):
1212
"""
1313
kuaidaili crawler, https://www.kuaidaili.com/
1414
"""
15-
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
15+
urls = [BASE_URL.format(type=type,page=page) for type in ('intr','inha') for page in range(1, MAX_PAGE + 1)]
1616

1717
def parse(self, html):
1818
"""

proxypool/crawlers/public/zhandaye.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
BASE_URL = 'https://www.zdaye.com/dayProxy/{page}.html'
9-
MAX_PAGE = 5
9+
MAX_PAGE = 5 * 2
1010

1111
class ZhandayeCrawler(BaseCrawler):
1212
"""

proxypool/processors/server.py

+15
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,21 @@ def get_proxy():
3737
return conn.random().string()
3838

3939

40+
@app.route('/all')
41+
def get_proxy_all():
42+
"""
43+
get a random proxy
44+
:return: get a random proxy
45+
"""
46+
conn = get_conn()
47+
proxies = conn.all()
48+
proxies_string = ''
49+
for proxy in proxies:
50+
proxies_string += str(proxy) + '\n'
51+
52+
return proxies_string
53+
54+
4055
@app.route('/count')
4156
def get_count():
4257
"""

proxypool/processors/tester.py

+7
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,14 @@ def run(self):
8484
if not cursor:
8585
break
8686

87+
def run_tester():
88+
host = '96.113.165.182'
89+
port = '3128'
90+
tasks = [tester.test(Proxy(host=host, port=port))]
91+
tester.loop.run_until_complete(asyncio.wait(tasks))
8792

8893
if __name__ == '__main__':
8994
tester = Tester()
9095
tester.run()
96+
# run_tester()
97+

proxypool/storages/redis.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,11 @@ def random(self) -> Proxy:
5151
:return: proxy, like 8.8.8.8:8
5252
"""
5353
# try to get proxy with max score
54-
proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
54+
proxies = self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MAX , PROXY_SCORE_MAX)
5555
if len(proxies):
5656
return convert_proxy_or_proxies(choice(proxies))
5757
# else get proxy by rank
58-
proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
58+
proxies = self.db.zrevrange(REDIS_KEY, PROXY_SCORE_MIN , PROXY_SCORE_MAX)
5959
if len(proxies):
6060
return convert_proxy_or_proxies(choice(proxies))
6161
# else raise error

requirements.txt

+11-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
environs==7.2.0
2-
Flask==1.0.3
3-
attrs==19.1.0
1+
environs==9.3.0
2+
Flask==1.1.2
3+
attrs==20.3.0
44
retrying==1.3.3
55
aiohttp==3.7.4
6-
requests==2.22.0
7-
loguru==0.3.2
8-
pyquery==1.4.0
9-
supervisor==4.1.0
10-
redis==2.10.6
11-
lxml==4.6.2
6+
requests==2.25.1
7+
loguru==0.5.3
8+
pyquery==1.4.3
9+
supervisor==4.2.1
10+
redis==3.5.3
11+
lxml==4.6.2
12+
fake_headers==1.0.2
13+
maxminddb_geolite2==2018.703

0 commit comments

Comments
 (0)