|
| 1 | +# -*- coding: UTF-8 -*- |
| 2 | + |
| 3 | +''' |
| 4 | +''' |
| 5 | +import requests |
| 6 | +import time |
| 7 | +import threading |
| 8 | +import urllib3 |
| 9 | +from fake_headers import Headers |
| 10 | +import uuid |
| 11 | +from geolite2 import geolite2 |
| 12 | +ips = [] |
| 13 | + |
| 14 | +# 爬数据的线程类 |
| 15 | + |
| 16 | +def getChinaIP(ip='127.0.0.1'): |
| 17 | + reader = geolite2.reader() |
| 18 | + ip_info = reader.get(ip) |
| 19 | + geolite2.close() |
| 20 | + print(ip_info) |
| 21 | + return True if ip_info['country']['iso_code'] == 'CN' else False |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +class CrawlThread(threading.Thread): |
| 26 | + def __init__(self, proxyip): |
| 27 | + super(CrawlThread, self).__init__() |
| 28 | + self.proxyip = proxyip |
| 29 | + |
| 30 | + def run(self): |
| 31 | + # 开始计时 |
| 32 | + pure_ip_address = self.proxyip.split(':')[0] |
| 33 | + # 验证IP归属 |
| 34 | + if not getChinaIP(pure_ip_address): |
| 35 | + # pass |
| 36 | + raise ValueError('不是有效IP') |
| 37 | + # |
| 38 | + start = time.time() |
| 39 | + # 消除关闭证书验证的警告 |
| 40 | + urllib3.disable_warnings() |
| 41 | + headers = Headers(headers=True).generate() |
| 42 | + headers['Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676' |
| 43 | + headers['Pragma'] = 'no-cache' |
| 44 | + headers['Host'] = 'bb.cf08tp.cn' |
| 45 | + headers['x-forward-for'] = pure_ip_address |
| 46 | + headers['Cookie'] = 'PHPSESSID={}'.format( |
| 47 | + ''.join(str(uuid.uuid1()).split('-'))) |
| 48 | + print(headers) |
| 49 | + html = requests.get(headers=headers, url=targetUrl, proxies={ |
| 50 | + "http": 'http://' + self.proxyip, "https": 'https://' + self.proxyip}, verify=False, timeout=2).content.decode() |
| 51 | + # 结束计时 |
| 52 | + end = time.time() |
| 53 | + # 输出内容 |
| 54 | + print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + |
| 55 | + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************") |
| 56 | + |
| 57 | +# 获取代理IP的线程类 |
| 58 | + |
| 59 | + |
| 60 | +class GetIpThread(threading.Thread): |
| 61 | + def __init__(self, fetchSecond): |
| 62 | + super(GetIpThread, self).__init__() |
| 63 | + self.fetchSecond = fetchSecond |
| 64 | + |
| 65 | + def run(self): |
| 66 | + global ips |
| 67 | + while True: |
| 68 | + # 获取IP列表 |
| 69 | + res = requests.get(apiUrl).content.decode() |
| 70 | + # 按照\n分割获取到的IP |
| 71 | + ips = res.split('\n') |
| 72 | + # 利用每一个IP |
| 73 | + for proxyip in ips: |
| 74 | + if proxyip.strip(): |
| 75 | + # 开启一个线程 |
| 76 | + # CrawlThread(proxyip).start() |
| 77 | + try: |
| 78 | + CrawlThread(proxyip).run() |
| 79 | + time.sleep(1.5) |
| 80 | + except Exception as e: |
| 81 | + print(e) |
| 82 | + # 休眠 |
| 83 | + time.sleep(len(ips) /self.fetchSecond ) |
| 84 | + |
| 85 | + |
| 86 | +if __name__ == '__main__': |
| 87 | + # 获取IP的API接口 |
| 88 | + # apiUrl = "http://127.0.0.1:5555/all" |
| 89 | + apiUrl = "http://127.0.0.1:5555/random" |
| 90 | + # 要抓取的目标网站地址 |
| 91 | + targetUrl = "http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335688&id=2676&tp=" |
| 92 | + # targetUrl = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=vote&vid=335608&id=2676&tp=' |
| 93 | + fetchSecond = 5 |
| 94 | + # 开始自动获取IP |
| 95 | + GetIpThread(fetchSecond).start() |
0 commit comments