Skip to content

Commit

Permalink
Merge pull request #68 from my-dev-app/scraper/additional-parsers
Browse files Browse the repository at this point in the history
Scraper/additional parsers
  • Loading branch information
0x78f1935 authored Feb 14, 2024
2 parents 2239247 + b5c8a13 commit cff511a
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 4 deletions.
3 changes: 3 additions & 0 deletions aproxyrelay/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ async def get_proxies(self) -> None:
self.logger.info('Scraper: Skip discovery of new proxy servers ...')

if self.filter and self.scrape:
self.logger.info(
f'Validating: Proxies ({self._queue_filter.qsize()}), checking if proxies meet connection requirements ...'
)
async with ClientSession(conn_timeout=15) as session:
await self._test_all_proxies(session)
self.logger.info(f'Filter: Found {self._filtered_failed} incompetent and {self._filtered_available} available proxy servers in {datetime.now(UTC) - self.started}') # noqa: B950
Expand Down
2 changes: 2 additions & 0 deletions aproxyrelay/req.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ async def _test_all_proxies(self, session):
_target['proxy'] = f"{_target['protocol'].replace('https', 'http')}://{_target['ip']}:{_target['port']}"
to_filter.append(_target)

# Remove duplicate entries
to_filter = [dict(x) for x in list(set([tuple(item.items()) for item in to_filter]))]
tasks = [self._test_proxy_link(proxy['proxy'], proxy, session) for proxy in to_filter]
await gather(*tasks)

Expand Down
30 changes: 30 additions & 0 deletions aproxyrelay/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
from .parser_spys_nl import ParserSpysNL
from .parser_spys_us import ParserSpysUS
from .parser_ssl_proxies import ParserSSLProxies
from .parser_sunny9577_proxy_scraper import ParserSunnyProxyScraper
from .parser_roosterkid_openproxylist_socks4 import ParserRoosterkidOpenproxylistSocks4
from .parser_roosterkid_openproxylist_socks5 import ParserRoosterkidOpenproxylistSocks5
from .parser_murongpig_proxy_master_http import ParserMurongpigProxyMasterHttp
from .parser_murongpig_proxy_master_socks4 import ParserMurongpigProxyMasterSocks4
from .parser_murongpig_proxy_master_socks5 import ParserMurongpigProxyMasterSocks5


proxy_list = [
Expand Down Expand Up @@ -78,4 +84,28 @@
'url': 'https://gg.my-dev.app/api/v1/proxies/available?zone=nl&anonimity=all&protocol=all&page=1&size=1000',
'parser': ParserGGMyDevApp,
},
{
'url': 'https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.json',
'parser': ParserSunnyProxyScraper,
},
{
'url': 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt',
'parser': ParserRoosterkidOpenproxylistSocks4,
},
{
'url': 'https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt',
'parser': ParserRoosterkidOpenproxylistSocks5,
},
{
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt',
'parser': ParserMurongpigProxyMasterHttp,
},
{
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt',
'parser': ParserMurongpigProxyMasterSocks4,
},
{
'url': 'https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt',
'parser': ParserMurongpigProxyMasterSocks5,
},
]
47 changes: 47 additions & 0 deletions aproxyrelay/scrapers/parser_murongpig_proxy_master_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

from .parser import MainScraper


class ParserMurongpigProxyMasterHttp(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': 'http',
'anonymity': 'unknown',
'protocol': 'https',
'port': item.split(':')[1],
'ip': item.split(':')[0],
} for item in html.split('\n') if item
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
47 changes: 47 additions & 0 deletions aproxyrelay/scrapers/parser_murongpig_proxy_master_socks4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

from .parser import MainScraper


class ParserMurongpigProxyMasterSocks4(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': 'socks4',
'anonymity': 'unknown',
'protocol': 'socks4',
'port': item.split(':')[1],
'ip': item.split(':')[0],
} for item in html.split('\n') if item
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
47 changes: 47 additions & 0 deletions aproxyrelay/scrapers/parser_murongpig_proxy_master_socks5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

from .parser import MainScraper


class ParserMurongpigProxyMasterSocks5(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': 'socks5',
'anonymity': 'unknown',
'protocol': 'socks5',
'port': item.split(':')[1],
'ip': item.split(':')[0],
} for item in html.split('\n') if item
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
47 changes: 47 additions & 0 deletions aproxyrelay/scrapers/parser_roosterkid_openproxylist_socks4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

from .parser import MainScraper


class ParserRoosterkidOpenproxylistSocks4(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': 'socks4',
'anonymity': 'unknown',
'protocol': 'socks4',
'port': item.split(':')[1],
'ip': item.split(':')[0],
} for item in html.split('\n')
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
47 changes: 47 additions & 0 deletions aproxyrelay/scrapers/parser_roosterkid_openproxylist_socks5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

from .parser import MainScraper


class ParserRoosterkidOpenproxylistSocks5(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': 'socks5',
'anonymity': 'unknown',
'protocol': 'socks5',
'port': item.split(':')[1],
'ip': item.split(':')[0],
} for item in html.split('\n')
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
84 changes: 84 additions & 0 deletions aproxyrelay/scrapers/parser_sunny9577_proxy_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- mode: python ; coding: utf-8 -*-
"""
░░ ░░ ░░ ░░░ ░░ ░░░░ ░ ░░░░ ░ ░░ ░ ░░░░░░░░ ░░ ░░░░ ░
▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒▒ ▒▒ ▒▒ ▒▒▒▒ ▒ ▒▒▒▒▒▒▒ ▒▒▒▒▒▒▒ ▒▒▒▒ ▒▒ ▒▒ ▒▒
▓ ▓▓▓▓ ▓ ▓▓ ▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓▓▓ ▓▓▓ ▓▓ ▓▓▓ ▓▓▓▓▓▓▓ ▓▓▓▓ ▓▓▓ ▓▓▓
█ █ ███████ ███ ██ ████ ██ ██ █████ ████ ███ ██ ███████ ███████ ████ ████
█ ████ █ ███████ ████ ██ ██ ████ ████ ████ ████ █ █ █ ████ ████ ████
By undeƒined
------------
Main parser example, other parsers can inherit from this class
"""
from queue import Queue

import ast

from .parser import MainScraper


class ParserSunnyProxyScraper(MainScraper):
def __init__(self) -> None:
MainScraper.__init__(self)
self.zone = None

@classmethod
async def format_url(cls, url, *args, **kwargs) -> str:
"""Formats URL before scraping, let us adjust query parameters for each parser"""
cls.zone = kwargs.get("zone", "us")
return url

@classmethod
def generate_method(cls, target_method) -> str:
if 'socks4' in target_method.lower():
return 'socks4'
elif 'socks5' in target_method.lower():
return 'socks5'
elif 'http' in target_method.lower():
return 'https'
return 'unknown'

@classmethod
def generate_protocol(cls, target_protocol) -> str:
if 'socks4' in target_protocol.lower():
return 'socks4'
elif 'socks5' in target_protocol.lower():
return 'socks5'
elif 'https' in target_protocol.lower():
return 'https'
elif 'http' in target_protocol.lower():
return 'http'
return 'unknown'

@classmethod
def generate_anonymity(cls, target_anonimity) -> str:
if target_anonimity.lower() in (
'anonymous',
'elite',
):
return 'anonymous'
elif target_anonimity.lower() in (
'transparent',
):
return 'transparent'
return 'unknown'

@classmethod
async def format_raw(cls, html: str) -> list:
"""Parse text/html pages, customized method for the parser of this website"""
return [
{
'zone': cls.zone.upper(),
'method': cls.generate_method(item['type']),
'anonymity': cls.generate_anonymity(item['anonymity']),
'protocol': cls.generate_protocol(item['type']),
'port': item['port'],
'ip': item['ip'],
} for item in ast.literal_eval(html)
]

@classmethod
async def format_data(cls, zone: str, data: dict, queue: Queue) -> None:
"""Data formatter, formats data and returns is back in the process Queue"""
queue.put(data)
return queue
Loading

0 comments on commit cff511a

Please sign in to comment.