-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpaper-finder.py
executable file
·124 lines (93 loc) · 3.6 KB
/
paper-finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python3
# -*- coding: iso-8859-15 -*-
import argparse
from subprocess import check_output
class APINotSupported(ValueError):
""" Raised if an API was specified that we do not support. """
def __init__(self, api, supported_apis):
super().__init__(
"Requested api '{api}' is not supported. "
"Supported apis are:\n-"
"{apis}".format(api=api, apis="\n-".join(supported_apis))
)
def list_apis(args):
""" Return a list of all supported sources of information. """
spiders = [
spider_name for spider_name in
map(lambda spider: spider.strip(),
check_output(["scrapy", "list"]).decode().split("\n"))
if spider_name
]
return spiders
def query_apis(args):
""" Query sources of information with given query arguments `args`. """
from paper_finder.paper_spiders import ScraperPaperSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
from scrapy.utils.project import get_project_settings
from itertools import chain as iter_chain
# Iterator over all our spider classes
spiders = iter_chain(*[
iter_spider_classes(module)
for module in walk_modules("paper_finder.spiders")
])
if args.apis:
spiders = set(spiders)
apis = {api.strip() for api in args.apis.split(",")}
spider_names = {spider.name for spider in spiders}
# sanitize user-specified apis - ensure they all exist {{{ #
for api in apis:
if api not in spider_names:
raise APINotSupported(
api=api, supported_apis=sorted(spider_names)
)
# }}} sanitize user-specified apis - ensure they all exist #
spiders = {
spider for spider in spiders if spider.name in apis
}
crawler_process = CrawlerProcess(get_project_settings())
for spider in spiders:
if not args.scrape and isinstance(spider, ScraperPaperSpider):
continue
crawler_process.crawl(spider, query=args.search_term)
crawler_process.start()
def main():
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
# List Supported Information Sources {{{ #
list_apis_parser = subparsers.add_parser(
"list", help="List all supported information sources."
)
list_apis_parser.set_defaults(fun=list_apis)
# }}} List Supported Information Sources #
# Find Papers {{{ #
find_papers_parser = subparsers.add_parser(
"find",
help="Query (all) information sources for papers matching the given TERM."
)
find_papers_parser.add_argument(
"search_term", help="Search term to find papers for."
)
find_papers_parser.add_argument(
"--apis", help="Use only the given (comma-seperated list of) API's.",
default=None, dest="apis", action="store"
)
find_papers_parser.add_argument(
"--scrape",
help="Boolean flag that turns on scraping for papers. "
"Note that scraping may be disallowed by some information sources "
"and specifiying this flag may result in IP-bans by the respective "
"information source.",
default=False,
dest="scrape",
action="store_true"
)
find_papers_parser.set_defaults(fun=query_apis)
# }}} Find Papers #
args = parser.parse_args()
results = args.fun(args)
if args.fun.__name__ == "list_apis":
print("\n".join(sorted(results)))
if __name__ == "__main__":
main()