Skip to content

Commit 46d0319

Browse files
committed
Refactor
1 parent 30d0cc4 commit 46d0319

16 files changed

+202
-164
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,3 @@ __pycache__/
88
.idea
99

1010
*.egg-info/
11-

.pre-commit-config.yaml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v2.5.0
4+
hooks:
5+
- id: check-yaml
6+
- id: end-of-file-fixer
7+
- id: trailing-whitespace
8+
- id: detect-aws-credentials
9+
args: [--allow-missing-credentials]
10+
11+
- repo: https://github.com/humitos/mirrors-autoflake.git
12+
rev: v1.3
13+
hooks:
14+
- id: autoflake
15+
args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
16+
17+
- repo: https://github.com/psf/black
18+
rev: stable
19+
hooks:
20+
- id: black
21+
args: [--line-length=120]
22+
23+
#- repo: https://github.com/pre-commit/mirrors-isort
24+
# rev: v4.3.21
25+
# hooks:
26+
# - id: isort

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,3 @@ From textract:
77
apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext
88
tesseract-ocr \
99
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
10-

TODO

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
Language detection, pycld2 cld3?, polyglot
33
- Verify TFIDF combination on queries
44
- UI
5-
- boolean queries
5+
- boolean queries

src/fusearch/fusearchd.py bin/fusearchd.py

+64-63
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import signal
88
import sys
99
import logging
10-
import yaml
1110
import textract
1211
import functools
1312
import progressbar
@@ -16,25 +15,27 @@
1615
import io
1716
from fusearch.index import Index
1817
from fusearch.model import Document
19-
from tokenizer import get_tokenizer, tokfreq, Tokenizer
18+
from fusearch.tokenizer import get_tokenizer, tokfreq, Tokenizer
19+
from fusearch.util import bytes_to_str, file_generator_ext, filename_without_extension, mtime, pickle_loader
20+
from fusearch.config import Config
2021
from multiprocessing import Process, Queue, cpu_count
21-
import queue
2222
import collections.abc
23-
from util import *
24-
from config import Config
2523

2624
progressbar_index_widgets_ = [
27-
' [',
28-
progressbar.Timer(format='Elapsed %(elapsed)s'), ', ',
29-
progressbar.SimpleProgress(), ' files'
30-
#'count: ', progressbar.Counter(),
31-
'] ',
25+
" [",
26+
progressbar.Timer(format="Elapsed %(elapsed)s"),
27+
", ",
28+
progressbar.SimpleProgress(),
29+
" files"
30+
#'count: ', progressbar.Counter(),
31+
"] ",
3232
progressbar.Bar(),
33-
' (', progressbar.ETA(), ') ',
33+
" (",
34+
progressbar.ETA(),
35+
") ",
3436
]
3537

3638

37-
3839
def cleanup() -> None:
3940
pass
4041

@@ -86,37 +87,34 @@ def daemonize() -> None:
8687
fork_exit_parent()
8788
os.setsid()
8889
fork_exit_parent()
89-
os.chdir('/')
90+
os.chdir("/")
9091
config_signal_handlers()
9192
os.umask(0o022)
9293
redirect_stream(sys.stdin, None)
93-
redirect_stream(sys.stdout, open('/tmp/fusearch.out', 'a'))
94-
redirect_stream(sys.stderr, open('/tmp/fusearch.err', 'a'))
94+
redirect_stream(sys.stdout, open("/tmp/fusearch.out", "a"))
95+
redirect_stream(sys.stderr, open("/tmp/fusearch.err", "a"))
9596
fusearch_main()
9697

9798

9899
def config_argparse() -> argparse.ArgumentParser:
99100
parser = argparse.ArgumentParser(description="fusearch daemon", epilog="")
100-
parser.add_argument('-f', '--foreground', action='store_true',
101-
help="Don't daemonize")
102-
parser.add_argument('-c', '--config', type=str,
103-
default='/etc/fusearch/config.yaml',
104-
help="config file")
101+
parser.add_argument("-f", "--foreground", action="store_true", help="Don't daemonize")
102+
parser.add_argument("-c", "--config", type=str, default="/etc/fusearch/config.yaml", help="config file")
105103
return parser
106104

107105

108106
def to_text(file: str) -> str:
109107
assert os.path.isfile(file)
110108
try:
111-
txt_b = textract.process(file, method='pdftotext')
109+
txt_b = textract.process(file, method="pdftotext")
112110
# TODO more intelligent decoding? there be dragons
113111
txt = bytes_to_str(txt_b)
114-
#print(file)
115-
#print(len(txt))
116-
#print(txt[:80])
117-
#print('-------------------')
112+
# print(file)
113+
# print(len(txt))
114+
# print(txt[:80])
115+
# print('-------------------')
118116
except Exception as e:
119-
txt = ''
117+
txt = ""
120118
logging.exception("Exception while extracting text from '%s'", file)
121119
# TODO mark it as failed instead of empty text
122120
return txt
@@ -125,37 +123,28 @@ def to_text(file: str) -> str:
125123
def document_from_file(file: str, tokenizer: Tokenizer) -> Document:
126124
mtime_latest = mtime(file)
127125
filename = filename_without_extension(file)
128-
txt = filename + '\n' + to_text(file)
126+
txt = filename + "\n" + to_text(file)
129127
# Detect language and check that the document makes sense, OCR returns garbage sometimes
130128
# TODO: add filename to content
131-
document = Document(
132-
url=file,
133-
filename=filename,
134-
content=txt,
135-
tokfreq=tokfreq(tokenizer(txt)),
136-
mtime=mtime_latest)
129+
document = Document(url=file, filename=filename, content=txt, tokfreq=tokfreq(tokenizer(txt)), mtime=mtime_latest)
137130
return document
138131

139132

140133
def needs_indexing(index: Index, file: str) -> bool:
141134
mtime_latest = mtime(file)
142-
#document = index.document_from_url(file)
135+
# document = index.document_from_url(file)
143136
mtime_last_known = index.mtime(file)
144137
if not mtime_last_known or mtime_last_known and mtime_latest > mtime_last_known:
145-
#logging.debug("needs_indexing: need '%s'", file)
138+
# logging.debug("needs_indexing: need '%s'", file)
146139
return True
147140
else:
148-
#logging.debug("needs_indexing: NOT need '%s'", file)
141+
# logging.debug("needs_indexing: NOT need '%s'", file)
149142
return False
150143

151144

152145
def get_index(path: str, config: Config) -> Index:
153-
index_db = os.path.join(path, '.fusearch.db')
154-
index = Index({
155-
'provider':'sqlite',
156-
'filename': index_db,
157-
'create_db': True
158-
}, tokenizer=get_tokenizer(config))
146+
index_db = os.path.join(path, ".fusearch.db")
147+
index = Index({"provider": "sqlite", "filename": index_db, "create_db": True}, tokenizer=get_tokenizer(config))
159148
logging.debug("get_index: '%s' %d docs", index_db, index.doc_count)
160149
return index
161150

@@ -167,7 +156,6 @@ def __init__(self, path, config):
167156
self.index = get_index(path, config)
168157
assert os.path.isdir(path)
169158

170-
171159
def __call__(self) -> collections.abc.Iterable:
172160
""":returns a generator of files which are updated from the mtime in the index"""
173161
file_needs_indexing = functools.partial(needs_indexing, self.index)
@@ -176,22 +164,24 @@ def __call__(self) -> collections.abc.Iterable:
176164

177165
def file_producer(path: str, config: Config, file_queue: Queue, file_inventory: io.IOBase) -> None:
178166
for file in pickle_loader(file_inventory):
179-
#logging.debug("file_producer: %s", file)
167+
# logging.debug("file_producer: %s", file)
180168
file_queue.put(file)
181169
logging.debug("file_producer is done")
182170

183171

184172
def text_extract(config: Config, file_queue: Queue, document_queue: Queue):
185-
#logging.debug("text_extract started")
173+
# logging.debug("text_extract started")
186174
tokenizer = get_tokenizer(config)
187175
while True:
188176
file = file_queue.get()
189177
if file is None:
190178
logging.debug("text_extract is done")
191179
return
192-
logging.debug("text_extract: file_queue.qsize %d document_queue.qsize %d", file_queue.qsize(), document_queue.qsize())
180+
logging.debug(
181+
"text_extract: file_queue.qsize %d document_queue.qsize %d", file_queue.qsize(), document_queue.qsize()
182+
)
193183
logging.debug("text_extract: '%s'", file)
194-
#logging.debug("text_extract: %s", file)
184+
# logging.debug("text_extract: %s", file)
195185
document = document_from_file(file, tokenizer)
196186
document_queue.put(document)
197187

@@ -218,6 +208,7 @@ def document_consumer(path: str, config: Config, document_queue: Queue, file_cou
218208
pbar.update(file_i)
219209
file_i += 1
220210

211+
221212
def gather_files(path, config, file_inventory) -> int:
222213
""":returns file count"""
223214
if not os.path.isdir(path):
@@ -227,23 +218,25 @@ def gather_files(path, config, file_inventory) -> int:
227218
logging.info("Calculating number of files to index (.=100files)")
228219
if config.verbose:
229220
widgets = [
230-
' [',
231-
progressbar.Timer(format='Elapsed %(elapsed)s'), ' ',
232-
'count: ', progressbar.Counter(),
233-
'] ',
221+
" [",
222+
progressbar.Timer(format="Elapsed %(elapsed)s"),
223+
" ",
224+
"count: ",
225+
progressbar.Counter(),
226+
"] ",
234227
progressbar.BouncingBar(),
235228
]
236229
pbar = progressbar.ProgressBar(widgets=widgets)
237230
file_count = 0
238231
for file in NeedsIndexFileGenerator(path, config)():
239232
pickle.dump(file, file_inventory)
240233
file_count += 1
241-
#if config.verbose and (file_count % 100) == 0:
234+
# if config.verbose and (file_count % 100) == 0:
242235
# sys.stdout.write('.')
243236
# sys.stdout.flush()
244237
if config.verbose:
245238
pbar.update(file_count)
246-
#if config.verbose:
239+
# if config.verbose:
247240
# sys.stdout.write('\n')
248241
if config.verbose:
249242
pbar.finish()
@@ -260,24 +253,31 @@ def index_do(path, config) -> None:
260253
else:
261254
index_serial(path, config, file_count, file_inventory)
262255

256+
263257
def index_parallel(path: str, config: Config, file_count: int, file_inventory) -> None:
264258
#
265259
# file_producer -> N * test_extract -> document_consumer
266260
#
267261
# TODO: check that processes are alive to prevent deadlocks on exceptions in children
268-
file_queue = Queue(cpu_count()*8)
262+
file_queue = Queue(cpu_count() * 8)
269263
document_queue = Queue(256)
270264
text_extract_procs = []
271-
file_producer_proc = Process(name='file producer', target=file_producer, daemon=True,
272-
args=(path, config, file_queue, file_inventory))
265+
file_producer_proc = Process(
266+
name="file producer", target=file_producer, daemon=True, args=(path, config, file_queue, file_inventory)
267+
)
273268
file_producer_proc.start()
274269

275-
document_consumer_proc = Process(name='document consumer', target=document_consumer, daemon=True,
276-
args=(path, config, document_queue, file_count))
270+
document_consumer_proc = Process(
271+
name="document consumer", target=document_consumer, daemon=True, args=(path, config, document_queue, file_count)
272+
)
277273

278274
for i in range(cpu_count()):
279-
p = Process(name='text extractor {}'.format(i), target=text_extract, daemon=True,
280-
args=(config, file_queue, document_queue))
275+
p = Process(
276+
name="text extractor {}".format(i),
277+
target=text_extract,
278+
daemon=True,
279+
args=(config, file_queue, document_queue),
280+
)
281281
text_extract_procs.append(p)
282282
p.start()
283283
document_consumer_proc.start()
@@ -297,6 +297,7 @@ def index_parallel(path: str, config: Config, file_count: int, file_inventory) -
297297
document_consumer_proc.join()
298298
logging.info("Parallel indexing finished")
299299

300+
300301
def index_serial(path, config, file_count, file_inventory):
301302
if config.verbose:
302303
pbar = progressbar.ProgressBar(max_value=file_count, widgets=progressbar_index_widgets_)
@@ -332,10 +333,10 @@ def script_name() -> str:
332333

333334
def config_logging() -> None:
334335
import time
336+
335337
logging.getLogger().setLevel(logging.DEBUG)
336338
logging.getLogger("requests").setLevel(logging.WARNING)
337-
logging.basicConfig(format='{}: %(asctime)sZ %(name)s %(levelname)s %(message)s'.
338-
format(script_name()))
339+
logging.basicConfig(format="{}: %(asctime)sZ %(name)s %(levelname)s %(message)s".format(script_name()))
339340
logging.Formatter.converter = time.gmtime
340341

341342

@@ -348,5 +349,5 @@ def main() -> int:
348349
fusearch_main(args)
349350

350351

351-
if __name__ == '__main__':
352+
if __name__ == "__main__":
352353
sys.exit(main())

fusearch.yml

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
#index_dirs: [/home/piotr/devel/fusearch/docu/]
2-
#index_dirs: [/tmp/programming]
3-
index_dirs: [/tmp/kkita]
4-
parallel_extraction: false
5-
#parallel_extraction: true
1+
index_dirs: [/home/piotr/storage/docu/]
2+
#parallel_extraction: false
3+
parallel_extraction: true
64
verbose: true
75
include_extensions:
86
- pdf

setup.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,38 @@
1-
from setuptools import setup, find_packages, find_namespace_packages
1+
from setuptools import find_packages, setup
2+
3+
4+
INSTALL_REQUIRES = ["textract", "nltk"]
5+
6+
EXTRAS_REQUIRE = {"test": ["flake8", "black", "mock", "pre-commit", "pytest"]}
7+
8+
9+
with open("README.md", "r") as f:
10+
LONG_DESCRIPTION = f.read()
211

3-
from os import path
412

513
setup(
614
name="fusearch",
715
version="0.1",
8-
packages=find_packages('src'),
9-
package_dir={"": "src"},
10-
install_requires=['textract'],
11-
tests_require=['nose'],
12-
package_data={},
1316
author="Pedro Larroy",
1417
author_email="[email protected]",
1518
description="fusearch is a local full text search engine",
1619
license="Apache 2",
1720
keywords="search console fulltext documents",
1821
url="https://github.com/larroy/fusearch",
19-
project_urls={
20-
"Source Code": "https://github.com/larroy/fusearch",
21-
}
22+
project_urls={"Source Code": "https://github.com/larroy/fusearch",},
23+
packages=find_packages("src"),
24+
package_dir={"": "src"},
25+
long_description=LONG_DESCRIPTION,
26+
long_description_content_type="text/markdown",
27+
install_requires=INSTALL_REQUIRES,
28+
extras_require=EXTRAS_REQUIRE,
29+
classifiers=[
30+
"Development Status :: 1 - Planning",
31+
"Intended Audience :: Developers",
32+
"Natural Language :: English",
33+
"Programming Language :: Python",
34+
"Programming Language :: Python :: 3.6",
35+
],
36+
package_data={},
37+
scripts=["bin/fusearchd.py"],
2238
)

0 commit comments

Comments
 (0)