larroy
diff --git a/‎.gitignore
-1 b/‎.gitignore
-1
diff --git a/‎.pre-commit-config.yaml
+26 b/‎.pre-commit-config.yaml
+26
diff --git a/‎README.md
-1 b/‎README.md
-1
diff --git a/‎TODO
+1-1 b/‎TODO
+1-1
diff --git a/‎src/fusearch/fusearchd.py ‎bin/fusearchd.py
+64-63 b/‎src/fusearch/fusearchd.py ‎bin/fusearchd.py
+64-63
diff --git a/‎fusearch.yml
+3-5 b/‎fusearch.yml
+3-5
diff --git a/‎setup.py
+26-10 b/‎setup.py
+26-10
@@ -8,4 +8,3 @@ __pycache__/
 .idea
 
 *.egg-info/
-
@@ -0,0 +1,26 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: detect-aws-credentials
+        args: [--allow-missing-credentials]
+
+- repo: https://github.com/humitos/mirrors-autoflake.git
+  rev: v1.3
+  hooks:
+    - id: autoflake
+      args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
+
+-   repo: https://github.com/psf/black
+    rev: stable
+    hooks:
+    -   id: black
+        args: [--line-length=120]
+
+        #-   repo: https://github.com/pre-commit/mirrors-isort
+        #    rev: v4.3.21
+        #    hooks:
+        #    -  id: isort
@@ -7,4 +7,3 @@ From textract:
 apt-get install python-dev libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext
 tesseract-ocr \
 flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
-
@@ -2,4 +2,4 @@
   Language detection, pycld2 cld3?, polyglot
 - Verify TFIDF combination on queries
 - UI
-- boolean queries 
+- boolean queries
@@ -7,7 +7,6 @@
 import signal
 import sys
 import logging
-import yaml
 import textract
 import functools
 import progressbar
@@ -16,25 +15,27 @@
 import io
 from fusearch.index import Index
 from fusearch.model import Document
-from tokenizer import get_tokenizer, tokfreq, Tokenizer
+from fusearch.tokenizer import get_tokenizer, tokfreq, Tokenizer
+from fusearch.util import bytes_to_str, file_generator_ext, filename_without_extension, mtime, pickle_loader
+from fusearch.config import Config
 from multiprocessing import Process, Queue, cpu_count
-import queue
 import collections.abc
-from util import *
-from config import Config
 
 progressbar_index_widgets_ = [
-    ' [',
-        progressbar.Timer(format='Elapsed %(elapsed)s'), ', ',
-        progressbar.SimpleProgress(), ' files'
-        #'count: ', progressbar.Counter(),
-    '] ',
+    " [",
+    progressbar.Timer(format="Elapsed %(elapsed)s"),
+    ", ",
+    progressbar.SimpleProgress(),
+    " files"
+    #'count: ', progressbar.Counter(),
+    "] ",
     progressbar.Bar(),
-    ' (', progressbar.ETA(), ') ',
+    " (",
+    progressbar.ETA(),
+    ") ",
 ]
 
 
-
 def cleanup() -> None:
     pass
 
@@ -86,37 +87,34 @@ def daemonize() -> None:
     fork_exit_parent()
     os.setsid()
     fork_exit_parent()
-    os.chdir('/')
+    os.chdir("/")
     config_signal_handlers()
     os.umask(0o022)
     redirect_stream(sys.stdin, None)
-    redirect_stream(sys.stdout, open('/tmp/fusearch.out', 'a'))
-    redirect_stream(sys.stderr, open('/tmp/fusearch.err', 'a'))
+    redirect_stream(sys.stdout, open("/tmp/fusearch.out", "a"))
+    redirect_stream(sys.stderr, open("/tmp/fusearch.err", "a"))
     fusearch_main()
 
 
 def config_argparse() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="fusearch daemon", epilog="")
-    parser.add_argument('-f', '--foreground', action='store_true',
-                        help="Don't daemonize")
-    parser.add_argument('-c', '--config', type=str,
-                        default='/etc/fusearch/config.yaml',
-                        help="config file")
+    parser.add_argument("-f", "--foreground", action="store_true", help="Don't daemonize")
+    parser.add_argument("-c", "--config", type=str, default="/etc/fusearch/config.yaml", help="config file")
     return parser
 
 
 def to_text(file: str) -> str:
     assert os.path.isfile(file)
     try:
-        txt_b = textract.process(file, method='pdftotext')
+        txt_b = textract.process(file, method="pdftotext")
         # TODO more intelligent decoding? there be dragons
         txt = bytes_to_str(txt_b)
-        #print(file)
-        #print(len(txt))
-        #print(txt[:80])
-        #print('-------------------')
+        # print(file)
+        # print(len(txt))
+        # print(txt[:80])
+        # print('-------------------')
     except Exception as e:
-        txt = ''
+        txt = ""
         logging.exception("Exception while extracting text from '%s'", file)
         # TODO mark it as failed instead of empty text
     return txt
@@ -125,37 +123,28 @@ def to_text(file: str) -> str:
 def document_from_file(file: str, tokenizer: Tokenizer) -> Document:
     mtime_latest = mtime(file)
     filename = filename_without_extension(file)
-    txt = filename + '\n' + to_text(file)
+    txt = filename + "\n" + to_text(file)
     # Detect language and check that the document makes sense, OCR returns garbage sometimes
     # TODO: add filename to content
-    document = Document(
-        url=file,
-        filename=filename,
-        content=txt,
-        tokfreq=tokfreq(tokenizer(txt)),
-        mtime=mtime_latest)
+    document = Document(url=file, filename=filename, content=txt, tokfreq=tokfreq(tokenizer(txt)), mtime=mtime_latest)
     return document
 
 
 def needs_indexing(index: Index, file: str) -> bool:
     mtime_latest = mtime(file)
-    #document = index.document_from_url(file)
+    # document = index.document_from_url(file)
     mtime_last_known = index.mtime(file)
     if not mtime_last_known or mtime_last_known and mtime_latest > mtime_last_known:
-        #logging.debug("needs_indexing: need '%s'", file)
+        # logging.debug("needs_indexing: need '%s'", file)
         return True
     else:
-        #logging.debug("needs_indexing: NOT need '%s'", file)
+        # logging.debug("needs_indexing: NOT need '%s'", file)
         return False
 
 
 def get_index(path: str, config: Config) -> Index:
-    index_db = os.path.join(path, '.fusearch.db')
-    index = Index({
-        'provider':'sqlite',
-        'filename': index_db,
-        'create_db': True
-    }, tokenizer=get_tokenizer(config))
+    index_db = os.path.join(path, ".fusearch.db")
+    index = Index({"provider": "sqlite", "filename": index_db, "create_db": True}, tokenizer=get_tokenizer(config))
     logging.debug("get_index: '%s' %d docs", index_db, index.doc_count)
     return index
 
@@ -167,7 +156,6 @@ def __init__(self, path, config):
         self.index = get_index(path, config)
         assert os.path.isdir(path)
 
-
     def __call__(self) -> collections.abc.Iterable:
         """:returns a generator of files which are updated from the mtime in the index"""
         file_needs_indexing = functools.partial(needs_indexing, self.index)
@@ -176,22 +164,24 @@ def __call__(self) -> collections.abc.Iterable:
 
 def file_producer(path: str, config: Config, file_queue: Queue, file_inventory: io.IOBase) -> None:
     for file in pickle_loader(file_inventory):
-        #logging.debug("file_producer: %s", file)
+        # logging.debug("file_producer: %s", file)
         file_queue.put(file)
     logging.debug("file_producer is done")
 
 
 def text_extract(config: Config, file_queue: Queue, document_queue: Queue):
-    #logging.debug("text_extract started")
+    # logging.debug("text_extract started")
     tokenizer = get_tokenizer(config)
     while True:
         file = file_queue.get()
         if file is None:
             logging.debug("text_extract is done")
             return
-        logging.debug("text_extract: file_queue.qsize %d document_queue.qsize %d", file_queue.qsize(), document_queue.qsize())
+        logging.debug(
+            "text_extract: file_queue.qsize %d document_queue.qsize %d", file_queue.qsize(), document_queue.qsize()
+        )
         logging.debug("text_extract: '%s'", file)
-        #logging.debug("text_extract: %s", file)
+        # logging.debug("text_extract: %s", file)
         document = document_from_file(file, tokenizer)
         document_queue.put(document)
 
@@ -218,6 +208,7 @@ def document_consumer(path: str, config: Config, document_queue: Queue, file_cou
             pbar.update(file_i)
         file_i += 1
 
+
 def gather_files(path, config, file_inventory) -> int:
     """:returns file count"""
     if not os.path.isdir(path):
@@ -227,23 +218,25 @@ def gather_files(path, config, file_inventory) -> int:
     logging.info("Calculating number of files to index (.=100files)")
     if config.verbose:
         widgets = [
-            ' [',
-                progressbar.Timer(format='Elapsed %(elapsed)s'), ' ',
-                'count: ', progressbar.Counter(),
-            '] ',
+            " [",
+            progressbar.Timer(format="Elapsed %(elapsed)s"),
+            " ",
+            "count: ",
+            progressbar.Counter(),
+            "] ",
             progressbar.BouncingBar(),
         ]
         pbar = progressbar.ProgressBar(widgets=widgets)
     file_count = 0
     for file in NeedsIndexFileGenerator(path, config)():
         pickle.dump(file, file_inventory)
         file_count += 1
-        #if config.verbose and (file_count % 100) == 0:
+        # if config.verbose and (file_count % 100) == 0:
         #    sys.stdout.write('.')
         #    sys.stdout.flush()
         if config.verbose:
             pbar.update(file_count)
-    #if config.verbose:
+    # if config.verbose:
     #    sys.stdout.write('\n')
     if config.verbose:
         pbar.finish()
@@ -260,24 +253,31 @@ def index_do(path, config) -> None:
     else:
         index_serial(path, config, file_count, file_inventory)
 
+
 def index_parallel(path: str, config: Config, file_count: int, file_inventory) -> None:
     #
     # file_producer -> N * test_extract -> document_consumer
     #
     # TODO: check that processes are alive to prevent deadlocks on exceptions in children
-    file_queue = Queue(cpu_count()*8)
+    file_queue = Queue(cpu_count() * 8)
     document_queue = Queue(256)
     text_extract_procs = []
-    file_producer_proc = Process(name='file producer', target=file_producer, daemon=True,
-                                 args=(path, config, file_queue, file_inventory))
+    file_producer_proc = Process(
+        name="file producer", target=file_producer, daemon=True, args=(path, config, file_queue, file_inventory)
+    )
     file_producer_proc.start()
 
-    document_consumer_proc = Process(name='document consumer', target=document_consumer, daemon=True,
-                                     args=(path, config, document_queue, file_count))
+    document_consumer_proc = Process(
+        name="document consumer", target=document_consumer, daemon=True, args=(path, config, document_queue, file_count)
+    )
 
     for i in range(cpu_count()):
-        p = Process(name='text extractor {}'.format(i), target=text_extract, daemon=True,
-                    args=(config, file_queue, document_queue))
+        p = Process(
+            name="text extractor {}".format(i),
+            target=text_extract,
+            daemon=True,
+            args=(config, file_queue, document_queue),
+        )
         text_extract_procs.append(p)
         p.start()
     document_consumer_proc.start()
@@ -297,6 +297,7 @@ def index_parallel(path: str, config: Config, file_count: int, file_inventory) -
     document_consumer_proc.join()
     logging.info("Parallel indexing finished")
 
+
 def index_serial(path, config, file_count, file_inventory):
     if config.verbose:
         pbar = progressbar.ProgressBar(max_value=file_count, widgets=progressbar_index_widgets_)
@@ -332,10 +333,10 @@ def script_name() -> str:
 
 def config_logging() -> None:
     import time
+
     logging.getLogger().setLevel(logging.DEBUG)
     logging.getLogger("requests").setLevel(logging.WARNING)
-    logging.basicConfig(format='{}: %(asctime)sZ %(name)s %(levelname)s %(message)s'.
-                        format(script_name()))
+    logging.basicConfig(format="{}: %(asctime)sZ %(name)s %(levelname)s %(message)s".format(script_name()))
     logging.Formatter.converter = time.gmtime
 
 
@@ -348,5 +349,5 @@ def main() -> int:
     fusearch_main(args)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
@@ -1,8 +1,6 @@
-#index_dirs: [/home/piotr/devel/fusearch/docu/]
-#index_dirs: [/tmp/programming]
-index_dirs: [/tmp/kkita]
-parallel_extraction: false
-#parallel_extraction: true
+index_dirs: [/home/piotr/storage/docu/]
+#parallel_extraction: false
+parallel_extraction: true
 verbose: true
 include_extensions:
   - pdf
 
@@ -1,22 +1,38 @@
-from setuptools import setup, find_packages, find_namespace_packages
+from setuptools import find_packages, setup
+
+
+INSTALL_REQUIRES = ["textract", "nltk"]
+
+EXTRAS_REQUIRE = {"test": ["flake8", "black", "mock", "pre-commit", "pytest"]}
+
+
+with open("README.md", "r") as f:
+    LONG_DESCRIPTION = f.read()
 
-from os import path
 
 setup(
     name="fusearch",
     version="0.1",
-    packages=find_packages('src'),
-    package_dir={"": "src"},
-    install_requires=['textract'],
-    tests_require=['nose'],
-    package_data={},
     author="Pedro Larroy",
     author_email="[email protected]",
     description="fusearch is a local full text search engine",
     license="Apache 2",
     keywords="search console fulltext documents",
     url="https://github.com/larroy/fusearch",
-    project_urls={
-        "Source Code": "https://github.com/larroy/fusearch",
-    }
+    project_urls={"Source Code": "https://github.com/larroy/fusearch",},
+    packages=find_packages("src"),
+    package_dir={"": "src"},
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    install_requires=INSTALL_REQUIRES,
+    extras_require=EXTRAS_REQUIRE,
+    classifiers=[
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Developers",
+        "Natural Language :: English",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.6",
+    ],
+    package_data={},
+    scripts=["bin/fusearchd.py"],
 )
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,3 @@ __pycache__/`
`8`	`8`	`.idea`
`9`	`9`
`10`	`10`	`*.egg-info/`
`11`		`-`