TFIDF indexing

Pedro Larroy · Pedro Larroy · commit 484cff633ff9 · 2018-12-21T03:20:20.000+01:00
diff --git a/3rdparty/textract b/3rdparty/textract
@@ -1 +1 @@
-Subproject commit d853cd3b9a766cf3d49a4743659f81ffc2e18173
+Subproject commit b163afd43ac2e253e270bf33fe9d60ec5f3ee05d
diff --git a/TODO b/TODO
@@ -0,0 +1,2 @@
+- Language detection to select the stemming algorithm
+- Verify TFIDF combination on queries
diff --git a/setup.py b/setup.py
@@ -1,15 +1,16 @@
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, find_namespace_packages
 
 from os import path
 
 setup(
     name="fusearch",
     version="0.1",
     packages=find_packages('src'),
+    package_dir={"": "src"},
     install_requires=['textract'],
     tests_require=['nose'],
     package_data={},
-    autho="Pedro Larroy",
+    author="Pedro Larroy",
     author_email="pedro.larroy.lists@gmail.com",
     description="fusearch is a local full text search engine",
     license="Apache 2",
diff --git a/src/fusearch/__init__.py b/src/fusearch/__init__.py
diff --git a/src/fusearch/fusearchd.py b/src/fusearch/fusearchd.py
@@ -7,12 +7,13 @@
 import signal
 import sys
 import logging
-import time
 import yaml
 import textract
 import filetype
 import functools
-from collections import namedtuple
+from fusearch.index import Index
+from fusearch.model import Document
+from fusearch.nltk_tokenizer import NLTKTokenizer
 
 
 def script_name() -> str:
@@ -24,7 +25,7 @@ def config_logging() -> None:
     import time
     logging.getLogger().setLevel(logging.INFO)
     logging.getLogger("requests").setLevel(logging.WARNING)
-    logging.basicConfig(format='{}: %(asctime)sZ %(levelname)s %(message)s'.
+    logging.basicConfig(format='{}: %(asctime)sZ %(name)s %(levelname)s %(message)s'.
                         format(script_name()))
     logging.Formatter.converter = time.gmtime
 
@@ -150,39 +151,45 @@ def file_generator(path):
 
 def to_text(file) -> None:
     try:
-        txt = textract.process(file)
+        txt_b = textract.process(file, method='pdftotext')
+        # TODO more intelligent decoding? there be dragons
+        txt = txt_b.decode('utf-8')
+        print(file)
         print(len(txt))
         print(txt[:80])
-    except RuntimeError as e:
+        print('-------------------')
+    except Exception as e:
         txt = ''
         logging.error("Exception while extracting text from '%s'", file)
     return txt
 
 
-Document = namedtuple('Document', ['path', 'filename', 'content'])
-def text_extraction(file) -> Document:
-    txt = to_text(file)
-    base = filename_without_extension(file)
-    return Document(file, base, txt)
+def text_extraction(path) -> Document:
+    assert os.path.isfile(path)
+    filename = filename_without_extension(path)
+    txt = to_text(path)
+    return Document(path, filename, txt)
 
 
 def index(path, include_extensions) -> None:
     if not os.path.isdir(path):
         logging.error("Not a directory: '%s', skipping indexing", path)
         return
     desired_filetype = functools.partial(filetype_admissible, include_extensions)
-    index = Index({'provider':'sqlite', 'filename':'fusearch.db', 'create_db': True})
+    index = Index({
+        'provider':'sqlite',
+        'filename': os.path.join(path,'fusearch.db'),
+        'create_db': True
+    }, tokenizer=NLTKTokenizer())
     for file in filter(desired_filetype, file_generator(path)):
         document = text_extraction(file)
+        index.add_document(document)
 
 
 def fusearch_main(args) -> int:
     logging.info("reading config from %s", args.config)
     config = Config.from_file(args.config)
     logging.info("%s", config)
-    # print(index_file('/Users/pllarroy/docu/books/edward_tufte_the_visual_display_of_quantitative_information_second_edition_2001.pdf'))
-    # index_file('/Users/pllarroy/docu/arts_design/Colour Management - A Comprehensive Guide For Graphic Designers - 150Dpi.pdf')
-    # return
     for path in config.index_dirs:
         index(path, set(config.include_extensions))
 
diff --git a/src/fusearch/index.py b/src/fusearch/index.py
@@ -1,33 +1,112 @@
 from pony.orm import *
+from .tokenizer import Tokenizer
+from collections import defaultdict
+from .util import uniq
+import msgpack
+import math
+from .model import Result
+import operator
 
 
 class Index:
     """Inverted index"""
-    def __init__(self, bindargs):
+    def __init__(self, bindargs, tokenizer: Tokenizer):
         """
         :param bindargs: pony bind args such as {'provider':'sqlite', 'filename':':memory:'}
+        :param tokenizer: A class implementing :class:`tokenizer.Tokenizer`
         """
+        self.tokenizer = tokenizer
+        # set_sql_debug(True)
+
         db = Database()
-        set_sql_debug(True)
+
+        class Token(db.Entity):
+            tok = Required(str, unique=True)
+            doc_freq = Required(int)
+            documents = Set('Document')
 
         class Document(db.Entity):
-            path = Required(str)
+            url = Required(str)
             filename = Required(str)
-            content = Required(LongStr)
+            content = Optional(LongStr)
             tokens = Set('Token')
+            tokfreq = Required(bytes)
 
-        class Token(db.Entity):
-            tok = Required(str)
-            doc_freq = Required(int)
-            documents = Set(Document)
 
+        self.Token = Token
+        self.Document = Document
         db.bind(**bindargs)
         db.generate_mapping(create_tables=True)
+        self.doc_count = 0
 
     def add_document(self, document):
+        tokens = self.tokenizer.tokenize(document.content)
+        tokfreq = defaultdict(int)
+        for tok in tokens:
+            tokfreq[tok] += 1
         with db_session:
-            Document(document.path, document.filename, document.content)
+            doc = self.Document(
+                url=document.url,
+                filename=document.filename,
+                content=document.content,
+                tokfreq=msgpack.packb(tokfreq))
+
+            for tok, freq in tokfreq.items():
+                token = self.Token.get(tok=tok)
+                if token:
+                    token.doc_freq += freq
+                    token.documents += doc
+                else:
+                    self.Token(tok=tok, doc_freq=freq, documents=doc)
+        self.doc_count += 1
+
+    def query_token(self, token):
+        result = []
+        with db_session:
+            tok = self.Token.get(tok=token)
+            for doc in tok.documents:
+                result.append(doc.url)
+        return result
+
+    def update(self):
+        # TODO update doc_count
+        pass
+
+
+    def query(self, txt):
+        """Given a query string, return a list of search results"""
+        txt_tokens = uniq(self.tokenizer.tokenize(txt))
+        results = []
+        with db_session:
+            tokens = self.Token.select(lambda x: x.tok in txt_tokens)
+            for token in tokens:
+                numdocs_t = len(token.documents)
+                for document in token.documents:
+                    tokfreq = msgpack.unpackb(document.tokfreq, raw=False)
+                    tok = token.tok
+                    tfidf = tokfreq[tok] * math.log(self.doc_count/numdocs_t) / len(tokfreq)
+                    results.append(
+                        Result(
+                            tok=tok,
+                            tfidf=tfidf,
+                            url=document.url
+                        )
+                    )
+        return results
+
+    def rank(self, results):
+        """Convert list of Result to a ranked list of urls"""
+        by_doc = defaultdict(float)
+        # Is this the best way to combine TFIDF? probably not
+        for x in results:
+            by_doc[x.url] += x.tfidf
+        sorted_results = sorted(by_doc.items(), key=operator.itemgetter(1), reverse=True)
+        urls = [x[0] for x in sorted_results]
+        return urls
+
+    def ranked(self, txt):
+        return self.rank(self.query(txt))
 
 
-index = Index({'provider':'sqlite', 'filename':':memory:'})
-index.add_document()
+#index = Index({'provider':'sqlite', 'filename':':memory:'})
+#index.add_document()
diff --git a/src/fusearch/model.py b/src/fusearch/model.py
@@ -0,0 +1,6 @@
+from collections import namedtuple
+
+Document = namedtuple('Document', ['url', 'filename', 'content'])
+
+
+Result = namedtuple('Result', ['tok', 'tfidf', 'url'])
diff --git a/src/fusearch/nltk_tokenizer.py b/src/fusearch/nltk_tokenizer.py
@@ -0,0 +1,12 @@
+from .tokenizer import Tokenizer
+
+from nltk.stem import PorterStemmer
+import nltk
+
+class NLTKTokenizer(Tokenizer):
+    def __init__(self):
+        self.stemmer = PorterStemmer()
+
+    def tokenize(self, x):
+        return list(map(self.stemmer.stem, nltk.word_tokenize(x)))
+
diff --git a/src/fusearch/tokenizer.py b/src/fusearch/tokenizer.py
@@ -0,0 +1,9 @@
+from abc import ABC, abstractmethod
+
+class Tokenizer(ABC):
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def tokenize(self, x):
+        pass
diff --git a/src/fusearch/util.py b/src/fusearch/util.py
@@ -0,0 +1,9 @@
+from typing import *
+def uniq(xs: List[Any]) -> List[Any]:
+    result = []
+    seen = set()
+    for x in xs:
+        if x not in seen:
+            result.append(x)
+        seen.add(x)
+    return result
diff --git a/tests/test_index.py b/tests/test_index.py
@@ -0,0 +1,31 @@
+from fusearch.index import Index
+from fusearch.tokenizer import Tokenizer
+from pony.orm import *
+import logging
+from fusearch.model import Document
+from nose.tools import *
+
+class NaiveTokenizer(Tokenizer):
+    def tokenize(self, x):
+        return x.split()
+
+
+def test_query():
+    index = Index({'provider':'sqlite', 'filename':':memory:'}, NaiveTokenizer())
+    docs = [
+        Document('/path/doc.pdf', 'doc', 'this is an example document example'),
+        Document('/path/doc2.pdf', 'doc', 'this is an another document days go by')
+    ]
+    for doc in docs:
+        index.add_document(doc)
+    res = set(index.query_token('example'))
+    results = index.query('another days document')
+    urls = index.rank(results)
+    eq_(urls, ['/path/doc2.pdf', '/path/doc.pdf'])
+    eq_(index.ranked('another'), ['/path/doc2.pdf'])
+    eq_(index.ranked('nada'), [])
+
+
+if __name__ == '__main__':
+    import nose
+    nose.run(defaultTest=__name__)
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -0,0 +1,12 @@
+import nose
+
+from fusearch.nltk_tokenizer import NLTKTokenizer
+
+def test_tokenizer():
+    tok = NLTKTokenizer()
+    toks = tok.tokenize('directed the movie long time')
+    print(toks)
+
+if __name__ == '__main__':
+    import nose
+    nose.run(defaultTest=__name__)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+- Language detection to select the stemming algorithm`
	`2`	`+- Verify TFIDF combination on queries`