Skip to content

Commit 484cff6

Browse files
author
Pedro Larroy
committedDec 21, 2018
TFIDF indexing
1 parent 16a0032 commit 484cff6

12 files changed

+196
-28
lines changed
 

‎3rdparty/textract

‎TODO

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
- Language detection to select the stemming algorithm
2+
- Verify TFIDF combination on queries

‎setup.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
1-
from setuptools import setup, find_packages
1+
from setuptools import setup, find_packages, find_namespace_packages
22

33
from os import path
44

55
setup(
66
name="fusearch",
77
version="0.1",
88
packages=find_packages('src'),
9+
package_dir={"": "src"},
910
install_requires=['textract'],
1011
tests_require=['nose'],
1112
package_data={},
12-
autho="Pedro Larroy",
13+
author="Pedro Larroy",
1314
author_email="pedro.larroy.lists@gmail.com",
1415
description="fusearch is a local full text search engine",
1516
license="Apache 2",
File renamed without changes.

‎src/fusearch/fusearchd.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@
77
import signal
88
import sys
99
import logging
10-
import time
1110
import yaml
1211
import textract
1312
import filetype
1413
import functools
15-
from collections import namedtuple
14+
from fusearch.index import Index
15+
from fusearch.model import Document
16+
from fusearch.nltk_tokenizer import NLTKTokenizer
1617

1718

1819
def script_name() -> str:
@@ -24,7 +25,7 @@ def config_logging() -> None:
2425
import time
2526
logging.getLogger().setLevel(logging.INFO)
2627
logging.getLogger("requests").setLevel(logging.WARNING)
27-
logging.basicConfig(format='{}: %(asctime)sZ %(levelname)s %(message)s'.
28+
logging.basicConfig(format='{}: %(asctime)sZ %(name)s %(levelname)s %(message)s'.
2829
format(script_name()))
2930
logging.Formatter.converter = time.gmtime
3031

@@ -150,39 +151,45 @@ def file_generator(path):
150151

151152
def to_text(file) -> None:
152153
try:
153-
txt = textract.process(file)
154+
txt_b = textract.process(file, method='pdftotext')
155+
# TODO more intelligent decoding? there be dragons
156+
txt = txt_b.decode('utf-8')
157+
print(file)
154158
print(len(txt))
155159
print(txt[:80])
156-
except RuntimeError as e:
160+
print('-------------------')
161+
except Exception as e:
157162
txt = ''
158163
logging.error("Exception while extracting text from '%s'", file)
159164
return txt
160165

161166

162-
Document = namedtuple('Document', ['path', 'filename', 'content'])
163-
def text_extraction(file) -> Document:
164-
txt = to_text(file)
165-
base = filename_without_extension(file)
166-
return Document(file, base, txt)
167+
def text_extraction(path) -> Document:
168+
assert os.path.isfile(path)
169+
filename = filename_without_extension(path)
170+
txt = to_text(path)
171+
return Document(path, filename, txt)
167172

168173

169174
def index(path, include_extensions) -> None:
170175
if not os.path.isdir(path):
171176
logging.error("Not a directory: '%s', skipping indexing", path)
172177
return
173178
desired_filetype = functools.partial(filetype_admissible, include_extensions)
174-
index = Index({'provider':'sqlite', 'filename':'fusearch.db', 'create_db': True})
179+
index = Index({
180+
'provider':'sqlite',
181+
'filename': os.path.join(path,'fusearch.db'),
182+
'create_db': True
183+
}, tokenizer=NLTKTokenizer())
175184
for file in filter(desired_filetype, file_generator(path)):
176185
document = text_extraction(file)
186+
index.add_document(document)
177187

178188

179189
def fusearch_main(args) -> int:
180190
logging.info("reading config from %s", args.config)
181191
config = Config.from_file(args.config)
182192
logging.info("%s", config)
183-
# print(index_file('/Users/pllarroy/docu/books/edward_tufte_the_visual_display_of_quantitative_information_second_edition_2001.pdf'))
184-
# index_file('/Users/pllarroy/docu/arts_design/Colour Management - A Comprehensive Guide For Graphic Designers - 150Dpi.pdf')
185-
# return
186193
for path in config.index_dirs:
187194
index(path, set(config.include_extensions))
188195

‎src/fusearch/index.py

+90-11
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,112 @@
11
from pony.orm import *
2+
from .tokenizer import Tokenizer
3+
from collections import defaultdict
4+
from .util import uniq
5+
import msgpack
6+
import math
7+
from .model import Result
8+
import operator
29

310

411
class Index:
512
"""Inverted index"""
6-
def __init__(self, bindargs):
13+
def __init__(self, bindargs, tokenizer: Tokenizer):
714
"""
815
:param bindargs: pony bind args such as {'provider':'sqlite', 'filename':':memory:'}
16+
:param tokenizer: A class implementing :class:`tokenizer.Tokenizer`
917
"""
18+
self.tokenizer = tokenizer
19+
# set_sql_debug(True)
20+
1021
db = Database()
11-
set_sql_debug(True)
22+
23+
class Token(db.Entity):
24+
tok = Required(str, unique=True)
25+
doc_freq = Required(int)
26+
documents = Set('Document')
1227

1328
class Document(db.Entity):
14-
path = Required(str)
29+
url = Required(str)
1530
filename = Required(str)
16-
content = Required(LongStr)
31+
content = Optional(LongStr)
1732
tokens = Set('Token')
33+
tokfreq = Required(bytes)
1834

19-
class Token(db.Entity):
20-
tok = Required(str)
21-
doc_freq = Required(int)
22-
documents = Set(Document)
2335

36+
self.Token = Token
37+
self.Document = Document
2438
db.bind(**bindargs)
2539
db.generate_mapping(create_tables=True)
40+
self.doc_count = 0
2641

2742
def add_document(self, document):
43+
tokens = self.tokenizer.tokenize(document.content)
44+
tokfreq = defaultdict(int)
45+
for tok in tokens:
46+
tokfreq[tok] += 1
2847
with db_session:
29-
Document(document.path, document.filename, document.content)
48+
doc = self.Document(
49+
url=document.url,
50+
filename=document.filename,
51+
content=document.content,
52+
tokfreq=msgpack.packb(tokfreq))
53+
54+
for tok, freq in tokfreq.items():
55+
token = self.Token.get(tok=tok)
56+
if token:
57+
token.doc_freq += freq
58+
token.documents += doc
59+
else:
60+
self.Token(tok=tok, doc_freq=freq, documents=doc)
61+
self.doc_count += 1
62+
63+
def query_token(self, token):
64+
result = []
65+
with db_session:
66+
tok = self.Token.get(tok=token)
67+
for doc in tok.documents:
68+
result.append(doc.url)
69+
return result
70+
71+
def update(self):
72+
# TODO update doc_count
73+
pass
74+
75+
76+
def query(self, txt):
77+
"""Given a query string, return a list of search results"""
78+
txt_tokens = uniq(self.tokenizer.tokenize(txt))
79+
results = []
80+
with db_session:
81+
tokens = self.Token.select(lambda x: x.tok in txt_tokens)
82+
for token in tokens:
83+
numdocs_t = len(token.documents)
84+
for document in token.documents:
85+
tokfreq = msgpack.unpackb(document.tokfreq, raw=False)
86+
tok = token.tok
87+
tfidf = tokfreq[tok] * math.log(self.doc_count/numdocs_t) / len(tokfreq)
88+
results.append(
89+
Result(
90+
tok=tok,
91+
tfidf=tfidf,
92+
url=document.url
93+
)
94+
)
95+
return results
96+
97+
def rank(self, results):
98+
"""Convert list of Result to a ranked list of urls"""
99+
by_doc = defaultdict(float)
100+
# Is this the best way to combine TFIDF? probably not
101+
for x in results:
102+
by_doc[x.url] += x.tfidf
103+
sorted_results = sorted(by_doc.items(), key=operator.itemgetter(1), reverse=True)
104+
urls = [x[0] for x in sorted_results]
105+
return urls
106+
107+
def ranked(self, txt):
108+
return self.rank(self.query(txt))
30109

31110

32-
index = Index({'provider':'sqlite', 'filename':':memory:'})
33-
index.add_document()
111+
#index = Index({'provider':'sqlite', 'filename':':memory:'})
112+
#index.add_document()

‎src/fusearch/model.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from collections import namedtuple
2+
3+
Document = namedtuple('Document', ['url', 'filename', 'content'])
4+
5+
6+
Result = namedtuple('Result', ['tok', 'tfidf', 'url'])

‎src/fusearch/nltk_tokenizer.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from .tokenizer import Tokenizer
2+
3+
from nltk.stem import PorterStemmer
4+
import nltk
5+
6+
class NLTKTokenizer(Tokenizer):
7+
def __init__(self):
8+
self.stemmer = PorterStemmer()
9+
10+
def tokenize(self, x):
11+
return list(map(self.stemmer.stem, nltk.word_tokenize(x)))
12+

‎src/fusearch/tokenizer.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from abc import ABC, abstractmethod
2+
3+
class Tokenizer(ABC):
4+
def __init__(self):
5+
pass
6+
7+
@abstractmethod
8+
def tokenize(self, x):
9+
pass

‎src/fusearch/util.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from typing import *
2+
def uniq(xs: List[Any]) -> List[Any]:
3+
result = []
4+
seen = set()
5+
for x in xs:
6+
if x not in seen:
7+
result.append(x)
8+
seen.add(x)
9+
return result

‎tests/test_index.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from fusearch.index import Index
2+
from fusearch.tokenizer import Tokenizer
3+
from pony.orm import *
4+
import logging
5+
from fusearch.model import Document
6+
from nose.tools import *
7+
8+
class NaiveTokenizer(Tokenizer):
9+
def tokenize(self, x):
10+
return x.split()
11+
12+
13+
def test_query():
14+
index = Index({'provider':'sqlite', 'filename':':memory:'}, NaiveTokenizer())
15+
docs = [
16+
Document('/path/doc.pdf', 'doc', 'this is an example document example'),
17+
Document('/path/doc2.pdf', 'doc', 'this is an another document days go by')
18+
]
19+
for doc in docs:
20+
index.add_document(doc)
21+
res = set(index.query_token('example'))
22+
results = index.query('another days document')
23+
urls = index.rank(results)
24+
eq_(urls, ['/path/doc2.pdf', '/path/doc.pdf'])
25+
eq_(index.ranked('another'), ['/path/doc2.pdf'])
26+
eq_(index.ranked('nada'), [])
27+
28+
29+
if __name__ == '__main__':
30+
import nose
31+
nose.run(defaultTest=__name__)

‎tests/test_tokenizer.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import nose
2+
3+
from fusearch.nltk_tokenizer import NLTKTokenizer
4+
5+
def test_tokenizer():
6+
tok = NLTKTokenizer()
7+
toks = tok.tokenize('directed the movie long time')
8+
print(toks)
9+
10+
if __name__ == '__main__':
11+
import nose
12+
nose.run(defaultTest=__name__)

0 commit comments

Comments
 (0)
Please sign in to comment.