|
1 | 1 | from pony.orm import *
|
| 2 | +from .tokenizer import Tokenizer |
| 3 | +from collections import defaultdict |
| 4 | +from .util import uniq |
| 5 | +import msgpack |
| 6 | +import math |
| 7 | +from .model import Result |
| 8 | +import operator |
2 | 9 |
|
3 | 10 |
|
4 | 11 | class Index:
|
5 | 12 | """Inverted index"""
|
6 |
| - def __init__(self, bindargs): |
| 13 | + def __init__(self, bindargs, tokenizer: Tokenizer): |
7 | 14 | """
|
8 | 15 | :param bindargs: pony bind args such as {'provider':'sqlite', 'filename':':memory:'}
|
| 16 | + :param tokenizer: A class implementing :class:`tokenizer.Tokenizer` |
9 | 17 | """
|
| 18 | + self.tokenizer = tokenizer |
| 19 | + # set_sql_debug(True) |
| 20 | + |
10 | 21 | db = Database()
|
11 |
| - set_sql_debug(True) |
| 22 | + |
| 23 | + class Token(db.Entity): |
| 24 | + tok = Required(str, unique=True) |
| 25 | + doc_freq = Required(int) |
| 26 | + documents = Set('Document') |
12 | 27 |
|
13 | 28 | class Document(db.Entity):
|
14 |
| - path = Required(str) |
| 29 | + url = Required(str) |
15 | 30 | filename = Required(str)
|
16 |
| - content = Required(LongStr) |
| 31 | + content = Optional(LongStr) |
17 | 32 | tokens = Set('Token')
|
| 33 | + tokfreq = Required(bytes) |
18 | 34 |
|
19 |
| - class Token(db.Entity): |
20 |
| - tok = Required(str) |
21 |
| - doc_freq = Required(int) |
22 |
| - documents = Set(Document) |
23 | 35 |
|
| 36 | + self.Token = Token |
| 37 | + self.Document = Document |
24 | 38 | db.bind(**bindargs)
|
25 | 39 | db.generate_mapping(create_tables=True)
|
| 40 | + self.doc_count = 0 |
26 | 41 |
|
27 | 42 | def add_document(self, document):
|
| 43 | + tokens = self.tokenizer.tokenize(document.content) |
| 44 | + tokfreq = defaultdict(int) |
| 45 | + for tok in tokens: |
| 46 | + tokfreq[tok] += 1 |
28 | 47 | with db_session:
|
29 |
| - Document(document.path, document.filename, document.content) |
| 48 | + doc = self.Document( |
| 49 | + url=document.url, |
| 50 | + filename=document.filename, |
| 51 | + content=document.content, |
| 52 | + tokfreq=msgpack.packb(tokfreq)) |
| 53 | + |
| 54 | + for tok, freq in tokfreq.items(): |
| 55 | + token = self.Token.get(tok=tok) |
| 56 | + if token: |
| 57 | + token.doc_freq += freq |
| 58 | + token.documents += doc |
| 59 | + else: |
| 60 | + self.Token(tok=tok, doc_freq=freq, documents=doc) |
| 61 | + self.doc_count += 1 |
| 62 | + |
| 63 | + def query_token(self, token): |
| 64 | + result = [] |
| 65 | + with db_session: |
| 66 | + tok = self.Token.get(tok=token) |
| 67 | + for doc in tok.documents: |
| 68 | + result.append(doc.url) |
| 69 | + return result |
| 70 | + |
| 71 | + def update(self): |
| 72 | + # TODO update doc_count |
| 73 | + pass |
| 74 | + |
| 75 | + |
| 76 | + def query(self, txt): |
| 77 | + """Given a query string, return a list of search results""" |
| 78 | + txt_tokens = uniq(self.tokenizer.tokenize(txt)) |
| 79 | + results = [] |
| 80 | + with db_session: |
| 81 | + tokens = self.Token.select(lambda x: x.tok in txt_tokens) |
| 82 | + for token in tokens: |
| 83 | + numdocs_t = len(token.documents) |
| 84 | + for document in token.documents: |
| 85 | + tokfreq = msgpack.unpackb(document.tokfreq, raw=False) |
| 86 | + tok = token.tok |
| 87 | + tfidf = tokfreq[tok] * math.log(self.doc_count/numdocs_t) / len(tokfreq) |
| 88 | + results.append( |
| 89 | + Result( |
| 90 | + tok=tok, |
| 91 | + tfidf=tfidf, |
| 92 | + url=document.url |
| 93 | + ) |
| 94 | + ) |
| 95 | + return results |
| 96 | + |
| 97 | + def rank(self, results): |
| 98 | + """Convert list of Result to a ranked list of urls""" |
| 99 | + by_doc = defaultdict(float) |
| 100 | + # Is this the best way to combine TFIDF? probably not |
| 101 | + for x in results: |
| 102 | + by_doc[x.url] += x.tfidf |
| 103 | + sorted_results = sorted(by_doc.items(), key=operator.itemgetter(1), reverse=True) |
| 104 | + urls = [x[0] for x in sorted_results] |
| 105 | + return urls |
| 106 | + |
| 107 | + def ranked(self, txt): |
| 108 | + return self.rank(self.query(txt)) |
30 | 109 |
|
31 | 110 |
|
32 |
| -index = Index({'provider':'sqlite', 'filename':':memory:'}) |
33 |
| -index.add_document() |
| 111 | +#index = Index({'provider':'sqlite', 'filename':':memory:'}) |
| 112 | +#index.add_document() |
0 commit comments