4
4
from build import *
5
5
from utils import docs2text , id2details
6
6
from cloud_storage import test_file_exists , download_blob , upload_blob , pull_indices , download_pytorch_model
7
- from preprocessing import preprocess_QA_text , preprocess_string , ensure_good_content
7
+ from preprocessing import preprocess_QA_text , preprocess_string , ensure_good_content , ensure_good_str_list , ensure_good_string , ensure_good_list
8
8
from bson import ObjectId
9
9
from live_indexing import update_faiss
10
10
28
28
ids = load ('models/ids.joblib' )
29
29
qa_model = QA ('models' )
30
30
31
+ @strawberry .type
32
+ class Author :
33
+ url : str
34
+ name : str
35
+ email : str
36
+
37
+ @strawberry .type
38
+ class Image :
39
+ url : str
40
+ description : str
41
+ provider : str
42
+ license : str
43
+
44
+ @strawberry .type
45
+ class Source :
46
+ id : str
47
+ name : str
48
+ description : str
49
+ url : str
31
50
32
51
@strawberry .type
33
52
class Document :
34
53
id : str
54
+ url : str
55
+ directURL : str
35
56
title : str
36
- description : str
57
+ dateIndexed : str
58
+ type_ : str
37
59
content : List [str ]
60
+ alternateTitle : List [str ]
61
+ fileName : str
62
+ authors : List [Author ]
63
+ datePublished : str
64
+ keywords : List [str ]
65
+ description : str
66
+ alternateDescription : str
67
+ imageURLS : List [Image ]
68
+ isbn : str
69
+ issn : str
70
+ doi : str
71
+ meshHeadings : List [str ]
72
+ meshQualifiers : List [str ]
73
+ source : Source
38
74
rights : str
39
- url : str
40
75
language : str
41
- type : str
42
- directURL : str
43
- datePublished : str
44
- dateAdded : str
45
76
46
77
@strawberry .type
47
78
class SearchResult :
@@ -63,6 +94,52 @@ class IndexingResult:
63
94
status : str
64
95
metadata : MetaData
65
96
97
+ def serach_result_from_documents (documents ):
98
+ return SearchResult (
99
+ [Document (
100
+ id = doc ['_id' ], #ensure_good_string(doc, '_id'),
101
+ url = doc ['url' ], #ensure_good_string(doc,'url'),
102
+ directURL = doc ['directURL' ], #ensure_good_string(doc,'directURL'),
103
+ title = doc ['title' ].encode ('latin1' ).decode ('utf8' ),
104
+ dateIndexed = doc ['dateIndexed' ], #ensure_good_string(doc,'dateIndexed'),
105
+ type_ = doc ['type' ], #ensure_good_string(doc,'type'),
106
+ content = ensure_good_content (doc ['content' ]['text' ]), # note difference!
107
+ alternateTitle = ensure_good_str_list (doc ,'alternateTitle' ),
108
+ fileName = ensure_good_string (doc ,'file_name' ),
109
+ authors = [
110
+ Author (
111
+ url = ensure_good_string (author ,'url' ),
112
+ name = ensure_good_string (author ,'name' ),
113
+ email = ensure_good_string (author ,'email' )
114
+ )
115
+ for author in ensure_good_list (doc , 'authors' )],
116
+ datePublished = ensure_good_string (doc ,'datePublished' ),
117
+ keywords = ensure_good_str_list (doc ,'keywords' ),
118
+ description = ensure_good_string (doc ,'description' ),
119
+ alternateDescription = ensure_good_string (doc ,'alternateDescription' ),
120
+ imageURLS = [
121
+ Image (
122
+ url = ensure_good_string (image ,'url' ),
123
+ description = ensure_good_string (image ,'description' ),
124
+ provider = ensure_good_string (image ,'provider' ),
125
+ licence = ensure_good_string (image ,'licence' )
126
+ )
127
+ for image in ensure_good_list (doc , 'imageURLS' )],
128
+ isbn = ensure_good_string (doc ,'isbn' ),
129
+ issn = ensure_good_string (doc ,'issn' ),
130
+ doi = ensure_good_string (doc ,'doi' ),
131
+ meshHeadings = ensure_good_str_list (doc ,'meshHeadings' ),
132
+ meshQualifiers = ensure_good_str_list (doc ,'meshQualifiers' ),
133
+ source = Source (
134
+ id = ensure_good_string (doc ['source' ],'id' ),
135
+ name = ensure_good_string (doc ['source' ],'name' ),
136
+ description = ensure_good_string (doc ['source' ],'description' ),
137
+ url = ensure_good_string (doc ['source' ],'url' )
138
+ ), #ensure_good_string(doc,'source'),
139
+ rights = "" , #ensure_good_string(doc,'rights')
140
+ language = doc ['language' ], #ensure_good_string(doc,'language')
141
+ ) for doc in documents ])
142
+
66
143
@strawberry .type
67
144
class Query :
68
145
@@ -83,30 +160,10 @@ def search(self, q: str, language: str = 'en', type: str = None) -> SearchResult
83
160
filters ['type' ] = type
84
161
documents = list (collection .find (filters ))
85
162
86
-
87
- # def ensure_good_content(content_list):
88
- # '''
89
- # function to remove potential problems from the context, and preprocess it to look like normal text
90
- # '''
91
- # # remove None-s from the list
92
- # string_list = map(str,content_list)
93
- # # preprocess and join together the content list
94
- # string_list = [preprocess_string(page, stopping = False, stemming = False, lowercasing = False) for page in string_list ]
95
- # return [string_list]
96
-
97
163
# things = list(db.things.find({'_id': {'$in': id_array}}))
98
164
documents .sort (key = lambda doc : id_arr .index (doc ['_id' ]))
99
- return SearchResult ([Document (id = doc ['_id' ],
100
- title = doc ['title' ].encode ('latin1' ).decode ('utf8' ),
101
- description = doc ['description' ],
102
- content = ensure_good_content (doc ['content' ]['text' ]),
103
- url = doc ['url' ],
104
- directURL = doc ['directURL' ],
105
- type = doc ['type' ],
106
- language = doc ['language' ],
107
- rights = "" ,
108
- datePublished = doc ['datePublished' ],
109
- dateAdded = doc ['dateIndexed' ]) for doc in documents ])
165
+ return serach_result_from_documents (documents )
166
+
110
167
111
168
@strawberry .field
112
169
def more_docs (self , id : str ) -> SearchResult :
@@ -115,17 +172,8 @@ def more_docs(self, id: str) -> SearchResult:
115
172
id_arr = (np .array (ids )[I [0 ]]).tolist ()
116
173
documents = list (collection .find ({'_id' : {'$in' : id_arr }}))
117
174
118
- return SearchResult ([Document (id = doc ['_id' ],
119
- title = doc ['title' ].encode ('latin1' ).decode ('utf8' ),
120
- description = doc ['description' ],
121
- content = ensure_good_content (doc ['content' ]['text' ]),
122
- url = doc ['url' ],
123
- directURL = doc ['directURL' ],
124
- type = doc ['type' ],
125
- language = doc ['language' ],
126
- rights = "" ,
127
- datePublished = doc ['datePublished' ],
128
- dateAdded = doc ['dateIndexed' ]) for doc in documents ])
175
+ return serach_result_from_documents (documents )
176
+
129
177
# @strawberry.field
130
178
# def semantic_search(self, q: str) -> SearchResult:
131
179
# D, I = vector_search(q, bert_model, bert_faiss)
0 commit comments