Skip to content

Commit 6f1294c

Browse files
returning all database fields to frontend after search and checking for empty/nonexistent fields
1 parent 67319c2 commit 6f1294c

File tree

3 files changed

+110
-44
lines changed

3 files changed

+110
-44
lines changed

app.py

+88-40
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from build import *
55
from utils import docs2text, id2details
66
from cloud_storage import test_file_exists, download_blob, upload_blob, pull_indices, download_pytorch_model
7-
from preprocessing import preprocess_QA_text, preprocess_string, ensure_good_content
7+
from preprocessing import preprocess_QA_text, preprocess_string, ensure_good_content, ensure_good_str_list, ensure_good_string, ensure_good_list
88
from bson import ObjectId
99
from live_indexing import update_faiss
1010

@@ -28,20 +28,51 @@
2828
ids = load('models/ids.joblib')
2929
qa_model = QA('models')
3030

31+
@strawberry.type
32+
class Author:
33+
url: str
34+
name: str
35+
email: str
36+
37+
@strawberry.type
38+
class Image:
39+
url: str
40+
description: str
41+
provider: str
42+
license: str
43+
44+
@strawberry.type
45+
class Source:
46+
id: str
47+
name: str
48+
description: str
49+
url: str
3150

3251
@strawberry.type
3352
class Document:
3453
id: str
54+
url: str
55+
directURL: str
3556
title: str
36-
description: str
57+
dateIndexed: str
58+
type_: str
3759
content: List[str]
60+
alternateTitle: List[str]
61+
fileName: str
62+
authors: List[Author]
63+
datePublished: str
64+
keywords: List[str]
65+
description: str
66+
alternateDescription: str
67+
imageURLS: List[Image]
68+
isbn: str
69+
issn: str
70+
doi: str
71+
meshHeadings: List[str]
72+
meshQualifiers: List[str]
73+
source: Source
3874
rights: str
39-
url: str
4075
language: str
41-
type: str
42-
directURL: str
43-
datePublished: str
44-
dateAdded: str
4576

4677
@strawberry.type
4778
class SearchResult:
@@ -63,6 +94,52 @@ class IndexingResult:
6394
status: str
6495
metadata: MetaData
6596

97+
def serach_result_from_documents(documents):
98+
return SearchResult(
99+
[Document(
100+
id = doc['_id'], #ensure_good_string(doc, '_id'),
101+
url = doc['url'], #ensure_good_string(doc,'url'),
102+
directURL = doc['directURL'], #ensure_good_string(doc,'directURL'),
103+
title = doc['title'].encode('latin1').decode('utf8'),
104+
dateIndexed = doc['dateIndexed'], #ensure_good_string(doc,'dateIndexed'),
105+
type_ = doc['type'], #ensure_good_string(doc,'type'),
106+
content = ensure_good_content(doc['content']['text']), # note difference!
107+
alternateTitle = ensure_good_str_list(doc,'alternateTitle'),
108+
fileName = ensure_good_string(doc,'file_name'),
109+
authors = [
110+
Author(
111+
url = ensure_good_string(author,'url'),
112+
name = ensure_good_string(author,'name'),
113+
email = ensure_good_string(author,'email')
114+
)
115+
for author in ensure_good_list(doc, 'authors')],
116+
datePublished = ensure_good_string(doc,'datePublished'),
117+
keywords = ensure_good_str_list(doc,'keywords'),
118+
description = ensure_good_string(doc,'description'),
119+
alternateDescription = ensure_good_string(doc,'alternateDescription'),
120+
imageURLS = [
121+
Image(
122+
url = ensure_good_string(image,'url'),
123+
description = ensure_good_string(image,'description'),
124+
provider = ensure_good_string(image,'provider'),
125+
licence = ensure_good_string(image,'licence')
126+
)
127+
for image in ensure_good_list(doc, 'imageURLS')],
128+
isbn = ensure_good_string(doc,'isbn'),
129+
issn = ensure_good_string(doc,'issn'),
130+
doi = ensure_good_string(doc,'doi'),
131+
meshHeadings = ensure_good_str_list(doc,'meshHeadings'),
132+
meshQualifiers = ensure_good_str_list(doc,'meshQualifiers'),
133+
source = Source(
134+
id = ensure_good_string(doc['source'],'id'),
135+
name = ensure_good_string(doc['source'],'name'),
136+
description = ensure_good_string(doc['source'],'description'),
137+
url = ensure_good_string(doc['source'],'url')
138+
), #ensure_good_string(doc,'source'),
139+
rights = "", #ensure_good_string(doc,'rights')
140+
language = doc['language'], #ensure_good_string(doc,'language')
141+
) for doc in documents])
142+
66143
@strawberry.type
67144
class Query:
68145

@@ -83,30 +160,10 @@ def search(self, q: str, language: str = 'en', type: str = None) -> SearchResult
83160
filters['type'] = type
84161
documents = list(collection.find(filters))
85162

86-
87-
# def ensure_good_content(content_list):
88-
# '''
89-
# function to remove potential problems from the context, and preprocess it to look like normal text
90-
# '''
91-
# # remove None-s from the list
92-
# string_list = map(str,content_list)
93-
# # preprocess and join together the content list
94-
# string_list = [preprocess_string(page, stopping = False, stemming = False, lowercasing = False) for page in string_list ]
95-
# return [string_list]
96-
97163
# things = list(db.things.find({'_id': {'$in': id_array}}))
98164
documents.sort(key=lambda doc: id_arr.index(doc['_id']))
99-
return SearchResult([Document(id=doc['_id'],
100-
title=doc['title'].encode('latin1').decode('utf8'),
101-
description=doc['description'],
102-
content= ensure_good_content(doc['content']['text']),
103-
url=doc['url'],
104-
directURL=doc['directURL'],
105-
type=doc['type'],
106-
language=doc['language'],
107-
rights="",
108-
datePublished=doc['datePublished'],
109-
dateAdded=doc['dateIndexed']) for doc in documents])
165+
return serach_result_from_documents(documents)
166+
110167

111168
@strawberry.field
112169
def more_docs(self, id: str) -> SearchResult:
@@ -115,17 +172,8 @@ def more_docs(self, id: str) -> SearchResult:
115172
id_arr = (np.array(ids)[I[0]]).tolist()
116173
documents = list(collection.find({'_id': {'$in': id_arr}}))
117174

118-
return SearchResult([Document(id=doc['_id'],
119-
title=doc['title'].encode('latin1').decode('utf8'),
120-
description=doc['description'],
121-
content= ensure_good_content(doc['content']['text']),
122-
url=doc['url'],
123-
directURL=doc['directURL'],
124-
type=doc['type'],
125-
language=doc['language'],
126-
rights="",
127-
datePublished=doc['datePublished'],
128-
dateAdded=doc['dateIndexed']) for doc in documents])
175+
return serach_result_from_documents(documents)
176+
129177
# @strawberry.field
130178
# def semantic_search(self, q: str) -> SearchResult:
131179
# D, I = vector_search(q, bert_model, bert_faiss)

mytest.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from build import pull_and_preprocess_from_mongo
2-
3-
print(pull_and_preprocess_from_mongo(0, 20)[2])
4-
1+
from preprocessing import ensure_good_str_list
52

3+
print(ensure_good_str_list(["hello", None, "World"]))

preprocessing.py

+20
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,20 @@ def preprocess_QA_text(text):
6161
clean_text = re.sub('[^a-zA-Z0-9,\'.?!:\-()\[\] ]', '', clean_text)
6262
return clean_text
6363

64+
def ensure_good_string(doc, string):
65+
# ensures that filed exists in doc and that it is not null, else returns ""
66+
if string in doc and doc[string]:
67+
return doc[string]
68+
else:
69+
return ""
70+
71+
def ensure_good_str_list(doc, string):
72+
if string in doc and doc[string]:
73+
return [s if s is not None else "" for s in doc[string] ]
74+
else:
75+
return []
76+
return
77+
6478
def ensure_good_content(content_list):
6579
'''
6680
function to remove potential problems from the context, and preprocess it to look like normal text
@@ -70,3 +84,9 @@ def ensure_good_content(content_list):
7084
# preprocess and join together the content list
7185
string_list = preprocess_string(" ".join(string_list), stopping = False, stemming = False, lowercasing = False)
7286
return [string_list]
87+
88+
def ensure_good_list(doc, string):
89+
if string in doc:
90+
return doc[string]
91+
else:
92+
return []

0 commit comments

Comments
 (0)