Skip to content

Commit 9b4910b

Browse files
committed
Fix index errors.
1 parent b45ec6f commit 9b4910b

File tree

2 files changed

+44
-5
lines changed

2 files changed

+44
-5
lines changed

build.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -97,22 +97,24 @@ def build_faiss(model, name):
9797
c = 2000 #collection.find({}, projection={'title': True, 'description': True, "content.text": True}).count()
9898
encoder = None
9999
index = None
100+
# idmap = None
100101
if hasattr(model, 'encode'):
101102
encoder = lambda x: model.encode(x).astype("float32")
102103
else:
103104
encoder = lambda x:model.transform(x).toarray().astype("float32")
104105
i = 0
105-
docs = []
106106
ids = []
107107
while i < c:
108108
print(i)
109+
docs = []
109110
for x in collection.find({}, projection={'_id': True, 'title': True, 'description': True, "content.text": True}).skip(i).limit(500):
110111
docs.append(x.get("title","") + " " + x.get('description',"")+ " " + " ".join(filter(None,x.get('content',{}).get('text',[]))))
111112
ids.append(x['_id'])
113+
print('docs',len(docs))
112114
embeddings = encoder(docs)
113115
if i == 0:
114116
index = faiss.IndexFlatIP(embeddings.shape[1])
115-
i += len(embeddings)
117+
# idmap = faiss.IndexIDMap(index)
116118

117119

118120
# embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
@@ -124,7 +126,11 @@ def build_faiss(model, name):
124126
# Step 3: Pass the index to IndexIDMap
125127
# index = faiss.IndexIDMap(index)
126128
# Step 4: Add vectors and their IDs
127-
index.add_with_ids(embeddings,list(range(i,len(embeddings))))
129+
print("range",len(np.arange(i,i+len(embeddings))))
130+
print("embeds",len(embeddings))
131+
# idmap.add_with_ids(embeddings,np.arange(i,i+len(embeddings)))
132+
index.add(embeddings)
133+
i += len(embeddings)
128134
faiss.write_index(index,f"models/{name}.index")
129135
dump(ids,'models/ids.joblib')
130136
print(f"Completed {name} index.")

test.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,38 @@
2929
# # f.write("%s\n" % item.encode('latin1').decode('utf-8'))
3030
# model = load_tfidf_model()
3131
# build_faiss(model, "tfidf")
32-
ids = load('models/ids.joblib')
33-
print(ids)
32+
# Requires the PyMongo package.
33+
# https://api.mongodb.com/python/current
34+
35+
# client = MongoClient('mongodb+srv://ir:[email protected]/test?authSource=admin&replicaSet=atlas-5jw1an-shard-0&readPreference=primary&appname=MongoDB%20Compass&ssl=true')
36+
result = collection.aggregate([
37+
{
38+
'$project': {
39+
'content': {
40+
'$reduce': {
41+
'input': '$content.text',
42+
'initialValue': '',
43+
'in': {
44+
'$concat': [
45+
'$$value', ' ', '$$this'
46+
]
47+
}
48+
}
49+
},
50+
'title': 1,
51+
'description': 1
52+
}
53+
}, {
54+
'$project': {
55+
'text': {
56+
'$concat': [
57+
'$title', ' ', '$description', ' ', '$content'
58+
]
59+
}
60+
}
61+
},
62+
{ "$limit": 1 },
63+
{ "$skip": 0 }
64+
])
65+
# ids = load('models/ids.joblib')
66+
print(list(result))

0 commit comments

Comments
 (0)