-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb.ts
278 lines (230 loc) · 8.9 KB
/
db.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import { PGlite } from "~dist/electric-sql";
import { PGliteWorker } from 'dist/electric-sql/worker/index.js'
import { vector } from '~dist/electric-sql/vector';
import { extractDomain, zip } from "~lib/utils";
import type { Chunk } from "~lib/chunk";
// TODO: fix the content revalidation
// TODO: accept "" to get keyword search instead of semantic search
export interface SearchResult {
id: number
content: string,
url: string,
page_id: number,
chunk_tag_id: string,
prob: number
}
const DB_STORAGE = "idb://casper"
// TODO: add a Web Lock on dbInstance? so that whoever needs to use it needs to wait
let dbInstance;
export async function getDB() {
if (dbInstance) {
// Pglite instance exists. Reuse the instance
await getDBName(dbInstance);
return dbInstance;
}
console.log("Attempting to create pglite db")
dbInstance = await PGlite.create(DB_STORAGE, {
extensions: {
vector
},
// relaxedDurability: false,
// debug: 1
})
await initSchema(dbInstance);
await getDBName(dbInstance)
return dbInstance;
}
export const countRows = async (db, table) => {
const res = await db.query(`SELECT COUNT(*) FROM ${table};`);
return res.rows[0].count;
};
export const initSchema = async (db: PGlite) => {
await db.exec(`
--DROP TABLE IF EXISTS embedding;
--DROP TABLE IF EXISTS page;
--DROP TABLE IF EXISTS filters;
--DROP TABLE IF EXISTS search_results_cache;
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS db(
id SERIAL PRIMARY KEY,
name TEXT UNIQUE
);
CREATE TABLE IF NOT EXISTS page(
id SERIAL PRIMARY KEY,
createdAt TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updatedAt TIMESTAMP DEFAULT NOW(),
url TEXT NOT NULL UNIQUE,
title TEXT
);
CREATE TABLE IF NOT EXISTS embedding(
id SERIAL PRIMARY KEY,
page_id INT REFERENCES page(id) ON DELETE CASCADE,
content TEXT NOT NULL,
embedding vector(384),
chunk_tag_id TEXT,
createdAt TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS filters(
url TEXT NOT NULL,
createdAt TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS search_results_cache(
id SERIAL PRIMARY KEY,
embedding_id INT REFERENCES embedding(id) ON DELETE CASCADE,
search_text TEXT NOT NULL,
similarity FLOAT
);
CREATE INDEX IF NOT EXISTS page_created_at_index
ON page(createdAt);
CREATE INDEX ON embedding USING hnsw (embedding vector_ip_ops);
CREATE INDEX IF NOT EXISTS filters_url_index
ON filters(url);
`)
if (await countRows(db, "db") === 0) {
let special = crypto.randomUUID();
await db.query("INSERT INTO db(name) VALUES ($1) ON CONFLICT DO NOTHING", [special]);
}
if (await countRows(db, "filters") === 0) {
await db.exec(`
INSERT INTO filters(url)
VALUES ('facebook.com'), ('x.com'), ('google.com'), ('youtube.com')
ON CONFLICT DO NOTHING;
`)
}
return;
}
export const getDBName = async (db: PGliteWorker) => {
// await db.query(`
// CREATE TABLE IF NOT EXISTS db(
// id SERIAL PRIMARY KEY,
// name TEXT UNIQUE
// );`);
const res = await db.query('SELECT id, name FROM db;');
const dbId = res.rows.length > 0 ? res.rows[0].name : null
console.log("-------------- DATABASE UUID --------------")
console.log(dbId);
console.log("-------------- ------------- --------------")
}
export const lockAndRunPglite = async (cb, { ...options }) => {
return await navigator.locks.request("pglite", async (lock) => {
const pg = await getDB();
// IMPORTANT: all callbacks that are passed through must use `db` as a field in the input obj
// TODO: need better docs + abstraction for this
const result = await cb({ db: pg, ...options });
return result;
})
}
export const storeEmbeddings = async (db: PGliteWorker, urlId: string, chunk: Chunk, embedding: number[]) => {
let embStr = `'[${embedding}]'`
const insertRes = await db.query(`INSERT INTO embedding (page_id, content, embedding, chunk_tag_id) VALUES ($1, $2, ${embStr}, $3);`,
[urlId, chunk.content, chunk.id]
);
}
export const getUrlId = async ({ db, url }: { db: PGliteWorker, url: string }) => {
let res = await db.query(`SELECT id FROM page WHERE url = $1`, [url]);
return res.rows.length > 0 ? res.rows[0].id : null
}
export const urlIsPresentOrInDatetimeRange = async ({ db, url, withinDays = 3 }: { db: PGliteWorker, url: string, withinDays: number }) => {
// fetch from db whether url exists and/or is within the required days
let filterSites = await getFilterSites(db);
if (filterSites.includes(extractDomain(url))) {
// skip processing this site as it is in the filtered sites list
return true;
}
let res = await db.query(`
SELECT * FROM page
WHERE url = $1
AND createdAt > NOW() - INTERVAL '${withinDays} days'
`, [url])
return res.rows.length > 0;
}
export const search = async (db: PGliteWorker, embedding: number[], matchThreshold = 0.8, limit = 5): Promise<SearchResult[]> => {
const res = await db.query(`
SELECT embedding.id, content, page_id, page.url as url, embedding.embedding <#> $1 AS prob, embedding.chunk_tag_id
FROM embedding
-- the inner product is negative, so we negate matchThreshold
LEFT JOIN page
ON page.id = embedding.page_id
WHERE embedding.embedding <#> $1 < $2
ORDER BY prob
LIMIT $3
`,
[JSON.stringify(embedding), Number(matchThreshold), Number(limit)]
);
return res.rows.map((row: any) => ({
id: row.id,
content: row.content,
page_id: row.page_id,
url: row.url,
prob: row.prob,
chunk_tag_id: row.chunk_tag_id
}));
}
export const deletePagesOlderThan = async (db: PGliteWorker, numDays: number = 14) => {
const res = await db.query(`
DELETE FROM page
WHERE page.createdAt < NOW() - INTERVAL '${numDays} days'
RETURNING *;
`)
return res.affectedRows;
}
export const saveFilterSites = async (db: PGliteWorker, listOfSites: string[]) => {
await db.transaction(async (tx) => {
// not performant, but this will have to do for now.
listOfSites.forEach(async (siteUrl) => {
await tx.query('INSERT INTO filters(url) VALUES($1)', [siteUrl])
});
})
}
export const removeFilterSites = async (db: PGliteWorker, listOfSites: string[]) => {
await db.transaction(async (tx) => {
// not performant, but this will have to do for now.
listOfSites.forEach(async (siteUrl) => {
await tx.query('DELETE FROM filters WHERE url = $1', [siteUrl])
});
})
}
export const getFilterSites = async (db: PGliteWorker) => {
const res = await db.query("SELECT url FROM filters");
return res.rows.map((m) => m.url);
}
export const nukeDb = async (db: PGliteWorker) => {
console.log("Nuking database. Bye bye.")
const res = await db.query("DELETE FROM page")
await db.exec(`DELETE FROM embedding;`)
await db.exec(`DELETE FROM search_results_cache;`)
await db.exec(`DELETE FROM db;`)
}
export const storeSearchCache = async (db: PGliteWorker, searchResultIds: number[], similarities: number[], searchText: string) => {
if (searchResultIds.length > 0 && searchText.length > 0) {
await db.transaction(async (tx) => {
await tx.query("DELETE FROM search_results_cache;")
for (let [_, sid, sim] of zip(searchResultIds, similarities)) {
await tx.query(`INSERT INTO search_results_cache(embedding_id, search_text, similarity) VALUES ($1, $2, $3)`, [sid, searchText, sim])
}
});
}
}
export const deleteStoreCache = async (db: PGliteWorker) => {
await db.exec(`DELETE FROM search_results_cache;`)
}
export const getSearchResultsCache = async (db: PGliteWorker) => {
const res = await db.query(`
SELECT emb.id, emb.content, emb.page_id, page.url, src.similarity AS prob, emb.chunk_tag_id, src.search_text
FROM search_results_cache AS src
LEFT JOIN embedding emb ON emb.id = src.embedding_id
LEFT JOIN page ON page.id = emb.page_id;
`);
return {
cache: res.rows.map((row: any) => ({
id: row.id,
content: row.content,
page_id: row.page_id,
url: row.url,
prob: row.prob,
chunk_tag_id: row.chunk_tag_id
})),
searchText: res.rows.length > 0 ? res.rows[0].search_text : null
}
}
export const saveModelType = async (db: PGliteWorker, modelType: string) => { }