Skip to content

Commit f74a888

Browse files
authored
Add database entrypoints (#11)
1 parent d90538b commit f74a888

13 files changed

+430
-157
lines changed

UI_GUIDE.md

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
## Guide for UI developers, no matter the platform.
2+
To interact with the rest of the system, exclusively utilize functions from `core/database/db_xxx`
3+
4+
For a simple applications which only requests summaries and reads the results,<br>
5+
You have to use:
6+
* `db_add_completion_task` to create a task
7+
* `db_get_completions_by_page` to get the results
8+
9+
<br>
10+
Here is an example of how that would look like with `FastAPI`:
11+
12+
```py
13+
@app.post("/add_completion_task")
14+
def add_completion_task(prompt):
15+
db_add_completion_task(prompt)
16+
return {
17+
"status": "OK"
18+
}
19+
20+
21+
@app.get("/get_completions")
22+
def get_completions(page: int):
23+
completions = db_get_completions_by_page(page)
24+
return {
25+
"completions": completions
26+
}
27+
```
28+
29+
For a more complicated web page, which can also schedule crawls,
30+
you'll also make use of:
31+
* `db_add_crawl_task` to schedule a new crawl
32+
* `db_get_crawl_history_by_page` to see the crawls you scheduled, and their status
33+
34+
#### Important notes
35+
* Currently, there is no system present which would automatically populate
36+
the embeddings database after scheduling a completion task.
37+
This means, that the UI has to ensure all the databases are appropriately populated.
38+
As a result, before requesting a summary, it's necessary to perform crawls
39+
to give our summaries enough context to work with.
40+
41+
* All db calls return lists of entire objects, unless it's specified otherwise.
42+
This is the default since we're prioritizing speed and minimal latency over
43+
security.

core/databases/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## What's this?
2+
This directory contains all the getters and setters for all 5 global databases,
3+
with more potentially coming in later.
4+
5+
## Now vs Future
6+
Currently, these simple getters and setters utilize TinyDB,
7+
and effectively behave as singletons for all the detached, separate workers.
8+
9+
In the future, we'll want to these functions to optionally call
10+
remote database providers, and act as a wrapper for these databases.
11+
12+
## Important notes
13+
All db calls return lists of entire objects, unless it's specified otherwise.
14+
This is the default since we're prioritizing speed and minimal latency over
15+
security or cleanliness, and these systems are not intended to be run publicly,
16+
but as a closed network.

core/databases/db_completion_tasks.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from tinydb import Query
2+
3+
from core.databases import defaults
4+
from core.tools import utils
5+
from core.tools.utils import use_tinydb
6+
7+
db = use_tinydb("completion_tasks")
8+
9+
10+
def db_add_completion_task(prompt):
11+
new_uuid = utils.gen_uuid()
12+
timestamp = utils.gen_unix_time()
13+
14+
db.insert(
15+
{
16+
"uuid": new_uuid,
17+
"prompt": prompt,
18+
"completed": False,
19+
"timestamp": timestamp,
20+
}
21+
)
22+
23+
return new_uuid
24+
25+
26+
def db_get_completion_tasks_by_page(page: int, per_page: int = defaults.ITEMS_PER_PAGE):
27+
28+
# returns all as TinyDB does not support pagination
29+
# we'll be moving to SQLite or Cassandra soon enough
30+
results = db.all()
31+
32+
return results
33+
34+
35+
def db_get_incomplete_completion_task():
36+
fields = Query()
37+
38+
results = db.get(fields.completed is False)
39+
40+
return results
41+
42+
43+
"""
44+
def db_add_smart_completion_task(prompt):
45+
# todo: this functions should automatically dispatch crawl tasks if they are needed
46+
new_uuid = utils.gen_uuid()
47+
timestamp = utils.gen_unix_time()
48+
49+
db.insert(
50+
{
51+
"uuid": new_uuid,
52+
"prompt": prompt,
53+
"complete": False,
54+
"timestamp": timestamp,
55+
}
56+
)
57+
58+
return new_uuid
59+
"""

core/databases/db_completions.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from tinydb import Query
2+
3+
from core.databases import defaults
4+
from core.tools import utils
5+
from core.tools.utils import use_tinydb
6+
7+
db = use_tinydb("completions")
8+
9+
# we have to use a document database with this one,
10+
# as completions will be large chunks of data of variable size
11+
12+
13+
def db_add_completion(text, prompt="N/A"):
14+
new_uuid = utils.gen_uuid()
15+
timestamp = utils.gen_unix_time()
16+
17+
db.insert(
18+
{
19+
"uuid": new_uuid,
20+
"prompt": prompt,
21+
"response": text,
22+
"timestamp": timestamp,
23+
}
24+
)
25+
26+
return new_uuid
27+
28+
29+
def db_get_completions_by_date(start_date: int, end_date: int) -> list:
30+
fields = Query()
31+
32+
results = db.search(start_date < fields.timestamp < end_date)
33+
return results
34+
35+
36+
def db_get_completions_by_page(
37+
page: int, per_page: int = defaults.ITEMS_PER_PAGE
38+
) -> list:
39+
splice_start = page * per_page
40+
splice_end = splice_start + per_page
41+
42+
# current db doesn't support ranges, return all
43+
results = db.all()
44+
return results

core/databases/db_crawl_history.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from core.databases import defaults
2+
from core.tools import utils
3+
from core.tools.utils import use_tinydb
4+
5+
db = use_tinydb("crawl_history")
6+
7+
# this db is completely optional, only used by the UI, and so it's development can be delayed
8+
# most sensible solution here is to make items of the url_database point to entries of this database
9+
# even better, let's add a prompt field to each entry of the url_database, and count them here
10+
# this db will still be used to store the prompts, and their embeddings, so that the UI
11+
# will have an easy time comparing new prompts to historical ones
12+
13+
14+
def db_add_crawl_history(prompt: str) -> str:
15+
new_uuid = utils.gen_uuid()
16+
return new_uuid
17+
18+
19+
def db_add_url_to_crawl_history(url: str, prompt: str) -> str:
20+
new_uuid = utils.gen_uuid()
21+
return new_uuid
22+
23+
24+
def db_get_similar_prompts(prompt: str) -> list:
25+
return []
26+
27+
28+
def db_get_crawl_history_by_page(page: int, per_page=defaults.ITEMS_PER_PAGE) -> list:
29+
return []

core/databases/db_crawl_tasks.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
3+
from tinydb import Query
4+
5+
from core.tools import utils
6+
from core.tools.utils import use_tinydb
7+
8+
db = use_tinydb("crawl_tasks")
9+
10+
# we have to heartbeat our workers once we run out of tasks, websocks should suffice
11+
12+
13+
def db_add_crawl_task(prompt):
14+
new_uuid = utils.gen_uuid()
15+
timestamp = utils.gen_unix_time()
16+
17+
db.insert(
18+
{
19+
"uuid": new_uuid,
20+
"prompt": prompt,
21+
"completed": False,
22+
"executing": False,
23+
"completion_date": 0, # time completed
24+
"execution_date": 0, # time started completion
25+
"timestamp": timestamp, # time added
26+
}
27+
)
28+
29+
return new_uuid
30+
31+
32+
def db_set_crawl_completed(uuid: str):
33+
fields = Query()
34+
db.update({"completed": True}, fields.uuid == uuid)
35+
36+
37+
def db_get_crawl_task():
38+
fields = Query()
39+
crawl_task = db.get(fields.completed is False)
40+
41+
return crawl_task
42+
43+
44+
def db_get_incomplete_completion_task():
45+
fields = Query()
46+
task = db.get(fields.completed is False and fields.executing is False)
47+
48+
task_uuid = task
49+
db.update({"executing": True}, fields.uuid == task_uuid)
50+
51+
return task

core/databases/db_embeddings.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from core.tools.utils import use_tinydb
2+
3+
db = use_tinydb("embeddings")
4+
5+
6+
# this global db has to actually be a set of multiple
7+
# separate dbs, each associated with its own embed model
8+
9+
# this file will be populated in a separate PR, along with an embedding server

core/databases/db_url_pool.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from tinydb import Query
2+
3+
from core.tools import utils
4+
from core.tools.utils import use_tinydb
5+
6+
db = use_tinydb("url_pool")
7+
8+
# we have to heartbeat the workers once we run out of urls
9+
# i believe this db should remain local permanently
10+
# instead, we should have a separate global file db for embedder to use,
11+
# and a tiny global kv cache just to prevent duplicate urls
12+
13+
14+
def db_add_url(url: str, prompt: str, parent_uuid: str = None):
15+
new_uuid = utils.gen_uuid()
16+
timestamp = utils.gen_unix_time()
17+
18+
new_url_object = {
19+
"uuid": new_uuid,
20+
"parent_uuid": parent_uuid,
21+
"prompt": prompt,
22+
"url": url,
23+
"text": None,
24+
"is_downloaded": False,
25+
"is_rubbish": False,
26+
"embedded_by": [],
27+
"timestamp": timestamp,
28+
}
29+
30+
db.insert(new_url_object)
31+
32+
return new_url_object
33+
34+
35+
def db_get_not_downloaded() -> list:
36+
db_query = Query()
37+
db_results = db.search(
38+
db_query.fragment({"is_downloaded": False, "is_rubbish": False})
39+
)
40+
41+
return db_results
42+
43+
44+
def db_get_not_embedded(model: str) -> list:
45+
fields = Query()
46+
db_results = db.search(fields.embedded_by.contains(model) is not True)
47+
48+
return db_results
49+
50+
51+
def db_set_url_embedded(url_id: str, embedding_model: str):
52+
query = Query()
53+
record = db.get(query.uuid == url_id)
54+
if record is None:
55+
return
56+
57+
embedded_by = record["embedded_by"]
58+
embedded_by.append(embedding_model)
59+
60+
db.update({"embedded_by": embedded_by}, query.uuid == url_id)
61+
62+
63+
def db_set_url_downloaded(url_id: str, text: str):
64+
query = Query()
65+
record = db.get(query.uuid == url_id)
66+
if record is None:
67+
return
68+
69+
db.update({"is_downloaded": True, "text": text}, query.uuid == url_id)
70+
71+
72+
def db_set_url_rubbish(url_id: str):
73+
query = Query()
74+
record = db.get(query.uuid == url_id)
75+
if record is None:
76+
return
77+
78+
db.update({"is_rubbish": True}, query.uuid == url_id)
79+
80+
81+
def db_is_url_present(url: str):
82+
query = Query()
83+
record = db.get(query.url == url)
84+
return record is not None

core/databases/defaults.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ITEMS_PER_PAGE = 20
2+
DATA_PATH = "store/data/"

0 commit comments

Comments
 (0)