Add database entrypoints (#11)

latekvo · web-flow · commit f74a88808b85 · 2024-04-11T20:31:43.000+02:00
diff --git a/UI_GUIDE.md b/UI_GUIDE.md
@@ -0,0 +1,43 @@
+## Guide for UI developers, no matter the platform.
+To interact with the rest of the system, exclusively utilize functions from `core/database/db_xxx`
+
+For a simple applications which only requests summaries and reads the results,<br>
+You have to use:
+* `db_add_completion_task` to create a task
+* `db_get_completions_by_page` to get the results
+
+<br>
+Here is an example of how that would look like with `FastAPI`:
+
+```py
+@app.post("/add_completion_task")
+def add_completion_task(prompt):
+    db_add_completion_task(prompt)
+    return {
+        "status": "OK"
+    }
+
+
+@app.get("/get_completions")
+def get_completions(page: int):
+    completions = db_get_completions_by_page(page)
+    return {
+        "completions": completions
+    }
+```
+
+For a more complicated web page, which can also schedule crawls,
+you'll also make use of:
+* `db_add_crawl_task` to schedule a new crawl
+* `db_get_crawl_history_by_page` to see the crawls you scheduled, and their status
+
+#### Important notes
+* Currently, there is no system present which would automatically populate
+the embeddings database after scheduling a completion task.
+This means, that the UI has to ensure all the databases are appropriately populated.
+As a result, before requesting a summary, it's necessary to perform crawls
+to give our summaries enough context to work with.
+
+* All db calls return lists of entire objects, unless it's specified otherwise.
+This is the default since we're prioritizing speed and minimal latency over 
+security.
diff --git a/core/databases/README.md b/core/databases/README.md
@@ -0,0 +1,16 @@
+## What's this?
+This directory contains all the getters and setters for all 5 global databases,
+with more potentially coming in later.
+
+## Now vs Future
+Currently, these simple getters and setters utilize TinyDB, 
+and effectively behave as singletons for all the detached, separate workers.
+
+In the future, we'll want to these functions to optionally call
+remote database providers, and act as a wrapper for these databases.
+
+## Important notes
+All db calls return lists of entire objects, unless it's specified otherwise.
+This is the default since we're prioritizing speed and minimal latency over 
+security or cleanliness, and these systems are not intended to be run publicly,
+but as a closed network.
diff --git a/core/databases/db_completion_tasks.py b/core/databases/db_completion_tasks.py
@@ -0,0 +1,59 @@
+from tinydb import Query
+
+from core.databases import defaults
+from core.tools import utils
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("completion_tasks")
+
+
+def db_add_completion_task(prompt):
+    new_uuid = utils.gen_uuid()
+    timestamp = utils.gen_unix_time()
+
+    db.insert(
+        {
+            "uuid": new_uuid,
+            "prompt": prompt,
+            "completed": False,
+            "timestamp": timestamp,
+        }
+    )
+
+    return new_uuid
+
+
+def db_get_completion_tasks_by_page(page: int, per_page: int = defaults.ITEMS_PER_PAGE):
+
+    # returns all as TinyDB does not support pagination
+    # we'll be moving to SQLite or Cassandra soon enough
+    results = db.all()
+
+    return results
+
+
+def db_get_incomplete_completion_task():
+    fields = Query()
+
+    results = db.get(fields.completed is False)
+
+    return results
+
+
+"""
+def db_add_smart_completion_task(prompt):
+    # todo: this functions should automatically dispatch crawl tasks if they are needed 
+    new_uuid = utils.gen_uuid()
+    timestamp = utils.gen_unix_time()
+
+    db.insert(
+        {
+            "uuid": new_uuid,
+            "prompt": prompt,
+            "complete": False,
+            "timestamp": timestamp,
+        }
+    )
+
+    return new_uuid
+"""
diff --git a/core/databases/db_completions.py b/core/databases/db_completions.py
@@ -0,0 +1,44 @@
+from tinydb import Query
+
+from core.databases import defaults
+from core.tools import utils
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("completions")
+
+# we have to use a document database with this one,
+# as completions will be large chunks of data of variable size
+
+
+def db_add_completion(text, prompt="N/A"):
+    new_uuid = utils.gen_uuid()
+    timestamp = utils.gen_unix_time()
+
+    db.insert(
+        {
+            "uuid": new_uuid,
+            "prompt": prompt,
+            "response": text,
+            "timestamp": timestamp,
+        }
+    )
+
+    return new_uuid
+
+
+def db_get_completions_by_date(start_date: int, end_date: int) -> list:
+    fields = Query()
+
+    results = db.search(start_date < fields.timestamp < end_date)
+    return results
+
+
+def db_get_completions_by_page(
+    page: int, per_page: int = defaults.ITEMS_PER_PAGE
+) -> list:
+    splice_start = page * per_page
+    splice_end = splice_start + per_page
+
+    # current db doesn't support ranges, return all
+    results = db.all()
+    return results
diff --git a/core/databases/db_crawl_history.py b/core/databases/db_crawl_history.py
@@ -0,0 +1,29 @@
+from core.databases import defaults
+from core.tools import utils
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("crawl_history")
+
+# this db is completely optional, only used by the UI, and so it's development can be delayed
+# most sensible solution here is to make items of the url_database point to entries of this database
+# even better, let's add a prompt field to each entry of the url_database, and count them here
+# this db will still be used to store the prompts, and their embeddings, so that the UI
+# will have an easy time comparing new prompts to historical ones
+
+
+def db_add_crawl_history(prompt: str) -> str:
+    new_uuid = utils.gen_uuid()
+    return new_uuid
+
+
+def db_add_url_to_crawl_history(url: str, prompt: str) -> str:
+    new_uuid = utils.gen_uuid()
+    return new_uuid
+
+
+def db_get_similar_prompts(prompt: str) -> list:
+    return []
+
+
+def db_get_crawl_history_by_page(page: int, per_page=defaults.ITEMS_PER_PAGE) -> list:
+    return []
diff --git a/core/databases/db_crawl_tasks.py b/core/databases/db_crawl_tasks.py
@@ -0,0 +1,51 @@
+import os
+
+from tinydb import Query
+
+from core.tools import utils
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("crawl_tasks")
+
+# we have to heartbeat our workers once we run out of tasks, websocks should suffice
+
+
+def db_add_crawl_task(prompt):
+    new_uuid = utils.gen_uuid()
+    timestamp = utils.gen_unix_time()
+
+    db.insert(
+        {
+            "uuid": new_uuid,
+            "prompt": prompt,
+            "completed": False,
+            "executing": False,
+            "completion_date": 0,  # time completed
+            "execution_date": 0,  # time started completion
+            "timestamp": timestamp,  # time added
+        }
+    )
+
+    return new_uuid
+
+
+def db_set_crawl_completed(uuid: str):
+    fields = Query()
+    db.update({"completed": True}, fields.uuid == uuid)
+
+
+def db_get_crawl_task():
+    fields = Query()
+    crawl_task = db.get(fields.completed is False)
+
+    return crawl_task
+
+
+def db_get_incomplete_completion_task():
+    fields = Query()
+    task = db.get(fields.completed is False and fields.executing is False)
+
+    task_uuid = task
+    db.update({"executing": True}, fields.uuid == task_uuid)
+
+    return task
diff --git a/core/databases/db_embeddings.py b/core/databases/db_embeddings.py
@@ -0,0 +1,9 @@
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("embeddings")
+
+
+# this global db has to actually be a set of multiple
+# separate dbs, each associated with its own embed model
+
+# this file will be populated in a separate PR, along with an embedding server
diff --git a/core/databases/db_url_pool.py b/core/databases/db_url_pool.py
@@ -0,0 +1,84 @@
+from tinydb import Query
+
+from core.tools import utils
+from core.tools.utils import use_tinydb
+
+db = use_tinydb("url_pool")
+
+# we have to heartbeat the workers once we run out of urls
+# i believe this db should remain local permanently
+# instead, we should have a separate global file db for embedder to use,
+# and a tiny global kv cache just to prevent duplicate urls
+
+
+def db_add_url(url: str, prompt: str, parent_uuid: str = None):
+    new_uuid = utils.gen_uuid()
+    timestamp = utils.gen_unix_time()
+
+    new_url_object = {
+        "uuid": new_uuid,
+        "parent_uuid": parent_uuid,
+        "prompt": prompt,
+        "url": url,
+        "text": None,
+        "is_downloaded": False,
+        "is_rubbish": False,
+        "embedded_by": [],
+        "timestamp": timestamp,
+    }
+
+    db.insert(new_url_object)
+
+    return new_url_object
+
+
+def db_get_not_downloaded() -> list:
+    db_query = Query()
+    db_results = db.search(
+        db_query.fragment({"is_downloaded": False, "is_rubbish": False})
+    )
+
+    return db_results
+
+
+def db_get_not_embedded(model: str) -> list:
+    fields = Query()
+    db_results = db.search(fields.embedded_by.contains(model) is not True)
+
+    return db_results
+
+
+def db_set_url_embedded(url_id: str, embedding_model: str):
+    query = Query()
+    record = db.get(query.uuid == url_id)
+    if record is None:
+        return
+
+    embedded_by = record["embedded_by"]
+    embedded_by.append(embedding_model)
+
+    db.update({"embedded_by": embedded_by}, query.uuid == url_id)
+
+
+def db_set_url_downloaded(url_id: str, text: str):
+    query = Query()
+    record = db.get(query.uuid == url_id)
+    if record is None:
+        return
+
+    db.update({"is_downloaded": True, "text": text}, query.uuid == url_id)
+
+
+def db_set_url_rubbish(url_id: str):
+    query = Query()
+    record = db.get(query.uuid == url_id)
+    if record is None:
+        return
+
+    db.update({"is_rubbish": True}, query.uuid == url_id)
+
+
+def db_is_url_present(url: str):
+    query = Query()
+    record = db.get(query.url == url)
+    return record is not None
diff --git a/core/databases/defaults.py b/core/databases/defaults.py
@@ -0,0 +1,2 @@
+ITEMS_PER_PAGE = 20
+DATA_PATH = "store/data/"
diff --git a/core/lookup.py b/core/lookup.py
diff --git a/core/scheduler.py b/core/scheduler.py
diff --git a/core/tools/utils.py b/core/tools/utils.py
diff --git a/crawler.py b/crawler.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+ITEMS_PER_PAGE = 20`
	`2`	`+DATA_PATH = "store/data/"`