|
| 1 | +from tinydb import Query |
| 2 | + |
| 3 | +from core.tools import utils |
| 4 | +from core.tools.utils import use_tinydb |
| 5 | + |
| 6 | +db = use_tinydb("url_pool") |
| 7 | + |
| 8 | +# we have to heartbeat the workers once we run out of urls |
| 9 | +# i believe this db should remain local permanently |
| 10 | +# instead, we should have a separate global file db for embedder to use, |
| 11 | +# and a tiny global kv cache just to prevent duplicate urls |
| 12 | + |
| 13 | + |
| 14 | +def db_add_url(url: str, prompt: str, parent_uuid: str = None): |
| 15 | + new_uuid = utils.gen_uuid() |
| 16 | + timestamp = utils.gen_unix_time() |
| 17 | + |
| 18 | + new_url_object = { |
| 19 | + "uuid": new_uuid, |
| 20 | + "parent_uuid": parent_uuid, |
| 21 | + "prompt": prompt, |
| 22 | + "url": url, |
| 23 | + "text": None, |
| 24 | + "is_downloaded": False, |
| 25 | + "is_rubbish": False, |
| 26 | + "embedded_by": [], |
| 27 | + "timestamp": timestamp, |
| 28 | + } |
| 29 | + |
| 30 | + db.insert(new_url_object) |
| 31 | + |
| 32 | + return new_url_object |
| 33 | + |
| 34 | + |
| 35 | +def db_get_not_downloaded() -> list: |
| 36 | + db_query = Query() |
| 37 | + db_results = db.search( |
| 38 | + db_query.fragment({"is_downloaded": False, "is_rubbish": False}) |
| 39 | + ) |
| 40 | + |
| 41 | + return db_results |
| 42 | + |
| 43 | + |
| 44 | +def db_get_not_embedded(model: str) -> list: |
| 45 | + fields = Query() |
| 46 | + db_results = db.search(fields.embedded_by.contains(model) is not True) |
| 47 | + |
| 48 | + return db_results |
| 49 | + |
| 50 | + |
| 51 | +def db_set_url_embedded(url_id: str, embedding_model: str): |
| 52 | + query = Query() |
| 53 | + record = db.get(query.uuid == url_id) |
| 54 | + if record is None: |
| 55 | + return |
| 56 | + |
| 57 | + embedded_by = record["embedded_by"] |
| 58 | + embedded_by.append(embedding_model) |
| 59 | + |
| 60 | + db.update({"embedded_by": embedded_by}, query.uuid == url_id) |
| 61 | + |
| 62 | + |
| 63 | +def db_set_url_downloaded(url_id: str, text: str): |
| 64 | + query = Query() |
| 65 | + record = db.get(query.uuid == url_id) |
| 66 | + if record is None: |
| 67 | + return |
| 68 | + |
| 69 | + db.update({"is_downloaded": True, "text": text}, query.uuid == url_id) |
| 70 | + |
| 71 | + |
| 72 | +def db_set_url_rubbish(url_id: str): |
| 73 | + query = Query() |
| 74 | + record = db.get(query.uuid == url_id) |
| 75 | + if record is None: |
| 76 | + return |
| 77 | + |
| 78 | + db.update({"is_rubbish": True}, query.uuid == url_id) |
| 79 | + |
| 80 | + |
| 81 | +def db_is_url_present(url: str): |
| 82 | + query = Query() |
| 83 | + record = db.get(query.url == url) |
| 84 | + return record is not None |
0 commit comments