diff --git a/.env-template b/.env-template index 29c42e81b..30f0f9ff9 100644 --- a/.env-template +++ b/.env-template @@ -86,13 +86,6 @@ OIDC_WELL_KNOWN_ENDPOINT= SCIM_USER= SCIM_PASSWORD= -# Compass - -COHERE_COMPASS_USERNAME= -COHERE_COMPASS_PASSWORD= -COHERE_COMPASS_API_URL= -COHERE_COMPASS_PARSER_URL= - # Google Drive GOOGLE_DRIVE_CLIENT_ID= diff --git a/.gitignore b/.gitignore index b8c78700a..db94c4ae5 100644 --- a/.gitignore +++ b/.gitignore @@ -95,10 +95,6 @@ ipython_config.py # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ -# Celery stuff -celerybeat-schedule -celerybeat.pid - # SageMath parsed files *.sage.py diff --git a/Makefile b/Makefile index 47379b13b..6540320fb 100644 --- a/Makefile +++ b/Makefile @@ -123,11 +123,3 @@ test-db: docker compose stop test_db docker compose rm -f test_db docker compose up test_db -d - -.PHONY: dev-sync -dev-sync: - @docker compose up --build sync_worker sync_publisher flower -d - -.PHONY: dev-sync-down -dev-sync-down: - @docker compose down sync_worker sync_publisher flower diff --git a/docker-compose.yml b/docker-compose.yml index 68e22f243..954ff6310 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,59 +85,6 @@ services: networks: - proxynet - sync_worker: - build: - context: . - args: - DOCKER_BUILDKIT: 1 - dockerfile: ./src/backend/Dockerfile - command: ["./src/backend/services/sync/executor.sh", "${ENVIRONMENT}"] - restart: unless-stopped - profiles: - - sync - develop: - watch: - - action: sync - path: ./src/backend - target: /workspace/src/backend - ignore: - - __pycache__/ - - alembic/ - - data/ - stdin_open: true - tty: true - depends_on: - - db - - redis - networks: - - proxynet - - sync_publisher: - build: - context: . - args: - DOCKER_BUILDKIT: 1 - dockerfile: ./src/backend/services/sync/Dockerfile - restart: unless-stopped - profiles: - - sync - develop: - watch: - - action: sync - path: ./src/backend - target: /workspace/src/backend - ignore: - - __pycache__/ - - alembic/ - - data/ - stdin_open: true - tty: true - depends_on: - - db - - redis - networks: - - proxynet - frontend: build: target: ${BUILD_TARGET:-prod} @@ -172,18 +119,6 @@ services: networks: - proxynet - flower: - image: mher/flower - profiles: - - sync - environment: - - CELERY_BROKER_URL=redis://:redis@redis:6379 - - FLOWER_PORT=5555 - ports: - - 5555:5555 - networks: - - proxynet - volumes: db: name: cohere_toolkit_db diff --git a/poetry.lock b/poetry.lock index 1c815e864..2f7b24490 100644 --- a/poetry.lock +++ b/poetry.lock @@ -371,17 +371,6 @@ charset-normalizer = ["charset-normalizer"] html5lib = ["html5lib"] lxml = ["lxml"] -[[package]] -name = "billiard" -version = "4.2.0" -description = "Python multiprocessing fork with improvements and bugfixes" -optional = false -python-versions = ">=3.7" -files = [ - {file = "billiard-4.2.0-py3-none-any.whl", hash = "sha256:07aa978b308f334ff8282bd4a746e681b3513db5c9a514cbdd810cbbdc19714d"}, - {file = "billiard-4.2.0.tar.gz", hash = "sha256:9a3c3184cb275aa17a732f93f65b20c525d3d9f253722d26a82194803ade5a2c"}, -] - [[package]] name = "blessed" version = "1.20.0" @@ -470,63 +459,6 @@ files = [ {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] -[[package]] -name = "celery" -version = "5.4.0" -description = "Distributed Task Queue." -optional = false -python-versions = ">=3.8" -files = [ - {file = "celery-5.4.0-py3-none-any.whl", hash = "sha256:369631eb580cf8c51a82721ec538684994f8277637edde2dfc0dacd73ed97f64"}, - {file = "celery-5.4.0.tar.gz", hash = "sha256:504a19140e8d3029d5acad88330c541d4c3f64c789d85f94756762d8bca7e706"}, -] - -[package.dependencies] -billiard = ">=4.2.0,<5.0" -click = ">=8.1.2,<9.0" -click-didyoumean = ">=0.3.0" -click-plugins = ">=1.1.1" -click-repl = ">=0.2.0" -gevent = {version = ">=1.5.0", optional = true, markers = "extra == \"gevent\""} -kombu = ">=5.3.4,<6.0" -python-dateutil = ">=2.8.2" -tzdata = ">=2022.7" -vine = ">=5.1.0,<6.0" - -[package.extras] -arangodb = ["pyArango (>=2.0.2)"] -auth = ["cryptography (==42.0.5)"] -azureblockblob = ["azure-storage-blob (>=12.15.0)"] -brotli = ["brotli (>=1.0.0)", "brotlipy (>=0.7.0)"] -cassandra = ["cassandra-driver (>=3.25.0,<4)"] -consul = ["python-consul2 (==0.1.5)"] -cosmosdbsql = ["pydocumentdb (==2.3.5)"] -couchbase = ["couchbase (>=3.0.0)"] -couchdb = ["pycouchdb (==1.14.2)"] -django = ["Django (>=2.2.28)"] -dynamodb = ["boto3 (>=1.26.143)"] -elasticsearch = ["elastic-transport (<=8.13.0)", "elasticsearch (<=8.13.0)"] -eventlet = ["eventlet (>=0.32.0)"] -gcs = ["google-cloud-storage (>=2.10.0)"] -gevent = ["gevent (>=1.5.0)"] -librabbitmq = ["librabbitmq (>=2.0.0)"] -memcache = ["pylibmc (==1.6.3)"] -mongodb = ["pymongo[srv] (>=4.0.2)"] -msgpack = ["msgpack (==1.0.8)"] -pymemcache = ["python-memcached (>=1.61)"] -pyro = ["pyro4 (==4.82)"] -pytest = ["pytest-celery[all] (>=1.0.0)"] -redis = ["redis (>=4.5.2,!=4.5.5,<6.0.0)"] -s3 = ["boto3 (>=1.26.143)"] -slmq = ["softlayer-messaging (>=1.0.3)"] -solar = ["ephem (==4.1.5)"] -sqlalchemy = ["sqlalchemy (>=1.4.48,<2.1)"] -sqs = ["boto3 (>=1.26.143)", "kombu[sqs] (>=5.3.4)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"] -tblib = ["tblib (>=1.3.0)", "tblib (>=1.5.0)"] -yaml = ["PyYAML (>=3.10)"] -zookeeper = ["kazoo (>=1.3.1)"] -zstd = ["zstandard (==0.22.0)"] - [[package]] name = "certifi" version = "2024.7.4" @@ -819,55 +751,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -[[package]] -name = "click-didyoumean" -version = "0.3.1" -description = "Enables git-like *did-you-mean* feature in click" -optional = false -python-versions = ">=3.6.2" -files = [ - {file = "click_didyoumean-0.3.1-py3-none-any.whl", hash = "sha256:5c4bb6007cfea5f2fd6583a2fb6701a22a41eb98957e63d0fac41c10e7c3117c"}, - {file = "click_didyoumean-0.3.1.tar.gz", hash = "sha256:4f82fdff0dbe64ef8ab2279bd6aa3f6a99c3b28c05aa09cbfc07c9d7fbb5a463"}, -] - -[package.dependencies] -click = ">=7" - -[[package]] -name = "click-plugins" -version = "1.1.1" -description = "An extension module for click to enable registering CLI commands via setuptools entry-points." -optional = false -python-versions = "*" -files = [ - {file = "click-plugins-1.1.1.tar.gz", hash = "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b"}, - {file = "click_plugins-1.1.1-py2.py3-none-any.whl", hash = "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8"}, -] - -[package.dependencies] -click = ">=4.0" - -[package.extras] -dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"] - -[[package]] -name = "click-repl" -version = "0.3.0" -description = "REPL plugin for Click" -optional = false -python-versions = ">=3.6" -files = [ - {file = "click-repl-0.3.0.tar.gz", hash = "sha256:17849c23dba3d667247dc4defe1757fff98694e90fe37474f3feebb69ced26a9"}, - {file = "click_repl-0.3.0-py3-none-any.whl", hash = "sha256:fb7e06deb8da8de86180a33a9da97ac316751c094c6899382da7feeeeb51b812"}, -] - -[package.dependencies] -click = ">=7.0" -prompt-toolkit = ">=3.0.36" - -[package.extras] -testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"] - [[package]] name = "cohere" version = "5.8.1" @@ -1428,69 +1311,6 @@ smb = ["smbprotocol"] ssh = ["paramiko"] tqdm = ["tqdm"] -[[package]] -name = "gevent" -version = "24.2.1" -description = "Coroutine-based network library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "gevent-24.2.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:6f947a9abc1a129858391b3d9334c45041c08a0f23d14333d5b844b6e5c17a07"}, - {file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bde283313daf0b34a8d1bab30325f5cb0f4e11b5869dbe5bc61f8fe09a8f66f3"}, - {file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a1df555431f5cd5cc189a6ee3544d24f8c52f2529134685f1e878c4972ab026"}, - {file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:14532a67f7cb29fb055a0e9b39f16b88ed22c66b96641df8c04bdc38c26b9ea5"}, - {file = "gevent-24.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd23df885318391856415e20acfd51a985cba6919f0be78ed89f5db9ff3a31cb"}, - {file = "gevent-24.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ca80b121bbec76d7794fcb45e65a7eca660a76cc1a104ed439cdbd7df5f0b060"}, - {file = "gevent-24.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b9913c45d1be52d7a5db0c63977eebb51f68a2d5e6fd922d1d9b5e5fd758cc98"}, - {file = "gevent-24.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:918cdf8751b24986f915d743225ad6b702f83e1106e08a63b736e3a4c6ead789"}, - {file = "gevent-24.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:3d5325ccfadfd3dcf72ff88a92fb8fc0b56cacc7225f0f4b6dcf186c1a6eeabc"}, - {file = "gevent-24.2.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:03aa5879acd6b7076f6a2a307410fb1e0d288b84b03cdfd8c74db8b4bc882fc5"}, - {file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8bb35ce57a63c9a6896c71a285818a3922d8ca05d150fd1fe49a7f57287b836"}, - {file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7f87c2c02e03d99b95cfa6f7a776409083a9e4d468912e18c7680437b29222c"}, - {file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968581d1717bbcf170758580f5f97a2925854943c45a19be4d47299507db2eb7"}, - {file = "gevent-24.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7899a38d0ae7e817e99adb217f586d0a4620e315e4de577444ebeeed2c5729be"}, - {file = "gevent-24.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f5e8e8d60e18d5f7fd49983f0c4696deeddaf6e608fbab33397671e2fcc6cc91"}, - {file = "gevent-24.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fbfdce91239fe306772faab57597186710d5699213f4df099d1612da7320d682"}, - {file = "gevent-24.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cdf66977a976d6a3cfb006afdf825d1482f84f7b81179db33941f2fc9673bb1d"}, - {file = "gevent-24.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:1dffb395e500613e0452b9503153f8f7ba587c67dd4a85fc7cd7aa7430cb02cc"}, - {file = "gevent-24.2.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:6c47ae7d1174617b3509f5d884935e788f325eb8f1a7efc95d295c68d83cce40"}, - {file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7cac622e11b4253ac4536a654fe221249065d9a69feb6cdcd4d9af3503602e0"}, - {file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf5b9c72b884c6f0c4ed26ef204ee1f768b9437330422492c319470954bc4cc7"}, - {file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f5de3c676e57177b38857f6e3cdfbe8f38d1cd754b63200c0615eaa31f514b4f"}, - {file = "gevent-24.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4faf846ed132fd7ebfbbf4fde588a62d21faa0faa06e6f468b7faa6f436b661"}, - {file = "gevent-24.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:368a277bd9278ddb0fde308e6a43f544222d76ed0c4166e0d9f6b036586819d9"}, - {file = "gevent-24.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f8a04cf0c5b7139bc6368b461257d4a757ea2fe89b3773e494d235b7dd51119f"}, - {file = "gevent-24.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9d8d0642c63d453179058abc4143e30718b19a85cbf58c2744c9a63f06a1d388"}, - {file = "gevent-24.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:94138682e68ec197db42ad7442d3cf9b328069c3ad8e4e5022e6b5cd3e7ffae5"}, - {file = "gevent-24.2.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:8f4b8e777d39013595a7740b4463e61b1cfe5f462f1b609b28fbc1e4c4ff01e5"}, - {file = "gevent-24.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141a2b24ad14f7b9576965c0c84927fc85f824a9bb19f6ec1e61e845d87c9cd8"}, - {file = "gevent-24.2.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:9202f22ef811053077d01f43cc02b4aaf4472792f9fd0f5081b0b05c926cca19"}, - {file = "gevent-24.2.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2955eea9c44c842c626feebf4459c42ce168685aa99594e049d03bedf53c2800"}, - {file = "gevent-24.2.1-cp38-cp38-win32.whl", hash = "sha256:44098038d5e2749b0784aabb27f1fcbb3f43edebedf64d0af0d26955611be8d6"}, - {file = "gevent-24.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:117e5837bc74a1673605fb53f8bfe22feb6e5afa411f524c835b2ddf768db0de"}, - {file = "gevent-24.2.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:2ae3a25ecce0a5b0cd0808ab716bfca180230112bb4bc89b46ae0061d62d4afe"}, - {file = "gevent-24.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7ceb59986456ce851160867ce4929edaffbd2f069ae25717150199f8e1548b8"}, - {file = "gevent-24.2.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:2e9ac06f225b696cdedbb22f9e805e2dd87bf82e8fa5e17756f94e88a9d37cf7"}, - {file = "gevent-24.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:90cbac1ec05b305a1b90ede61ef73126afdeb5a804ae04480d6da12c56378df1"}, - {file = "gevent-24.2.1-cp39-cp39-win32.whl", hash = "sha256:782a771424fe74bc7e75c228a1da671578c2ba4ddb2ca09b8f959abdf787331e"}, - {file = "gevent-24.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:3adfb96637f44010be8abd1b5e73b5070f851b817a0b182e601202f20fa06533"}, - {file = "gevent-24.2.1-pp310-pypy310_pp73-macosx_11_0_universal2.whl", hash = "sha256:7b00f8c9065de3ad226f7979154a7b27f3b9151c8055c162332369262fc025d8"}, - {file = "gevent-24.2.1.tar.gz", hash = "sha256:432fc76f680acf7cf188c2ee0f5d3ab73b63c1f03114c7cd8a34cebbe5aa2056"}, -] - -[package.dependencies] -cffi = {version = ">=1.12.2", markers = "platform_python_implementation == \"CPython\" and sys_platform == \"win32\""} -greenlet = {version = ">=3.0rc3", markers = "platform_python_implementation == \"CPython\" and python_version >= \"3.11\""} -"zope.event" = "*" -"zope.interface" = "*" - -[package.extras] -dnspython = ["dnspython (>=1.16.0,<2.0)", "idna"] -docs = ["furo", "repoze.sphinx.autointerface", "sphinx", "sphinxcontrib-programoutput", "zope.schema"] -monitor = ["psutil (>=5.7.0)"] -recommended = ["cffi (>=1.12.2)", "dnspython (>=1.16.0,<2.0)", "idna", "psutil (>=5.7.0)"] -test = ["cffi (>=1.12.2)", "coverage (>=5.0)", "dnspython (>=1.16.0,<2.0)", "idna", "objgraph", "psutil (>=5.7.0)", "requests"] - [[package]] name = "google-api-core" version = "2.19.1" @@ -4214,20 +4034,6 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" -[[package]] -name = "prompt-toolkit" -version = "3.0.47" -description = "Library for building powerful interactive command lines in Python" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "prompt_toolkit-3.0.47-py3-none-any.whl", hash = "sha256:0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10"}, - {file = "prompt_toolkit-3.0.47.tar.gz", hash = "sha256:1e1b29cb58080b1e69f207c893a1a7bf16d127a5c30c9d17a25a5d77792e5360"}, -] - -[package.dependencies] -wcwidth = "*" - [[package]] name = "proto-plus" version = "1.24.0" @@ -6727,76 +6533,7 @@ files = [ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] -[[package]] -name = "zope-event" -version = "5.0" -description = "Very basic event publishing system" -optional = false -python-versions = ">=3.7" -files = [ - {file = "zope.event-5.0-py3-none-any.whl", hash = "sha256:2832e95014f4db26c47a13fdaef84cef2f4df37e66b59d8f1f4a8f319a632c26"}, - {file = "zope.event-5.0.tar.gz", hash = "sha256:bac440d8d9891b4068e2b5a2c5e2c9765a9df762944bda6955f96bb9b91e67cd"}, -] - -[package.dependencies] -setuptools = "*" - -[package.extras] -docs = ["Sphinx"] -test = ["zope.testrunner"] - -[[package]] -name = "zope-interface" -version = "7.0.1" -description = "Interfaces for Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zope.interface-7.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ec4e87e6fdc511a535254daa122c20e11959ce043b4e3425494b237692a34f1c"}, - {file = "zope.interface-7.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:51d5713e8e38f2d3ec26e0dfdca398ed0c20abda2eb49ffc15a15a23eb8e5f6d"}, - {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea8d51e5eb29e57d34744369cd08267637aa5a0fefc9b5d33775ab7ff2ebf2e3"}, - {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55bbcc74dc0c7ab489c315c28b61d7a1d03cf938cc99cc58092eb065f120c3a5"}, - {file = "zope.interface-7.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10ebac566dd0cec66f942dc759d46a994a2b3ba7179420f0e2130f88f8a5f400"}, - {file = "zope.interface-7.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7039e624bcb820f77cc2ff3d1adcce531932990eee16121077eb51d9c76b6c14"}, - {file = "zope.interface-7.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03bd5c0db82237bbc47833a8b25f1cc090646e212f86b601903d79d7e6b37031"}, - {file = "zope.interface-7.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f52050c6a10d4a039ec6f2c58e5b3ade5cc570d16cf9d102711e6b8413c90e6"}, - {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af0b33f04677b57843d529b9257a475d2865403300b48c67654c40abac2f9f24"}, - {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:696c2a381fc7876b3056711717dba5eddd07c2c9e5ccd50da54029a1293b6e43"}, - {file = "zope.interface-7.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f89a420cf5a6f2aa7849dd59e1ff0e477f562d97cf8d6a1ee03461e1eec39887"}, - {file = "zope.interface-7.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:b59deb0ddc7b431e41d720c00f99d68b52cb9bd1d5605a085dc18f502fe9c47f"}, - {file = "zope.interface-7.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:52f5253cca1b35eaeefa51abd366b87f48f8714097c99b131ba61f3fdbbb58e7"}, - {file = "zope.interface-7.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:88d108d004e0df25224de77ce349a7e73494ea2cb194031f7c9687e68a88ec9b"}, - {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c203d82069ba31e1f3bc7ba530b2461ec86366cd4bfc9b95ec6ce58b1b559c34"}, - {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f3495462bc0438b76536a0e10d765b168ae636092082531b88340dc40dcd118"}, - {file = "zope.interface-7.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:192b7a792e3145ed880ff6b1a206fdb783697cfdb4915083bfca7065ec845e60"}, - {file = "zope.interface-7.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:400d06c9ec8dbcc96f56e79376297e7be07a315605c9a2208720da263d44d76f"}, - {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c1dff87b30fd150c61367d0e2cdc49bb55f8b9fd2a303560bbc24b951573ae1"}, - {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f749ca804648d00eda62fe1098f229b082dfca930d8bad8386e572a6eafa7525"}, - {file = "zope.interface-7.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ec212037becf6d2f705b7ed4538d56980b1e7bba237df0d8995cbbed29961dc"}, - {file = "zope.interface-7.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d33cb526efdc235a2531433fc1287fcb80d807d5b401f9b801b78bf22df560dd"}, - {file = "zope.interface-7.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b419f2144e1762ab845f20316f1df36b15431f2622ebae8a6d5f7e8e712b413c"}, - {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03f1452d5d1f279184d5bdb663a3dc39902d9320eceb63276240791e849054b6"}, - {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ba4b3638d014918b918aa90a9c8370bd74a03abf8fcf9deb353b3a461a59a84"}, - {file = "zope.interface-7.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc0615351221926a36a0fbcb2520fb52e0b23e8c22a43754d9cb8f21358c33c0"}, - {file = "zope.interface-7.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:ce6cbb852fb8f2f9bb7b9cdca44e2e37bce783b5f4c167ff82cb5f5128163c8f"}, - {file = "zope.interface-7.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5566fd9271c89ad03d81b0831c37d46ae5e2ed211122c998637130159a120cf1"}, - {file = "zope.interface-7.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da0cef4d7e3f19c3bd1d71658d6900321af0492fee36ec01b550a10924cffb9c"}, - {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f32ca483e6ade23c7caaee9d5ee5d550cf4146e9b68d2fb6c68bac183aa41c37"}, - {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da21e7eec49252df34d426c2ee9cf0361c923026d37c24728b0fa4cc0599fd03"}, - {file = "zope.interface-7.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a8195b99e650e6f329ce4e5eb22d448bdfef0406404080812bc96e2a05674cb"}, - {file = "zope.interface-7.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:19c829d52e921b9fe0b2c0c6a8f9a2508c49678ee1be598f87d143335b6a35dc"}, - {file = "zope.interface-7.0.1.tar.gz", hash = "sha256:f0f5fda7cbf890371a59ab1d06512da4f2c89a6ea194e595808123c863c38eff"}, -] - -[package.dependencies] -setuptools = "*" - -[package.extras] -docs = ["Sphinx", "repoze.sphinx.autointerface", "sphinx-rtd-theme"] -test = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] -testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] - [metadata] lock-version = "2.0" python-versions = "~3.11" -content-hash = "04a023459d113e34eaf3cd43c04f1c5953507a0d4f2b4b9d59583b7320ccc25c" +content-hash = "8c1e1cd8cac4fd0c00a62cece0077d039153cdba0c7822ca3ce7387ff3b133e4" diff --git a/pyproject.toml b/pyproject.toml index 105977e7c..f29456781 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ google_auth_oauthlib = "^1.2.0" google-auth-httplib2 = "^0.2.0" google-api-python-client = "^2.133.0" openpyxl = "^3.1.5" -celery = {version = "^5.4.0", extras = ["gevent"]} kombu = "^5.3.7" watchdog = "^4.0.1" redis = {extras = ["hiredis"], version = "^5.0.7"} @@ -55,19 +54,6 @@ structlog = "^24.4.0" pyyaml = "^6.0.1" nltk = "^3.9.1" -[tool.poetry.group.compass] -optional = false - -[tool.poetry.group.compass.dependencies] -# Compass dependencies - To be removed once Compass is OSS -fsspec = "2024.2.0" -joblib = "*" -pydantic = ">=2.6.3" -python = ">=3.9,<3.12" -requests = ">=2.25.0,<3.0.0" -tenacity = "8.2.3" -tqdm = ">=4.42.1" - [tool.poetry.group.dev] optional = true diff --git a/src/backend/alembic/versions/2024_09_18_803535b4e118_.py b/src/backend/alembic/versions/2024_09_18_803535b4e118_.py new file mode 100644 index 000000000..4b041b0e6 --- /dev/null +++ b/src/backend/alembic/versions/2024_09_18_803535b4e118_.py @@ -0,0 +1,34 @@ +""" + +Revision ID: 803535b4e118 +Revises: ac3933258035 +Create Date: 2024-09-18 17:04:40.969832 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '803535b4e118' +down_revision: Union[str, None] = 'ac3933258035' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('agent_tool_metadata', 'artifacts', + existing_type=postgresql.JSON(astext_type=sa.Text()), + nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('agent_tool_metadata', 'artifacts', + existing_type=postgresql.JSON(astext_type=sa.Text()), + nullable=True) + # ### end Alembic commands ### diff --git a/src/backend/compass_sdk/__init__.py b/src/backend/compass_sdk/__init__.py deleted file mode 100644 index 9db81f481..000000000 --- a/src/backend/compass_sdk/__init__.py +++ /dev/null @@ -1,393 +0,0 @@ -import logging -from enum import Enum -from os import getenv -from typing import Any, Dict, List, Optional, Union - -from pydantic import BaseModel - -from backend.compass_sdk.constants import ( - COHERE_API_ENV_VAR, - DEFAULT_COMMANDR_EXTRACTABLE_ATTRIBUTES, - DEFAULT_COMMANDR_PROMPT, - DEFAULT_MIN_CHARS_PER_ELEMENT, - DEFAULT_MIN_NUM_CHUNKS_IN_TITLE, - DEFAULT_MIN_NUM_TOKENS_CHUNK, - DEFAULT_NUM_TOKENS_CHUNK_OVERLAP, - DEFAULT_NUM_TOKENS_PER_CHUNK, - METADATA_HEURISTICS_ATTRIBUTES, - SKIP_INFER_TABLE_TYPES, -) - - -class Logger: - def __init__(self, name: str, log_level: int = logging.INFO): - self._logger = logging.getLogger(name) - self._logger.setLevel(log_level) - - formatter = logging.Formatter( - f"%(asctime)s-{name}-PID:%(process)d: %(message)s", "%d-%m-%y:%H:%M:%S" - ) - stream_handler = logging.StreamHandler() - stream_handler.setFormatter(formatter) - self._logger.addHandler(stream_handler) - - def info(self, msg: str): - self._logger.info(msg) - - def debug(self, msg: str): - self._logger.debug(msg) - - def error(self, msg: str): - self._logger.error(msg) - - def critical(self, msg: str): - self._logger.critical(msg) - - def warning(self, msg: str): - self._logger.warning(msg) - - def flush(self): - for handler in self._logger.handlers: - handler.flush() - - def setLevel(self, level: Union[int, str]): - self._logger.setLevel(level) - - -logger = Logger(name="compass-sdk", log_level=logging.INFO) - - -class ValidatedModel(BaseModel): - class Config: - arbitrary_types_allowed = True - use_enum_values = True - - @classmethod - def attribute_in_model(cls, attr_name): - return attr_name in cls.__fields__ - - def __init__(self, **data): - for name, value in data.items(): - if not self.attribute_in_model(name): - raise ValueError( - f"{name} is not a valid attribute for {self.__class__.__name__}" - ) - super().__init__(**data) - - -class CompassDocumentMetadata(ValidatedModel): - """ - Compass document metadata - """ - - doc_id: str = "" - filename: str = "" - meta: List = [] - - -class CompassDocumentStatus(str, Enum): - """ - Compass document status - """ - - Success = "success" - ParsingErrors = "parsing-errors" - MetadataErrors = "metadata-errors" - IndexingErrors = "indexing-errors" - - -class CompassSdkStage(str, Enum): - """ - Compass SDK stages - """ - - Parsing = "parsing" - Metadata = "metadata" - Chunking = "chunking" - Indexing = "indexing" - - -class CompassDocumentChunkOrigin(BaseModel): - page_number: Optional[int] = None - - -class CompassDocumentChunk(BaseModel): - chunk_id: str - sort_id: str - doc_id: str - content: Dict[str, Any] - origin: Optional[CompassDocumentChunkOrigin] = None - - -class CompassDocument(ValidatedModel): - """ - A Compass document contains all the information required to process a document and insert it into the index - It includes: - - metadata: the document metadata (e.g., filename, title, authors, date) - - content: the document content in string format - - elements: the document's Unstructured elements (e.g., tables, images, text). Used for chunking - - chunks: the document's chunks (e.g., paragraphs, tables, images). Used for indexing - - index_fields: the fields to be indexed. Used by the indexer - """ - - filebytes: bytes = b"" - metadata: CompassDocumentMetadata = CompassDocumentMetadata() - content: Dict[str, str] = {} - elements: List[Any] = [] - chunks: List[CompassDocumentChunk] = [] - index_fields: List[str] = [] - errors: List[Dict[CompassSdkStage, str]] = [] - ignore_metadata_errors: bool = True - is_dataset: bool = False - markdown: Optional[str] = None - - def has_data(self) -> bool: - return len(self.filebytes) > 0 - - def has_markdown(self) -> bool: - return self.markdown is not None - - def has_filename(self) -> bool: - return len(self.metadata.filename) > 0 - - def has_metadata(self) -> bool: - return len(self.metadata.meta) > 0 - - def has_parsing_errors(self) -> bool: - return any( - stage == CompassSdkStage.Parsing - for error in self.errors - for stage, _ in error.items() - ) - - def has_metadata_errors(self) -> bool: - return any( - stage == CompassSdkStage.Metadata - for error in self.errors - for stage, _ in error.items() - ) - - def has_indexing_errors(self) -> bool: - return any( - stage == CompassSdkStage.Indexing - for error in self.errors - for stage, _ in error.items() - ) - - @property - def status(self) -> CompassDocumentStatus: - if self.has_parsing_errors(): - return CompassDocumentStatus.ParsingErrors - - if not self.ignore_metadata_errors and self.has_metadata_errors(): - return CompassDocumentStatus.MetadataErrors - - if self.has_indexing_errors(): - return CompassDocumentStatus.IndexingErrors - - return CompassDocumentStatus.Success - - -class MetadataStrategy(str, Enum): - No_Metadata = "no_metadata" - Naive_Title = "naive_title" - KeywordSearch = "keyword_search" - Bart = "bart" - Command_R = "command_r" - Custom = "custom" - - @classmethod - def _missing_(cls, value): - return cls.No_Metadata - - -class LoggerLevel(str, Enum): - DEBUG = "DEBUG" - INFO = "INFO" - WARNING = "WARNING" - ERROR = "ERROR" - CRITICAL = "CRITICAL" - - @classmethod - def _missing_(cls, value): - return cls.INFO - - -class MetadataConfig(ValidatedModel): - """ - Configuration class for metadata detection. - :param pre_build_detectors: whether to pre-build all metadata detectors. If set to False (default), - detectors will be built on the fly when needed - :param metadata_strategy: the metadata detection strategy to use. One of: - - No_Metadata: no metadata is inferred - - Heuristics: metadata is inferred using heuristics - - Bart: metadata is inferred using the BART summarization model - - Command_R: metadata is inferred using the Command-R summarization model - :param cohere_api_key: the Cohere API key to use for metadata detection - :param commandr_model_name: the name of the Command-R model to use for metadata detection - :param commandr_prompt: the prompt to use for the Command-R model - :param commandr_extractable_attributes: the extractable attributes for the Command-R model - :param commandr_max_tokens: the maximum number of tokens to use for the Command-R model - :param keyword_search_attributes: the attributes to search for in the document when using keyword search - :param keyword_search_separator: the separator to use for nested attributes when using keyword search - :param ignore_errors: if set to True, metadata detection errors will not be raised or stop the parsing process - - """ - - pre_build_detectors: bool = False - metadata_strategy: MetadataStrategy = MetadataStrategy.No_Metadata - cohere_api_key: Optional[str] = getenv(COHERE_API_ENV_VAR, None) - commandr_model_name: str = "command-r" - commandr_prompt: str = DEFAULT_COMMANDR_PROMPT - commandr_max_tokens: int = 500 - commandr_extractable_attributes: List[str] = DEFAULT_COMMANDR_EXTRACTABLE_ATTRIBUTES - keyword_search_attributes: List[str] = METADATA_HEURISTICS_ATTRIBUTES - keyword_search_separator: str = "." - ignore_errors: bool = True - - -class ParsingStrategy(str, Enum): - Fast = "fast" - Hi_Res = "hi_res" - - @classmethod - def _missing_(cls, value): - return cls.Fast - - -class ParsingModel(str, Enum): - Marker = "marker" # Default model, it is actually a combination of models used by the Marker PDF parser - YoloX_Quantized = ( - "yolox_quantized" # Only PDF parsing working option from Unstructured - ) - - @classmethod - def _missing_(cls, value): - return cls.Marker - - -class DocumentFormat(str, Enum): - Markdown = "markdown" - Text = "text" - - @classmethod - def _missing_(cls, value): - return cls.Markdown - - -class ParserConfig(ValidatedModel): - """ - CompassParser configuration. Important parameters: - :param parsing_strategy: the parsing strategy to use: - - 'auto' (default): automatically determine the best strategy - - 'fast': leverage traditional NLP extraction techniques to quickly pull all the - text elements. “Fast” strategy is not good for image based file types. - - 'hi_res': identifies the layout of the document using detectron2. The advantage of “hi_res” - is that it uses the document layout to gain additional information about document elements. - We recommend using this strategy if your use case is highly sensitive to correct - classifications for document elements. - - 'ocr_only': leverage Optical Character Recognition to extract text from the image based files. - :param parsing_model: the parsing model to use. One of: - - yolox_quantized (default): single-stage object detection model, quantized. Runs faster than YoloX - See https://unstructured-io.github.io/unstructured/best_practices/models.html for more details. - We have temporarily removed the option to use other models because - of ongoing stability issues. - - """ - - # CompassParser configuration - logger_level: LoggerLevel = LoggerLevel.INFO - parse_tables: bool = True - parse_images: bool = True - parsed_images_output_dir: Optional[str] = None - allowed_image_types: Optional[List[str]] = None - min_chars_per_element: int = DEFAULT_MIN_CHARS_PER_ELEMENT - skip_infer_table_types: List[str] = SKIP_INFER_TABLE_TYPES - detect_datasets: bool = True - parsing_strategy: ParsingStrategy = ParsingStrategy.Fast - parsing_model: ParsingModel = ParsingModel.Marker - - # CompassChunker configuration - num_tokens_per_chunk: int = DEFAULT_NUM_TOKENS_PER_CHUNK - num_tokens_overlap: int = DEFAULT_NUM_TOKENS_CHUNK_OVERLAP - min_chunk_tokens: int = DEFAULT_MIN_NUM_TOKENS_CHUNK - num_chunks_in_title: int = DEFAULT_MIN_NUM_CHUNKS_IN_TITLE - max_tokens_metadata: int = 1000 - include_tables: bool = True - - # Formatting configuration - output_format: DocumentFormat = DocumentFormat.Markdown - - -### Document indexing - - -class Chunk(BaseModel): - chunk_id: str - sort_id: int - content: Dict[str, Any] - origin: Optional[CompassDocumentChunkOrigin] = None - - -class Document(BaseModel): - """ - A document that can be indexed in Compass (i.e., a list of indexable chunks) - """ - - doc_id: str - path: str - content: Dict[str, Any] - chunks: List[Chunk] - index_fields: List[str] = [] - - -class SearchFilter(BaseModel): - class FilterType(str, Enum): - EQ = "$eq" - LT_EQ = "$lte" - GT_EQ = "$gte" - - field: str - type: FilterType - value: Any - - -class SearchInput(BaseModel): - """ - Search query input - """ - - query: str - top_k: int - filters: Optional[List[SearchFilter]] = None - - -class PutDocumentsInput(BaseModel): - """ - A Compass request to put a list of Document - """ - - docs: List[Document] - - -class BatchPutDocumentsInput(BaseModel): - uuid: str - - -class ProcessFileParameters(ValidatedModel): - parser_config: ParserConfig - metadata_config: MetadataConfig - doc_id: Optional[str] = None - is_dataset: Optional[bool] = None - - -class ProcessFilesParameters(ValidatedModel): - doc_ids: Optional[List[str]] = None - parser_config: ParserConfig - metadata_config: MetadataConfig - are_datasets: Optional[bool] = None - - -class BatchProcessFilesParameters(ProcessFilesParameters): - uuid: str - file_name_to_doc_ids: Optional[Dict[str, str]] = None diff --git a/src/backend/compass_sdk/compass.py b/src/backend/compass_sdk/compass.py deleted file mode 100644 index 33ec3b2dc..000000000 --- a/src/backend/compass_sdk/compass.py +++ /dev/null @@ -1,549 +0,0 @@ -import os -import threading -from collections import deque -from dataclasses import dataclass -from statistics import mean -from typing import Dict, Iterator, List, Optional, Tuple, Union - -import requests -from joblib import Parallel, delayed -from pydantic import BaseModel -from requests.exceptions import InvalidSchema -from tenacity import ( - RetryError, - retry, - retry_if_not_exception_type, - stop_after_attempt, - wait_fixed, -) -from tqdm import tqdm - -from backend.compass_sdk import ( - BatchPutDocumentsInput, - Chunk, - CompassDocument, - CompassDocumentStatus, - CompassSdkStage, - Document, - LoggerLevel, - PutDocumentsInput, - SearchFilter, - SearchInput, -) -from backend.compass_sdk.constants import ( - DEFAULT_MAX_CHUNKS_PER_REQUEST, - DEFAULT_MAX_ERROR_RATE, - DEFAULT_MAX_RETRIES, - DEFAULT_SLEEP_RETRY_SECONDS, -) -from backend.services.logger.utils import LoggerFactory - -logger = LoggerFactory().get_logger() - - -@dataclass -class RetryResult: - result: Optional[dict] = None - error: Optional[str] = None - - -class CompassAuthError(Exception): - """Exception raised for authentication errors in the Compass client.""" - - def __init__( - self, - message=( - "Unauthorized. Please check your username and password, " - "which can be passed into CompassClient or set in the " - "secrets.yaml, in the tools.compass.username and tools.compass.password " - "config variables." - ), - ): - self.message = message - super().__init__(self.message) - - -class CompassMaxErrorRateExceeded(Exception): - """Exception raised when the error rate exceeds the maximum allowed error rate in the Compass client.""" - - def __init__( - self, - message="The maximum error rate was exceeded. Stopping the insertion process.", - ): - self.message = message - super().__init__(self.message) - - -class CompassClient: - def __init__( - self, - *, - index_url: str, - username: Optional[str] = None, - password: Optional[str] = None, - logger_level: LoggerLevel = LoggerLevel.INFO, - ): - """ - A compass client to interact with the Compass API - :param index_url: the url of the Compass instance - :param username: the username for the Compass instance - :param password: the password for the Compass instance - """ - self.index_url = index_url - self.username = username or os.getenv("COHERE_COMPASS_USERNAME") - self.password = password or os.getenv("COHERE_COMPASS_PASSWORD") - self.session = requests.Session() - - self.function_call = { - "create_index": self.session.put, - "list_indexes": self.session.get, - "delete_index": self.session.delete, - "delete_document": self.session.delete, - "get_document": self.session.get, - "put_documents": self.session.put, - "put_documents_batch": self.session.post, - "search_documents": self.session.post, - "add_context": self.session.post, - "refresh": self.session.post, - } - self.function_endpoint = { - "create_index": "/api/v1/indexes/{index_name}", - "list_indexes": "/api/v1/indexes", - "delete_index": "/api/v1/indexes/{index_name}", - "delete_document": "/api/v1/indexes/{index_name}/documents/{doc_id}", - "get_document": "/api/v1/indexes/{index_name}/documents/{doc_id}", - "put_documents": "/api/v1/indexes/{index_name}/documents", - "put_documents_batch": "/api/v1/batch/indexes/{index_name}", - "search_documents": "/api/v1/indexes/{index_name}/documents/search", - "add_context": "/api/v1/indexes/{index_name}/documents/add_context/{doc_id}", - "refresh": "/api/v1/indexes/{index_name}/refresh", - } - - def create_index(self, *, index_name: str): - """ - Create an index in Compass - :param index_name: the name of the index - :return: the response from the Compass API - """ - return self._send_request( - function="create_index", - index_name=index_name, - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def refresh(self, *, index_name: str): - """ - Refresh index - :param index_name: the name of the index - :return: the response from the Compass API - """ - return self._send_request( - function="refresh", - index_name=index_name, - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def delete_index(self, *, index_name: str): - """ - Delete an index from Compass - :param index_name: the name of the index - :return: the response from the Compass API - """ - return self._send_request( - function="delete_index", - index_name=index_name, - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def delete_document(self, *, index_name: str, doc_id: str): - """ - Delete a document from Compass - :param index_name: the name of the index - :doc_id: the id of the document - :return: the response from the Compass API - """ - return self._send_request( - function="delete_document", - index_name=index_name, - doc_id=doc_id, - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def get_document(self, *, index_name: str, doc_id: str): - """ - Get a document from Compass - :param index_name: the name of the index - :doc_id: the id of the document - :return: the response from the Compass API - """ - return self._send_request( - function="get_document", - index_name=index_name, - doc_id=doc_id, - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def list_indexes(self): - """ - List all indexes in Compass - :return: the response from the Compass API - """ - return self._send_request( - function="list_indexes", - index_name="", - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def add_context( - self, - *, - index_name: str, - doc_id: str, - context: Dict, - max_retries: int = DEFAULT_MAX_RETRIES, - sleep_retry_seconds: int = DEFAULT_SLEEP_RETRY_SECONDS, - ) -> Optional[RetryResult]: - """ - Update the content field of an existing document with additional context - - :param index_name: the name of the index - :param doc_id: the document to modify - :param context: A dictionary of key:value pairs to insert into the content field of a document - :param max_retries: the maximum number of times to retry a doc insertion - :param sleep_retry_seconds: number of seconds to go to sleep before retrying a doc insertion - """ - - return self._send_request( - function="add_context", - index_name=index_name, - doc_id=doc_id, - data=context, - max_retries=max_retries, - sleep_retry_seconds=sleep_retry_seconds, - ) - - def insert_doc( - self, - *, - index_name: str, - doc: CompassDocument, - max_retries: int = DEFAULT_MAX_RETRIES, - sleep_retry_seconds: int = DEFAULT_SLEEP_RETRY_SECONDS, - ) -> Optional[List[CompassDocument]]: - """ - Insert a parsed document into an index in Compass - :param index_name: the name of the index - :param doc: the parsed compass document - :param max_retries: the maximum number of times to retry a doc insertion - :param sleep_retry_seconds: number of seconds to go to sleep before retrying a doc insertion - """ - return self.insert_docs( - index_name=index_name, - docs=iter([doc]), - max_retries=max_retries, - sleep_retry_seconds=sleep_retry_seconds, - ) - - def insert_docs_batch(self, *, uuid: str, index_name: str): - """ - Insert a batch of parsed documents into an index in Compass - :param uuid: the uuid of the batch - :param index_name: the name of the index - """ - return self._send_request( - function="put_documents_batch", - index_name=index_name, - data=BatchPutDocumentsInput(uuid=uuid), - max_retries=DEFAULT_MAX_RETRIES, - sleep_retry_seconds=DEFAULT_SLEEP_RETRY_SECONDS, - ) - - def batch_status(self, *, uuid: str): - """ - Get the status of a batch - :param uuid: the uuid of the batch - """ - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - resp = self.session.get( - url=f"{self.index_url}/api/v1/batch/status/{uuid}", - auth=auth, - ) - - if resp.ok: - return resp.json() - else: - raise Exception( - f"Failed to get batch status: {resp.status_code} {resp.text}" - ) - - def insert_docs( - self, - *, - index_name: str, - docs: Iterator[CompassDocument], - max_chunks_per_request: int = DEFAULT_MAX_CHUNKS_PER_REQUEST, - max_error_rate: float = DEFAULT_MAX_ERROR_RATE, - max_retries: int = DEFAULT_MAX_RETRIES, - sleep_retry_seconds: int = DEFAULT_SLEEP_RETRY_SECONDS, - errors_sliding_window_size: Optional[int] = 10, - skip_first_n_docs: int = 0, - num_jobs: Optional[int] = None, - ) -> Optional[List[CompassDocument]]: - """ - Insert multiple parsed documents into an index in Compass - :param index_name: the name of the index - :param docs: the parsed documents - :param max_chunks_per_request: the maximum number of chunks to send in a single API request - :param num_jobs: the number of parallel jobs to use - :param max_error_rate: the maximum error rate allowed - :param max_retries: the maximum number of times to retry a request if it fails - :param sleep_retry_seconds: the number of seconds to wait before retrying an API request - :param errors_sliding_window_size: the size of the sliding window to keep track of errors - :param skip_first_n_docs: number of docs to skip indexing. Useful when insertion failed after N documents - """ - - def put_request( - request_data: List[Tuple[CompassDocument, Document]], - previous_errors: List[CompassDocument], - num_doc: int, - ) -> None: - nonlocal num_succeeded, errors - errors.extend(previous_errors) - compass_docs: List[CompassDocument] = [ - compass_doc for compass_doc, _ in request_data - ] - put_docs_input = PutDocumentsInput( - docs=[input_doc for _, input_doc in request_data] - ) - - # It could be that all documents have errors, in which case we should not send a request - # to the Compass Server. This is a common case when the parsing of the documents fails. - # In this case, only errors will appear in the insertion_docs response - if not request_data: - return - - results = self._send_request( - function="put_documents", - index_name=index_name, - data=put_docs_input, - max_retries=max_retries, - sleep_retry_seconds=sleep_retry_seconds, - ) - - if results.error: - for doc in compass_docs: - doc.errors.append({CompassSdkStage.Indexing: results.error}) - errors.append(doc) - else: - num_succeeded += len(compass_docs) - - # Keep track of the results of the last N API calls to calculate the error rate - # If the error rate is higher than the threshold, stop the insertion process - error_window.append(results.error) - error_rate = ( - mean([1 if x else 0 for x in error_window]) - if len(error_window) == error_window.maxlen - else 0 - ) - if error_rate > max_error_rate: - raise CompassMaxErrorRateExceeded( - f"[Thread {threading.get_native_id()}]{error_rate * 100}% of insertions failed " - f"in the last {errors_sliding_window_size} API calls. Stopping the insertion process." - ) - - error_window = deque( - maxlen=errors_sliding_window_size - ) # Keep track of the results of the last N API calls - num_succeeded = 0 - errors = [] - requests_iter = tqdm(self._get_request_blocks(docs, max_chunks_per_request)) - - try: - num_jobs = num_jobs or os.cpu_count() - Parallel(n_jobs=num_jobs, backend="threading")( - delayed(put_request)( - request_data=request_block, - previous_errors=previous_errors, - num_doc=i, - ) - for i, (request_block, previous_errors) in enumerate(requests_iter, 1) - if i > skip_first_n_docs - ) - except CompassMaxErrorRateExceeded as e: - logger.error(event="[CompassError]", error=e.message) - return errors if len(errors) > 0 else None - - @staticmethod - def _get_request_blocks( - docs: Iterator[CompassDocument], - max_chunks_per_request: int, - ) -> Iterator: - """ - Create request blocks to send to the Compass API - :param docs: the documents to send - :param max_chunks_per_request: the maximum number of chunks to send in a single API request - :return: an iterator over the request blocks - """ - - request_block, errors = [], [] - num_chunks = 0 - for num_doc, doc in enumerate(docs, 1): - if doc.status != CompassDocumentStatus.Success: - logger.error( - event="Document errors", - errors=doc.errors, - num_doc=num_doc, - thread_id=threading.get_native_id(), - ) - errors.append(doc) - else: - num_chunks += ( - len(doc.chunks) - if doc.status == CompassDocumentStatus.Success - else 0 - ) - if num_chunks > max_chunks_per_request: - yield request_block, errors - request_block, errors = [], [] - num_chunks = 0 - - request_block.append( - ( - doc, - Document( - doc_id=doc.metadata.doc_id, - path=doc.metadata.filename, - content=doc.content, - chunks=[Chunk(**c.model_dump()) for c in doc.chunks], - index_fields=doc.index_fields, - ), - ) - ) - - if len(request_block) > 0 or len(errors) > 0: - yield request_block, errors - - def search( - self, - *, - index_name: str, - query: str, - top_k: int = 10, - filters: Optional[List[SearchFilter]] = None, - ): - """ - Search your Compass index - :param index_name: the name of the index - :param query: query to search for - :param top_k: number of documents to return - """ - return self._send_request( - function="search_documents", - index_name=index_name, - data=SearchInput(query=query, top_k=top_k, filters=filters), - max_retries=1, - sleep_retry_seconds=1, - ) - - def _send_request( - self, - index_name: str, - function: str, - max_retries: int, - sleep_retry_seconds: int, - data: Optional[Union[Dict, BaseModel]] = None, - doc_id: Optional[str] = None, - ) -> RetryResult: - """ - Send a request to the Compass API - :param function: the function to call - :param index_name: the name of the index - :param max_retries: the number of times to retry the request - :param sleep_retry_seconds: the number of seconds to sleep between retries - :param data: the data to send - :return: An error message if the request failed, otherwise None - """ - - @retry( - stop=stop_after_attempt(max_retries), - wait=wait_fixed(sleep_retry_seconds), - retry=retry_if_not_exception_type((CompassAuthError, InvalidSchema)), - ) - def _send_request_with_retry(): - nonlocal error - try: - if data: - if isinstance(data, BaseModel): - data_dict = data.model_dump() - elif isinstance(data, Dict): - data_dict = data - - response = self.function_call[function]( - target_path, json=data_dict, auth=(self.username, self.password) - ) - else: - response = self.function_call[function]( - target_path, auth=(self.username, self.password) - ) - - if response.ok: - error = None - return RetryResult(result=response.json(), error=None) - else: - response.raise_for_status() - - except requests.exceptions.HTTPError as e: - if e.response.status_code == 401: - error = "Unauthorized. Please check your username and password." - raise CompassAuthError() - else: - error = str(e) + " " + e.response.text - logger.error( - event="Failed to send request to", - function=function, - target_path=target_path, - type=type(e), - error=error, - sleep_retry_seconds=sleep_retry_seconds, - thread_id=threading.get_native_id(), - ) - raise e - - except Exception as e: - error = str(e) - logger.error( - event="Failed to send request to", - function=function, - target_path=target_path, - type=type(e), - error=error, - sleep_retry_seconds=sleep_retry_seconds, - thread_id=threading.get_native_id(), - ) - raise e - - error = None - try: - target_path = self.index_url + self.function_endpoint[function].format( - index_name=index_name, doc_id=doc_id - ) - res = _send_request_with_retry() - if res: - return res - else: - return RetryResult(result=None, error=error) - except RetryError: - logger.error( - event="Failed to send request after max_retries attempts. Aborting.", - max_retries=max_retries, - thread_id=threading.get_native_id(), - ) - return RetryResult(result=None, error=error) diff --git a/src/backend/compass_sdk/constants.py b/src/backend/compass_sdk/constants.py deleted file mode 100644 index c684d73d3..000000000 --- a/src/backend/compass_sdk/constants.py +++ /dev/null @@ -1,35 +0,0 @@ -DEFAULT_MAX_CHUNKS_PER_REQUEST = 100 -DEFAULT_SLEEP_RETRY_SECONDS = 5 -DEFAULT_MAX_RETRIES = 1 -DEFAULT_MAX_ERROR_RATE = 0.5 -DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES = 50_000_000 - -DEFAULT_MIN_CHARS_PER_ELEMENT = 3 -DEFAULT_NUM_TOKENS_PER_CHUNK = 500 -DEFAULT_NUM_TOKENS_CHUNK_OVERLAP = 15 -DEFAULT_MIN_NUM_TOKENS_CHUNK = 5 -DEFAULT_MIN_NUM_CHUNKS_IN_TITLE = 1 - -DEFAULT_WIDTH_HEIGHT_VERTICAL_RATIO = 0.6 -SKIP_INFER_TABLE_TYPES = ["jpg", "png", "xls", "xlsx", "heic"] - -# Metadata detection constants -COHERE_API_ENV_VAR = "COHERE_API_KEY" -DEFAULT_COMMANDR_EXTRACTABLE_ATTRIBUTES = ["title", "authors", "date"] -DEFAULT_COMMANDR_PROMPT = """ - Given the following document: - {text}. - Extract the following attributes from the document: {attributes}. - Write the output in JSON format. For example, if the document title is "Hello World" - and the authors are "John Doe" and "Jane Smith", the output should be: - {{"title": "Hello World", "authors": ["John Doe", "Jane Smith"]}}. - Do not write the ```json (...) ``` tag. The output should be a valid JSON. - If you cannot find the information, write "" for the corresponding field. - Answer: - """ -METADATA_HEURISTICS_ATTRIBUTES = [ - "title", - "name", - "date", - "authors", -] diff --git a/src/backend/compass_sdk/parser.py b/src/backend/compass_sdk/parser.py deleted file mode 100644 index fd13b8ea0..000000000 --- a/src/backend/compass_sdk/parser.py +++ /dev/null @@ -1,351 +0,0 @@ -import json -import os -from concurrent.futures import ThreadPoolExecutor -from typing import Any, Callable, Dict, Iterable, List, Optional, Union - -import requests - -from backend.compass_sdk import ( - BatchProcessFilesParameters, - CompassDocument, - MetadataConfig, - ParserConfig, - ProcessFileParameters, -) -from backend.compass_sdk.constants import DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES -from backend.compass_sdk.utils import imap_queued, open_document, scan_folder -from backend.services.logger.utils import LoggerFactory - -logger = LoggerFactory().get_logger() - -Fn_or_Dict = Union[Dict[str, Any], Callable[[CompassDocument], Dict[str, Any]]] - - -class CompassParserClient: - """ - Client to interact with the CompassParser API. It allows to process files using the parser and metadata - configurations specified in the parameters. The client is stateful, that is, it can be initialized with - parser and metadata configurations that will be used for all subsequent files processed by the client. - Also, independently of the default configurations, the client allows to pass specific configurations for each file - when calling the process_file or process_files methods. The client is responsible for opening the files and - sending them to the CompassParser API for processing. The resulting documents are returned as CompassDocument - objects. - - :param parser_url: URL of the CompassParser API - :param parser_config: Default parser configuration to use when processing files - :param metadata_config: Default metadata configuration to use when processing files - - """ - - def __init__( - self, - *, - parser_url: str, - parser_config: ParserConfig = ParserConfig(), - metadata_config: MetadataConfig = MetadataConfig(), - username: Optional[str] = None, - password: Optional[str] = None, - num_workers: int = 4, - ): - """ - Initializes the CompassParserClient with the specified parser_url, parser_config, and metadata_config. - The parser_config and metadata_config are optional, and if not provided, the default configurations will be used. - If the parser/metadata configs are provided, they will be used for all subsequent files processed by the client - unless specific configs are passed when calling the process_file or process_files methods. - - :param parser_url: the URL of the CompassParser API - :param parser_config: the parser configuration to use when processing files if no parser configuration - is specified in the method calls (process_file or process_files) - :param metadata_config: the metadata configuration to use when processing files if no metadata configuration - is specified in the method calls (process_file or process_files) - """ - self.parser_url = ( - parser_url if not parser_url.endswith("/") else parser_url[:-1] - ) - self.parser_config = parser_config - self.username = username or os.getenv("COHERE_COMPASS_USERNAME") - self.password = password or os.getenv("COHERE_COMPASS_PASSWORD") - self.session = requests.Session() - self.thread_pool = ThreadPoolExecutor(num_workers) - self.num_workers = num_workers - self.metadata_config = metadata_config - - def process_folder( - self, - *, - folder_path: str, - allowed_extensions: Optional[List[str]] = None, - recursive: bool = False, - parser_config: Optional[ParserConfig] = None, - metadata_config: Optional[MetadataConfig] = None, - custom_context: Optional[Fn_or_Dict] = None, - ): - """ - Processes all the files in the specified folder using the default parser and metadata configurations - passed when creating the client. The method iterates over all the files in the folder and processes them - using the process_file method. The resulting documents are returned as a list of CompassDocument objects. - - :param folder_path: the folder to process - :param allowed_extensions: the list of allowed extensions to process - :param recursive: whether to process the folder recursively - :param parser_config: the parser configuration to use when processing files if no parser configuration - is specified in the method calls (process_file or process_files) - :param metadata_config: the metadata configuration to use when processing files if no metadata configuration - is specified in the method calls (process_file or process_files) - :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable. - Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary. - - :return: the list of processed documents - """ - filenames = scan_folder( - folder_path=folder_path, - allowed_extensions=allowed_extensions, - recursive=recursive, - ) - return self.process_files( - filenames=filenames, - parser_config=parser_config, - metadata_config=metadata_config, - custom_context=custom_context if custom_context else None, - ) - - def process_files( - self, - *, - filenames: List[str], - file_ids: Optional[List[str]] = None, - parser_config: Optional[ParserConfig] = None, - metadata_config: Optional[MetadataConfig] = None, - are_datasets: Optional[List[bool]] = None, - custom_context: Optional[Fn_or_Dict] = None, - ) -> Iterable[CompassDocument]: - """ - Processes a list of files provided as filenames, using the specified parser and metadata configurations. - - If the parser/metadata configs are not provided, then the default configs passed by parameter when - creating the client will be used. This makes the CompassParserClient stateful. That is, we can set the - parser/metadata configs only once when creating the parser client, and process all subsequent files - without having to pass the config every time. - - All the documents passed as filenames and opened to obtain their bytes. Then, they are packed into a - ProcessFilesParameters object that contains a list of ProcessFileParameters, each contain a file, - its id, and the parser/metadata config - - :param filenames: List of filenames to process - :param file_ids: List of ids for the files - :param parser_config: ParserConfig object (applies the same config to all docs) - :param metadata_config: MetadataConfig object (applies the same config to all docs) - :param are_datasets: List of booleans indicating whether each file is a dataset - :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable. - Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary. - - :return: List of processed documents - """ - - def process_file(i: int) -> List[CompassDocument]: - return self.process_file( - filename=filenames[i], - file_id=file_ids[i] if file_ids else None, - parser_config=parser_config, - metadata_config=metadata_config, - is_dataset=are_datasets[i] if are_datasets else None, - custom_context=custom_context, - ) - - for results in imap_queued( - self.thread_pool, - process_file, - range(len(filenames)), - max_queued=self.num_workers, - ): - yield from results - - @staticmethod - def _get_metadata( - doc: CompassDocument, custom_context: Optional[Fn_or_Dict] = None - ) -> Dict[str, Any]: - if custom_context is None: - return {} - elif callable(custom_context): - return custom_context(doc) - else: - return custom_context - - def process_file( - self, - *, - filename: str, - file_id: Optional[str] = None, - parser_config: Optional[ParserConfig] = None, - metadata_config: Optional[MetadataConfig] = None, - is_dataset: Optional[bool] = None, - custom_context: Optional[Fn_or_Dict] = None, - ) -> List[CompassDocument]: - """ - Takes in a file, its id, and the parser/metadata config. If the config is None, then it uses the - default configs passed by parameter when creating the client. This makes the CompassParserClient - stateful for convenience, that is, one can pass in the parser/metadata config only once when creating the - CompassParserClient, and process files without having to pass the config every time - - :param filename: Filename to process - :param file_id: Id for the file - :param parser_config: ParserConfig object with the config to use for parsing the file - :param metadata_config: MetadataConfig object with the config to use for extracting metadata for each document - :param is_dataset: Boolean indicating whether the file is a dataset. If True, the file will be processed - as a dataset and multiple CompassDocument objects might be returned (one per dataset record). Otherwise, - the file will be processed as a single document (e.g., a PDF file). Default is None, which means that - the server will try to infer whether the file is a dataset or not. - :param custom_context: Additional data to add to compass document. Fields will be filterable but not semantically searchable. - Can either be a dictionary or a callable that takes a CompassDocument and returns a dictionary. - - :return: List of resulting documents - """ - doc = open_document(filename) - if doc.errors: - logger.error( - event="Error opening document", - errors=doc.errors, - ) - return [] - if len(doc.filebytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: - logger.error( - event="File too large, supported file size in mb", - supported_file_size={DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES / 1000_000}, - filename=doc.metadata.filename, - ) - return [] - - parser_config = parser_config or self.parser_config - metadata_config = metadata_config or self.metadata_config - - params = ProcessFileParameters( - parser_config=parser_config, - metadata_config=metadata_config, - doc_id=file_id, - is_dataset=is_dataset, - ) - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - res = self.session.post( - url=f"{self.parser_url}/v1/process_file", - data={"data": json.dumps(params.model_dump())}, - files={"file": (filename, doc.filebytes)}, - auth=auth, - ) - - if res.ok: - docs = [CompassDocument(**doc) for doc in res.json()["docs"]] - for doc in docs: - additional_metadata = CompassParserClient._get_metadata( - doc=doc, custom_context=custom_context - ) - doc.content = {**doc.content, **additional_metadata} - else: - docs = [] - logger.error( - event="Error processing file", - error=res.text, - ) - - return docs - - def batch_upload(self, *, zip_file_path: str) -> str: - """ - Uploads a zip file to the for offline processing. The zip file should contain the files to process. - The zip file is sent to the server, and the server will process each file in the zip file using the default - parser and metadata configurations passed when creating the client. - - :param zip_file_path: the path to the zip file to upload - :return: uuid for the uploaded zip file - """ - if not zip_file_path.endswith(".zip"): - raise Exception("Allowed type is only zip") - - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - with open(zip_file_path, "rb") as zip_file: - zip_data = zip_file.read() - res = self.session.post( - url=f"{self.parser_url}/v1/batch/upload", - data={"data": {"is_dataset": False}}, - files={"file": ("data.zip", zip_data)}, - auth=auth, - ) - - if res.ok: - return res.json() - else: - logger.error( - event="Error uploading file", - error=res.text, - ) - raise Exception(f"Error uploading zip file: {res.text}") - - def batch_status(self, uuid: str) -> str: - """ - Returns the status of the batch processing job with the specified uuid. The status can be one of the following: - - "PROCESSING": the job is being processed - - "DONE": the job has been processed successfully - - "ERROR": the job has failed to process - - :param uuid: the uuid of the batch processing job - :return: the status of the batch processing job - """ - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - res = self.session.get( - url=f"{self.parser_url}/v1/batch/status", - params={"uuid": uuid}, - auth=auth, - ) - - if res.ok: - return res.json() - else: - logger.error(event="Error getting batch status", error=res.text) - raise Exception(f"Error getting batch status: {res.text}") - - def batch_run( - self, - *, - uuid: str, - file_name_to_doc_ids: Optional[Dict[str, str]] = None, - parser_config: Optional[ParserConfig] = None, - metadata_config: Optional[MetadataConfig] = None, - are_datasets: Optional[bool] = None, - ) -> List[CompassDocument]: - parser_config = parser_config or self.parser_config - metadata_config = metadata_config or self.metadata_config - - params = BatchProcessFilesParameters( - uuid=uuid, - file_name_to_doc_ids=file_name_to_doc_ids, - parser_config=parser_config, - metadata_config=metadata_config, - are_datasets=are_datasets, - ) - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - res = self.session.post( - url=f"{self.parser_url}/v1/batch/run", - data={"data": json.dumps(params.model_dump())}, - auth=auth, - ) - - if res.ok: - return res.json() - else: - docs = [] - logger.error( - event="Error processing file", - error=res.text, - ) - - # # Run metadata detection locally if a metadata detector was provided. - # # This overrides the metadata generated by the server using the metadata_config provided in the method call - # self._add_metadata(docs=docs, metadata_detector=metadata_detector, metadata_config=metadata_config) - return docs diff --git a/src/backend/compass_sdk/utils.py b/src/backend/compass_sdk/utils.py deleted file mode 100644 index d661341e4..000000000 --- a/src/backend/compass_sdk/utils.py +++ /dev/null @@ -1,100 +0,0 @@ -import glob -import os -from collections import deque -from concurrent.futures import Executor, Future -from typing import Callable, Iterable, Iterator, List, Optional, TypeVar - -import fsspec -from fsspec import AbstractFileSystem - -from backend.compass_sdk import ( - CompassDocument, - CompassDocumentMetadata, - CompassSdkStage, -) - -T = TypeVar("T") -U = TypeVar("U") - - -def imap_queued( - executor: Executor, f: Callable[[T], U], it: Iterable[T], max_queued: int -) -> Iterator[U]: - assert max_queued >= 1 - tasks = deque[Future[U]]() - - for x in it: - tasks.append(executor.submit(f, x)) - - while len(tasks) > max_queued: - yield tasks.popleft().result() - - while tasks: - yield tasks.popleft().result() - - -def get_fs(document_path: str) -> AbstractFileSystem: - """ - Get the filesystem object for the given document path - :param document_path: the path to the document - :return: the filesystem object - """ - if document_path.find("://") >= 0: - file_system = document_path.split("://")[0] - fs = fsspec.filesystem(file_system) - else: - fs = fsspec.filesystem("local") - return fs - - -def open_document(document_path) -> CompassDocument: - """ - Opens a document regardless of the file system (local, GCS, S3, etc.) and returns a file-like object - :param document_path: the path to the document - :return: a file-like object - """ - doc = CompassDocument(metadata=CompassDocumentMetadata(filename=document_path)) - try: - fs = get_fs(document_path) - with fs.open(document_path, "rb") as f: - val = f.read() - if isinstance(val, bytes): - doc.filebytes = val - else: - raise Exception(f"Expected bytes, got {type(val)}") - except Exception as e: - doc.errors = [{CompassSdkStage.Parsing: str(e)}] - return doc - - -def scan_folder( - folder_path: str, - allowed_extensions: Optional[List[str]] = None, - recursive: bool = False, -) -> List[str]: - """ - Scans a folder for files with the given extensions - :param folder_path: the path to the folder - :param allowed_extensions: the allowed extensions - :param recursive: whether to scan the folder recursively or to only scan the top level - :return: a list of file paths - """ - fs = get_fs(folder_path) - all_files = [] - path_prepend = ( - f"{folder_path.split('://')[0]}://" if folder_path.find("://") >= 0 else "" - ) - - if allowed_extensions is None: - allowed_extensions = [""] - else: - allowed_extensions = [ - f".{ext}" if not ext.startswith(".") else ext for ext in allowed_extensions - ] - - for ext in allowed_extensions: - rec_glob = "**/" if recursive else "" - pattern = os.path.join(glob.escape(folder_path), f"{rec_glob}*{ext}") - scanned_files = fs.glob(pattern, recursive=recursive) - all_files.extend([f"{path_prepend}{f}" for f in scanned_files]) - return all_files diff --git a/src/backend/config/configuration.template.yaml b/src/backend/config/configuration.template.yaml index a97101238..6c385b756 100644 --- a/src/backend/config/configuration.template.yaml +++ b/src/backend/config/configuration.template.yaml @@ -41,7 +41,6 @@ feature_flags: # Experimental features use_experimental_langchain: false use_agents_view: false - use_compass_file_storage: false # Community features use_community_features: true auth: @@ -55,6 +54,3 @@ logger: strategy: structlog renderer: console level: info -sync: - broker_url: redis://:redis@redis:6379 - worker_concurrency: 4 diff --git a/src/backend/config/secrets.template.yaml b/src/backend/config/secrets.template.yaml index 94db2c14a..2ff9f1116 100644 --- a/src/backend/config/secrets.template.yaml +++ b/src/backend/config/secrets.template.yaml @@ -20,9 +20,6 @@ tools: api_key: wolfram_alpha: app_id: - compass: - username: - password: google_drive: client_id: client_secret: @@ -38,9 +35,4 @@ auth: oidc: client_id: client_secret: - well_known_endpoint: -compass: - username: - password: - api_url: - parser_url: + well_known_endpoint: \ No newline at end of file diff --git a/src/backend/config/settings.py b/src/backend/config/settings.py index 63f92a245..2fcf24103 100644 --- a/src/backend/config/settings.py +++ b/src/backend/config/settings.py @@ -112,13 +112,6 @@ class FeatureFlags(BaseSettings, BaseModel): "USE_COMMUNITY_FEATURES", "use_community_features" ), ) - use_compass_file_storage: Optional[bool] = Field( - default=False, - validation_alias=AliasChoices( - "USE_COMPASS_FILE_STORAGE", "use_compass_file_storage" - ), - ) - class PythonToolSettings(BaseSettings, BaseModel): model_config = SETTINGS_CONFIG @@ -127,25 +120,6 @@ class PythonToolSettings(BaseSettings, BaseModel): ) -class CompassSettings(BaseSettings, BaseModel): - model_config = SETTINGS_CONFIG - username: Optional[str] = Field( - default=None, - validation_alias=AliasChoices("COHERE_COMPASS_USERNAME", "username"), - ) - password: Optional[str] = Field( - default=None, - validation_alias=AliasChoices("COHERE_COMPASS_PASSWORD", "password"), - ) - api_url: Optional[str] = Field( - default=None, validation_alias=AliasChoices("COHERE_COMPASS_API_URL", "api_url") - ) - parser_url: Optional[str] = Field( - default=None, - validation_alias=AliasChoices("COHERE_COMPASS_PARSER_URL", "parser_url"), - ) - - class TavilySearchSettings(BaseSettings, BaseModel): model_config = SETTINGS_CONFIG api_key: Optional[str] = Field( @@ -308,17 +282,6 @@ class LoggerSettings(BaseSettings, BaseModel): ) -class SyncSettings(BaseSettings, BaseModel): - model_config = SETTINGS_CONFIG - broker_url: Optional[str] = Field( - default=None, validation_alias=AliasChoices("BROKER_URL", "broker_url") - ) - worker_concurrency: Optional[int] = Field( - default=4, - validation_alias=AliasChoices("WORKER_CONCURRENCY", "worker_concurrency"), - ) - - class Settings(BaseSettings): """ Settings class used to grab environment variables from configuration.yaml @@ -335,8 +298,6 @@ class Settings(BaseSettings): redis: Optional[RedisSettings] = Field(default=RedisSettings()) deployments: Optional[DeploymentSettings] = Field(default=DeploymentSettings()) logger: Optional[LoggerSettings] = Field(default=LoggerSettings()) - compass: Optional[CompassSettings] = Field(default=CompassSettings()) - sync: Optional[SyncSettings] = Field(default=SyncSettings()) @classmethod def settings_customise_sources( diff --git a/src/backend/crud/agent_task.py b/src/backend/crud/agent_task.py deleted file mode 100644 index 55c7c05c1..000000000 --- a/src/backend/crud/agent_task.py +++ /dev/null @@ -1,28 +0,0 @@ -from typing import List - -from sqlalchemy.orm import Session - -from backend.database_models.agent_task import AgentTask, SyncCeleryTaskMeta -from backend.services.logger.utils import LoggerFactory -from backend.services.transaction import validate_transaction - -logger = LoggerFactory().get_logger() - - -@validate_transaction -def create_agent_task(db: Session, agent_id: str, task_id: str) -> AgentTask: - agent_task = AgentTask(agent_id=agent_id, task_id=task_id) - db.add(agent_task) - db.commit() - db.refresh(agent_task) - return agent_task - - -@validate_transaction -def get_agent_tasks_by_agent_id(db: Session, agent_id: str) -> List[SyncCeleryTaskMeta]: - return ( - db.query(SyncCeleryTaskMeta) - .join(AgentTask, AgentTask.task_id == SyncCeleryTaskMeta.task_id) - .filter(AgentTask.agent_id == agent_id) - .all() - ) diff --git a/src/backend/database_models/__init__.py b/src/backend/database_models/__init__.py index d339338e6..20d1c116a 100644 --- a/src/backend/database_models/__init__.py +++ b/src/backend/database_models/__init__.py @@ -1,6 +1,5 @@ # ruff: noqa from backend.database_models.agent import * -from backend.database_models.agent_task import * from backend.database_models.agent_tool_metadata import * from backend.database_models.base import * from backend.database_models.blacklist import * diff --git a/src/backend/database_models/agent_task.py b/src/backend/database_models/agent_task.py deleted file mode 100644 index d19c25c95..000000000 --- a/src/backend/database_models/agent_task.py +++ /dev/null @@ -1,43 +0,0 @@ -from sqlalchemy import ( - DateTime, - ForeignKey, - Integer, - LargeBinary, - String, - Text, - UniqueConstraint, -) -from sqlalchemy.orm import Mapped, mapped_column - -from backend.database_models.base import Base, MinimalBase - - -class SyncCeleryTaskMeta(MinimalBase): - __tablename__ = "sync_celery_taskmeta" - - id: Mapped[int] = mapped_column(Integer, primary_key=True) - task_id: Mapped[str] = mapped_column(String(155), unique=True) - status: Mapped[str] = mapped_column(String(50)) - result: Mapped[bytes] = mapped_column(LargeBinary) - date_done: Mapped[DateTime] = mapped_column(DateTime) - traceback: Mapped[str] = mapped_column(Text) - name: Mapped[str] = mapped_column(String(155)) - args: Mapped[bytes] = mapped_column(LargeBinary) - kwargs: Mapped[bytes] = mapped_column(LargeBinary) - worker: Mapped[str] = mapped_column(String(155)) - retries: Mapped[int] = mapped_column(Integer) - queue: Mapped[str] = mapped_column(String(155)) - - -class AgentTask(Base): - __tablename__ = "agent_tasks" - - agent_id: Mapped[str] = mapped_column( - ForeignKey("agents.id", ondelete="CASCADE"), nullable=False - ) - - task_id: Mapped[str] = mapped_column(nullable=False) - - __table_args__ = ( - UniqueConstraint("agent_id", "task_id", name="unique_agent_task"), - ) diff --git a/src/backend/routers/agent.py b/src/backend/routers/agent.py index cb218e8ce..d15a2d4b2 100644 --- a/src/backend/routers/agent.py +++ b/src/backend/routers/agent.py @@ -1,17 +1,14 @@ import asyncio -from typing import List, Optional +from typing import Optional from fastapi import APIRouter, Depends, HTTPException from fastapi import File as RequestFile from fastapi import UploadFile as FastAPIUploadFile from backend.config.routers import RouterName -from backend.config.settings import Settings -from backend.config.tools import ToolName from backend.crud import agent as agent_crud from backend.crud import agent_tool_metadata as agent_tool_metadata_crud from backend.crud import snapshot as snapshot_crud -from backend.crud.agent_task import get_agent_tasks_by_agent_id from backend.database_models.agent import Agent as AgentModel from backend.database_models.agent_tool_metadata import ( AgentToolMetadata as AgentToolMetadataModel, @@ -21,7 +18,6 @@ from backend.schemas.agent import ( Agent, AgentPublic, - AgentTaskResponse, AgentToolMetadata, AgentToolMetadataPublic, AgentVisibility, @@ -42,14 +38,12 @@ agent_to_metrics_agent, ) from backend.services.agent import ( - parse_task, raise_db_error, validate_agent_exists, validate_agent_tool_metadata_exists, ) from backend.services.context import get_context from backend.services.file import ( - consolidate_agent_files_in_compass, get_file_service, validate_file, ) @@ -58,8 +52,6 @@ validate_update_agent_request, validate_user_header, ) -from backend.services.sync.jobs.sync_agent import sync_agent -from backend.tools.files import FileToolsArtifactTypes router = APIRouter( prefix="/v1/agents", @@ -118,30 +110,6 @@ async def create_agent( created_agent, tool_metadata, session, ctx ) - # Consolidate agent files into one index in compass - file_tools = [ToolName.Read_File, ToolName.Search_File] - if ( - Settings().feature_flags.use_compass_file_storage - and created_agent.tools_metadata - ): - artifacts = next( - ( - tool_metadata.artifacts - for tool_metadata in created_agent.tools_metadata - if tool_metadata.tool_name in file_tools - ), - [], - ) - file_ids = list( - { - artifact.get("id") - for artifact in artifacts - if artifact.get("type") == FileToolsArtifactTypes.local_file - } - ) - if file_ids: - await consolidate_agent_files_in_compass(file_ids, created_agent.id, ctx) - if deployment_db and model_db: deployment_config = ( agent.deployment_config @@ -161,9 +129,6 @@ async def create_agent( ctx.with_agent(agent_schema) ctx.with_metrics_agent(agent_to_metrics_agent(agent_schema)) - # initiate agent sync job - sync_agent.apply_async(args=[created_agent.id]) - return created_agent except Exception as e: logger.exception(event=e) @@ -279,23 +244,6 @@ async def get_agent_deployments( ] -@router.get( - "/{agent_id}/tasks", - response_model=List[AgentTaskResponse], - dependencies=[ - Depends(validate_user_header), - ], -) -async def get_agent_tasks( - agent_id: str, - session: DBSessionDep, - ctx: Context = Depends(get_context), -) -> List[AgentTaskResponse]: - raw_tasks = get_agent_tasks_by_agent_id(session, agent_id) - tasks = [parse_task(t) for t in raw_tasks] - return tasks - - @router.put( "/{agent_id}", response_model=AgentPublic, diff --git a/src/backend/schemas/metrics.py b/src/backend/schemas/metrics.py index 5eca412cf..46d79e6b1 100644 --- a/src/backend/schemas/metrics.py +++ b/src/backend/schemas/metrics.py @@ -30,10 +30,6 @@ class MetricsMessageType(str, Enum): RERANK_API_FAIL = "rerank_api_call_failure" # pending implementation ENV_LIVENESS = "env_liveness" - COMPASS_NEW_INDEX = "compass_new_index" - COMPASS_REMOVE_INDEX = "compass_remove_index" - COMPASS_NEW_USER = "compass_new_user" - COMPASS_REMOVE_USER = "compass_remove_user" UNKNOWN_SIGNAL = "unknown" diff --git a/src/backend/services/agent.py b/src/backend/services/agent.py index c9b5e12d0..69f5d71da 100644 --- a/src/backend/services/agent.py +++ b/src/backend/services/agent.py @@ -1,13 +1,10 @@ -import pickle from fastapi import HTTPException from backend.crud import agent as agent_crud from backend.crud import agent_tool_metadata as agent_tool_metadata_crud from backend.database_models.agent import Agent, AgentToolMetadata -from backend.database_models.agent_task import SyncCeleryTaskMeta from backend.database_models.database import DBSessionDep -from backend.schemas.agent import AgentTaskResponse TASK_TRACE_PREVIEW_LIMIT = 200 @@ -47,25 +44,3 @@ def raise_db_error(e: Exception, type: str, name: str): ) raise HTTPException(status_code=500, detail=str(e)) - - -def parse_task(t: SyncCeleryTaskMeta) -> AgentTaskResponse: - result = None - exception_snippet = None - if t.status == "SUCCESS": - result = pickle.loads(t.result) - if t.status == "FAILURE": - trace_lines = t.traceback.split("\n") - if len(trace_lines) >= 2: - # first 200 characters of the exception - exception_snippet = trace_lines[-2][:TASK_TRACE_PREVIEW_LIMIT] + "...check logs for details" - - return AgentTaskResponse( - task_id=t.task_id, - status=t.status, - name=t.name, - retries=t.retries, - result=result, - exception_snippet=exception_snippet, - date_done=str(t.date_done), - ) diff --git a/src/backend/services/compass.py b/src/backend/services/compass.py deleted file mode 100644 index 32db53bc9..000000000 --- a/src/backend/services/compass.py +++ /dev/null @@ -1,344 +0,0 @@ -import json -import os -from enum import Enum -from typing import Any, Dict, List, Optional - -from backend.compass_sdk import ( - CompassDocument, - MetadataConfig, - ParserConfig, - ProcessFileParameters, -) -from backend.compass_sdk.compass import CompassClient -from backend.compass_sdk.constants import DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES -from backend.compass_sdk.parser import CompassParserClient -from backend.config.settings import Settings -from backend.services.logger.utils import LoggerFactory - -logger = LoggerFactory().get_logger() -compass = None - -def get_compass(): - """ - Initialize a singular instance of Compass if not initialized yet - - Returns: - Compass: The singleton Compass instance - """ - global compass - - if compass is None: - try: - compass = Compass( - compass_api_url=Settings().compass.api_url, # type: ignore - compass_parser_url=Settings().compass.parser_url, # type: ignore - compass_username=Settings().compass.username, # type: ignore - compass_password=Settings().compass.password, # type: ignore - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Error initializing Compass: {e}" - ) - raise e - return compass - -class Compass: - """Interface to interact with a Compass instance.""" - - class ValidActions(Enum): - LIST_INDEXES = "list_indexes" - CREATE_INDEX = "create_index" - DELETE_INDEX = "delete_index" - CREATE = "create" - SEARCH = "search" - UPDATE = "update" - DELETE = "delete" - GET_DOCUMENT = "get_document" - ADD_CONTEXT = "add_context" - REFRESH = "refresh" - PROCESS_FILE = "process_file" - - def __init__( - self, - compass_api_url: Optional[str] = None, - compass_parser_url: Optional[str] = None, - compass_username: Optional[str] = None, - compass_password: Optional[str] = None, - metadata_config=MetadataConfig(), - parser_config=ParserConfig(), - ): - """Initialize the Compass tool. Pass the Compass URL, username, and password - as arguments or as environment variables.""" - self.compass_api_url = compass_api_url or Settings().compass.api_url - self.compass_parser_url = compass_parser_url or Settings().compass.parser_url - self.username = compass_username or Settings().compass.username - self.password = compass_password or Settings().compass.password - if self.compass_api_url is None or self.compass_parser_url is None: - message = "[Compass] Error initializing Compass client: API url or parser url missing." - logger.exception(event=message) - raise Exception(message) - self.parser_config = parser_config - self.metadata_config = metadata_config - # Try initializing Compass Parser and Client and call list_indexes - # to check if the credentials are correct. - self.parser_client = CompassParserClient( - parser_url=self.compass_parser_url, - username=self.username, - password=self.password, - parser_config=self.parser_config, - metadata_config=self.metadata_config, - ) - self.compass_client = CompassClient( - index_url=self.compass_api_url, - username=self.username, - password=self.password, - ) - - def health_check(self): - try: - self.compass_client.list_indexes() - except Exception as e: - logger.exception(event=f"[Compass] Error initializing Compass client: {e}") - raise e - - def invoke( - self, - action: ValidActions, - parameters: dict = {}, - **kwargs: Any, - ) -> List[Dict[str, Any]]: - """Call the Compass tool. Allowed `action` values: - - list_indexes: List all indexes in Compass. - - create_index: Create a new index in Compass. - - delete_index: Delete an existing index in Compass. - - create: Create a new document in Compass. - - search: Search for documents in Compass. - - update: Update an existing document in Compass. - - delete: Delete an existing document in Compass. - """ - - # Check if index is specified - if not parameters.get("index", None) and action.value not in [ - self.ValidActions.LIST_INDEXES.value, - self.ValidActions.PROCESS_FILE.value, - ]: - raise Exception( - f"[Compass] Error invoking Compass: No index specified in parameters {parameters}", - ) - - # Index-related actions - try: - match action.value: - case self.ValidActions.LIST_INDEXES.value: - return self.compass_client.list_indexes() - case self.ValidActions.CREATE_INDEX.value: - return self.compass_client.create_index( - index_name=parameters["index"] - ) - case self.ValidActions.DELETE_INDEX.value: - return self.compass_client.delete_index( - index_name=parameters["index"] - ) - case self.ValidActions.CREATE.value: - return self._create(parameters, **kwargs) - case self.ValidActions.SEARCH.value: - return self._search(parameters, **kwargs) - case self.ValidActions.UPDATE.value: - return self._update(parameters, **kwargs) - case self.ValidActions.DELETE.value: - return self._delete(parameters, **kwargs) - case self.ValidActions.GET_DOCUMENT.value: - return self._get_document(parameters, **kwargs) - case self.ValidActions.ADD_CONTEXT.value: - return self._add_context(parameters, **kwargs) - case self.ValidActions.REFRESH.value: - return self._refresh(parameters, **kwargs) - case self.ValidActions.PROCESS_FILE.value: - return self._process_file(parameters, **kwargs) - case _: - raise Exception( - f"[Compass] Error invoking Compass: Invalid action in parameters {parameters}" - ) - except Exception as error: - logger.error( - event="[Compass] Error invoking Compass", - error=error, - ) - raise Exception(f"[Compass] Error invoking Compass: {error}") - - def _create(self, parameters: dict, **kwargs: Any) -> Dict[str, str]: - """Insert the document into Compass""" - compass_docs = self._process_file(parameters, **kwargs) - if compass_docs is None: - raise Exception( - "[Compass] Error inserting document: Failed to process file" - ) - - if doc_metadata := parameters.get("metadata", None): - for doc in compass_docs: - doc.metadata.meta.append(doc_metadata) - - error = self.compass_client.insert_docs( - index_name=parameters["index"], - docs=compass_docs, - ) - if error is not None: - message = (f"[Compass] Error inserting document: {error}",) - logger.error(event=message) - raise Exception(message) - - def _search(self, parameters: dict, **kwargs: Any) -> None: - """Run a search query on Compass and return the - top_k results. By default, k=10.""" - if not parameters.get("query", None): - message = f"[Compass] Error searching Compass: No search query specified in parameters {parameters}" - raise Exception(message) - - return self.compass_client.search( - index_name=parameters["index"], - query=parameters["query"], - top_k=parameters.get("top_k", 10), - filters=parameters.get("filters", None), - ) - - def _update(self, parameters: dict, **kwargs: Any) -> None: - """Update file in Compass""" - self._delete(parameters, **kwargs) - self._create(parameters, **kwargs) - - def _delete(self, parameters: dict, **kwargs: Any) -> None: - """Delete file from Compass""" - # Check if file_id is specified for file-related actions - if not parameters.get("file_id", None): - raise Exception( - f"[Compass] Error deleting file: No file_id in parameters {parameters}" - ) - self.compass_client.delete_document( - index_name=parameters["index"], - doc_id=parameters["file_id"], - ) - - def _get_document(self, parameters: dict, **kwargs: Any) -> None: - """Get document with id from Compass""" - # Check if file_id is specified for file-related actions - if not parameters.get("file_id", None): - raise Exception( - f"[Compass] Error fetching document: No file_id in parameters {parameters}" - ) - return self.compass_client.get_document( - index_name=parameters["index"], - doc_id=parameters["file_id"], - ) - - def _add_context(self, parameters: dict, **kwargs: Any) -> None: - """Adds context to a document with id in Compass""" - # Check if file_id is specified for file-related actions - if not parameters.get("file_id", None): - raise Exception( - f"[Compass] Error adding context: No file_id in parameters {parameters}" - ) - if not parameters.get("context", None): - raise Exception( - f"[Compass] Error adding context: Context cannot be empty for parameters {parameters}" - ) - self.compass_client.add_context( - index_name=parameters["index"], - doc_id=parameters["file_id"], - context=parameters["context"], - ) - - def _refresh(self, parameters: dict, **kwargs: Any) -> None: - """Refresh an index in Compass""" - self.compass_client.refresh(index_name=parameters["index"]) - - def _process_file(self, parameters: dict, **kwargs: Any) -> None: - """Parse the input file.""" - # Check if file_id is specified for file-related actions - if not parameters.get("file_id", None): - raise Exception( - f"[Compass] Error processing file: No file_id specified in parameters {parameters}" - ) - - # Check if filename is specified for file-related actions - if not parameters.get("filename", None) and ( - not parameters.get("file_bytes", None) - or not parameters.get("file_extension", None) - ): - logger.error( - event=f"[Compass] Error processing file: No filename or file_text or file_extension specified in parameters {parameters.keys()}" - ) - return None - - file_id = parameters["file_id"] - filename = parameters.get("filename", None) - file_bytes = parameters.get("file_bytes", None) - file_extension = parameters.get("file_extension", None) - - if filename and not os.path.exists(filename): - logger.error( - event="[Compass] Error processing file: Invalid filename in parameters", - filename=filename, - parameters=parameters, - ) - return None - - parser_config = self.parser_config or parameters.get("parser_config", None) - metadata_config = metadata_config = self.metadata_config or parameters.get( - "metadata_config", None - ) - - if filename: - return self.parser_client.process_file( - filename=filename, - file_id=file_id, - parser_config=parser_config, - metadata_config=metadata_config, - is_dataset=False, - custom_context=parameters.get("custom_context", None), - ) - else: - return self._raw_parsing( - file_id=file_id, - file_bytes=file_bytes, - file_extension=file_extension, - custom_context=parameters.get("custom_context", {}), - ) - - def _raw_parsing( - self, file_id: str, file_bytes: str, file_extension: str, custom_context: dict - ): - if len(file_bytes) > DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES: - logger.error( - event="[Compass] Error parsing file: File Size is too large", - file_bytes=len(file_bytes), - max_size=DEFAULT_MAX_ACCEPTED_FILE_SIZE_BYTES, - ) - return [] - - params = ProcessFileParameters( - parser_config=self.parser_config, - metadata_config=self.metadata_config, - doc_id=file_id, - ) - auth = ( - (self.username, self.password) if self.username and self.password else None - ) - res = self.parser_client.session.post( - url=f"{self.parser_client.parser_url}/v1/process_file", - data={"data": json.dumps(params.model_dump())}, - files={"file": ("{}.{}".format(file_id, file_extension), file_bytes)}, - auth=auth, - ) - - if res.ok: - docs = [CompassDocument(**doc) for doc in res.json()["docs"]] - for doc in docs: - additional_metadata = CompassParserClient._get_metadata( - doc=doc, custom_context=custom_context - ) - doc.content = {**doc.content, **additional_metadata} - else: - docs = [] - logger.error(event=f"[Compass] Error processing file: {res.text}") - - return docs diff --git a/src/backend/services/file.py b/src/backend/services/file.py index cc227a1a5..fa61401cb 100644 --- a/src/backend/services/file.py +++ b/src/backend/services/file.py @@ -1,6 +1,4 @@ import io -import uuid -from datetime import datetime import pandas as pd from docx import Document @@ -10,7 +8,6 @@ import backend.crud.conversation as conversation_crud import backend.crud.file as file_crud -from backend.config.settings import Settings from backend.crud import message as message_crud from backend.database_models.conversation import ConversationFileAssociation from backend.database_models.database import DBSessionDep @@ -19,7 +16,6 @@ from backend.schemas.file import ConversationFilePublic, File from backend.services import utils from backend.services.agent import validate_agent_exists -from backend.services.compass import Compass, get_compass from backend.services.context import get_context from backend.services.logger.utils import LoggerFactory @@ -60,19 +56,10 @@ class FileService: """ FileService class - This class manages interfacing with different file storage solutions. Currently it supports storing files in the Postgres DB and or using Compass. - By default Toolkit will run with Postgres DB as the storage solution for files. - To enable Compass as the storage solution, set the `use_compass_file_storage` feature flag to `true` in the .env or .configuration file. - Also be sure to add the appropriate Compass environment variables to the .env or .configuration file. + This class manages interfacing with different file storage solutions, + currently supports storing files in PostgreSQL. """ - @property - def is_compass_enabled(self) -> bool: - """ - Returns whether Compass is enabled as the file storage solution - """ - return Settings().feature_flags.use_compass_file_storage - async def create_conversation_files( self, session: DBSessionDep, @@ -94,12 +81,7 @@ async def create_conversation_files( Returns: list[File]: The files that were created """ - if self.is_compass_enabled: - uploaded_files = await insert_files_in_compass( - files, user_id, ctx, conversation_id - ) - else: - uploaded_files = await insert_files_in_db(session, files, user_id) + uploaded_files = await insert_files_in_db(session, files, user_id) for file in uploaded_files: conversation_crud.create_conversation_file_association( @@ -130,15 +112,7 @@ async def create_agent_files( Returns: list[File]: The files that were created """ - uploaded_files = [] - if self.is_compass_enabled: - """ - Since agents are created after the files are upload we index files into dummy indices first - We later consolidate them in consolidate_agent_files_in_compass() to a singular index when an agent is created. - """ - uploaded_files = await insert_files_in_compass(files, user_id, ctx) - else: - uploaded_files = await insert_files_in_db(session, files, user_id) + uploaded_files = await insert_files_in_db(session, files, user_id) return uploaded_files @@ -182,10 +156,7 @@ def get_files_by_agent_id( } ) - if self.is_compass_enabled: - files = get_files_in_compass(agent_id, file_ids, user_id, ctx) - else: - files = file_crud.get_files_by_ids(session, file_ids, user_id) + files = file_crud.get_files_by_ids(session, file_ids, user_id) return files @@ -215,10 +186,7 @@ def get_files_by_conversation_id( files = [] if file_ids is not None: - if self.is_compass_enabled: - files = get_files_in_compass(conversation_id, file_ids, user_id, ctx) - else: - files = file_crud.get_files_by_ids(session, file_ids, user_id) + files = file_crud.get_files_by_ids(session, file_ids, user_id) return files @@ -243,10 +211,7 @@ def delete_conversation_file_by_id( session, conversation_id, file_id, user_id ) - if self.is_compass_enabled: - delete_file_in_compass(conversation_id, file_id, user_id, ctx) - else: - file_crud.delete_file(session, file_id, user_id) + file_crud.delete_file(session, file_id, user_id) return @@ -267,10 +232,7 @@ def delete_agent_file_by_id( file_id (str): The file ID user_id (str): The user ID """ - if self.is_compass_enabled: - delete_file_in_compass(agent_id, file_id, user_id, ctx) - else: - file_crud.delete_file(session, file_id, user_id) + file_crud.delete_file(session, file_id, user_id) return @@ -294,24 +256,10 @@ def delete_all_conversation_files( """ logger = ctx.get_logger() - if self.is_compass_enabled: - compass = get_compass() - try: - logger.info( - event=f"[Compass File Service] Deleting conversation {conversation_id} files from Compass" - ) - compass.invoke( - action=Compass.ValidActions.DELETE_INDEX, - parameters={"index": conversation_id}, - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Error deleting conversation {conversation_id} files from Compass: {e}" - ) - else: - file_crud.bulk_delete_files(session, file_ids, user_id) - - return + logger.info( + event=f"Deleting conversation {conversation_id} files from DB." + ) + file_crud.bulk_delete_files(session, file_ids, user_id) def get_files_by_message_id( self, session: DBSessionDep, message_id: str, user_id: str, ctx: Context @@ -330,271 +278,17 @@ def get_files_by_message_id( message = message_crud.get_message(session, message_id, user_id) files = [] if message.file_ids is not None: - if self.is_compass_enabled: - files = get_files_in_compass( - message.conversation_id, message.file_ids, user_id, ctx - ) - else: - files = file_crud.get_files_by_ids(session, message.file_ids, user_id) - return files - - -# Compass Operations -def delete_file_in_compass( - index: str, file_id: str, user_id: str, ctx: Context -) -> None: - """ - Delete a file from Compass - - Args: - index (str): The index - file_id (str): The file ID - user_id (str): The user ID - ctx (Context): Context object - - Raises: - HTTPException: If the file is not found - """ - logger = ctx.get_logger() - compass = get_compass() - - try: - logger.info( - event=f"[Compass File Service] Deleting file {file_id} from Compass {index}" - ) - compass.invoke( - action=Compass.ValidActions.DELETE, - parameters={"index": index, "file_id": file_id}, - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Error deleting file {file_id} on index {index} from Compass: {e}" - ) - - -def get_files_in_compass( - index: str, file_ids: list[str], user_id: str, ctx: Context -) -> list[File]: - """ - Get files from Compass - - Args: - index (str): The index - file_ids (list[str]): The file IDs - user_id (str): The user ID - - Returns: - list[File]: The files that were created - """ - compass = get_compass() - logger = ctx.get_logger() - - files = [] - for file_id in file_ids: - try: - fetched_doc = compass.invoke( - action=Compass.ValidActions.GET_DOCUMENT, - parameters={"index": index, "file_id": file_id}, - ).result["doc"]["content"] - except Exception as e: - logger.error( - event=f"[Compass File Service] Error fetching file {file_id} on index {index} from Compass: {e}" - ) - raise HTTPException( - status_code=404, detail=f"File with ID: {file_id} not found." - ) - - files.append( - File( - id=file_id, - file_name=fetched_doc["file_name"], - file_size=fetched_doc["file_size"], - file_content=fetched_doc["text"], - user_id=user_id, - created_at=datetime.fromisoformat(fetched_doc["created_at"]), - updated_at=datetime.fromisoformat(fetched_doc["updated_at"]), - ) - ) - - return files - + files = file_crud.get_files_by_ids(session, message.file_ids, user_id) -async def consolidate_agent_files_in_compass( - file_ids, - agent_id, - ctx: Context, -) -> None: - """ - Consolidate files into a single index (agent ID) in Compass. - We do this because when agents are created after a file is uploaded, the file is not associated with the agent. - We consolidate them in a single index to under one agent ID when an agent is created. - - Args: - file_ids (list[str]): The file IDs - agent_id (str): The agent ID - ctx (Context): Context object - """ - logger = ctx.get_logger() - compass = get_compass() - - try: - logger.info( - event="[Compass File Service] Creating index for agent files", - agent_id=agent_id - ) - response = compass.invoke( - action=Compass.ValidActions.CREATE_INDEX, - parameters={ - "index": agent_id, - }, - ) - logger.info( - event="[Compass File Service] Finished creating index for agent files", - agent_id=agent_id, - response=response - ) - except Exception as e: - logger.Error( - event=f"[Compass File Service] Error creating index for agent files: {agent_id}, error: {e}" - ) - raise HTTPException( - status_code=500, - detail=f"Error creating index for agent files: {agent_id}, error: {e}", - ) - - for file_id in file_ids: - try: - fetched_doc = compass.invoke( - action=Compass.ValidActions.GET_DOCUMENT, - parameters={"index": file_id, "file_id": file_id}, - ).result["doc"]["content"] - compass.invoke( - action=Compass.ValidActions.CREATE, - parameters={ - "index": agent_id, - "file_id": file_id, - "file_bytes": fetched_doc["text"], - "file_extension": get_file_extension(fetched_doc["file_name"]), - "custom_context": { - "file_id": file_id, - "file_name": fetched_doc["file_name"], - "file_size": fetched_doc["file_size"], - "user_id": fetched_doc["user_id"], - "created_at": fetched_doc["created_at"], - "updated_at": fetched_doc["updated_at"], - }, - }, - ) - compass.invoke( - action=Compass.ValidActions.REFRESH, - parameters={"index": agent_id}, - ) - logger.info( - event=f"[Compass File Service] Delete temporary file index: {file_id}" - ) - # Remove the temporary file index entry - compass.invoke( - action=Compass.ValidActions.DELETE_INDEX, parameters={"index": file_id} - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Error consolidating file {file_id} into agent {agent_id}, error: {e}" - ) - raise HTTPException( - status_code=500, - detail=f"Error consolidating file {file_id} into agent {agent_id}, error: {e}", - ) - - -async def insert_files_in_compass( - files: list[FastAPIUploadFile], - user_id: str, - ctx: Context, - index: str = None, -) -> list[File]: - logger = ctx.get_logger() - compass = get_compass() - - if index is not None: - try: - compass.invoke( - action=Compass.ValidActions.CREATE_INDEX, - parameters={ - "index": index, - }, - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Failed to create index: {index}, error: {e}" - ) - - uploaded_files = [] - for file in files: - filename = file.filename.encode("ascii", "ignore").decode("utf-8") - file_bytes = await file.read() - new_file_id = str(uuid.uuid4()) - - # Create temporary index for individual file (files not associated with conversations) - # Consolidate them under one agent index during agent creation - if index is None: - try: - compass.invoke( - action=Compass.ValidActions.CREATE_INDEX, - parameters={ - "index": new_file_id, - }, - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Failed to create index: {index}, error: {e}" - ) - - try: - compass.invoke( - action=Compass.ValidActions.CREATE, - parameters={ - "index": new_file_id if index is None else index, - "file_id": new_file_id, - "file_bytes": file_bytes, - "file_extension": get_file_extension(filename), - "custom_context": { - "file_id": new_file_id, - "file_name": filename, - "file_size": file.size, - "user_id": user_id, - "created_at": datetime.now().isoformat(), - "updated_at": datetime.now().isoformat(), - }, - }, - ) - compass.invoke( - action=Compass.ValidActions.REFRESH, - parameters={"index": new_file_id if index is None else index}, - ) - except Exception as e: - logger.error( - event=f"[Compass File Service] Failed to create document on index: {index}, error: {e}" - ) - - uploaded_files.append( - File( - file_name=filename, - id=new_file_id, - file_size=file.size, - user_id=user_id, - created_at=datetime.now(), - updated_at=datetime.now(), - ) - ) - - return uploaded_files + return files # Misc def validate_file( session: DBSessionDep, file_id: str, user_id: str, index: str, ctx: Context ) -> File: - """Validates if a file exists and belongs to the user + """ + Validates if a file exists and belongs to the user Args: session (DBSessionDep): Database session @@ -607,10 +301,7 @@ def validate_file( Raises: HTTPException: If the file is not found """ - if Settings().feature_flags.use_compass_file_storage: - file = get_files_in_compass(index, [file_id], user_id, ctx)[0] - else: - file = file_crud.get_file(session, file_id, user_id) + file = file_crud.get_file(session, file_id, user_id) if not file: raise HTTPException( diff --git a/src/backend/services/request_validators.py b/src/backend/services/request_validators.py index c55c27b11..961cccc1c 100644 --- a/src/backend/services/request_validators.py +++ b/src/backend/services/request_validators.py @@ -186,12 +186,12 @@ async def validate_chat_request(session: DBSessionDep, request: Request): return managed_tools = [tool["name"] for tool in tools if tool["name"] in AVAILABLE_TOOLS] - if len(managed_tools) > 0 and len(tools) != len(managed_tools): + if managed_tools and len(tools) != len(managed_tools): raise HTTPException( status_code=400, detail="Cannot mix both managed and custom tools" ) - if len(managed_tools) == 0: + if not managed_tools: for tool in tools: if not tool.get("description"): raise HTTPException( diff --git a/src/backend/services/sync/Dockerfile b/src/backend/services/sync/Dockerfile deleted file mode 100644 index d1574a19a..000000000 --- a/src/backend/services/sync/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -FROM python:3.11 - -RUN apt-get update && apt-get -y install cron vim - -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -ENV PYTHONPATH=/app -ENV PYTHONIOENCODING=utf-8 -ENV PYTHONPATH=/app/src/ -ENV POETRY_VIRTUALENVS_IN_PROJECT=true -ENV VIRTUAL_ENV=/app/.venv -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -WORKDIR /app - -RUN pip install poetry==1.6.1 -ENV POETRY_NO_INTERACTION=1 \ - POETRY_VIRTUALENVS_IN_PROJECT=1 \ - POETRY_VIRTUALENVS_CREATE=1 \ - POETRY_CACHE_DIR=/tmp/poetry_cache -COPY ./pyproject.toml ./poetry.lock ./ -RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry install --no-root - -COPY ./src/backend/services/sync/crontab /etc/cron.d/crontab -COPY ./src/backend /app/src/backend - -RUN chmod 0644 /etc/cron.d/crontab -RUN /usr/bin/crontab /etc/cron.d/crontab -RUN touch /etc/cron.log -RUN chmod +x /app/src/backend/services/sync/publisher.sh - -# run crond as main process of container -CMD ["cron", "-f"] \ No newline at end of file diff --git a/src/backend/services/sync/__init__.py b/src/backend/services/sync/__init__.py deleted file mode 100644 index 78f5f03aa..000000000 --- a/src/backend/services/sync/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from celery import Celery - -""" -Run celery as a listener - -celery -A src.backend.services.sync worker -P gevent --loglevel=INFO - -Run celery as a daemon -todo -""" -# Start celery. -app = Celery("sync") -app.config_from_object("src.backend.services.sync.celeryconfig") diff --git a/src/backend/services/sync/celeryconfig.py b/src/backend/services/sync/celeryconfig.py deleted file mode 100644 index e008ca7df..000000000 --- a/src/backend/services/sync/celeryconfig.py +++ /dev/null @@ -1,68 +0,0 @@ -# Celery configuration -# http://docs.celeryproject.org/en/latest/configuration.html - -import socket - -from backend.services.sync.constants import ( - DEFAULT_TIME_OUT, - SYNC_BROKER_URL, - SYNC_DATABASE_URL, - SYNC_WORKER_CONCURRENCY, -) - -# Sockets timeout -# This helps with GDrive SDK download timeouts -socket.setdefaulttimeout(DEFAULT_TIME_OUT) - -# broker and db -broker_url = SYNC_BROKER_URL - -# see https://docs.celeryq.dev/en/stable/userguide/configuration.html#database-url-examples -result_backend = "db+{}".format(SYNC_DATABASE_URL) - -# retry connecting if disconnected -broker_connection_retry_on_startup = True - -# extended tasks format -result_extended = True - -# retry connecting if disconnected -broker_connection_retry_on_startup = True - -# update task status when STARTED -task_track_started = True - -# How many messages to prefetch at a time multiplied by the number of concurrent processes -worker_prefetch_multiplier = 1 - -# If True, then the backend object is shared across threads -# result_backend_thread_safe = True - -# The maximum number of connections that can be open in the connection pool -broker_pool_limit = SYNC_WORKER_CONCURRENCY - -# Use UTC instead of localtime -CELERY_enable_utc = True - -# result format -task_serializer = "json" -result_serializer = "json" -accept_content = ["json"] - -# custom table names -database_table_names = { - "task": "sync_celery_taskmeta", - "group": "sync_celery_tasksetmeta", -} - -worker_concurrency = SYNC_WORKER_CONCURRENCY - -# Use UTC instead of localtime -CELERY_enable_utc = True - -# modules to include -include = [ - "backend.tools.google_drive", - "backend.services.sync.jobs.sync_agent", - "backend.services.sync.jobs.sync_agent_activity", -] diff --git a/src/backend/services/sync/config.py b/src/backend/services/sync/config.py deleted file mode 100644 index b06121772..000000000 --- a/src/backend/services/sync/config.py +++ /dev/null @@ -1,8 +0,0 @@ -import httpx -from dotenv import load_dotenv - -load_dotenv() - - -class Configuration: - HTTPX_CLIENT: httpx.Client = httpx.Client(http2=True, timeout=300) diff --git a/src/backend/services/sync/constants.py b/src/backend/services/sync/constants.py deleted file mode 100644 index 56df315aa..000000000 --- a/src/backend/services/sync/constants.py +++ /dev/null @@ -1,19 +0,0 @@ -from enum import Enum - -from dotenv import load_dotenv - -from backend.config.settings import Settings - -load_dotenv() - -# Very high, extreme fallback in case Compass is busy -DEFAULT_TIME_OUT = 10 * 60 -SYNC_BROKER_URL = Settings().sync.broker_url -SYNC_DATABASE_URL = Settings().database.url -SYNC_WORKER_CONCURRENCY = int(Settings().sync.worker_concurrency) - - -class Status(Enum): - SUCCESS = "success" - CANCELLED = "cancelled" - FAIL = "fail" diff --git a/src/backend/services/sync/crontab b/src/backend/services/sync/crontab deleted file mode 100644 index 32dbc9d70..000000000 --- a/src/backend/services/sync/crontab +++ /dev/null @@ -1,3 +0,0 @@ -# START CRON JOB -0 * * * * /app/src/backend/services/sync/publisher.sh > /proc/1/fd/1 2>/proc/1/fd/2 -# END CRON JOB \ No newline at end of file diff --git a/src/backend/services/sync/env.py b/src/backend/services/sync/env.py deleted file mode 100644 index cc83c8a0c..000000000 --- a/src/backend/services/sync/env.py +++ /dev/null @@ -1,13 +0,0 @@ -from functools import lru_cache - -from pydantic import ValidationError - -from backend.services.sync.config import Configuration - - -@lru_cache(maxsize=1) -def env() -> Configuration: - try: - return Configuration() - except ValidationError as e: - Configuration.handle_validation_error(e) diff --git a/src/backend/services/sync/executor.sh b/src/backend/services/sync/executor.sh deleted file mode 100755 index 09adf4e81..000000000 --- a/src/backend/services/sync/executor.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash - -# Default to the production version. -if [ "$1" = "development" ]; then - exec watchmedo auto-restart \ - --directory=src/backend --pattern=*.py --recursive -- \ - celery -A backend.services.sync worker -P gevent --loglevel=INFO -else - exec celery -A backend.services.sync worker -P gevent --loglevel=INFO -fi diff --git a/src/backend/services/sync/jobs/sync_agent.py b/src/backend/services/sync/jobs/sync_agent.py deleted file mode 100644 index 9d0f583a2..000000000 --- a/src/backend/services/sync/jobs/sync_agent.py +++ /dev/null @@ -1,42 +0,0 @@ -from backend.config.tools import ToolName -from backend.crud.agent import get_agent_by_id -from backend.crud.agent_tool_metadata import get_all_agent_tool_metadata_by_agent_id -from backend.database_models.database import get_session -from backend.schemas.agent import Agent, AgentToolMetadata -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT -from backend.tools.google_drive import ( - handle_google_drive_sync, - list_google_drive_artifacts_file_ids, -) - - -@app.task(time_limit=DEFAULT_TIME_OUT) -def sync_agent(agent_id: str): - """ - sync_agent is a job that aims to one time sync the remote artifact with Compass - Once that job is complete, future jobs will be purely about recent activity (sync_agent_activity) - """ - agent_tool_metadata = [] - session = next(get_session()) - agent = get_agent_by_id(session, agent_id, override_user_id=True) - agent_schema = Agent.model_validate(agent) - agent_tool_metadata = get_all_agent_tool_metadata_by_agent_id(session, agent_id) - agent_tool_metadata_schema = [ - AgentToolMetadata.model_validate(x) for x in agent_tool_metadata - ] - for metadata in agent_tool_metadata_schema: - match metadata.tool_name: - case ToolName.Google_Drive: - file_ids = list_google_drive_artifacts_file_ids( - session=session, - user_id=agent_schema.user_id, - agent_artifacts=metadata.artifacts, - verbose=True, - ) - session.close() - handle_google_drive_sync( - file_ids=file_ids, agent_id=agent_id, user_id=agent_schema.user_id - ) - case _: - continue diff --git a/src/backend/services/sync/jobs/sync_agent_activity.py b/src/backend/services/sync/jobs/sync_agent_activity.py deleted file mode 100644 index b4a08456f..000000000 --- a/src/backend/services/sync/jobs/sync_agent_activity.py +++ /dev/null @@ -1,69 +0,0 @@ -from backend.config.tools import ToolName -from backend.crud.agent import get_agent_by_id -from backend.crud.agent_tool_metadata import get_all_agent_tool_metadata_by_agent_id -from backend.database_models.database import get_session -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT -from backend.tools.google_drive import ( - handle_google_drive_activity_event, - query_google_drive_activity, -) -from backend.tools.google_drive.sync.consolidation import consolidate - -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT) -def sync_agent_activity(agent_id: str): - """ - sync_agent_activity is a job that aims to sync Compass with the agent artifacts recent activity - """ - agent_tool_metadata = [] - session = next(get_session()) - agent = get_agent_by_id(session, agent_id, override_user_id=True) - agent_tool_metadata = get_all_agent_tool_metadata_by_agent_id(session, agent_id) - for metadata in agent_tool_metadata: - try: - match metadata.tool_name: - case ToolName.Google_Drive: - activities = query_google_drive_activity( - session=session, - user_id=agent.user_id, - agent_artifacts=metadata.artifacts, - ) - session.close() - - consolidated_activities = { - key: consolidate(activities=value) - for key, value in activities.items() - } - logger.info( - event=f"Publishing {sum([len(x) for x in consolidated_activities.values()])} activity tasks for agent {agent_id}" - ) - for artifact_id, activity in consolidated_activities.items(): - for activity_item in activity: - event_type = list( - activity_item["primaryActionDetail"].keys() - )[0] - # NOTE: This is an unfortunate hack because the Google APi - # does not provide consistency over the request and response - # format of this action - if event_type == "permissionChange": - event_type = "permission_change" - - handle_google_drive_activity_event( - event_type=event_type, - activity=activity_item, - agent_id=agent_id, - user_id=agent.user_id, - artifact_id=artifact_id, - ) - case _: - continue - except Exception as e: - logger.error( - event=f"Error syncing agent activity for agent {agent_id}", - exception=e, - ) - continue diff --git a/src/backend/services/sync/publish.py b/src/backend/services/sync/publish.py deleted file mode 100644 index 686c834d0..000000000 --- a/src/backend/services/sync/publish.py +++ /dev/null @@ -1,26 +0,0 @@ -from backend.crud.agent import get_agents -from backend.database_models.database import get_session -from backend.schemas.agent import Agent -from backend.services.sync.jobs.sync_agent_activity import sync_agent_activity - -# NOTE Variable to limit the number of agents you are syncing at once -# Helpful for first time setups -LIMIT = None -# LIMIT = 1 - - -def main(): - session = next(get_session()) - agents = [ - Agent.model_validate(x) for x in get_agents(session, override_user_id=True) - ] - if LIMIT: - agents = agents[:LIMIT] - session.close() - - for agent in agents: - sync_agent_activity(agent_id=agent.id) - - -if __name__ == "__main__": - main() diff --git a/src/backend/services/sync/publisher.sh b/src/backend/services/sync/publisher.sh deleted file mode 100755 index 3d75633ea..000000000 --- a/src/backend/services/sync/publisher.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -cd /app -export PYTHONPATH="$PYTHONPATH:/app/src" -/app/.venv/bin/python3 src/backend/services/sync/publish.py \ No newline at end of file diff --git a/src/backend/tests/integration/services/test_compass.py b/src/backend/tests/integration/services/test_compass.py deleted file mode 100644 index 12c5bb2f3..000000000 --- a/src/backend/tests/integration/services/test_compass.py +++ /dev/null @@ -1,27 +0,0 @@ -import os - -import pytest - -from backend.services.compass import Compass - -is_compass_env_set = all( - os.getenv(var) is not None - for var in [ - "COHERE_COMPASS_API_URL", - "COHERE_COMPASS_PARSER_URL", - "COHERE_COMPASS_USERNAME", - "COHERE_COMPASS_PASSWORD", - ] -) - - -@pytest.mark.skipif( - not is_compass_env_set, reason="Compass environment variables are not set" -) -def test_compass() -> None: - compass = Compass() - result = compass.invoke( - action=Compass.ValidActions.CREATE_INDEX, parameters={"index": "foobar"} - ) - assert result.result is None - assert result.result is None diff --git a/src/backend/tests/unit/configuration.yaml b/src/backend/tests/unit/configuration.yaml index 8449bcb6a..078ea5339 100644 --- a/src/backend/tests/unit/configuration.yaml +++ b/src/backend/tests/unit/configuration.yaml @@ -19,13 +19,10 @@ tools: enabled_tools: python_interpreter: url: - compass: - api_url: - parser_url: feature_flags: # Experimental features use_experimental_langchain: false - use_agents_view: true + use_agents_view: false # Community features use_community_features: true auth: diff --git a/src/backend/tests/unit/conftest.py b/src/backend/tests/unit/conftest.py index 218fb1977..3e0c6c8ca 100644 --- a/src/backend/tests/unit/conftest.py +++ b/src/backend/tests/unit/conftest.py @@ -205,19 +205,3 @@ def mock_available_model_deployments(request): with patch.dict(AVAILABLE_MODEL_DEPLOYMENTS, MOCKED_DEPLOYMENTS) as mock: yield mock - - -@pytest.fixture -def mock_compass_settings(): - with patch("backend.services.file.Settings") as MockSettings: - mock_settings = MockSettings.return_value - mock_settings.feature_flags.use_compass_file_storage = os.getenv( - "ENABLE_COMPASS_FILE_STORAGE", "False" - ).lower() in ("true", "1") - mock_settings.tools.compass.api_url = os.getenv("COHERE_COMPASS_API_URL") - mock_settings.tools.compass.api_parser_url = os.getenv( - "COHERE_COMPASS_API_PARSER_URL" - ) - mock_settings.tools.compass.username = os.getenv("COHERE_COMPASS_USERNAME") - mock_settings.tools.compass.password = os.getenv("COHERE_COMPASS_PASSWORD") - yield mock_settings diff --git a/src/backend/tests/unit/routers/test_chat.py b/src/backend/tests/unit/routers/test_chat.py index fdcead931..8f5bf3535 100644 --- a/src/backend/tests/unit/routers/test_chat.py +++ b/src/backend/tests/unit/routers/test_chat.py @@ -1009,7 +1009,6 @@ def test_streaming_chat_with_files( session_client_chat: TestClient, session_chat: Session, user: User, - mock_compass_settings, ): # Create convo conversation = get_factory("Conversation", session_chat).create(user_id=user.id) diff --git a/src/backend/tests/unit/routers/test_conversation.py b/src/backend/tests/unit/routers/test_conversation.py index 89274db89..9adc5e175 100644 --- a/src/backend/tests/unit/routers/test_conversation.py +++ b/src/backend/tests/unit/routers/test_conversation.py @@ -495,7 +495,7 @@ def test_search_conversations_missing_user_id( # FILES def test_list_files( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) files = [ @@ -531,7 +531,7 @@ def test_list_files( def test_list_files_no_files( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) response = session_client.get( @@ -544,7 +544,7 @@ def test_list_files_no_files( def test_list_files_missing_user_id( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) response = session_client.get(f"/v1/conversations/{conversation.id}/files") @@ -554,7 +554,7 @@ def test_list_files_missing_user_id( def test_batch_upload_file_existing_conversation( - session_client: TestClient, session: Session, user, mock_compass_settings + session_client: TestClient, session: Session, user ) -> None: file_paths = { "Mariana_Trench.pdf": "src/backend/tests/unit/test_data/Mariana_Trench.pdf", @@ -595,7 +595,7 @@ def test_batch_upload_file_existing_conversation( def test_batch_upload_file_nonexistent_conversation_creates_new_conversation( - session_client: TestClient, session: Session, user, mock_compass_settings + session_client: TestClient, session: Session, user ) -> None: file_paths = { "Mariana_Trench.pdf": "src/backend/tests/unit/test_data/Mariana_Trench.pdf", @@ -649,7 +649,7 @@ def test_batch_upload_file_nonexistent_conversation_creates_new_conversation( def test_batch_upload_file_nonexistent_conversation_fails_if_user_id_not_provided( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: file_paths = { "Mariana_Trench.pdf": "src/backend/tests/unit/test_data/Mariana_Trench.pdf", @@ -672,7 +672,6 @@ def test_delete_file( session_client: TestClient, session: Session, user: User, - mock_compass_settings, ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) files = [ @@ -722,7 +721,7 @@ def test_delete_file( def test_fail_delete_nonexistent_file( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) response = session_client.delete( @@ -735,7 +734,7 @@ def test_fail_delete_nonexistent_file( def test_fail_delete_file_missing_user_id( - session_client: TestClient, session: Session, user: User, mock_compass_settings + session_client: TestClient, session: Session, user: User ) -> None: conversation = get_factory("Conversation", session).create(user_id=user.id) file = get_factory("File", session).create( diff --git a/src/backend/tests/unit/secrets.yaml b/src/backend/tests/unit/secrets.yaml index c7222ffec..acd6e6f5e 100644 --- a/src/backend/tests/unit/secrets.yaml +++ b/src/backend/tests/unit/secrets.yaml @@ -20,9 +20,6 @@ tools: api_key: wolfram_alpha: app_id: - compass: - username: - password: google_drive: client_id: client_secret: diff --git a/src/backend/tools/base.py b/src/backend/tools/base.py index 5f1664688..d16170967 100644 --- a/src/backend/tools/base.py +++ b/src/backend/tools/base.py @@ -151,3 +151,9 @@ def insert_tool_auth_cache(self, user_id: str, tool_id: str) -> str: cache_put(key, payload) return key + + +class ToolAuthException(Exception): + def __init__(self, message, tool_id: str): + self.message = message + self.tool_id = tool_id diff --git a/src/backend/tools/files.py b/src/backend/tools/files.py index 86f181ce9..6df05fc0e 100644 --- a/src/backend/tools/files.py +++ b/src/backend/tools/files.py @@ -2,89 +2,15 @@ from typing import Any, Dict, List import backend.crud.file as file_crud -from backend.compass_sdk import SearchFilter -from backend.config.settings import Settings -from backend.services.compass import Compass -from backend.services.file import get_compass from backend.tools.base import BaseTool class FileToolsArtifactTypes(StrEnum): local_file = "file" - -def compass_file_search( - file_ids: List[str], - conversation_id: str, - agent_id: str, - query: str, - search_limit: int = 5, -) -> List[Dict[str, Any]]: - results = [] - - # Note: Compass search currently has an issue where the type of the context is not directly referenced - # Temporarily add `.keyword` to workaround this issue. - search_filters = [ - SearchFilter( - field="content.file_id.keyword", - type=SearchFilter.FilterType.EQ, - value=file_id, - ) - for file_id in file_ids - ] - - compass = get_compass() - - # Search conversation ID index - search_results = compass.invoke( - action=Compass.ValidActions.SEARCH, - parameters={ - "index": conversation_id, - "query": query, - "top_k": search_limit, - "filters": search_filters, - }, - ) - - if search_results.result: - results.extend(search_results.result["hits"]) - - # Search agent ID index - if agent_id: - search_results = compass.invoke( - action=Compass.ValidActions.SEARCH, - parameters={ - "index": agent_id, - "query": query, - "top_k": search_limit, - "filters": search_filters, - }, - ) - - if search_results.result: - results.extend(search_results.result["hits"]) - - chunks = sorted( - [ - { - "text": chunk["content"]["text"], - "score": chunk["score"], - "url": result["content"].get("file_name", ""), - "title": result["content"].get("file_name", ""), - } - for result in results - for chunk in result["chunks"] - ], - key=lambda x: x["score"], - reverse=True, - )[:search_limit] - - return chunks - - class ReadFileTool(BaseTool): """ - This class reads a file from the file system. + Tool to read a file from the file system. """ NAME = "read_document" @@ -103,37 +29,25 @@ async def call(self, parameters: dict, **kwargs: Any) -> List[Dict[str, Any]]: session = kwargs.get("session") user_id = kwargs.get("user_id") - agent_id = kwargs.get("agent_id") - conversation_id = kwargs.get("conversation_id") if not file: return [] _, file_id = file - if Settings().feature_flags.use_compass_file_storage: - return compass_file_search( - [file_id], - conversation_id, - agent_id, - "*", - search_limit=self.SEARCH_LIMIT, - ) - else: - retrieved_file = file_crud.get_file(session, file_id, user_id) - if not retrieved_file: - return [] - - return [ - { - "text": retrieved_file.file_content, - "title": retrieved_file.file_name, - "url": retrieved_file.file_name, - } - ] + retrieved_file = file_crud.get_file(session, file_id, user_id) + if not retrieved_file: + return [] + return [ + { + "text": retrieved_file.file_content, + "title": retrieved_file.file_name, + "url": retrieved_file.file_name, + } + ] class SearchFileTool(BaseTool): """ - This class searches for a query in a file. + Tool to query a list of files. """ NAME = "search_file" @@ -153,8 +67,6 @@ async def call( query = parameters.get("search_query") files = parameters.get("files") - agent_id = kwargs.get("agent_id") - conversation_id = kwargs.get("conversation_id") session = kwargs.get("session") user_id = kwargs.get("user_id") @@ -162,26 +74,17 @@ async def call( return [] file_ids = [file_id for _, file_id in files] - if Settings().feature_flags.use_compass_file_storage: - return compass_file_search( - file_ids, - conversation_id, - agent_id, - query, - search_limit=self.SEARCH_LIMIT, + retrieved_files = file_crud.get_files_by_ids(session, file_ids, user_id) + if not retrieved_files: + return [] + + results = [] + for file in retrieved_files: + results.append( + { + "text": file.file_content, + "title": file.file_name, + "url": file.file_name, + } ) - else: - retrieved_files = file_crud.get_files_by_ids(session, file_ids, user_id) - if not retrieved_files: - return [] - - results = [] - for file in retrieved_files: - results.append( - { - "text": file.file_content, - "title": file.file_name, - "url": file.file_name, - } - ) - return results + return results diff --git a/src/backend/tools/google_drive/__init__.py b/src/backend/tools/google_drive/__init__.py index 136656772..cfe087ab2 100644 --- a/src/backend/tools/google_drive/__init__.py +++ b/src/backend/tools/google_drive/__init__.py @@ -1,39 +1,11 @@ from backend.tools.google_drive.auth import GoogleDriveAuth from backend.tools.google_drive.constants import ( GOOGLE_DRIVE_TOOL_ID, - GoogleDriveActions, -) -from backend.tools.google_drive.sync import ( - handle_google_drive_sync, - list_google_drive_artifacts_file_ids, -) -from backend.tools.google_drive.sync.actions import ( - create, - delete, - edit, - move, - permission_change, - rename, -) -from backend.tools.google_drive.sync.activity import ( - handle_google_drive_activity_event, - query_google_drive_activity, ) from backend.tools.google_drive.tool import GoogleDrive __all__ = [ "GoogleDriveAuth", "GoogleDrive", - "handle_google_drive_activity_event", "GOOGLE_DRIVE_TOOL_ID", - "GoogleDriveActions", - "query_google_drive_activity", - "list_google_drive_artifacts_file_ids", - "handle_google_drive_sync", - "create", - "delete", - "edit", - "move", - "permission_change", - "rename", ] diff --git a/src/backend/tools/google_drive/constants.py b/src/backend/tools/google_drive/constants.py index fb27f109a..4f6ee656a 100644 --- a/src/backend/tools/google_drive/constants.py +++ b/src/backend/tools/google_drive/constants.py @@ -1,5 +1,3 @@ -from enum import Enum - CSV_MIMETYPE = "text/csv" TEXT_MIMETYPE = "text/plain" SEARCH_LIMIT = 10 @@ -32,13 +30,3 @@ DOC_FIELDS = "id, name, mimeType, webViewLink, exportLinks, shortcutDetails, trashed, parents, fileExtension, permissions" GOOGLE_DRIVE_TOOL_ID = "google_drive" - - -class GoogleDriveActions(Enum): - CREATE = "create" - EDIT = "edit" - MOVE = "move" - RENAME = "rename" - DELETE = "delete" - RESTORE = "restore" - PERMISSION_CHANGE = "permission_change" diff --git a/src/backend/tools/google_drive/sync/__init__.py b/src/backend/tools/google_drive/sync/__init__.py deleted file mode 100644 index 32a633077..000000000 --- a/src/backend/tools/google_drive/sync/__init__.py +++ /dev/null @@ -1,149 +0,0 @@ -from typing import Any, Dict, List, Optional - -from sqlalchemy.orm import Session - -from backend.services.logger.utils import LoggerFactory -from backend.tools.google_drive.auth import GoogleDriveAuth -from backend.tools.google_drive.constants import FOLDER_MIME_TYPE, SEARCH_MIME_TYPES -from backend.tools.google_drive.sync.actions import create -from backend.tools.google_drive.sync.utils import get_service -from backend.tools.google_drive.tool import GoogleDrive - -logger = LoggerFactory().get_logger() - - -def handle_google_drive_sync( - file_ids: List[str], agent_id: str, user_id: str, **kwargs -): - index_name = "{}_{}".format( - agent_id if agent_id is not None else user_id, GoogleDrive.NAME - ) - [ - create.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - "artifact_id": "", - "skip_file_exists": True, - **kwargs, - }, - ) - for file_id in file_ids - ] - - -def list_google_drive_artifacts_file_ids( - session: Session, user_id: str, agent_artifacts: List[Dict[str, str]], verbose=False -): - gdrive_auth = GoogleDriveAuth() - if gdrive_auth.is_auth_required(session, user_id=user_id): - raise Exception( - "Sync GDrive Error: Agent creator credentials need to re-authenticate" - ) - - agent_creator_auth_token = gdrive_auth.get_token(session=session, user_id=user_id) - if agent_creator_auth_token is None: - raise Exception("Sync GDrive Error: No agent creator credentials found") - - (service,) = ( - get_service(api="drive", user_id=user_id)[key] for key in ("service",) - ) - - folder_artifacts = [x for x in agent_artifacts if x["type"] == "folder"] - file_artifacts = [x for x in agent_artifacts if x["type"] != "folder"] - - file_ids = [] - for folder_artifact in folder_artifacts: - file_ids += _recursively_list_google_drive_artifact_file_ids( - service=service, - user_id=user_id, - artifact_id=folder_artifact["id"], - verbose=verbose, - ) - - return [*file_ids, *[x["id"] for x in file_artifacts]] - - -def _recursively_list_google_drive_artifact_file_ids( - service: Any, - user_id: str, - artifact_id: str, - next_page_token: Optional[str] = None, - verbose=False, -): - if verbose: - logger.info( - event="[list_google_drive_artifacts_file_ids] Fetching artifacts for", - artifact_id=artifact_id, - ) - - # fetch files and folders - conditions = [ - "(" - + " or ".join( - [ - f"mimeType = '{mime_type}'" - for mime_type in [*SEARCH_MIME_TYPES, FOLDER_MIME_TYPE] - ] - ) - + ")", - "'{}' in parents".format(artifact_id), - ] - q = " and ".join(conditions) - - fields = "nextPageToken, files(id, mimeType)" - response = ( - service.files() - .list( - q=q, - includeItemsFromAllDrives=True, - supportsAllDrives=True, - fields=fields, - pageToken=next_page_token, - ) - .execute() - ) - - artifacts = response["files"] if response else [] - folder_artifacts = [x["id"] for x in artifacts if x["mimeType"] == FOLDER_MIME_TYPE] - file_artifacts = [x["id"] for x in artifacts if x["mimeType"] != FOLDER_MIME_TYPE] - - if response_next_page_token := response.get("nextPageToken", None): - artifacts = [ - # existing files - *file_artifacts, - # same folder artifact with next page token - *_recursively_list_google_drive_artifact_file_ids( - service=service, - user_id=user_id, - artifact_id=artifact_id, - next_page_token=response_next_page_token, - verbose=verbose, - ), - # new folder artifacts with next page token - *[ - x - for new_artifact_id in folder_artifacts - for x in _recursively_list_google_drive_artifact_file_ids( - service=service, - user_id=user_id, - artifact_id=new_artifact_id, - verbose=verbose, - ) - ], - ] - - return [ - # existing files - *file_artifacts, - # new folder artifacts with next page token - *[ - x - for new_artifact_id in folder_artifacts - for x in _recursively_list_google_drive_artifact_file_ids( - service=service, - user_id=user_id, - artifact_id=new_artifact_id, - verbose=verbose, - ) - ], - ] diff --git a/src/backend/tools/google_drive/sync/actions/__init__.py b/src/backend/tools/google_drive/sync/actions/__init__.py deleted file mode 100644 index 2c8f8bf68..000000000 --- a/src/backend/tools/google_drive/sync/actions/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from backend.tools.google_drive.sync.actions.create import create -from backend.tools.google_drive.sync.actions.delete import delete -from backend.tools.google_drive.sync.actions.edit import edit -from backend.tools.google_drive.sync.actions.move import move -from backend.tools.google_drive.sync.actions.permission_change import permission_change -from backend.tools.google_drive.sync.actions.rename import rename - -__all__ = [ - "create", - "delete", - "edit", - "move", - "permission_change", - "rename", -] diff --git a/src/backend/tools/google_drive/sync/actions/create.py b/src/backend/tools/google_drive/sync/actions/create.py deleted file mode 100644 index 0d1b5b6a4..000000000 --- a/src/backend/tools/google_drive/sync/actions/create.py +++ /dev/null @@ -1,130 +0,0 @@ -import time - -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.actions.utils import ( - check_if_file_exists_in_artifact, - get_file_details, -) -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "create" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def create(self, file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs): - # check if file exists - # NOTE Important when a file has a move and create action - artifact_id = kwargs["artifact_id"] - file_details = get_file_details(file_id=file_id, user_id=user_id, just_title=True) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}" - raise Exception(err_msg) - - title = file_details["title"] - if not kwargs.get("skip_file_exists"): - exists = check_if_file_exists_in_artifact( - file_id=file_id, - artifact_id=artifact_id, - user_id=user_id, - title=title, - ) - if not exists: - err_msg = f"{file_id} does not exist agent_id" - raise Exception(err_msg) - - # Get file bytes, web view link, title - file_details = get_file_details( - file_id=file_id, user_id=user_id, include_permissions=True - ) - if not file_details: - err_msg = f"Error creating file {file_id} with link on Compass. File details could not be parsed" - raise Exception(err_msg) - file_bytes, web_view_link, extension, permissions = ( - file_details[key] - for key in ("file_bytes", "web_view_link", "extension", "permissions") - ) - if not file_bytes: - err_msg = f"Error creating file {file_id} with link: {web_view_link} on Compass. File bytes could not be parsed" - raise Exception(err_msg) - - file_meta = file_details.copy() - del file_meta["file_bytes"] - - compass = get_compass() - try: - # idempotent create index - logger.info( - event="[Google Drive Create] Initiating Compass create_index action for index", - index_name=index_name, - ) - compass.invoke( - compass.ValidActions.CREATE_INDEX, - { - "index": index_name, - }, - ) - logger.info( - event="[Google Drive Create] Finished Compass create_index action for index", - index_name=index_name, - ) - # Create or replace doc (if already exists) - logger.info( - event="[Google Drive Create] Initiating Compass create action for file", - web_view_link=web_view_link, - ) - compass.invoke( - compass.ValidActions.CREATE, - { - "index": index_name, - "file_id": file_id, - "file_bytes": file_bytes, - "file_extension": extension, - }, - ) - logger.info( - event="[Google Drive Create] Finished Compass create action for file", - web_view_link=web_view_link, - ) - logger.info( - event="[Google Drive Create] Initiating Compass add context for file", - web_view_link=web_view_link, - ) - # Add title and url context - compass.invoke( - compass.ValidActions.ADD_CONTEXT, - { - "index": index_name, - "file_id": file_id, - "context": { - "url": web_view_link, - "title": title, - "last_updated": int(time.time()), - "permissions": permissions, - }, - }, - ) - logger.info( - event="[Google Drive Create] Finished Compass add context action for file", - web_view_link=web_view_link, - ) - except Exception as error: - logger.info( - event="[Google Drive Create] Errors indexing on compass", - web_view_link=web_view_link, - error=str(error), - ) - err_msg = f"Error creating file {file_id} with link: {web_view_link} on Compass: {error}" - raise Exception(err_msg) - - action_name = kwargs.get("action_name_override", ACTION_NAME) - return { - "action": action_name, - "status": Status.SUCCESS.value, - "file_id": file_id, - **file_meta, - } diff --git a/src/backend/tools/google_drive/sync/actions/delete.py b/src/backend/tools/google_drive/sync/actions/delete.py deleted file mode 100644 index 50def5b25..000000000 --- a/src/backend/tools/google_drive/sync/actions/delete.py +++ /dev/null @@ -1,49 +0,0 @@ -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "delete" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def delete(self, file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs): - compass = get_compass() - - try: - # Delete document - logger.info( - event="[Google Drive Delete] Initiating Compass delete for file", - file_id=file_id, - ) - compass.invoke( - compass.ValidActions.DELETE, - { - "index": index_name, - "file_id": file_id, - }, - ) - logger.info( - event="[Google Drive Delete] Finished Compass delete action for file", - file_id=file_id, - ) - return { - "action": ACTION_NAME, - "status": Status.SUCCESS.value, - "file_id": file_id, - } - except Exception as error: - logger.error( - event="Failed to delete document in Compass for file", - user_id=user_id, - agent_id=agent_id, - index_name=index_name, - file_id=file_id, - ) - err_msg = ( - f"Error deleting file {file_id} for agent {agent_id} on Compass: {error}" - ) - raise Exception(err_msg) diff --git a/src/backend/tools/google_drive/sync/actions/edit.py b/src/backend/tools/google_drive/sync/actions/edit.py deleted file mode 100644 index 8d5620085..000000000 --- a/src/backend/tools/google_drive/sync/actions/edit.py +++ /dev/null @@ -1,102 +0,0 @@ -import time - -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.actions.utils import ( - check_if_file_exists_in_artifact, - get_file_details, -) -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "edit" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def edit(file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs): - # check if file exists - # NOTE Important when a file has a move and create action - artifact_id = kwargs["artifact_id"] - file_details = get_file_details(file_id=file_id, user_id=user_id, just_title=True) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}, agent_id: {agent_id}" - raise Exception(err_msg) - - title = file_details["title"] - exists = check_if_file_exists_in_artifact( - file_id=file_id, - artifact_id=artifact_id, - user_id=user_id, - title=title, - ) - if not exists: - err_msg = f"{file_id} does not exist" - raise Exception(err_msg) - - # Get file bytes, web view link, title - file_details = get_file_details(file_id=file_id, user_id=user_id) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}" - raise Exception(err_msg) - file_bytes, web_view_link, extension, permissions = ( - file_details[key] - for key in ("file_bytes", "web_view_link", "extension", "permissions") - ) - if not file_bytes: - err_msg = f"Error creating file {file_id} with link: {web_view_link} on Compass. File bytes could not be parsed." - raise Exception(err_msg) - - compass = get_compass() - try: - # Update doc - logger.info( - event="[Google Drive Edit] Initiating Compass update action for file", - web_view_link=web_view_link, - ) - compass.invoke( - compass.ValidActions.UPDATE, - { - "index": index_name, - "file_id": file_id, - "file_bytes": file_bytes, - "file_extension": extension, - }, - ) - logger.info( - event="[Google Drive Edit] Finished Compass update action for file", - web_view_link=web_view_link, - ) - logger.info( - event="[Google Drive Edit] Initiating Compass add context for file", - web_view_link=web_view_link, - ) - # Update last_updated - compass.invoke( - compass.ValidActions.ADD_CONTEXT, - { - "index": index_name, - "file_id": file_id, - "context": { - "url": web_view_link, - "title": title, - "last_updated": int(time.time()), - "permissions": permissions, - }, - }, - ) - logger.info( - event="[Google Drive Edit] Finished Compass add context action for file", - web_view_link=web_view_link, - ) - except Exception as error: - logger.info( - event="[Google Drive Edit] Failed to edit document in Compass for file", - web_view_link=web_view_link, - ) - err_msg = f"Error editing file {file_id} with link: {web_view_link} on Compass: {error}" - raise Exception(err_msg) - - return {"action": ACTION_NAME, "status": Status.SUCCESS.value, "file_id": file_id} diff --git a/src/backend/tools/google_drive/sync/actions/move.py b/src/backend/tools/google_drive/sync/actions/move.py deleted file mode 100644 index 53089063b..000000000 --- a/src/backend/tools/google_drive/sync/actions/move.py +++ /dev/null @@ -1,78 +0,0 @@ -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.actions.utils import ( - check_if_file_exists_in_artifact, - get_file_details, -) -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "move" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def move(file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs): - artifact_id = kwargs["artifact_id"] - if artifact_id == file_id: - return { - "action": ACTION_NAME, - "status": Status.CANCELLED.value, - "file_id": file_id, - } - - file_details = get_file_details(file_id=file_id, user_id=user_id, just_title=True) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}" - raise Exception(err_msg) - - file_meta = file_details.copy() - del file_meta["file_bytes"] - - title = file_details["title"] - exists = check_if_file_exists_in_artifact( - file_id=file_id, - artifact_id=artifact_id, - user_id=user_id, - title=title, - ) - compass = get_compass() - if exists: - err_msg = f"file already exists: {file_id}" - raise Exception(err_msg) - - # Delete file if moved out of agent's artifacts - try: - logger.info( - event="[Google Drive Move] Initiating Compass delete action for file_id", - file_id=file_id, - ) - compass.invoke( - compass.ValidActions.DELETE, - { - "index": index_name, - "file_id": file_id, - }, - ) - logger.info( - event="[Google Drive Move] Finished Compass delete action for file_id", - file_id=file_id, - ) - return { - "action": ACTION_NAME, - "status": Status.SUCCESS.value, - "file_id": file_id, - **file_meta, - } - except Exception as error: - logger.error( - event="Failed to delete document in Compass", - user_id=user_id, - index_name=index_name, - file_id=file_id, - agent_id=agent_id, - ) - err_msg = f"Error deleting file {file_id} on Compass: {error}" - raise Exception(err_msg) diff --git a/src/backend/tools/google_drive/sync/actions/permission_change.py b/src/backend/tools/google_drive/sync/actions/permission_change.py deleted file mode 100644 index f83ffa752..000000000 --- a/src/backend/tools/google_drive/sync/actions/permission_change.py +++ /dev/null @@ -1,80 +0,0 @@ -import time - -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.actions.utils import ( - check_if_file_exists_in_artifact, - get_file_details, - list_permissions, -) -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "permission_change" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def permission_change( - file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs -): - # check if file exists - # NOTE Important when a file has a move and permission_change action - artifact_id = kwargs["artifact_id"] - file_details = get_file_details(file_id=file_id, user_id=user_id, just_title=True) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}" - raise Exception(err_msg) - - title = file_details["title"] - exists = check_if_file_exists_in_artifact( - file_id=file_id, - artifact_id=artifact_id, - user_id=user_id, - title=title, - ) - if not exists: - err_msg = f"{file_id} does not exist" - raise Exception(err_msg) - - permissions = list_permissions(file_id=file_id, user_id=user_id) - compass = get_compass() - - # Update permissions array - logger.info( - event="[Google Drive Permission Change] Initiating Compass add_context action for file", - file_id=file_id, - ) - try: - compass.invoke( - compass.ValidActions.ADD_CONTEXT, - { - "index": index_name, - "file_id": file_id, - "context": { - "last_updated": int(time.time()), - "permissions": permissions, - }, - }, - ) - logger.info( - event="[Google Drive Permission Change] Finished Compass add_context action for file", - file_id=file_id, - ) - return { - "action": ACTION_NAME, - "status": Status.SUCCESS.value, - "file_id": file_id, - } - except Exception as error: - logger.error( - event="Failed to update permissions in Compass for file", - user_id=user_id, - agent_id=agent_id, - index_name=index_name, - file_id=file_id, - ) - err_msg = f"Error updating permissions for file {file_id} on Compass: {error}" - raise Exception(err_msg) diff --git a/src/backend/tools/google_drive/sync/actions/rename.py b/src/backend/tools/google_drive/sync/actions/rename.py deleted file mode 100644 index 2df2adbc1..000000000 --- a/src/backend/tools/google_drive/sync/actions/rename.py +++ /dev/null @@ -1,60 +0,0 @@ -import time - -from backend.services.compass import get_compass -from backend.services.logger.utils import LoggerFactory -from backend.services.sync import app -from backend.services.sync.constants import DEFAULT_TIME_OUT, Status -from backend.tools.google_drive.sync.actions.utils import get_file_details -from backend.tools.google_drive.sync.utils import persist_agent_task - -ACTION_NAME = "rename" -logger = LoggerFactory().get_logger() - - -@app.task(time_limit=DEFAULT_TIME_OUT, bind=True) -@persist_agent_task -def rename(file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs): - file_details = get_file_details(file_id=file_id, user_id=user_id, just_title=True) - if file_details is None: - err_msg = f"empty file details for file_id: {file_id}" - raise Exception(err_msg) - - title = file_details["title"] - compass = get_compass() - - # Modify title - logger.info( - event="[Google Drive Rename] Initiating Compass add context for file", - file_id=file_id, - ) - try: - compass.invoke( - compass.ValidActions.ADD_CONTEXT, - { - "index": index_name, - "file_id": file_id, - "context": { - "title": title, - "last_updated": int(time.time()), - }, - }, - ) - logger.info( - event="[Google Drive Rename] Finished Compass add context action for file", - file_id=file_id, - ) - return { - "action": ACTION_NAME, - "status": Status.SUCCESS.value, - "file_id": file_id, - } - except Exception as error: - logger.error( - event="Failed to rename document in Compass for file", - user_id=user_id, - index_name=index_name, - file_id=file_id, - agent_id=agent_id, - ) - err_msg = f"Error renaming file {file_id} on Compass: {error}" - raise Exception(err_msg) diff --git a/src/backend/tools/google_drive/sync/actions/utils.py b/src/backend/tools/google_drive/sync/actions/utils.py deleted file mode 100644 index cde5cfda6..000000000 --- a/src/backend/tools/google_drive/sync/actions/utils.py +++ /dev/null @@ -1,140 +0,0 @@ -from typing import Dict, List, Optional - -from backend.services.logger.utils import LoggerFactory -from backend.services.sync.env import env -from backend.tools.google_drive.constants import ( - NATIVE_SEARCH_MIME_TYPES, - SEARCH_MIME_TYPES, -) -from backend.tools.google_drive.sync.utils import ( - extract_export_link, - extract_file_extension, - extract_title, - extract_web_view_link, - get_service, - perform_get_single, - perform_non_native_single, - process_shortcut_file, -) -from backend.tools.utils import download - -logger = LoggerFactory().get_logger() - - -def get_file_details( - file_id: str, user_id: str, include_permissions=False, just_title=False -): - """ - Return file bytes, web view link and title - """ - # get service - service, creds = ( - get_service(api="drive", user_id=user_id)[key] for key in ("service", "creds") - ) - - # get file details - file_get = perform_get_single(file_id=file_id, user_id=user_id) - if file_get["trashed"]: - return None - - processed_file = process_shortcut_file(service, file_get) - if processed_file["mimeType"] not in SEARCH_MIME_TYPES: - return None - - extension = extract_file_extension(processed_file) - web_view_link = extract_web_view_link(processed_file) - title = extract_title(processed_file) - - if just_title: - return {"title": title} - - # get file content bytes - file_bytes = None - if processed_file["mimeType"] in NATIVE_SEARCH_MIME_TYPES: - # native files - export_link = extract_export_link(processed_file) - if export_link: - file_text = download.perform_single( - httpx_client=env().HTTPX_CLIENT, - url=export_link, - access_token=creds.token, - ) - file_bytes = file_text.encode() - else: - # non-native files - file_bytes = perform_non_native_single( - service=service, file_id=processed_file["id"] - ) - - permissions = [] - if include_permissions: - permissions = list_permissions(file_id=file_id, user_id=user_id) - - return { - "file_bytes": file_bytes, - "extension": extension, - "web_view_link": web_view_link, - "title": title, - "permissions": permissions, - } - - -def list_permissions(file_id: str, user_id: str, next_page_token: Optional[str] = None): - (service,) = ( - get_service(api="drive", user_id=user_id)[key] for key in ("service",) - ) - response = ( - service.permissions() - .list( - fileId=file_id, - supportsAllDrives=True, - pageToken=next_page_token, - fields="permissions(emailAddress, type, domain), nextPageToken", - ) - .execute() - ) - if response_next_page_token := response.get("nextPageToken", None): - return [ - *_format_permissions(response.get("permissions", [])), - *list_permissions( - file_id=file_id, - user_id=user_id, - next_page_token=response_next_page_token, - ), - ] - return _format_permissions(response.get("permissions", [])) - - -def check_if_file_exists_in_artifact( - file_id: str, artifact_id: str, user_id: str, title: str -): - (service,) = ( - get_service(api="drive", user_id=user_id)[key] for key in ("service",) - ) - response = ( - service.files() - .list( - q="'{}' in parents and name = '{}'".format(artifact_id, title), - includeItemsFromAllDrives=True, - supportsAllDrives=True, - ) - .execute() - ) - - if files := response.get("files", None): - found_file = [x for x in files if x["id"] == file_id] - if found_file: - return True - return False - - -def _format_permissions(permissions: List[Dict[str, str]]): - return [ - *[ - {"id": x["emailAddress"], "type": x["type"]} - for x in permissions - if "emailAddress" in x - ], - *[{"id": x["domain"], "type": x["type"]} for x in permissions if "domain" in x], - *[{"id": x["group"], "type": x["type"]} for x in permissions if "group" in x], - ] diff --git a/src/backend/tools/google_drive/sync/activity.py b/src/backend/tools/google_drive/sync/activity.py deleted file mode 100644 index 9bc9e3a09..000000000 --- a/src/backend/tools/google_drive/sync/activity.py +++ /dev/null @@ -1,205 +0,0 @@ -from concurrent import futures -from typing import Any, Dict, List, Optional - -from sqlalchemy.orm import Session - -from backend.tools.google_drive.auth import GoogleDriveAuth -from backend.tools.google_drive.constants import ( - ACTIVITY_TRACKING_WINDOW, - GoogleDriveActions, -) -from backend.tools.google_drive.sync.actions import ( - create, - delete, - edit, - move, - permission_change, - rename, -) -from backend.tools.google_drive.sync.utils import ( - extract_file_ids_from_target, - get_current_timestamp_in_ms, - get_service, -) -from backend.tools.google_drive.tool import GoogleDrive - -RESTORE_ACTION_NAME="restore" - -def handle_google_drive_activity_event( - event_type: str, activity: Dict[str, str], agent_id: str, user_id: str, **kwargs -): - index_name = "{}_{}".format( - agent_id if agent_id is not None else user_id, GoogleDrive.NAME - ) - file_ids = extract_file_ids_from_target(activity=activity) - if not file_ids: - return - - match event_type: - case GoogleDriveActions.CREATE.value: - [ - create.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - "artifact_id": kwargs["artifact_id"], - **kwargs, - }, - ) - for file_id in file_ids - ] - case GoogleDriveActions.EDIT.value: - [ - edit.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - "artifact_id": kwargs["artifact_id"], - **kwargs, - }, - ) - for file_id in file_ids - ] - case GoogleDriveActions.MOVE.value: - [ - move.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - "artifact_id": kwargs["artifact_id"], - **kwargs, - }, - ) - for file_id in file_ids - ] - case GoogleDriveActions.RENAME.value: - [ - rename.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - **kwargs, - }, - ) - for file_id in file_ids - ] - case GoogleDriveActions.DELETE.value: - [ - delete.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs=kwargs, - ) - for file_id in file_ids - ] - case GoogleDriveActions.RESTORE.value: - [ - create.apply_async( - args=[file_id, index_name, user_id, agent_id], - action_name_override=RESTORE_ACTION_NAME, - **kwargs, - ) - for file_id in file_ids - ] - case GoogleDriveActions.PERMISSION_CHANGE.value: - [ - permission_change.apply_async( - args=[file_id, index_name, user_id, agent_id], - kwargs={ - "artifact_id": kwargs["artifact_id"], - **kwargs, - }, - ) - for file_id in file_ids - ] - case _: - raise Exception("This action is not tracked for Google Drive") - - -def query_google_drive_activity( - session: Session, user_id: str, agent_artifacts: List[Dict[str, str]] -): - gdrive_auth = GoogleDriveAuth() - agent_creator_auth_token = gdrive_auth.get_token(session=session, user_id=user_id) - if agent_creator_auth_token is None: - raise Exception( - f"Sync GDrive Error: No agent creator credentials found user id: {user_id}" - ) - - if gdrive_auth.is_auth_required(session, user_id=user_id): - raise Exception( - "Sync GDrive Error: Agent creator credentials need to re-authenticate" - ) - - (service,) = ( - get_service(api="driveactivity", version="v2", user_id=user_id)[key] - for key in ("service",) - ) - - activity_ts_filter = get_current_timestamp_in_ms( - negative_offset=ACTIVITY_TRACKING_WINDOW - ) - activities = {} - with futures.ThreadPoolExecutor(max_workers=10) as executor: - futures_list = [ - executor.submit(_get_activity, service, artifact, activity_ts_filter) - for artifact in agent_artifacts - ] - for future in futures.as_completed(futures_list): - try: - (artifact_id, artifact_activities) = ( - future.result()[key] for key in ("id", "activities") - ) - activities[artifact_id] = artifact_activities - except Exception as e: - raise e - - return activities - - -def _get_activity( - service: Any, - artifact: Dict[str, str], - activity_ts_filter: int, - next_page_token: Optional[str] = None, -): - artifact_id = artifact["id"] - artifact_type = artifact["type"] - response = ( - service.activity() - .query( - body={ - "filter": "time >= {} AND detail.action_detail_case:({})".format( - activity_ts_filter, - " ".join([e.value.upper() for e in GoogleDriveActions]), - ), - **( - {"ancestorName": "items/{}".format(artifact_id)} - if artifact_type == "folder" - else {} - ), - **( - {"itemName": "items/{}".format(artifact_id)} - if artifact_type != "folder" - else {} - ), - "pageToken": next_page_token, - "consolidationStrategy": { - "legacy": {}, - }, - } - ) - .execute() - ) - if response_next_page_token := response.get("nextPageToken", None): - return { - "id": artifact_id, - "activities": [ - *response.get("activities", []), - *_get_activity( - service=service, - artifact=artifact, - activity_ts_filter=activity_ts_filter, - next_page_token=response_next_page_token, - )["activities"], - ], - } - return { - "id": artifact_id, - "activities": response["activities"] if response else [], - } diff --git a/src/backend/tools/google_drive/sync/consolidation.py b/src/backend/tools/google_drive/sync/consolidation.py deleted file mode 100644 index 6422d7f19..000000000 --- a/src/backend/tools/google_drive/sync/consolidation.py +++ /dev/null @@ -1,55 +0,0 @@ -from collections import defaultdict -from typing import Dict - -from backend.tools.google_drive.constants import GoogleDriveActions -from backend.tools.google_drive.sync.utils import extract_file_ids_from_target - - -def consolidate(activities: Dict[str, str]): - """ - Notes - - - GDrive actions come in chronological order - - Purposefully have Create and Edit actions check if a file exists, helps with - move + create/edit combos. Without it there is no great way to async execute them. - - Create action also pulls latest permissions and name, thus making create a superset of permissions_change and rename - - Edit action also pulls latest permissions and name, thus making edit a superset of permissions_change and rename - """ - file_id_actions = defaultdict(list) - for activity in activities: - file_ids = extract_file_ids_from_target(activity=activity) - for file_id in file_ids: - file_id_actions[file_id].append(activity) - - consolidated_file_id_actions = defaultdict(list) - for file_id, activities in file_id_actions.items(): - actions = [list(x["primaryActionDetail"].keys())[0] for x in activities] - # NOTE Debugs below help with understanding the consolidation logic - # print(file_id) - # print("Before") - # print(actions) - - if GoogleDriveActions.MOVE.value in actions: - found_index = actions.index(GoogleDriveActions.MOVE.value) - consolidated_file_id_actions[file_id].append(activities[found_index]) - - for action in actions: - consolidated_file_id_actions[file_id].append(activities[0]) - if action in [ - GoogleDriveActions.DELETE.value, - GoogleDriveActions.RESTORE.value, - GoogleDriveActions.CREATE.value, - GoogleDriveActions.EDIT.value, - ]: - break - - # NOTE Debugs below help with understanding the consolidation logic - # after_actions = [list(x["primaryActionDetail"].keys())[0] for x in consolidated_file_id_actions[file_id]] - # print("After") - # print(after_actions) - # print("\n") - return [ - x - for file_id in consolidated_file_id_actions - for x in consolidated_file_id_actions[file_id] - ] diff --git a/src/backend/tools/google_drive/sync/utils.py b/src/backend/tools/google_drive/sync/utils.py deleted file mode 100644 index bfd63090f..000000000 --- a/src/backend/tools/google_drive/sync/utils.py +++ /dev/null @@ -1,253 +0,0 @@ -import io -import time -from concurrent import futures -from functools import wraps -from typing import Any, Dict, List - -from google.oauth2.credentials import Credentials -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError -from googleapiclient.http import MediaIoBaseDownload - -from backend.crud.agent_task import create_agent_task -from backend.database_models.database import get_session -from backend.services.logger.utils import LoggerFactory -from backend.tools.google_drive.auth import GoogleDriveAuth -from backend.tools.google_drive.constants import ( - CSV_MIMETYPE, - DOC_FIELDS, - NATIVE_EXTENSION_MAPPINGS, - SEARCH_MIME_TYPES, - TEXT_MIMETYPE, -) - -""" -Get service -""" - -logger = LoggerFactory().get_logger() - - -def get_service(api: str, user_id: str, version: str = "v3"): - # Get google credentials - gdrive_auth = GoogleDriveAuth() - agent_creator_auth_token = None - - session = next(get_session()) - if gdrive_auth.is_auth_required(session, user_id=user_id): - session.close() - raise Exception( - "Sync GDrive Error: Agent creator credentials need to re-authenticate" - ) - - agent_creator_auth_token = gdrive_auth.get_token(session=session, user_id=user_id) - if agent_creator_auth_token is None: - session.close() - raise Exception("Sync GDrive Error: No agent creator credentials found") - - creds = Credentials(agent_creator_auth_token) - service = build(api, version, credentials=creds, cache_discovery=False) - session.close() - return {"service": service, "creds": creds} - - -""" -GDrive GET file -""" - - -def perform_get_batch(file_ids: List[str], user_id: str) -> List[Dict[str, str]]: - results = [] - - with futures.ThreadPoolExecutor(max_workers=10) as executor: - futures_list = [ - executor.submit(_get_file, file_id, user_id) for file_id in file_ids - ] - for future in futures.as_completed(futures_list): - try: - results.append(future.result()) - except Exception as e: - raise e - return results - - -def perform_get_single(file_id: str, user_id: str) -> Dict[str, str]: - return _get_file(file_id=file_id, user_id=user_id) - - -def _get_file(file_id: str, user_id: str): - (service,) = ( - get_service(api="drive", user_id=user_id)[key] for key in ("service",) - ) - return ( - service.files() - .get( - fileId=file_id, - fields=DOC_FIELDS, - supportsAllDrives=True, - ) - .execute() - ) - - -""" -NON-NATIVE DOWNLOAD -""" - - -def perform_non_native_batch(service: Any, file_ids: List[str]) -> Dict[str, str]: - tasks = [] - - with futures.ThreadPoolExecutor(max_workers=10) as executor: - futures_list = [ - executor.submit(_download_non_native_file, service, file_id) - for file_id in file_ids - ] - for future in futures.as_completed(futures_list): - try: - tasks.append(future.result()) - except Exception as e: - raise e - - return { - "{}".format(task.get("file_id", "")): task.get("file_text", "") - for task in tasks - } - - -def perform_non_native_single(service: Any, file_id: str): - return _download_non_native_file(service=service, file_id=file_id) - - -def _download_non_native_file(service: Any, file_id: str): - request = service.files().get_media(fileId=file_id) - file = io.BytesIO() - downloader = MediaIoBaseDownload(file, request) - - try: - done = False - while done is False: - status, done = downloader.next_chunk(num_retries=5) - logger.info( - event="Downloading", - file_id=file_id, - status=status.progress(), - ) - logger.info( - event="Finished downloading", - file_id=file_id, - ) - except HttpError as error: - logger.error( - event="[Google Drive] Error downloading file", - file_id=file_id, - type=(type(error)), - error=error, - ) - return "" - except Exception as error: - logger.error( - event="[Google Drive] Error downloading file", - file_id=file_id, - type=(type(error)), - error=error, - ) - return "" - if file is None: - return "" - - file_bytes = file.getvalue() - return file_bytes - - -""" -OTHER -""" - - -def get_current_timestamp_in_ms(positive_offset: int = 0, negative_offset: int = 0): - return int((time.time() + positive_offset - negative_offset) * 1000) - - -def process_shortcut_file(service: Any, file: Dict[str, str]) -> Dict[str, str]: - if file["mimeType"] == "application/vnd.google-apps.shortcut": - try: - targetId = file["shortcutDetails"]["targetId"] - targetFile = ( - service.files() - .get( - fileId=targetId, - fields=DOC_FIELDS, - supportsAllDrives=True, - ) - .execute() - ) - return targetFile - except Exception as error: - file_id = file["id"] - logger.error( - event="An error occurred processing a shortcut file with id", - file_id=file_id, - type=type(error), - error=error, - ) - return {} - else: - return file - - -def extract_web_view_link(file: Dict[str, str]) -> str: - return file.pop("webViewLink", "") - - -def extract_file_extension(file: Dict[str, str]) -> str: - extension = file.pop("fileExtension", "") - if not extension: - # NOTE: Mean native file - # ref. docs https://developers.google.com/drive/api/reference/rest/v3/files#File - return NATIVE_EXTENSION_MAPPINGS[file["mimeType"]] - return extension - - -def extract_title(file: Dict[str, str]) -> str: - return file.pop("name", "") - - -def extract_export_link(file: Dict[str, str]) -> str: - export_links = file.pop("exportLinks", {}) - if TEXT_MIMETYPE in export_links: - return export_links[TEXT_MIMETYPE] - elif CSV_MIMETYPE in export_links: - return export_links[CSV_MIMETYPE] - return "" - - -def extract_file_ids_from_target(activity: Dict[str, str]): - file_ids = set() - targets = activity["targets"] - for target in targets: - # NOTE: if not a drive item then skip - if driveItem := target["driveItem"]: - mimeType = driveItem["mimeType"] - # NOTE: if mime type not being tracked then skip - if mimeType in SEARCH_MIME_TYPES: - file_id = driveItem["name"].split("/")[1] - file_ids.add(file_id) - return file_ids - -def persist_agent_task(method): - @wraps(method) - def wrapper( - self, file_id: str, index_name: str, user_id: str, agent_id: str, **kwargs - ): - task_id = self.request.id - logger.info( - event=f"Executing task id {self.request.id}, args: {self.request.args} kwargs: {self.request.kwargs}", - agent_id=agent_id, - ) - session = next(get_session()) - create_agent_task(session, agent_id=agent_id, task_id=task_id) - session.close() - return method(self, file_id, index_name, user_id, agent_id, **kwargs) - - return wrapper diff --git a/src/backend/tools/google_drive/tool.py b/src/backend/tools/google_drive/tool.py index 74212bd20..cae3b54fe 100644 --- a/src/backend/tools/google_drive/tool.py +++ b/src/backend/tools/google_drive/tool.py @@ -4,10 +4,17 @@ from backend.config.settings import Settings from backend.crud import tool_auth as tool_auth_crud -from backend.services.compass import Compass from backend.services.logger.utils import LoggerFactory from backend.tools.base import BaseTool from backend.tools.google_drive.constants import GOOGLE_DRIVE_TOOL_ID, SEARCH_LIMIT +from backend.tools.google_drive.utils import ( + extract_export_link, + extract_title, + extract_web_view_link, + get_service, + perform_get_batch, + process_shortcut_file, +) logger = LoggerFactory().get_logger() @@ -43,68 +50,15 @@ def _handle_tool_specific_errors(self, error: Exception, **kwargs: Any): async def call(self, parameters: dict, **kwargs: Any) -> List[Dict[str, Any]]: user_id = kwargs.get("user_id") - agent_id = kwargs["agent_id"] - index_name = "{}_{}".format( - agent_id if agent_id is not None else user_id, GOOGLE_DRIVE_TOOL_ID - ) query = parameters.get("query", "").replace("'", "\\'") - compass = None - try: - compass = Compass( - compass_api_url=Settings().compass.api_url, - compass_parser_url=Settings().compass.parser_url, - compass_username=Settings().compass.username, - compass_password=Settings().compass.password, - ) - except Exception as e: - logger.error(event=f"[Google Drive] Compass setup not found. {e}") - pass - - if compass is not None: - # Compass setup found - # Query Compass - documents = [] - documents = compass.invoke( - compass.ValidActions.SEARCH, - { - "index": index_name, - "query": query, - # TODO filter on permissions - }, - ) - if documents.error: - raise Exception( - f"Error getting documents for {query} with {documents.error}" - ) - - hits = documents.result["hits"] - chunks = sorted( - [ - { - "text": chunk["content"]["text"], - "score": chunk["score"], - "url": hit["content"].get("url", ""), - "title": hit["content"].get("title", ""), - } - for hit in hits - for chunk in hit["chunks"] - ], - key=lambda x: x["score"], - reverse=True, - )[:SEARCH_LIMIT] - if chunks == []: - raise Exception(f"Compass no documents found for search query {query}") - return chunks - else: - # No compass setup - # Default to raw gdrive search - logger.info(event="[Google Drive] Defaulting to raw Google Drive search.") - agent_tool_metadata = kwargs["agent_tool_metadata"] - documents = await _default_gdrive_list_files( - user_id=user_id, query=query, agent_tool_metadata=agent_tool_metadata - ) - return documents + # Search Google Drive + logger.info(event="[Google Drive] Defaulting to raw Google Drive search.") + agent_tool_metadata = kwargs["agent_tool_metadata"] + documents = await _default_gdrive_list_files( + user_id=user_id, query=query, agent_tool_metadata=agent_tool_metadata + ) + return documents async def _default_gdrive_list_files( @@ -115,15 +69,7 @@ async def _default_gdrive_list_files( NATIVE_SEARCH_MIME_TYPES, SEARCH_MIME_TYPES, ) - from backend.tools.google_drive.sync.utils import ( - extract_export_link, - extract_title, - extract_web_view_link, - get_service, - perform_get_batch, - process_shortcut_file, - ) - from backend.tools.utils import async_download + from backend.tools.utils.async_download import async_perform (service, creds) = ( get_service(api="drive", user_id=user_id)[key] for key in ("service", "creds") @@ -212,7 +158,7 @@ async def _default_gdrive_list_files( file_id: extract_export_link(x) for file_id, x in native_files.items() } if id_to_urls: - id_to_texts = await async_download.async_perform(id_to_urls, creds.token) + id_to_texts = await async_perform(id_to_urls, creds.token) return [ { diff --git a/src/backend/tools/google_drive/utils.py b/src/backend/tools/google_drive/utils.py new file mode 100644 index 000000000..2c781ccf8 --- /dev/null +++ b/src/backend/tools/google_drive/utils.py @@ -0,0 +1,124 @@ +from concurrent import futures +from typing import Any, Dict, List, TypedDict + +from google.oauth2.credentials import Credentials +from googleapiclient.discovery import build + +from backend.database_models.database import get_session +from backend.services.logger.utils import LoggerFactory +from backend.tools.base import ToolAuthException +from backend.tools.google_drive.auth import GoogleDriveAuth +from backend.tools.google_drive.constants import ( + CSV_MIMETYPE, + DOC_FIELDS, + GOOGLE_DRIVE_TOOL_ID, + TEXT_MIMETYPE, +) + +logger = LoggerFactory().get_logger() + + +class Service(TypedDict): + service: Any + creds: Credentials + + +def get_service(api: str, user_id: str, version: str = "v3") -> Service: + # Get google credentials + gdrive_auth = GoogleDriveAuth() + agent_creator_auth_token = None + + session = next(get_session()) + if gdrive_auth.is_auth_required(session, user_id=user_id): + session.close() + raise ToolAuthException( + "Sync GDrive Error: Agent creator credentials need to re-authenticate", + GOOGLE_DRIVE_TOOL_ID, + ) + + agent_creator_auth_token = gdrive_auth.get_token(session=session, user_id=user_id) + if agent_creator_auth_token is None: + session.close() + raise Exception("Sync GDrive Error: No agent creator credentials found") + + creds = Credentials(agent_creator_auth_token) + service = build(api, version, credentials=creds, cache_discovery=False) + session.close() + return {"service": service, "creds": creds} + + +""" +GDrive GET file +""" + + +def perform_get_batch(file_ids: List[str], user_id: str) -> List[Dict[str, str]]: + results = [] + + with futures.ThreadPoolExecutor(max_workers=10) as executor: + futures_list = [ + executor.submit(_get_file, file_id, user_id) for file_id in file_ids + ] + for future in futures.as_completed(futures_list): + try: + results.append(future.result()) + except Exception as e: + raise e + return results + + +def _get_file(file_id: str, user_id: str): + (service,) = ( + get_service(api="drive", user_id=user_id)[key] for key in ("service",) + ) + return ( + service.files() + .get( + fileId=file_id, + fields=DOC_FIELDS, + supportsAllDrives=True, + ) + .execute() + ) + +def process_shortcut_file(service: Any, file: Dict[str, str]) -> Dict[str, str]: + if file["mimeType"] == "application/vnd.google-apps.shortcut": + try: + targetId = file["shortcutDetails"]["targetId"] + targetFile = ( + service.files() + .get( + fileId=targetId, + fields=DOC_FIELDS, + supportsAllDrives=True, + ) + .execute() + ) + return targetFile + except Exception as error: + file_id = file["id"] + logger.error( + event="An error occurred processing a shortcut file with id", + file_id=file_id, + type=type(error), + error=error, + ) + return {} + else: + return file + + +def extract_web_view_link(file: Dict[str, str]) -> str: + return file.pop("webViewLink", "") + +def extract_title(file: Dict[str, str]) -> str: + return file.pop("name", "") + + +def extract_export_link(file: Dict[str, str]) -> str: + export_links = file.pop("exportLinks", {}) + if TEXT_MIMETYPE in export_links: + return export_links[TEXT_MIMETYPE] + elif CSV_MIMETYPE in export_links: + return export_links[CSV_MIMETYPE] + return ""