roboflow · SkalskiP · Sep 16, 2024 · Sep 15, 2024 · Sep 15, 2024 · Sep 15, 2024
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,16 @@
+version: 2
+updates:
+  # GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    commit-message:
+      prefix: ⬆️
+  # Python
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    commit-message:
+      prefix: ⬆️
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -1,21 +1,23 @@
-name: Docs WorkFlow
+name: 📚 Docs WorkFlow
 
 on:
   push:
-    branches:
-      - master
-      - main
-      - develop
+    branches: [main, develop]
+
 jobs:
   deploy:
     runs-on: ubuntu-latest
     permissions:
       contents: write
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - name: 🛎️ Checkout
+        uses: actions/checkout@v3
+      - name: 🐍 Set up Python
+        uses: actions/setup-python@v4
         with:
           python-version: 3.x
-      - run: pip install mkdocs-material
-      - run: pip install "mkdocstrings[python]"
-      - run: mkdocs gh-deploy --force
+      - name: 📦 Install dependencies
+        run: |
+          pip install mkdocs-material "mkdocstrings[python]"
+      - name: 🚀 Deploy Docs
+        run: mkdocs gh-deploy --force
diff --git a/.github/workflows/maestro-tests.yml b/.github/workflows/maestro-tests.yml
@@ -0,0 +1,44 @@
+name: Test WorkFlow
+
+on:
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  build-dev-test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - name: 🛎️ Checkout
+        uses: actions/checkout@v4
+      - name: 🐍 Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          check-latest: true
+
+      - name: 📦 Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install torch
+          if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
+            if [[ "${{ matrix.python-version }}" == "3.10" ]]; then
+              pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+            elif [[ "${{ matrix.python-version }}" == "3.11" ]]; then
+              pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
+            elif [[ "${{ matrix.python-version }}" == "3.12" ]]; then
+              pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+            elif [[ "${{ matrix.python-version }}" == "3.8" ]]; then
+              pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp38-cp38-linux_x86_64.whl
+            elif [[ "${{ matrix.python-version }}" == "3.9" ]]; then
+              pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp39-cp39-linux_x86_64.whl
+            fi
+          fi
+          pip install .
+          pip install pytest
+
+      - name: 🧪 Test
+        run: "python -m pytest ./test"
diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml
@@ -0,0 +1,40 @@
+name: Multimodal Maestro Releases to PyPi
+on:
+  push:
+    tags:
+      - '[0-9]+.[0-9]+[0-9]+.[0-9]'
+
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - name: 🛎️ Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+      - name: 🐍 Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name:  🏗️ Build source and wheel distributions
+        run: |
+          python -m pip install --upgrade build twine
+          python -m build
+          twine check --strict dist/*
+      - name: 🚀 Publish to PyPi
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: ${{ secrets.PYPI_USERNAME }}
+          password: ${{ secrets.PYPI_PASSWORD }}
+      - name: 🚀 Publish to Test-PyPi
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          user: ${{ secrets.PYPI_TEST_USERNAME }}
+          password: ${{ secrets.PYPI_TEST_PASSWORD }}
diff --git a/.github/workflows/pypi-test-publish.yml b/.github/workflows/pypi-test-publish.yml
@@ -0,0 +1,43 @@
+name: Multimodal Maestro Test Releases to PyPi
+on:
+  push:
+    tags:
+      - '[0-9]+.[0-9]+[0-9]+.[0-9]+a[0-9]'
+      - '[0-9]+.[0-9]+[0-9]+.[0-9]+b[0-9]'
+      - '[0-9]+.[0-9]+[0-9]+.[0-9]+rc[0-9]'
+
+  workflow_dispatch:
+
+jobs:
+  build-n-publish:
+    name: Build and publish to PyPI
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - name: 🛎️ Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+      - name: 🐍 Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: 🏗️ Build source and wheel distributions
+        run: |
+          python -m pip install --upgrade build twine
+          python -m build
+          twine check --strict dist/*
+      - name: 🚀 Publish distribution to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: ${{ secrets.PYPI_USERNAME }}
+          password: ${{ secrets.PYPI_PASSWORD }}
+      - name: 🚀 Publish to Test-PyPi
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          user: ${{ secrets.PYPI_TEST_USERNAME }}
+          password: ${{ secrets.PYPI_TEST_PASSWORD }}
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Mac OS X
+.DS_Store
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,47 @@
+
+# CHANGELOGS
+
+## multimodal-maestro-0.1.0
+
+### 🚀 Added
+
+- [`SegmentAnythingMarkGenerator`](https://roboflow.github.io/multimodal-maestro/markers/#multimodalmaestro.markers.sam.SegmentAnythingMarkGenerator) allowing the generation of segmentation marks.
+- [`MarkVisualizer`](https://roboflow.github.io/multimodal-maestro/visualizers/#multimodalmaestro.visualizers.MarkVisualizer) allowing to visualize the generated marks.
+- [`prompt_image`](https://roboflow.github.io/multimodal-maestro/lmms/#multimodalmaestro.lmms.gpt4.prompt_image) allowing for convenient GPT-4 Vision API querying.
+- 🤗 Hugging Face Set-of-Mark [space](https://huggingface.co/spaces/Roboflow/SoM).
+
+```python
+>>> import cv2
+>>> import torch
+>>> import multimodalmaesto as mm
+
+>>> image = cv2.imread("...")
+
+>>> generator = mm.SegmentAnythingMarkGenerator()
+>>> visualizer = mm.MarkVisualizer()
+
+>>> marks = generator.generate(image=image)
+>>> marks = mm.refine_marks(marks=marks)
+
+>>> image_prompt = visualizer.visualize(image=image, marks=marks)
+>>> text_prompt = "Find dog."
+
+>>> response = mm.prompt_image(api_key=api_key, image=image_prompt, prompt=text_prompt)
+>>> response
+
+"The dog is prominently featured in the center of the image with the label [9]."
+
+>>> masks = mm.extract_relevant_masks(text=response, detections=refined_marks)
+
+{'6': array([
+    [False, False, False, ..., False, False, False],
+    [False, False, False, ..., False, False, False],
+    [False, False, False, ..., False, False, False],
+    ...,
+    [ True,  True,  True, ..., False, False, False],
+    [ True,  True,  True, ..., False, False, False],
+    [ True,  True,  True, ..., False, False, False]])
+}
+```
+
+![multimodal-maestro-2](https://github.com/roboflow/multimodal-maestro/assets/26109316/118feb2e-654e-473c-b534-65bc01df7480)
diff --git a/README.md b/README.md
@@ -8,9 +8,9 @@
 
 ## 👋 hello
 
-**maestro** is a tool designed to streamline and accelerate the fine-tuning process for 
-multimodal models. It provides ready-to-use recipes for fine-tuning popular 
-vision-language models (VLMs) such as **Florence-2**, **PaliGemma**, and 
+**maestro** is a tool designed to streamline and accelerate the fine-tuning process for
+multimodal models. It provides ready-to-use recipes for fine-tuning popular
+vision-language models (VLMs) such as **Florence-2**, **PaliGemma**, and
 **Phi-3.5 Vision** on downstream vision-language tasks.
 
 ## 💻 install
@@ -26,7 +26,7 @@ pip install maestro
 
 ### CLI
 
-VLMs can be fine-tuned on downstream tasks directly from the command line with 
+VLMs can be fine-tuned on downstream tasks directly from the command line with
 `maestro` command:
 
 ```bash
@@ -35,7 +35,7 @@ maestro florence2 train --dataset='<DATASET_PATH>' --epochs=10 --batch-size=8
 
 ### SDK
 
-Alternatively, you can fine-tune VLMs using the Python SDK, which accepts the same 
+Alternatively, you can fine-tune VLMs using the Python SDK, which accepts the same
 arguments as the CLI example above:
 
 ```python
@@ -54,8 +54,8 @@ train(config)
 
 ## 📚 notebooks
 
-Explore our collection of notebooks that demonstrate how to fine-tune various 
-vision-language models using maestro. Each notebook provides step-by-step instructions 
+Explore our collection of notebooks that demonstrate how to fine-tune various
+vision-language models using maestro. Each notebook provides step-by-step instructions
 and code examples to help you get started quickly.
 
 | model and task | colab | video                                                                                  |
@@ -65,8 +65,8 @@ and code examples to help you get started quickly.
 
 ## 🦸 contribution
 
-We would love your help in making this repository even better! We are especially 
-looking for contributors with experience in fine-tuning vision-language models (VLMs). 
-If you notice any bugs or have suggestions for improvement, feel free to open an 
-[issue](https://github.com/roboflow/multimodal-maestro/issues) or submit a 
+We would love your help in making this repository even better! We are especially
+looking for contributors with experience in fine-tuning vision-language models (VLMs).
+If you notice any bugs or have suggestions for improvement, feel free to open an
+[issue](https://github.com/roboflow/multimodal-maestro/issues) or submit a
 [pull request](https://github.com/roboflow/multimodal-maestro/pulls).
diff --git a/maestro/cli/main.py b/maestro/cli/main.py
@@ -1,6 +1,7 @@
 import typer
 
 from maestro.cli.introspection import find_training_recipes
+from maestro import __version__
 
 app = typer.Typer()
 find_training_recipes(app=app)
@@ -10,6 +11,9 @@
 def info():
     typer.echo("Welcome to maestro CLI. Let's train some VLM! 🏋")
 
+@app.command(help="Display version of maestro")
+def version():
+    typer.echo(f"Maestro version: {__version__}")
 
 if __name__ == "__main__":
     app()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,80 @@
+[build-system]
+requires = ["setuptools", "setuptools-scm", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "maestro"
+version = "0.2.0rc3"
+description = "Visual Prompting for Large Multimodal Models (LMMs)"
+readme = "README.md"
+authors = [
+    {name = "Roboflow", email = "[email protected]"}
+]
+license = {file = "LICENSE"}
+keywords = ["roboflow","maestro","multimodal-maestro","transformers", "torch", "accelerate", "multimodal", "lmm", "vision", "nlp", "prompting"]
+requires-python = ">=3.9,<3.13"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Typing :: Typed",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: MacOS"
+]
+
+dependencies = [
+    "supervision~=0.24.0rc1",
+    "requests>=2.31.0,<=2.32.3",
+    "transformers~=4.44.2",
+    "torch~=2.4.0",
+    "accelerate~=0.33.0",
+    "sentencepiece~=0.2.0",
+    "peft~=0.12.0",
+    "flash-attn~=2.6.3; sys_platform != 'darwin'",
+    "einops~=0.8.0",
+    "timm~=1.0.9",
+    "typer~=0.12.5"
+]
+
+[project.urls]
+Homepage = "https://roboflow.github.io/multimodal-maestro/"
+Documentation = "https://roboflow.github.io/multimodal-maestro/"
+Repository = "https://github.com/roboflow/multimodal-maestro"
+Issues = "https://github.com/roboflow/multimodal-maestro/issues"
+Changelog = "https://github.com/roboflow/multimodal-maestro/blob/main/CHANGELOG.md"
+
+
+[project.optional-dependencies]
+docs = [
+    "mkdocs-material~=9.5.33",
+    "mkdocstrings[python]>=0.20.0,<0.25.2"
+]
+dev = [
+    "pytest~=8.3.2",
+    "black~=24.8.0",
+    "pre-commit~=3.8.0",
+    "mypy~=1.11.2",
+    "flake8~=7.1.1",
+    "tox~=4.18.1"
+]
+
+[project.scripts]
+maestro = "maestro.cli.main:app"
+
+[tool.setuptools]
+include-package-data = true # include non-python files in the package (default)
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = ["cookbooks", "docs", "docs.*", "test", "test.*", "mkdocs", "mkdocs.*"]
diff --git a/requirements/requirements.docs.txt b/requirements/requirements.docs.txt
diff --git a/requirements/requirements.test.txt b/requirements/requirements.test.txt