roboflow · probicheaux · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
@@ -0,0 +1,82 @@
+FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as base
+
+WORKDIR /app
+
+RUN rm -rf /var/lib/apt/lists/* && apt-get clean && apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    ffmpeg \
+    libxext6 \
+    libopencv-dev \
+    uvicorn \
+    python3-pip \
+    git \
+    libgdal-dev \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements/requirements.sam.txt \
+    requirements/requirements.clip.txt \
+    requirements/requirements.http.txt \
+    requirements/requirements.gpu.txt \
+    requirements/requirements.waf.txt \
+    requirements/requirements.gaze.txt \
+    requirements/requirements.doctr.txt \
+    requirements/requirements.groundingdino.txt \
+    requirements/requirements.cogvlm.txt \
+    requirements/requirements.yolo_world.txt \
+    requirements/_requirements.txt \
+    requirements/requirements.transformers.txt \
+    requirements/requirements.pali.flash_attn.txt \
+    requirements/requirements.sdk.http.txt \
+    requirements/requirements.cli.txt \
+    ./
+
+RUN python3 -m pip install -U pip
+RUN python3 -m pip install --extra-index-url https://download.pytorch.org/whl/cu118 \
+    -r _requirements.txt \
+    -r requirements.sam.txt \
+    -r requirements.clip.txt \
+    -r requirements.http.txt \
+    -r requirements.gpu.txt \
+    -r requirements.waf.txt \
+    -r requirements.gaze.txt \
+    -r requirements.groundingdino.txt \
+    -r requirements.doctr.txt \
+    -r requirements.cogvlm.txt \
+    -r requirements.yolo_world.txt \
+    -r requirements.transformers.txt \
+    -r requirements.sdk.http.txt \
+    -r requirements.cli.txt \
+    jupyterlab \
+    --upgrade \
+    && rm -rf ~/.cache/pip
+
+# Install setup.py requirements for flash_attn
+RUN python3 -m pip install packaging==24.1  && rm -rf ~/.cache/pip
+
+# Install flash_attn required for Paligemma and Florence2
+RUN python3 -m pip install -r requirements.pali.flash_attn.txt --no-build-isolation && rm -rf ~/.cache/pip
+
+FROM scratch
+COPY --from=base / /
+
+WORKDIR /app/
+COPY inference inference
+COPY inference_sdk inference_sdk
+COPY inference_cli inference_cli
+ENV PYTHONPATH=/app/
+COPY docker/config/gpu_http.py gpu_http.py
+
+ENV PYTHONPATH=/app/
+ENV VERSION_CHECK_MODE=continuous
+ENV PROJECT=roboflow-platform
+ENV NUM_WORKERS=1
+ENV HOST=0.0.0.0
+ENV PORT=9001
+ENV WORKFLOWS_STEP_EXECUTION_MODE=local
+ENV WORKFLOWS_MAX_CONCURRENT_STEPS=1
+ENV API_LOGGING_ENABLED=True
+ENV LMM_ENABLED=True
+ENV CORE_MODEL_SAM2_ENABLED=True
+ENV CORE_MODEL_OWLV2_ENABLED=True
+
+ENTRYPOINT uvicorn gpu_http:app --workers $NUM_WORKERS --host $HOST --port $PORT
@@ -291,7 +291,9 @@ class MultiLabelClassificationInferenceResponse(
 
 
 class LMMInferenceResponse(CvInferenceResponse):
-    response: str = Field(description="Text generated by PaliGemma")
+    response: Union[str, dict] = Field(
+        description="Text/structured response generated by model"
+    )
 
 
 class FaceDetectionPrediction(ObjectDetectionPrediction):

@@ -1,3 +1,4 @@
+import hashlib
 import json
 import logging
 import re
@@ -90,15 +91,16 @@ class BlockManifest(WorkflowBlockManifest):
     classes: Union[
         WorkflowParameterSelector(kind=[LIST_OF_VALUES_KIND]),
         StepOutputSelector(kind=[LIST_OF_VALUES_KIND]),
-        List[str],
+        Optional[List[str]],
     ] = Field(
         description="List of all classes used by the model, required to "
         "generate mapping between class name and class id.",
         examples=[["$steps.lmm.classes", "$inputs.classes", ["class_a", "class_b"]]],
+        default=None,
     )
-    model_type: Literal["google-gemini", "anthropic-claude"] = Field(
+    model_type: Literal["google-gemini", "anthropic-claude", "florence-2"] = Field(
         description="Type of the model that generated prediction",
-        examples=[["google-gemini", "anthropic-claude"]],
+        examples=[["google-gemini", "anthropic-claude", "florence-2"]],
     )
     task_type: Literal["object-detection"]
 
@@ -108,6 +110,11 @@ def validate(self) -> "BlockManifest":
             raise ValueError(
                 f"Could not parse result of task {self.task_type} for model {self.model_type}"
             )
+        if self.model_type != "florence-2" and self.classes is None:
+            raise ValueError(
+                "Must pass list of classes to this block when using gemini or claude"
+            )
+
         return self
 
     @classmethod
@@ -135,7 +142,7 @@ def run(
         self,
         image: WorkflowImageData,
         vlm_output: str,
-        classes: List[str],
+        classes: Optional[List[str]],
         model_type: str,
         task_type: str,
     ) -> BlockResult:
@@ -255,7 +262,58 @@ def scale_confidence(value: float) -> float:
     return min(max(float(value), 0.0), 1.0)
 
 
+def parse_florence2_object_detection_response(
+    image: WorkflowImageData,
+    parsed_data: dict,
+    classes: Optional[List[str]],
+    inference_id: str,
+):
+    image_height, image_width = image.numpy_image.shape[:2]
+    detections = sv.Detections.from_lmm(
+        "florence_2",
+        parsed_data,
+        resolution_wh=(image_width, image_height),
+    )
+    detection_ids = np.array([str(uuid4()) for _ in range(len(detections))])
+    inference_ids = np.array([inference_id] * len(detections))
+    prediction_type = np.array(["object-detection"] * len(detections))
+    detections.data.update(
+        {
+            INFERENCE_ID_KEY: inference_ids,
+            DETECTION_ID_KEY: detection_ids,
+            PREDICTION_TYPE_KEY: prediction_type,
+        }
+    )
+    detections.confidence = np.array([1.0 for _ in detections])
+    detected_class_names = detections.data[CLASS_NAME_DATA_FIELD]
+    if classes is not None:
+        bool_array = np.array([c in classes for c in detected_class_names])
+        filtered_detections = detections[bool_array]
+        filtered_classes = filtered_detections.data[CLASS_NAME_DATA_FIELD]
+        filtered_detections.class_id = np.array(
+            [classes.index(c) for c in filtered_classes]
+        )
+        return attach_parents_coordinates_to_sv_detections(
+            detections=filtered_detections,
+            image=image,
+        )
+    # classes is None
+    class_ids = [get_4digit_from_md5(c) for c in detected_class_names]
+    detections.class_id = np.array(class_ids)
+    return attach_parents_coordinates_to_sv_detections(
+        detections=detections, image=image
+    )
+
+
+def get_4digit_from_md5(input_string):
+    md5_hash = hashlib.md5(input_string.encode("utf-8"))
+    hex_digest = md5_hash.hexdigest()
+    integer_value = int(hex_digest[:9], 16)
+    return integer_value % 10000
+
+
 REGISTERED_PARSERS = {
     ("google-gemini", "object-detection"): parse_gemini_object_detection_response,
     ("anthropic-claude", "object-detection"): parse_gemini_object_detection_response,
+    ("florence-2", "object-detection"): parse_florence2_object_detection_response,
 }
@@ -85,6 +85,9 @@
 from inference.core.workflows.core_steps.models.foundation.cog_vlm.v1 import (
     CogVLMBlockV1,
 )
+from inference.core.workflows.core_steps.models.foundation.florence2.v1 import (
+    Florence2BlockV1,
+)
 from inference.core.workflows.core_steps.models.foundation.google_gemini.v1 import (
     GoogleGeminiBlockV1,
 )
@@ -339,6 +342,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
         AntropicClaudeBlockV1,
         LineCounterBlockV1,
         PolygonZoneVisualizationBlockV1,
+        Florence2BlockV1,
     ]