Lowercased coca model directory and added to kokoro build

keras-team · VarunS1997 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
commit f15408f2e7713dc39a6bc6cb9dcad49ee5869edb
diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh
@@ -69,6 +69,7 @@ then
       keras_cv/models/object_detection/retinanet \
       keras_cv/models/object_detection/yolo_v8 \
       keras_cv/models/object_detection_3d \
+      keras_cv/models/feature_extractor/coca \
       keras_cv/models/segmentation \
       keras_cv/models/stable_diffusion
 else
@@ -83,6 +84,7 @@ else
       keras_cv/models/object_detection/retinanet \
       keras_cv/models/object_detection/yolo_v8 \
       keras_cv/models/object_detection_3d \
+      keras_cv/models/feature_extractor/coca \
       keras_cv/models/segmentation \
       keras_cv/models/stable_diffusion
 fi
diff --git a/keras_cv/layers/attention_pooling.py b/keras_cv/layers/attention_pooling.py
@@ -2,7 +2,7 @@
 
 
 class AttentionPooling(layers.Layer):
-    """Implements the Pooled Attention Layer used in "CoCa": Contrastive Captioners are Image-Text Foundation Models"
+    """Implements the Pooled Attention Layer used in "coca": Contrastive Captioners are Image-Text Foundation Models"
     (https://arxiv.org/pdf/2205.01917.pdf), consisting of a Multiheaded Attention followed by Layer Normalization.
 
     Args:

diff --git a/...models/feature_extractor/CoCa/__init__.py → ...models/feature_extractor/coca/__init__.py b/...models/feature_extractor/CoCa/__init__.py → ...models/feature_extractor/coca/__init__.py
diff --git a/...dels/feature_extractor/CoCa/coca_model.py → ...dels/feature_extractor/coca/coca_model.py b/...dels/feature_extractor/CoCa/coca_model.py → ...dels/feature_extractor/coca/coca_model.py
@@ -21,7 +21,7 @@
 from keras_cv.layers.vit_layers import PatchingAndEmbedding
 
 
-@keras_cv_export(["keras_cv.models.CoCa"])
+@keras_cv_export(["keras_cv.models.coca"])
 class CoCa(Task):
     def __init__(self,
                  img_patch_size=18,
@@ -91,7 +91,7 @@ def __init__(self,
     """ Contrastive Captioner foundational model implementation.
 
     This model implements the "Contrastive Captioners are image-Text Foundational Models" by Yu, et al.
-    (https://arxiv.org/pdf/2205.01917.pdf). In short, the CoCa model combines the ideas of Contrastive techniques
+    (https://arxiv.org/pdf/2205.01917.pdf). In short, the coca model combines the ideas of Contrastive techniques
     such as CLIP, with Generative Captioning approaches such as SimVLM.
 
     The architecture of clip can be described as an Image Visual Transformer Encoder in parallel to self-attention-only
@@ -105,7 +105,7 @@ def __init__(self,
     images = ... # [batch_size, height, width, channel]
     text = ... # [batch_size, text_dim, sequence_length]
 
-    coca = CoCa()
+    coca = coca()
 
     # [batch_size, sequence_length, captioning_query_length]
     output = coca(images, text)
@@ -118,7 +118,7 @@ def __init__(self,
         encoder_depth: number of image encoder blocks
         encoder_heads: number of attention heads used in each image encoder block
         encoder_intermediate_dim: dimensionality of the encoder blocks' intermediate representation (MLP dimensionality)
-        encoder_width: dimensionality of the encoder's projection, consistent with wording used in CoCa paper.
+        encoder_width: dimensionality of the encoder's projection, consistent with wording used in coca paper.
         unimodal_decoder_depth: number of decoder blocks used for text self-attention/embedding
         multimodal_decoder_depth: number of decoder blocks used for image-text cross-attention and captioning
         decoder_intermediate_dim: dimensionality of the decoder blocks' MLPs
@@ -137,7 +137,7 @@ def build(self, input_shape):
 
         # Validate Input Shape
         if len(input_shape) < 2:
-            raise ValueError("Build arguments to CoCa expected to contain shapes of both text and image data; "
+            raise ValueError("Build arguments to coca expected to contain shapes of both text and image data; "
                              f"got {len(input_shape)} shapes.")
 
         images_shape = input_shape[0]