Add top 3 HF Presets for Mobilenet (#2105)

pkgoogle · web-flow · commit 5b677e336c52 · 2025-03-06T18:00:56.000-08:00
* test and preset fixes

* add version_number to kaggle_handle

* add top3 HF presets

* fix the new arguments

* fix new arguments to other test

* update/add mobilenet presets

* update model nomenclature

* define __init__.py for mobilenet, further fix nomenclature

* remove extra line

* update expected output (batch size mismatch) on test

* classifier definition slight refactor

* include more specific condition
diff --git a/keras_hub/src/models/mobilenet/mobilenet_backbone.py b/keras_hub/src/models/mobilenet/mobilenet_backbone.py
@@ -142,6 +142,8 @@ class DepthwiseConvBlock(keras.layers.Layer):
             signal into before reexciting back out. If (>1) technically, it's an
             excite & squeeze layer. If this doesn't exist there is no
             SqueezeExcite layer.
+        residual: bool, default False. True if we want a residual connection. If
+            False, there is no residual connection.
         name: str, name of the layer
         dtype: `None` or str or `keras.mixed_precision.DTypePolicy`. The dtype
             to use for the model's computations and weights.
@@ -161,6 +163,7 @@ def __init__(
         kernel_size=3,
         stride=2,
         squeeze_excite_ratio=None,
+        residual=False,
         name=None,
         dtype=None,
         **kwargs,
@@ -171,6 +174,7 @@ def __init__(
         self.kernel_size = kernel_size
         self.stride = stride
         self.squeeze_excite_ratio = squeeze_excite_ratio
+        self.residual = residual
         self.name = name
 
         channel_axis = (
@@ -256,11 +260,15 @@ def call(self, inputs):
         x = self.batch_normalization1(x)
         x = self.activation1(x)
 
-        if self.se_layer:
+        if self.squeeze_excite_ratio:
             x = self.se_layer(x)
 
         x = self.conv2(x)
         x = self.batch_normalization2(x)
+
+        if self.residual:
+            x = x + inputs
+
         return x
 
     def get_config(self):
@@ -272,6 +280,7 @@ def get_config(self):
                 "kernel_size": self.kernel_size,
                 "stride": self.stride,
                 "squeeze_excite_ratio": self.squeeze_excite_ratio,
+                "residual": self.residual,
                 "name": self.name,
             }
         )
@@ -675,6 +684,8 @@ def __init__(
         stackwise_padding,
         output_num_filters,
         depthwise_filters,
+        depthwise_stride,
+        depthwise_residual,
         last_layer_filter,
         squeeze_and_excite=None,
         image_shape=(None, None, 3),
@@ -722,7 +733,9 @@ def __init__(
         x = DepthwiseConvBlock(
             input_num_filters,
             depthwise_filters,
+            stride=depthwise_stride,
             squeeze_excite_ratio=squeeze_and_excite,
+            residual=depthwise_residual,
             name="block_0",
             dtype=dtype,
         )(x)
@@ -768,6 +781,8 @@ def __init__(
         self.input_num_filters = input_num_filters
         self.output_num_filters = output_num_filters
         self.depthwise_filters = depthwise_filters
+        self.depthwise_stride = depthwise_stride
+        self.depthwise_residual = depthwise_residual
         self.last_layer_filter = last_layer_filter
         self.squeeze_and_excite = squeeze_and_excite
         self.input_activation = input_activation
@@ -790,6 +805,8 @@ def get_config(self):
                 "input_num_filters": self.input_num_filters,
                 "output_num_filters": self.output_num_filters,
                 "depthwise_filters": self.depthwise_filters,
+                "depthwise_stride": self.depthwise_stride,
+                "depthwise_residual": self.depthwise_residual,
                 "last_layer_filter": self.last_layer_filter,
                 "squeeze_and_excite": self.squeeze_and_excite,
                 "input_activation": self.input_activation,
diff --git a/keras_hub/src/models/mobilenet/mobilenet_backbone_test.py b/keras_hub/src/models/mobilenet/mobilenet_backbone_test.py
@@ -53,6 +53,8 @@ def setUp(self):
             "input_num_filters": 16,
             "image_shape": (32, 32, 3),
             "depthwise_filters": 8,
+            "depthwise_stride": 2,
+            "depthwise_residual": False,
             "squeeze_and_excite": 0.5,
             "last_layer_filter": 288,
         }
diff --git a/keras_hub/src/models/mobilenet/mobilenet_image_classifier.py b/keras_hub/src/models/mobilenet/mobilenet_image_classifier.py
@@ -18,6 +18,7 @@ def __init__(
         self,
         backbone,
         num_classes,
+        num_features=1024,
         preprocessor=None,
         head_dtype=None,
         **kwargs,
@@ -33,7 +34,7 @@ def __init__(
         )
 
         self.output_conv = keras.layers.Conv2D(
-            filters=1024,
+            filters=num_features,
             kernel_size=(1, 1),
             strides=(1, 1),
             use_bias=True,
@@ -69,13 +70,15 @@ def __init__(
 
         # === Config ===
         self.num_classes = num_classes
+        self.num_features = num_features
 
     def get_config(self):
         # Skip ImageClassifier
         config = Task.get_config(self)
         config.update(
             {
                 "num_classes": self.num_classes,
+                "num_features": self.num_features,
             }
         )
         return config
diff --git a/keras_hub/src/models/mobilenet/mobilenet_image_classifier_test.py b/keras_hub/src/models/mobilenet/mobilenet_image_classifier_test.py
@@ -54,6 +54,8 @@ def setUp(self):
             input_num_filters=16,
             image_shape=(32, 32, 3),
             depthwise_filters=8,
+            depthwise_stride=2,
+            depthwise_residual=False,
             squeeze_and_excite=0.5,
             last_layer_filter=288,
         )
diff --git a/keras_hub/src/models/mobilenet/mobilenet_presets.py b/keras_hub/src/models/mobilenet/mobilenet_presets.py
@@ -4,12 +4,48 @@
     "mobilenet_v3_small_050_imagenet": {
         "metadata": {
             "description": (
-                "Small MobileNet V3 model pre-trained on the ImageNet 1k "
-                "dataset at a 224x224 resolution."
+                "Small Mobilenet V3 model pre-trained on the ImageNet 1k "
+                "dataset at a 224x224 resolution. Has half channel multiplier."
             ),
             "params": 278784,
             "path": "mobilenetv3",
         },
         "kaggle_handle": "kaggle://keras/mobilenetv3/keras/mobilenet_v3_small_050_imagenet/1",
     },
+    "mobilenet_v3_small_100_imagenet": {
+        "metadata": {
+            "description": (
+                "Small Mobilenet V3 model pre-trained on the ImageNet 1k "
+                "dataset at a 224x224 resolution. Has baseline channel "
+                "multiplier."
+            ),
+            "params": 939120,
+            "path": "mobilenetv3",
+        },
+        "kaggle_handle": "kaggle://keras/mobilenetv3/keras/mobilenet_v3_small_100_imagenet/1",
+    },
+    "mobilenet_v3_large_100_imagenet": {
+        "metadata": {
+            "description": (
+                "Large Mobilenet V3 model pre-trained on the ImageNet 1k "
+                "dataset at a 224x224 resolution. Has baseline channel "
+                "multiplier."
+            ),
+            "params": 2996352,
+            "path": "mobilenetv3",
+        },
+        "kaggle_handle": "kaggle://keras/mobilenetv3/keras/mobilenet_v3_large_100_imagenet/1",
+    },
+    "mobilenet_v3_large_100_imagenet_21k": {
+        "metadata": {
+            "description": (
+                "Large Mobilenet V3 model pre-trained on the ImageNet 21k "
+                "dataset at a 224x224 resolution. Has baseline channel "
+                "multiplier."
+            ),
+            "params": 2996352,
+            "path": "mobilenetv3",
+        },
+        "kaggle_handle": "kaggle://keras/mobilenetv3/keras/mobilenet_v3_large_100_imagenet_21k/1",
+    },
 }
diff --git a/keras_hub/src/utils/preset_utils.py b/keras_hub/src/utils/preset_utils.py
@@ -622,6 +622,7 @@ def load_task(self, cls, load_weights, load_task_weights, **kwargs):
             kwargs["preprocessor"] = self.load_preprocessor(
                 cls.preprocessor_cls,
             )
+
         return cls(**kwargs)
 
     def load_preprocessor(
diff --git a/keras_hub/src/utils/timm/convert_mobilenet.py b/keras_hub/src/utils/timm/convert_mobilenet.py
@@ -8,64 +8,135 @@
 def convert_backbone_config(timm_config):
     timm_architecture = timm_config["architecture"]
 
-    if "mobilenetv3_" in timm_architecture:
-        input_activation = "hard_swish"
-        output_activation = "hard_swish"
-    else:
-        input_activation = "relu6"
-        output_activation = "relu6"
-
-    if timm_architecture == "mobilenetv3_small_050":
-        stackwise_num_blocks = [2, 3, 2, 3]
-        stackwise_expansion = [
+    kwargs = {
+        "stackwise_num_blocks": [2, 3, 2, 3],
+        "stackwise_expansion": [
             [40, 56],
             [64, 144, 144],
             [72, 72],
             [144, 288, 288],
-        ]
-        stackwise_num_filters = [[16, 16], [24, 24, 24], [24, 24], [48, 48, 48]]
-        stackwise_kernel_size = [[3, 3], [5, 5, 5], [5, 5], [5, 5, 5]]
-        stackwise_num_strides = [[2, 1], [2, 1, 1], [1, 1], [2, 1, 1]]
-        stackwise_se_ratio = [
+        ],
+        "stackwise_num_filters": [
+            [16, 16],
+            [24, 24, 24],
+            [24, 24],
+            [48, 48, 48],
+        ],
+        "stackwise_kernel_size": [[3, 3], [5, 5, 5], [5, 5], [5, 5, 5]],
+        "stackwise_num_strides": [[2, 1], [2, 1, 1], [1, 1], [2, 1, 1]],
+        "stackwise_se_ratio": [
             [None, None],
             [0.25, 0.25, 0.25],
             [0.25, 0.25],
             [0.25, 0.25, 0.25],
-        ]
-        stackwise_activation = [
+        ],
+        "stackwise_activation": [
             ["relu", "relu"],
             ["hard_swish", "hard_swish", "hard_swish"],
             ["hard_swish", "hard_swish"],
             ["hard_swish", "hard_swish", "hard_swish"],
-        ]
-        stackwise_padding = [[1, 1], [2, 2, 2], [2, 2], [2, 2, 2]]
-        output_num_filters = 1024
-        input_num_filters = 16
-        depthwise_filters = 8
-        squeeze_and_excite = 0.5
-        last_layer_filter = 288
+        ],
+        "stackwise_padding": [[1, 1], [2, 2, 2], [2, 2], [2, 2, 2]],
+        "output_num_filters": 1024,
+        "input_num_filters": 16,
+        "depthwise_filters": 8,
+        "depthwise_stride": 2,
+        "depthwise_residual": False,
+        "squeeze_and_excite": 0.5,
+        "last_layer_filter": 288,
+        "input_activation": "relu6",
+        "output_activation": "relu6",
+    }
+
+    if "mobilenetv3_" in timm_architecture:
+        kwargs["input_activation"] = "hard_swish"
+        kwargs["output_activation"] = "hard_swish"
+
+    if timm_architecture == "mobilenetv3_small_050":
+        pass
+    elif timm_architecture == "mobilenetv3_small_100":
+        modified_kwargs = {
+            "stackwise_expansion": [
+                [72, 88],
+                [96, 240, 240],
+                [120, 144],
+                [288, 576, 576],
+            ],
+            "stackwise_num_filters": [
+                [24, 24],
+                [40, 40, 40],
+                [48, 48],
+                [96, 96, 96],
+            ],
+            "depthwise_filters": 16,
+            "last_layer_filter": 576,
+        }
+        kwargs.update(modified_kwargs)
+    elif timm_architecture.startswith("mobilenetv3_large_100"):
+        modified_kwargs = {
+            "stackwise_num_blocks": [2, 3, 4, 2, 3],
+            "stackwise_expansion": [
+                [64, 72],
+                [72, 120, 120],
+                [240, 200, 184, 184],
+                [480, 672],
+                [672, 960, 960],
+            ],
+            "stackwise_num_filters": [
+                [24, 24],
+                [40, 40, 40],
+                [80, 80, 80, 80],
+                [112, 112],
+                [160, 160, 160],
+            ],
+            "stackwise_kernel_size": [
+                [3, 3],
+                [5, 5, 5],
+                [3, 3, 3, 3],
+                [3, 3],
+                [5, 5, 5],
+            ],
+            "stackwise_num_strides": [
+                [2, 1],
+                [2, 1, 1],
+                [2, 1, 1, 1],
+                [1, 1],
+                [2, 1, 1],
+            ],
+            "stackwise_se_ratio": [
+                [None, None],
+                [0.25, 0.25, 0.25],
+                [None, None, None, None],
+                [0.25, 0.25],
+                [0.25, 0.25, 0.25],
+            ],
+            "stackwise_activation": [
+                ["relu", "relu"],
+                ["relu", "relu", "relu"],
+                ["hard_swish", "hard_swish", "hard_swish", "hard_swish"],
+                ["hard_swish", "hard_swish"],
+                ["hard_swish", "hard_swish", "hard_swish"],
+            ],
+            "stackwise_padding": [
+                [1, 1],
+                [2, 2, 2],
+                [1, 1, 1, 1],
+                [1, 1],
+                [2, 2, 2],
+            ],
+            "depthwise_filters": 16,
+            "depthwise_stride": 1,
+            "depthwise_residual": True,
+            "squeeze_and_excite": None,
+            "last_layer_filter": 960,
+        }
+        kwargs.update(modified_kwargs)
     else:
         raise ValueError(
             f"Currently, the architecture {timm_architecture} is not supported."
         )
 
-    return dict(
-        input_num_filters=input_num_filters,
-        input_activation=input_activation,
-        depthwise_filters=depthwise_filters,
-        squeeze_and_excite=squeeze_and_excite,
-        stackwise_num_blocks=stackwise_num_blocks,
-        stackwise_expansion=stackwise_expansion,
-        stackwise_num_filters=stackwise_num_filters,
-        stackwise_kernel_size=stackwise_kernel_size,
-        stackwise_num_strides=stackwise_num_strides,
-        stackwise_se_ratio=stackwise_se_ratio,
-        stackwise_activation=stackwise_activation,
-        stackwise_padding=stackwise_padding,
-        output_num_filters=output_num_filters,
-        output_activation=output_activation,
-        last_layer_filter=last_layer_filter,
-    )
+    return kwargs
 
 
 def convert_weights(backbone, loader, timm_config):
@@ -120,9 +191,14 @@ def port_batch_normalization(keras_layer, hf_weight_prefix):
     port_conv2d(stem_block.conv1, f"{hf_name}.conv_dw")
     port_batch_normalization(stem_block.batch_normalization1, f"{hf_name}.bn1")
 
-    stem_se_block = stem_block.se_layer
-    port_conv2d(stem_se_block.conv_reduce, f"{hf_name}.se.conv_reduce", True)
-    port_conv2d(stem_se_block.conv_expand, f"{hf_name}.se.conv_expand", True)
+    if stem_block.squeeze_excite_ratio:
+        stem_se_block = stem_block.se_layer
+        port_conv2d(
+            stem_se_block.conv_reduce, f"{hf_name}.se.conv_reduce", True
+        )
+        port_conv2d(
+            stem_se_block.conv_expand, f"{hf_name}.se.conv_expand", True
+        )
 
     port_conv2d(stem_block.conv2, f"{hf_name}.conv_pw")
     port_batch_normalization(stem_block.batch_normalization2, f"{hf_name}.bn2")
diff --git a/keras_hub/src/utils/timm/preset_loader.py b/keras_hub/src/utils/timm/preset_loader.py
diff --git a/tools/checkpoint_conversion/convert_mobilenet_checkpoints.py b/tools/checkpoint_conversion/convert_mobilenet_checkpoints.py

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,8 @@ def setUp(self):`
`53`	`53`	`"input_num_filters": 16,`
`54`	`54`	`"image_shape": (32, 32, 3),`
`55`	`55`	`"depthwise_filters": 8,`
	`56`	`+ "depthwise_stride": 2,`
	`57`	`+ "depthwise_residual": False,`
`56`	`58`	`"squeeze_and_excite": 0.5,`
`57`	`59`	`"last_layer_filter": 288,`
`58`	`60`	`}`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,8 @@ def setUp(self):`
`54`	`54`	`input_num_filters=16,`
`55`	`55`	`image_shape=(32, 32, 3),`
`56`	`56`	`depthwise_filters=8,`
	`57`	`+ depthwise_stride=2,`
	`58`	`+ depthwise_residual=False,`
`57`	`59`	`squeeze_and_excite=0.5,`
`58`	`60`	`last_layer_filter=288,`
`59`	`61`	`)`
Original file line number	Diff line number	Diff line change
`@@ -622,6 +622,7 @@ def load_task(self, cls, load_weights, load_task_weights, **kwargs):`
`622`	`622`	`kwargs["preprocessor"] = self.load_preprocessor(`
`623`	`623`	`cls.preprocessor_cls,`
`624`	`624`	`)`
	`625`	`+`
`625`	`626`	`return cls(**kwargs)`
`626`	`627`
`627`	`628`	`def load_preprocessor(`