diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e8da536747d4..c0ad2f21a733 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -762,6 +762,7 @@ "RoFormerConfig", "RoFormerTokenizer", ], + "models.rf_detr": ["RFDetrConfig", "RFDetrDinov2WithRegistersConfig"], "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"], "models.rt_detr_v2": ["RTDetrV2Config"], "models.rwkv": ["RwkvConfig"], @@ -3536,6 +3537,15 @@ "load_tf_weights_in_roformer", ] ) + _import_structure["models.rf_detr"].extend( + [ + "RFDetrForObjectDetection", + "RFDetrModel", + "RFDetrPreTrainedModel", + "RFDetrDinov2WithRegistersBackbone", + "RFDetrDinov2WithRegistersPreTrainedModel", + ] + ) _import_structure["models.rt_detr"].extend( [ "RTDetrForObjectDetection", @@ -5987,6 +5997,7 @@ from .models.regnet import RegNetConfig from .models.rembert import RemBertConfig from .models.resnet import ResNetConfig + from .models.rf_detr import RFDetrConfig, RFDetrDinov2WithRegistersConfig from .models.roberta import ( RobertaConfig, RobertaTokenizer, @@ -8316,6 +8327,13 @@ ResNetModel, ResNetPreTrainedModel, ) + from .models.rf_detr import ( + RFDetrDinov2WithRegistersBackbone, + RFDetrDinov2WithRegistersPreTrainedModel, + RFDetrForObjectDetection, + RFDetrModel, + RFDetrPreTrainedModel, + ) from .models.roberta import ( RobertaForCausalLM, RobertaForMaskedLM, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 49ce48dd6c04..d3c044778c46 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -235,6 +235,7 @@ regnet, rembert, resnet, + rf_detr, roberta, roberta_prelayernorm, roc_bert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c7ef472882ba..27945df21f45 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -261,6 +261,7 @@ ("rembert", "RemBertConfig"), ("resnet", "ResNetConfig"), ("retribert", "RetriBertConfig"), + ("rf_detr", "RFDetrConfig"), ("roberta", "RobertaConfig"), ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), ("roc_bert", "RoCBertConfig"), @@ -615,6 +616,8 @@ ("rembert", "RemBERT"), ("resnet", "ResNet"), ("retribert", "RetriBERT"), + ("rf_detr", "RF-DETR"), + ("rf_detr_dinov2_with_registers", "RF-DETR-DINOv2 with Registers"), ("roberta", "RoBERTa"), ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"), ("roc_bert", "RoCBert"), @@ -766,6 +769,7 @@ ("smolvlm_vision", "smolvlm"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), + ("rf_detr_dinov2_with_registers", "rf_detr"), ("granitevision", "llava_next"), ] ) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 05a415741413..72f06cb2d1cb 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -238,6 +238,7 @@ ("rembert", "RemBertModel"), ("resnet", "ResNetModel"), ("retribert", "RetriBertModel"), + ("rf_detr", "RFDetrModel"), ("roberta", "RobertaModel"), ("roberta-prelayernorm", "RobertaPreLayerNormModel"), ("roc_bert", "RoCBertModel"), @@ -920,6 +921,7 @@ ("deformable_detr", "DeformableDetrForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("rf_detr", "RFDetrForObjectDetection"), ("rt_detr", "RTDetrForObjectDetection"), ("rt_detr_v2", "RTDetrV2ForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), @@ -1440,6 +1442,7 @@ ("nat", "NatBackbone"), ("pvt_v2", "PvtV2Backbone"), ("resnet", "ResNetBackbone"), + ("rf_detr_dinov2_with_registers", "RFDetrDinov2WithRegistersBackbone"), ("rt_detr_resnet", "RTDetrResNetBackbone"), ("swin", "SwinBackbone"), ("swinv2", "Swinv2Backbone"), diff --git a/src/transformers/models/rf_detr/__init__.py b/src/transformers/models/rf_detr/__init__.py new file mode 100644 index 000000000000..46dba76871ff --- /dev/null +++ b/src/transformers/models/rf_detr/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_rf_detr import * + from .configuration_rf_detr_dinov2_with_registers import * + from .modeling_rf_detr import * + from .modeling_rf_detr_dinov2_with_registers import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py new file mode 100644 index 000000000000..0bce9b42a0aa --- /dev/null +++ b/src/transformers/models/rf_detr/configuration_rf_detr.py @@ -0,0 +1,316 @@ +from typing import List, Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING +from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig + + +logger = logging.get_logger(__name__) + + +class RFDetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate + an RF DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the RF DETR + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + + TODO: Add more details about the architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`RFDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use + `two_stage_num_proposals` instead. + max_position_embeddings (``, *optional*, defaults to 1024): + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + is_encoder_decoder (``, *optional*, defaults to `True`): + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1.0): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + return_intermediate (``, *optional*, defaults to `True`): + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `False`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Deformable DETR, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `False`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + out_feature_indexes (`List`, *optional*, defaults to `[2, 5, 8, 11]`): + scale_factors (`List`, *optional*, defaults to `[1.0]`): + layer_norm (`bool`, *optional*, defaults to `False`): + projector_in_channels (`int`, *optional*, defaults to 256): + projector_num_blocks (`int`, *optional*, defaults to 3): + projector_survival_prob (`float`, *optional*, defaults to 1.0): + projector_force_drop_last_n_features (`int`, *optional*, defaults to 0): + + Examples: + + ```python + >>> from transformers import RFDetrConfig, RFDetrModel + + >>> # Initializing a Deformable DETR SenseTime/deformable-detr style configuration + >>> configuration = RFDetrConfig() + + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> model = RFDetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rf_detr" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + init_std=0.02, + init_xavier_std=1.0, + # backbone + use_timm_backbone=False, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + backbone_kwargs=None, + # RFDetrModel + num_queries=300, + # RFDetrEncoder + encoder_layers=6, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + encoder_layerdrop=0.0, + encoder_n_points=4, + # RFDetrDecoder + decoder_layers=3, + d_model=256, + attention_dropout=0.0, + dropout=0.1, + activation_function="relu", + activation_dropout=0.0, + decoder_self_attention_heads=8, + decoder_cross_attention_heads=16, + decoder_n_points=4, + decoder_ffn_dim=2048, + # LWDetr + layer_norm: bool = True, + ## + auxiliary_loss=False, + position_embedding_type="sine", + dilation=False, + two_stage=True, + two_stage_num_proposals=300, + with_box_refine=True, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=False, + out_feature_indexes: List[int] = [2, 5, 8, 11], + scale_factors: List[float] = [1.0], + projector_in_channels: Optional[List[int]] = None, + projector_num_blocks: int = 3, # TODO rename + projector_survival_prob: float = 1.0, + projector_force_drop_last_n_features: int = 0, + projector_activation_function: str = "silu", + csp_hidden_expansion: float = 0.5, + bottleneck_hidden_expansion: float = 0.5, + batch_norm_eps: float = 1e-5, + bbox_reparam: bool = True, + is_encoder_decoder=True, + num_groups=13, + light_reference_point_refinement: bool = True, + **kwargs, + ): + if backbone_config is None and backbone is None: + logger.info( + "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone." + ) + backbone_config = RFDetrDinov2WithRegistersConfig( + out_features=[f"stage{i}" for i in out_feature_indexes], + return_dict=False, + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_queries = num_queries + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_self_attention_heads = decoder_self_attention_heads + self.decoder_cross_attention_heads = decoder_cross_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.backbone_kwargs = backbone_kwargs + self.dilation = dilation + # deformable attributes + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + + self.scale_factors = [1.0] if scale_factors is None else scale_factors + assert len(self.scale_factors) > 0, "scale_factors must be a list of at least one element" + assert sorted(self.scale_factors, reverse=True) == self.scale_factors, "scale_factors must be reverse sorted" + assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), ( + "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]" + ) + + self.num_feature_levels = len(scale_factors) + self.layer_norm = layer_norm + self.projector_in_channels = ( + projector_in_channels + if projector_in_channels is not None + else [backbone_config.hidden_size] * len(out_feature_indexes) + ) + assert len(self.projector_in_channels) == len(out_feature_indexes), ( + "projector_in_channels must have the same length as out_feature_indexes" + ) + self.projector_num_blocks = projector_num_blocks + self.projector_survival_prob = projector_survival_prob + self.projector_force_drop_last_n_features = projector_force_drop_last_n_features + self.projector_activation_function = projector_activation_function + self.csp_hidden_expansion = csp_hidden_expansion + self.bottleneck_expansion = bottleneck_hidden_expansion + self.batch_norm_eps = batch_norm_eps + self.encoder_hidden_dim = backbone_config.hidden_size + self.bbox_reparam = bbox_reparam + self.num_groups = num_groups + self.light_reference_point_refinement = light_reference_point_refinement + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + +__all__ = ["RFDetrConfig"] diff --git a/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..b43b8ec4dc04 --- /dev/null +++ b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py @@ -0,0 +1,151 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from ...configuration_utils import PretrainedConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class RFDetrDinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RFDetrDinov2WithRegistersModel`]. It is used to instantiate an + RFDetrDinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv2 with Registers + [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of register tokens to use. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import RFDetrDinov2WithRegistersConfig, RFDetrDinov2WithRegistersModel + + >>> # Initializing a RFDetrDinov2WithRegisters base style configuration + >>> configuration = RFDetrDinov2WithRegistersConfig() + + >>> # Initializing a model (with random weights) from the base style configuration + >>> model = RFDetrDinov2WithRegistersModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "rf_detr_dinov2_with_registers" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + num_register_tokens=4, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + num_windows: int = 4, + window_block_indexes=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.num_register_tokens = num_register_tokens + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + + self.num_windows = num_windows + window_block_indexes = set(range(self._out_indices[-1] + 1)) + window_block_indexes.difference_update(self._out_indices) + window_block_indexes = list(window_block_indexes) + self.window_block_indexes = window_block_indexes + + +__all__ = ["RFDetrDinov2WithRegistersConfig"] diff --git a/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py new file mode 100644 index 000000000000..8f920f19b6fc --- /dev/null +++ b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py @@ -0,0 +1,337 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert RF Detr checkpoints to Hugging Face Transformers format.""" + +import argparse +import json +import re +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import ( + AutoConfig, + RFDetrConfig, + RFDetrDinov2WithRegistersConfig, + RFDetrForObjectDetection, + RTDetrImageProcessor, + RTDetrImageProcessorFast, +) +from transformers.utils import logging + + +torch.set_printoptions(precision=6, sci_mode=False) + + +def custom_repr(self): + # return f"{tuple(self.shape)} {self.flatten()[-10:].tolist()} {original_repr(self)}" + return f"{tuple(self.shape)} {self.flatten()[-3:].tolist()}" + + +original_repr = torch.Tensor.__repr__ +torch.Tensor.__repr__ = custom_repr + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_rt_detr_v2_config(model_name: str) -> RFDetrConfig: + if model_name in ["rf-detr-base", "rf-detr-base-2"]: + dinov2_size = "small" + elif model_name == "rf-detr-large": + dinov2_size = "base" + + base_backbone_model_name = f"facebook/dinov2-with-registers-{dinov2_size}" + num_register_tokens = 0 + out_indices = [2, 5, 8, 11] + base_backbone = AutoConfig.from_pretrained( + base_backbone_model_name, + num_register_tokens=num_register_tokens, + out_indices=out_indices, + ) + + num_windows = 4 + backbone_config = RFDetrDinov2WithRegistersConfig( + **base_backbone.to_dict(), + num_windows=num_windows, + ) + + scale_factors = [2.0, 0.5] + d_model = 384 + decoder_self_attention_heads = 12 + decoder_cross_attention_heads = 24 + num_labels = 91 + config = RFDetrConfig( + backbone_config=backbone_config, + scale_factors=scale_factors, + d_model=d_model, + decoder_self_attention_heads=decoder_self_attention_heads, + decoder_cross_attention_heads=decoder_cross_attention_heads, + num_labels=num_labels, + ) + + config.num_labels = 80 + repo_id = "huggingface/label-files" + filename = "coco-detection-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + if model_name in ["rf-detr-base", "rf-detr-base-2"]: + pass + # config.backbone_config.hidden_sizes = [64, 128, 256, 512] + # config.backbone_config.depths = [2, 2, 2, 2] + # config.backbone_config.layer_type = "basic" + # config.encoder_in_channels = [128, 256, 512] + # config.hidden_expansion = 0.5 + # config.decoder_layers = 3 + elif model_name == "rf-detr-large": + pass + # config.backbone_config.hidden_sizes = [64, 128, 256, 512] + # config.backbone_config.depths = [3, 4, 6, 3] + # config.backbone_config.layer_type = "basic" + # config.encoder_in_channels = [128, 256, 512] + # config.hidden_expansion = 0.5 + # config.decoder_layers = 4 + + return config + + +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + r"backbone.0.encoder.encoder": r"model.backbone.conv_encoder.model", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).(weight|bias)": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.\4", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.conv", + r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.norm", + r"backbone.0.projector.stages.(\d+).0.cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.conv", + r"backbone.0.projector.stages.(\d+).0.cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.norm", + r"backbone.0.projector.stages.(\d+).0.cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.conv", + r"backbone.0.projector.stages.(\d+).0.cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.norm", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.conv", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.norm", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.conv", + r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.norm", + r"backbone.0.projector.stages.(\d+).1": r"model.backbone.conv_encoder.projector.scale_layers.\1.layer_norm", + r"transformer.decoder.layers.(\d+).self_attn.out_proj": r"model.decoder.layers.\1.self_attn.out_proj", + r"transformer.decoder.layers.(\d+).norm1": r"model.decoder.layers.\1.self_attn_layer_norm", + r"transformer.decoder.layers.(\d+).cross_attn.sampling_offsets": r"model.decoder.layers.\1.encoder_attn.sampling_offsets", + r"transformer.decoder.layers.(\d+).cross_attn.attention_weights": r"model.decoder.layers.\1.encoder_attn.attention_weights", + r"transformer.decoder.layers.(\d+).cross_attn.value_proj": r"model.decoder.layers.\1.encoder_attn.value_proj", + r"transformer.decoder.layers.(\d+).cross_attn.output_proj": r"model.decoder.layers.\1.encoder_attn.output_proj", + r"transformer.decoder.layers.(\d+).norm2": r"model.decoder.layers.\1.encoder_attn_layer_norm", + r"transformer.decoder.layers.(\d+).linear1": r"model.decoder.layers.\1.fc1", + r"transformer.decoder.layers.(\d+).linear2": r"model.decoder.layers.\1.fc2", + r"transformer.decoder.layers.(\d+).norm3": r"model.decoder.layers.\1.final_layer_norm", + r"transformer.decoder.norm": r"model.decoder.norm", + r"transformer.decoder.ref_point_head": r"model.decoder.reference_points_head", + r"refpoint_embed": r"model.reference_point_embeddings", + r"transformer.enc_output": r"model.enc_output", + r"transformer.enc_output_norm": r"model.enc_output_norm", + r"transformer.enc_out_bbox_embed": r"model.enc_out_bbox_embed", + r"transformer.enc_out_class_embed": r"model.enc_out_class_embed", + r"query_feat": r"model.query_position_embeddings", +} + + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + # Use the mapping to rename keys + for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + for key in list(state_dict_keys.keys()): + new_key = re.sub(original_key, converted_key, key) + if new_key != key: + state_dict_keys[new_key] = state_dict_keys.pop(key) + + return state_dict_keys + + +def read_in_q_k_v(state_dict, config: RFDetrConfig): + prefix = "transformer.decoder.layers" + decoder_hidden_dim = config.d_model + + for i in range(config.decoder_layers): + # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:decoder_hidden_dim, :] + state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:decoder_hidden_dim] + state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ + decoder_hidden_dim : 2 * decoder_hidden_dim, : + ] + state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ + decoder_hidden_dim : 2 * decoder_hidden_dim + ] + state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-decoder_hidden_dim:, :] + state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-decoder_hidden_dim:] + + +def copy_weights(state_dict, config): + for key, value in dict(state_dict.items()).items(): + if key.startswith("bbox_embed"): + new_key = f"model.decoder.{key}" + state_dict[new_key] = value + if key.startswith("class_embed"): + new_key = f"model.decoder.{key}" + state_dict[new_key] = value + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + + return im + + +@torch.no_grad() +def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id): + """ + Copy/paste/tweak model's weights to our RTDETR structure. + """ + + # load default config + config = get_rt_detr_v2_config(model_name) + + # load original model from torch hub + model_name_to_checkpoint_url = { + "rf-detr-base": "https://storage.googleapis.com/rfdetr/rf-detr-base-coco.pth", + # below is a less converged model that may be better for finetuning but worse for inference + "rf-detr-base-2": "https://storage.googleapis.com/rfdetr/rf-detr-base-2.pth", + "rf-detr-large": "https://storage.googleapis.com/rfdetr/rf-detr-large.pth", + } + logger.info(f"Converting model {model_name}...") + state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[ + "model" + ] + original_state_dict = state_dict.copy() + # rename keys + state_dict = convert_old_keys_to_new_keys(state_dict) + for key in state_dict.copy().keys(): + if key.startswith("query_feat"): + del state_dict[key] + + # query, key and value matrices need special treatment + read_in_q_k_v(state_dict, config) + # certain weights are copied from the RFDetrForObjectDetection to the RFDetrDecoder + copy_weights(state_dict, config) + # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them + for key in state_dict.copy().keys(): + if key.endswith("num_batches_tracked"): + del state_dict[key] + + # finally, create HuggingFace model and load state dict + model = RFDetrForObjectDetection(config) + target_state_dict = model.state_dict() + model.load_state_dict(state_dict) + loaded_state_dict = model.state_dict() + model.eval() + + # load image processor + image_processor = RTDetrImageProcessorFast(size={"height": 560, "width": 560}, do_normalize=True) + + # prepare image + img = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize([560, 560], interpolation=transforms.InterpolationMode.BILINEAR), + transforms.ToTensor(), + ] + ) + original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension + + encoding = image_processor(images=img, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + pixel_values = pixel_values.to(device) + + # Pass image by the model + with torch.no_grad(): + outputs = model(pixel_values) + + if model_name == "rf-detr-base": + expected_slice_logits = torch.tensor( + [[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]] + ) + expected_slice_boxes = torch.tensor( + [[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]] + ) + elif model_name == "rf-detr-base-2": + expected_slice_logits = torch.tensor( + [[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]] + ) + expected_slice_boxes = torch.tensor( + [[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]] + ) + elif model_name == "rf-detr-large": + expected_slice_logits = torch.tensor( + [[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]] + ) + expected_slice_boxes = torch.tensor( + [[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]] + ) + else: + raise ValueError(f"Unknown rf_detr_name: {model_name}") + assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4) + assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3) + + if output_dir is not None: + Path(output_dir).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {output_dir}") + model.save_pretrained(output_dir) + print(f"Saving image processor to {output_dir}") + image_processor.save_pretrained(output_dir) + + if push_to_hub: + # Upload model, image processor and config to the hub + logger.info("Uploading PyTorch model and image processor to the hub...") + config.push_to_hub( + repo_id=repo_id, + commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + model.push_to_hub( + repo_id=repo_id, + commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + image_processor.push_to_hub( + repo_id=repo_id, + commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py", + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + default="rf-detr-large", + type=str, + help="model_name of the checkpoint you'd like to convert.", + ) + parser.add_argument("--output_dir", default=None, type=str, help="Location to write HF model and image processor") + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") + parser.add_argument( + "--repo_id", + type=str, + help="repo_id where the model will be pushed to.", + ) + args = parser.parse_args() + write_model_and_image_processor(args.model_name, args.output_dir, args.push_to_hub, args.repo_id) diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py new file mode 100644 index 000000000000..c719f942b46c --- /dev/null +++ b/src/transformers/models/rf_detr/modeling_rf_detr.py @@ -0,0 +1,2118 @@ +import copy +import math +import warnings +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2CLS, ACT2FN +from ...integrations.hub_kernels import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_timm_available, + logging, + replace_return_docstrings, +) +from ...utils.backbone_utils import load_backbone +from .configuration_rf_detr import RFDetrConfig + + +logger = logging.get_logger(__name__) + + +if is_timm_available(): + pass + +_CONFIG_FOR_DOC = "RFDetrConfig" +_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr" # TODO + + +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention +class MultiScaleDeformableAttention(nn.Module): + def forward( + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: List[Tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, + ): + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->RFDetr +class RFDetrDecoderOutput(ModelOutput): + """ + Base class for outputs of the RFDetrDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->RFDetr +class RFDetrModelOutput(ModelOutput): + """ + Base class for outputs of the Deformable DETR encoder-decoder model. + + Args: + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + init_reference_points: torch.FloatTensor = None + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +@dataclass +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->RFDetr +class RFDetrObjectDetectionOutput(ModelOutput): + """ + Output type of [`RFDetrForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~RFDetrProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries, + num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted + average in the self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`. + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each + layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4, + 4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average + in the self-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional = None + enc_outputs_coord_logits: Optional = None + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr._get_clones +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.inverse_sigmoid +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->RFDetr +class RFDetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.replace_batch_norm with DeformableDetr->RFDetr +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `RFDetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = RFDetrFrozenBatchNorm2d(module.num_features) + + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class RFDetrConvNormLayer(nn.Module): + def __init__( + self, + config: RFDetrConfig, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + padding: int = None, + activation: str = None, + ): + super().__init__() + activation = config.projector_activation_function if activation is None else activation + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + self.norm = ( + RFDetrLayerNorm(out_channels, data_format="channels_first") + if config.layer_norm + else nn.BatchNorm2d(out_channels, config.batch_norm_eps) + ) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_state): + hidden_state = self.conv(hidden_state) + hidden_state = self.norm(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +# Copied from transformers.models.rt_detr.modeling_rt_detr.RTDetrRepVggBlock with RTDetr->RFDetr, activation_function->projector_activation_function +class RFDetrCSPRepBottleneck(nn.Module): + """ + RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". + """ + + def __init__(self, config: RFDetrConfig, hidden_channels: int): + super().__init__() + + activation = config.projector_activation_function + self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1) + self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_states): + output_states = self.conv1(hidden_states) + output_states = self.conv2(output_states) + return hidden_states + output_states + + +class RFDetrCSPRepLayer(nn.Module): + """ + Cross Stage Partial (CSP) network layer with RepVGG blocks. + """ + + def __init__(self, config: RFDetrConfig, in_channels: int): + super().__init__() + + out_channels = config.d_model + num_blocks = config.projector_num_blocks + + self.hidden_channels = int(out_channels * config.csp_hidden_expansion) + self.conv1 = RFDetrConvNormLayer(config, in_channels, 2 * self.hidden_channels, 1, 1) + self.conv2 = RFDetrConvNormLayer(config, (2 + num_blocks) * self.hidden_channels, out_channels, 1, 1) + self.bottlenecks = nn.ModuleList( + [RFDetrCSPRepBottleneck(config, self.hidden_channels) for _ in range(num_blocks)] + ) + + def forward(self, hidden_states): + hidden_states = self.conv1(hidden_states) + all_hidden_states = list(hidden_states.split(self.hidden_channels, 1)) + hidden_states = all_hidden_states[-1] + for bottleneck in self.bottlenecks: + new_hidden_states = bottleneck(hidden_states) + all_hidden_states.append(new_hidden_states) + # all_hidden_states.extend(bottleneck(hidden_states) for bottleneck in self.bottlenecks) + hidden_states = torch.cat(all_hidden_states, 1) + hidden_states = self.conv2(hidden_states) + return hidden_states + + +class RFDetrScaleProjectorLayer(nn.Module): + def __init__(self, config: RFDetrConfig, scale: float, in_channels: int): + super().__init__() + self.use_extra_pool = False + layers = [] + if scale == 2.0: + layers.append(nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)) + elif scale == 1.0: + pass + elif scale == 0.5: + layers.append(RFDetrConvNormLayer(config, in_channels, in_channels, 3, 2, activation="relu")) + else: + raise NotImplementedError("Unsupported scale_factor:{}".format(scale)) + self.layers = nn.Sequential(*layers) + + def forward(self, hidden_state): + return self.layers(hidden_state) + + +# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->RFDetr +class RFDetrLayerNorm(nn.Module): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {self.data_format}") + self.normalized_shape = (normalized_shape,) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.data_format == "channels_last": + x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + elif self.data_format == "channels_first": + input_dtype = x.dtype + x = x.float() + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = x.to(dtype=input_dtype) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class RFDetrScaleProjector(nn.Module): + def __init__(self, config: RFDetrConfig, scale: float, in_channels: List[int]): + super().__init__() + + self.sampling_layers = nn.ModuleList( + [RFDetrScaleProjectorLayer(config, scale, in_channel) for in_channel in in_channels] + ) + + in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels)) + self.stage_layer = RFDetrCSPRepLayer(config, in_dim) + self.layer_norm = RFDetrLayerNorm(config.d_model, data_format="channels_first") + + def forward(self, hidden_states): + features = [layer(hidden_state) for layer, hidden_state in zip(self.sampling_layers, hidden_states)] + features = torch.cat(features, dim=1) + hidden_state = self.stage_layer(features) + hidden_state = self.layer_norm(hidden_state) + return hidden_state + + +class RFDetrMultiScaleProjector(nn.Module): + """ + This module implements MultiScaleProjector in :paper:`lwdetr`. + It creates pyramid features built on top of the input feature map. + """ + + def __init__( + self, + config: RFDetrConfig, + ): + """ + Args: + net (Backbone): module representing the subnetwork backbone. + Must be a subclass of :class:`Backbone`. + out_channels (int): number of channels in the output feature maps. + scale_factors (list[float]): list of scaling factors to upsample or downsample + the input features for creating pyramid features. + """ + super().__init__() + in_channels = config.projector_in_channels + self.survival_prob = config.projector_survival_prob + self.force_drop_last_n_features = config.projector_force_drop_last_n_features + scale_factors = config.scale_factors + + self.scale_layers = nn.ModuleList( + [RFDetrScaleProjector(config, scale, in_channels) for scale in scale_factors] + ) + + def forward(self, features): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: + mapping from feature map name to pyramid feature map tensor + in high to low resolution order. Returned feature names follow the FPN + convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + num_features = len(features) + if self.survival_prob < 1.0 and self.training: + final_drop_prob = 1 - self.survival_prob + drop_p = np.random.uniform() + for i in range(1, num_features): + critical_drop_prob = i * (final_drop_prob / (num_features - 1)) + if drop_p < critical_drop_prob: + features[i][:] = 0 + elif self.force_drop_last_n_features > 0: + for i in range(self.force_drop_last_n_features): + # don't do it inplace to ensure the compiler can optimize out the backbone layers + features[-(i + 1)] = torch.zeros_like(features[-(i + 1)]) + + outputs = [layer(features) for layer in self.scale_layers] + return outputs + + +class RFDetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by RFDetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + self.projector = RFDetrMultiScaleProjector(config) + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + features = self.projector(features) + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvModel with DeformableDetr->RFDetr +class RFDetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->RFDetr +class RFDetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype) + x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLearnedPositionEmbedding with DeformableDetr->RFDetr +class RFDetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat( + [ + x_emb.unsqueeze(0).repeat(height, 1, 1), + y_emb.unsqueeze(1).repeat(1, width, 1), + ], + dim=-1, + ) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.build_position_encoding with DeformableDetr->RFDetr +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = RFDetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = RFDetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RFDetr +class RFDetrMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: RFDetrConfig, num_heads: int, n_points: int): + super().__init__() + + self.attn = MultiScaleDeformableAttention() + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in RFDetrMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + total_elements = sum(height * width for height, width in spatial_shapes_list) + if total_elements != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + output = self.output_proj(output) + + return output, attention_weights + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->RFDetr +class RFDetrMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != ( + batch_size * self.num_heads, + target_len, + self.head_dim, + ): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->RFDetr +class RFDetrEncoderLayer(nn.Module): + def __init__(self, config: RFDetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = RFDetrMultiscaleDeformableAttention( + config, + num_heads=config.encoder_attention_heads, + n_points=config.encoder_n_points, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->RFDetr +class RFDetrDecoderLayer(nn.Module): + def __init__(self, config: RFDetrConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = RFDetrMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_self_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = RFDetrMultiscaleDeformableAttention( + config, + num_heads=config.decoder_cross_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->RFDetr +class RFDetrPreTrainedModel(PreTrainedModel): + config_class = RFDetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = [ + r"RFDetrConvEncoder", + r"RFDetrEncoderLayer", + r"RFDetrDecoderLayer", + ] + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, RFDetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, RFDetrMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + +RFDETR_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RFDetrConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +RFDETR_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`] + for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->RFDetr +class RFDetrDecoder(RFDetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Deformable DETR: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: RFDetrConfig + """ + + def __init__(self, config: RFDetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([RFDetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + self.config = config + self.reference_points_head = RFDetrMLPPredictionHead(2 * config.d_model, config.d_model, config.d_model, 2) + + self.norm = RFDetrLayerNorm(config.d_model, data_format="channels_first") + + # Initialize weights and apply final processing + self.post_init() + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def get_reference_points(self, reference_points_embeds, valid_ratios): + obj_center = reference_points_embeds[..., :4] + + refpoints_input = ( + obj_center[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) # bs, nq, nlevel, 4 + query_sine_embed = self.get_proposal_pos_embed(refpoints_input[:, :, 0, :], self.d_model / 2) # bs, nq, 256*2 + position_query_embeddings = self.reference_points_head(query_sine_embed) + return refpoints_input, position_query_embeddings + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif reference_points.shape[-1] == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") + + if self.config.bbox_reparam: + query_sine_embed = self.get_proposal_pos_embed( + reference_points_input[:, :, 0, :], self.d_model / 2 + ) # bs, nq, 256*2 + position_embeddings = self.reference_points_head(query_sine_embed) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + position_embeddings, + reference_points_input, + spatial_shapes, + spatial_shapes_list, + level_start_index, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if self.config.bbox_reparam: + new_reference_points_cxcy = ( + tmp[..., :2] * reference_points_embeddings[..., 2:] + reference_points_embeddings[..., :2] + ) + new_reference_points_wh = tmp[..., 2:].exp() * reference_points_embeddings[..., 2:] + new_reference_points = torch.concat([new_reference_points_cxcy, new_reference_points_wh], dim=-1) + else: + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return RFDetrDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The bare RF DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + RFDETR_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->RFDetr, DEFORMABLE_DETR->RFDETR +class RFDetrModel(RFDetrPreTrainedModel): + def __init__(self, config: RFDetrConfig): + super().__init__(config) + + self.bbox_reparam = config.bbox_reparam + self.two_stage = config.two_stage + self.num_groups = config.num_groups + self.num_queries = config.num_queries + self.d_model = config.d_model + + # Create backbone + positional encoding + backbone = RFDetrConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = RFDetrConvModel(backbone, position_embeddings) + + self.query_position_embeddings = nn.Embedding(config.num_queries * config.num_groups, config.d_model) + + self.reference_point_embeddings = nn.Embedding(config.num_queries * config.num_groups, 4) + nn.init.constant_(self.reference_point_embeddings.weight.data, 0) + + self.decoder = RFDetrDecoder(config) + + if config.two_stage: + self.enc_output = nn.ModuleList( + [nn.Linear(config.d_model, config.d_model) for _ in range(config.num_groups)] + ) + self.enc_output_norm = nn.ModuleList([nn.LayerNorm(config.d_model) for _ in range(config.num_groups)]) + # self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + # self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask, dtype=torch.float32): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.to(dtype) / height + valid_ratio_width = valid_width.to(dtype) / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace( + 0, + height - 1, + height, + dtype=enc_output.dtype, + device=enc_output.device, + ), + torch.linspace( + 0, + width - 1, + width, + dtype=enc_output.dtype, + device=enc_output.device, + ), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + # object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFDetrModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFDetrModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFDetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, _ = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for source, mask in features: + sources.append(source) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Create queries + query_embeds = None + if not self.config.two_stage: + if self.training: + query_embeds = self.query_position_embeddings.weight + else: + query_embeds = self.query_position_embeddings.weight[: self.num_queries] + + if self.training: + reference_point_embeds = self.reference_point_embeddings.weight + else: + reference_point_embeds = self.reference_point_embeddings.weight[: self.num_queries] + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + spatial_shapes_list = [] + for source, mask in enumerate(zip(sources, masks)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) + + # Fifth, prepare decoder inputs + encoder_hidden_states = source_flatten + batch_size, _, num_channels = encoder_hidden_states.shape + enc_outputs_class = None + enc_outputs_coord_logits = None + + if self.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + source_flatten, ~mask_flatten, spatial_shapes_list + ) + + reference_points = [] + encoder_hidden_states = [] + boxes_ts = [] + num_groups = self.num_groups if self.training else 1 + for group_id in range(num_groups): + object_query_embedding = self.enc_output[group_id](object_query_embedding) + object_query_embedding = self.enc_output_norm[group_id](object_query_embedding) + + enc_outputs_class = self.enc_out_class_embed[group_id](object_query_embedding) + + if self.bbox_reparam: + enc_outputs_coord_delta = self.enc_out_bbox_embed[group_id](object_query_embedding) + enc_outputs_coord_cxcy = ( + enc_outputs_coord_delta[..., :2] * output_proposals[..., 2:] + output_proposals[..., :2] + ) + enc_outputs_coord_wh = enc_outputs_coord_delta[..., 2:].exp() * output_proposals[..., 2:] + enc_outputs_coord_logits = torch.concat([enc_outputs_coord_cxcy, enc_outputs_coord_wh], dim=-1) + else: + delta_bbox = self.enc_out_bbox_embed[group_id](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + topk = self.config.two_stage_num_proposals + enc_outputs_class = torch.max(enc_outputs_class, dim=-1)[0] + topk_proposals = torch.topk(enc_outputs_class, topk, dim=1)[1] # bs, nq + + reference_point_embedding = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) # unsigmoid + # for decoder layer, detached as initial ones, (bs, nq, 4) + reference_point_embedding_detached = reference_point_embedding.detach() + + # get memory tgt + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ) + + reference_points.append(reference_point_embedding_detached) + encoder_hidden_states.append(target) + boxes_ts.append(reference_point_embedding) + reference_points = torch.cat(reference_points, dim=1) + encoder_hidden_states = torch.cat(encoder_hidden_states, dim=1) + boxes_ts = torch.cat(boxes_ts, dim=1) + + # hack implementation for two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + # enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + # delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + # enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, + 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4), + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return RFDetrModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + # TODO + # encoder_last_hidden_state=features.last_hidden_state, + # encoder_hidden_states=features.hidden_states, + # encoder_attentions=features.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr +class RFDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@add_start_docstrings( + """ + RF DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """, + RFDETR_START_DOCSTRING, +) +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with Deformable->RF, DEFORMABLE_DETR->RFDETR +class RFDetrForObjectDetection(RFDetrPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: RFDetrConfig): + super().__init__(config) + + # RF DETR encoder-decoder model + self.model = RFDetrModel(config) + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = RFDetrMLPPredictionHead( + input_dim=config.d_model, + hidden_dim=config.d_model, + output_dim=4, + num_layers=3, + ) + + prior_prob = 0.01 + bias_value = -math.log((1 - prior_prob) / prior_prob) + self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value + nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + # self.class_embed = _get_clones(self.class_embed, num_pred) + # self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + # nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + # nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0) + # self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + # self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + # for box_embed in self.bbox_embed: + # nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0) + + self.model.enc_out_bbox_embed = nn.ModuleList( + [copy.deepcopy(self.bbox_embed) for _ in range(config.num_groups)] + ) + self.model.enc_out_class_embed = nn.ModuleList( + [copy.deepcopy(self.class_embed) for _ in range(config.num_groups)] + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=RFDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[List[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], RFDetrObjectDetectionOutput]: + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through RfDETR base model to obtain backbone + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + ) + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = RFDetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs + + +__all__ = [ + "RFDetrForObjectDetection", + "RFDetrModel", + "RFDetrPreTrainedModel", +] diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..54530aacffa7 --- /dev/null +++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py @@ -0,0 +1,763 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +from typing import Callable, Optional, Set, Tuple, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BackboneOutput, BaseModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ...utils.backbone_utils import BackboneMixin +from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "RFDetrDinov2WithRegistersConfig" + + +class RFDetrDinov2WithRegistersPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class RFDetrDinov2WithRegistersEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, register tokens, position and patch embeddings. + """ + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.register_tokens = ( + nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + if config.num_register_tokens > 0 + else None + ) + self.patch_embeddings = RFDetrDinov2WithRegistersPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility + with the original implementation. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py + - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # Skip interpolation for matching dimensions (unless tracing) + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + # Handle class token and patch embeddings separately + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + + # Calculate new dimensions + height = height // self.config.patch_size + width = width // self.config.patch_size + + # Reshape for interpolation + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + # Store original dtype for restoration after interpolation + target_dtype = patch_pos_embed.dtype + + # Interpolate at float32 precision + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(dtype=torch.float32), + size=(torch_int(height), torch_int(width)), # Explicit size instead of scale_factor + mode="bicubic", + align_corners=False, + antialias=True, + ).to(dtype=target_dtype) + + # Validate output dimensions if not tracing + if not torch.jit.is_tracing(): + if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: + raise ValueError("Width or height does not match with the interpolated position embeddings") + + # Reshape back to original format + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + # Combine class and patch embeddings + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class RFDetrDinov2WithRegistersSelfAttention(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = None + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class RFDetrDinov2WithRegistersSelfOutput(nn.Module): + """ + The residual connection is defined in RFDetrDinov2WithRegistersLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class RFDetrDinov2WithRegistersAttention(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.attention = RFDetrDinov2WithRegistersSelfAttention(config) + self.output = RFDetrDinov2WithRegistersSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class RFDetrDinov2WithRegistersLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class RFDetrDinov2WithRegistersDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class RFDetrDinov2WithRegistersMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class RFDetrDinov2WithRegistersSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class RFDetrDinov2WithRegistersLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = RFDetrDinov2WithRegistersAttention(config) + self.layer_scale1 = RFDetrDinov2WithRegistersLayerScale(config) + self.drop_path = ( + RFDetrDinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = RFDetrDinov2WithRegistersSwiGLUFFN(config) + else: + self.mlp = RFDetrDinov2WithRegistersMLP(config) + self.layer_scale2 = RFDetrDinov2WithRegistersLayerScale(config) + self.num_windows = config.num_windows + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + remove_windows: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if remove_windows: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if remove_windows: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class RFDetrDinov2WithRegistersEncoder(nn.Module): + def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([RFDetrDinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if i > int(self.config.out_features[-1][5:]): + # early stop if we have reached the last output feature + break + + remove_windows = i not in self.config.window_block_indexes + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + remove_windows, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class RFDetrDinov2WithRegistersPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RFDetrDinov2WithRegistersConfig + base_model_prefix = "rf_detr_dinov2_with_registers" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["RFDetrDinov2WithRegistersSwiGLUFFN"] + _supports_sdpa = True + _supports_flash_attn_2 = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, RFDetrDinov2WithRegistersEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + + +RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`RFDetrDinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BitImageProcessor.preprocess`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """ + RFDetrDinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer. + """, + RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING, +) +class RFDetrDinov2WithRegistersBackbone(RFDetrDinov2WithRegistersPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = RFDetrDinov2WithRegistersEmbeddings(config) + self.encoder = RFDetrDinov2WithRegistersEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.num_register_tokens = config.num_register_tokens + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings: + return self.embeddings.patch_embeddings + + @add_start_docstrings_to_model_forward(RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + Returns: + + Examples: + Returns: + + Examples: + + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + B, HW, C = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) + hidden_state = hidden_state.view( + B // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + C, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = ["RFDetrDinov2WithRegistersBackbone"] diff --git a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py new file mode 100644 index 000000000000..7e7a785111d6 --- /dev/null +++ b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py @@ -0,0 +1,308 @@ +from typing import Optional, Tuple, Union + +import torch +from torch import nn + +from ...modeling_outputs import BackboneOutput, BaseModelOutput +from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig +from ..dinov2_with_registers.modeling_dinov2_with_registers import ( + Dinov2WithRegistersBackbone, + Dinov2WithRegistersEmbeddings, + Dinov2WithRegistersEncoder, + Dinov2WithRegistersLayer, + Dinov2WithRegistersSelfAttention, +) + + +class RFDetrDinov2WithRegistersConfig(Dinov2WithRegistersConfig): + def __init__(self, num_windows: int = 4, window_block_indexes=None, **super_kwargs): + super(Dinov2WithRegistersConfig).__init__(**super_kwargs) + + self.num_windows = num_windows + window_block_indexes = set(range(self._out_indices[-1] + 1)) + window_block_indexes.difference_update(self._out_indices) + window_block_indexes = list(window_block_indexes) + self.window_block_indexes = window_block_indexes + + +class RFDetrDinov2WithRegistersEmbeddings(Dinov2WithRegistersEmbeddings): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super(Dinov2WithRegistersEmbeddings).__init__(config) + self.register_tokens = ( + nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + if config.num_register_tokens > 0 + else None + ) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + if bool_masked_pos is not None: + embeddings = torch.where( + bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings + ) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + if self.config.num_windows > 1: + # reshape for windows + num_h_patches = height // self.config.patch_size + num_w_patches = width // self.config.patch_size + cls_token_with_pos_embed = embeddings[:, :1] + pixel_tokens_with_pos_embed = embeddings[:, 1:] + pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view( + batch_size, num_h_patches, num_w_patches, -1 + ) + num_w_patches_per_window = num_w_patches // self.config.num_windows + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_windows = self.config.num_windows + windowed_pixel_tokens = pixel_tokens_with_pos_embed.view( + batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1 + ) + windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5) + windowed_pixel_tokens = windowed_pixel_tokens.reshape( + batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1 + ) + windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1) + embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1) + + # add register tokens + embeddings = ( + torch.cat( + (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1 + ) + if self.config.num_register_tokens > 0 + else embeddings + ) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class RFDetrDinov2WithRegistersSelfAttention(Dinov2WithRegistersSelfAttention): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super(Dinov2WithRegistersSelfAttention).__init__(config) + self.scaling = None + + +class RFDetrDinov2WithRegistersLayer(Dinov2WithRegistersLayer): + def __init__(self, config: RFDetrDinov2WithRegistersConfig): + super().__init__(config) + self.num_windows = config.num_windows + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + remove_windows: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + assert head_mask is None, "head_mask is not supported for windowed attention" + assert not output_attentions, "output_attentions is not supported for windowed attention" + shortcut = hidden_states + if remove_windows: + # reshape x to remove windows + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C) + + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + if remove_windows: + # reshape x to add windows back + B, HW, C = hidden_states.shape + num_windows_squared = self.num_windows**2 + # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C) + attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C) + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + shortcut + + # in Dinov2WithRegisters, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class RFDetrDinov2WithRegistersEncoder(Dinov2WithRegistersEncoder): + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if i > int(self.config.out_features[-1][5:]): + # early stop if we have reached the last output feature + break + + remove_windows = i not in self.config.window_block_indexes + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + remove_windows, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class RFDetrDinov2WithRegistersBackbone(Dinov2WithRegistersBackbone): + def forward( + self, + pixel_values: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + Returns: + + Examples: + + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base") + >>> model = AutoBackbone.from_pretrained( + ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 16, 16] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + embedding_output = self.embeddings(pixel_values) + + outputs = self.encoder( + embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.apply_layernorm: + hidden_state = self.layernorm(hidden_state) + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, self.num_register_tokens + 1 :] + # this was actually a bug in the original implementation that we copied here, + # cause normally the order is height, width + batch_size, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + + num_h_patches = height // patch_size + num_w_patches = width // patch_size + + if self.config.num_windows > 1: + # undo windowing + num_windows_squared = self.config.num_windows**2 + B, HW, C = hidden_state.shape + num_h_patches_per_window = num_h_patches // self.config.num_windows + num_w_patches_per_window = num_w_patches // self.config.num_windows + hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C) + hidden_state = hidden_state.view( + B // num_windows_squared, + self.config.num_windows, + self.config.num_windows, + num_h_patches_per_window, + num_w_patches_per_window, + C, + ) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5) + + hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_maps += (hidden_state,) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions if output_attentions else None, + ) + + +__all__ = [ + "RFDetrDinov2WithRegistersConfig", + "RFDetrDinov2WithRegistersBackbone", +] diff --git a/src/transformers/models/rf_detr/run_rfdetr.py b/src/transformers/models/rf_detr/run_rfdetr.py new file mode 100644 index 000000000000..0ec581d49343 --- /dev/null +++ b/src/transformers/models/rf_detr/run_rfdetr.py @@ -0,0 +1,18 @@ +import io + +import requests +from PIL import Image + +from transformers import AutoImageProcessor, RFDetrBackbone, RFDetrConfig + + +images = ["https://media.roboflow.com/notebooks/examples/dog-2.jpeg"] + +images = [Image.open(io.BytesIO(requests.get(url).content)) for url in images] + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") +inputs = processor(images, return_tensors="pt") + +config = RFDetrConfig() +backbone = RFDetrBackbone(config=config.backbone_config) +# model = RFDetrForObjectDetection.from_config() diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index a7051cffca81..a0683b901966 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8387,6 +8387,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class RFDetrDinov2WithRegistersBackbone(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RFDetrPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class RobertaForCausalLM(metaclass=DummyObject): _backends = ["torch"]