diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e8da536747d4..c0ad2f21a733 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -762,6 +762,7 @@
         "RoFormerConfig",
         "RoFormerTokenizer",
     ],
+    "models.rf_detr": ["RFDetrConfig", "RFDetrDinov2WithRegistersConfig"],
     "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"],
     "models.rt_detr_v2": ["RTDetrV2Config"],
     "models.rwkv": ["RwkvConfig"],
@@ -3536,6 +3537,15 @@
             "load_tf_weights_in_roformer",
         ]
     )
+    _import_structure["models.rf_detr"].extend(
+        [
+            "RFDetrForObjectDetection",
+            "RFDetrModel",
+            "RFDetrPreTrainedModel",
+            "RFDetrDinov2WithRegistersBackbone",
+            "RFDetrDinov2WithRegistersPreTrainedModel",
+        ]
+    )
     _import_structure["models.rt_detr"].extend(
         [
             "RTDetrForObjectDetection",
@@ -5987,6 +5997,7 @@
     from .models.regnet import RegNetConfig
     from .models.rembert import RemBertConfig
     from .models.resnet import ResNetConfig
+    from .models.rf_detr import RFDetrConfig, RFDetrDinov2WithRegistersConfig
     from .models.roberta import (
         RobertaConfig,
         RobertaTokenizer,
@@ -8316,6 +8327,13 @@
             ResNetModel,
             ResNetPreTrainedModel,
         )
+        from .models.rf_detr import (
+            RFDetrDinov2WithRegistersBackbone,
+            RFDetrDinov2WithRegistersPreTrainedModel,
+            RFDetrForObjectDetection,
+            RFDetrModel,
+            RFDetrPreTrainedModel,
+        )
         from .models.roberta import (
             RobertaForCausalLM,
             RobertaForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 49ce48dd6c04..d3c044778c46 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -235,6 +235,7 @@
     regnet,
     rembert,
     resnet,
+    rf_detr,
     roberta,
     roberta_prelayernorm,
     roc_bert,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c7ef472882ba..27945df21f45 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -261,6 +261,7 @@
         ("rembert", "RemBertConfig"),
         ("resnet", "ResNetConfig"),
         ("retribert", "RetriBertConfig"),
+        ("rf_detr", "RFDetrConfig"),
         ("roberta", "RobertaConfig"),
         ("roberta-prelayernorm", "RobertaPreLayerNormConfig"),
         ("roc_bert", "RoCBertConfig"),
@@ -615,6 +616,8 @@
         ("rembert", "RemBERT"),
         ("resnet", "ResNet"),
         ("retribert", "RetriBERT"),
+        ("rf_detr", "RF-DETR"),
+        ("rf_detr_dinov2_with_registers", "RF-DETR-DINOv2 with Registers"),
         ("roberta", "RoBERTa"),
         ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"),
         ("roc_bert", "RoCBert"),
@@ -766,6 +769,7 @@
         ("smolvlm_vision", "smolvlm"),
         ("chinese_clip_vision_model", "chinese_clip"),
         ("rt_detr_resnet", "rt_detr"),
+        ("rf_detr_dinov2_with_registers", "rf_detr"),
         ("granitevision", "llava_next"),
     ]
 )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 05a415741413..72f06cb2d1cb 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -238,6 +238,7 @@
         ("rembert", "RemBertModel"),
         ("resnet", "ResNetModel"),
         ("retribert", "RetriBertModel"),
+        ("rf_detr", "RFDetrModel"),
         ("roberta", "RobertaModel"),
         ("roberta-prelayernorm", "RobertaPreLayerNormModel"),
         ("roc_bert", "RoCBertModel"),
@@ -920,6 +921,7 @@
         ("deformable_detr", "DeformableDetrForObjectDetection"),
         ("deta", "DetaForObjectDetection"),
         ("detr", "DetrForObjectDetection"),
+        ("rf_detr", "RFDetrForObjectDetection"),
         ("rt_detr", "RTDetrForObjectDetection"),
         ("rt_detr_v2", "RTDetrV2ForObjectDetection"),
         ("table-transformer", "TableTransformerForObjectDetection"),
@@ -1440,6 +1442,7 @@
         ("nat", "NatBackbone"),
         ("pvt_v2", "PvtV2Backbone"),
         ("resnet", "ResNetBackbone"),
+        ("rf_detr_dinov2_with_registers", "RFDetrDinov2WithRegistersBackbone"),
         ("rt_detr_resnet", "RTDetrResNetBackbone"),
         ("swin", "SwinBackbone"),
         ("swinv2", "Swinv2Backbone"),
diff --git a/src/transformers/models/rf_detr/__init__.py b/src/transformers/models/rf_detr/__init__.py
new file mode 100644
index 000000000000..46dba76871ff
--- /dev/null
+++ b/src/transformers/models/rf_detr/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_rf_detr import *
+    from .configuration_rf_detr_dinov2_with_registers import *
+    from .modeling_rf_detr import *
+    from .modeling_rf_detr_dinov2_with_registers import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/rf_detr/configuration_rf_detr.py b/src/transformers/models/rf_detr/configuration_rf_detr.py
new file mode 100644
index 000000000000..0bce9b42a0aa
--- /dev/null
+++ b/src/transformers/models/rf_detr/configuration_rf_detr.py
@@ -0,0 +1,316 @@
+from typing import List, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class RFDetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RFDetrModel`]. It is used to instantiate
+    an RF DETR model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the RF DETR
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    TODO: Add more details about the architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects
+            [`RFDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use
+            `two_stage_num_proposals` instead.
+        max_position_embeddings (`<fill_type>`, *optional*, defaults to 1024): <fill_docstring>
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        is_encoder_decoder (`<fill_type>`, *optional*, defaults to `True`): <fill_docstring>
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1.0):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        return_intermediate (`<fill_type>`, *optional*, defaults to `True`): <fill_docstring>
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `True`):
+            Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        dilation (`bool`, *optional*, defaults to `False`):
+            Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when
+            `use_timm_backbone` = `True`.
+        num_feature_levels (`int`, *optional*, defaults to 4):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `False`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            Deformable DETR, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `False`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        disable_custom_kernels (`bool`, *optional*, defaults to `False`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+        out_feature_indexes (`List`, *optional*, defaults to `[2, 5, 8, 11]`): <fill_docstring>
+        scale_factors (`List`, *optional*, defaults to `[1.0]`): <fill_docstring>
+        layer_norm (`bool`, *optional*, defaults to `False`): <fill_docstring>
+        projector_in_channels (`int`, *optional*, defaults to 256): <fill_docstring>
+        projector_num_blocks (`int`, *optional*, defaults to 3): <fill_docstring>
+        projector_survival_prob (`float`, *optional*, defaults to 1.0): <fill_docstring>
+        projector_force_drop_last_n_features (`int`, *optional*, defaults to 0): <fill_docstring>
+
+    Examples:
+
+    ```python
+    >>> from transformers import RFDetrConfig, RFDetrModel
+
+    >>> # Initializing a Deformable DETR SenseTime/deformable-detr style configuration
+    >>> configuration = RFDetrConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = RFDetrModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rf_detr"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        # backbone
+        use_timm_backbone=False,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        backbone_kwargs=None,
+        # RFDetrModel
+        num_queries=300,
+        # RFDetrEncoder
+        encoder_layers=6,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        encoder_n_points=4,
+        # RFDetrDecoder
+        decoder_layers=3,
+        d_model=256,
+        attention_dropout=0.0,
+        dropout=0.1,
+        activation_function="relu",
+        activation_dropout=0.0,
+        decoder_self_attention_heads=8,
+        decoder_cross_attention_heads=16,
+        decoder_n_points=4,
+        decoder_ffn_dim=2048,
+        # LWDetr
+        layer_norm: bool = True,
+        ##
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        dilation=False,
+        two_stage=True,
+        two_stage_num_proposals=300,
+        with_box_refine=True,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        disable_custom_kernels=False,
+        out_feature_indexes: List[int] = [2, 5, 8, 11],
+        scale_factors: List[float] = [1.0],
+        projector_in_channels: Optional[List[int]] = None,
+        projector_num_blocks: int = 3,  # TODO rename
+        projector_survival_prob: float = 1.0,
+        projector_force_drop_last_n_features: int = 0,
+        projector_activation_function: str = "silu",
+        csp_hidden_expansion: float = 0.5,
+        bottleneck_hidden_expansion: float = 0.5,
+        batch_norm_eps: float = 1e-5,
+        bbox_reparam: bool = True,
+        is_encoder_decoder=True,
+        num_groups=13,
+        light_reference_point_refinement: bool = True,
+        **kwargs,
+    ):
+        if backbone_config is None and backbone is None:
+            logger.info(
+                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone."
+            )
+            backbone_config = RFDetrDinov2WithRegistersConfig(
+                out_features=[f"stage{i}" for i in out_feature_indexes],
+                return_dict=False,
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_config = backbone_config
+        self.num_queries = num_queries
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_self_attention_heads = decoder_self_attention_heads
+        self.decoder_cross_attention_heads = decoder_cross_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.dilation = dilation
+        # deformable attributes
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+
+        self.scale_factors = [1.0] if scale_factors is None else scale_factors
+        assert len(self.scale_factors) > 0, "scale_factors must be a list of at least one element"
+        assert sorted(self.scale_factors, reverse=True) == self.scale_factors, "scale_factors must be reverse sorted"
+        assert all(scale in [2.0, 1.0, 0.5, 0.25] for scale in self.scale_factors), (
+            "scale_factors must be a consecutive list subset of [2.0, 1.0, 0.5, 0.25]"
+        )
+
+        self.num_feature_levels = len(scale_factors)
+        self.layer_norm = layer_norm
+        self.projector_in_channels = (
+            projector_in_channels
+            if projector_in_channels is not None
+            else [backbone_config.hidden_size] * len(out_feature_indexes)
+        )
+        assert len(self.projector_in_channels) == len(out_feature_indexes), (
+            "projector_in_channels must have the same length as out_feature_indexes"
+        )
+        self.projector_num_blocks = projector_num_blocks
+        self.projector_survival_prob = projector_survival_prob
+        self.projector_force_drop_last_n_features = projector_force_drop_last_n_features
+        self.projector_activation_function = projector_activation_function
+        self.csp_hidden_expansion = csp_hidden_expansion
+        self.bottleneck_expansion = bottleneck_hidden_expansion
+        self.batch_norm_eps = batch_norm_eps
+        self.encoder_hidden_dim = backbone_config.hidden_size
+        self.bbox_reparam = bbox_reparam
+        self.num_groups = num_groups
+        self.light_reference_point_refinement = light_reference_point_refinement
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+__all__ = ["RFDetrConfig"]
diff --git a/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py
new file mode 100644
index 000000000000..b43b8ec4dc04
--- /dev/null
+++ b/src/transformers/models/rf_detr/configuration_rf_detr_dinov2_with_registers.py
@@ -0,0 +1,151 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+class RFDetrDinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RFDetrDinov2WithRegistersModel`]. It is used to instantiate an
+    RFDetrDinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+    [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of register tokens to use.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import RFDetrDinov2WithRegistersConfig, RFDetrDinov2WithRegistersModel
+
+    >>> # Initializing a RFDetrDinov2WithRegisters base style configuration
+    >>> configuration = RFDetrDinov2WithRegistersConfig()
+
+    >>> # Initializing a model (with random weights) from the base style configuration
+    >>> model = RFDetrDinov2WithRegistersModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rf_detr_dinov2_with_registers"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        num_register_tokens=4,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        num_windows: int = 4,
+        window_block_indexes=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_register_tokens = num_register_tokens
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+        self.num_windows = num_windows
+        window_block_indexes = set(range(self._out_indices[-1] + 1))
+        window_block_indexes.difference_update(self._out_indices)
+        window_block_indexes = list(window_block_indexes)
+        self.window_block_indexes = window_block_indexes
+
+
+__all__ = ["RFDetrDinov2WithRegistersConfig"]
diff --git a/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py
new file mode 100644
index 000000000000..8f920f19b6fc
--- /dev/null
+++ b/src/transformers/models/rf_detr/convert_rf_detr_weights_to_hf.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RF Detr checkpoints to Hugging Face Transformers format."""
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import (
+    AutoConfig,
+    RFDetrConfig,
+    RFDetrDinov2WithRegistersConfig,
+    RFDetrForObjectDetection,
+    RTDetrImageProcessor,
+    RTDetrImageProcessorFast,
+)
+from transformers.utils import logging
+
+
+torch.set_printoptions(precision=6, sci_mode=False)
+
+
+def custom_repr(self):
+    # return f"{tuple(self.shape)} {self.flatten()[-10:].tolist()} {original_repr(self)}"
+    return f"{tuple(self.shape)} {self.flatten()[-3:].tolist()}"
+
+
+original_repr = torch.Tensor.__repr__
+torch.Tensor.__repr__ = custom_repr
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_rt_detr_v2_config(model_name: str) -> RFDetrConfig:
+    if model_name in ["rf-detr-base", "rf-detr-base-2"]:
+        dinov2_size = "small"
+    elif model_name == "rf-detr-large":
+        dinov2_size = "base"
+
+    base_backbone_model_name = f"facebook/dinov2-with-registers-{dinov2_size}"
+    num_register_tokens = 0
+    out_indices = [2, 5, 8, 11]
+    base_backbone = AutoConfig.from_pretrained(
+        base_backbone_model_name,
+        num_register_tokens=num_register_tokens,
+        out_indices=out_indices,
+    )
+
+    num_windows = 4
+    backbone_config = RFDetrDinov2WithRegistersConfig(
+        **base_backbone.to_dict(),
+        num_windows=num_windows,
+    )
+
+    scale_factors = [2.0, 0.5]
+    d_model = 384
+    decoder_self_attention_heads = 12
+    decoder_cross_attention_heads = 24
+    num_labels = 91
+    config = RFDetrConfig(
+        backbone_config=backbone_config,
+        scale_factors=scale_factors,
+        d_model=d_model,
+        decoder_self_attention_heads=decoder_self_attention_heads,
+        decoder_cross_attention_heads=decoder_cross_attention_heads,
+        num_labels=num_labels,
+    )
+
+    config.num_labels = 80
+    repo_id = "huggingface/label-files"
+    filename = "coco-detection-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    if model_name in ["rf-detr-base", "rf-detr-base-2"]:
+        pass
+        # config.backbone_config.hidden_sizes = [64, 128, 256, 512]
+        # config.backbone_config.depths = [2, 2, 2, 2]
+        # config.backbone_config.layer_type = "basic"
+        # config.encoder_in_channels = [128, 256, 512]
+        # config.hidden_expansion = 0.5
+        # config.decoder_layers = 3
+    elif model_name == "rf-detr-large":
+        pass
+        # config.backbone_config.hidden_sizes = [64, 128, 256, 512]
+        # config.backbone_config.depths = [3, 4, 6, 3]
+        # config.backbone_config.layer_type = "basic"
+        # config.encoder_in_channels = [128, 256, 512]
+        # config.hidden_expansion = 0.5
+        # config.decoder_layers = 4
+
+    return config
+
+
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    r"backbone.0.encoder.encoder": r"model.backbone.conv_encoder.model",
+    r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).(weight|bias)": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.\4",
+    r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.conv",
+    r"backbone.0.projector.stages_sampling.(\d+).(\d+).(\d+).bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.sampling_layers.\2.layers.\3.norm",
+    r"backbone.0.projector.stages.(\d+).0.cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.conv",
+    r"backbone.0.projector.stages.(\d+).0.cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv1.norm",
+    r"backbone.0.projector.stages.(\d+).0.cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.conv",
+    r"backbone.0.projector.stages.(\d+).0.cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.conv2.norm",
+    r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.conv",
+    r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv1.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv1.norm",
+    r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.conv": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.conv",
+    r"backbone.0.projector.stages.(\d+).0.m.(\d+).cv2.bn": r"model.backbone.conv_encoder.projector.scale_layers.\1.stage_layer.bottlenecks.\2.conv2.norm",
+    r"backbone.0.projector.stages.(\d+).1": r"model.backbone.conv_encoder.projector.scale_layers.\1.layer_norm",
+    r"transformer.decoder.layers.(\d+).self_attn.out_proj": r"model.decoder.layers.\1.self_attn.out_proj",
+    r"transformer.decoder.layers.(\d+).norm1": r"model.decoder.layers.\1.self_attn_layer_norm",
+    r"transformer.decoder.layers.(\d+).cross_attn.sampling_offsets": r"model.decoder.layers.\1.encoder_attn.sampling_offsets",
+    r"transformer.decoder.layers.(\d+).cross_attn.attention_weights": r"model.decoder.layers.\1.encoder_attn.attention_weights",
+    r"transformer.decoder.layers.(\d+).cross_attn.value_proj": r"model.decoder.layers.\1.encoder_attn.value_proj",
+    r"transformer.decoder.layers.(\d+).cross_attn.output_proj": r"model.decoder.layers.\1.encoder_attn.output_proj",
+    r"transformer.decoder.layers.(\d+).norm2": r"model.decoder.layers.\1.encoder_attn_layer_norm",
+    r"transformer.decoder.layers.(\d+).linear1": r"model.decoder.layers.\1.fc1",
+    r"transformer.decoder.layers.(\d+).linear2": r"model.decoder.layers.\1.fc2",
+    r"transformer.decoder.layers.(\d+).norm3": r"model.decoder.layers.\1.final_layer_norm",
+    r"transformer.decoder.norm": r"model.decoder.norm",
+    r"transformer.decoder.ref_point_head": r"model.decoder.reference_points_head",
+    r"refpoint_embed": r"model.reference_point_embeddings",
+    r"transformer.enc_output": r"model.enc_output",
+    r"transformer.enc_output_norm": r"model.enc_output_norm",
+    r"transformer.enc_out_bbox_embed": r"model.enc_out_bbox_embed",
+    r"transformer.enc_out_class_embed": r"model.enc_out_class_embed",
+    r"query_feat": r"model.query_position_embeddings",
+}
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    # Use the mapping to rename keys
+    for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+        for key in list(state_dict_keys.keys()):
+            new_key = re.sub(original_key, converted_key, key)
+            if new_key != key:
+                state_dict_keys[new_key] = state_dict_keys.pop(key)
+
+    return state_dict_keys
+
+
+def read_in_q_k_v(state_dict, config: RFDetrConfig):
+    prefix = "transformer.decoder.layers"
+    decoder_hidden_dim = config.d_model
+
+    for i in range(config.decoder_layers):
+        # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"{prefix}.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:decoder_hidden_dim, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:decoder_hidden_dim]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[
+            decoder_hidden_dim : 2 * decoder_hidden_dim, :
+        ]
+        state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[
+            decoder_hidden_dim : 2 * decoder_hidden_dim
+        ]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-decoder_hidden_dim:, :]
+        state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-decoder_hidden_dim:]
+
+
+def copy_weights(state_dict, config):
+    for key, value in dict(state_dict.items()).items():
+        if key.startswith("bbox_embed"):
+            new_key = f"model.decoder.{key}"
+            state_dict[new_key] = value
+        if key.startswith("class_embed"):
+            new_key = f"model.decoder.{key}"
+            state_dict[new_key] = value
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+
+    return im
+
+
+@torch.no_grad()
+def write_model_and_image_processor(model_name, output_dir, push_to_hub, repo_id):
+    """
+    Copy/paste/tweak model's weights to our RTDETR structure.
+    """
+
+    # load default config
+    config = get_rt_detr_v2_config(model_name)
+
+    # load original model from torch hub
+    model_name_to_checkpoint_url = {
+        "rf-detr-base": "https://storage.googleapis.com/rfdetr/rf-detr-base-coco.pth",
+        # below is a less converged model that may be better for finetuning but worse for inference
+        "rf-detr-base-2": "https://storage.googleapis.com/rfdetr/rf-detr-base-2.pth",
+        "rf-detr-large": "https://storage.googleapis.com/rfdetr/rf-detr-large.pth",
+    }
+    logger.info(f"Converting model {model_name}...")
+    state_dict = torch.hub.load_state_dict_from_url(model_name_to_checkpoint_url[model_name], map_location="cpu")[
+        "model"
+    ]
+    original_state_dict = state_dict.copy()
+    # rename keys
+    state_dict = convert_old_keys_to_new_keys(state_dict)
+    for key in state_dict.copy().keys():
+        if key.startswith("query_feat"):
+            del state_dict[key]
+
+    # query, key and value matrices need special treatment
+    read_in_q_k_v(state_dict, config)
+    # certain weights are copied from the RFDetrForObjectDetection to the RFDetrDecoder
+    copy_weights(state_dict, config)
+    # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them
+    for key in state_dict.copy().keys():
+        if key.endswith("num_batches_tracked"):
+            del state_dict[key]
+
+    # finally, create HuggingFace model and load state dict
+    model = RFDetrForObjectDetection(config)
+    target_state_dict = model.state_dict()
+    model.load_state_dict(state_dict)
+    loaded_state_dict = model.state_dict()
+    model.eval()
+
+    # load image processor
+    image_processor = RTDetrImageProcessorFast(size={"height": 560, "width": 560}, do_normalize=True)
+
+    # prepare image
+    img = prepare_img()
+
+    # preprocess image
+    transformations = transforms.Compose(
+        [
+            transforms.Resize([560, 560], interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.ToTensor(),
+        ]
+    )
+    original_pixel_values = transformations(img).unsqueeze(0)  # insert batch dimension
+
+    encoding = image_processor(images=img, return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    pixel_values = pixel_values.to(device)
+
+    # Pass image by the model
+    with torch.no_grad():
+        outputs = model(pixel_values)
+
+    if model_name == "rf-detr-base":
+        expected_slice_logits = torch.tensor(
+            [[-3.7045, -5.1913, -6.1787], [-4.0106, -9.3450, -5.2043], [-4.1287, -4.7463, -5.8634]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.2582, 0.5497, 0.4764], [0.1684, 0.1985, 0.2120], [0.7665, 0.4146, 0.4669]]
+        )
+    elif model_name == "rf-detr-base-2":
+        expected_slice_logits = torch.tensor(
+            [[-4.6108, -5.9453, -3.8505], [-3.8702, -6.1136, -5.5677], [-3.7790, -6.4538, -5.9449]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.1691, 0.1984, 0.2118], [0.2594, 0.5506, 0.4736], [0.7669, 0.4136, 0.4654]]
+        )
+    elif model_name == "rf-detr-large":
+        expected_slice_logits = torch.tensor(
+            [[-4.7881, -4.6754, -6.1624], [-5.4441, -6.6486, -4.3840], [-3.5455, -4.9318, -6.3544]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.2588, 0.5487, 0.4747], [0.5497, 0.2760, 0.0573], [0.7688, 0.4133, 0.4634]]
+        )
+    else:
+        raise ValueError(f"Unknown rf_detr_name: {model_name}")
+    assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-4)
+    assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-3)
+
+    if output_dir is not None:
+        Path(output_dir).mkdir(exist_ok=True)
+        print(f"Saving model {model_name} to {output_dir}")
+        model.save_pretrained(output_dir)
+        print(f"Saving image processor to {output_dir}")
+        image_processor.save_pretrained(output_dir)
+
+    if push_to_hub:
+        # Upload model, image processor and config to the hub
+        logger.info("Uploading PyTorch model and image processor to the hub...")
+        config.push_to_hub(
+            repo_id=repo_id,
+            commit_message="Add config from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
+        )
+        model.push_to_hub(
+            repo_id=repo_id,
+            commit_message="Add model from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
+        )
+        image_processor.push_to_hub(
+            repo_id=repo_id,
+            commit_message="Add image processor from convert_rt_detr_v2_original_pytorch_checkpoint_to_pytorch.py",
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        default="rf-detr-large",
+        type=str,
+        help="model_name of the checkpoint you'd like to convert.",
+    )
+    parser.add_argument("--output_dir", default=None, type=str, help="Location to write HF model and image processor")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.")
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        help="repo_id where the model will be pushed to.",
+    )
+    args = parser.parse_args()
+    write_model_and_image_processor(args.model_name, args.output_dir, args.push_to_hub, args.repo_id)
diff --git a/src/transformers/models/rf_detr/modeling_rf_detr.py b/src/transformers/models/rf_detr/modeling_rf_detr.py
new file mode 100644
index 000000000000..c719f942b46c
--- /dev/null
+++ b/src/transformers/models/rf_detr/modeling_rf_detr.py
@@ -0,0 +1,2118 @@
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...integrations.hub_kernels import use_kernel_forward_from_hub
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_timm_available,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_rf_detr import RFDetrConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_timm_available():
+    pass
+
+_CONFIG_FOR_DOC = "RFDetrConfig"
+_CHECKPOINT_FOR_DOC = "sensetime/deformable-detr"  # TODO
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: List[Tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
+        )
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
+        )
+        return output.transpose(1, 2).contiguous()
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderOutput with DeformableDetr->RFDetr
+class RFDetrDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the RFDetrDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModelOutput with DeformableDetr->RFDetr
+class RFDetrModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Deformable DETR encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    init_reference_points: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    intermediate_hidden_states: torch.FloatTensor = None
+    intermediate_reference_points: torch.FloatTensor = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrObjectDetectionOutput with DeformableDetr->RFDetr
+class RFDetrObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`RFDetrForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~RFDetrProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: torch.FloatTensor = None
+    pred_boxes: torch.FloatTensor = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional = None
+    enc_outputs_coord_logits: Optional = None
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr._get_clones
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.inverse_sigmoid
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrFrozenBatchNorm2d with DeformableDetr->RFDetr
+class RFDetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.replace_batch_norm with DeformableDetr->RFDetr
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `RFDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = RFDetrFrozenBatchNorm2d(module.num_features)
+
+            if not module.weight.device == torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class RFDetrConvNormLayer(nn.Module):
+    def __init__(
+        self,
+        config: RFDetrConfig,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: int = None,
+        activation: str = None,
+    ):
+        super().__init__()
+        activation = config.projector_activation_function if activation is None else activation
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = (
+            RFDetrLayerNorm(out_channels, data_format="channels_first")
+            if config.layer_norm
+            else nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        )
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.rt_detr.modeling_rt_detr.RTDetrRepVggBlock with RTDetr->RFDetr, activation_function->projector_activation_function
+class RFDetrCSPRepBottleneck(nn.Module):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: RFDetrConfig, hidden_channels: int):
+        super().__init__()
+
+        activation = config.projector_activation_function
+        self.conv1 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1)
+        self.conv2 = RFDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_states):
+        output_states = self.conv1(hidden_states)
+        output_states = self.conv2(output_states)
+        return hidden_states + output_states
+
+
+class RFDetrCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+
+    def __init__(self, config: RFDetrConfig, in_channels: int):
+        super().__init__()
+
+        out_channels = config.d_model
+        num_blocks = config.projector_num_blocks
+
+        self.hidden_channels = int(out_channels * config.csp_hidden_expansion)
+        self.conv1 = RFDetrConvNormLayer(config, in_channels, 2 * self.hidden_channels, 1, 1)
+        self.conv2 = RFDetrConvNormLayer(config, (2 + num_blocks) * self.hidden_channels, out_channels, 1, 1)
+        self.bottlenecks = nn.ModuleList(
+            [RFDetrCSPRepBottleneck(config, self.hidden_channels) for _ in range(num_blocks)]
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        all_hidden_states = list(hidden_states.split(self.hidden_channels, 1))
+        hidden_states = all_hidden_states[-1]
+        for bottleneck in self.bottlenecks:
+            new_hidden_states = bottleneck(hidden_states)
+            all_hidden_states.append(new_hidden_states)
+        # all_hidden_states.extend(bottleneck(hidden_states) for bottleneck in self.bottlenecks)
+        hidden_states = torch.cat(all_hidden_states, 1)
+        hidden_states = self.conv2(hidden_states)
+        return hidden_states
+
+
+class RFDetrScaleProjectorLayer(nn.Module):
+    def __init__(self, config: RFDetrConfig, scale: float, in_channels: int):
+        super().__init__()
+        self.use_extra_pool = False
+        layers = []
+        if scale == 2.0:
+            layers.append(nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2))
+        elif scale == 1.0:
+            pass
+        elif scale == 0.5:
+            layers.append(RFDetrConvNormLayer(config, in_channels, in_channels, 3, 2, activation="relu"))
+        else:
+            raise NotImplementedError("Unsupported scale_factor:{}".format(scale))
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, hidden_state):
+        return self.layers(hidden_state)
+
+
+# Copied from transformers.models.convnext.modeling_convnext.ConvNextLayerNorm with ConvNext->RFDetr
+class RFDetrLayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            x = torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            input_dtype = x.dtype
+            x = x.float()
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = x.to(dtype=input_dtype)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class RFDetrScaleProjector(nn.Module):
+    def __init__(self, config: RFDetrConfig, scale: float, in_channels: List[int]):
+        super().__init__()
+
+        self.sampling_layers = nn.ModuleList(
+            [RFDetrScaleProjectorLayer(config, scale, in_channel) for in_channel in in_channels]
+        )
+
+        in_dim = int(sum(in_channel // max(1, scale) for in_channel in in_channels))
+        self.stage_layer = RFDetrCSPRepLayer(config, in_dim)
+        self.layer_norm = RFDetrLayerNorm(config.d_model, data_format="channels_first")
+
+    def forward(self, hidden_states):
+        features = [layer(hidden_state) for layer, hidden_state in zip(self.sampling_layers, hidden_states)]
+        features = torch.cat(features, dim=1)
+        hidden_state = self.stage_layer(features)
+        hidden_state = self.layer_norm(hidden_state)
+        return hidden_state
+
+
+class RFDetrMultiScaleProjector(nn.Module):
+    """
+    This module implements MultiScaleProjector in :paper:`lwdetr`.
+    It creates pyramid features built on top of the input feature map.
+    """
+
+    def __init__(
+        self,
+        config: RFDetrConfig,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+        """
+        super().__init__()
+        in_channels = config.projector_in_channels
+        self.survival_prob = config.projector_survival_prob
+        self.force_drop_last_n_features = config.projector_force_drop_last_n_features
+        scale_factors = config.scale_factors
+
+        self.scale_layers = nn.ModuleList(
+            [RFDetrScaleProjector(config, scale, in_channels) for scale in scale_factors]
+        )
+
+    def forward(self, features):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        num_features = len(features)
+        if self.survival_prob < 1.0 and self.training:
+            final_drop_prob = 1 - self.survival_prob
+            drop_p = np.random.uniform()
+            for i in range(1, num_features):
+                critical_drop_prob = i * (final_drop_prob / (num_features - 1))
+                if drop_p < critical_drop_prob:
+                    features[i][:] = 0
+        elif self.force_drop_last_n_features > 0:
+            for i in range(self.force_drop_last_n_features):
+                # don't do it inplace to ensure the compiler can optimize out the backbone layers
+                features[-(i + 1)] = torch.zeros_like(features[-(i + 1)])
+
+        outputs = [layer(features) for layer in self.scale_layers]
+        return outputs
+
+
+class RFDetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by RFDetrFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        backbone = load_backbone(config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+        self.projector = RFDetrMultiScaleProjector(config)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+        features = self.projector(features)
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrConvModel with DeformableDetr->RFDetr
+class RFDetrConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrSinePositionEmbedding with DeformableDetr->RFDetr
+class RFDetrSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype)
+        x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLearnedPositionEmbedding with DeformableDetr->RFDetr
+class RFDetrLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat(
+            [
+                x_emb.unsqueeze(0).repeat(height, 1, 1),
+                y_emb.unsqueeze(1).repeat(1, width, 1),
+            ],
+            dim=-1,
+        )
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.build_position_encoding with DeformableDetr->RFDetr
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = RFDetrSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = RFDetrLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RFDetr
+class RFDetrMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: RFDetrConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        self.attn = MultiScaleDeformableAttention()
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in RFDetrMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
+        if total_elements != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiheadAttention with DeformableDetr->RFDetr
+class RFDetrMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (
+            batch_size * self.num_heads,
+            target_len,
+            self.head_dim,
+        ):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrEncoderLayer with DeformableDetr->RFDetr
+class RFDetrEncoderLayer(nn.Module):
+    def __init__(self, config: RFDetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = RFDetrMultiscaleDeformableAttention(
+            config,
+            num_heads=config.encoder_attention_heads,
+            n_points=config.encoder_n_points,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: torch.Tensor = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoderLayer with DeformableDetr->RFDetr
+class RFDetrDecoderLayer(nn.Module):
+    def __init__(self, config: RFDetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = RFDetrMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_self_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = RFDetrMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_cross_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrPreTrainedModel with DeformableDetr->RFDetr
+class RFDetrPreTrainedModel(PreTrainedModel):
+    config_class = RFDetrConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        r"RFDetrConvEncoder",
+        r"RFDetrEncoderLayer",
+        r"RFDetrDecoderLayer",
+    ]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, RFDetrLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, RFDetrMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+
+RFDETR_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RFDetrConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RFDETR_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`DeformableDetrImageProcessor.__call__`]
+            for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrDecoder with DeformableDetr->RFDetr
+class RFDetrDecoder(RFDetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RFDetrDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Deformable DETR:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: RFDetrConfig
+    """
+
+    def __init__(self, config: RFDetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([RFDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.config = config
+        self.reference_points_head = RFDetrMLPPredictionHead(2 * config.d_model, config.d_model, config.d_model, 2)
+
+        self.norm = RFDetrLayerNorm(config.d_model, data_format="channels_first")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def get_reference_points(self, reference_points_embeds, valid_ratios):
+        obj_center = reference_points_embeds[..., :4]
+
+        refpoints_input = (
+            obj_center[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+        )  # bs, nq, nlevel, 4
+        query_sine_embed = self.get_proposal_pos_embed(refpoints_input[:, :, 0, :], self.d_model / 2)  # bs, nq, 256*2
+        position_query_embeddings = self.reference_points_head(query_sine_embed)
+        return refpoints_input, position_query_embeddings
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            elif reference_points.shape[-1] == 2:
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Reference points' last dimension must be of size 2")
+
+            if self.config.bbox_reparam:
+                query_sine_embed = self.get_proposal_pos_embed(
+                    reference_points_input[:, :, 0, :], self.d_model / 2
+                )  # bs, nq, 256*2
+                position_embeddings = self.reference_points_head(query_sine_embed)
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    position_embeddings,
+                    reference_points_input,
+                    spatial_shapes,
+                    spatial_shapes_list,
+                    level_start_index,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings=position_embeddings,
+                    encoder_hidden_states=encoder_hidden_states,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
+                    level_start_index=level_start_index,
+                    encoder_attention_mask=encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                num_coordinates = reference_points.shape[-1]
+                if self.config.bbox_reparam:
+                    new_reference_points_cxcy = (
+                        tmp[..., :2] * reference_points_embeddings[..., 2:] + reference_points_embeddings[..., :2]
+                    )
+                    new_reference_points_wh = tmp[..., 2:].exp() * reference_points_embeddings[..., 2:]
+                    new_reference_points = torch.concat([new_reference_points_cxcy, new_reference_points_wh], dim=-1)
+                else:
+                    if num_coordinates == 4:
+                        new_reference_points = tmp + inverse_sigmoid(reference_points)
+                        new_reference_points = new_reference_points.sigmoid()
+                    elif num_coordinates == 2:
+                        new_reference_points = tmp
+                        new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                        new_reference_points = new_reference_points.sigmoid()
+                    else:
+                        raise ValueError(
+                            f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
+                        )
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return RFDetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare RF DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """,
+    RFDETR_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetr->RFDetr, DEFORMABLE_DETR->RFDETR
+class RFDetrModel(RFDetrPreTrainedModel):
+    def __init__(self, config: RFDetrConfig):
+        super().__init__(config)
+
+        self.bbox_reparam = config.bbox_reparam
+        self.two_stage = config.two_stage
+        self.num_groups = config.num_groups
+        self.num_queries = config.num_queries
+        self.d_model = config.d_model
+
+        # Create backbone + positional encoding
+        backbone = RFDetrConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = RFDetrConvModel(backbone, position_embeddings)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries * config.num_groups, config.d_model)
+
+        self.reference_point_embeddings = nn.Embedding(config.num_queries * config.num_groups, 4)
+        nn.init.constant_(self.reference_point_embeddings.weight.data, 0)
+
+        self.decoder = RFDetrDecoder(config)
+
+        if config.two_stage:
+            self.enc_output = nn.ModuleList(
+                [nn.Linear(config.d_model, config.d_model) for _ in range(config.num_groups)]
+            )
+            self.enc_output_norm = nn.ModuleList([nn.LayerNorm(config.d_model) for _ in range(config.num_groups)])
+            # self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            # self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask, dtype=torch.float32):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_height = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device)
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (List[Tuple[int, int]]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(
+                    0,
+                    height - 1,
+                    height,
+                    dtype=enc_output.dtype,
+                    device=enc_output.device,
+                ),
+                torch.linspace(
+                    0,
+                    width - 1,
+                    width,
+                    dtype=enc_output.dtype,
+                    device=enc_output.device,
+                ),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        # object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
+
+    @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=RFDetrModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], RFDetrModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, RFDetrModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = RFDetrModel.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, _ = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for source, mask in features:
+            sources.append(source)
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            if self.training:
+                query_embeds = self.query_position_embeddings.weight
+            else:
+                query_embeds = self.query_position_embeddings.weight[: self.num_queries]
+
+        if self.training:
+            reference_point_embeds = self.reference_point_embeddings.weight
+        else:
+            reference_point_embeds = self.reference_point_embeddings.weight[: self.num_queries]
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        spatial_shapes_list = []
+        for source, mask in enumerate(zip(sources, masks)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes_list.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1)
+
+        # Fifth, prepare decoder inputs
+        encoder_hidden_states = source_flatten
+        batch_size, _, num_channels = encoder_hidden_states.shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+
+        if self.two_stage:
+            object_query_embedding, output_proposals = self.gen_encoder_output_proposals(
+                source_flatten, ~mask_flatten, spatial_shapes_list
+            )
+
+            reference_points = []
+            encoder_hidden_states = []
+            boxes_ts = []
+            num_groups = self.num_groups if self.training else 1
+            for group_id in range(num_groups):
+                object_query_embedding = self.enc_output[group_id](object_query_embedding)
+                object_query_embedding = self.enc_output_norm[group_id](object_query_embedding)
+
+                enc_outputs_class = self.enc_out_class_embed[group_id](object_query_embedding)
+
+                if self.bbox_reparam:
+                    enc_outputs_coord_delta = self.enc_out_bbox_embed[group_id](object_query_embedding)
+                    enc_outputs_coord_cxcy = (
+                        enc_outputs_coord_delta[..., :2] * output_proposals[..., 2:] + output_proposals[..., :2]
+                    )
+                    enc_outputs_coord_wh = enc_outputs_coord_delta[..., 2:].exp() * output_proposals[..., 2:]
+                    enc_outputs_coord_logits = torch.concat([enc_outputs_coord_cxcy, enc_outputs_coord_wh], dim=-1)
+                else:
+                    delta_bbox = self.enc_out_bbox_embed[group_id](object_query_embedding)
+                    enc_outputs_coord_logits = delta_bbox + output_proposals
+
+                topk = self.config.two_stage_num_proposals
+                enc_outputs_class = torch.max(enc_outputs_class, dim=-1)[0]
+                topk_proposals = torch.topk(enc_outputs_class, topk, dim=1)[1]  # bs, nq
+
+                reference_point_embedding = torch.gather(
+                    enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+                )  # unsigmoid
+                # for decoder layer, detached as initial ones, (bs, nq, 4)
+                reference_point_embedding_detached = reference_point_embedding.detach()
+
+                # get memory tgt
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                )
+
+                reference_points.append(reference_point_embedding_detached)
+                encoder_hidden_states.append(target)
+                boxes_ts.append(reference_point_embedding)
+            reference_points = torch.cat(reference_points, dim=1)
+            encoder_hidden_states = torch.cat(encoder_hidden_states, dim=1)
+            boxes_ts = torch.cat(boxes_ts, dim=1)
+
+            # hack implementation for two-stage Deformable DETR
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            # enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            # delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            # enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.config.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits,
+                1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4),
+            )
+
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return RFDetrModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            # TODO
+            # encoder_last_hidden_state=features.last_hidden_state,
+            # encoder_hidden_states=features.hidden_states,
+            # encoder_attentions=features.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+        )
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMLPPredictionHead with DeformableDetr->RFDetr
+class RFDetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    RF DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on
+    top, for tasks such as COCO detection.
+    """,
+    RFDETR_START_DOCSTRING,
+)
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection with Deformable->RF, DEFORMABLE_DETR->RFDETR
+class RFDetrForObjectDetection(RFDetrPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: RFDetrConfig):
+        super().__init__(config)
+
+        # RF DETR encoder-decoder model
+        self.model = RFDetrModel(config)
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = RFDetrMLPPredictionHead(
+            input_dim=config.d_model,
+            hidden_dim=config.d_model,
+            output_dim=4,
+            num_layers=3,
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(config.num_labels) * bias_value
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            # self.class_embed = _get_clones(self.class_embed, num_pred)
+            # self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            # nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            # nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            # self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            # self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            # for box_embed in self.bbox_embed:
+            # nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+            self.model.enc_out_bbox_embed = nn.ModuleList(
+                [copy.deepcopy(self.bbox_embed) for _ in range(config.num_groups)]
+            )
+            self.model.enc_out_class_embed = nn.ModuleList(
+                [copy.deepcopy(self.class_embed) for _ in range(config.num_groups)]
+            )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(RFDETR_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=RFDetrObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[List[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], RFDetrObjectDetectionOutput]:
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, RFDetrForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr")
+        >>> model = RFDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78]
+        Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25]
+        Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through RfDETR base model to obtain backbone + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                outputs_class,
+                outputs_coord,
+            )
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = RFDetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+        )
+
+        return dict_outputs
+
+
+__all__ = [
+    "RFDetrForObjectDetection",
+    "RFDetrModel",
+    "RFDetrPreTrainedModel",
+]
diff --git a/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py
new file mode 100644
index 000000000000..54530aacffa7
--- /dev/null
+++ b/src/transformers/models/rf_detr/modeling_rf_detr_dinov2_with_registers.py
@@ -0,0 +1,763 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_rf_detr_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import collections.abc
+from typing import Callable, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_rf_detr_dinov2_with_registers import RFDetrDinov2WithRegistersConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "RFDetrDinov2WithRegistersConfig"
+
+
+class RFDetrDinov2WithRegistersPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class RFDetrDinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+            if config.num_register_tokens > 0
+            else None
+        )
+        self.patch_embeddings = RFDetrDinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        if self.config.num_windows > 1:
+            # reshape for windows
+            num_h_patches = height // self.config.patch_size
+            num_w_patches = width // self.config.patch_size
+            cls_token_with_pos_embed = embeddings[:, :1]
+            pixel_tokens_with_pos_embed = embeddings[:, 1:]
+            pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view(
+                batch_size, num_h_patches, num_w_patches, -1
+            )
+            num_w_patches_per_window = num_w_patches // self.config.num_windows
+            num_h_patches_per_window = num_h_patches // self.config.num_windows
+            num_windows = self.config.num_windows
+            windowed_pixel_tokens = pixel_tokens_with_pos_embed.view(
+                batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1
+            )
+            windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5)
+            windowed_pixel_tokens = windowed_pixel_tokens.reshape(
+                batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1
+            )
+            windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1)
+            embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1)
+
+        # add register tokens
+        embeddings = (
+            torch.cat(
+                (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+            )
+            if self.config.num_register_tokens > 0
+            else embeddings
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class RFDetrDinov2WithRegistersSelfAttention(nn.Module):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = None
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class RFDetrDinov2WithRegistersSelfOutput(nn.Module):
+    """
+    The residual connection is defined in RFDetrDinov2WithRegistersLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class RFDetrDinov2WithRegistersAttention(nn.Module):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.attention = RFDetrDinov2WithRegistersSelfAttention(config)
+        self.output = RFDetrDinov2WithRegistersSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class RFDetrDinov2WithRegistersLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class RFDetrDinov2WithRegistersDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class RFDetrDinov2WithRegistersMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class RFDetrDinov2WithRegistersSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class RFDetrDinov2WithRegistersLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = RFDetrDinov2WithRegistersAttention(config)
+        self.layer_scale1 = RFDetrDinov2WithRegistersLayerScale(config)
+        self.drop_path = (
+            RFDetrDinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        )
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = RFDetrDinov2WithRegistersSwiGLUFFN(config)
+        else:
+            self.mlp = RFDetrDinov2WithRegistersMLP(config)
+        self.layer_scale2 = RFDetrDinov2WithRegistersLayerScale(config)
+        self.num_windows = config.num_windows
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        remove_windows: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        assert head_mask is None, "head_mask is not supported for windowed attention"
+        assert not output_attentions, "output_attentions is not supported for windowed attention"
+        shortcut = hidden_states
+        if remove_windows:
+            # reshape x to remove windows
+            B, HW, C = hidden_states.shape
+            num_windows_squared = self.num_windows**2
+            hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C)
+
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2WithRegisters, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        if remove_windows:
+            # reshape x to add windows back
+            B, HW, C = hidden_states.shape
+            num_windows_squared = self.num_windows**2
+            # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C)
+            attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C)
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + shortcut
+
+        # in Dinov2WithRegisters, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class RFDetrDinov2WithRegistersEncoder(nn.Module):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RFDetrDinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if i > int(self.config.out_features[-1][5:]):
+                # early stop if we have reached the last output feature
+                break
+
+            remove_windows = i not in self.config.window_block_indexes
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                    remove_windows,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class RFDetrDinov2WithRegistersPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RFDetrDinov2WithRegistersConfig
+    base_model_prefix = "rf_detr_dinov2_with_registers"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RFDetrDinov2WithRegistersSwiGLUFFN"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, RFDetrDinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+
+RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RFDetrDinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """
+    RFDetrDinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    RF_DETR_DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class RFDetrDinov2WithRegistersBackbone(RFDetrDinov2WithRegistersPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = RFDetrDinov2WithRegistersEmbeddings(config)
+        self.encoder = RFDetrDinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.num_register_tokens = config.num_register_tokens
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> RFDetrDinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(RF_DETR_DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+        Returns:
+
+        Examples:
+        Returns:
+
+        Examples:
+        Returns:
+
+        Examples:
+
+
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, self.num_register_tokens + 1 :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+
+                    num_h_patches = height // patch_size
+                    num_w_patches = width // patch_size
+
+                    if self.config.num_windows > 1:
+                        # undo windowing
+                        num_windows_squared = self.config.num_windows**2
+                        B, HW, C = hidden_state.shape
+                        num_h_patches_per_window = num_h_patches // self.config.num_windows
+                        num_w_patches_per_window = num_w_patches // self.config.num_windows
+                        hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C)
+                        hidden_state = hidden_state.view(
+                            B // num_windows_squared,
+                            self.config.num_windows,
+                            self.config.num_windows,
+                            num_h_patches_per_window,
+                            num_w_patches_per_window,
+                            C,
+                        )
+                        hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5)
+
+                    hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
+
+
+__all__ = ["RFDetrDinov2WithRegistersBackbone"]
diff --git a/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py
new file mode 100644
index 000000000000..7e7a785111d6
--- /dev/null
+++ b/src/transformers/models/rf_detr/modular_rf_detr_dinov2_with_registers.py
@@ -0,0 +1,308 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ..dinov2_with_registers.configuration_dinov2_with_registers import Dinov2WithRegistersConfig
+from ..dinov2_with_registers.modeling_dinov2_with_registers import (
+    Dinov2WithRegistersBackbone,
+    Dinov2WithRegistersEmbeddings,
+    Dinov2WithRegistersEncoder,
+    Dinov2WithRegistersLayer,
+    Dinov2WithRegistersSelfAttention,
+)
+
+
+class RFDetrDinov2WithRegistersConfig(Dinov2WithRegistersConfig):
+    def __init__(self, num_windows: int = 4, window_block_indexes=None, **super_kwargs):
+        super(Dinov2WithRegistersConfig).__init__(**super_kwargs)
+
+        self.num_windows = num_windows
+        window_block_indexes = set(range(self._out_indices[-1] + 1))
+        window_block_indexes.difference_update(self._out_indices)
+        window_block_indexes = list(window_block_indexes)
+        self.window_block_indexes = window_block_indexes
+
+
+class RFDetrDinov2WithRegistersEmbeddings(Dinov2WithRegistersEmbeddings):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig):
+        super(Dinov2WithRegistersEmbeddings).__init__(config)
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+            if config.num_register_tokens > 0
+            else None
+        )
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        if self.config.num_windows > 1:
+            # reshape for windows
+            num_h_patches = height // self.config.patch_size
+            num_w_patches = width // self.config.patch_size
+            cls_token_with_pos_embed = embeddings[:, :1]
+            pixel_tokens_with_pos_embed = embeddings[:, 1:]
+            pixel_tokens_with_pos_embed = pixel_tokens_with_pos_embed.view(
+                batch_size, num_h_patches, num_w_patches, -1
+            )
+            num_w_patches_per_window = num_w_patches // self.config.num_windows
+            num_h_patches_per_window = num_h_patches // self.config.num_windows
+            num_windows = self.config.num_windows
+            windowed_pixel_tokens = pixel_tokens_with_pos_embed.view(
+                batch_size, num_windows, num_h_patches_per_window, num_windows, num_h_patches_per_window, -1
+            )
+            windowed_pixel_tokens = windowed_pixel_tokens.permute(0, 1, 3, 2, 4, 5)
+            windowed_pixel_tokens = windowed_pixel_tokens.reshape(
+                batch_size * num_windows**2, num_h_patches_per_window * num_w_patches_per_window, -1
+            )
+            windowed_cls_token_with_pos_embed = cls_token_with_pos_embed.repeat(num_windows**2, 1, 1)
+            embeddings = torch.cat((windowed_cls_token_with_pos_embed, windowed_pixel_tokens), dim=1)
+
+        # add register tokens
+        embeddings = (
+            torch.cat(
+                (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+            )
+            if self.config.num_register_tokens > 0
+            else embeddings
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class RFDetrDinov2WithRegistersSelfAttention(Dinov2WithRegistersSelfAttention):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig):
+        super(Dinov2WithRegistersSelfAttention).__init__(config)
+        self.scaling = None
+
+
+class RFDetrDinov2WithRegistersLayer(Dinov2WithRegistersLayer):
+    def __init__(self, config: RFDetrDinov2WithRegistersConfig):
+        super().__init__(config)
+        self.num_windows = config.num_windows
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        remove_windows: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        assert head_mask is None, "head_mask is not supported for windowed attention"
+        assert not output_attentions, "output_attentions is not supported for windowed attention"
+        shortcut = hidden_states
+        if remove_windows:
+            # reshape x to remove windows
+            B, HW, C = hidden_states.shape
+            num_windows_squared = self.num_windows**2
+            hidden_states = hidden_states.view(B // num_windows_squared, num_windows_squared * HW, C)
+
+        self_attention_outputs = self.attention(
+            self.norm1(hidden_states),  # in Dinov2WithRegisters, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        if remove_windows:
+            # reshape x to add windows back
+            B, HW, C = hidden_states.shape
+            num_windows_squared = self.num_windows**2
+            # hidden_states = hidden_states.view(B * num_windows_squared, HW // num_windows_squared, C)
+            attention_output = attention_output.view(B * num_windows_squared, HW // num_windows_squared, C)
+
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + shortcut
+
+        # in Dinov2WithRegisters, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class RFDetrDinov2WithRegistersEncoder(Dinov2WithRegistersEncoder):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if i > int(self.config.out_features[-1][5:]):
+                # early stop if we have reached the last output feature
+                break
+
+            remove_windows = i not in self.config.window_block_indexes
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                    remove_windows,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, remove_windows)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class RFDetrDinov2WithRegistersBackbone(Dinov2WithRegistersBackbone):
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+        Returns:
+
+        Examples:
+
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, self.num_register_tokens + 1 :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+
+                    num_h_patches = height // patch_size
+                    num_w_patches = width // patch_size
+
+                    if self.config.num_windows > 1:
+                        # undo windowing
+                        num_windows_squared = self.config.num_windows**2
+                        B, HW, C = hidden_state.shape
+                        num_h_patches_per_window = num_h_patches // self.config.num_windows
+                        num_w_patches_per_window = num_w_patches // self.config.num_windows
+                        hidden_state = hidden_state.reshape(B // num_windows_squared, num_windows_squared * HW, C)
+                        hidden_state = hidden_state.view(
+                            B // num_windows_squared,
+                            self.config.num_windows,
+                            self.config.num_windows,
+                            num_h_patches_per_window,
+                            num_w_patches_per_window,
+                            C,
+                        )
+                        hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5)
+
+                    hidden_state = hidden_state.reshape(batch_size, num_h_patches, num_w_patches, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
+
+
+__all__ = [
+    "RFDetrDinov2WithRegistersConfig",
+    "RFDetrDinov2WithRegistersBackbone",
+]
diff --git a/src/transformers/models/rf_detr/run_rfdetr.py b/src/transformers/models/rf_detr/run_rfdetr.py
new file mode 100644
index 000000000000..0ec581d49343
--- /dev/null
+++ b/src/transformers/models/rf_detr/run_rfdetr.py
@@ -0,0 +1,18 @@
+import io
+
+import requests
+from PIL import Image
+
+from transformers import AutoImageProcessor, RFDetrBackbone, RFDetrConfig
+
+
+images = ["https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
+
+images = [Image.open(io.BytesIO(requests.get(url).content)) for url in images]
+
+processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+inputs = processor(images, return_tensors="pt")
+
+config = RFDetrConfig()
+backbone = RFDetrBackbone(config=config.backbone_config)
+# model = RFDetrForObjectDetection.from_config()
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index a7051cffca81..a0683b901966 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -8387,6 +8387,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class RFDetrDinov2WithRegistersBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RFDetrForObjectDetection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RFDetrModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RFDetrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class RobertaForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]