update swinb & swinl results

hhaAndroid · hhaAndroid · commit 606b88b9c3de · 2024-02-29T19:54:33.000+08:00
diff --git a/configs/mm_grounding_dino/coco/grounding_dino_swin-l_finetune_16xb4_1x_coco.py b/configs/mm_grounding_dino/coco/grounding_dino_swin-l_finetune_16xb4_1x_coco.py
@@ -0,0 +1,85 @@
+_base_ = '../grounding_dino_swin-l_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.1),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = 'xxxx'  # noqa
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,26 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
+
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-l_pretrain_zeroshot_concat_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-l_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,31 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
+
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-l_pretrain_zeroshot_parallel_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-l_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-l_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-b-pretrain_flickr30k.py b/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-b-pretrain_flickr30k.py
@@ -0,0 +1,25 @@
+_base_ = 'grounding_dino_swin-t-pretrain_flickr30k.py'
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-l-pretrain_flickr30k.py b/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-l-pretrain_flickr30k.py
@@ -0,0 +1,30 @@
+_base_ = 'grounding_dino_swin-t-pretrain_flickr30k.py'
+
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-b_pretrain_lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-b_pretrain_lvis.py
@@ -0,0 +1,25 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-b_pretrain_mini-lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-b_pretrain_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-l_pretrain_lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-l_pretrain_lvis.py
@@ -0,0 +1,30 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
+
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-l_pretrain_mini-lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-l_pretrain_mini-lvis.py
@@ -0,0 +1,30 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
+
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
@@ -0,0 +1,25 @@
+_base_ = 'grounding_dino_swin-t_pretrain_odinw13.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
@@ -0,0 +1,25 @@
+_base_ = 'grounding_dino_swin-t_pretrain_odinw35.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=None),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-l_pretrain_odinw13.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-l_pretrain_odinw13.py
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-l_pretrain_odinw35.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-l_pretrain_odinw35.py
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-b_pretrain_refexp.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-b_pretrain_refexp.py
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-l_pretrain_refexp.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-l_pretrain_refexp.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'`
	`2`	`+`
	`3`	`+model = dict(test_cfg=dict(chunked_size=1))`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+_base_ = 'grounding_dino_swin-l_pretrain_zeroshot_concat_dod.py'`
	`2`	`+`
	`3`	`+model = dict(test_cfg=dict(chunked_size=1))`