Skip to content

Commit 606b88b

Browse files
committed
update swinb & swinl results
1 parent 892e8ec commit 606b88b

17 files changed

+478
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
_base_ = '../grounding_dino_swin-l_pretrain_obj365.py'
2+
3+
data_root = 'data/coco/'
4+
5+
train_pipeline = [
6+
dict(type='LoadImageFromFile'),
7+
dict(type='LoadAnnotations', with_bbox=True),
8+
dict(type='RandomFlip', prob=0.5),
9+
dict(
10+
type='RandomChoice',
11+
transforms=[
12+
[
13+
dict(
14+
type='RandomChoiceResize',
15+
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
16+
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
17+
(736, 1333), (768, 1333), (800, 1333)],
18+
keep_ratio=True)
19+
],
20+
[
21+
dict(
22+
type='RandomChoiceResize',
23+
# The radio of all image in train dataset < 7
24+
# follow the original implement
25+
scales=[(400, 4200), (500, 4200), (600, 4200)],
26+
keep_ratio=True),
27+
dict(
28+
type='RandomCrop',
29+
crop_type='absolute_range',
30+
crop_size=(384, 600),
31+
allow_negative_crop=True),
32+
dict(
33+
type='RandomChoiceResize',
34+
scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
35+
(608, 1333), (640, 1333), (672, 1333), (704, 1333),
36+
(736, 1333), (768, 1333), (800, 1333)],
37+
keep_ratio=True)
38+
]
39+
]),
40+
dict(
41+
type='PackDetInputs',
42+
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
43+
'scale_factor', 'flip', 'flip_direction', 'text',
44+
'custom_entities'))
45+
]
46+
47+
train_dataloader = dict(
48+
dataset=dict(
49+
_delete_=True,
50+
type='CocoDataset',
51+
data_root=data_root,
52+
ann_file='annotations/instances_train2017.json',
53+
data_prefix=dict(img='train2017/'),
54+
return_classes=True,
55+
filter_cfg=dict(filter_empty_gt=False, min_size=32),
56+
pipeline=train_pipeline))
57+
58+
optim_wrapper = dict(
59+
_delete_=True,
60+
type='OptimWrapper',
61+
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
62+
clip_grad=dict(max_norm=0.1, norm_type=2),
63+
paramwise_cfg=dict(
64+
custom_keys={
65+
'absolute_pos_embed': dict(decay_mult=0.),
66+
'backbone': dict(lr_mult=0.1),
67+
'language_model': dict(lr_mult=0.1),
68+
}))
69+
70+
# learning policy
71+
max_epochs = 12
72+
param_scheduler = [
73+
dict(
74+
type='MultiStepLR',
75+
begin=0,
76+
end=max_epochs,
77+
by_epoch=True,
78+
milestones=[8, 11],
79+
gamma=0.1)
80+
]
81+
train_cfg = dict(max_epochs=max_epochs, val_interval=1)
82+
83+
default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
84+
85+
load_from = 'xxxx' # noqa
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)
26+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'
2+
3+
model = dict(test_cfg=dict(chunked_size=1))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
2+
3+
num_levels = 5
4+
model = dict(
5+
num_feature_levels=num_levels,
6+
backbone=dict(
7+
_delete_=True,
8+
type='SwinTransformer',
9+
pretrain_img_size=384,
10+
embed_dims=192,
11+
depths=[2, 2, 18, 2],
12+
num_heads=[6, 12, 24, 48],
13+
window_size=12,
14+
mlp_ratio=4,
15+
qkv_bias=True,
16+
qk_scale=None,
17+
drop_rate=0.,
18+
attn_drop_rate=0.,
19+
drop_path_rate=0.2,
20+
patch_norm=True,
21+
out_indices=(0, 1, 2, 3),
22+
# Please only add indices that would be used
23+
# in FPN, otherwise some parameter will not be used
24+
with_cp=True,
25+
convert_weights=True,
26+
frozen_stages=-1,
27+
init_cfg=None),
28+
neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
29+
encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
30+
decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
31+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
_base_ = 'grounding_dino_swin-l_pretrain_zeroshot_concat_dod.py'
2+
3+
model = dict(test_cfg=dict(chunked_size=1))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
_base_ = 'grounding_dino_swin-t-pretrain_flickr30k.py'
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
_base_ = 'grounding_dino_swin-t-pretrain_flickr30k.py'
2+
3+
num_levels = 5
4+
model = dict(
5+
num_feature_levels=num_levels,
6+
backbone=dict(
7+
_delete_=True,
8+
type='SwinTransformer',
9+
pretrain_img_size=384,
10+
embed_dims=192,
11+
depths=[2, 2, 18, 2],
12+
num_heads=[6, 12, 24, 48],
13+
window_size=12,
14+
mlp_ratio=4,
15+
qkv_bias=True,
16+
qk_scale=None,
17+
drop_rate=0.,
18+
attn_drop_rate=0.,
19+
drop_path_rate=0.2,
20+
patch_norm=True,
21+
out_indices=(0, 1, 2, 3),
22+
# Please only add indices that would be used
23+
# in FPN, otherwise some parameter will not be used
24+
with_cp=True,
25+
convert_weights=True,
26+
frozen_stages=-1,
27+
init_cfg=None),
28+
neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
29+
encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
30+
decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
2+
3+
num_levels = 5
4+
model = dict(
5+
num_feature_levels=num_levels,
6+
backbone=dict(
7+
_delete_=True,
8+
type='SwinTransformer',
9+
pretrain_img_size=384,
10+
embed_dims=192,
11+
depths=[2, 2, 18, 2],
12+
num_heads=[6, 12, 24, 48],
13+
window_size=12,
14+
mlp_ratio=4,
15+
qkv_bias=True,
16+
qk_scale=None,
17+
drop_rate=0.,
18+
attn_drop_rate=0.,
19+
drop_path_rate=0.2,
20+
patch_norm=True,
21+
out_indices=(0, 1, 2, 3),
22+
# Please only add indices that would be used
23+
# in FPN, otherwise some parameter will not be used
24+
with_cp=True,
25+
convert_weights=True,
26+
frozen_stages=-1,
27+
init_cfg=None),
28+
neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
29+
encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
30+
decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
2+
3+
num_levels = 5
4+
model = dict(
5+
num_feature_levels=num_levels,
6+
backbone=dict(
7+
_delete_=True,
8+
type='SwinTransformer',
9+
pretrain_img_size=384,
10+
embed_dims=192,
11+
depths=[2, 2, 18, 2],
12+
num_heads=[6, 12, 24, 48],
13+
window_size=12,
14+
mlp_ratio=4,
15+
qkv_bias=True,
16+
qk_scale=None,
17+
drop_rate=0.,
18+
attn_drop_rate=0.,
19+
drop_path_rate=0.2,
20+
patch_norm=True,
21+
out_indices=(0, 1, 2, 3),
22+
# Please only add indices that would be used
23+
# in FPN, otherwise some parameter will not be used
24+
with_cp=True,
25+
convert_weights=True,
26+
frozen_stages=-1,
27+
init_cfg=None),
28+
neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
29+
encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
30+
decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_odinw13.py' # noqa
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
_base_ = 'grounding_dino_swin-t_pretrain_odinw35.py' # noqa
2+
3+
model = dict(
4+
backbone=dict(
5+
_delete_=True,
6+
type='SwinTransformer',
7+
pretrain_img_size=384,
8+
embed_dims=128,
9+
depths=[2, 2, 18, 2],
10+
num_heads=[4, 8, 16, 32],
11+
window_size=12,
12+
mlp_ratio=4,
13+
qkv_bias=True,
14+
qk_scale=None,
15+
drop_rate=0.,
16+
attn_drop_rate=0.,
17+
drop_path_rate=0.3,
18+
patch_norm=True,
19+
out_indices=(1, 2, 3),
20+
with_cp=True,
21+
convert_weights=True,
22+
frozen_stages=-1,
23+
init_cfg=None),
24+
neck=dict(in_channels=[256, 512, 1024]),
25+
)

0 commit comments

Comments
 (0)