recogni
diff --git a/‎configs/queryinst/queryinst_r50_fpn_1x_coco.py
+32-15 b/‎configs/queryinst/queryinst_r50_fpn_1x_coco.py
+32-15
diff --git a/‎configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+30-37 b/‎configs/queryinst/queryinst_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+30-37
diff --git a/‎configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py
+26-15 b/‎configs/queryinst/queryinst_r50_fpn_mstrain_480-800_3x_coco.py
+26-15
diff --git a/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py
+15-10 b/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_1x_coco.py
+15-10
diff --git a/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+30-37 b/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_300_proposals_crop_mstrain_480-800_3x_coco.py
+30-37
diff --git a/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py
+26-15 b/‎configs/sparse_rcnn/sparse_rcnn_r50_fpn_mstrain_480-800_3x_coco.py
+26-15
diff --git a/‎mmdet/models/dense_heads/base_dense_head.py
+6-3 b/‎mmdet/models/dense_heads/base_dense_head.py
+6-3
@@ -6,6 +6,13 @@
 num_proposals = 100
 model = dict(
     type='QueryInst',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
     backbone=dict(
         type='ResNet',
         depth=50,
@@ -111,10 +118,11 @@
             dict(
                 assigner=dict(
                     type='HungarianAssigner',
-                    cls_cost=dict(type='FocalLossCost', weight=2.0),
-                    reg_cost=dict(type='BBoxL1Cost', weight=5.0),
-                    iou_cost=dict(type='IoUCost', iou_mode='giou',
-                                  weight=2.0)),
+                    match_costs=[
+                        dict(type='FocalLossCost', weight=2.0),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ]),
                 sampler=dict(type='PseudoSampler'),
                 pos_weight=1,
                 mask_size=28,
@@ -124,15 +132,24 @@
         rpn=None, rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
 
 # optimizer
-optimizer = dict(
-    _delete_=True,
-    type='AdamW',
-    lr=0.0001,
-    weight_decay=0.0001,
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
     paramwise_cfg=dict(
-        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
-optimizer_config = dict(
-    _delete_=True, grad_clip=dict(max_norm=0.1, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[8, 11], warmup_iters=1000)
-runner = dict(type='EpochBasedRunner', max_epochs=12)
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=0.1, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
@@ -6,49 +6,42 @@
         _delete_=True,
         rpn=None,
         rcnn=dict(max_per_img=num_proposals, mask_thr_binary=0.5)))
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 
 # augmentation strategy originates from DETR.
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadImageFromFile',
+        file_client_args={{_base_.file_client_args}}),
     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='RandomFlip', prob=0.5),
     dict(
-        type='AutoAugment',
-        policies=[[
+        type='RandomChoice',
+        transforms=[[
             dict(
-                type='Resize',
-                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
-                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
-                           (736, 1333), (768, 1333), (800, 1333)],
-                multiscale_mode='value',
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
                 keep_ratio=True)
         ],
-                  [
-                      dict(
-                          type='Resize',
-                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
-                          multiscale_mode='value',
-                          keep_ratio=True),
-                      dict(
-                          type='RandomCrop',
-                          crop_type='absolute_range',
-                          crop_size=(384, 600),
-                          allow_negative_crop=True),
-                      dict(
-                          type='Resize',
-                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
-                                     (576, 1333), (608, 1333), (640, 1333),
-                                     (672, 1333), (704, 1333), (736, 1333),
-                                     (768, 1333), (800, 1333)],
-                          multiscale_mode='value',
-                          override=True,
-                          keep_ratio=True)
-                  ]]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
 ]
-data = dict(train=dict(pipeline=train_pipeline))
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
@@ -1,23 +1,34 @@
 _base_ = './queryinst_r50_fpn_1x_coco.py'
 
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadImageFromFile',
+        file_client_args={{_base_.file_client_args}}),
     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
     dict(
-        type='Resize',
-        img_scale=[(1333, value) for value in min_values],
-        multiscale_mode='value',
+        type='RandomChoiceResize',
+        scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                (736, 1333), (768, 1333), (800, 1333)],
         keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
 ]
 
-data = dict(train=dict(pipeline=train_pipeline))
-lr_config = dict(policy='step', step=[27, 33])
-runner = dict(type='EpochBasedRunner', max_epochs=36)
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
@@ -6,6 +6,12 @@
 num_proposals = 100
 model = dict(
     type='SparseRCNN',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
     backbone=dict(
         type='ResNet',
         depth=50,
@@ -78,19 +84,18 @@
             dict(
                 assigner=dict(
                     type='HungarianAssigner',
-                    # TODO update
-                    cls_cost=dict(type='FocalLossCost', weight=2.0),
-                    reg_cost=dict(type='BBoxL1Cost', weight=5.0),
-                    iou_cost=dict(type='IoUCost', iou_mode='giou',
-                                  weight=2.0)),
+                    match_costs=[
+                        dict(type='FocalLossCost', weight=2.0),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xyxy'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ]),
                 sampler=dict(type='PseudoSampler'),
                 pos_weight=1) for _ in range(num_stages)
         ]),
     test_cfg=dict(rpn=None, rcnn=dict(max_per_img=num_proposals)))
 
 # optimizer
-optimizer = dict(_delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001)
-optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=1, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[8, 11])
-runner = dict(type='EpochBasedRunner', max_epochs=12)
+optim_wrapper = dict(
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.000025, weight_decay=0.0001),
+    clip_grad=dict(max_norm=1, norm_type=2))
@@ -4,49 +4,42 @@
     rpn_head=dict(num_proposals=num_proposals),
     test_cfg=dict(
         _delete_=True, rpn=None, rcnn=dict(max_per_img=num_proposals)))
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 
 # augmentation strategy originates from DETR.
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadImageFromFile',
+        file_client_args={{_base_.file_client_args}}),
     dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='RandomFlip', prob=0.5),
     dict(
-        type='AutoAugment',
-        policies=[[
+        type='RandomChoice',
+        transforms=[[
             dict(
-                type='Resize',
-                img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
-                           (608, 1333), (640, 1333), (672, 1333), (704, 1333),
-                           (736, 1333), (768, 1333), (800, 1333)],
-                multiscale_mode='value',
+                type='RandomChoiceResize',
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
                 keep_ratio=True)
         ],
-                  [
-                      dict(
-                          type='Resize',
-                          img_scale=[(400, 1333), (500, 1333), (600, 1333)],
-                          multiscale_mode='value',
-                          keep_ratio=True),
-                      dict(
-                          type='RandomCrop',
-                          crop_type='absolute_range',
-                          crop_size=(384, 600),
-                          allow_negative_crop=True),
-                      dict(
-                          type='Resize',
-                          img_scale=[(480, 1333), (512, 1333), (544, 1333),
-                                     (576, 1333), (608, 1333), (640, 1333),
-                                     (672, 1333), (704, 1333), (736, 1333),
-                                     (768, 1333), (800, 1333)],
-                          multiscale_mode='value',
-                          override=True,
-                          keep_ratio=True)
-                  ]]),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+                    [
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type='RandomCrop',
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type='RandomChoiceResize',
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type='PackDetInputs')
 ]
-data = dict(train=dict(pipeline=train_pipeline))
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
@@ -1,23 +1,34 @@
 _base_ = './sparse_rcnn_r50_fpn_1x_coco.py'
 
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-min_values = (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
 train_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadImageFromFile',
+        file_client_args={{_base_.file_client_args}}),
     dict(type='LoadAnnotations', with_bbox=True),
     dict(
-        type='Resize',
-        img_scale=[(1333, value) for value in min_values],
-        multiscale_mode='value',
+        type='RandomChoiceResize',
+        scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                (736, 1333), (768, 1333), (800, 1333)],
         keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackDetInputs')
 ]
 
-data = dict(train=dict(pipeline=train_pipeline))
-lr_config = dict(policy='step', step=[27, 33])
-runner = dict(type='EpochBasedRunner', max_epochs=36)
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# learning policy
+max_epochs = 36
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=max_epochs)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[27, 33],
+        gamma=0.1)
+]
@@ -265,10 +265,13 @@ def predict_by_feat(self,
 
         for img_id in range(len(batch_img_metas)):
             img_meta = batch_img_metas[img_id]
-            cls_score_list = select_single_mlvl(cls_scores, img_id)
-            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
             if with_score_factors:
-                score_factor_list = select_single_mlvl(score_factors, img_id)
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
             else:
                 score_factor_list = [None for _ in range(num_levels)]