CompVis
diff --git a/Diff for: ‎configs/coco_cond_stage.yaml
+2-2 b/Diff for: ‎configs/coco_cond_stage.yaml
+2-2
diff --git a/Diff for: ‎configs/coco_scene_images_transformer.yaml
+77 b/Diff for: ‎configs/coco_scene_images_transformer.yaml
+77
diff --git a/Diff for: ‎environment.yaml
+1 b/Diff for: ‎environment.yaml
+1
diff --git a/Diff for: ‎main.py
+1-1 b/Diff for: ‎main.py
+1-1
diff --git a/Diff for: ‎taming/data/annotated_objects_coco.py
+141 b/Diff for: ‎taming/data/annotated_objects_coco.py
+141
@@ -30,7 +30,7 @@ model:
         codebook_weight: 1.0
 
 data:
-  target: cutlit.DataModuleFromConfig
+  target: main.DataModuleFromConfig
   params:
     batch_size: 12
     train:
@@ -41,7 +41,7 @@ data:
         onehot_segmentation: true
         use_stuffthing: true
     validation:
-      target: taming.data.coco.CocoImagesAndCaptionsTrain
+      target: taming.data.coco.CocoImagesAndCaptionsValidation
       params:
         size: 256
         crop_size: 256
 
@@ -0,0 +1,77 @@
+model:
+  base_learning_rate: 4.5e-06
+  target: taming.models.cond_transformer.Net2NetTransformer
+  params:
+    cond_stage_key: objects_bbox
+    transformer_config:
+      target: taming.modules.transformer.mingpt.GPT
+      params:
+        vocab_size: 8192
+        block_size: 348  # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim)
+        n_layer: 32
+        n_head: 16
+        n_embd: 912
+    first_stage_config:
+      target: taming.models.vqgan.VQModel
+      params:
+        ckpt_path: /path/to/coco_epoch117.ckpt  # https://heibox.uni-heidelberg.de/f/78dea9589974474c97c1/
+        embed_dim: 256
+        n_embed: 8192
+        ddconfig:
+          double_z: false
+          z_channels: 256
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 1
+          - 2
+          - 2
+          - 4
+          num_res_blocks: 2
+          attn_resolutions:
+          - 16
+          dropout: 0.0
+        lossconfig:
+          target: taming.modules.losses.DummyLoss
+    cond_stage_config:
+      target: taming.models.dummy_cond_stage.DummyCondStage
+      params:
+        conditional_key: objects_bbox
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 24
+    train:
+      target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco
+      params:
+        data_path: data/coco
+        split: train
+        keys: [image, objects_bbox, file_name]
+        no_tokens: 8192
+        target_image_size: 256
+        min_object_area: 0.00001
+        min_objects_per_image: 2
+        max_objects_per_image: 30
+        crop_method: random-1d
+        random_flip: true
+        use_group_parameter: true
+        encode_crop: true
+    validation:
+      target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco
+      params:
+        data_path: data/coco
+        split: validation
+        keys: [image, objects_bbox, file_name]
+        no_tokens: 8192
+        target_image_size: 256
+        min_object_area: 0.00001
+        min_objects_per_image: 2
+        max_objects_per_image: 30
+        crop_method: random-1d
+        random_flip: true
+        use_group_parameter: true
+        encode_crop: true
@@ -20,5 +20,6 @@ dependencies:
     - test-tube>=0.7.5
     - streamlit>=0.73.1
     - einops==0.3.0
+    - more-itertools>=8.0.0
     - transformers==4.3.1
     - -e .
@@ -278,7 +278,7 @@ def log_img(self, pl_module, batch, batch_idx, split="train"):
                 pl_module.eval()
 
             with torch.no_grad():
-                images = pl_module.log_images(batch, split=split)
+                images = pl_module.log_images(batch, split=split, pl_module=pl_module)
 
             for k in images:
                 N = min(images[k].shape[0], self.max_images)
 
@@ -0,0 +1,141 @@
+import json
+from itertools import chain
+from pathlib import Path
+from typing import Iterable, Dict, List, Callable, Any
+from collections import defaultdict
+
+from tqdm import tqdm
+
+from taming.data.annotated_objects_dataset import AnnotatedObjectsDataset
+from taming.data.helper_types import Annotation, ImageDescription, Category
+
+COCO_PATH_STRUCTURE = {
+    'train': {
+        'top_level': '',
+        'person_annotations': 'annotations/person_keypoints_train2017.json',
+        'instances_annotations': 'annotations/instances_train2017.json',
+        'stuff_annotations': 'annotations/stuff_train2017.json',
+        'files': 'train2017'
+    },
+    'validation': {
+        'top_level': '',
+        'person_annotations': 'annotations/person_keypoints_val2017.json',
+        'instances_annotations': 'annotations/instances_val2017.json',
+        'stuff_annotations': 'annotations/stuff_val2017.json',
+        'files': 'val2017'
+    }
+}
+
+
+def load_image_descriptions(description_json: List[Dict]) -> Dict[str, ImageDescription]:
+    return {
+        str(img['id']): ImageDescription(
+            id=img['id'],
+            license=img.get('license'),
+            file_name=img['file_name'],
+            coco_url=img['coco_url'],
+            original_size=(img['width'], img['height']),
+            date_captured=img.get('date_captured'),
+            flickr_url=img.get('flickr_url')
+        )
+        for img in description_json
+    }
+
+
+def load_categories(category_json: Iterable) -> Dict[str, Category]:
+    return {str(cat['id']): Category(id=str(cat['id']), super_category=cat['supercategory'], name=cat['name'])
+            for cat in category_json if cat['name'] != 'other'}
+
+
+def load_annotations(annotations_json: List[Dict], image_descriptions: Dict[str, ImageDescription],
+                     category_no_for_id: Callable[[str], int], split: str) -> Dict[str, List[Annotation]]:
+    annotations = defaultdict(list)
+    total = sum(len(a) for a in annotations_json)
+    for ann in tqdm(chain(*annotations_json), f'Loading {split} annotations', total=total):
+        image_id = str(ann['image_id'])
+        if image_id not in image_descriptions:
+            raise ValueError(f'image_id [{image_id}] has no image description.')
+        category_id = ann['category_id']
+        try:
+            category_no = category_no_for_id(str(category_id))
+        except KeyError:
+            continue
+
+        width, height = image_descriptions[image_id].original_size
+        bbox = (ann['bbox'][0] / width, ann['bbox'][1] / height, ann['bbox'][2] / width, ann['bbox'][3] / height)
+
+        annotations[image_id].append(
+            Annotation(
+                id=ann['id'],
+                area=bbox[2]*bbox[3],  # use bbox area
+                is_group_of=ann['iscrowd'],
+                image_id=ann['image_id'],
+                bbox=bbox,
+                category_id=str(category_id),
+                category_no=category_no
+            )
+        )
+    return dict(annotations)
+
+
+class AnnotatedObjectsCoco(AnnotatedObjectsDataset):
+    def __init__(self, use_things: bool = True, use_stuff: bool = True, **kwargs):
+        """
+        @param data_path: is the path to the following folder structure:
+                          coco/
+                          ├── annotations
+                          │   ├── instances_train2017.json
+                          │   ├── instances_val2017.json
+                          │   ├── stuff_train2017.json
+                          │   └── stuff_val2017.json
+                          ├── train2017
+                          │   ├── 000000000009.jpg
+                          │   ├── 000000000025.jpg
+                          │   └── ...
+                          ├── val2017
+                          │   ├── 000000000139.jpg
+                          │   ├── 000000000285.jpg
+                          │   └── ...
+        @param: split: one of 'train' or 'validation'
+        @param: desired image size (give square images)
+        """
+        super().__init__(**kwargs)
+        self.use_things = use_things
+        self.use_stuff = use_stuff
+
+        with open(self.paths['instances_annotations']) as f:
+            inst_data_json = json.load(f)
+        with open(self.paths['stuff_annotations']) as f:
+            stuff_data_json = json.load(f)
+
+        category_jsons = []
+        annotation_jsons = []
+        if self.use_things:
+            category_jsons.append(inst_data_json['categories'])
+            annotation_jsons.append(inst_data_json['annotations'])
+        if self.use_stuff:
+            category_jsons.append(stuff_data_json['categories'])
+            annotation_jsons.append(stuff_data_json['annotations'])
+
+        self.categories = load_categories(chain(*category_jsons))
+        self.filter_categories()
+        self.setup_category_id_and_number()
+
+        self.image_descriptions = load_image_descriptions(inst_data_json['images'])
+        annotations = load_annotations(annotation_jsons, self.image_descriptions, self.get_category_number, self.split)
+        self.annotations = self.filter_object_number(annotations, self.min_object_area,
+                                                     self.min_objects_per_image, self.max_objects_per_image)
+        self.image_ids = list(self.annotations.keys())
+        self.clean_up_annotations_and_image_descriptions()
+
+    def get_path_structure(self) -> Dict[str, str]:
+        if self.split not in COCO_PATH_STRUCTURE:
+            raise ValueError(f'Split [{self.split} does not exist for COCO data.]')
+        return COCO_PATH_STRUCTURE[self.split]
+
+    def get_image_path(self, image_id: str) -> Path:
+        return self.paths['files'].joinpath(self.image_descriptions[str(image_id)].file_name)
+
+    def get_image_description(self, image_id: str) -> Dict[str, Any]:
+        # noinspection PyProtectedMember
+        return self.image_descriptions[image_id]._asdict()