Create split tool (#2)

ravern · web-flow · commit 81e402efb353 · 2018-10-28T21:21:40.000+08:00
* Change to use NotImplementedError

* Pass dataset argument to split

* Create ratio extraction and error handling

* Create pipeline to create the new data

* Install pandas

* Split images successfully

* Create exporting functionality

* Finish split tool

* Clean up the path management

* Clean up based on PR review

* Fix map object not a list issue

* Make image distribution consistent
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "python.pythonPath": "venv/bin/python"
+}
diff --git a/coco_tools/__init__.py b/coco_tools/__init__.py
@@ -1,2 +1,3 @@
 from coco_tools.merge import merge
 from coco_tools.split import split
+from coco_tools.error import COCOToolsError
diff --git a/coco_tools/error.py b/coco_tools/error.py
@@ -0,0 +1,2 @@
+class COCOToolsError(BaseException):
+    pass
diff --git a/coco_tools/merge.py b/coco_tools/merge.py
@@ -1,2 +1,2 @@
 def merge():
-    print("Unimplemented!")
+    raise NotImplementedError()
diff --git a/coco_tools/split.py b/coco_tools/split.py
@@ -1,2 +1,129 @@
-def split():
-    print("Unimplemented!")
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from coco_tools.error import COCOToolsError
+
+
+def split(dataset_path, ratio, names):
+    """Splits the dataset into multiple parts based on the given ratio.
+
+    Within the dataset, one image can have multiple annotations. `split` splits
+    the dataset by the number of images, and splits the annotations based on the
+    images that they belong to.
+
+    For example, in a dataset of 1000 images, a ratio of `70:20:10` would split
+    the dataset into three datasets containing `700`, `200` and `100`
+    respectively.
+    """
+
+    # Extract and validate the inputs.
+    dataset_path = Path(dataset_path)
+    ratio = __extract_ratio(ratio)
+    names = __extract_names(names)
+
+    # Some additional input validation.
+    if len(ratio) != len(names):
+        raise COCOToolsError("ratio and names should be of same length")
+
+    # Load the dataset from `dataset_path`.
+    raw_data = None
+    try:
+        with open(str(dataset_path), "r") as dataset_file:
+            raw_data = json.load(dataset_file)
+    except FileNotFoundError:
+        raise COCOToolsError(f"file \"{dataset_path}\" not found")
+
+    # Extract `images` and `annotations`.
+    images = raw_data.pop("images")
+    annotations = raw_data.pop("annotations")
+
+    # Initialize the new datas.
+    new_datas = [raw_data.copy() for _ in ratio]
+
+    # Split the data.
+    __split_data(new_datas, ratio, images, annotations)
+
+    # Output the results to the corresponding files.
+    for (i, new_data) in enumerate(new_datas):
+        with open(__derive_path(dataset_path, names[i]), "w") as output_file:
+            json.dump(new_data, output_file)
+
+
+def __split_data(datas, ratio, images, annotations):
+    """Sets `images` and `annotations` on the `datas` based on `ratio`.
+
+    Take note that this method mutates `datas`. It is done this way because
+    `datas` should contain the additional data as part of a COCO dataset.
+
+    `pandas` is used here to perform the splitting/partitioning.
+    """
+
+    # Create data frames.
+    images = pd.DataFrame(images)
+    annotations = pd.DataFrame(annotations)
+
+    # Create the base mask
+    base_mask = np.arange(0, 1, 1 / len(images))
+    np.random.shuffle(base_mask)
+
+    # Track the current sum of ratios. This is used when finding the range to
+    # compare to.
+    ratio_sum = 0
+
+    # Iterate through each ratio and split the data.
+    for (i, ration) in enumerate(ratio):
+        data = datas[i]
+
+        # Create the mask.
+        mask = (base_mask >= ratio_sum) & (base_mask < ratio_sum + ration)
+        ratio_sum += ration
+
+        # Set the images on the data.
+        data["images"] = images[mask].to_dict("records")
+
+        # Set the annotations on the data.
+        common = images[mask].merge(
+            annotations, left_on="id", right_on="image_id", how="inner")
+        data["annotations"] = annotations[annotations.image_id.isin(
+            common.image_id)].to_dict("records")
+
+
+def __derive_path(dataset_path, name):
+    """Derives the output path given `dataset_path` and `name`.
+    """
+
+    output_filename = Path(f"{str(dataset_path.stem)}_{name}.json")
+    output_path = dataset_path.parent / output_filename
+    return output_path
+
+
+def __extract_ratio(ratio):
+    """Splits, verifies and normalizes the ratio.
+
+    For example, a ratio of `70: 20: 30` will become `[0.58, 0.17, 0.25]`. The
+    total does not need to add up to `100`.
+    """
+
+    # Split and strip.
+    ratio = [ration.strip() for ration in ratio.split(":")]
+
+    # Verify length of ratio.
+    if len(ratio) != 3:
+        raise COCOToolsError("ratio should have length 3")
+
+    # Parse, and hence, verify.
+    try:
+        ratio = list(map(float, ratio))
+    except ValueError:
+        raise COCOToolsError(f'ratio should be a floats')
+
+    # Normalize based on sum.
+    return list(map(lambda ration: ration / sum(ratio), ratio))
+
+
+def __extract_names(names):
+    """Splits the names.
+    """
+
+    return [name.strip() for name in names.split(":")]
diff --git a/main.py b/main.py
@@ -1,21 +1,32 @@
 from argparse import ArgumentParser
-import coco_tools
+from coco_tools import COCOToolsError, split
 
 
 def main():
     parser = ArgumentParser(description="Useful operations for COCO datasets")
-    subparsers = parser.add_subparsers(help="Possible operations")
+    subparsers = parser.add_subparsers(
+        help="Possible operations", dest="command")
 
     split_parser = subparsers.add_parser("split", help="Splits a dataset")
-    split_parser.add_argument("dataset", help="The dataset to split")
+    split_parser.add_argument(
+        "-i", "--dataset", help="The dataset to split", default="data.json")
+    split_parser.add_argument(
+        "-r", "--ratio", help="The ratio to split by (e.g. 70:20:10)", default="70:20:10")
+    split_parser.add_argument(
+        "-n", "--names", help="The names for each split (e.g. train:validation:test)", default="train:validation:test")
 
     merge_parser = subparsers.add_parser("merge", help="Merges datasets")
-    merge_parser.add_argument("datasets", nargs="+",
+    merge_parser.add_argument("--input", nargs="+",
                               help="The datasets to merge")
 
     args = parser.parse_args()
 
-    print(args)
+    try:
+        if args.command == "split":
+            split(args.dataset, args.ratio, args.names)
+    except COCOToolsError as e:
+        print(f'error: {e}')
+        exit(1)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,13 @@
+astroid==2.0.4
+autopep8==1.4.1
+isort==4.3.4
+lazy-object-proxy==1.3.1
+mccabe==0.6.1
+numpy==1.15.3
+pandas==0.23.4
+pycodestyle==2.4.0
+pylint==2.1.1
+python-dateutil==2.7.3
+pytz==2018.5
+six==1.11.0
+wrapt==1.10.11

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "python.pythonPath": "venv/bin/python"`
	`3`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from coco_tools.merge import merge`
`2`	`2`	`from coco_tools.split import split`
	`3`	`+from coco_tools.error import COCOToolsError`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class COCOToolsError(BaseException):`
	`2`	`+ pass`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`def merge():`
`2`		`- print("Unimplemented!")`
	`2`	`+ raise NotImplementedError()`