From d55428949de74940a70690cfcb70ebea283710e2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Fri, 11 Aug 2023 17:41:08 +0000
Subject: [PATCH 1/7] add center_method focus

---
 nerfstudio/cameras/camera_utils.py            | 71 ++++++++++++++++---
 .../data/dataparsers/sdfstudio_dataparser.py  |  4 +-
 2 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/nerfstudio/cameras/camera_utils.py b/nerfstudio/cameras/camera_utils.py
index 2adaffd6..ae380b90 100644
--- a/nerfstudio/cameras/camera_utils.py
+++ b/nerfstudio/cameras/camera_utils.py
@@ -17,12 +17,15 @@
 """
 
 import math
-from typing import List, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
+from jaxtyping import Float
+from numpy.typing import NDArray
 from torchtyping import TensorType
 from typing_extensions import Literal
+from torch import Tensor
 
 _EPS = np.finfo(float).eps * 4.0
 
@@ -406,10 +409,46 @@ def rotation_matrix(a: TensorType[3], b: TensorType[3]) -> TensorType[3, 3]:
     )
     return torch.eye(3) + skew_sym_mat + skew_sym_mat @ skew_sym_mat * ((1 - c) / (s**2 + 1e-8))
 
+def focus_of_attention(poses: Float[Tensor, "*num_poses 4 4"], initial_focus: Float[Tensor, "3"]) -> Float[Tensor, "3"]:
+    """Compute the focus of attention of a set of cameras. Only cameras
+    that have the focus of attention in front of them are considered.
+
+     Args:
+        poses: The poses to orient.
+        initial_focus: The 3D point views to decide which cameras are initially activated.
+
+    Returns:
+        The 3D position of the focus of attention.
+    """
+    # References to the same method in third-party code:
+    # https://github.com/google-research/multinerf/blob/1c8b1c552133cdb2de1c1f3c871b2813f6662265/internal/camera_utils.py#L145
+    # https://github.com/bmild/nerf/blob/18b8aebda6700ed659cb27a0c348b737a5f6ab60/load_llff.py#L197
+    active_directions = -poses[:, :3, 2:3]
+    active_origins = poses[:, :3, 3:4]
+    # initial value for testing if the focus_pt is in front or behind
+    focus_pt = initial_focus
+    # Prune cameras which have the current have the focus_pt behind them.
+    active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+    done = False
+    # We need at least two active cameras, else fallback on the previous solution.
+    # This may be the "poses" solution if no cameras are active on first iteration, e.g.
+    # they are in an outward-looking configuration.
+    while torch.sum(active.int()) > 1 and not done:
+        active_directions = active_directions[active]
+        active_origins = active_origins[active]
+        # https://en.wikipedia.org/wiki/Line–line_intersection#In_more_than_two_dimensions
+        m = torch.eye(3) - active_directions * torch.transpose(active_directions, -2, -1)
+        mt_m = torch.transpose(m, -2, -1) @ m
+        focus_pt = torch.linalg.inv(mt_m.mean(0)) @ (mt_m @ active_origins).mean(0)[:, 0]
+        active = torch.sum(active_directions.squeeze(-1) * (focus_pt - active_origins.squeeze(-1)), dim=-1) > 0
+        if active.all():
+            # the set of active cameras did not change, so we're done.
+            done = True
+    return focus_pt
 
 def auto_orient_and_center_poses(
-    poses: TensorType["num_poses":..., 4, 4], method: Literal["pca", "up", "none"] = "up", center_poses: bool = True
-) -> TensorType["num_poses":..., 3, 4]:
+    poses: TensorType["num_poses":..., 4, 4], method: Literal["pca", "up", "none"] = "up", center_method: Literal["poses", "focus", "none"] = "poses",
+) -> Tuple[Float [Tensor, "*num_poses 3 4"], Float[Tensor, "3 4"]]:
     """Orients and centers the poses. We provide two methods for orientation: pca and up.
 
     pca: Orient the poses so that the principal component of the points is aligned with the axes.
@@ -417,25 +456,35 @@ def auto_orient_and_center_poses(
     up: Orient the poses so that the average up vector is aligned with the z axis.
         This method works well when images are not at arbitrary angles.
 
+    There are two centering methods:
+    poses: The poses are centered around the origin.
+    focus: The origin is set to the focus of attention of all cameras (the
+        closest point to cameras optical axes). Recommended for inward-looking
+        camera configurations.
+
 
     Args:
         poses: The poses to orient.
         method: The method to use for orientation.
-        center_poses: If True, the poses are centered around the origin.
+        center_method: The method to use to center poses
 
     Returns:
-        The oriented poses.
+        Tuple of the oriented poses and the transform matrix.
     """
 
-    translation = poses[..., :3, 3]
+    origin = poses[..., :3, 3]
 
-    mean_translation = torch.mean(translation, dim=0)
-    translation_diff = translation - mean_translation
+    mean_origin = torch.mean(origin, dim=0)
+    translation_diff = origin - mean_origin
 
-    if center_poses:
-        translation = mean_translation
+    if center_method == "poses":
+        translation = mean_origin
+    elif center_method == "focus":
+        translation = focus_of_attention(poses, mean_origin)
+    elif center_method == "none":
+        translation = torch.zeros_like(mean_origin)
     else:
-        translation = torch.zeros_like(mean_translation)
+        raise ValueError(f"Unknown value for center_method: {center_method}")
 
     if method == "pca":
         _, eigvec = torch.linalg.eigh(translation_diff.T @ translation_diff)
diff --git a/nerfstudio/data/dataparsers/sdfstudio_dataparser.py b/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
index 393867bb..717f37f5 100644
--- a/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
@@ -157,7 +157,7 @@ class SDFStudioDataParserConfig(DataParserConfig):
     # """How much to downscale images. If not set, images are chosen such that the max dimension is <1600px."""
     orientation_method: Literal["up", "none"] = "up"
     """The method to use for orientation."""
-    center_poses: bool = False
+    center_method: Literal["focus", "none"] = "focus"
     """Whether to center the poses."""
     auto_scale_poses: bool = False
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
@@ -310,7 +310,7 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
             camera_to_worlds, transform = camera_utils.auto_orient_and_center_poses(
                 camera_to_worlds,
                 method=orientation_method,
-                center_poses=self.config.center_poses,
+                center_method=self.config.center_method,
             )
 
             # we should also transform normal accordingly

From e5f445104d54ea8ec26e2ce93d8353c2729fd880 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Fri, 11 Aug 2023 18:51:15 +0000
Subject: [PATCH 2/7] replace torch._six.string_classes to str

---
 nerfstudio/data/utils/nerfstudio_collate.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nerfstudio/data/utils/nerfstudio_collate.py b/nerfstudio/data/utils/nerfstudio_collate.py
index 65917d85..b7a9a63f 100644
--- a/nerfstudio/data/utils/nerfstudio_collate.py
+++ b/nerfstudio/data/utils/nerfstudio_collate.py
@@ -23,7 +23,6 @@
 
 import torch
 import torch.utils.data
-from torch._six import string_classes
 
 from nerfstudio.cameras.cameras import Cameras
 from nerfstudio.utils.images import BasicImages
@@ -120,7 +119,7 @@ def nerfstudio_collate(
         return torch.tensor(batch, dtype=torch.float64)
     elif isinstance(elem, int):
         return torch.tensor(batch)
-    elif isinstance(elem, string_classes):
+    elif isinstance(elem, str):
         return batch
     elif isinstance(elem, collections.abc.Mapping):
         try:

From ced06064756eb0e56b3cf546f7f3fb03e8974ab4 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Fri, 11 Aug 2023 21:28:14 +0000
Subject: [PATCH 3/7] implementing center_method on nerfstudio-data

---
 nerfstudio/data/dataparsers/nerfstudio_dataparser.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index c88f4275..b8c08f3f 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -56,8 +56,10 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """How much to scale the region of interest by."""
     orientation_method: Literal["pca", "up", "none"] = "up"
     """The method to use for orientation."""
-    center_poses: bool = True
-    """Whether to center the poses."""
+    # center_poses: bool = True
+    # """Whether to center the poses."""
+    center_method: Literal["poses", "focus", "none"] = "poses"
+    """The method to use to center the poses"""
     auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
     train_split_percentage: float = 0.9
@@ -189,7 +191,7 @@ def _generate_dataparser_outputs(self, split="train"):
         poses, transform_matrix = camera_utils.auto_orient_and_center_poses(
             poses,
             method=orientation_method,
-            center_poses=self.config.center_poses,
+            center_method=self.config.center_method,
         )
 
         # Scale poses

From 5663636044fd45c897701989aa96c56723de7727 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Fri, 11 Aug 2023 21:44:02 +0000
Subject: [PATCH 4/7] add foreground_mask to meta_data.json

---
 scripts/datasets/process_nerfstudio_to_sdfstudio.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/datasets/process_nerfstudio_to_sdfstudio.py b/scripts/datasets/process_nerfstudio_to_sdfstudio.py
index 95a5e129..994a7218 100644
--- a/scripts/datasets/process_nerfstudio_to_sdfstudio.py
+++ b/scripts/datasets/process_nerfstudio_to_sdfstudio.py
@@ -199,6 +199,9 @@ def main(args):
             frame["mono_depth_path"] = rgb_path.replace("_rgb.png", "_depth.npy")
             frame["mono_normal_path"] = rgb_path.replace("_rgb.png", "_normal.npy")
 
+        if args.foreground_mask:
+            frame["foreground_mask"] = rgb_path.replace("_rgb.png", "_mask.png")
+
         frames.append(frame)
         out_index += 1
 
@@ -209,7 +212,7 @@ def main(args):
         "width": tar_w,
         "has_mono_prior": args.mono_prior,
         "has_sensor_depth": args.sensor_depth,
-        "has_foreground_mask": False,
+        "has_foreground_mask": args.foreground_mask,
         "pairs": None,
         "worldtogt": scale_mat.tolist(),
         "scene_box": scene_box,
@@ -261,6 +264,8 @@ def main(args):
     parser.add_argument("--mono-prior", dest="mono_prior", action="store_true",
                         help="Whether to generate mono-prior depths and normals. "
                              "If enabled, the images will be cropped to 384*384")
+    parser.add_argument("--foreground-mask", dest="foreground_mask", action="store_true",
+                        help="Whether to add foreground masks to the json file")
     parser.add_argument("--crop-mult", dest="crop_mult", type=int, default=1,
                         help="image size will be resized to crop_mult*384, only take effect when enabling mono-prior")
     parser.add_argument("--omnidata-path", dest="omnidata_path",

From 0f5cd6ddc525a7f76a4d642b65c5b92c995cb2d2 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Sat, 12 Aug 2023 15:33:48 +0000
Subject: [PATCH 5/7] added support for mini-omnidata to generate depth and
 normals with any image size

---
 .../process_nerfstudio_to_sdfstudio.py        | 62 ++++++++++++-------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/scripts/datasets/process_nerfstudio_to_sdfstudio.py b/scripts/datasets/process_nerfstudio_to_sdfstudio.py
index 994a7218..c2118c0b 100644
--- a/scripts/datasets/process_nerfstudio_to_sdfstudio.py
+++ b/scripts/datasets/process_nerfstudio_to_sdfstudio.py
@@ -123,9 +123,10 @@ def main(args):
 
     # === Resize the images and intrinsics ===
     # Only resize the images when we want to use mono prior
+    # Skip if we're using mini omnidata
     sample_img = cv2.imread(str(image_paths[0]))
     h, w, _ = sample_img.shape
-    if args.mono_prior:
+    if args.mono_prior and not args.mini_omnidata:
         # get smallest side to generate square crop
         target_crop = min(h, w)
         tar_h = tar_w = 384 * args.crop_mult
@@ -223,25 +224,42 @@ def main(args):
 
     # === Generate mono priors using omnidata ===
     if args.mono_prior:
-        assert os.path.exists(args.pretrained_models), "Pretrained model path not found"
-        assert os.path.exists(args.omnidata_path), "omnidata l path not found"
-        # generate mono depth and normal
-        print("Generating mono depth...")
-        os.system(
-            f"python scripts/datasets/extract_monocular_cues.py \
-            --omnidata_path {args.omnidata_path} \
-            --pretrained_model {args.pretrained_models} \
-            --img_path {output_dir} --output_path {output_dir} \
-            --task depth"
-        )
-        print("Generating mono normal...")
-        os.system(
-            f"python scripts/datasets/extract_monocular_cues.py \
-            --omnidata_path {args.omnidata_path} \
-            --pretrained_model {args.pretrained_models} \
-            --img_path {output_dir} --output_path {output_dir} \
-            --task normal"
-        )
+        if args.mini_omnidata:
+            assert os.path.exists(args.omnidata_path), "omnidata path not found"
+            print("Generating mono normal with mini omnidata...")
+            os.system(
+                f"python {args.omnidata_path}/run.py \
+                --images_dir {output_dir} --output_dir {output_dir} \
+                --task normal --model_path {args.omnidata_path}/pretrained_models/omnidata_dpt_normal_v2.ckpt --dtu_format"
+            )
+
+            print("Generating mono depth with mini omnidata...")
+            os.system(
+                f"python {args.omnidata_path}/run.py \
+                --images_dir {output_dir} --output_dir {output_dir} \
+                --task depth --model_path {args.omnidata_path}/pretrained_models/omnidata_dpt_depth_v2.ckpt --dtu_format"
+            )
+            
+        else:
+            assert os.path.exists(args.pretrained_models), "Pretrained model path not found"
+            assert os.path.exists(args.omnidata_path), "omnidata path not found"
+            # generate mono depth and normal
+            print("Generating mono depth...")
+            os.system(
+                f"python /home/ubuntu/Documents/omnidata/omnidata_tools/torch/extract_monocular_cues.py \
+                --omnidata_path {args.omnidata_path} \
+                --pretrained_model {args.pretrained_models} \
+                --img_path {output_dir} --output_path {output_dir} \
+                --task depth"
+            )
+            print("Generating mono normal...")
+            os.system(
+                f"python /home/ubuntu/Documents/omnidata/omnidata_tools/torch/extract_monocular_cues.py \
+                --omnidata_path {args.omnidata_path} \
+                --pretrained_model {args.pretrained_models} \
+                --img_path {output_dir} --output_path {output_dir} \
+                --task normal"
+            )
 
     print(f"Done! The processed data has been saved in {output_dir}")
 
@@ -270,10 +288,12 @@ def main(args):
                         help="image size will be resized to crop_mult*384, only take effect when enabling mono-prior")
     parser.add_argument("--omnidata-path", dest="omnidata_path",
                         default="<YOUR_DIR>/omnidata/omnidata_tools/torch",
-                        help="path to omnidata model")
+                        help="path to omnidata torch directory or mini-omnidata root directory if mini-omnidata is set to true")
     parser.add_argument("--pretrained-models", dest="pretrained_models",
                         default="<YOUR_DIR>/omnidata_tools/torch/pretrained_models/",
                         help="path to pretrained models")
+    parser.add_argument("--mini-omnidata", dest="mini_omnidata", action="store_true",
+                        help="use mini-omnidata to generate normals and depth for any image size. Requires --omnidata-path pointing to your mini-omnidata path")
 
     args = parser.parse_args()
 

From e157d3affaf9a8e33ee8454f3ed125cb892d83d3 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Sat, 12 Aug 2023 15:42:02 +0000
Subject: [PATCH 6/7] added create_masked_img to data_utils

---
 nerfstudio/data/utils/data_utils.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/nerfstudio/data/utils/data_utils.py b/nerfstudio/data/utils/data_utils.py
index cba91453..973f22af 100644
--- a/nerfstudio/data/utils/data_utils.py
+++ b/nerfstudio/data/utils/data_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """Utility functions to allow easy re-use of common operations across dataloaders"""
+import os
 from pathlib import Path
 from typing import List, Tuple, Union
 
@@ -51,3 +52,28 @@ def get_semantics_and_mask_tensors_from_path(
     semantics = torch.from_numpy(np.array(pil_image, dtype="int64"))[..., None]
     mask = torch.sum(semantics == mask_indices, dim=-1, keepdim=True) == 0
     return semantics, mask
+
+
+def create_masked_img(img_filepath: Path, mask_filepath: Path, output_dir: Path) -> Path:
+    """
+    Utility function to mask an image using provided mask and store it on disk.
+    Output_dir is absolute path where to store the masked image.
+    """
+    img = np.array(Image.open(img_filepath), dtype=np.float32)
+    mask = np.array(Image.open(mask_filepath), dtype=np.float32) / 255.0
+    assert len(img.shape) == 3
+    if img.shape[-1] == 4:
+        img = img[:, :, :3]
+
+    # in case the mask comes with alpha channel
+    if mask.shape[-1] == 4:
+        mask = mask[:, :, :3]
+
+    if len(mask.shape) == 2:
+        mask = mask[..., np.newaxis]
+
+    masked_image = Image.fromarray((img * mask).astype(np.uint8))
+    masked_image_filename = output_dir / (img_filepath.stem + "_masked" + img_filepath.suffix)
+    masked_image.save(masked_image_filename)
+
+    return masked_image_filename

From 07dfe7dc51ed2f0cbd848a813c522da1a3bc780b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-84-46.ec2.internal>
Date: Mon, 14 Aug 2023 14:00:53 +0000
Subject: [PATCH 7/7] added foreground mask training to sdfstudio-data

---
 .../data/dataparsers/sdfstudio_dataparser.py  | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/nerfstudio/data/dataparsers/sdfstudio_dataparser.py b/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
index 717f37f5..4e74a7d1 100644
--- a/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/sdfstudio_dataparser.py
@@ -15,6 +15,7 @@
 """Data parser for friends dataset"""
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Dict, Optional, Type
@@ -34,6 +35,7 @@
     DataparserOutputs,
 )
 from nerfstudio.data.scene_box import SceneBox
+from nerfstudio.data.utils.data_utils import create_masked_img
 from nerfstudio.utils.images import BasicImages
 from nerfstudio.utils.io import load_from_json
 
@@ -176,6 +178,12 @@ class SDFStudioDataParserConfig(DataParserConfig):
     """automatically orient the scene such that the up direction is the same as the viewer's up direction"""
     load_dtu_highres: bool = False
     """load high resolution images from DTU dataset, should only be used for the preprocessed DTU dataset"""
+    train_with_masked_imgs: bool = False
+    """whether or not to mask out objects using foreground masks and train with masked images"""
+    sample_from_mask: bool = False
+    """if true, pixels are sampled only from masked regions"""
+    masked_img_dir: str = "masked_images"
+    """name of the folder where masked images are stored if train_with_masked_imgs is true"""
 
 
 def filter_list(list_to_filter, indices):
@@ -209,6 +217,7 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
 
         image_filenames = []
         depth_images = []
+        mask_filenames = []
         normal_images = []
         sensor_depth_images = []
         foreground_mask_images = []
@@ -221,6 +230,35 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
         for i, frame in enumerate(meta["frames"]):
             image_filename = self.config.data / frame["rgb_path"]
 
+
+            if (
+                self.config.train_with_masked_imgs
+                or self.config.include_foreground_mask
+                or self.config.sample_from_mask
+            ):
+                assert meta["has_foreground_mask"]
+                mask_filename = self.config.data / frame["foreground_mask"]
+                mask = np.array(Image.open(mask_filename), dtype=np.float32) / 255.0
+                if len(mask.shape) == 3:
+                    mask = mask[..., 0]
+                if self.config.train_with_masked_imgs or self.config.sample_from_mask:
+                    masked_img_dir_path = self.config.data / self.config.masked_img_dir
+                    os.makedirs(str(masked_img_dir_path), exist_ok=True)
+
+            if self.config.train_with_masked_imgs:
+                image_filename = create_masked_img(image_filename, mask_filename, masked_img_dir_path)
+
+            if self.config.include_foreground_mask:
+                foreground_mask = mask[..., None]
+                foreground_mask_images.append(torch.from_numpy(foreground_mask).float())
+
+            if self.config.sample_from_mask:
+                # nerfstudio's pixel sampler requires single channel masks
+                mask_img = Image.fromarray((255.0 * mask).astype(np.uint8))
+                mask_filename = masked_img_dir_path / mask_filename.name
+                mask_img.save(mask_filename)
+                mask_filenames.append(mask_filename)
+
             intrinsics = torch.tensor(frame["intrinsics"])
             camtoworld = torch.tensor(frame["camtoworld"])
 
@@ -236,6 +274,8 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
                 assert meta["has_mono_prior"]
                 # load mono depth
                 depth = np.load(self.config.data / frame["mono_depth_path"])
+                if self.config.train_with_masked_imgs:
+                    depth = depth * mask
                 depth_images.append(torch.from_numpy(depth).float())
 
                 # load mono normal
@@ -258,6 +298,9 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
                 assert meta["has_sensor_depth"]
                 # load sensor depth
                 sensor_depth = np.load(self.config.data / frame["sensor_depth_path"])
+                if self.config.train_with_masked_imgs:
+                    # TODO: Maybe set background depth to very large value instead of 0?
+                    sensor_depth = sensor_depth * mask
                 sensor_depth_images.append(torch.from_numpy(sensor_depth).float())
 
             if self.config.include_foreground_mask:
@@ -418,6 +461,7 @@ def _generate_dataparser_outputs(self, split="train"):  # pylint: disable=unused
         dataparser_outputs = DataparserOutputs(
             image_filenames=filter_list(image_filenames, indices),
             cameras=cameras,
+            mask_filenames=mask_filenames if self.config.sample_from_mask else None,
             scene_box=scene_box,
             additional_inputs=additional_inputs_dict,
             depths=filter_list(depth_images, indices),