From e661439f461efde668fd4d4e98546d8d30d260c9 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Fri, 14 Feb 2025 11:46:04 -0800
Subject: [PATCH 01/10] Minor edits to instructions, fuzzy and semantic

Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/README.md             | 15 +++++++++++++--
 tutorials/dapt-curation/code/main.py          |  8 ++++++--
 tutorials/dapt-curation/code/requirements.txt |  2 ++
 tutorials/dapt-curation/code/utils.py         |  4 +++-
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index 55ddb2b5c..98ddb92f3 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -44,12 +44,23 @@ The tutorial follows the steps below:<br>
 
 ## Usage
 
-After installing the NeMo Curator package, install the dependencies and run:
+Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the dapt tutorial:
 
 ```bash
+cd NeMo-Curator/tutorials/dapt-curation/code/
+apt update
+apt-get install poppler-utils 
+apt-get install tesseract-ocr
+apt install libtesseract-dev
+pip install -r requirements.txt
+pip uninstall --yes $(pip list --format=freeze | grep opencv)
+rm -rf /usr/local/lib/python3.10/dist-packages/cv2/
+pip install opencv-python-headless
 cd code
 pip install -r requirements.txt
 python main.py --device "gpu"
 ```
 
-This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
+This will download chip-design related datasets and begin the data curation pipeline. 
+
+Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py
index 5f51ead85..f1941fda1 100755
--- a/tutorials/dapt-curation/code/main.py
+++ b/tutorials/dapt-curation/code/main.py
@@ -119,7 +119,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         jsonl_dir (str): Directory path where the JSONL files are stored.
     """
     # Initialize the Dask cluster.
-    client = get_client(**ArgumentHelper.parse_client_args(args))
+    client = get_client(**ArgumentHelper.parse_client_args(args),set_torch_to_use_rmm=True)
 
     # Define data curation steps for text and pdf files
     curation_steps_text = Sequential(
@@ -171,6 +171,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
     dataset_text = curation_steps_text(orig_dataset_text)
     dataset_code = curation_steps_code(orig_dataset_code)
 
+    print('********************* Generating Statistics *********************')
     print(f"Original dataset length for text files: {len(orig_dataset_text.df)}")
     print(f"After dataprep for text files: {len(dataset_text.df)}")
     print(f"Original dataset length for code files: {len(orig_dataset_code.df)}")
@@ -194,6 +195,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         semantic_dataset_text = DocumentDataset(
             gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)]
         )
+        print('********************* Generating Statistics *********************')
         print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}")
 
         print("Executing the fuzzy dedupe pipeline...")
@@ -208,8 +210,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
 
         dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas")
         dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas")
+        print('********************* Generating Statistics *********************')
         print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}")
-        print(f"After fuzzy dedupe: {len(dataset_code.df)}")
+        print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}")
 
     final_dataset_text = dataset_text.persist()
     final_dataset_code = dataset_code.persist()
@@ -274,6 +277,7 @@ def main():
     args = ArgumentHelper(parser).add_distributed_args().parse_args()
     # Limit the total number of workers to ensure we don't run out of memory.
     args.n_workers = min(args.n_workers, 8)
+    args.device = "gpu"
     print("Args: ", args)
 
     # Download all the sources and get the list of text and code files.
diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt
index 481f5b0a3..31dff6d71 100755
--- a/tutorials/dapt-curation/code/requirements.txt
+++ b/tutorials/dapt-curation/code/requirements.txt
@@ -5,3 +5,5 @@ nltk==3.8.1
 poppler-utils
 unstructured[all-docs]==0.14.5
 unstructured[pdf]
+tesseract-ocr
+qgrid
\ No newline at end of file
diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py
index c81637941..2d601688e 100755
--- a/tutorials/dapt-curation/code/utils.py
+++ b/tutorials/dapt-curation/code/utils.py
@@ -300,12 +300,14 @@ def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset:
         id_field="id",
         text_field="text",
         seed=42,
-        char_ngrams=24,
+        char_ngrams=20,
         num_buckets=20,
         hashes_per_bucket=13,
         use_64_bit_hash=False,
         buckets_per_shuffle=5,
         false_positive_check=False,
+        num_anchors=2,
+        jaccard_threshold=0.8,
     )
     fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config)
     duplicates = fuzzy_dup(dataset)

From dab3789a2a8941a6a9a3c757f5ac7fd2abb78872 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Fri, 14 Feb 2025 14:45:59 -0800
Subject: [PATCH 02/10] Update tutorials/dapt-curation/README.md

Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index 98ddb92f3..ad2b1a930 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -44,7 +44,7 @@ The tutorial follows the steps below:<br>
 
 ## Usage
 
-Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the dapt tutorial:
+Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial:
 
 ```bash
 cd NeMo-Curator/tutorials/dapt-curation/code/

From 8e86a1aa04636513f627ff603a998de1c462d2b9 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Fri, 14 Feb 2025 14:46:08 -0800
Subject: [PATCH 03/10] Update tutorials/dapt-curation/code/main.py

Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/code/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py
index f1941fda1..25a1e729b 100755
--- a/tutorials/dapt-curation/code/main.py
+++ b/tutorials/dapt-curation/code/main.py
@@ -277,7 +277,6 @@ def main():
     args = ArgumentHelper(parser).add_distributed_args().parse_args()
     # Limit the total number of workers to ensure we don't run out of memory.
     args.n_workers = min(args.n_workers, 8)
-    args.device = "gpu"
     print("Args: ", args)
 
     # Download all the sources and get the list of text and code files.

From 517a2199b7d2cdd054bb380143167e067b3879ed Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Fri, 14 Feb 2025 14:46:34 -0800
Subject: [PATCH 04/10] Update tutorials/dapt-curation/code/requirements.txt

Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/code/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt
index 31dff6d71..95cf1dc67 100755
--- a/tutorials/dapt-curation/code/requirements.txt
+++ b/tutorials/dapt-curation/code/requirements.txt
@@ -6,4 +6,4 @@ poppler-utils
 unstructured[all-docs]==0.14.5
 unstructured[pdf]
 tesseract-ocr
-qgrid
\ No newline at end of file
+qgrid

From bec0bfe5b1248d535cf8685fd7f7cebdc528f851 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Wed, 19 Feb 2025 19:51:04 -0800
Subject: [PATCH 05/10] Addressing PR comments

Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/code/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py
index 2d601688e..c81637941 100755
--- a/tutorials/dapt-curation/code/utils.py
+++ b/tutorials/dapt-curation/code/utils.py
@@ -300,14 +300,12 @@ def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset:
         id_field="id",
         text_field="text",
         seed=42,
-        char_ngrams=20,
+        char_ngrams=24,
         num_buckets=20,
         hashes_per_bucket=13,
         use_64_bit_hash=False,
         buckets_per_shuffle=5,
         false_positive_check=False,
-        num_anchors=2,
-        jaccard_threshold=0.8,
     )
     fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config)
     duplicates = fuzzy_dup(dataset)

From 6b142496f267adc8a263a9f2788c296734f5567d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2025 17:37:23 +0000
Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tutorials/dapt-curation/README.md             |  4 ++--
 tutorials/dapt-curation/code/main.py          | 10 ++++++----
 tutorials/dapt-curation/code/requirements.txt |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index ad2b1a930..e55b3a775 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -49,7 +49,7 @@ Please follow the instructions in the README on using docker image and installin
 ```bash
 cd NeMo-Curator/tutorials/dapt-curation/code/
 apt update
-apt-get install poppler-utils 
+apt-get install poppler-utils
 apt-get install tesseract-ocr
 apt install libtesseract-dev
 pip install -r requirements.txt
@@ -61,6 +61,6 @@ pip install -r requirements.txt
 python main.py --device "gpu"
 ```
 
-This will download chip-design related datasets and begin the data curation pipeline. 
+This will download chip-design related datasets and begin the data curation pipeline.
 
 Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py
index 25a1e729b..80042bb29 100755
--- a/tutorials/dapt-curation/code/main.py
+++ b/tutorials/dapt-curation/code/main.py
@@ -119,7 +119,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         jsonl_dir (str): Directory path where the JSONL files are stored.
     """
     # Initialize the Dask cluster.
-    client = get_client(**ArgumentHelper.parse_client_args(args),set_torch_to_use_rmm=True)
+    client = get_client(
+        **ArgumentHelper.parse_client_args(args), set_torch_to_use_rmm=True
+    )
 
     # Define data curation steps for text and pdf files
     curation_steps_text = Sequential(
@@ -171,7 +173,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
     dataset_text = curation_steps_text(orig_dataset_text)
     dataset_code = curation_steps_code(orig_dataset_code)
 
-    print('********************* Generating Statistics *********************')
+    print("********************* Generating Statistics *********************")
     print(f"Original dataset length for text files: {len(orig_dataset_text.df)}")
     print(f"After dataprep for text files: {len(dataset_text.df)}")
     print(f"Original dataset length for code files: {len(orig_dataset_code.df)}")
@@ -195,7 +197,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         semantic_dataset_text = DocumentDataset(
             gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)]
         )
-        print('********************* Generating Statistics *********************')
+        print("********************* Generating Statistics *********************")
         print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}")
 
         print("Executing the fuzzy dedupe pipeline...")
@@ -210,7 +212,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
 
         dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas")
         dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas")
-        print('********************* Generating Statistics *********************')
+        print("********************* Generating Statistics *********************")
         print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}")
         print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}")
 
diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt
index 95cf1dc67..2fc55c6c7 100755
--- a/tutorials/dapt-curation/code/requirements.txt
+++ b/tutorials/dapt-curation/code/requirements.txt
@@ -3,7 +3,7 @@ arxiv-downloader
 cchardet
 nltk==3.8.1
 poppler-utils
+qgrid
+tesseract-ocr
 unstructured[all-docs]==0.14.5
 unstructured[pdf]
-tesseract-ocr
-qgrid

From 31a62daeba97e64c698e27e551af63786be94143 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Wed, 26 Feb 2025 14:55:48 -0800
Subject: [PATCH 07/10] Update tutorials/dapt-curation/README.md

Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index e55b3a775..28c72e488 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -44,7 +44,7 @@ The tutorial follows the steps below:<br>
 
 ## Usage
 
-Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial:
+Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial:
 
 ```bash
 cd NeMo-Curator/tutorials/dapt-curation/code/

From 6cb179f69dd3f58410453bba2069b066e7033176 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Tue, 11 Mar 2025 14:33:52 -0700
Subject: [PATCH 08/10] edits to ensure recent PR changes

Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 tutorials/dapt-curation/README.md                          | 6 +++---
 .../code/configs/text_semantic_dedupe_config.yaml          | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
index 28c72e488..7686eff84 100755
--- a/tutorials/dapt-curation/README.md
+++ b/tutorials/dapt-curation/README.md
@@ -47,7 +47,7 @@ The tutorial follows the steps below:<br>
 Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial:
 
 ```bash
-cd NeMo-Curator/tutorials/dapt-curation/code/
+cd /opt/NeMo-Curator/tutorials/dapt-curation/code/
 apt update
 apt-get install poppler-utils
 apt-get install tesseract-ocr
@@ -56,8 +56,8 @@ pip install -r requirements.txt
 pip uninstall --yes $(pip list --format=freeze | grep opencv)
 rm -rf /usr/local/lib/python3.10/dist-packages/cv2/
 pip install opencv-python-headless
-cd code
-pip install -r requirements.txt
+python -c "import nltk; nltk.download('punkt_tab')"
+python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
 python main.py --device "gpu"
 ```
 
diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
index 52291f52d..e93a388eb 100644
--- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
+++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -13,14 +13,13 @@ write_to_filename: false
 
 # Clustering configuration
 max_iter: 100
-n_clusters: 20
+n_clusters: 15
 clustering_save_loc: "clustering_results"
-random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
 sort_clusters: true
 kmeans_with_cos_dist: false
-clustering_input_partition_size: "2gb"
+partition_size: "2gb"
 
 # Extract dedup configuration
 eps_thresholds:
@@ -28,4 +27,4 @@ eps_thresholds:
   - 0.01
 
 # Which threshold to use for extracting deduped data
-eps_to_extract: 0.1
+eps_to_extract: 0.1
\ No newline at end of file

From 8026d7d0caed9921b9bc313e8a1794c492976db4 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Wed, 12 Mar 2025 11:16:02 -0700
Subject: [PATCH 09/10] Update
 tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml

Co-authored-by: Vibhu Jawa <vibhujawa@gmail.com>
Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 .../dapt-curation/code/configs/text_semantic_dedupe_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
index a8ff08bb7..2a0f72776 100644
--- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
+++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -20,7 +20,7 @@ which_to_keep: "hard"
 batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
-partition_size: "2gb"
+clustering_input_partition_size: "2gb"
 
 # Extract dedup configuration
 eps_thresholds:

From 063f5ed2994cd1ebff92bf93e9df86511ebb1f69 Mon Sep 17 00:00:00 2001
From: Rucha Apte <ruchaa@nvidia.com>
Date: Wed, 12 Mar 2025 13:20:32 -0700
Subject: [PATCH 10/10] Addressing PR comment to add new line, partition size
 key in configs

Signed-off-by: Rucha Apte <ruchaa@nvidia.com>
---
 .../code/configs/text_semantic_dedupe_config.yaml            | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
index 2a0f72776..fda7df41f 100644
--- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
+++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -20,7 +20,7 @@ which_to_keep: "hard"
 batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
-clustering_input_partition_size: "2gb"
+partition_size: "2gb"
 
 # Extract dedup configuration
 eps_thresholds:
@@ -28,4 +28,5 @@ eps_thresholds:
   - 0.01
 
 # Which threshold to use for extracting deduped data
-eps_to_extract: 0.1
\ No newline at end of file
+eps_to_extract: 0.1
+