NVIDIA · ruchaa-apte · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md
@@ -44,12 +44,23 @@ The tutorial follows the steps below:<br>
 
 ## Usage
 
-After installing the NeMo Curator package, install the dependencies and run:
+Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial:
 
 ```bash
-cd code
+cd /opt/NeMo-Curator/tutorials/dapt-curation/code/
+apt update
+apt-get install poppler-utils
+apt-get install tesseract-ocr
+apt install libtesseract-dev
 pip install -r requirements.txt
+pip uninstall --yes $(pip list --format=freeze | grep opencv)
+rm -rf /usr/local/lib/python3.10/dist-packages/cv2/
+pip install opencv-python-headless
+python -c "import nltk; nltk.download('punkt_tab')"
+python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')"
 python main.py --device "gpu"
 ```
 
-This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
+This will download chip-design related datasets and begin the data curation pipeline.
+
+Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml
@@ -13,15 +13,14 @@ write_to_filename: false
 
 # Clustering configuration
 max_iter: 100
-n_clusters: 20
+n_clusters: 15
 clustering_save_loc: "clustering_results"
-random_state: 1234
 sim_metric: "cosine"
 which_to_keep: "hard"
 batched_cosine_similarity: 1024
 sort_clusters: true
 kmeans_with_cos_dist: false
-clustering_input_partition_size: "2gb"
+partition_size: "2gb"
 clustering_input_partition_size: str = "2gb" 
 clustering_input_partition_size: str = "2gb" 
 
 # Extract dedup configuration
 eps_thresholds:
@@ -30,3 +29,4 @@ eps_thresholds:
 
 # Which threshold to use for extracting deduped data
 eps_to_extract: 0.1
+
diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py
@@ -119,7 +119,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         jsonl_dir (str): Directory path where the JSONL files are stored.
     """
     # Initialize the Dask cluster.
-    client = get_client(**ArgumentHelper.parse_client_args(args))
+    client = get_client(
+        **ArgumentHelper.parse_client_args(args), set_torch_to_use_rmm=True
+    )
 
     # Define data curation steps for text and pdf files
     curation_steps_text = Sequential(
@@ -171,6 +173,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
     dataset_text = curation_steps_text(orig_dataset_text)
     dataset_code = curation_steps_code(orig_dataset_code)
 
+    print("********************* Generating Statistics *********************")
     print(f"Original dataset length for text files: {len(orig_dataset_text.df)}")
     print(f"After dataprep for text files: {len(dataset_text.df)}")
     print(f"Original dataset length for code files: {len(orig_dataset_code.df)}")
@@ -193,6 +196,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
         semantic_dataset_text = DocumentDataset(
             gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)]
         )
+        print("********************* Generating Statistics *********************")
         print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}")
 
         print("Executing the fuzzy dedupe pipeline...")
@@ -207,8 +211,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
 
         dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas")
         dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas")
+        print("********************* Generating Statistics *********************")
         print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}")
-        print(f"After fuzzy dedupe: {len(dataset_code.df)}")
+        print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}")
 
     final_dataset_text = dataset_text.persist()
     final_dataset_code = dataset_code.persist()

diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt
@@ -3,5 +3,7 @@ arxiv-downloader
 cchardet
 nltk==3.8.1
 poppler-utils
+qgrid
+tesseract-ocr
 unstructured[all-docs]==0.14.5
 unstructured[pdf]