From e661439f461efde668fd4d4e98546d8d30d260c9 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Fri, 14 Feb 2025 11:46:04 -0800 Subject: [PATCH 01/10] Minor edits to instructions, fuzzy and semantic Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/README.md | 15 +++++++++++++-- tutorials/dapt-curation/code/main.py | 8 ++++++-- tutorials/dapt-curation/code/requirements.txt | 2 ++ tutorials/dapt-curation/code/utils.py | 4 +++- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 55ddb2b5c..98ddb92f3 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -44,12 +44,23 @@ The tutorial follows the steps below:<br> ## Usage -After installing the NeMo Curator package, install the dependencies and run: +Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the dapt tutorial: ```bash +cd NeMo-Curator/tutorials/dapt-curation/code/ +apt update +apt-get install poppler-utils +apt-get install tesseract-ocr +apt install libtesseract-dev +pip install -r requirements.txt +pip uninstall --yes $(pip list --format=freeze | grep opencv) +rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ +pip install opencv-python-headless cd code pip install -r requirements.txt python main.py --device "gpu" ``` -This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. +This will download chip-design related datasets and begin the data curation pipeline. + +Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index 5f51ead85..f1941fda1 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -119,7 +119,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: jsonl_dir (str): Directory path where the JSONL files are stored. """ # Initialize the Dask cluster. - client = get_client(**ArgumentHelper.parse_client_args(args)) + client = get_client(**ArgumentHelper.parse_client_args(args),set_torch_to_use_rmm=True) # Define data curation steps for text and pdf files curation_steps_text = Sequential( @@ -171,6 +171,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text = curation_steps_text(orig_dataset_text) dataset_code = curation_steps_code(orig_dataset_code) + print('********************* Generating Statistics *********************') print(f"Original dataset length for text files: {len(orig_dataset_text.df)}") print(f"After dataprep for text files: {len(dataset_text.df)}") print(f"Original dataset length for code files: {len(orig_dataset_code.df)}") @@ -194,6 +195,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: semantic_dataset_text = DocumentDataset( gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)] ) + print('********************* Generating Statistics *********************') print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}") print("Executing the fuzzy dedupe pipeline...") @@ -208,8 +210,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas") dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas") + print('********************* Generating Statistics *********************') print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}") - print(f"After fuzzy dedupe: {len(dataset_code.df)}") + print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}") final_dataset_text = dataset_text.persist() final_dataset_code = dataset_code.persist() @@ -274,6 +277,7 @@ def main(): args = ArgumentHelper(parser).add_distributed_args().parse_args() # Limit the total number of workers to ensure we don't run out of memory. args.n_workers = min(args.n_workers, 8) + args.device = "gpu" print("Args: ", args) # Download all the sources and get the list of text and code files. diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt index 481f5b0a3..31dff6d71 100755 --- a/tutorials/dapt-curation/code/requirements.txt +++ b/tutorials/dapt-curation/code/requirements.txt @@ -5,3 +5,5 @@ nltk==3.8.1 poppler-utils unstructured[all-docs]==0.14.5 unstructured[pdf] +tesseract-ocr +qgrid \ No newline at end of file diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index c81637941..2d601688e 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -300,12 +300,14 @@ def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset: id_field="id", text_field="text", seed=42, - char_ngrams=24, + char_ngrams=20, num_buckets=20, hashes_per_bucket=13, use_64_bit_hash=False, buckets_per_shuffle=5, false_positive_check=False, + num_anchors=2, + jaccard_threshold=0.8, ) fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config) duplicates = fuzzy_dup(dataset) From dab3789a2a8941a6a9a3c757f5ac7fd2abb78872 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Fri, 14 Feb 2025 14:45:59 -0800 Subject: [PATCH 02/10] Update tutorials/dapt-curation/README.md Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 98ddb92f3..ad2b1a930 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -44,7 +44,7 @@ The tutorial follows the steps below:<br> ## Usage -Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the dapt tutorial: +Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial: ```bash cd NeMo-Curator/tutorials/dapt-curation/code/ From 8e86a1aa04636513f627ff603a998de1c462d2b9 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Fri, 14 Feb 2025 14:46:08 -0800 Subject: [PATCH 03/10] Update tutorials/dapt-curation/code/main.py Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/code/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index f1941fda1..25a1e729b 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -277,7 +277,6 @@ def main(): args = ArgumentHelper(parser).add_distributed_args().parse_args() # Limit the total number of workers to ensure we don't run out of memory. args.n_workers = min(args.n_workers, 8) - args.device = "gpu" print("Args: ", args) # Download all the sources and get the list of text and code files. From 517a2199b7d2cdd054bb380143167e067b3879ed Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Fri, 14 Feb 2025 14:46:34 -0800 Subject: [PATCH 04/10] Update tutorials/dapt-curation/code/requirements.txt Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/code/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt index 31dff6d71..95cf1dc67 100755 --- a/tutorials/dapt-curation/code/requirements.txt +++ b/tutorials/dapt-curation/code/requirements.txt @@ -6,4 +6,4 @@ poppler-utils unstructured[all-docs]==0.14.5 unstructured[pdf] tesseract-ocr -qgrid \ No newline at end of file +qgrid From bec0bfe5b1248d535cf8685fd7f7cebdc528f851 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Wed, 19 Feb 2025 19:51:04 -0800 Subject: [PATCH 05/10] Addressing PR comments Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/code/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index 2d601688e..c81637941 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -300,14 +300,12 @@ def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset: id_field="id", text_field="text", seed=42, - char_ngrams=20, + char_ngrams=24, num_buckets=20, hashes_per_bucket=13, use_64_bit_hash=False, buckets_per_shuffle=5, false_positive_check=False, - num_anchors=2, - jaccard_threshold=0.8, ) fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config) duplicates = fuzzy_dup(dataset) From 6b142496f267adc8a263a9f2788c296734f5567d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 17:37:23 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tutorials/dapt-curation/README.md | 4 ++-- tutorials/dapt-curation/code/main.py | 10 ++++++---- tutorials/dapt-curation/code/requirements.txt | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index ad2b1a930..e55b3a775 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -49,7 +49,7 @@ Please follow the instructions in the README on using docker image and installin ```bash cd NeMo-Curator/tutorials/dapt-curation/code/ apt update -apt-get install poppler-utils +apt-get install poppler-utils apt-get install tesseract-ocr apt install libtesseract-dev pip install -r requirements.txt @@ -61,6 +61,6 @@ pip install -r requirements.txt python main.py --device "gpu" ``` -This will download chip-design related datasets and begin the data curation pipeline. +This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index 25a1e729b..80042bb29 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -119,7 +119,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: jsonl_dir (str): Directory path where the JSONL files are stored. """ # Initialize the Dask cluster. - client = get_client(**ArgumentHelper.parse_client_args(args),set_torch_to_use_rmm=True) + client = get_client( + **ArgumentHelper.parse_client_args(args), set_torch_to_use_rmm=True + ) # Define data curation steps for text and pdf files curation_steps_text = Sequential( @@ -171,7 +173,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text = curation_steps_text(orig_dataset_text) dataset_code = curation_steps_code(orig_dataset_code) - print('********************* Generating Statistics *********************') + print("********************* Generating Statistics *********************") print(f"Original dataset length for text files: {len(orig_dataset_text.df)}") print(f"After dataprep for text files: {len(dataset_text.df)}") print(f"Original dataset length for code files: {len(orig_dataset_code.df)}") @@ -195,7 +197,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: semantic_dataset_text = DocumentDataset( gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)] ) - print('********************* Generating Statistics *********************') + print("********************* Generating Statistics *********************") print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}") print("Executing the fuzzy dedupe pipeline...") @@ -210,7 +212,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas") dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas") - print('********************* Generating Statistics *********************') + print("********************* Generating Statistics *********************") print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}") print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}") diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt index 95cf1dc67..2fc55c6c7 100755 --- a/tutorials/dapt-curation/code/requirements.txt +++ b/tutorials/dapt-curation/code/requirements.txt @@ -3,7 +3,7 @@ arxiv-downloader cchardet nltk==3.8.1 poppler-utils +qgrid +tesseract-ocr unstructured[all-docs]==0.14.5 unstructured[pdf] -tesseract-ocr -qgrid From 31a62daeba97e64c698e27e551af63786be94143 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Wed, 26 Feb 2025 14:55:48 -0800 Subject: [PATCH 07/10] Update tutorials/dapt-curation/README.md Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index e55b3a775..28c72e488 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -44,7 +44,7 @@ The tutorial follows the steps below:<br> ## Usage -Please follow the instructions in the README on using docker image and installing the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial: +Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial: ```bash cd NeMo-Curator/tutorials/dapt-curation/code/ From 6cb179f69dd3f58410453bba2069b066e7033176 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Tue, 11 Mar 2025 14:33:52 -0700 Subject: [PATCH 08/10] edits to ensure recent PR changes Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- tutorials/dapt-curation/README.md | 6 +++--- .../code/configs/text_semantic_dedupe_config.yaml | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 28c72e488..7686eff84 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -47,7 +47,7 @@ The tutorial follows the steps below:<br> Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial: ```bash -cd NeMo-Curator/tutorials/dapt-curation/code/ +cd /opt/NeMo-Curator/tutorials/dapt-curation/code/ apt update apt-get install poppler-utils apt-get install tesseract-ocr @@ -56,8 +56,8 @@ pip install -r requirements.txt pip uninstall --yes $(pip list --format=freeze | grep opencv) rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ pip install opencv-python-headless -cd code -pip install -r requirements.txt +python -c "import nltk; nltk.download('punkt_tab')" +python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')" python main.py --device "gpu" ``` diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index 52291f52d..e93a388eb 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -13,14 +13,13 @@ write_to_filename: false # Clustering configuration max_iter: 100 -n_clusters: 20 +n_clusters: 15 clustering_save_loc: "clustering_results" -random_state: 1234 sim_metric: "cosine" which_to_keep: "hard" sort_clusters: true kmeans_with_cos_dist: false -clustering_input_partition_size: "2gb" +partition_size: "2gb" # Extract dedup configuration eps_thresholds: @@ -28,4 +27,4 @@ eps_thresholds: - 0.01 # Which threshold to use for extracting deduped data -eps_to_extract: 0.1 +eps_to_extract: 0.1 \ No newline at end of file From 8026d7d0caed9921b9bc313e8a1794c492976db4 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Wed, 12 Mar 2025 11:16:02 -0700 Subject: [PATCH 09/10] Update tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml Co-authored-by: Vibhu Jawa <vibhujawa@gmail.com> Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- .../dapt-curation/code/configs/text_semantic_dedupe_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index a8ff08bb7..2a0f72776 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -20,7 +20,7 @@ which_to_keep: "hard" batched_cosine_similarity: 1024 sort_clusters: true kmeans_with_cos_dist: false -partition_size: "2gb" +clustering_input_partition_size: "2gb" # Extract dedup configuration eps_thresholds: From 063f5ed2994cd1ebff92bf93e9df86511ebb1f69 Mon Sep 17 00:00:00 2001 From: Rucha Apte <ruchaa@nvidia.com> Date: Wed, 12 Mar 2025 13:20:32 -0700 Subject: [PATCH 10/10] Addressing PR comment to add new line, partition size key in configs Signed-off-by: Rucha Apte <ruchaa@nvidia.com> --- .../code/configs/text_semantic_dedupe_config.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index 2a0f72776..fda7df41f 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -20,7 +20,7 @@ which_to_keep: "hard" batched_cosine_similarity: 1024 sort_clusters: true kmeans_with_cos_dist: false -clustering_input_partition_size: "2gb" +partition_size: "2gb" # Extract dedup configuration eps_thresholds: @@ -28,4 +28,5 @@ eps_thresholds: - 0.01 # Which threshold to use for extracting deduped data -eps_to_extract: 0.1 \ No newline at end of file +eps_to_extract: 0.1 +