diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 55ddb2b5c..7686eff84 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -44,12 +44,23 @@ The tutorial follows the steps below:
## Usage -After installing the NeMo Curator package, install the dependencies and run: +Please follow the instructions in NeMo Curator's [README](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#nemo-framework-container) to run the NeMo Framework Container and install the NeMo Curator package. Then, install the following dependencies for running the DAPT tutorial: ```bash -cd code +cd /opt/NeMo-Curator/tutorials/dapt-curation/code/ +apt update +apt-get install poppler-utils +apt-get install tesseract-ocr +apt install libtesseract-dev pip install -r requirements.txt +pip uninstall --yes $(pip list --format=freeze | grep opencv) +rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ +pip install opencv-python-headless +python -c "import nltk; nltk.download('punkt_tab')" +python -c "import nltk; nltk.download('averaged_perceptron_tagger_eng')" python main.py --device "gpu" ``` -This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. +This will download chip-design related datasets and begin the data curation pipeline. + +Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index afa97d504..fda7df41f 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -13,15 +13,14 @@ write_to_filename: false # Clustering configuration max_iter: 100 -n_clusters: 20 +n_clusters: 15 clustering_save_loc: "clustering_results" -random_state: 1234 sim_metric: "cosine" which_to_keep: "hard" batched_cosine_similarity: 1024 sort_clusters: true kmeans_with_cos_dist: false -clustering_input_partition_size: "2gb" +partition_size: "2gb" # Extract dedup configuration eps_thresholds: @@ -30,3 +29,4 @@ eps_thresholds: # Which threshold to use for extracting deduped data eps_to_extract: 0.1 + diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index 44679f9bd..71cd27942 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -119,7 +119,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: jsonl_dir (str): Directory path where the JSONL files are stored. """ # Initialize the Dask cluster. - client = get_client(**ArgumentHelper.parse_client_args(args)) + client = get_client( + **ArgumentHelper.parse_client_args(args), set_torch_to_use_rmm=True + ) # Define data curation steps for text and pdf files curation_steps_text = Sequential( @@ -171,6 +173,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text = curation_steps_text(orig_dataset_text) dataset_code = curation_steps_code(orig_dataset_code) + print("********************* Generating Statistics *********************") print(f"Original dataset length for text files: {len(orig_dataset_text.df)}") print(f"After dataprep for text files: {len(dataset_text.df)}") print(f"Original dataset length for code files: {len(orig_dataset_code.df)}") @@ -193,6 +196,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: semantic_dataset_text = DocumentDataset( gpu_dataset_text.df[gpu_dataset_text.df.id.isin(unique_ids)] ) + print("********************* Generating Statistics *********************") print(f"After semantic dedupe for text files: {len(semantic_dataset_text.df)}") print("Executing the fuzzy dedupe pipeline...") @@ -207,8 +211,9 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: dataset_text.df = fuzzy_dataset_text.df.to_backend("pandas") dataset_code.df = fuzzy_dataset_code.df.to_backend("pandas") + print("********************* Generating Statistics *********************") print(f"After fuzzy dedupe for text files: {len(dataset_text.df)}") - print(f"After fuzzy dedupe: {len(dataset_code.df)}") + print(f"After fuzzy dedupe for code files: {len(dataset_code.df)}") final_dataset_text = dataset_text.persist() final_dataset_code = dataset_code.persist() diff --git a/tutorials/dapt-curation/code/requirements.txt b/tutorials/dapt-curation/code/requirements.txt index 481f5b0a3..2fc55c6c7 100755 --- a/tutorials/dapt-curation/code/requirements.txt +++ b/tutorials/dapt-curation/code/requirements.txt @@ -3,5 +3,7 @@ arxiv-downloader cchardet nltk==3.8.1 poppler-utils +qgrid +tesseract-ocr unstructured[all-docs]==0.14.5 unstructured[pdf]