NVIDIA
diff --git a/‎.github/workflows/build-test-publish-wheel.yml
+6-7 b/‎.github/workflows/build-test-publish-wheel.yml
+6-7
diff --git a/‎.github/workflows/release-freeze.yml
+8-3 b/‎.github/workflows/release-freeze.yml
+8-3
diff --git a/‎.github/workflows/release.yml
+5-5 b/‎.github/workflows/release.yml
+5-5
diff --git a/‎docs/user-guide/distributeddataclassification.rst
+8-8 b/‎docs/user-guide/distributeddataclassification.rst
+8-8
diff --git a/‎docs/user-guide/documentdataset.rst
+2-2 b/‎docs/user-guide/documentdataset.rst
+2-2
diff --git a/‎docs/user-guide/download.rst
+91-29 b/‎docs/user-guide/download.rst
+91-29
diff --git a/‎docs/user-guide/qualityfiltering.rst
+1-1 b/‎docs/user-guide/qualityfiltering.rst
+1-1
diff --git a/‎docs/user-guide/sparkother.rst
+1-1 b/‎docs/user-guide/sparkother.rst
+1-1
diff --git a/‎docs/user-guide/taskdecontamination.rst
+1-1 b/‎docs/user-guide/taskdecontamination.rst
+1-1
diff --git a/‎examples/classifier_filtering.py
+1-1 b/‎examples/classifier_filtering.py
+1-1
diff --git a/‎examples/identify_languages.py
+1-1 b/‎examples/identify_languages.py
+1-1
diff --git a/‎examples/task_decontamination.py
+1-1 b/‎examples/task_decontamination.py
+1-1
diff --git a/‎nemo_curator/classifiers/base.py
+7-4 b/‎nemo_curator/classifiers/base.py
+7-4
@@ -17,24 +17,23 @@ name: Build, test, and publish a PyPi wheel (to testpypi)
 on:
   push:
     branches:
-      - 'main'
-      - '[rv][0-9].[0-9].[0-9]'
-      - '[rv][0-9].[0-9].[0-9]rc[0-9]'
+      - "main"
+      - "[rv][0-9].[0-9].[0-9]"
+      - "[rv][0-9].[0-9].[0-9]rc[0-9]"
 
 defaults:
   run:
     shell: bash -x -e -u -o pipefail {0}
 
 jobs:
   build-test-publish-wheel:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.20.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.22.3
     with:
       dry-run: true
       python-package: nemo_curator
-      environment: public
-      python-version: '3.10'
+      python-version: "3.10"
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
@@ -14,15 +14,20 @@ on:
         description: Commit SHA to use for cut-off
         required: false
         default: main
-
+      dry-run:
+        type: boolean
+        description: Dry-run of code-freeze
+        required: false
+        default: true
 jobs:
   code-freeze:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.21.6
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.22.5
     with:
       library-name: NeMo Curator
       python-package: nemo_curator
       release-type: ${{ inputs.release-type }}
       freeze-commit: ${{ inputs.freeze-commit }}
+      dry-run: ${{ inputs.dry-run }}
     secrets:
-      SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: 'Release NeMo Curator'
+name: "Release NeMo Curator"
 
 on:
   workflow_dispatch:
@@ -31,17 +31,17 @@ on:
         description: Branch to target for version bump
 jobs:
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.20.1
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.22.6
     with:
       release-ref: ${{ inputs.release-ref }}
       python-package: nemo_curator
-      python-version: '3.10'
+      python-version: "3.10"
       library-name: NeMo Curator
       dry-run: ${{ inputs.dry-run }}
       version-bump-branch: ${{ inputs.version-bump-branch }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
-      SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       PAT: ${{ secrets.PAT }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
@@ -65,7 +65,7 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example
 
     from nemo_curator.classifiers import DomainClassifier
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     domain_classifier = DomainClassifier(filter_by=["Games", "Sports"])
@@ -87,7 +87,7 @@ Using the ``MultilingualDomainClassifier`` is very similar to using the ``Domain
 
     from nemo_curator.classifiers import MultilingualDomainClassifier
 
-    files = get_all_files_paths_under("japanese_books_dataset/")
+    files = get_all_files_paths_under("japanese_books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     multilingual_domain_classifier = MultilingualDomainClassifier(
@@ -110,7 +110,7 @@ Here's an example of how to use the ``QualityClassifier``:
 
     from nemo_curator.classifiers import QualityClassifier
 
-    files = get_all_files_paths_under("web_documents/")
+    files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     quality_classifier = QualityClassifier(filter_by=["High", "Medium"])
@@ -138,7 +138,7 @@ NeMo Curator provides an easy way to annotate and filter your data using the saf
 
 .. code-block:: python
 
-    files = get_all_files_paths_under("unsafe_documents/")
+    files = get_all_files_paths_under("unsafe_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     token = "hf_1234"  # Replace with your user access token
@@ -185,7 +185,7 @@ Here is a small example of how to use the ``InstructionDataGuardClassifier``:
 
     # The model expects instruction-response style text data. For example:
     # "Instruction: {instruction}. Input: {input_}. Response: {response}."
-    files = get_all_files_paths_under("instruction_input_response_dataset/")
+    files = get_all_files_paths_under("instruction_input_response_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     token = "hf_1234"  # Replace with your user access token
@@ -214,7 +214,7 @@ To use the FineWeb Educational Content Classifier, you can follow this example:
 
     from nemo_curator.classifiers import FineWebEduClassifier
 
-    files = get_all_files_paths_under("web_documents/")
+    files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     edu_classifier = FineWebEduClassifier(
@@ -337,7 +337,7 @@ Let's see how ``ContentTypeClassifier`` works in a small excerpt taken from ``ex
 
     from nemo_curator.classifiers import ContentTypeClassifier
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
@@ -359,7 +359,7 @@ Here's an example of how to use the ``PromptTaskComplexityClassifier``:
 
     from nemo_curator.classifiers import PromptTaskComplexityClassifier
 
-    files = get_all_files_paths_under("my_dataset/")
+    files = get_all_files_paths_under("my_dataset/", keep_extensions="jsonl")
     input_dataset = DocumentDataset.read_json(files, backend="cudf")
 
     classifier = PromptTaskComplexityClassifier()
 
@@ -43,7 +43,7 @@ You could read, filter the dataset, and write it using the following methods
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.filters import WordCountFilter
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     filter_step = nc.ScoreFilter(
@@ -58,7 +58,7 @@ You could read, filter the dataset, and write it using the following methods
 
 Let's walk through this code line by line.
 
-* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory.
+* ``files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")`` This retrieves a list of all files in the given directory, then filters the list to include only files ending with ".jsonl".
   In our case, this is equivalent to writing
 
   .. code-block:: python
 
@@ -36,41 +36,103 @@ By "extraction", we typically mean the process of converting a data format from
   Common crawl has an S3 bucket and a direct HTTPS endpoint. If you want to use the S3 bucket, ensure you have properly set up your credentials with `s5cmd <https://github.com/peak/s5cmd>`_.
   Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it:
 
-  .. code-block:: python
-
-    from nemo_curator.download import download_common_crawl
-
-    common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")
-
-  * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
-  * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
-  * ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
-  * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.
+.. code-block:: python
+
+  import os
+  from nemo_curator import get_client
+  from nemo_curator.download import download_common_crawl
+  from nemo_curator.datasets import DocumentDataset
+
+  def main():
+      # Initialize a distributed Dask client
+      client = get_client(cluster_type="cpu")
+
+      # Parameters for downloading Common Crawl data.
+      # - output_folder: directory for temporary download/extraction files
+      # - start_snapshot and end_snapshot define the range to fetch
+      # - output_type: specifies file format for the extracted data (e.g., "jsonl")
+      output_folder = "/extracted/output/folder"
+      start_snapshot = "2020-50"
+      end_snapshot = "2021-04"
+      output_type = "jsonl"
+      os.makedirs(output_folder, exist_ok=True)
+
+      # Download and extract the Common Crawl data.
+      # The function returns a DocumentDataset that contains the extracted documents.
+      # Note: The output folder and output type are passed here to store intermediate files
+      # and check if the data has already been downloaded. They should match the final location
+      # and format of the extracted data.
+      common_crawl_dataset = download_common_crawl(
+          output_folder, start_snapshot, end_snapshot, output_type=output_type
+      )
+
+      # Write the extracted dataset to JSON format.
+      # The 'to_json' method will write one JSON document per line,
+      # preserving the original shard information if write_to_filename is True.
+      common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
+      print("Extracted dataset saved to:", output_folder)
+
+  if __name__ == "__main__":
+      main()
+
+* ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
+* ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
+* ``"2021-04"`` is the last common crawl snapshot that will be included in the download.
+* ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported.
 
 You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
 
-  .. code-block:: python
+.. code-block:: python
 
-    from nemo_curator.download import (
+  import os
+  from nemo_curator import get_client
+  from nemo_curator.download import (
       ResiliparseExtractor,
       download_common_crawl,
-    )
-
-    # Change the extraction algorithm
-    extraction_algorithm = ResiliparseExtractor()
-    common_crawl = download_common_crawl(
-      "/extracted/output/folder",
-      "2020-50",
-      "2021-04",
-      output_type="jsonl",
-      algorithm=extraction_algorithm,
-    )
-
-  Above, we changed the extraction algorithm from the default ``JusTextExtractor``.
-
-  The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use.
-
-  NeMo Curator's Common Crawl extraction process looks like this under the hood:
+  )
+  from nemo_curator.datasets import DocumentDataset
+
+  def main():
+      # Initialize a distributed Dask client
+      client = get_client(cluster_type="cpu")
+
+      # Parameters for downloading Common Crawl data.
+      # - output_folder: directory for temporary download/extraction files
+      # - start_snapshot and end_snapshot define the range to fetch
+      # - output_type: specifies file format for the extracted data (e.g., "jsonl")
+      output_folder = "/extracted/output/folder"
+      start_snapshot = "2020-50"
+      end_snapshot = "2021-04"
+      output_type = "jsonl"
+      os.makedirs(output_folder, exist_ok=True)
+
+      # Change the extraction algorithm to use ResiliparseExtractor
+      extraction_algorithm = ResiliparseExtractor()
+
+      # Download and extract the Common Crawl data using the Resiliparse extraction algorithm.
+      # The function returns a DocumentDataset that contains the extracted documents.
+      common_crawl_dataset = download_common_crawl(
+          output_folder,
+          start_snapshot,
+          end_snapshot,
+          output_type=output_type,
+          algorithm=extraction_algorithm,
+      )
+
+      # Write the extracted dataset to JSON format.
+      # The 'to_json' method writes one JSON document per line,
+      # preserving the original shard information if write_to_filename is True.
+      common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
+      print("Extracted dataset saved to:", output_folder)
+
+  if __name__ == "__main__":
+      main()
+
+Above, we changed the extraction algorithm from the default ``JusTextExtractor``.
+
+The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use.
+
+NeMo Curator's Common Crawl extraction process looks like this under the hood:
 
  1. Decode the HTML within the record from binary to text.
  2. If the HTML can be properly decoded, then with `pyCLD2 <https://github.com/aboSamoor/pycld2>`_, perform language detection on the input HTML.
 
@@ -35,7 +35,7 @@ Let's examine this small example:
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.filters import WordCountFilter
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     filter_step = nc.ScoreFilter(
 
@@ -91,4 +91,4 @@ The following code snippet demonstrates how to read output from a Spark DataFram
     stories_dataset = DocumentDataset.read_parquet(processed_files, backend="pandas")
 
 It is worth noting that Spark typically tends to create checksum and other marker files which can vary by Spark distribution,
-so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.
+so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``.
@@ -28,7 +28,7 @@ Let's examine this small example:
     from nemo_curator.utils.file_utils import get_all_files_paths_under
     from nemo_curator.tasks import Winogrande, Squad, TriviaQA,
 
-    files = get_all_files_paths_under("books_dataset/")
+    files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")
     books = DocumentDataset.read_json(files, add_filename=True)
 
     downstream_tasks = [
 
@@ -27,7 +27,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
 
@@ -26,7 +26,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
 
@@ -44,7 +44,7 @@
 
 
 def load_dataset(input_data_dir):
-    files = list(get_all_files_paths_under(input_data_dir))
+    files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
     dataset = DocumentDataset(raw_data)
 
 
@@ -123,10 +123,13 @@ def _run_classifier_helper(
     prob_col: str = None,
 ) -> "dask_cudf.DataFrame":
 
-    if prob_col:
-        df[prob_col] = 0
-    else:
+    if prob_col is None:
         prob_col = "_prob"
+        labeler = op.Labeler(labels, cols=[prob_col], suffix=label_col)
+    else:
+        labeler = op.Labeler(
+            labels, cols=[prob_col], keep_cols=[prob_col], suffix=label_col
+        )
 
     columns_to_keep_list = df.columns.to_list()
 
@@ -140,7 +143,7 @@ def _run_classifier_helper(
             batch_size=batch_size,
             pred_output_col=prob_col,
         ),
-        op.Labeler(labels, cols=[prob_col], suffix=label_col),
+        labeler,
         repartition=df.npartitions,
         keep_cols=columns_to_keep_list,
     )