Fix a few typos (#843)

srstevenson · Saaketh Narayan · web-flow · commit a0d491e2a927 · 2024-12-09T16:43:15.000-05:00
Co-authored-by: Saaketh Narayan &lt;saaketh.narayan@databricks.com&gt;
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -75,7 +75,7 @@ pytest -vv -s . # run all the unittests
 cd docs && make clean && make doctest # run doctests
 ```
 
-6\. [Optional] Compile and visualize the documentation locally. If you have a documentation changes, running the below commands is mandatory.
+6\. [Optional] Compile and visualize the documentation locally. If you have documentation changes, running the below commands is mandatory.
 
 <!--pytest.mark.skip-->
 ```bash
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
 # several pytest settings
 PYTHON ?= python  # Python command
 PYTEST ?= pytest  # Pytest command
-PYRIGHT ?= pyright  # Pyright command. Pyright must be installed seperately -- e.g. `node install -g pyright`
+PYRIGHT ?= pyright  # Pyright command. Pyright must be installed separately -- e.g. `node install -g pyright`
 EXTRA_ARGS ?=  # extra arguments for pytest
 
 dirs := streaming tests docs
diff --git a/docs/source/_templates/base.html b/docs/source/_templates/base.html
@@ -99,7 +99,7 @@
         version = fragments[1].split("/")[0]
 
         // NOTE: The version string will resolve to the PR number for RTD sites.
-        //       Checking whether first charater is a number.
+        //       Checking whether first character is a number.
         if (version[0] >= '0' && version[0] <= '9') {
           version = undefined
         }
diff --git a/docs/source/dataset_configuration/shuffling.md b/docs/source/dataset_configuration/shuffling.md
@@ -70,4 +70,4 @@ Samples within each shard are shuffled both before and after shards are split am
 
 Globally shuffles all samples. This is useful for single-node training on small data, where you want the most random shuffle possible, but is the least download-efficient of all shuffle algorithms. Training throughput is often much lower when using the `naive` shuffling algorithm.
 
-If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [perfomance tuning page](../distributed_training/performance_tuning.md).
+If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [performance tuning page](../distributed_training/performance_tuning.md).
diff --git a/docs/source/distributed_training/performance_tuning.md b/docs/source/distributed_training/performance_tuning.md
@@ -23,7 +23,7 @@ $$L = 2 \cdot S \cdot \lceil\frac{C}{P}\rceil $$
 
 Where $L$ is the required minimum cache limit per node, in MB, $S$ is the average shard size, in MB, $C$ is the number of canonical nodes (see [here](../dataset_configuration/shuffling.md#how-shuffling-works) and [here](../distributed_training/elastic_determinism.md#requirements)), and $P$ is the number of physical nodes. This is because only a single shard, plus a potentially predownloaded subsequent shard, needs to be resident per canonical node to make progress during training.
 
-If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minumum cache limit per node will be approximately:
+If using a shuffle-block-based algorithm such as [`'py1e'`](../dataset_configuration/shuffling.md#py1e-default) or [`'py1br'`](../dataset_configuration/shuffling.md#py1br), the required minimum cache limit per node will be approximately:
 
 $$L = k \cdot S \lceil \frac{B}{Q} \rceil \cdot \lceil\frac{C}{P}\rceil $$
 
diff --git a/scripts/samples/bench_and_plot.py b/scripts/samples/bench_and_plot.py
@@ -237,7 +237,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable,
         args (Namespace): Command-line arguments.
         bench_name (str): What to call this benchmark.
         desc (str): Brief description of the data.
-        generate (Callable): Method to genereate the dataset.
+        generate (Callable): Method to generate the dataset.
         formats (List[str]): List of shard formats to benchmark this data in.
     """
     print(f'Bench: {bench_name}')
@@ -373,7 +373,7 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable,
             y *= args.plot_bins
             y = y.astype(np.int64)
 
-            # Truncate the higest ``args.truncate_highest_frac`` timings because they get further
+            # Truncate the highest ``args.truncate_highest_frac`` timings because they get further
             # and further spaced as you ascend, which would ruin the plot.
             y = y[np.nonzero(y < args.plot_bins)[0]]
 
diff --git a/simulation/core/utils.py b/simulation/core/utils.py
@@ -20,7 +20,7 @@ def get_batches_epochs(dataset: SimulationDataset, max_duration: Time) -> tuple[
     Returns:
         Tuple[int, int, int]: batches per epoch, epochs, and the total batches.
     """
-    # get epochs, batches_per_epoch, and total_batches from a Time obect
+    # get epochs, batches_per_epoch, and total_batches from a Time object
     dataset_batches = dataset.get_num_batches()
     batches_per_epoch = dataset_batches
     epochs = 1
diff --git a/streaming/base/batching/stratified.py b/streaming/base/batching/stratified.py
@@ -115,7 +115,7 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e
                 f'Number of samples for stream {stream_id} is {batch_portion} because the portion '
                 +
                 f'of this stream in the global batch, which is of size {global_batch_size}, is ' +
-                f'too low. Please increase the global batch size or increase the porportion of ' +
+                f'too low. Please increase the global batch size or increase the proportion of ' +
                 f'total samples that come from stream {stream_id}.')
 
     # We now merge the partitions from each stream to get our final partition over all
diff --git a/streaming/text/convert/enwiki/mds/merge_shard_groups.py b/streaming/text/convert/enwiki/mds/merge_shard_groups.py
@@ -11,7 +11,7 @@
 
 
 def parse_args() -> Namespace:
-    """Parse commmand-line arguments.
+    """Parse command-line arguments.
 
     Returns:
         Namespace: Command-line arguments.
diff --git a/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py b/streaming/text/convert/enwiki/tfrecord/pick_eval_samples.py
@@ -1,4 +1,4 @@
-"""Script for picking certain number of sampels.
+"""Script for picking certain number of samples.
 """
 
 import argparse
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -512,7 +512,7 @@ def test_stratified_batching_Exception(local_remote_dir: tuple[str, str], stream
 
     with pytest.raises(ValueError, match=f'Number of samples for stream*'):
         # When we iterate through the dataloader, the samples will be partitioned.
-        # This should thow ValueError since stream 2 is too small to be included in each batch.
+        # This should throw ValueError since stream 2 is too small to be included in each batch.
         for _ in dataloader:
             continue
 

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@`
`99`	`99`	`version = fragments[1].split("/")[0]`
`100`	`100`
`101`	`101`	`// NOTE: The version string will resolve to the PR number for RTD sites.`
`102`		`- // Checking whether first charater is a number.`
	`102`	`+ // Checking whether first character is a number.`
`103`	`103`	`if (version[0] >= '0' && version[0] <= '9') {`
`104`	`104`	`version = undefined`
`105`	`105`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,4 +70,4 @@ Samples within each shard are shuffled both before and after shards are split am`
`70`	`70`
`71`	`71`	Globally shuffles all samples. This is useful for single-node training on small data, where you want the most random shuffle possible, but is the least download-efficient of all shuffle algorithms. Training throughput is often much lower when using the `naive` shuffling algorithm.
`72`	`72`
`73`		`-If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [perfomance tuning page](../distributed_training/performance_tuning.md).`
	`73`	`+If you are having trouble with throughput, network downloads, or shuffle quality, please refer to the [performance tuning page](../distributed_training/performance_tuning.md).`
Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ def generate_work_stratified_batching(dataset: StreamingDataset, world: World, e`
`115`	`115`	`f'Number of samples for stream {stream_id} is {batch_portion} because the portion '`
`116`	`116`	`+`
`117`	`117`	`f'of this stream in the global batch, which is of size {global_batch_size}, is ' +`
`118`		`- f'too low. Please increase the global batch size or increase the porportion of ' +`
	`118`	`+ f'too low. Please increase the global batch size or increase the proportion of ' +`
`119`	`119`	`f'total samples that come from stream {stream_id}.')`
`120`	`120`
`121`	`121`	`# We now merge the partitions from each stream to get our final partition over all`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-"""Script for picking certain number of sampels.`
	`1`	`+"""Script for picking certain number of samples.`
`2`	`2`	`"""`
`3`	`3`
`4`	`4`	`import argparse`