Updated test tolerances for H100 (linkedin#55)

shimizust · web-flow · commit 2c43eba98bb7 · 2024-08-23T10:42:12.000-07:00
## Summary
- Updated test tolerances to pass on H100

## Testing Done

- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

```
$ make checkstyle
flake8 . --exclude=.venv; flake8_status=$?; \
isort .; isort_status=$?; \
black .; black_status=$?; \
if [ $flake8_status -ne 0 ] || [ $isort_status -ne 0 ] || [ $black_status -ne 0 ]; then \
        exit 1; \
fi
Skipped 2 files
All done! ✨ 🍰 ✨
51 files left unchanged.

$ make test
pytest --disable-warnings test/ --ignore=test/convergence
================================================================================ test session starts ================================================================================
platform linux -- Python 3.10.14, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jobuser/Liger-Kernel
plugins: lipy-config-base-32.0.27, lipy-fabric-35.2.19, lipy-test-8.0.66, datadir-1.3.1
collected 114 items                                                                                                                                                                 

test/transformers/test_cross_entropy.py ..........................................................                                                                            [ 50%]
test/transformers/test_fused_linear_cross_entropy.py ......                                                                                                                   [ 56%]
test/transformers/test_geglu.py ........                                                                                                                                      [ 63%]
test/transformers/test_rms_norm.py ................                                                                                                                           [ 77%]
test/transformers/test_rope.py ............                                                                                                                                   [ 87%]
test/transformers/test_swiglu.py ........                                                                                                                                     [ 94%]
test/transformers/test_trainer_integration.py ...                                                                                                                             [ 97%]
test/transformers/test_transformers_monkey_patch.py .                                                                                                                         [ 98%]
test/triton/test_triton_monkey_patch.py ..                                                                                                                                    [100%]

=============================================================================== 114 passed in 35.25s ================================================================================
(.venv) jobuser [ ~/Liger-Kernel ]$ make test-convergence
HF_DATASETS_OFFLINE=1 pytest --disable-warnings test/convergence
================================================================================ test session starts ================================================================================
platform linux -- Python 3.10.14, pytest-7.1.2, pluggy-1.0.0
rootdir: /home/jobuser/Liger-Kernel
plugins: lipy-config-base-32.0.27, lipy-fabric-35.2.19, lipy-test-8.0.66, datadir-1.3.1
collected 6 items                                                                                                                                                                   

test/convergence/test_mini_models.py ....                                                                                                                                     [ 66%]
test/convergence/test_mini_models_no_logits.py ..                                                                                                                             [100%]

================================================================================ 6 passed in 23.04s =================================================================================
```
diff --git a/.flake8 b/.flake8
@@ -4,6 +4,7 @@ max-line-length = 120
 exclude = 
     .git,
     __pycache__,
-    benchmark_internal/others
+    benchmark_internal/others,
+    .venv
 # E203: https://github.com/psf/black/issues/315
 extend-ignore=E501,B006,E731,A002,E203
diff --git a/examples/medusa/train.py b/examples/medusa/train.py
@@ -37,9 +37,7 @@
 
 @dataclass
 class ModelArguments:
-    model_name_or_path: Optional[str] = field(
-        default="meta-llama/Meta-Llama-3-8B"
-    )
+    model_name_or_path: Optional[str] = field(default="meta-llama/Meta-Llama-3-8B")
 
 
 @dataclass
diff --git a/setup.py b/setup.py
@@ -13,17 +13,17 @@
     package_dir={"": "src"},
     packages=find_namespace_packages(where="src"),
     classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Developers',
-        'Intended Audience :: Science/Research',
-        'Intended Audience :: Education',
-        'License :: OSI Approved :: BSD License',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.8',
-        'Programming Language :: Python :: 3.9',
-        'Programming Language :: Python :: 3.10',
-        'Topic :: Software Development :: Libraries',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Education",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     keywords="triton,kernels,LLM training,deep learning,Hugging Face,PyTorch,GPU optimization",
     include_package_data=True,
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
@@ -101,7 +101,7 @@ def _test_correctness_not_last_layer_once(
     [
         (0.1, torch.bfloat16, 1e-8, 5e-2),
         (1.0, torch.bfloat16, 1e-8, 5e-2),
-        (10.0, torch.bfloat16, 1e-8, 5e-2),
+        (10.0, torch.bfloat16, 1e-7, 5e-2),
         (0.1, torch.float32, 1e-8, 1e-6),
         (1.0, torch.float32, 1e-8, 1e-6),
         (10.0, torch.float32, 1e-8, 1e-6),
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
@@ -41,7 +41,7 @@ def forward(self, hidden_states):
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
-        (torch.float32, 1e-4, 1e-7),
+        (torch.float32, 1e-4, 1e-6),
         (torch.bfloat16, 5.0, 1e-5),
     ],
 )

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ def _test_correctness_not_last_layer_once(`
`101`	`101`	`[`
`102`	`102`	`(0.1, torch.bfloat16, 1e-8, 5e-2),`
`103`	`103`	`(1.0, torch.bfloat16, 1e-8, 5e-2),`
`104`		`- (10.0, torch.bfloat16, 1e-8, 5e-2),`
	`104`	`+ (10.0, torch.bfloat16, 1e-7, 5e-2),`
`105`	`105`	`(0.1, torch.float32, 1e-8, 1e-6),`
`106`	`106`	`(1.0, torch.float32, 1e-8, 1e-6),`
`107`	`107`	`(10.0, torch.float32, 1e-8, 1e-6),`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def forward(self, hidden_states):`
`41`	`41`	`@pytest.mark.parametrize(`
`42`	`42`	`"dtype, atol, rtol",`
`43`	`43`	`[`
`44`		`- (torch.float32, 1e-4, 1e-7),`
	`44`	`+ (torch.float32, 1e-4, 1e-6),`
`45`	`45`	`(torch.bfloat16, 5.0, 1e-5),`
`46`	`46`	`],`
`47`	`47`	`)`