mosaicml · KuuCi · Mar 12, 2025 · Mar 5, 2025 · Mar 6, 2025 · Mar 6, 2025
@@ -197,7 +197,7 @@ def __init__(
 
     # How to tokenize a text sample to a token sample
     def _tokenize(self, text_sample: Mapping) -> dict[str, list[int]]:
-        if self.tokenizer._pad_token is None:
+        if self.tokenizer.pad_token is None:
             # Some tokenizers (e.g. GPT2 tokenizer) have no padding token which causes bugs
             raise RuntimeError(
                 'If tokenizing on-the-fly, tokenizer must have a pad_token_id',

@@ -211,7 +211,7 @@ def get_text_collator(
     collate_fn = transformers.DataCollatorForLanguageModeling(
         tokenizer=tokenizer,
         mlm=mlm_probability is not None,
-        mlm_probability=mlm_probability,
+        mlm_probability=mlm_probability if mlm_probability else 0,
     )
 
     if (eos_token_id is not None) or (bos_token_id is not None):

@@ -228,13 +228,11 @@ def build_inner_model(
         Returns:
             Union[PreTrainedModel, 'PeftModel']: The built inner model.
         """
-        if not trust_remote_code and pretrained_model_name_or_path.startswith(
+        if pretrained_model_name_or_path.startswith(
             'mosaicml/mpt',
         ):
             raise ValueError(
-                'trust_remote_code must be set to True for MPT models. Without this, the MPT model code will come from the transformers library, '
-                +
-                'which is significantly slower and not compatible with the LLM foundry training code, rather than the code release by MosaicML.',
+                'MPT models are no longer supported by LLM Foundry due to transformer version v4.49.0 incompatibilities'
             )
         # Resolve "mixed" init device to either "cpu" or "meta"
         resolved_init_device = hf_get_init_device(init_device)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -28,19 +28,20 @@ parameters:
 
   models:
   -
-    model_name: mosaicml/mpt-7b-instruct
+    model_name: meta-llama/Llama-2-7b
     # Tokenizer
     tokenizer:
-      name: EleutherAI/gpt-neox-20b
+      name: meta-llama/Llama-2-7b
       kwargs:
         model_max_length: ${max_seq_len}
 
     model:
       name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+      pretrained_model_name_or_path: meta-llama/Llama-2-7b
       init_device: mixed
       pretrained: true
-      use_auth_token: false
+      # Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
+      use_auth_token: true
 
   # FSDP config for model sharding
   fsdp_config:

@@ -23,16 +23,17 @@ models:
       model_max_length: ${variables.max_seq_len}
 # # if you are evaluating more than one model, list them all as YAML blocks without variable interpolation
 # -
-#   model_name: mosaicml/mpt-7b
+#   model_name: meta-llama/Llama-2-7b
 #   model:
 #     name: hf_causal_lm
-#     pretrained_model_name_or_path: mosaicml/mpt-7b
+#     pretrained_model_name_or_path: meta-llama/Llama-2-7b
 #     init_device: cpu
 #     pretrained: true
 #     config_overrides:
 #       max_seq_len: ${variables.max_seq_len}
+#     use_auth_token: true
 #   tokenizer:
-#     name: mosaicml/mpt-7b
+#     name: meta-llama/Llama-2-7b
 #     kwargs:
 #       model_max_length: ${variables.max_seq_len}
 

diff --git a/...inetune_example/mpt-7b-arc-easy--gpu.yaml → ...une_example/llama-2-7b-arc-easy--gpu.yaml b/...inetune_example/mpt-7b-arc-easy--gpu.yaml → ...une_example/llama-2-7b-arc-easy--gpu.yaml
@@ -7,18 +7,20 @@ run_name:  # If left blank, will be read from env var $COMPOSER_RUN_NAME
 # Model
 model:
   name: hf_causal_lm
-  pretrained_model_name_or_path: mosaicml/mpt-7b
+  pretrained_model_name_or_path: meta-llama/Llama-2-7b
   pretrained: true  # false: only use the architecture; true: initialize with pretrained weights
   config_overrides:
     max_seq_len: ${max_seq_len}
     attn_config:
       attn_impl: flash
       # Set this to `true` if using `train_loader.dataset.packing_ratio` below
       attn_uses_sequence_id: false
+  # Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
+  use_auth_token: true
 
 # Tokenizer
 tokenizer:
-  name: mosaicml/mpt-7b
+  name: meta-llama/Llama-2-7b
   kwargs:
     model_max_length: ${max_seq_len}
 

diff --git a/...rain/yamls/finetune/mpt-7b_dolly_sft.yaml → .../yamls/finetune/llama-2-7b_dolly_sft.yaml b/...rain/yamls/finetune/mpt-7b_dolly_sft.yaml → .../yamls/finetune/llama-2-7b_dolly_sft.yaml
@@ -12,17 +12,19 @@ run_name: ${variables.run_name}
 model:
   name: hf_causal_lm
   pretrained: true
-  pretrained_model_name_or_path: mosaicml/mpt-7b
+  pretrained_model_name_or_path: meta-llama/Llama-2-7b
   config_overrides:
     max_seq_len: ${variables.max_seq_len}
     attn_config:
       attn_impl: flash
       # Set this to `true` if using `train_loader.dataset.packing_ratio` below
       attn_uses_sequence_id: false
+  # Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
+  use_auth_token: true
 
 # Tokenizer
 tokenizer:
-  name: mosaicml/mpt-7b
+  name: meta-llama/Llama-2-7b
   kwargs:
     model_max_length: ${variables.max_seq_len}
 

diff --git a/...n/yamls/finetune/mpt-7b_domain_adapt.yaml → ...mls/finetune/llama-2-7b_domain_adapt.yaml b/...n/yamls/finetune/mpt-7b_domain_adapt.yaml → ...mls/finetune/llama-2-7b_domain_adapt.yaml
@@ -15,16 +15,18 @@ run_name:  # If left blank, will be read from env var $COMPOSER_RUN_NAME
 model:
   name: hf_causal_lm
   pretrained: true
-  pretrained_model_name_or_path: mosaicml/mpt-7b
+  pretrained_model_name_or_path: meta-llama/Llama-2-7b
   config_overrides:
     max_seq_len: ${variables.max_seq_len}
     attn_config:
       attn_impl: flash
       attn_uses_sequence_id: false
+  # Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
+  use_auth_token: true
 
 # Tokenizer
 tokenizer:
-  name: mosaicml/mpt-7b
+  name: meta-llama/Llama-2-7b
   kwargs:
     model_max_length: ${variables.max_seq_len}
 

@@ -55,7 +55,7 @@
     'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.29.0,<0.30',
     'mlflow>=2.14.1,<2.19',
     'accelerate>=0.25,<1.4',  # for HF inference `device_map`
-    'transformers>=4.43.2,<4.47',
+    'transformers>=v4.49.0,<4.50',
     'mosaicml-streaming>=0.11.0,<0.12',
     'torch>=2.5.1,<2.5.2',
     'datasets>=3.3.2,<3.4',

diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import glob
 import json
 import math
 import os
@@ -220,11 +221,43 @@ def check_hf_tokenizer_equivalence(
         if attr1 is None and attr2 is None:
             continue
 
-        attr_value1 = attr1 if isinstance(attr1, str) else attr1.content
-        attr_value2 = attr2 if isinstance(attr2, str) else attr2.content
+        # Handle the case when the attribute is an AddedToken object
+        attr_value1 = attr1 if isinstance(
+            attr1,
+            str,
+        ) else attr1.content if hasattr(attr1, 'content') else str(attr1)
+        attr_value2 = attr2 if isinstance(
+            attr2,
+            str,
+        ) else attr2.content if hasattr(attr2, 'content') else str(attr2)
         assert attr_value1 == attr_value2
 
-    assert tokenizer1.__dict__ == tokenizer2.__dict__
+    # Ignore 'extra_special_tokens' as it was added by the transformers library during save/load
+    if 'extra_special_tokens' in tokenizer2.init_kwargs and 'extra_special_tokens' not in tokenizer1.init_kwargs:
+        tokenizer2.init_kwargs.pop('extra_special_tokens')
+
+    # Process special tokens map and added tokens decoder
+    for dict_map_key in ['_special_tokens_map', '_added_tokens_decoder']:
+        if dict_map_key in tokenizer1.__dict__ and dict_map_key in tokenizer2.__dict__:
+            # Get the nested dictionaries
+            token_map1 = tokenizer1.__dict__[dict_map_key]
+            token_map2 = tokenizer2.__dict__[dict_map_key]
+
+            # Process values in the first tokenizer's map
+            for key in list(token_map1.keys()):
+                if hasattr(token_map1[key], 'content'):
+                    token_map1[key] = token_map1[key].content
+
+            # Process values in the second tokenizer's map
+            for key in list(token_map2.keys()):
+                if hasattr(token_map2[key], 'content'):
+                    token_map2[key] = token_map2[key].content
+
+    # Final comparison of dictionaries
+    t1_dict = tokenizer1.__dict__
+    t2_dict = tokenizer2.__dict__
+
+    assert t1_dict == t2_dict, 'Tokenizer dictionaries are not equal'
 
 
 def remove_moe_world_size(config: MPTConfig):
@@ -274,6 +307,52 @@ def check_hf_model_equivalence(
             assert torch.equal(p1.cpu(), p2.cpu())
 
 
+def check_safetensors_precision(
+    model_path: str,
+    model: torch.nn.Module,
+    expected_precision: torch.dtype,
+    tolerance: float = 0.2,
+):
+    """Verify that the safetensors files in model_path have the expected size.
+
+    Args:
+        model_path: Path to the directory containing the safetensors files
+        model: The original model to count parameters from
+        expected_precision: The expected precision (torch.float32, torch.bfloat16, etc.)
+        tolerance: Allowed deviation from expected file size (as a ratio)
+
+    Returns:
+        bool: True if the safetensors files have the expected size, False otherwise
+    """
+    total_params = sum(p.numel() for p in model.parameters())
+    # Determine expected bytes per parameter based on precision
+    bytes_per_param = {
+        torch.float32: 4,
+        torch.float16: 2,
+        torch.bfloat16: 2,
+        torch.int8: 1,
+    }.get(expected_precision)
+    assert bytes_per_param
+
+    expected_size = total_params * bytes_per_param
+
+    safetensors_files = glob.glob(os.path.join(model_path, '*.safetensors'))
+    if not safetensors_files:
+        # If no safetensors files found, check pytorch_model.bin
+        safetensors_files = glob.glob(
+            os.path.join(model_path, 'pytorch_model*.bin'),
+        )
+
+    if not safetensors_files:
+        return False
+
+    total_size = sum(os.path.getsize(f) for f in safetensors_files)
+    size_ratio = total_size / expected_size
+
+    is_correct_size = (1.0 - tolerance) <= size_ratio <= (1.0 + tolerance)
+    return is_correct_size
+
+
 # TODO(GRT-2435): Change to fixture
 def delete_transformers_cache():
     # Only delete the files on local rank 0, otherwise race conditions are created
@@ -578,15 +657,27 @@ def test_huggingface_conversion_callback_interval(
     assert len(normal_checkpoints) == expected_normal_checkpoints
     assert len(huggingface_checkpoints) == expected_hf_checkpoints
 
+    # Get path to the last checkpoint
+    checkpoint_path = os.path.join(
+        tmp_path,
+        'checkpoints',
+        'huggingface',
+        f'ba{batches_per_epoch}',
+    )
+
+    # Verify the safetensors file size matches the expected precision
+    is_size_correct = check_safetensors_precision(
+        model_path=checkpoint_path,
+        model=trainer.state.model.model,
+        expected_precision=precision,
+    )
+    assert is_size_correct, f"Safetensors file size doesn't match expected precision {precision_str}"
+
     # Load the last huggingface checkpoint
     loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
-        os.path.join(
-            tmp_path,
-            'checkpoints',
-            'huggingface',
-            f'ba{batches_per_epoch}',
-        ),
+        checkpoint_path,
         trust_remote_code=True,
+        torch_dtype=precision,
     )
 
     # Check that the loaded model has the correct precision, and then set it back
@@ -603,15 +694,16 @@ def test_huggingface_conversion_callback_interval(
     loaded_model.config.init_device = original_model.model.config.init_device
 
     loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
-        os.path.join(
-            tmp_path,
-            'checkpoints',
-            'huggingface',
-            f'ba{batches_per_epoch}',
-        ),
+        checkpoint_path,
         trust_remote_code=True,
     )
 
+    # Also check that at least one parameter has the expected precision
+    for param_name, param in loaded_model.named_parameters():
+        assert param.dtype == precision, \
+            f'Parameter {param_name} has dtype {param.dtype}, expected {precision}'
+        break
+
     check_hf_model_equivalence(
         trainer.state.model.model.to(precision),
         loaded_model,
@@ -873,6 +965,7 @@ def _assert_checkpoint_equivalence(
             loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
                 checkpoint_path,
                 trust_remote_code=True,
+                torch_dtype=precision,
             )
 
         # Check that the loaded model has the correct precision, and then set it back
@@ -1426,7 +1519,6 @@ def test_mptmoe_huggingface_conversion_callback(
     device_batch_size = 1
     dataset_size = 2
     precision_str = 'float32'
-    precision = torch.float32
     batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
 
     checkpointer_callback = HuggingFaceCheckpointer(
@@ -1617,7 +1709,7 @@ def test_mptmoe_huggingface_conversion_callback(
 
             # Check that the loaded model has the correct precision, and then set it back
             # to the original for the equivalence check
-            assert loaded_model.config.torch_dtype == precision
+            assert loaded_model.config.torch_dtype == precision_str
             loaded_model.config.torch_dtype = original_model.model.config.torch_dtype
 
             loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(