Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pbinder/auto resume #766

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft

Pbinder/auto resume #766

wants to merge 7 commits into from

Conversation

polinabinder1
Copy link
Collaborator

PEFT checkpointing and inference for esm2.

Signed-off-by: Polina Binder <[email protected]>
Signed-off-by: Polina Binder <[email protected]>
Signed-off-by: Polina Binder <[email protected]>
Signed-off-by: Polina Binder <[email protected]>
Signed-off-by: Polina Binder <[email protected]>
@codecov-commenter
Copy link

❌ 2 Tests Failed:

Tests completed Failed Passed Skipped
825 2 823 12
View the top 2 failed test(s) by shortest run time
sub-packages/bionemo-esm2/tests/bionemo/esm2/model/finetune/test_finetune.py::sub-packages.bionemo-esm2.tests.bionemo.esm2.model.finetune.test_finetune
Stack Traces | 0s run time
ImportError while importing test module '.../model/finetune/test_finetune.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
.../usr/lib/python3.12/importlib/__init__.py:90: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
.../model/finetune/test_finetune.py:38: in <module>
    from bionemo.esm2.model.finetune.finetune_regressor import (
E   ModuleNotFoundError: No module named 'bionemo.esm2.model.finetune.finetune_regressor'
sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py::test_finetune_geneformer_with_peft
Stack Traces | 60.4s run time
tmpdir = local('.../pytest-of-root/pytest-4/test_finetune_geneformer_with_0')
geneformer_config = GeneformerConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size...tion'>, train_metric=None, valid_metric=None, model_cls=<class 'bionemo.llm.model.biobert.model.MegatronBioBertModel'>)
n_layers_test = 3, n_steps_train = 100, batch_size = 16

    @pytest.mark.needs_gpu
    def test_finetune_geneformer_with_peft(
        tmpdir, geneformer_config: GeneformerConfig, n_layers_test: int = 3, n_steps_train: int = 100, batch_size: int = 16
    ):
        base_geneformer_config = io.reinit(geneformer_config)  # generate a new copy by calling the cached init.
    
        # Modify both the variable and associated saved init hyper-param by calling config.mutate(...)
        base_geneformer_config.set_hparam("return_only_hidden_states", False)
        base_geneformer_config.set_hparam("nemo1_ckpt_path", None)
        base_geneformer_config.set_hparam("num_layers", n_layers_test)  # set to 3 layers
        base_geneformer_config.set_hparam("hidden_size", 128)
        base_geneformer_config.set_hparam("ffn_hidden_size", 256)
        # Turn off dropout for this quick test
        base_geneformer_config.set_hparam("attention_dropout", 0.0)
        base_geneformer_config.set_hparam("hidden_dropout", 0.0)
        # Re-initialize after manually updating hidden_size/ffn_hidden_size since so many other parameters
        #  are based off of these parameters and modified in post_init of the transformer config.
        base_geneformer_config = io.reinit(base_geneformer_config)
        assert base_geneformer_config.num_layers == n_layers_test
        assert base_geneformer_config.nemo1_ckpt_path is None
        assert not base_geneformer_config.return_only_hidden_states
        with megatron_parallel_state_utils.distributed_model_parallel_state(32):
            ckpt_path, initial_metrics, initial_trainer = _train_model_get_ckpt(
                name="test_experiment",
                root_dir=tmpdir / "pretrain",
                config=base_geneformer_config,
                n_steps_train=n_steps_train,
                batch_size=batch_size,
                lr=5e-4,
            )
            weights_ckpt = ckpt_path / "weights"
            assert weights_ckpt.exists()
            assert weights_ckpt.is_dir()
            assert io.is_distributed_ckpt(weights_ckpt)
            assert initial_trainer.model.config.num_layers == n_layers_test
            assert sum(initial_metrics.collection_train["loss"][:10]) > sum(initial_metrics.collection_train["loss"][-10:])
        with megatron_parallel_state_utils.distributed_model_parallel_state(43):
            ft_geneformer_config = FineTuneSeqLenBioBertConfig(
                # All other hparams will be pulled from this checkpoint, aside from those in `override_parent_fields``
                initial_ckpt_path=str(ckpt_path),
            )
            peft = LoRAForGeneFormerTokenRegressor()
            simple_ft_checkpoint, simple_ft_metrics, ft_trainer = _train_model_get_ckpt(
                name="finetune_new_head",
                root_dir=tmpdir / "finetune_new_head",  # new checkpoint will land in a subdir of this
                config=ft_geneformer_config,  # same config as before since we are just continuing training
                n_steps_train=n_steps_train,
                batch_size=batch_size,
                peft=peft,
                lr=5e-3,
            )
            weights_ckpt = simple_ft_checkpoint / "weights"
            assert weights_ckpt.exists()
            assert weights_ckpt.is_dir()
            assert io.is_distributed_ckpt(weights_ckpt)
            assert ft_trainer.model.config.num_layers == n_layers_test
            assert sum(simple_ft_metrics.collection_train["loss"][:10]) > sum(
                simple_ft_metrics.collection_train["loss"][-10:]
            )
    
            model = ft_trainer.model[0].module.module.module
            assert all(not p.requires_grad for p in model.embedding.parameters())
            assert all(not p.requires_grad for name, p in model.encoder.named_parameters() if "adapter" not in name)
            assert all(p.requires_grad for name, p in model.encoder.named_parameters() if "adapter" in name)
            assert all(p.requires_grad for p in model.regression_head.parameters())
>           assert False
E           assert False

.../bionemo/geneformer/test_model.py:1085: AssertionError

To view more test analytics, go to the Test Analytics Dashboard
📋 Got 3 mins? Take this short survey to help us improve Test Analytics.

if args.lora_checkpoint_path and not args.lora_finetune:
parser.error("Arguments --lora=checkpoint-path cannot be set when not using lora-finetune.")

with megatron_parallel_state_utils.distributed_model_parallel_state(43):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this seed be surfaced somewhere?

@trvachov
Copy link
Collaborator

The resulting model finetuned with LoRA has this file...do you know what it's supposed to have in it?

cat /results/lora_model/checkpoint-step\=999-consumed_samples\=64000.0-last/weights/adapter_metadata.json
{"model_ckpt_path": "None"}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants