From baae75e077ab2fcff995c2dffb3ef4aa5e484c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Z=C3=BCgner?= Date: Thu, 23 Jan 2025 11:40:51 +0100 Subject: [PATCH] Lazy lfs download (#38) --------- Co-authored-by: Daniel Zuegner --- .lfsconfig | 2 ++ README.md | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 .lfsconfig diff --git a/.lfsconfig b/.lfsconfig new file mode 100644 index 0000000..2cf1bce --- /dev/null +++ b/.lfsconfig @@ -0,0 +1,2 @@ +[lfs] + fetchexclude = * diff --git a/README.md b/README.md index 35d4402..93c2259 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,10 @@ export PYTORCH_ENABLE_MPS_FALLBACK=1 # required to run MatterGen on Apple Silic ### Install Git LFS -If Git LFS was not installed before you cloned this repo, you can install it and download the missing files via: +If Git LFS was not installed before you cloned this repo, you can install it via: ```bash sudo apt install git-lfs git lfs install -git lfs pull # this downloads the missing files ``` ## Get started with a pre-trained model @@ -94,6 +93,7 @@ To sample from the pre-trained base model, run the following command. ```bash export MODEL_PATH=checkpoints/mattergen_base # Or provide your own model export RESULTS_PATH=results/ # Samples will be written to this directory +git lfs pull -I $MODEL_PATH --exclude="" # first download the checkpoint file from Git LFS # generate batch_size * num_batches samples python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --num_batches 1 @@ -111,6 +111,7 @@ For example, to sample from the model trained on magnetic density, you can run t export MODEL_NAME=dft_mag_density export MODEL_PATH="checkpoints/$MODEL_NAME" # Or provide your own model export RESULTS_PATH="results/$MODEL_NAME/" # Samples will be written to this directory, e.g., `results/dft_mag_density` +git lfs pull -I $MODEL_PATH --exclude="" # first download the checkpoint file from Git LFS # Generate conditional samples with a target magnetic density of 0.15 python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --checkpoint_epoch=last --properties_to_condition_on="{'dft_mag_density': 0.15}" --diffusion_guidance_factor=2.0 @@ -125,12 +126,14 @@ Adapt the following command to your specific needs: export MODEL_NAME=chemical_system_energy_above_hull export MODEL_PATH="checkpoints/$MODEL_NAME" # Or provide your own model export RESULTS_PATH="results/$MODEL_NAME/" # Samples will be written to this directory, e.g., `results/dft_mag_density` +git lfs pull -I $MODEL_PATH --exclude="" # first download the checkpoint file from Git LFS python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --checkpoint_epoch=last --properties_to_condition_on="{'energy_above_hull': 0.05, 'chemical_system': 'Li-O'}" --diffusion_guidance_factor=2.0 ``` ## Evaluation Once you have generated a list of structures contained in `$RESULTS_PATH` (either using MatterGen or another method), you can relax the structures using the default MatterSim machine learning force field (see [repository](https://github.com/microsoft/mattersim)) and compute novelty, uniqueness, stability (using energy estimated by MatterSim), and other metrics via the following command: ```bash +git lfs pull -I data-release/alex-mp/reference_MP2020correction.gz --exclude="" # first download the reference dataset from Git LFS python scripts/evaluate.py --structures_path=$RESULTS_PATH --relax=True --structure_matcher='disordered' --save_as="$RESULTS_PATH/metrics.json" ``` This script will write `metrics.json` containing the metric results to `$RESULTS_PATH` and will print it to your console. @@ -143,6 +146,7 @@ This script will write `metrics.json` containing the metric results to `$RESULTS If, instead, you have relaxed the structures and obtained the relaxed total energies via another mean (e.g., DFT), you can evaluate the metrics via: ```bash +git lfs pull -I data-release/alex-mp/reference_MP2020correction.gz --exclude="" # first download the reference dataset from Git LFS python scripts/evaluate.py --structures_path=$RESULTS_PATH --energies_path='energies.npy' --relax=False --structure_matcher='disordered' --save_as='metrics' ``` This script will try to read structures from disk in the following precedence order: @@ -158,6 +162,8 @@ Before we can train MatterGen from scratch, we have to unpack and preprocess the You can run the following command for `mp_20`: ```bash +# Download file from LFS +git lfs pull -I data-release/mp-20/ --exclude="" unzip data-release/mp-20/mp_20.zip -d datasets python scripts/csv_to_dataset.py --csv-folder datasets/mp_20/ --dataset-name mp_20 --cache-folder datasets/cache ``` @@ -165,6 +171,8 @@ You will get preprocessed data files in `datasets/cache/mp_20`. To preprocess our larger `alex_mp_20` dataset, run: ```bash +# Download file from LFS +git lfs pull -I data-release/alex-mp/alex_mp_20.zip --exclude="" unzip data-release/alex-mp/alex_mp_20.zip -d datasets python scripts/csv_to_dataset.py --csv-folder datasets/alex_mp_20/ --dataset-name alex_mp_20 --cache-folder datasets/cache ``` @@ -209,6 +217,7 @@ Assume that you have a MatterGen base model at `$MODEL_PATH` (e.g., `checkpoints ```bash export PROPERTY=dft_mag_density export MODEL_PATH=checkpoints/mattergen_base +git lfs pull -I $MODEL_PATH --exclude="" # first download the checkpoint file from Git LFS python scripts/finetune.py adapter.model_path=$MODEL_PATH data_module=mp_20 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY=$PROPERTY ~trainer.logger data_module.properties=["$PROPERTY"] ``` `dft_mag_density` denotes the target property for fine-tuning. @@ -226,6 +235,7 @@ You can also fine-tune MatterGen on multiple properties. For instance, to fine-t export PROPERTY1=dft_mag_density export PROPERTY2=dft_band_gap export MODEL_PATH=checkpoints/mattergen_base +git lfs pull -I $MODEL_PATH --exclude="" # first download the checkpoint file from Git LFS python scripts/finetune.py adapter.model_path=$MODEL_PATH data_module=mp_20 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY1=$PROPERTY1 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY2=$PROPERTY2 ~trainer.logger data_module.properties=["$PROPERTY1","$PROPERTY2"] ``` > [!TIP]