From baae75e077ab2fcff995c2dffb3ef4aa5e484c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Z=C3=BCgner?= <daniel.zuegner@gmail.com>
Date: Thu, 23 Jan 2025 11:40:51 +0100
Subject: [PATCH] Lazy lfs download (#38)

---------

Co-authored-by: Daniel Zuegner <dzuegner@microsoft.com>
---
 .lfsconfig |  2 ++
 README.md  | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 .lfsconfig

diff --git a/.lfsconfig b/.lfsconfig
new file mode 100644
index 0000000..2cf1bce
--- /dev/null
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+	fetchexclude = *
diff --git a/README.md b/README.md
index 35d4402..93c2259 100644
--- a/README.md
+++ b/README.md
@@ -65,11 +65,10 @@ export PYTORCH_ENABLE_MPS_FALLBACK=1  # required to run MatterGen on Apple Silic
 
 
 ### Install Git LFS
-If Git LFS was not installed before you cloned this repo, you can install it and download the missing files via:
+If Git LFS was not installed before you cloned this repo, you can install it via:
 ```bash
 sudo apt install git-lfs
 git lfs install
-git lfs pull  # this downloads the missing files
 ```
 
 ## Get started with a pre-trained model
@@ -94,6 +93,7 @@ To sample from the pre-trained base model, run the following command.
 ```bash
 export MODEL_PATH=checkpoints/mattergen_base  # Or provide your own model
 export RESULTS_PATH=results/  # Samples will be written to this directory
+git lfs pull -I $MODEL_PATH --exclude=""  # first download the checkpoint file from Git LFS
 
 # generate batch_size * num_batches samples
 python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --num_batches 1
@@ -111,6 +111,7 @@ For example, to sample from the model trained on magnetic density, you can run t
 export MODEL_NAME=dft_mag_density
 export MODEL_PATH="checkpoints/$MODEL_NAME"  # Or provide your own model
 export RESULTS_PATH="results/$MODEL_NAME/"  # Samples will be written to this directory, e.g., `results/dft_mag_density`
+git lfs pull -I $MODEL_PATH --exclude=""  # first download the checkpoint file from Git LFS
 
 # Generate conditional samples with a target magnetic density of 0.15
 python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --checkpoint_epoch=last --properties_to_condition_on="{'dft_mag_density': 0.15}" --diffusion_guidance_factor=2.0
@@ -125,12 +126,14 @@ Adapt the following command to your specific needs:
 export MODEL_NAME=chemical_system_energy_above_hull
 export MODEL_PATH="checkpoints/$MODEL_NAME"  # Or provide your own model
 export RESULTS_PATH="results/$MODEL_NAME/"  # Samples will be written to this directory, e.g., `results/dft_mag_density`
+git lfs pull -I $MODEL_PATH --exclude=""  # first download the checkpoint file from Git LFS
 python scripts/generate.py $RESULTS_PATH $MODEL_PATH --batch_size=16 --checkpoint_epoch=last --properties_to_condition_on="{'energy_above_hull': 0.05, 'chemical_system': 'Li-O'}" --diffusion_guidance_factor=2.0
 ```
 ## Evaluation
 
 Once you have generated a list of structures contained in `$RESULTS_PATH` (either using MatterGen or another method), you can relax the structures using the default MatterSim machine learning force field (see [repository](https://github.com/microsoft/mattersim)) and compute novelty, uniqueness, stability (using energy estimated by MatterSim), and other metrics via the following command:
 ```bash
+git lfs pull -I data-release/alex-mp/reference_MP2020correction.gz --exclude=""  # first download the reference dataset from Git LFS
 python scripts/evaluate.py --structures_path=$RESULTS_PATH --relax=True --structure_matcher='disordered' --save_as="$RESULTS_PATH/metrics.json"
 ```
 This script will write `metrics.json` containing the metric results to `$RESULTS_PATH` and will print it to your console.
@@ -143,6 +146,7 @@ This script will write `metrics.json` containing the metric results to `$RESULTS
 
 If, instead, you have relaxed the structures and obtained the relaxed total energies via another mean (e.g., DFT), you can evaluate the metrics via:
 ```bash
+git lfs pull -I data-release/alex-mp/reference_MP2020correction.gz --exclude=""  # first download the reference dataset from Git LFS
 python scripts/evaluate.py --structures_path=$RESULTS_PATH --energies_path='energies.npy' --relax=False --structure_matcher='disordered' --save_as='metrics'
 ```
 This script will try to read structures from disk in the following precedence order:
@@ -158,6 +162,8 @@ Before we can train MatterGen from scratch, we have to unpack and preprocess the
 
 You can run the following command for `mp_20`:
 ```bash
+# Download file from LFS
+git lfs pull -I data-release/mp-20/ --exclude=""
 unzip data-release/mp-20/mp_20.zip -d datasets
 python scripts/csv_to_dataset.py --csv-folder datasets/mp_20/ --dataset-name mp_20 --cache-folder datasets/cache
 ```
@@ -165,6 +171,8 @@ You will get preprocessed data files in `datasets/cache/mp_20`.
 
 To preprocess our larger `alex_mp_20` dataset, run:
 ```bash
+# Download file from LFS
+git lfs pull -I data-release/alex-mp/alex_mp_20.zip --exclude=""
 unzip data-release/alex-mp/alex_mp_20.zip -d datasets
 python scripts/csv_to_dataset.py --csv-folder datasets/alex_mp_20/ --dataset-name alex_mp_20 --cache-folder datasets/cache
 ```
@@ -209,6 +217,7 @@ Assume that you have a MatterGen base model at `$MODEL_PATH` (e.g., `checkpoints
 ```bash
 export PROPERTY=dft_mag_density
 export MODEL_PATH=checkpoints/mattergen_base
+git lfs pull -I $MODEL_PATH --exclude=""  # first download the checkpoint file from Git LFS
 python scripts/finetune.py adapter.model_path=$MODEL_PATH data_module=mp_20 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY=$PROPERTY ~trainer.logger data_module.properties=["$PROPERTY"]
 ```
 `dft_mag_density` denotes the target property for fine-tuning. 
@@ -226,6 +235,7 @@ You can also fine-tune MatterGen on multiple properties. For instance, to fine-t
 export PROPERTY1=dft_mag_density
 export PROPERTY2=dft_band_gap 
 export MODEL_PATH=checkpoints/mattergen_base
+git lfs pull -I $MODEL_PATH --exclude=""  # first download the checkpoint file from Git LFS
 python scripts/finetune.py adapter.model_path=$MODEL_PATH data_module=mp_20 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY1=$PROPERTY1 +lightning_module/diffusion_module/model/property_embeddings@adapter.adapter.property_embeddings_adapt.$PROPERTY2=$PROPERTY2 ~trainer.logger data_module.properties=["$PROPERTY1","$PROPERTY2"]
 ```
 > [!TIP]