-
Notifications
You must be signed in to change notification settings - Fork 557
/
Copy pathllama-2-7b-arc-easy--gpu.yaml
121 lines (106 loc) · 3.22 KB
/
llama-2-7b-arc-easy--gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
max_seq_len: 2048
global_seed: 17
# Run Name
run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME
# Model
model:
name: hf_causal_lm
pretrained_model_name_or_path: meta-llama/Llama-2-7b
pretrained: true # false: only use the architecture; true: initialize with pretrained weights
config_overrides:
max_seq_len: ${max_seq_len}
attn_config:
attn_impl: flash
# Set this to `true` if using `train_loader.dataset.packing_ratio` below
attn_uses_sequence_id: false
# Note: you must have set the HF_TOKEN environment variable and have access to the llama2 models
use_auth_token: true
# Tokenizer
tokenizer:
name: meta-llama/Llama-2-7b
kwargs:
model_max_length: ${max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
############
hf_name: json
hf_kwargs:
# Note: absolute paths for data_dir are more reliable;
# relative paths will be interpreted relative to whatever your
# working directory is when you run `train.py`
data_dir: finetune_example
# Note: `scripts/train` will be the working directory when resolving
# the preprocessing_fn import path
preprocessing_fn: finetune_example.preprocessing:multiple_choice
split: train
############
shuffle: true
max_seq_len: ${max_seq_len}
decoder_only_format: true
# # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
# # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
# # of the dataset.
# # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
# # to profile this run's optimal packing_ratio as it depends on GPU count,
# # batch size, sequence length
# packing_ratio:
drop_last: true
num_workers: 8
# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1
optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 1ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 8
# System
seed: ${global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
# device_train_microbatch_size: auto
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
# loggers:
# wandb: {}
# Checkpoint to local filesystem or remote object store
# save_interval: 500ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
# Load from local filesystem or remote object store
# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt