|
| 1 | +experiment_setup: |
| 2 | + executable: 'code_transformer/experiments/code_transformer/code_summarization.py' |
| 3 | + |
| 4 | +data_setup: |
| 5 | + language: 'python,javascript,ruby,go' # The dataset (language) to use |
| 6 | + filter_language: python # only for multi-language datasets. Let only snippets of the specified language through (fine-tuning) |
| 7 | + use_validation: True |
| 8 | + num_sub_tokens: 5 # Number of sub-tokens that input tokens should be split into |
| 9 | + num_subtokens_output: 6 # Number of sub-tokens of the method name to predict |
| 10 | + use_only_ast: False # Whether to use the only-ast ablation |
| 11 | + mask_all_tokens: False # Only relevant if use_only_ast=True. Replaces all input tokens with a dummy token |
| 12 | + use_no_punctuation: True # Whether to drop punctuation tokens before feeding the snippets into the model |
| 13 | + use_pointer_network: True # Whether to use a pointer network in the decoder |
| 14 | + sort_by_length: False # Whether to sort loaded slices by number of tokens. Useful to minimize amount of zero-padding needed |
| 15 | + shuffle: False # Whether load order of snippets should be randomized |
| 16 | + chunk_size: 32 # Only relevant if shuffle=True and sort_by_lenght=True. Snippets will be chunked into chunks of `chunk_size`, which will then be randomly shuffled. |
| 17 | + |
| 18 | +data_transforms: |
| 19 | + max_distance_mask: None # Mask nodes if their relative distances exceed a certain treshold |
| 20 | +# max_distance_mask: |
| 21 | +# shortest_paths: 5 |
| 22 | +# ppr: -1 |
| 23 | +# sibling_sp: -1 |
| 24 | +# ancestor_sp: -1 |
| 25 | + |
| 26 | + relative_distances: # Which relative distances to use (have to be pre-computed in stage 2 preprocessing) |
| 27 | + - ppr |
| 28 | + - ancestor_sp |
| 29 | + - sibling_sp |
| 30 | + - shortest_paths |
| 31 | + |
| 32 | + distance_binning: # Distance binning for dealing with real-valued distances |
| 33 | + type: 'exponential' # "exponential" or "equal". Exponential binning has more diversified (smaller) bins for smaller distances |
| 34 | + growth_factor: 1.3 |
| 35 | + n_fixed_bins: 9 |
| 36 | + |
| 37 | +transfer_learning: # Load a pretrained model for fine-tuninig |
| 38 | + use_pretrained_model: False |
| 39 | + model_type: 'ct_code_summarization' |
| 40 | + run_id: CT-23 |
| 41 | + snapshot_iteration: 10 |
| 42 | + cpu: False |
| 43 | + freeze_encoder_layers: None # None, "all" or {number}. If and how many encoder layers to keep constant during fine-tuning |
| 44 | + |
| 45 | +model: |
| 46 | + with_cuda: True # Run model on GPU |
| 47 | + label_smoothing: 0.1 # Apply label smoothing to ground truth |
| 48 | + lm_encoder: # Hyperparameters of the encoder |
| 49 | + input_nonlinearity: 'tanh' |
| 50 | + num_languages: None # only relevant for multi-language datasets. How many different languages have been fused together |
| 51 | + transformer: # CodeTransformer hyperparameters |
| 52 | + num_layers: 1 |
| 53 | + encoder_layer: |
| 54 | + d_model: 1024 # Internal embedding dimension |
| 55 | + nhead: 8 # Number of attention heads |
| 56 | + dim_feedforward: 2048 # Dimension of feed-forward layer |
| 57 | + dropout: 0.2 |
| 58 | + activation: 'gelu' |
| 59 | + use_content_content: True # Whether to use the content-content term in attention computation |
| 60 | + use_content_pos: True # Whether to use the content-content term in attention computation |
| 61 | + use_pos_content: True # Whether to use the content-content term in attention computation |
| 62 | + use_pos_pos: True # Whether to use the content-content term in attention computation |
| 63 | + use_token_distances: True # Whether to also compute the simple hop-distance between the input tokens |
| 64 | + lm_decoder: # Hyperparameters of the decoder |
| 65 | + output_nonlinearity: None |
| 66 | + n_layers: 1 |
| 67 | + decoder_dropout: 0 |
| 68 | + decoder_nhead: 8 |
| 69 | + decoder_dim_feedforward: 2048 |
| 70 | + decoder_activation: 'gelu' |
| 71 | + use_teacher_forcing: True # Whether to use teacher forcing during training (Label is fed into decoder instead of prediction for previous position) |
| 72 | + pointer_attention_type: 'additive' # Attention type in Pointer Network. "scaled_dot_product", "multihead" or "additive" |
| 73 | + use_pointer_query_self_attention: False # Whether to use self-attention between pointer query and decoder input |
| 74 | + concat_query_and_pointer: True # Whether to also use the query-stream of the encoder output to guide the pointer query |
| 75 | + attend_cls_token: False # Whether to mask the CLS token for attention |
| 76 | + |
| 77 | +optimizer: |
| 78 | + optimizer: 'Adam' |
| 79 | + learning_rate: 8e-5 |
| 80 | + reg_scale: 3e-5 |
| 81 | + |
| 82 | + #scheduler: 'OneCycleLR' |
| 83 | + #scheduler_params: |
| 84 | + # max_lr: 1e-4 |
| 85 | + # steps_per_epoch: 4000 # 500000 / 128 |
| 86 | + # epochs: 30 |
| 87 | + # pct_start: 0.3 |
| 88 | + |
| 89 | + #scheduler: 'MultiStepLR' |
| 90 | + #scheduler_params: |
| 91 | + # milestones: [1500, 5000] |
| 92 | + # gamma: 0.1 |
| 93 | + |
| 94 | +training: |
| 95 | + random_seed: 456 |
| 96 | + batch_size: 8 |
| 97 | + simulated_batch_size: 128 # Gradient Accumulation. After how many samples the model parameters should be updated |
| 98 | + simulated_batch_size_valid: 1280 # Over how many samples validation metrics should be calculated |
| 99 | + accumulate_tokens_batch: False |
| 100 | + validate_every: 100 # Counted in number of parameter updates (simulated_batch_size). How often approximate evaluation should be done |
| 101 | + persistent_snapshot_every: 10000 # Counted in loop iterations (batch_size). Also starts a full evaluation on the validation set to guide early stopping |
| 102 | + early_stopping_patience: 20 # How often the model evaluation can be worse than the current best before training is stopped |
| 103 | + max_validation_samples: 50000 # Limit number of samples for full evaluation on larger datasets to speed up training |
| 104 | + metrics: # Which metrics to log |
| 105 | + - top1_accuracy |
| 106 | + - top5_accuracy |
| 107 | + - non_trivial_accuracy |
| 108 | + - precision |
| 109 | + - recall |
| 110 | + - f1_score |
| 111 | + - micro_f1_score |
| 112 | + - rouge_2 |
| 113 | + - rouge_l |
0 commit comments