Skip to content

Commit 491a75a

Browse files
committed
initial
1 parent 30e4771 commit 491a75a

File tree

247 files changed

+28665
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

247 files changed

+28665
-1
lines changed

.gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Python cache
2+
_pycache__
3+
*.py[cod]
4+
5+
# IntelliJ/Jupyter
6+
.idea
7+
.ipynb_checkpoints

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2021 Daniel Zügner and Tobias Kirschstein, Technical University of Munich
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+400-1
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from enum import Enum
2+
3+
4+
class AttentionType(Enum):
5+
SCALED_DOT_PRODUCT = "scaled_dot_product"
6+
ADDITIVE = "additive"
7+
MULTIHEAD = "multihead"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from code_transformer.configuration.configuration_utils import ModelConfiguration
2+
3+
4+
class CodeTransformerLayerConfig(ModelConfiguration):
5+
def __init__(self,
6+
d_model=256,
7+
nhead=8,
8+
dim_feedforward=1024,
9+
activation="gelu",
10+
dropout=0.1,
11+
12+
num_relative_distances=1,
13+
use_token_distances=False,
14+
use_edge_embeddings=False,
15+
use_content_content=True,
16+
use_content_pos=True,
17+
use_pos_content=True,
18+
use_pos_pos=True, ):
19+
super(CodeTransformerLayerConfig, self).__init__()
20+
self.d_model = d_model
21+
self.nhead = nhead
22+
self.dim_feedforward = dim_feedforward
23+
self.activation = activation
24+
self.dropout = dropout
25+
self.num_relative_distances = num_relative_distances
26+
self.use_token_distances = use_token_distances
27+
self.use_edge_embeddings = use_edge_embeddings
28+
self.use_content_content = use_content_content
29+
self.use_content_pos = use_content_pos
30+
self.use_pos_content = use_pos_content
31+
self.use_pos_pos = use_pos_pos
32+
33+
34+
class CodeTransformerCoreConfig(ModelConfiguration):
35+
def __init__(self,
36+
encoder_layer: CodeTransformerLayerConfig,
37+
num_layers: int,
38+
positional_encoding=None,
39+
norm=None
40+
):
41+
super(CodeTransformerCoreConfig, self).__init__()
42+
if isinstance(encoder_layer, CodeTransformerLayerConfig):
43+
self.encoder_layer = CodeTransformerLayerConfig(**encoder_layer)
44+
else:
45+
self.encoder_layer = encoder_layer
46+
self.num_layers = num_layers
47+
self.norm = norm
48+
self.positional_encoding = positional_encoding
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
class DotDict(dict):
2+
"""
3+
Simple extension of Python's dict to support dot access.
4+
"""
5+
def __init__(self, *args, **kwargs):
6+
super(DotDict, self).__init__(*args, **kwargs)
7+
for arg in args:
8+
if isinstance(arg, dict):
9+
for k, v in arg.items():
10+
self[k] = v
11+
12+
if kwargs:
13+
for k, v in kwargs.items():
14+
if isinstance(v, dict):
15+
self[k] = DotDict(**v)
16+
else:
17+
self[k] = v
18+
19+
def __getattr__(self, attr):
20+
return self[attr]
21+
22+
def __setattr__(self, key, value):
23+
self.__setitem__(key, value)
24+
25+
def __setitem__(self, key, value):
26+
super(DotDict, self).__setitem__(key, value)
27+
self.__dict__.update({key: value})
28+
29+
def __delattr__(self, item):
30+
self.__delitem__(item)
31+
32+
def __delitem__(self, key):
33+
super(DotDict, self).__delitem__(key)
34+
del self.__dict__[key]
35+
36+
37+
class ModelConfiguration(DotDict):
38+
pass
39+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from code_transformer.configuration.configuration_utils import ModelConfiguration
2+
3+
4+
class GreatTransformerConfig(ModelConfiguration):
5+
def __init__(self,
6+
num_layers: int,
7+
positional_encoding=None,
8+
embed_dim=256,
9+
num_heads=8,
10+
ff_dim=1024,
11+
dropout_rate=0.1,
12+
is_encoder_decoder=False
13+
):
14+
super(GreatTransformerConfig, self).__init__()
15+
16+
self.num_layers = num_layers
17+
self.positional_encoding = positional_encoding
18+
19+
self.embed_dim = embed_dim
20+
self.hidden_dim = embed_dim
21+
self.attention_dim = embed_dim
22+
self.bias_dim = embed_dim
23+
self.num_heads = num_heads
24+
self.ff_dim = ff_dim
25+
self.dropout_rate = dropout_rate
26+
self.is_encoder_decoder = is_encoder_decoder
27+
28+
29+
class GreatEncoderConfig(ModelConfiguration):
30+
31+
def __init__(self,
32+
transformer_config: GreatTransformerConfig,
33+
vocab_size=32000,
34+
num_node_types=None,
35+
subtokens_per_token=5,
36+
num_languages=None
37+
):
38+
super(GreatEncoderConfig, self).__init__()
39+
40+
self.transformer_config = transformer_config
41+
self.vocab_size = vocab_size
42+
self.num_node_types = num_node_types
43+
self.subtokens_per_token = subtokens_per_token
44+
self.num_languages = num_languages
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from torch import nn
2+
3+
from code_transformer.configuration.configuration_utils import ModelConfiguration
4+
from code_transformer.configuration.attention import AttentionType
5+
6+
7+
class TransformerLMDecoderConfig(ModelConfiguration):
8+
9+
def __init__(self,
10+
lm_encoder, #: Union[TransformerLMEncoder, TransformerLMEncoderConfig],
11+
sos_id: int,
12+
unk_id=0,
13+
n_layers=1,
14+
decoder_dropout=0,
15+
decoder_nhead=8,
16+
decoder_dim_feedforward=2048,
17+
decoder_activation="gelu",
18+
use_teacher_forcing=False,
19+
output_subtokens_per_token=5,
20+
output_nonlinearity=None,
21+
loss_fct=nn.CrossEntropyLoss(ignore_index=-1),
22+
use_pointer_network=False,
23+
use_pointer_query_linear=False,
24+
use_pointer_query_self_attention=False,
25+
concat_query_and_pointer=True,
26+
attend_cls_token=False,
27+
pointer_attention_type=AttentionType.MULTIHEAD,
28+
target_vocab_size: int = None):
29+
r"""
30+
:param lm_encoder: The encoder on which the decoder should be built
31+
:param sos_id: The ID of the SOS token in the underlying vocabulary. Initially the decoder sequence will be
32+
populated with a single SOS token to have an input for the LSTM decoder
33+
:param n_layers: How many layers the decoder should have.
34+
:param decoder_dropout: Whether dropout should be applied in the decoder.
35+
:param use_teacher_forcing: If set, the previous label will be fed into the decoder instead of the
36+
previous prediction during training. Usually speeds up training but also introduces a gap between
37+
training and evaluation.
38+
:param target_vocab_size: If given, the model assumes separate vocabularies for input tokens and label tokens.
39+
The value specified here indicates the output distribution domain that is to be predicted
40+
"""
41+
super(TransformerLMDecoderConfig, self).__init__()
42+
self.lm_encoder = lm_encoder
43+
self.sos_id = sos_id
44+
self.unk_id = unk_id
45+
self.n_layers = n_layers
46+
self.decoder_dropout = decoder_dropout
47+
self.decoder_nhead = decoder_nhead
48+
self.decoder_dim_feedforward = decoder_dim_feedforward
49+
self.decoder_activation = decoder_activation
50+
self.use_teacher_forcing = use_teacher_forcing
51+
self.output_subtokens_per_token = output_subtokens_per_token
52+
self.output_nonlinearity = output_nonlinearity
53+
self.loss_fct = loss_fct
54+
self.use_pointer_network = use_pointer_network
55+
self.pointer_attention_type = pointer_attention_type if isinstance(pointer_attention_type,
56+
AttentionType) else AttentionType(
57+
pointer_attention_type)
58+
self.use_pointer_query_linear = use_pointer_query_linear
59+
self.use_pointer_query_self_attention = use_pointer_query_self_attention
60+
self.concat_query_and_pointer = concat_query_and_pointer
61+
self.attend_cls_token = attend_cls_token
62+
assert not (
63+
use_pointer_query_self_attention and use_pointer_query_linear), "Cannot set both query linear and query self attention"
64+
self.target_vocab_size = target_vocab_size
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from code_transformer.configuration.configuration_utils import ModelConfiguration
2+
3+
4+
class TransformerLMEncoderConfig(ModelConfiguration):
5+
6+
def __init__(self,
7+
transformer, #: Union[CodeTransformer, CodeTransformerCoreConfig],
8+
vocab_size=32000,
9+
num_node_types=None,
10+
num_token_types=None,
11+
subtokens_per_token=5,
12+
input_nonlinearity=None,
13+
num_languages=None):
14+
super(TransformerLMEncoderConfig, self).__init__()
15+
16+
self.transformer = transformer
17+
self.vocab_size = vocab_size
18+
self.num_token_types = num_token_types
19+
self.num_node_types = num_node_types
20+
self.subtokens_per_token = subtokens_per_token
21+
self.input_nonlinearity = input_nonlinearity
22+
self.num_languages = num_languages
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from code_transformer.experiments.experiment import ExperimentSetup, ex
2+
from code_transformer.experiments.mixins.code_summarization import CTCodeSummarizationMixin
3+
from code_transformer.experiments.mixins.code_trans_transformer import CodeTransformerDecoderMixin
4+
5+
6+
class CodeTransDecoderExperimentSetup(CodeTransformerDecoderMixin,
7+
CTCodeSummarizationMixin,
8+
ExperimentSetup):
9+
pass
10+
11+
12+
@ex.automain
13+
def main():
14+
experiment = CodeTransDecoderExperimentSetup()
15+
experiment.train()
16+
17+
18+
@ex.command(unobserved=True)
19+
def recreate_experiment():
20+
return CodeTransDecoderExperimentSetup()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
experiment_setup:
2+
executable: 'code_transformer/experiments/code_transformer/code_summarization.py'
3+
4+
data_setup:
5+
language: 'python,javascript,ruby,go' # The dataset (language) to use
6+
filter_language: python # only for multi-language datasets. Let only snippets of the specified language through (fine-tuning)
7+
use_validation: True
8+
num_sub_tokens: 5 # Number of sub-tokens that input tokens should be split into
9+
num_subtokens_output: 6 # Number of sub-tokens of the method name to predict
10+
use_only_ast: False # Whether to use the only-ast ablation
11+
mask_all_tokens: False # Only relevant if use_only_ast=True. Replaces all input tokens with a dummy token
12+
use_no_punctuation: True # Whether to drop punctuation tokens before feeding the snippets into the model
13+
use_pointer_network: True # Whether to use a pointer network in the decoder
14+
sort_by_length: False # Whether to sort loaded slices by number of tokens. Useful to minimize amount of zero-padding needed
15+
shuffle: False # Whether load order of snippets should be randomized
16+
chunk_size: 32 # Only relevant if shuffle=True and sort_by_lenght=True. Snippets will be chunked into chunks of `chunk_size`, which will then be randomly shuffled.
17+
18+
data_transforms:
19+
max_distance_mask: None # Mask nodes if their relative distances exceed a certain treshold
20+
# max_distance_mask:
21+
# shortest_paths: 5
22+
# ppr: -1
23+
# sibling_sp: -1
24+
# ancestor_sp: -1
25+
26+
relative_distances: # Which relative distances to use (have to be pre-computed in stage 2 preprocessing)
27+
- ppr
28+
- ancestor_sp
29+
- sibling_sp
30+
- shortest_paths
31+
32+
distance_binning: # Distance binning for dealing with real-valued distances
33+
type: 'exponential' # "exponential" or "equal". Exponential binning has more diversified (smaller) bins for smaller distances
34+
growth_factor: 1.3
35+
n_fixed_bins: 9
36+
37+
transfer_learning: # Load a pretrained model for fine-tuninig
38+
use_pretrained_model: False
39+
model_type: 'ct_code_summarization'
40+
run_id: CT-23
41+
snapshot_iteration: 10
42+
cpu: False
43+
freeze_encoder_layers: None # None, "all" or {number}. If and how many encoder layers to keep constant during fine-tuning
44+
45+
model:
46+
with_cuda: True # Run model on GPU
47+
label_smoothing: 0.1 # Apply label smoothing to ground truth
48+
lm_encoder: # Hyperparameters of the encoder
49+
input_nonlinearity: 'tanh'
50+
num_languages: None # only relevant for multi-language datasets. How many different languages have been fused together
51+
transformer: # CodeTransformer hyperparameters
52+
num_layers: 1
53+
encoder_layer:
54+
d_model: 1024 # Internal embedding dimension
55+
nhead: 8 # Number of attention heads
56+
dim_feedforward: 2048 # Dimension of feed-forward layer
57+
dropout: 0.2
58+
activation: 'gelu'
59+
use_content_content: True # Whether to use the content-content term in attention computation
60+
use_content_pos: True # Whether to use the content-content term in attention computation
61+
use_pos_content: True # Whether to use the content-content term in attention computation
62+
use_pos_pos: True # Whether to use the content-content term in attention computation
63+
use_token_distances: True # Whether to also compute the simple hop-distance between the input tokens
64+
lm_decoder: # Hyperparameters of the decoder
65+
output_nonlinearity: None
66+
n_layers: 1
67+
decoder_dropout: 0
68+
decoder_nhead: 8
69+
decoder_dim_feedforward: 2048
70+
decoder_activation: 'gelu'
71+
use_teacher_forcing: True # Whether to use teacher forcing during training (Label is fed into decoder instead of prediction for previous position)
72+
pointer_attention_type: 'additive' # Attention type in Pointer Network. "scaled_dot_product", "multihead" or "additive"
73+
use_pointer_query_self_attention: False # Whether to use self-attention between pointer query and decoder input
74+
concat_query_and_pointer: True # Whether to also use the query-stream of the encoder output to guide the pointer query
75+
attend_cls_token: False # Whether to mask the CLS token for attention
76+
77+
optimizer:
78+
optimizer: 'Adam'
79+
learning_rate: 8e-5
80+
reg_scale: 3e-5
81+
82+
#scheduler: 'OneCycleLR'
83+
#scheduler_params:
84+
# max_lr: 1e-4
85+
# steps_per_epoch: 4000 # 500000 / 128
86+
# epochs: 30
87+
# pct_start: 0.3
88+
89+
#scheduler: 'MultiStepLR'
90+
#scheduler_params:
91+
# milestones: [1500, 5000]
92+
# gamma: 0.1
93+
94+
training:
95+
random_seed: 456
96+
batch_size: 8
97+
simulated_batch_size: 128 # Gradient Accumulation. After how many samples the model parameters should be updated
98+
simulated_batch_size_valid: 1280 # Over how many samples validation metrics should be calculated
99+
accumulate_tokens_batch: False
100+
validate_every: 100 # Counted in number of parameter updates (simulated_batch_size). How often approximate evaluation should be done
101+
persistent_snapshot_every: 10000 # Counted in loop iterations (batch_size). Also starts a full evaluation on the validation set to guide early stopping
102+
early_stopping_patience: 20 # How often the model evaluation can be worse than the current best before training is stopped
103+
max_validation_samples: 50000 # Limit number of samples for full evaluation on larger datasets to speed up training
104+
metrics: # Which metrics to log
105+
- top1_accuracy
106+
- top5_accuracy
107+
- non_trivial_accuracy
108+
- precision
109+
- recall
110+
- f1_score
111+
- micro_f1_score
112+
- rouge_2
113+
- rouge_l

0 commit comments

Comments
 (0)