Skip to content

Commit fb47e14

Browse files
lhz1029madisonmay
authored andcommittedJan 10, 2020
ADD: default context must be explicitly passed in for start and end tokens
1 parent bf6414d commit fb47e14

17 files changed

+201
-110
lines changed
 

‎.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,5 @@ spyder_crash.log
146146
# Rope project folders
147147
.ropeproject/
148148
.vscode/
149+
150+
Data/**/*

‎docs/auxiliary.rst

+14-11
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,26 @@ as long as they describe specific spans of text.
77
.. code-block:: python
88
99
# First we define the extra features we will be providing, through a dictionary.
10-
# We do this by defining values and the defaults that tokens will receive if they are not explicitly labeled.
11-
# Auxiliary info can take the form of strings, booleans, floats, or ints.
12-
default = {'capitalized':False, 'part_of_speech':'unknown'}
13-
14-
# Next we create context tags in a similar format to SequenceLabeling labels, as a list of lists of dictionaries:
10+
# Auxiliary info can take the form of booleans, floats, or ints. We currently cannot accept categorical inputs.
11+
12+
# First we create context tags as a list of lists of dictionaries. Every token should have a context.
1513
train_text = ['Intelligent process automation']
1614
train_context = [[
1715
{'text': 'Intelligent', 'capitalized': True, 'end': 11, 'start': 0, 'part_of_speech': 'ADJ'},
1816
{'text': 'process automation', 'capitalized': False, 'end': 30, 'start': 12, 'part_of_speech': 'NOUN'},
1917
]]
2018
21-
# Our input to the model is now a list containing the text, and the context
22-
trainX = [train_text, train_context]
23-
24-
# Examples with no context must have an empty list as their context
25-
assert len(train_text) == len(train_context)
19+
# We then define the defaults that start and end tokens will receive.
20+
default = {'capitalized':False, 'part_of_speech':'unknown'}
2621
2722
# We indicate to the model that we are including auxiliary info by passing our default dictionary in with the default_context kwarg.
2823
model = Classifier(default_context=default)
29-
model.fit(trainX, trainY)
24+
# We finally pass in the context when fitting and predicting with our model.
25+
model.fit(trainX, trainY, context=train_context)
26+
27+
# Note that context format adapts with the text.
28+
# For most tasks, the context for a sequence of text is a list of dictionaries.
29+
# For comparison and comparison_regressor, where the input X is a list of two text sequences, the context is also a list of two dictionary lists.
30+
# For multiple_choice, context must be given to both the question and answers. Specifically, for a given input, the context should be a list of n dictionary lists where the first corresponds to the question and the subsequent n-1 correspond to the answers.
31+
32+
# See tests/test_auxiliary.py for examples.

‎finetune/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,10 @@ def _target_model(
139139
reuse=None,
140140
**kwargs
141141
):
142+
# Overridden by subclass to attach a target model onto the shared base featurizer.
143+
raise NotImplementedError
144+
145+
def _add_context_embed(self, featurizer_state):
142146
if "context" in featurizer_state:
143147
context_embed = featurizer_state["context"]
144148
for key in ['features', 'explain_out']:

‎finetune/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ def get_default_config():
373373
# Auxiliary Information
374374
use_auxiliary_info=False,
375375
n_context_embed=32,
376+
default_context=None,
376377
context_dim=None # number of context dimensions to be inserted
377378
)
378379
return settings

‎finetune/encoding/input_encoder.py

+6-19
Original file line numberDiff line numberDiff line change
@@ -222,37 +222,24 @@ def __setstate__(self, state):
222222
def __getstate__(self):
223223
return {"Encoder": None}
224224

225-
def get_default_context(context_by_char_loc):
226-
""" Use mean for numeric values, majority otherwise. """
227-
context_values = [c[1] for c in context_by_char_loc]
228-
num_keys = len(context_values[0])
229-
default_values = []
230-
for k in range(num_keys):
231-
values = [c[k] for c in context_values]
232-
if isinstance(values[0], str) or isinstance(values[0], bool):
233-
default_value = Counter(values).most_common(1)[0][0]
234-
else:
235-
default_value = np.mean(values)
236-
default_values.append(default_value)
237-
return default_values
238225

239-
def tokenize_context(context, encoded_output):
226+
def tokenize_context(context, encoded_output, config):
240227
""" Tokenize the context corresponding to a single sequence of text """
241228
seq_len = len(encoded_output.token_ids)
242229
context_keys = list(k for k in sorted(context[0].keys()) if k not in ['token', 'start', 'end'])
243230
context_by_char_loc = sorted([(c['end'], [c[k] for k in context_keys]) for c in context], key=lambda c: c[0])
244231
# default context is the sequence majority
245-
default_context = get_default_context(context_by_char_loc)
246-
current_context = 0
232+
default_context = [config.default_context[k] for k in context_keys]
233+
current_char_loc = 0
247234
tokenized_context = []
248235
for char_loc in encoded_output.char_locs:
249236
# Note: this assumes that the tokenization will never lump multiple tokens into one
250237
if char_loc == -1:
251238
tokenized_context.append(default_context)
252239
else:
253-
if char_loc > context_by_char_loc[current_context][0]:
254-
current_context += 1
255-
tokenized_context.append(context_by_char_loc[current_context][1])
240+
if char_loc > context_by_char_loc[current_char_loc][0]:
241+
current_char_loc += 1
242+
tokenized_context.append(context_by_char_loc[current_char_loc][1])
256243
# padded value doesn't matter since it will be masked out
257244
expanded_context = np.pad(tokenized_context, ((0, seq_len - len(tokenized_context)), (0, 0)), 'constant')
258245
assert len(expanded_context) == len(encoded_output.token_ids)

‎finetune/input_pipeline.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def text_to_tokens_mask(self, X, Y=None, context=None):
121121
if context is None:
122122
feats = {"tokens": out.token_ids, "mask": out.mask}
123123
else:
124-
tokenized_context = tokenize_context(context, out)
124+
tokenized_context = tokenize_context(context, out, self.config)
125125
feats = {"tokens": out.token_ids, "mask": out.mask, "context": tokenized_context}
126126
if Y is None:
127127
yield feats
@@ -151,13 +151,13 @@ def _compute_class_counts(self, encoded_dataset):
151151
return Counter(self.label_encoder.inverse_transform(target_arrs))
152152

153153
def _dataset_with_targets(self, Xs, Y, train, context=None):
154-
if context:
155-
if not callable(Xs) and not callable(Y):
154+
if context is not None:
155+
if not callable(Xs) and not callable(Y) and not callable(context):
156156
dataset = lambda: zip(Xs, Y, context)
157-
elif callable(Xs) and callable(Y):
157+
elif callable(Xs) and callable(Y) and callable(context):
158158
dataset = lambda: zip(Xs(), Y(), context)
159159
else:
160-
raise ValueError( "Either neither or both of Xs and Y should be callable, not a mixture")
160+
raise ValueError( "Either none or all of Xs and Y and context should be callable, not a mixture")
161161

162162
dataset_encoded = lambda: itertools.chain.from_iterable(
163163
map(lambda xyc: self.text_to_tokens_mask(*xyc), dataset())
@@ -187,7 +187,7 @@ def _dataset_with_targets(self, Xs, Y, train, context=None):
187187
)
188188

189189
def _dataset_without_targets(self, Xs, train, context=None):
190-
if context:
190+
if context is not None:
191191
# we assume that X must have known length if we also provide context so this is safe
192192
if callable(Xs):
193193
Xs_ = Xs()
@@ -360,7 +360,7 @@ def get_train_input_fns(self, Xs, Y=None, batch_size=None, val_size=None, contex
360360
)
361361
else:
362362
self._skip_tqdm = 0
363-
if context:
363+
if context is not None:
364364
to_shuffle = (Xs, Y, context)
365365

366366
if self.config.val_size > 0 and self.config.val_set is None:

‎finetune/nn/auxiliary.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,18 @@
44

55

66
def embed_context(context, featurizer_state, config, train):
7-
context_dim = shape_list(context)[-1]
8-
context_weight = tf.get_variable(
9-
name="ce",
10-
shape=[context_dim, config.n_context_embed],
11-
initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
12-
)
13-
context_bias = tf.get_variable(
14-
name="ca",
15-
shape=[config.n_context_embed],
16-
initializer=tf.zeros_initializer(),
17-
)
187
with tf.variable_scope("context_embedding"):
8+
context_dim = shape_list(context)[-1]
9+
context_weight = tf.get_variable(
10+
name="ce",
11+
shape=[context_dim, config.n_context_embed],
12+
initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
13+
)
14+
context_bias = tf.get_variable(
15+
name="ca",
16+
shape=[config.n_context_embed],
17+
initializer=tf.zeros_initializer(),
18+
)
1919
c_embed = tf.add(tf.multiply(context, context_weight), context_bias)
2020
featurizer_state['context'] = c_embed
2121
return featurizer_state

‎finetune/target_models/classifier.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
class ClassificationPipeline(BasePipeline):
1717
def resampling(self, Xs, Y, context=None):
18-
if context:
18+
if context is not None:
1919
if self.config.oversample:
2020
idxs, Ys, contexts = shuffle(
2121
*RandomOverSampler().fit_sample([[i] for i in range(len(Xs))], Y, context)
@@ -131,9 +131,7 @@ def get_eval_fn(cls):
131131
def _target_model(
132132
self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs
133133
):
134-
super(Classifier, self)._target_model(
135-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
136-
train=train, reuse=reuse, **kwargs)
134+
self._add_context_embed(featurizer_state)
137135
if "explain_out" in featurizer_state:
138136
shape = tf.shape(featurizer_state["explain_out"]) # batch, seq, hidden
139137
flat_explain = tf.reshape(

‎finetune/target_models/comparison.py

+40-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from finetune.errors import FinetuneError
66
from finetune.base import BaseModel
77
from finetune.target_models.classifier import Classifier, ClassificationPipeline
8-
from finetune.encoding.input_encoder import ArrayEncodedOutput
8+
from finetune.encoding.input_encoder import ArrayEncodedOutput, tokenize_context
99

1010

1111
class ComparisonPipeline(ClassificationPipeline):
@@ -32,14 +32,46 @@ def _text_to_ids(self, pair, Y=None, pad_token=None):
3232
kwargs["mask"] = np.stack([arr_forward.mask, arr_backward.mask], 0)
3333
yield ArrayEncodedOutput(**kwargs)
3434

35+
def text_to_tokens_mask(self, pair, Y=None, context=None):
36+
out_gen = self._text_to_ids(pair, pad_token=self.config.pad_token)
37+
for i, out in enumerate(out_gen):
38+
if context is None:
39+
feats = {"tokens": out.token_ids, "mask": out.mask}
40+
else:
41+
out_forward = ArrayEncodedOutput(
42+
token_ids=out.token_ids[0],
43+
tokens=out.token_ids[0],
44+
labels=None,
45+
char_locs=out.char_locs,
46+
mask=out.mask[0],
47+
)
48+
out_backward = ArrayEncodedOutput(
49+
token_ids=out.token_ids[1],
50+
tokens=out.token_ids[1],
51+
labels=None,
52+
char_locs=out.char_locs,
53+
mask=out.mask[1],
54+
)
55+
tokenized_context_forward = tokenize_context(context[0], out_forward, self.config)
56+
tokenized_context_backward = tokenize_context(context[1], out_backward, self.config)
57+
tokenized_context = [tokenized_context_forward, tokenized_context_backward]
58+
feats = {"tokens": out.token_ids, "mask": out.mask, "context": tokenized_context}
59+
if Y is None:
60+
yield feats
61+
else:
62+
yield feats, self.label_encoder.transform([Y])[0]
63+
3564
def feed_shape_type_def(self):
3665
TS = tf.TensorShape
3766
types = {"tokens": tf.int32, "mask": tf.float32}
3867
shapes = {
3968
"tokens": TS([2, self.config.max_length, 2]),
4069
"mask": TS([None, self.config.max_length]),
4170
}
42-
types, shapes = self._add_context_info_if_present(types, shapes)
71+
if self.config.use_auxiliary_info:
72+
TS = tf.TensorShape
73+
types["context"] = tf.float32
74+
shapes["context"] = TS([2, self.config.max_length, self.config.context_dim])
4375
return (
4476
(types, tf.float32,),
4577
(shapes, TS([self.target_dim]),),
@@ -78,12 +110,18 @@ def _target_model(
78110
**kwargs
79111
):
80112
featurizer_state = featurizer_state.copy()
113+
print('features before ', featurizer_state['features'])
81114
featurizer_state["sequence_features"] = tf.abs(
82115
tf.reduce_sum(featurizer_state["sequence_features"], 1)
83116
)
84117
featurizer_state["features"] = tf.abs(
85118
tf.reduce_sum(featurizer_state["features"], 1)
86119
)
120+
# to go from [batch, 2, max_length, n_context_embed] -> [batch, max_length, n_context_embed]
121+
if 'context' in featurizer_state:
122+
featurizer_state["context"] = tf.abs(
123+
tf.reduce_sum(featurizer_state["context"], 1)
124+
)
87125
return super(Comparison, self)._target_model(
88126
config=config,
89127
featurizer_state=featurizer_state,

‎finetune/target_models/comparison_regressor.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ def feed_shape_type_def(self):
1616
TS = tf.TensorShape
1717
types = {"tokens": tf.int32, "mask": tf.int32}
1818
shapes = {"tokens": TS([2, self.config.max_length, 2]), "mask": TS([2, self.config.max_length])}
19-
types, shapes = self._add_context_info_if_present(types, shapes)
19+
if self.config.use_auxiliary_info:
20+
TS = tf.TensorShape
21+
types["context"] = tf.float32
22+
shapes["context"] = TS([2, self.config.max_length, self.config.context_dim])
2023
return (
2124
(types, tf.float32,),
2225
(shapes, TS([self.target_dim]),),
@@ -36,11 +39,11 @@ def _get_input_pipeline(self):
3639
return ComparisonRegressionPipeline(self.config)
3740

3841
def _target_model(self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs):
39-
super(ComparisonRegressor, self)._target_model(
40-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
41-
train=train, reuse=reuse, **kwargs)
4242
featurizer_state["sequence_features"] = tf.abs(tf.reduce_sum(featurizer_state["sequence_features"], 1))
4343
featurizer_state["features"] = tf.abs(tf.reduce_sum(featurizer_state["features"], 1))
44+
if 'context' in featurizer_state:
45+
featurizer_state["context"] = tf.abs(tf.reduce_sum(featurizer_state["context"], 1))
46+
self._add_context_embed(featurizer_state)
4447
return regressor(
4548
hidden=featurizer_state['features'],
4649
targets=targets,

‎finetune/target_models/mtl.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,7 @@ def _target_model(
252252
task_id=None,
253253
**kwargs
254254
):
255-
super(MultiTask, self)._target_model(
256-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
257-
train=train, reuse=reuse, task_id=task_id, **kwargs)
255+
self._add_context_embed(featurizer_state)
258256
pred_fn_pairs = []
259257
featurizer_state["features"] = tf.cond(
260258
tf.equal(tf.shape(featurizer_state["features"])[1], 1),

‎finetune/target_models/multi_label_classifier.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def predict_proba(self, X, context=None):
7474
:param X: list or array of text to embed.
7575
:returns: list of dictionaries. Each dictionary maps from a class label to its assigned class probability.
7676
"""
77-
return super().predict_proba(X, context=None)
77+
return super().predict_proba(X, context=context)
7878

7979
def finetune(self, X, Y=None, batch_size=None, context=None):
8080
"""
@@ -86,9 +86,7 @@ def finetune(self, X, Y=None, batch_size=None, context=None):
8686
return super().finetune(X, Y=Y, batch_size=batch_size, context=context)
8787

8888
def _target_model(self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs):
89-
super(MultiLabelClassifier, self)._target_model(
90-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
91-
train=train, reuse=reuse, **kwargs)
89+
self._add_context_embed(featurizer_state)
9290
return multi_classifier(
9391
hidden=featurizer_state['features'],
9492
targets=targets,

‎finetune/target_models/multiple_choice.py

+35-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from finetune.nn.target_blocks import multi_choice_question
1010
from finetune.util import list_transpose
11+
from finetune.encoding.input_encoder import tokenize_context
1112

1213

1314
class MultipleChoicePipeline(BasePipeline):
@@ -35,6 +36,31 @@ def _text_to_ids(self, Xs, Y=None, pad_token=None):
3536
kwargs["mask"] = np.stack([arr.mask for arr in arrays], 0)
3637
yield ArrayEncodedOutput(**kwargs)
3738

39+
def text_to_tokens_mask(self, pair, Y=None, context=None):
40+
out_gen = self._text_to_ids(pair, pad_token=self.config.pad_token)
41+
for i, out in enumerate(out_gen):
42+
if context is None:
43+
feats = {"tokens": out.token_ids, "mask": out.mask}
44+
else:
45+
num_answers = len(out.tokens)
46+
tokenized_context = []
47+
for answer_idx in range(num_answers):
48+
out_instance = ArrayEncodedOutput(
49+
token_ids=out.token_ids[answer_idx],
50+
tokens=out.token_ids[answer_idx],
51+
labels=None,
52+
char_locs=out.char_locs,
53+
mask=out.mask[answer_idx],
54+
)
55+
context_instance = context[0] + context[answer_idx + 1]
56+
tokenized_context.append(tokenize_context(context_instance, out_instance, self.config))
57+
feats = {"tokens": out.token_ids, "mask": out.mask, "context": tokenized_context}
58+
if Y is None:
59+
yield feats
60+
else:
61+
yield feats, self.label_encoder.transform([Y])[0]
62+
63+
3864
def _format_for_encoding(self, X):
3965
return [[field] for field in X]
4066

@@ -45,7 +71,10 @@ def feed_shape_type_def(self):
4571
"tokens": TS([self.target_dim, self.config.max_length, 2]),
4672
"mask": TS([self.target_dim, self.config.max_length]),
4773
}
48-
types, shapes = self._add_context_info_if_present(types, shapes)
74+
if self.config.use_auxiliary_info:
75+
TS = tf.TensorShape
76+
types["context"] = tf.float32
77+
shapes["context"] = TS([self.target_dim, self.config.max_length, self.config.context_dim])
4978
return (
5079
(types, tf.float32,),
5180
(shapes, TS([]),),
@@ -114,9 +143,11 @@ def finetune(self, questions, answers, correct_answer, fit_lm_only=False, contex
114143
def _target_model(
115144
self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs
116145
):
117-
super(MultipleChoice, self)._target_model(
118-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
119-
train=train, reuse=reuse, **kwargs)
146+
if "context" in featurizer_state:
147+
context_embed = featurizer_state["context"]
148+
featurizer_state['features'] = tf.concat(
149+
(featurizer_state['features'], tf.reduce_mean(context_embed, 2)), -1
150+
)
120151
return multi_choice_question(
121152
hidden=featurizer_state["features"],
122153
targets=targets,

‎finetune/target_models/ordinal_regressor.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,7 @@ def finetune(self, X, Y=None, batch_size=None, context=None):
9191
def _target_model(
9292
self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs
9393
):
94-
super(OrdinalRegressor, self)._target_model(
95-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
96-
train=train, reuse=reuse, **kwargs)
94+
self._add_context_embed(featurizer_state)
9795
return ordinal_regressor(
9896
hidden=featurizer_state["features"],
9997
targets=targets,
@@ -137,7 +135,6 @@ def predict(self, pairs):
137135
def _get_input_pipeline(self):
138136
return ComparisonOrdinalRegressionPipeline(self.config)
139137

140-
@classmethod
141138
def _target_model(
142139
self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs
143140
):

‎finetune/target_models/regressor.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,7 @@ def finetune(self, X, Y=None, batch_size=None, context=None):
7575

7676
@classmethod
7777
def _target_model(self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs):
78-
super(Regressor, self)._target_model(
79-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
80-
train=train, reuse=reuse, **kwargs)
78+
self._add_context_embed(featurizer_state)
8179
return regressor(
8280
hidden=featurizer_state['features'],
8381
targets=targets,

‎finetune/target_models/sequence_labeling.py

+3-13
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def text_to_tokens_mask(self, X, Y=None, context=None):
3838
out_gen = self._text_to_ids(X, Y=Y, pad_token=pad_token)
3939
for out in out_gen:
4040
feats = {"tokens": out.token_ids, "mask": out.mask}
41-
if context:
42-
tokenized_context = tokenize_context(context, out)
41+
if context is not None:
42+
tokenized_context = tokenize_context(context, out, self.config)
4343
feats['context'] = tokenized_context
4444
if Y is None:
4545
yield feats
@@ -210,7 +210,6 @@ def predict(self, X, per_token=False, context=None):
210210
step_size = chunk_size // 3
211211
doc_idx = -1
212212
for position_seq, start_of_doc, end_of_doc, label_seq, proba_seq in self.process_long_sequence(X, context=context):
213-
print('position_seq', position_seq)
214213
start, end = 0, None
215214
if start_of_doc:
216215
# if this is the first chunk in a document, start accumulating from scratch
@@ -281,13 +280,6 @@ def predict(self, X, per_token=False, context=None):
281280
none_value=self.config.pad_token,
282281
subtoken_predictions=self.config.subtoken_predictions,
283282
)
284-
print( X,
285-
all_subseqs,
286-
all_labels,
287-
all_probs,
288-
all_positions,
289-
doc_annotations,
290-
)
291283

292284
if per_token:
293285
return [
@@ -333,9 +325,7 @@ def predict_proba(self, X, context=None):
333325
def _target_model(
334326
self, *, config, featurizer_state, targets, n_outputs, train=False, reuse=None, **kwargs
335327
):
336-
super(SequenceLabeler, self)._target_model(
337-
config=config, featurizer_state=featurizer_state, targets=targets, n_outputs=n_outputs,
338-
train=train, reuse=reuse, **kwargs)
328+
self._add_context_embed(featurizer_state)
339329
return sequence_labeler(
340330
hidden=featurizer_state["sequence_features"],
341331
targets=targets,

‎tests/test_auxiliary.py

+64-21
Original file line numberDiff line numberDiff line change
@@ -13,32 +13,22 @@
1313
from sklearn.metrics import accuracy_score, recall_score
1414
from sklearn.model_selection import train_test_split
1515

16-
from finetune import Classifier, SequenceLabeler
16+
from finetune import Classifier, SequenceLabeler, Comparison, ComparisonRegressor, MultipleChoice
1717
from finetune.base_models import TextCNN, BERTModelCased, GPT2Model, GPTModel, RoBERTa, GPT
1818
from finetune.config import get_config
1919
from finetune.util.metrics import (
2020
sequence_labeling_token_precision,
2121
sequence_labeling_token_recall,
2222
)
2323
from finetune.datasets.reuters import Reuters
24-
from finetune.encoding.input_encoder import get_default_context, tokenize_context, ArrayEncodedOutput
24+
from finetune.encoding.input_encoder import tokenize_context, ArrayEncodedOutput
2525

2626

2727
# prevent excessive warning logs
2828
warnings.filterwarnings("ignore")
2929
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
3030

3131
class TestAuxiliaryTokenization(unittest.TestCase):
32-
def test_get_default_context(self):
33-
context = [
34-
(2, ["single", True, 23.3, 4]),
35-
(4, ["double", True, 24.3, 2]),
36-
(8, ["single", False, 25.3, 3]),
37-
]
38-
39-
expected = ["single", True, 24.3, 3]
40-
self.assertEqual(get_default_context(context), expected)
41-
4232
def test_tokenize_context(self):
4333
encoded_output = ArrayEncodedOutput(
4434
token_ids=[
@@ -60,15 +50,16 @@ def test_tokenize_context(self):
6050
{'token': "only", 'start': 13, 'end': 17, 'left': 20, 'bold': False},
6151
{'token': "$80", 'start': 18, 'end': 21, 'left': 30, 'bold': True},
6252
]
63-
expanded_context = tokenize_context(context, encoded_output)
53+
config = get_config(**{'default_context': {'left': 0, 'bold': False}})
54+
expanded_context = tokenize_context(context, encoded_output, config)
6455
expected = [
65-
[False, 20],
56+
[False, 0],
6657
[False, 10],
6758
[False, 10],
6859
[False, 20],
6960
[True, 30],
7061
[True, 30],
71-
[False, 20]
62+
[False, 0]
7263
]
7364
print(expanded_context)
7465
np.testing.assert_array_equal(expected, expanded_context)
@@ -119,12 +110,13 @@ def default_config(self, **kwargs):
119110
defaults = {
120111
"batch_size": 2,
121112
"max_length": 256,
122-
"n_epochs": 1000,
113+
"n_epochs": 1, # we mostly are making sure nothing errors out
123114
"base_model": self.base_model,
124115
"val_size": 0,
125116
"use_auxiliary_info": True,
126117
"context_dim": 1,
127-
"val_set": (self.trainX, self.trainY, self.train_context)
118+
"val_set": (self.trainX, self.trainY, self.train_context),
119+
"default_context": {'bold': False}
128120
}
129121
defaults.update(kwargs)
130122
return dict(get_config(**defaults))
@@ -178,7 +170,6 @@ def test_sequence_labeler_no_auxiliary(self):
178170
Ensure model training does not error out
179171
Ensure model returns reasonable predictions
180172
"""
181-
182173
model = SequenceLabeler(**self.default_config(use_auxiliary_info=False, val_set=(self.trainX, self.trainY)))
183174
model.fit(self.trainX, self.trainY_seq)
184175
preds = model.predict(self.trainX)
@@ -190,12 +181,64 @@ def test_sequence_labeler_auxiliary(self):
190181
Ensure model training does not error out
191182
Ensure model returns reasonable predictions
192183
"""
193-
194-
model = SequenceLabeler(**self.default_config())
184+
# here we want to make sure we're actually using context
185+
model = SequenceLabeler(**self.default_config(n_epochs=1500))
195186
model.fit(self.trainX, self.trainY_seq, context=self.train_context)
196187
preds = model.predict(self.trainX, context=self.train_context)
197188
self._evaluate_sequence_preds(preds, True)
198-
189+
190+
def test_comparison_auxiliary(self):
191+
"""
192+
Ensure model training does not error out
193+
Ensure model returns reasonable predictions
194+
"""
195+
model = Comparison(**self.default_config(chunk_long_sequences=False, max_length=50, batch_size=4))
196+
trainX = [['i like apples', 'i like apples']] * 4
197+
trainY = ['A', 'B', 'C', 'D']
198+
train_context = [
199+
[self.train_context[i], self.train_context[j]] for i in [0, 1] for j in [0, 1]
200+
]
201+
print(train_context)
202+
model.fit(trainX, trainY, context=train_context)
203+
preds = model.predict(trainX, context=train_context)
204+
205+
def test_comparison_regressor_auxiliary(self):
206+
"""
207+
Ensure model training does not error out
208+
Ensure model returns reasonable predictions
209+
"""
210+
model = ComparisonRegressor(**self.default_config(chunk_long_sequences=False, max_length=50, batch_size=4))
211+
trainX = [['i like apples', 'i like apples']] * 4
212+
trainY = [0, .5, .5, 1]
213+
train_context = [
214+
[self.train_context[i], self.train_context[j]] for i in [0, 1] for j in [0, 1]
215+
]
216+
print(train_context)
217+
model.fit(trainX, trainY, context=train_context)
218+
preds = model.predict(trainX, context=train_context)
219+
220+
def test_multiple_choice_auxiliary(self):
221+
"""
222+
Ensure model training does not error out
223+
Ensure model returns reasonable predictions
224+
"""
225+
model = MultipleChoice(**self.default_config(chunk_long_sequences=False, max_length=50, batch_size=4))
226+
questions = ['i like apples'] * 2
227+
answers = [['happy', 'sad', 'neutral', 'not satisfied'], ['happy', 'sad', 'neutral', 'not satisfied']]
228+
correct_answers = ['happy', 'sad']
229+
answer_context = [
230+
[{'start': 0, 'end': 5, 'token': 'happy', 'bold': False}],
231+
[{'start': 0, 'end': 3, 'token': 'sad', 'bold': False}],
232+
[{'start': 0, 'end': 7, 'token': 'neutral', 'bold': False}],
233+
[{'start': 0, 'end': 3, 'token': 'not', 'bold': False}, {'start': 4, 'end': 13, 'token': 'satisfied', 'bold': False}],
234+
]
235+
# context looks like [[{}, {}, {}], [{}], [{}], [{}], [{}, {}]] where the first list is for the question
236+
# and the subsequent ones are for each answer
237+
train_context = [[self.train_context[0]] + answer_context] + [[self.train_context[1]] + answer_context]
238+
print(train_context)
239+
model.fit(questions, answers, correct_answers, context=train_context)
240+
preds = model.predict(questions, answers, context=train_context)
241+
199242
def test_save_load(self):
200243
"""
201244
Ensure saving + loading does not cause errors

0 commit comments

Comments
 (0)
Please sign in to comment.