39
39
"train-other-500"
40
40
],
41
41
]
42
- _LIBRISPEECH_TEST_DATASETS = [
42
+ _LIBRISPEECH_DEV_DATASETS = [
43
43
[
44
44
"http://www.openslr.org/resources/12/dev-clean.tar.gz" ,
45
45
"dev-clean"
49
49
"dev-other"
50
50
],
51
51
]
52
+ _LIBRISPEECH_TEST_DATASETS = [
53
+ [
54
+ "http://www.openslr.org/resources/12/test-clean.tar.gz" ,
55
+ "test-clean"
56
+ ],
57
+ [
58
+ "http://www.openslr.org/resources/12/test-other.tar.gz" ,
59
+ "test-other"
60
+ ],
61
+ ]
52
62
53
63
54
64
def _collect_data (directory , input_ext , transcription_ext ):
@@ -72,7 +82,7 @@ def _collect_data(directory, input_ext, transcription_ext):
72
82
assert key not in data_files
73
83
media_name = "%s.%s" % (media_base , input_ext )
74
84
media_path = os .path .join (root , media_name )
75
- data_files [key ] = (media_path , label )
85
+ data_files [key ] = (media_base , media_path , label )
76
86
return data_files
77
87
78
88
@@ -82,7 +92,8 @@ class Librispeech(speech_recognition.SpeechRecognitionProblem):
82
92
83
93
# Select only the clean data
84
94
TRAIN_DATASETS = _LIBRISPEECH_TRAIN_DATASETS
85
- DEV_DATASETS = _LIBRISPEECH_TEST_DATASETS
95
+ DEV_DATASETS = _LIBRISPEECH_DEV_DATASETS
96
+ TEST_DATASETS = _LIBRISPEECH_TEST_DATASETS
86
97
87
98
@property
88
99
def num_shards (self ):
@@ -96,6 +107,10 @@ def use_subword_tokenizer(self):
96
107
def num_dev_shards (self ):
97
108
return 1
98
109
110
+ @property
111
+ def num_test_shards (self ):
112
+ return 1
113
+
99
114
@property
100
115
def use_train_shards_for_dev (self ):
101
116
"""If true, we only generate training data and hold out shards for dev."""
@@ -127,20 +142,31 @@ def generator(self, data_dir, tmp_dir, datasets,
127
142
audio_encoder = encoders ["waveforms" ]
128
143
text_encoder = encoders ["targets" ]
129
144
130
- for media_file , text_data in sorted (data_pairs )[start_from :]:
145
+ for utt_id , media_file , text_data in sorted (data_pairs )[start_from :]:
131
146
if how_many > 0 and i == how_many :
132
147
return
133
148
i += 1
149
+ wav_data = audio_encoder .encode (media_file )
150
+ spk_id , unused_book_id , _ = utt_id .split ("-" )
134
151
yield {
135
- "waveforms" : audio_encoder .encode (media_file ),
136
- "targets" : text_encoder .encode (text_data )
152
+ "waveforms" : wav_data ,
153
+ "waveform_lens" : [len (wav_data )],
154
+ "targets" : text_encoder .encode (text_data ),
155
+ "raw_transcript" : [text_data ],
156
+ "utt_id" : [utt_id ],
157
+ "spk_id" : [spk_id ],
137
158
}
138
159
139
160
def generate_data (self , data_dir , tmp_dir , task_id = - 1 ):
140
161
train_paths = self .training_filepaths (
141
162
data_dir , self .num_shards , shuffled = False )
142
163
dev_paths = self .dev_filepaths (
143
164
data_dir , self .num_dev_shards , shuffled = False )
165
+ test_paths = self .test_filepaths (
166
+ data_dir , self .num_test_shards , shuffled = True )
167
+
168
+ generator_utils .generate_files (
169
+ self .generator (data_dir , tmp_dir , self .TEST_DATASETS ), test_paths )
144
170
145
171
if self .use_train_shards_for_dev :
146
172
all_paths = train_paths + dev_paths
@@ -153,22 +179,51 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
153
179
self .generator (data_dir , tmp_dir , self .DEV_DATASETS ), dev_paths )
154
180
155
181
182
+ @registry .register_problem ()
183
+ class LibrispeechTrainFullTestClean (Librispeech ):
184
+ """Problem to train on full 960h, but evaluate on clean data only."""
185
+
186
+ def training_filepaths (self , data_dir , num_shards , shuffled ):
187
+ return Librispeech .training_filepaths (data_dir , num_shards , shuffled )
188
+
189
+ def dev_filepaths (self , data_dir , num_shards , shuffled ):
190
+ return LibrispeechClean .dev_filepaths (data_dir , num_shards , shuffled )
191
+
192
+ def test_filepaths (self , data_dir , num_shards , shuffled ):
193
+ return LibrispeechClean .test_filepaths (data_dir , num_shards , shuffled )
194
+
195
+ def generate_data (self , data_dir , tmp_dir , task_id = - 1 ):
196
+ raise Exception ("Generate librispeech and librispeech_clean data." )
197
+
198
+
156
199
@registry .register_problem ()
157
200
class LibrispeechCleanSmall (Librispeech ):
158
- """Problem spec for Librispeech using 100h clean train data."""
201
+ """Problem spec for Librispeech using 100h clean train and clean eval data."""
159
202
160
203
# Select only the clean data
161
204
TRAIN_DATASETS = _LIBRISPEECH_TRAIN_DATASETS [:1 ]
162
- DEV_DATASETS = _LIBRISPEECH_TEST_DATASETS [:1 ]
205
+ DEV_DATASETS = _LIBRISPEECH_DEV_DATASETS [:1 ]
206
+ TEST_DATASETS = _LIBRISPEECH_TEST_DATASETS [:1 ]
163
207
164
208
165
209
@registry .register_problem ()
166
210
class LibrispeechClean (Librispeech ):
167
- """Problem spec for Librispeech using 460h clean train data."""
211
+ """Problem spec for Librispeech using 460h clean train and clean eval data."""
168
212
169
213
# Select only the clean data
170
214
TRAIN_DATASETS = _LIBRISPEECH_TRAIN_DATASETS [:2 ]
171
- DEV_DATASETS = _LIBRISPEECH_TEST_DATASETS [:1 ]
215
+ DEV_DATASETS = _LIBRISPEECH_DEV_DATASETS [:1 ]
216
+ TEST_DATASETS = _LIBRISPEECH_TEST_DATASETS [:1 ]
217
+
218
+
219
+ @registry .register_problem ()
220
+ class LibrispeechNoisy (Librispeech ):
221
+ """Problem spec for Librispeech using 400h noisy train and noisy eval data."""
222
+
223
+ # Select only the clean data
224
+ TRAIN_DATASETS = _LIBRISPEECH_TRAIN_DATASETS [2 :]
225
+ DEV_DATASETS = _LIBRISPEECH_DEV_DATASETS [1 :]
226
+ TEST_DATASETS = _LIBRISPEECH_TEST_DATASETS [1 :]
172
227
173
228
174
229
# TODO(lukaszkaiser): clean up hparams or remove from here.
0 commit comments