@@ -15,23 +15,17 @@ def _export_datasets(output_prefix):
15
15
16
16
# export all bliss corpora
17
17
for audio_format in ["flac" , "ogg" , "wav" ]:
18
- bliss_corpus_dict = get_bliss_corpus_dict (
19
- audio_format = audio_format , output_prefix = output_prefix
20
- )
18
+ bliss_corpus_dict = get_bliss_corpus_dict (audio_format = audio_format , output_prefix = output_prefix )
21
19
for name , bliss_corpus in bliss_corpus_dict .items ():
22
20
tk .register_output (
23
- os .path .join (
24
- output_prefix , "LibriSpeech" , "%s-%s.xml.gz" % (name , audio_format )
25
- ),
21
+ os .path .join (output_prefix , "LibriSpeech" , "%s-%s.xml.gz" % (name , audio_format )),
26
22
bliss_corpus ,
27
23
)
28
24
29
25
# export all ogg zip corpora
30
26
ogg_corpus_dict = get_ogg_zip_dict (output_prefix = output_prefix )
31
27
for name , ogg_corpus in ogg_corpus_dict .items ():
32
- tk .register_output (
33
- os .path .join (output_prefix , "LibriSpeech" , "%s.ogg.zip" % name ), ogg_corpus
34
- )
28
+ tk .register_output (os .path .join (output_prefix , "LibriSpeech" , "%s.ogg.zip" % name ), ogg_corpus )
35
29
36
30
37
31
def _export_lm_data (output_prefix ):
@@ -57,37 +51,27 @@ def _export_lexicon_and_vocab(output_prefix):
57
51
lexicon_output_prefix = os .path .join (output_prefix , "LibriSpeech" , "lexicon" )
58
52
59
53
# folded / without stress marker
60
- bliss_lexicon = get_bliss_lexicon (
61
- output_prefix = output_prefix , use_stress_marker = False
62
- )
54
+ bliss_lexicon = get_bliss_lexicon (output_prefix = output_prefix , use_stress_marker = False )
63
55
tk .register_output (
64
56
os .path .join (lexicon_output_prefix , "librispeech.lexicon.folded.xml.gz" ),
65
57
bliss_lexicon ,
66
58
)
67
59
68
- g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict (
69
- use_stress_marker = True , output_prefix = output_prefix
70
- )
60
+ g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict (use_stress_marker = True , output_prefix = output_prefix )
71
61
for k , lexicon in g2p_lexicon_dict .items ():
72
62
tk .register_output (
73
- os .path .join (
74
- lexicon_output_prefix , "%s.lexicon_with_g2p.folded.xml.gz" % k
75
- ),
63
+ os .path .join (lexicon_output_prefix , "%s.lexicon_with_g2p.folded.xml.gz" % k ),
76
64
lexicon ,
77
65
)
78
66
79
67
# with stress marker
80
- bliss_lexicon = get_bliss_lexicon (
81
- output_prefix = output_prefix , use_stress_marker = True
82
- )
68
+ bliss_lexicon = get_bliss_lexicon (output_prefix = output_prefix , use_stress_marker = True )
83
69
tk .register_output (
84
70
os .path .join (lexicon_output_prefix , "librispeech.lexicon.xml.gz" ),
85
71
bliss_lexicon ,
86
72
)
87
73
88
- g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict (
89
- use_stress_marker = False , output_prefix = output_prefix
90
- )
74
+ g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict (use_stress_marker = False , output_prefix = output_prefix )
91
75
for k , lexicon in g2p_lexicon_dict .items ():
92
76
tk .register_output (
93
77
os .path .join (lexicon_output_prefix , "%s.lexicon_with_g2p.xml.gz" % k ),
@@ -102,12 +86,8 @@ def _export_legacy_bpe(output_prefix):
102
86
:param str output_prefix
103
87
"""
104
88
lexicon_output_prefix = os .path .join (output_prefix , "LibriSpeech" , "bpe" )
105
- ls960_bpe_settings = get_subword_nmt_bpe (
106
- corpus_key = "train-other-960" , bpe_size = 10000 , output_prefix = output_prefix
107
- )
108
- ls100_bpe_settings = get_subword_nmt_bpe (
109
- corpus_key = "train-clean-100" , bpe_size = 2000 , output_prefix = output_prefix
110
- )
89
+ ls960_bpe_settings = get_subword_nmt_bpe (corpus_key = "train-other-960" , bpe_size = 10000 , output_prefix = output_prefix )
90
+ ls100_bpe_settings = get_subword_nmt_bpe (corpus_key = "train-clean-100" , bpe_size = 2000 , output_prefix = output_prefix )
111
91
tk .register_output (
112
92
os .path .join (lexicon_output_prefix , "train-other-960" , "bpe_10k.codes" ),
113
93
ls960_bpe_settings .bpe_codes ,
0 commit comments