Yurii Paniv commited on
Commit
03f568d
1 Parent(s): 369ee40

Update and rename converter.py to import_ukrainian.py

Browse files
Files changed (2) hide show
  1. scripts/converter.py +0 -97
  2. scripts/import_ukrainian.py +240 -0
scripts/converter.py DELETED
@@ -1,97 +0,0 @@
1
- import os
2
- from random import shuffle
3
- from shutil import copyfile
4
-
5
- # file template needed for import script
6
- template = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}"
7
- # structure example below
8
- # client_id path sentence up_votes down_votes age gender accent locale segment
9
- structure = template.format("client_id", "path", "sentence", "up_votes",
10
- "down_votes", "age", "gender", "accent", "locale", "segment")
11
-
12
- iterator = 1
13
- speaker_iterator = 1
14
-
15
-
16
- def write_dataset(path, name, data):
17
- """
18
- Function to write converted data list
19
- """
20
- global iterator
21
- global speaker_iterator
22
- file_path = os.path.join(path, name)
23
- clip_path = os.path.join(os.path.dirname(path), "wav")
24
- result = open(file_path, mode="w", encoding="utf-8")
25
- result.write(structure)
26
- result.write("\n")
27
- for row in data:
28
- file_name = row[0]
29
- if file_name.endswith(".wav"):
30
- pass
31
- elif file_name.endswith(".mp3"):
32
- pass
33
- elif file_name.find(".") == -1:
34
- file_name += ".wav"
35
- parted_name = file_name.split(".")
36
-
37
- new_file_name = f"{iterator}." + parted_name[1]
38
-
39
- old_file_path = os.path.join(clip_path, file_name)
40
- new_file_path = os.path.join("clips", new_file_name)
41
- if os.path.exists(old_file_path):
42
- copyfile(old_file_path,
43
- new_file_path)
44
- result.write(template.format(
45
- speaker_iterator, new_file_name, row[1], "", "", "", "", "", "uk", "\n"))
46
- speaker_iterator += 1
47
- iterator += 1
48
- else:
49
- print("File not found", old_file_path)
50
- result.close()
51
-
52
-
53
- if not os.path.exists("clips"):
54
- os.makedirs("clips") # create folder to contain processed clips
55
-
56
- # iterate over all data lists and write converted version near them
57
- for subdir, dirs, files in os.walk(os.path.abspath(os.path.curdir)):
58
- print(subdir)
59
- for file in files:
60
- if file == "txt.final.data":
61
- file_path = os.path.join(subdir, file)
62
- file = open(file_path, mode="r")
63
- data = [row.replace(" \n", "").split(" ", 1)
64
- for row in file.readlines()]
65
- file.close()
66
-
67
- shuffle(data)
68
-
69
- dataset_size = len(data)
70
- train_point = int(dataset_size*0.8)
71
- dev_point = int(train_point + (dataset_size - train_point) / 2)
72
- # split dataset
73
- write_dataset(subdir, "train.tsv", data[:train_point])
74
- write_dataset(subdir, "dev.tsv", data[train_point:dev_point])
75
- write_dataset(subdir, "test.tsv", data[dev_point:])
76
-
77
- # write dataset splits into single files
78
- final_files = {
79
- "train.tsv": open("train.tsv", mode="w", encoding="utf-8"),
80
- "dev.tsv": open("dev.tsv", mode="w", encoding="utf-8"),
81
- "test.tsv": open("test.tsv", mode="w", encoding="utf-8")
82
- }
83
- for file in final_files.values():
84
- file.write(structure)
85
- file.write("\n")
86
-
87
- for subdir, dirs, files in os.walk(os.path.curdir):
88
- for file in files:
89
- if file in ["train.tsv", "dev.tsv", "test.tsv"]:
90
- input_file = open(os.path.join(subdir, file))
91
- data = [row for row in input_file.readlines()][1::]
92
- input_file.close()
93
- for row in data:
94
- final_files[file].write(row)
95
-
96
- for file in final_files.values():
97
- file.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/import_ukrainian.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ This script transforms custom dataset, gathered from Internet into
4
+ DeepSpeech-ready .csv file
5
+ Use "python3 import_ukrainian.py -h" for help
6
+ """
7
+ import csv
8
+ import os
9
+ import subprocess
10
+ import unicodedata
11
+ from multiprocessing import Pool
12
+
13
+ import progressbar
14
+ import sox
15
+
16
+ from deepspeech_training.util.downloader import SIMPLE_BAR
17
+ from deepspeech_training.util.importers import (
18
+ get_counter,
19
+ get_imported_samples,
20
+ get_importers_parser,
21
+ get_validate_label,
22
+ print_import_report,
23
+ )
24
+ from ds_ctcdecoder import Alphabet
25
+
26
+ FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
27
+ SAMPLE_RATE = 16000
28
+ CHANNELS = 1
29
+ MAX_SECS = 10
30
+ PARAMS = None
31
+ FILTER_OBJ = None
32
+ AUDIO_DIR = None
33
+
34
+
35
+ class LabelFilter:
36
+ def __init__(self, normalize, alphabet, validate_fun):
37
+ self.normalize = normalize
38
+ self.alphabet = alphabet
39
+ self.validate_fun = validate_fun
40
+
41
+ def filter(self, label):
42
+ if self.normalize:
43
+ label = unicodedata.normalize("NFKD", label.strip()).encode(
44
+ "ascii", "ignore").decode("ascii", "ignore")
45
+ label = self.validate_fun(label)
46
+ if self.alphabet and label and not self.alphabet.CanEncode(label):
47
+ label = None
48
+ return label
49
+
50
+
51
+ def init_worker(params):
52
+ global FILTER_OBJ # pylint: disable=global-statement
53
+ global AUDIO_DIR # pylint: disable=global-statement
54
+ AUDIO_DIR = params.audio_dir if params.audio_dir else os.path.join(
55
+ params.tsv_dir, "clips")
56
+ validate_label = get_validate_label(params)
57
+ alphabet = Alphabet(
58
+ params.filter_alphabet) if params.filter_alphabet else None
59
+ FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
60
+
61
+
62
+ def one_sample(sample):
63
+ """ Take an audio file, and optionally convert it to 16kHz WAV """
64
+ global AUDIO_DIR
65
+ source_filename = sample[0]
66
+ if not os.path.splitext(source_filename.lower())[1] == ".wav":
67
+ source_filename += ".wav"
68
+ # Storing wav files next to the mp3 ones - just with a different suffix
69
+ output_filename = f"{sample[2]}.wav"
70
+ output_filepath = os.path.join(AUDIO_DIR, output_filename)
71
+ _maybe_convert_wav(source_filename, output_filepath)
72
+ file_size = -1
73
+ frames = 0
74
+ if os.path.exists(output_filepath):
75
+ file_size = os.path.getsize(output_filepath)
76
+ if file_size == 0:
77
+ frames = 0
78
+ else:
79
+ frames = int(
80
+ subprocess.check_output(
81
+ ["soxi", "-s", output_filepath], stderr=subprocess.STDOUT
82
+ )
83
+ )
84
+ label = FILTER_OBJ.filter(sample[1])
85
+ rows = []
86
+ counter = get_counter()
87
+ if file_size == -1:
88
+ # Excluding samples that failed upon conversion
89
+ counter["failed"] += 1
90
+ elif label is None:
91
+ # Excluding samples that failed on label validation
92
+ counter["invalid_label"] += 1
93
+ # + 1 added for filtering surname dataset with too short audio files
94
+ elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)) + 1:
95
+ # Excluding samples that are too short to fit the transcript
96
+ counter["too_short"] += 1
97
+ elif frames / SAMPLE_RATE > MAX_SECS:
98
+ # Excluding very long samples to keep a reasonable batch-size
99
+ counter["too_long"] += 1
100
+ else:
101
+ # This one is good - keep it for the target CSV
102
+ rows.append((os.path.split(output_filename)
103
+ [-1], file_size, label, sample[2]))
104
+ counter["imported_time"] += frames
105
+ counter["all"] += 1
106
+ counter["total_time"] += frames
107
+
108
+ return (counter, rows)
109
+
110
+
111
+ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
112
+ # iterate over all data lists and write converted version near them
113
+ speaker_iterator = 1
114
+
115
+ samples = []
116
+ for subdir, dirs, files in os.walk(dataset_dir):
117
+ for file in files:
118
+ # Get audiofile path and transcript for each sentence in tsv
119
+ if file == "txt.final.data":
120
+ file_path = os.path.join(subdir, file)
121
+ file = open(file_path, mode="r")
122
+ data = []
123
+ for row in file.readlines():
124
+ file_name, transcript = row.replace(
125
+ " \n", "").split(" ", 1)
126
+
127
+ if file_name.endswith(".wav"):
128
+ pass
129
+ elif file_name.endswith(".mp3"):
130
+ pass
131
+ elif file_name.find(".") == -1:
132
+ file_name += ".wav"
133
+ file_name = os.path.join(os.path.join(
134
+ os.path.dirname(subdir), "wav"), file_name)
135
+ data.append(
136
+ (file_name, transcript, speaker_iterator))
137
+ speaker_iterator += 1
138
+
139
+ file.close()
140
+
141
+ samples += data
142
+
143
+ if rows is None:
144
+ rows = []
145
+ counter = get_counter()
146
+ num_samples = len(samples)
147
+ print("Importing dataset files...")
148
+ pool = Pool(initializer=init_worker, initargs=(PARAMS,))
149
+ bar = progressbar.ProgressBar(
150
+ max_value=num_samples, widgets=SIMPLE_BAR)
151
+ for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
152
+ counter += processed[0]
153
+ rows += processed[1]
154
+ bar.update(i)
155
+ bar.update(num_samples)
156
+ pool.close()
157
+ pool.join()
158
+
159
+ imported_samples = get_imported_samples(counter)
160
+ assert counter["all"] == num_samples
161
+ assert len(rows) == imported_samples
162
+ print_import_report(counter, SAMPLE_RATE, MAX_SECS)
163
+
164
+ output_csv = os.path.join(os.path.abspath(audio_dir), "train.csv")
165
+ print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
166
+ with open(output_csv, "w", encoding="utf-8", newline="") as output_csv_file:
167
+ print("Writing CSV file for DeepSpeech.py as: ", output_csv)
168
+ writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
169
+ writer.writeheader()
170
+ bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
171
+ for filename, file_size, transcript, speaker in bar(rows):
172
+ if space_after_every_character:
173
+ writer.writerow(
174
+ {
175
+ "wav_filename": filename,
176
+ "wav_filesize": file_size,
177
+ "transcript": " ".join(transcript),
178
+ }
179
+ )
180
+ else:
181
+ writer.writerow(
182
+ {
183
+ "wav_filename": filename,
184
+ "wav_filesize": file_size,
185
+ "transcript": transcript,
186
+ }
187
+ )
188
+ return rows
189
+
190
+
191
+ def _preprocess_data(tsv_dir, audio_dir, space_after_every_character=False):
192
+ set_samples = _maybe_convert_set(
193
+ tsv_dir, audio_dir, space_after_every_character)
194
+
195
+
196
+ def _maybe_convert_wav(mp3_filename, wav_filename):
197
+ if not os.path.exists(wav_filename):
198
+ transformer = sox.Transformer()
199
+ transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
200
+ try:
201
+ transformer.build(mp3_filename, wav_filename)
202
+ except sox.core.SoxError:
203
+ pass
204
+
205
+
206
+ def parse_args():
207
+ parser = get_importers_parser(
208
+ description="Import CommonVoice v2.0 corpora")
209
+ parser.add_argument("tsv_dir", help="Directory containing tsv files")
210
+ parser.add_argument(
211
+ "--audio_dir",
212
+ help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
213
+ )
214
+ parser.add_argument(
215
+ "--filter_alphabet",
216
+ help="Exclude samples with characters not in provided alphabet",
217
+ )
218
+ parser.add_argument(
219
+ "--normalize",
220
+ action="store_true",
221
+ help="Converts diacritic characters to their base ones",
222
+ )
223
+ parser.add_argument(
224
+ "--space_after_every_character",
225
+ action="store_true",
226
+ help="To help transcript join by white space",
227
+ )
228
+ return parser.parse_args()
229
+
230
+
231
+ def main():
232
+ audio_dir = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join(
233
+ PARAMS.tsv_dir, "clips")
234
+ _preprocess_data(PARAMS.tsv_dir, audio_dir,
235
+ PARAMS.space_after_every_character)
236
+
237
+
238
+ if __name__ == "__main__":
239
+ PARAMS = parse_args()
240
+ main()