Yurii Paniv commited on
Commit
7756f2b
1 Parent(s): 2a5583f

Update script for new dataset

Browse files
Files changed (1) hide show
  1. scripts/import_ukrainian.py +20 -5
scripts/import_ukrainian.py CHANGED
@@ -22,6 +22,7 @@ from deepspeech_training.util.importers import (
22
  print_import_report,
23
  )
24
  from ds_ctcdecoder import Alphabet
 
25
 
26
  FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
27
  SAMPLE_RATE = 16000
@@ -108,6 +109,12 @@ def one_sample(sample):
108
  return (counter, rows)
109
 
110
 
 
 
 
 
 
 
111
  def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
112
  # iterate over all data lists and write converted version near them
113
  speaker_iterator = 1
@@ -124,8 +131,13 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
124
  os.path.dirname(subdir), "wav")
125
  file_dict = dict()
126
  for row in file.readlines():
127
- file_name, transcript = row.replace(
128
- " \n", "").split(" ", 1)
 
 
 
 
 
129
  if file_name.endswith(".wav"):
130
  pass
131
  elif file_name.endswith(".mp3"):
@@ -133,8 +145,10 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
133
  elif file_name.find(".") == -1:
134
  file_name += ".wav"
135
 
136
- file_name = os.path.join(file_folder, file_name)
137
- file_dict[file_name] = transcript
 
 
138
 
139
  file.close()
140
 
@@ -176,7 +190,8 @@ def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_cha
176
  print("Writing CSV file for DeepSpeech.py as: ", output_csv)
177
  writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
178
  writer.writeheader()
179
- bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
 
180
  for filename, file_size, transcript, speaker in bar(rows):
181
  if space_after_every_character:
182
  writer.writerow(
 
22
  print_import_report,
23
  )
24
  from ds_ctcdecoder import Alphabet
25
+ import re
26
 
27
  FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
28
  SAMPLE_RATE = 16000
 
109
  return (counter, rows)
110
 
111
 
112
+ def convert_transcript(transcript):
113
+ transcript = re.sub("[а-я](')[а-я]", "’", transcript)
114
+ transcript = transcript.replace("-", " ")
115
+ return transcript.strip()
116
+
117
+
118
  def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
119
  # iterate over all data lists and write converted version near them
120
  speaker_iterator = 1
 
131
  os.path.dirname(subdir), "wav")
132
  file_dict = dict()
133
  for row in file.readlines():
134
+ if row.isspace():
135
+ continue
136
+ splitted_row = row.replace("\n", "").replace(
137
+ " wav ", ".wav ").split(" ", 1)
138
+ if len(splitted_row) != 2:
139
+ continue
140
+ file_name, transcript = splitted_row
141
  if file_name.endswith(".wav"):
142
  pass
143
  elif file_name.endswith(".mp3"):
 
145
  elif file_name.find(".") == -1:
146
  file_name += ".wav"
147
 
148
+ if file_name.startswith("/"):
149
+ file_name = file_name[1::]
150
+ file_name = os.path.join(dataset_dir, file_name)
151
+ file_dict[file_name] = convert_transcript(transcript)
152
 
153
  file.close()
154
 
 
190
  print("Writing CSV file for DeepSpeech.py as: ", output_csv)
191
  writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
192
  writer.writeheader()
193
+ bar = progressbar.ProgressBar(
194
+ max_value=len(rows), widgets=SIMPLE_BAR)
195
  for filename, file_size, transcript, speaker in bar(rows):
196
  if space_after_every_character:
197
  writer.writerow(