Yurii Paniv commited on
Commit
2a5583f
1 Parent(s): 4b03cbe

Improve file parsing from folder

Browse files
Files changed (1) hide show
  1. scripts/import_ukrainian.py +19 -10
scripts/import_ukrainian.py CHANGED
@@ -111,34 +111,43 @@ def one_sample(sample):
111
  def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
112
  # iterate over all data lists and write converted version near them
113
  speaker_iterator = 1
114
-
115
  samples = []
 
116
  for subdir, dirs, files in os.walk(dataset_dir):
117
  for file in files:
118
  # Get audiofile path and transcript for each sentence in tsv
119
- if file == "txt.final.data":
120
  file_path = os.path.join(subdir, file)
121
  file = open(file_path, mode="r")
122
  data = []
 
 
 
123
  for row in file.readlines():
124
  file_name, transcript = row.replace(
125
  " \n", "").split(" ", 1)
126
-
127
  if file_name.endswith(".wav"):
128
  pass
129
  elif file_name.endswith(".mp3"):
130
  pass
131
  elif file_name.find(".") == -1:
132
  file_name += ".wav"
133
- file_name = os.path.join(os.path.join(
134
- os.path.dirname(subdir), "wav"), file_name)
135
- data.append(
136
- (file_name, transcript, speaker_iterator))
137
- speaker_iterator += 1
138
 
139
  file.close()
140
 
141
- samples += data
 
 
 
 
 
 
 
 
 
142
 
143
  if rows is None:
144
  rows = []
@@ -199,7 +208,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
199
  transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
200
  try:
201
  transformer.build(mp3_filename, wav_filename)
202
- except Exception as e: # TODO: improve exception handling
203
  pass
204
 
205
 
 
111
  def _maybe_convert_set(dataset_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None):
112
  # iterate over all data lists and write converted version near them
113
  speaker_iterator = 1
 
114
  samples = []
115
+ total_file_dict = dict()
116
  for subdir, dirs, files in os.walk(dataset_dir):
117
  for file in files:
118
  # Get audiofile path and transcript for each sentence in tsv
119
+ if file.endswith(".data"):
120
  file_path = os.path.join(subdir, file)
121
  file = open(file_path, mode="r")
122
  data = []
123
+ file_folder = os.path.join(
124
+ os.path.dirname(subdir), "wav")
125
+ file_dict = dict()
126
  for row in file.readlines():
127
  file_name, transcript = row.replace(
128
  " \n", "").split(" ", 1)
 
129
  if file_name.endswith(".wav"):
130
  pass
131
  elif file_name.endswith(".mp3"):
132
  pass
133
  elif file_name.find(".") == -1:
134
  file_name += ".wav"
135
+
136
+ file_name = os.path.join(file_folder, file_name)
137
+ file_dict[file_name] = transcript
 
 
138
 
139
  file.close()
140
 
141
+ for wav_subdir, wav_dirs, wav_files in os.walk(file_folder):
142
+ for wav_file in wav_files:
143
+ wav_file_path = os.path.join(wav_subdir, wav_file)
144
+ if file_dict.get(wav_file_path) is not None:
145
+ total_file_dict[wav_file_path] = file_dict[wav_file_path]
146
+
147
+ for key in total_file_dict.keys():
148
+ samples.append((key, total_file_dict[key], speaker_iterator))
149
+ speaker_iterator += 1
150
+ del(total_file_dict)
151
 
152
  if rows is None:
153
  rows = []
 
208
  transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
209
  try:
210
  transformer.build(mp3_filename, wav_filename)
211
+ except Exception as e: # TODO: improve exception handling
212
  pass
213
 
214