marinone94 commited on
Commit
cd904f4
1 Parent(s): 71e9ea9

add decoding to get correct swedish chars

Browse files
Files changed (1) hide show
  1. run_speech_recognition_ctc.py +2 -2
run_speech_recognition_ctc.py CHANGED
@@ -521,9 +521,9 @@ def main():
521
 
522
  def remove_special_characters(batch):
523
  if chars_to_ignore_regex is not None:
524
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt").replace("\\\\Komma").lower() + " "
525
  else:
526
- batch["target_text"] = batch[text_column_name].replace("\\\\Punkt").replace("\\\\Komma").lower() + " "
527
  return batch
528
 
529
  num_workers = data_args.preprocessing_num_workers
 
521
 
522
  def remove_special_characters(batch):
523
  if chars_to_ignore_regex is not None:
524
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt").replace("\\\\Komma").lower().decode("utf-8") + " "
525
  else:
526
+ batch["target_text"] = batch[text_column_name].replace("\\\\Punkt").replace("\\\\Komma").lower().decode("utf-8") + " "
527
  return batch
528
 
529
  num_workers = data_args.preprocessing_num_workers