marinone94 commited on
Commit
393fd68
β€’
1 Parent(s): 9be1ce7

correct filtering column

Browse files
Files changed (2) hide show
  1. run.sh +1 -1
  2. run_speech_recognition_ctc.py +3 -3
run.sh CHANGED
@@ -17,7 +17,7 @@ python run_speech_recognition_ctc.py \
17
  --evaluation_strategy="epoch" \
18
  --save_strategy="epoch" \
19
  --text_column_name="sentence" \
20
- --chars_to_ignore , ? . ! \- \; \: \" β€œ % β€˜ ” οΏ½ β€” ’ … – / \\ \
21
  --logging_steps="100" \
22
  --layerdrop="0.0" \
23
  --activation_dropout="0.1" \
 
17
  --evaluation_strategy="epoch" \
18
  --save_strategy="epoch" \
19
  --text_column_name="sentence" \
20
+ --chars_to_ignore , ? . ! \- \; \: \" β€œ % β€˜ ” οΏ½ β€” ’ … – / \
21
  --logging_steps="100" \
22
  --layerdrop="0.0" \
23
  --activation_dropout="0.1" \
run_speech_recognition_ctc.py CHANGED
@@ -521,9 +521,9 @@ def main():
521
 
522
  def remove_special_characters(batch):
523
  if chars_to_ignore_regex is not None:
524
- batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower().decode("utf-8") + " "
525
  else:
526
- batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower().decode("utf-8") + " "
527
  return batch
528
 
529
  num_workers = data_args.preprocessing_num_workers
@@ -537,7 +537,7 @@ def main():
537
  raw_datasets = raw_datasets.filter(
538
  is_text_valid,
539
  num_proc=num_workers,
540
- input_columns=["input_length"],
541
  desc="remove single words, single chars and 'W O R D S'",
542
  )
543
 
 
521
 
522
  def remove_special_characters(batch):
523
  if chars_to_ignore_regex is not None:
524
+ batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[text_column_name]).replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
525
  else:
526
+ batch["target_text"] = batch[text_column_name].replace("\\\\Punkt", "").replace("\\\\Komma", "").lower() + " "
527
  return batch
528
 
529
  num_workers = data_args.preprocessing_num_workers
 
537
  raw_datasets = raw_datasets.filter(
538
  is_text_valid,
539
  num_proc=num_workers,
540
+ input_columns=["target_text"],
541
  desc="remove single words, single chars and 'W O R D S'",
542
  )
543