tomascufaro commited on
Commit
58b2225
1 Parent(s): 0a897e8

evaluated without lm

Browse files
eval.py CHANGED
@@ -46,12 +46,25 @@ def log_results(result: Dataset, args: Dict[str, str]):
46
 
47
  result.map(write_to_file, with_indices=True)
48
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
 
50
  def normalize_text(text: str) -> str:
51
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
 
53
  chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
-
55
  text = re.sub(chars_to_ignore_regex, "", text.lower())
56
 
57
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
@@ -60,7 +73,10 @@ def normalize_text(text: str) -> str:
60
 
61
  for t in token_sequences_to_ignore:
62
  text = " ".join(text.split(t))
63
-
 
 
 
64
  return text
65
 
66
 
@@ -69,8 +85,14 @@ def main(args):
69
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
70
 
71
  # for testing: only process the first two examples as a test
72
- # dataset = dataset.select(range(10))
73
-
 
 
 
 
 
 
74
  # load processor
75
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
76
  sampling_rate = feature_extractor.sampling_rate
@@ -88,14 +110,12 @@ def main(args):
88
  prediction = asr(
89
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
90
  )
91
-
92
  batch["prediction"] = prediction["text"]
93
  batch["target"] = normalize_text(batch["sentence"])
94
  return batch
95
 
96
  # run inference on all examples
97
  result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
98
-
99
  # compute and log_results
100
  # do not change function below
101
  log_results(result, args)
46
 
47
  result.map(write_to_file, with_indices=True)
48
 
49
+ def clean_batch(text):
50
+ text = re.sub("([^A-Za-zÀ-ú ])", '', text).lower()
51
+ text = re.sub("([ß|þ|ð|æ])",'',text)
52
+ return text
53
 
54
+ def homologate_accents(text):
55
+ text=re.sub("([â|ã|ä|å|à])","a",text)
56
+ text=re.sub("([é|ê|ë])","e",text)
57
+ text=re.sub("([ì|î|ï])","i",text)
58
+ text=re.sub("([ö|õ|ô|ò|ø])","o",text)
59
+ text=re.sub("ù","u",text)
60
+ text=re.sub("ç","c",text)
61
+ return text
62
+
63
  def normalize_text(text: str) -> str:
64
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
65
 
66
  chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
67
+ text = text.lower()
68
  text = re.sub(chars_to_ignore_regex, "", text.lower())
69
 
70
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
73
 
74
  for t in token_sequences_to_ignore:
75
  text = " ".join(text.split(t))
76
+
77
+ #added functions
78
+ text = homologate_accents(text)
79
+ text = clean_batch(text)
80
  return text
81
 
82
 
85
  dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
86
 
87
  # for testing: only process the first two examples as a test
88
+ #dataset = dataset.select(range(15))
89
+ # vocab = [character for character in "aábcdeéfghiíjklmnñoópqrstuúüvwxyz·-."]
90
+
91
+ # dataset = dataset.filter(
92
+ # lambda example: not any((c not in vocab) for c in example),
93
+ # input_columns='sentence',
94
+ # desc="remove examples with weird characters"
95
+ # )
96
  # load processor
97
  feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
98
  sampling_rate = feature_extractor.sampling_rate
110
  prediction = asr(
111
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
112
  )
 
113
  batch["prediction"] = prediction["text"]
114
  batch["target"] = normalize_text(batch["sentence"])
115
  return batch
116
 
117
  # run inference on all examples
118
  result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
 
119
  # compute and log_results
120
  # do not change function below
121
  log_results(result, args)
mozilla-foundation_common_voice_8_0_es_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ WER: 0.12618083227750462
2
+ CER: 0.035028395923434555
mozilla-foundation_common_voice_8_0_es_validation_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ WER: 0.10670647680293982
2
+ CER: 0.0284079393233586