Harveenchadha commited on
Commit
249704b
β€’
1 Parent(s): c0377ce

Update evaluations/eval.py

Browse files
Files changed (1) hide show
  1. evaluations/eval.py +21 -13
evaluations/eval.py CHANGED
@@ -1,14 +1,22 @@
1
  #!/usr/bin/env python3
 
 
 
 
 
 
2
  import argparse
3
  import re
4
  from typing import Dict
5
 
6
- import torch
7
  from datasets import Audio, Dataset, load_dataset, load_metric
8
 
9
  from transformers import AutoFeatureExtractor, pipeline
10
 
11
 
 
 
 
12
  def log_results(result: Dataset, args: Dict[str, str]):
13
  """DO NOT CHANGE. This function computes and logs the result metrics."""
14
 
@@ -50,9 +58,9 @@ def log_results(result: Dataset, args: Dict[str, str]):
50
  def normalize_text(text: str) -> str:
51
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
52
 
53
- chars_to_ignore_regex = '[,?.!\-\;\:"β€œ%β€˜β€οΏ½β€”β€™β€¦β€“]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
54
 
55
- text = re.sub(chars_to_ignore_regex, "", text.lower())
56
 
57
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
58
  # note that order is important here!
@@ -63,6 +71,12 @@ def normalize_text(text: str) -> str:
63
 
64
  return text
65
 
 
 
 
 
 
 
66
 
67
  def main(args):
68
  # load dataset
@@ -79,9 +93,7 @@ def main(args):
79
  dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
80
 
81
  # load eval pipeline
82
- if args.device is None:
83
- args.device = 0 if torch.cuda.is_available() else -1
84
- asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
85
 
86
  # map function to decode audio
87
  def map_to_pred(batch):
@@ -89,8 +101,10 @@ def main(args):
89
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
90
  )
91
 
92
- batch["prediction"] = prediction["text"].replace('<s>','')
93
  batch["target"] = normalize_text(batch["sentence"])
 
 
94
  return batch
95
 
96
  # run inference on all examples
@@ -126,12 +140,6 @@ if __name__ == "__main__":
126
  parser.add_argument(
127
  "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
128
  )
129
- parser.add_argument(
130
- "--device",
131
- type=int,
132
- default=None,
133
- help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
134
- )
135
  args = parser.parse_args()
136
 
137
  main(args)
 
1
  #!/usr/bin/env python3
2
+
3
+ #pip install indic-nlp-library
4
+ from indicnlp.tokenize.indic_tokenize import trivial_tokenize
5
+ from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
6
+
7
+
8
  import argparse
9
  import re
10
  from typing import Dict
11
 
 
12
  from datasets import Audio, Dataset, load_dataset, load_metric
13
 
14
  from transformers import AutoFeatureExtractor, pipeline
15
 
16
 
17
+ indic_normalizer_factory = IndicNormalizerFactory()
18
+ indic_normalizer = indic_normalizer_factory.get_normalizer('hi')
19
+
20
  def log_results(result: Dataset, args: Dict[str, str]):
21
  """DO NOT CHANGE. This function computes and logs the result metrics."""
22
 
 
58
  def normalize_text(text: str) -> str:
59
  """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
60
 
61
+ chars_to_ignore_regex = '[ΰ₯€,?.!\-\;\:"β€œ%β€˜β€οΏ½β€”β€™β€¦β€“]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
62
 
63
+ text = re.sub(chars_to_ignore_regex, "", text.lower().strip())
64
 
65
  # In addition, we can normalize the target text, e.g. removing new lines characters etc...
66
  # note that order is important here!
 
71
 
72
  return text
73
 
74
+ def normalize_text_indic(text:str) -> str:
75
+ lang='hi'
76
+ normalized = indic_normalizer.normalize(text)
77
+ processed = ' '.join(trivial_tokenize(normalized, lang))
78
+ return processed
79
+
80
 
81
  def main(args):
82
  # load dataset
 
93
  dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
94
 
95
  # load eval pipeline
96
+ asr = pipeline("automatic-speech-recognition", model=args.model_id)
 
 
97
 
98
  # map function to decode audio
99
  def map_to_pred(batch):
 
101
  batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
102
  )
103
 
104
+ batch["prediction"] = prediction["text"]
105
  batch["target"] = normalize_text(batch["sentence"])
106
+ batch["prediction"] = normalize_text_indic(batch["prediction"] )
107
+ batch["target"] = normalize_text_indic(batch["target"] )
108
  return batch
109
 
110
  # run inference on all examples
 
140
  parser.add_argument(
141
  "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
142
  )
 
 
 
 
 
 
143
  args = parser.parse_args()
144
 
145
  main(args)