Abid commited on
Commit
ff0d701
·
1 Parent(s): 1c78367
Files changed (1) hide show
  1. eval.py +8 -4
eval.py CHANGED
@@ -61,8 +61,6 @@ def normalize_text(text: str) -> str:
61
  text = re.sub("['ّ]", '', text)
62
  text = re.sub("['ٔ]", '', text)
63
  text = re.sub("['ٰ]", '', text)
64
- # batch["sentence"] = re.sub("[ء]", '', batch["sentence"])
65
- # batch["sentence"] = re.sub("[آ]", 'ا', batch["sentence"])
66
  text = re.sub("[ۂ]", 'ہ', text)
67
  text = re.sub("[ي]", "ی",text)
68
  text = re.sub("[ؤ]", "و", text)
@@ -74,15 +72,20 @@ def normalize_text(text: str) -> str:
74
  # note that order is important here!
75
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
76
 
 
77
  for t in token_sequences_to_ignore:
78
  text = " ".join(text.split(t))
79
 
80
  return text
81
 
 
 
 
82
 
83
  def main(args):
84
  # load dataset
85
  dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
 
86
 
87
  # for testing: only process the first two examples as a test
88
  # dataset = dataset.select(range(10))
@@ -92,7 +95,8 @@ def main(args):
92
  sampling_rate = feature_extractor.sampling_rate
93
 
94
  # resample audio
95
- dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
 
96
 
97
  # load eval pipeline
98
  if args.device is None:
@@ -102,7 +106,7 @@ def main(args):
102
  # map function to decode audio
103
  def map_to_pred(batch):
104
  prediction = asr(
105
- batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
106
  )
107
 
108
  batch["prediction"] = prediction["text"]
 
61
  text = re.sub("['ّ]", '', text)
62
  text = re.sub("['ٔ]", '', text)
63
  text = re.sub("['ٰ]", '', text)
 
 
64
  text = re.sub("[ۂ]", 'ہ', text)
65
  text = re.sub("[ي]", "ی",text)
66
  text = re.sub("[ؤ]", "و", text)
 
72
  # note that order is important here!
73
  token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
74
 
75
+
76
  for t in token_sequences_to_ignore:
77
  text = " ".join(text.split(t))
78
 
79
  return text
80
 
81
+ def path_adjust(batch):
82
+ batch["path"] = "Data/ur/clips/"+str(batch["path"])
83
+ return batch
84
 
85
  def main(args):
86
  # load dataset
87
  dataset = load_dataset(args.dataset, args.config,delimiter="\t",split=args.split, use_auth_token=True)
88
+
89
 
90
  # for testing: only process the first two examples as a test
91
  # dataset = dataset.select(range(10))
 
95
  sampling_rate = feature_extractor.sampling_rate
96
 
97
  # resample audio
98
+ dataset = dataset.cast_column("path", path_adjust())
99
+ dataset = dataset.cast_column("path", Audio(sampling_rate=sampling_rate))
100
 
101
  # load eval pipeline
102
  if args.device is None:
 
106
  # map function to decode audio
107
  def map_to_pred(batch):
108
  prediction = asr(
109
+ batch["path"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
110
  )
111
 
112
  batch["prediction"] = prediction["text"]