Charalampos commited on
Commit
2de8373
1 Parent(s): 280cb76

added fleurs

Browse files
run_speech_recognition_seq2seq_streaming.py CHANGED
@@ -358,14 +358,13 @@ def main():
358
  common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
359
  common_voice = common_voice.remove_columns(set(common_voice.features.keys()) - set(["audio", "sentence"]))
360
 
361
- #fleurs = load_maybe_streaming_dataset("google/fleurs", "el_gr", split="train+validation+test")
362
- #fleurs = fleurs.cast_column("audio", Audio(sampling_rate=16000))
363
- #fleurs = fleurs.rename_column("raw_transcription", "sentence")
364
- #fleurs = fleurs.remove_columns(set(fleurs.features.keys()) - set(["audio", "sentence"]))
365
 
366
- #all_datasets = [common_voice, fleurs]
367
- #raw_datasets["train"] = interleave_datasets(all_datasets, stopping_strategy="all_exhausted")
368
- raw_datasets["train"] = common_voice
369
 
370
  """
371
  raw_datasets["train"] = load_maybe_streaming_dataset(
 
358
  common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
359
  common_voice = common_voice.remove_columns(set(common_voice.features.keys()) - set(["audio", "sentence"]))
360
 
361
+ fleurs = load_maybe_streaming_dataset("google/fleurs", "el_gr", split="train+validation+test")
362
+ fleurs = fleurs.cast_column("audio", Audio(sampling_rate=16000))
363
+ fleurs = fleurs.rename_column("raw_transcription", "sentence")
364
+ fleurs = fleurs.remove_columns(set(fleurs.features.keys()) - set(["audio", "sentence"]))
365
 
366
+ all_datasets = [common_voice, fleurs]
367
+ raw_datasets["train"] = interleave_datasets(all_datasets, stopping_strategy="all_exhausted")
 
368
 
369
  """
370
  raw_datasets["train"] = load_maybe_streaming_dataset(