Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
from datasets import Audio | |
class Dataset: | |
def __init__(self, n:int = 100): | |
self.n = n | |
self.options = ['LibriSpeech Clean', 'LibriSpeech Other', 'Common Voice', 'VoxPopuli', 'TEDLIUM', 'GigaSpeech', 'SPGISpeech', 'AMI', 'OWN'] | |
self.selected = None | |
self.dataset = None | |
self.text = None | |
def get_options(self): | |
return self.options | |
def _check_text(self): | |
sample = next(iter(self.dataset)) | |
print(sample) | |
self._get_text(sample) | |
def _get_text(self, sample): | |
if "text" in sample: | |
self.text = "text" | |
return sample["text"] | |
elif "sentence" in sample: | |
self.text = "sentence" | |
return sample["sentence"] | |
elif "normalized_text" in sample: | |
self.text = "normalized_text" | |
return sample["normalized_text"] | |
elif "transcript" in sample: | |
self.text = "transcript" | |
return sample["transcript"] | |
else: | |
raise ValueError(f"Sample: {sample.keys()} has no transcript.") | |
def filter(self, input_column:str = None): | |
if input_column is None: | |
if self.text is not None: | |
input_column = self.text | |
else: | |
input_column = self._check_text() | |
def is_target_text_in_range(ref): | |
if ref.strip() == "ignore time segment in scoring": | |
return False | |
else: | |
return ref.strip() != "" | |
self.dataset = self.dataset.filter(is_target_text_in_range, input_columns=[input_column]) | |
return self.dataset | |
def normalised(self, normalise): | |
self.dataset = self.dataset.map(normalise) | |
def _select(self, option:str): | |
if option not in self.options: | |
raise ValueError(f"This value is not an option, please see: {self.options}") | |
self.selected = option | |
def _preprocess(self): | |
self.dataset = self.dataset.take(self.n) | |
self.dataset = self.dataset.cast_column("audio", Audio(sampling_rate=16000)) | |
def load(self, option:str = None): | |
self._select(option) | |
if option == "OWN": | |
pass | |
elif option == "LibriSpeech Clean": | |
self.dataset = load_dataset("librispeech_asr", "all", split="test.clean", streaming=True) | |
elif option == "LibriSpeech Other": | |
self.dataset = load_dataset("librispeech_asr", "all", split="test.other", streaming=True) | |
elif option == "Common Voice": | |
self.dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True) | |
elif option == "VoxPopuli": | |
self.dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True) | |
elif option == "TEDLIUM": | |
self.dataset = load_dataset("LIUM/tedlium", "release3", split="test", streaming=True, trust_remote_code=True) | |
elif option == "GigaSpeech": | |
self.dataset = load_dataset("speechcolab/gigaspeech", "xs", split="test", streaming=True, token=True, trust_remote_code=True) | |
elif option == "SPGISpeech": | |
self.dataset = load_dataset("kensho/spgispeech", "S", split="test", streaming=True, token=True, trust_remote_code=True) | |
elif option == "AMI": | |
self.dataset = load_dataset("edinburghcstr/ami", "ihm", split="test", streaming=True, trust_remote_code=True) | |
self._preprocess() |