from datasets import DatasetDict, load_dataset import pandas as pd import torchaudio import os print(os.path.exists("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3")) array, sampling_rate = torchaudio.load("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3", format="mp3") test_df = pd.read_csv("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/test.tsv", sep='\t') print(test_df["path"].values[:5]) datasets = DatasetDict() datasets["test"] = load_dataset( "/workspace/datasets/CommonVoiceWalkie", "ru", cache_dir = "cache", split="test", use_auth_token=False, ) # get the first sample of the dataset straight away! print(next(iter(datasets["test"])))