from datasets import DatasetDict, load_dataset | |
import pandas as pd | |
import torchaudio | |
import os | |
print(os.path.exists("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3")) | |
array, sampling_rate = torchaudio.load("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3", format="mp3") | |
test_df = pd.read_csv("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/test.tsv", sep='\t') | |
print(test_df["path"].values[:5]) | |
datasets = DatasetDict() | |
datasets["test"] = load_dataset( | |
"/workspace/datasets/CommonVoiceWalkie", | |
"ru", | |
cache_dir = "cache", | |
split="test", | |
use_auth_token=False, | |
) | |
# get the first sample of the dataset straight away! | |
print(next(iter(datasets["test"]))) |