File size: 826 Bytes
5c88922 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
from datasets import DatasetDict, load_dataset
import pandas as pd
import torchaudio
import os
print(os.path.exists("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3"))
array, sampling_rate = torchaudio.load("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3", format="mp3")
test_df = pd.read_csv("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/test.tsv", sep='\t')
print(test_df["path"].values[:5])
datasets = DatasetDict()
datasets["test"] = load_dataset(
"/workspace/datasets/CommonVoiceWalkie",
"ru",
cache_dir = "cache",
split="test",
use_auth_token=False,
)
# get the first sample of the dataset straight away!
print(next(iter(datasets["test"]))) |