Shiry's picture
Training in progress, step 1000
5c88922
raw
history blame
826 Bytes
from datasets import DatasetDict, load_dataset
import pandas as pd
import torchaudio
import os
print(os.path.exists("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3"))
array, sampling_rate = torchaudio.load("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/clips/common_voice_ru_28718278.mp3", format="mp3")
test_df = pd.read_csv("/workspace/datasets/CommonVoiceWalkie/cv-corpus-6.1-2020-12-11/ru/test.tsv", sep='\t')
print(test_df["path"].values[:5])
datasets = DatasetDict()
datasets["test"] = load_dataset(
"/workspace/datasets/CommonVoiceWalkie",
"ru",
cache_dir = "cache",
split="test",
use_auth_token=False,
)
# get the first sample of the dataset straight away!
print(next(iter(datasets["test"])))