|
import os |
|
import soundfile as sf |
|
import csv |
|
from datasets import load_dataset |
|
|
|
|
|
dataset = load_dataset("facebook/multilingual_librispeech", "italian") |
|
|
|
|
|
output_dir = "multilingual_librispeech_italian" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
def save_split(split_name, dry_run=False): |
|
split = dataset[split_name] |
|
split_dir = os.path.join(output_dir, split_name) |
|
os.makedirs(split_dir, exist_ok=True) |
|
|
|
wavs_dir = os.path.join(split_dir, "wavs") |
|
os.makedirs(wavs_dir, exist_ok=True) |
|
|
|
COLUMNS_TO_KEEP = ["transcript", "audio", "sampling_rate"] |
|
all_columns = split.column_names |
|
|
|
if dry_run: |
|
print(split) |
|
columns_to_remove = set(all_columns) - set(COLUMNS_TO_KEEP) |
|
split = split.remove_columns(columns_to_remove) |
|
print(split[0]) |
|
return |
|
|
|
columns_to_remove = set(all_columns) - set(COLUMNS_TO_KEEP) |
|
split = split.remove_columns(columns_to_remove) |
|
|
|
metadata_path = os.path.join(split_dir, "metadata.csv") |
|
|
|
with open(metadata_path, mode='w', newline='', encoding='utf-8') as file: |
|
writer = csv.writer(file, delimiter='|') |
|
|
|
for i, example in enumerate(split): |
|
|
|
audio = example["audio"] |
|
audio_array = audio["array"] |
|
sampling_rate = audio["sampling_rate"] |
|
|
|
|
|
audio_path = os.path.join(wavs_dir, f"{i}.wav") |
|
|
|
|
|
sf.write(audio_path, audio_array, sampling_rate) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
writer.writerow([audio_path, example["transcript"]]) |
|
|
|
|
|
save_split("9_hours") |
|
|