#!/usr/bin/env python3 import os from datasets import load_dataset, concatenate_datasets, Dataset SEED = 42 SAMPLE_SIZE = 100 testsets: list[tuple[str, Dataset]] = [ ("TIMIT", load_dataset("KoelLabs/TIMIT")["test"]), ("EpaDB", load_dataset("KoelLabs/EpaDB")["test"]), ("PSST", load_dataset("KoelLabs/PSST")["test"]), ("SpeechOcean", load_dataset("KoelLabs/SpeechOceanNoTH")["test"]), ("ISLE", load_dataset("KoelLabs/ISLE")["train"]), ] # type: ignore all_datasets = [] for name, test_ds in testsets: shuffled_ds = test_ds.shuffle(seed=SEED) sample_ds = shuffled_ds.select(range(SAMPLE_SIZE)) sample_ds = sample_ds.add_column("dataset", [name] * len(sample_ds)) # type: ignore sample_ds = sample_ds.remove_columns( [ col for col in sample_ds.column_names if col not in ["audio", "ipa", "dataset"] ] ) all_datasets.append(sample_ds) combined_ds: Dataset = concatenate_datasets(all_datasets) os.makedirs(os.path.join("app", "data"), exist_ok=True) combined_ds.save_to_disk(os.path.join("app", "data", "test"))