In [1]:
# Ensure datasets is installed from main. Uncomment the following line if you face issues running this script:
# !pip install git+https://github.com/huggingface/datasets

In [2]:
from datasets import Audio, interleave_datasets, IterableDataset, load_dataset
from typing import List, Optional

### Define the dataset attributes

In this example, we'll show to combine the Common Voice 11, VoxPopuli, Mulitlingual LibriSpeech and FLEURS datasets for Spanish, giving a training corpus equal to the sum of the individual datasets. This is particularly beneficial in low-resource settings, where any one of the datasets alone might have insufficient data to train a model.

We need to specify the dataset names on the Hub, the corresponding configs and finally the text column names for the transcriptions:

In [3]:
dataset_names = ["mozilla-foundation/common_voice_11_0", "facebook/voxpopuli", "facebook/multilingual_librispeech", "google/fleurs"]
dataset_config_names = ["es", "es", "spanish", "es_419"]
text_column_names = ["sentence", "normalized_text", "text", "transcription"]

### Define the merging function

We define a function, `load_multiple_streaming_datasets`, that takes as argument a list of datasets, configs, splits (optional) and text column names (optional). It sets them to a specified sampling rate and interleaves them together, giving one merged dataset. This is all 
done in _streaming mode_: as we iterate over the merged dataset we load samples one-by-one on the fly. No data is
saved to disk.

We can also specify our strategy for interleaving datasets. The default strategy, `all_exhausted` is an oversampling 
strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset 
has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the 
beginning of this dataset until the stop criterion has been reached. You can specify `stopping_strategy=first_exhausted` 
for a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples. 

In [4]:
def load_multiple_streaming_datasets(
 dataset_names: List,
 dataset_config_names: List,
 splits: Optional[List] = None,
 text_column_names: Optional[List] = None,
 sampling_rate: Optional[int] = 16000,
 stopping_strategy: Optional[str] = "all_exhausted",
 **kwargs
) -> IterableDataset:

 if len(dataset_names) != len(dataset_config_names):
 raise ValueError(
 f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
 f" {len(dataset_config_names)} configs."
 )

 if splits is not None and len(splits) != len(dataset_names):
 raise ValueError(
 f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
 )

 if text_column_names is not None and len(text_column_names) != len(dataset_names):
 raise ValueError(
 f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
 f" {len(text_column_names)} text column names."
 )

 splits = splits if splits is not None else ["train" for i in range(len(dataset_names))]
 text_column_names = (
 text_column_names if text_column_names is not None else ["text" for i in range(len(dataset_names))]
 )

 all_datasets = []
 # iterate over the datasets we want to interleave
 for i, dataset_name in enumerate(dataset_names):
 dataset = load_dataset(dataset_name, dataset_config_names[i], split=splits[i], streaming=True, **kwargs)
 # resample to specified sampling rate
 dataset = dataset.cast_column("audio", Audio(sampling_rate))
 #  normalise columns to ["audio", "sentence"]
 if text_column_names[i] != "sentence":
 dataset = dataset.rename_column(text_column_names[i], "sentence")
 dataset = dataset.remove_columns(set(dataset.features.keys()) - set(["audio", "sentence"]))
 all_datasets.append(dataset)

 interleaved_dataset = interleave_datasets(all_datasets, stopping_strategy=stopping_strategy)
 return interleaved_dataset

Let's apply this function to load and merge our four datasets:

In [5]:
ds = load_multiple_streaming_datasets(dataset_names, dataset_config_names=dataset_config_names, text_column_names=text_column_names, use_auth_token=True)

### Iterate over the dataset

We iterate over the dataset, loading and merging samples on the fly. Let's print the transcriptions for the first 10 samples of our merged dataset:

In [6]:
for i, sample in enumerate(ds):
 print(i, sample["sentence"])
 if i == 9:
 break

Reading metadata...: 230467it [00:41, 5545.80it/s]


0 ¿ Qué tal a tres de cinco ?
1 y desde luego esa razón no puede tener que ver con la explicación surrealista que hemos escuchado más de una vez de que se trata de una conspiración izquierdista.
2 para exclamar con voz de acción de gracias y para contar todas tus maravillas jehová la habitación de tu casa he amado y el lugar del tabernáculo de tu gloria no juntes con los pecadores mi alma ni con los hombres de sangres mi vida
3 el uso de internet y de la red informática mundial permite que los estudiantes tengan acceso a la información en todo momento
4 vamos , quiero decir , que no soy de citas especiales .
5 si bien esta lista no es perfecta sí que resulta necesario que las entidades financieras refuercen sus controles.
6 oye oh jehová mi voz con que á ti clamo y ten misericordia de mí respóndeme mi corazón ha dicho de ti buscad mi rostro tu rostro buscaré oh jehová
7 los deportes de nieve en descenso como el esquí y la tablanieve son disciplinas populares que consisten en deslizarse

We can see that the transcriptions take several different formats. Those from Common Voice 11 are cased and punctuated. Those from VoxPopuli are punctuated only. Those from Multilingual LibriSpeech and FLEURS are neither cased not punctuated. We need to normalise the transcriptions to a uniform format before training our model. 

The following code cell is lifted from the Whisper training notebook: https://github.com/huggingface/community-events/blob/main/whisper-fine-tuning-event/fine-tune-whisper-streaming.ipynb

In [7]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

do_lower_case = True
do_remove_punctuation = True

normalizer = BasicTextNormalizer()



Now we define a function to normalise our transcriptions:

In [8]:
def normalize_transcriptions(batch):
 # optional pre-processing steps
 transcription = batch["sentence"]
 if do_lower_case:
 transcription = transcription.lower()
 if do_remove_punctuation:
 transcription = normalizer(transcription).strip()
 batch["sentence"] = transcription
 return batch

Let's apply the data pre-processing steps to our dataset and view the first 10 samples again:

In [9]:
ds = ds.map(normalize_transcriptions)

for i, sample in enumerate(ds):
 print(i, sample["sentence"])
 if i == 9:
 break

Reading metadata...: 230467it [00:32, 6984.59it/s] 


0 qué tal a tres de cinco 
1 y desde luego esa razón no puede tener que ver con la explicación surrealista que hemos escuchado más de una vez de que se trata de una conspiración izquierdista 
2 para exclamar con voz de acción de gracias y para contar todas tus maravillas jehová la habitación de tu casa he amado y el lugar del tabernáculo de tu gloria no juntes con los pecadores mi alma ni con los hombres de sangres mi vida
3 el uso de internet y de la red informática mundial permite que los estudiantes tengan acceso a la información en todo momento
4 vamos quiero decir que no soy de citas especiales 
5 si bien esta lista no es perfecta sí que resulta necesario que las entidades financieras refuercen sus controles 
6 oye oh jehová mi voz con que á ti clamo y ten misericordia de mí respóndeme mi corazón ha dicho de ti buscad mi rostro tu rostro buscaré oh jehová
7 los deportes de nieve en descenso como el esquí y la tablanieve son disciplinas populares que consisten en deslizarse con esq

This time the transcriptions are in a consistent format. We can use this data to fine-tune our Whisper model. Note that since we've removed punctuation and casing, the Whisper model won't learn to predict these features.