#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import shutil from tempfile import NamedTemporaryFile import pandas as pd from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, gen_config_yaml, gen_vocab, get_zip_manifest, save_df_to_tsv, ) from torchaudio.datasets import LIBRISPEECH from tqdm import tqdm log = logging.getLogger(__name__) SPLITS = [ "train-clean-100", "train-clean-360", "train-other-500", "dev-clean", "dev-other", "test-clean", "test-other", ] MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] def process(args): out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) # Extract features feature_root = out_root / "fbank80" feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(out_root.as_posix(), url=split) for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) manifest["audio"].append(audio_paths[sample_id]) manifest["n_frames"].append(audio_lengths[sample_id]) manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( out_root, spm_filename=spm_filename_prefix + ".model", specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root) def main(): parser = argparse.ArgumentParser() parser.add_argument("--output-root", "-o", required=True, type=str) parser.add_argument( "--vocab-type", default="unigram", required=True, type=str, choices=["bpe", "unigram", "char"], ), parser.add_argument("--vocab-size", default=10000, type=int) args = parser.parse_args() process(args) if __name__ == "__main__": main()