Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import argparse | |
| import logging | |
| from pathlib import Path | |
| import shutil | |
| from tempfile import NamedTemporaryFile | |
| import pandas as pd | |
| from examples.speech_to_text.data_utils import ( | |
| create_zip, | |
| extract_fbank_features, | |
| gen_config_yaml, | |
| gen_vocab, | |
| get_zip_manifest, | |
| save_df_to_tsv, | |
| ) | |
| from torchaudio.datasets import LIBRISPEECH | |
| from tqdm import tqdm | |
| log = logging.getLogger(__name__) | |
| SPLITS = [ | |
| "train-clean-100", | |
| "train-clean-360", | |
| "train-other-500", | |
| "dev-clean", | |
| "dev-other", | |
| "test-clean", | |
| "test-other", | |
| ] | |
| MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] | |
| def process(args): | |
| out_root = Path(args.output_root).absolute() | |
| out_root.mkdir(exist_ok=True) | |
| # Extract features | |
| feature_root = out_root / "fbank80" | |
| feature_root.mkdir(exist_ok=True) | |
| for split in SPLITS: | |
| print(f"Fetching split {split}...") | |
| dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) | |
| print("Extracting log mel filter bank features...") | |
| for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): | |
| sample_id = f"{spk_id}-{chapter_no}-{utt_no}" | |
| extract_fbank_features( | |
| wav, sample_rate, feature_root / f"{sample_id}.npy" | |
| ) | |
| # Pack features into ZIP | |
| zip_path = out_root / "fbank80.zip" | |
| print("ZIPing features...") | |
| create_zip(feature_root, zip_path) | |
| print("Fetching ZIP manifest...") | |
| audio_paths, audio_lengths = get_zip_manifest(zip_path) | |
| # Generate TSV manifest | |
| print("Generating manifest...") | |
| train_text = [] | |
| for split in SPLITS: | |
| manifest = {c: [] for c in MANIFEST_COLUMNS} | |
| dataset = LIBRISPEECH(out_root.as_posix(), url=split) | |
| for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): | |
| sample_id = f"{spk_id}-{chapter_no}-{utt_no}" | |
| manifest["id"].append(sample_id) | |
| manifest["audio"].append(audio_paths[sample_id]) | |
| manifest["n_frames"].append(audio_lengths[sample_id]) | |
| manifest["tgt_text"].append(utt.lower()) | |
| manifest["speaker"].append(spk_id) | |
| save_df_to_tsv( | |
| pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" | |
| ) | |
| if split.startswith("train"): | |
| train_text.extend(manifest["tgt_text"]) | |
| # Generate vocab | |
| vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) | |
| spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" | |
| with NamedTemporaryFile(mode="w") as f: | |
| for t in train_text: | |
| f.write(t + "\n") | |
| gen_vocab( | |
| Path(f.name), | |
| out_root / spm_filename_prefix, | |
| args.vocab_type, | |
| args.vocab_size, | |
| ) | |
| # Generate config YAML | |
| gen_config_yaml( | |
| out_root, | |
| spm_filename=spm_filename_prefix + ".model", | |
| specaugment_policy="ld" | |
| ) | |
| # Clean up | |
| shutil.rmtree(feature_root) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--output-root", "-o", required=True, type=str) | |
| parser.add_argument( | |
| "--vocab-type", | |
| default="unigram", | |
| required=True, | |
| type=str, | |
| choices=["bpe", "unigram", "char"], | |
| ), | |
| parser.add_argument("--vocab-size", default=10000, type=int) | |
| args = parser.parse_args() | |
| process(args) | |
| if __name__ == "__main__": | |
| main() | |