|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import logging |
|
from pathlib import Path |
|
import shutil |
|
from tempfile import NamedTemporaryFile |
|
|
|
import pandas as pd |
|
from examples.speech_to_text.data_utils import ( |
|
create_zip, |
|
extract_fbank_features, |
|
gen_config_yaml, |
|
gen_vocab, |
|
get_zip_manifest, |
|
save_df_to_tsv, |
|
) |
|
from torchaudio.datasets import LIBRISPEECH |
|
from tqdm import tqdm |
|
|
|
|
|
log = logging.getLogger(__name__) |
|
|
|
SPLITS = [ |
|
"train-clean-100", |
|
"train-clean-360", |
|
"train-other-500", |
|
"dev-clean", |
|
"dev-other", |
|
"test-clean", |
|
"test-other", |
|
] |
|
|
|
MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] |
|
|
|
|
|
def process(args): |
|
out_root = Path(args.output_root).absolute() |
|
out_root.mkdir(exist_ok=True) |
|
|
|
feature_root = out_root / "fbank80" |
|
feature_root.mkdir(exist_ok=True) |
|
for split in SPLITS: |
|
print(f"Fetching split {split}...") |
|
dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) |
|
print("Extracting log mel filter bank features...") |
|
for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): |
|
sample_id = f"{spk_id}-{chapter_no}-{utt_no}" |
|
extract_fbank_features( |
|
wav, sample_rate, feature_root / f"{sample_id}.npy" |
|
) |
|
|
|
zip_path = out_root / "fbank80.zip" |
|
print("ZIPing features...") |
|
create_zip(feature_root, zip_path) |
|
print("Fetching ZIP manifest...") |
|
audio_paths, audio_lengths = get_zip_manifest(zip_path) |
|
|
|
print("Generating manifest...") |
|
train_text = [] |
|
for split in SPLITS: |
|
manifest = {c: [] for c in MANIFEST_COLUMNS} |
|
dataset = LIBRISPEECH(out_root.as_posix(), url=split) |
|
for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): |
|
sample_id = f"{spk_id}-{chapter_no}-{utt_no}" |
|
manifest["id"].append(sample_id) |
|
manifest["audio"].append(audio_paths[sample_id]) |
|
manifest["n_frames"].append(audio_lengths[sample_id]) |
|
manifest["tgt_text"].append(utt.lower()) |
|
manifest["speaker"].append(spk_id) |
|
save_df_to_tsv( |
|
pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" |
|
) |
|
if split.startswith("train"): |
|
train_text.extend(manifest["tgt_text"]) |
|
|
|
vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) |
|
spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" |
|
with NamedTemporaryFile(mode="w") as f: |
|
for t in train_text: |
|
f.write(t + "\n") |
|
gen_vocab( |
|
Path(f.name), |
|
out_root / spm_filename_prefix, |
|
args.vocab_type, |
|
args.vocab_size, |
|
) |
|
|
|
gen_config_yaml( |
|
out_root, |
|
spm_filename=spm_filename_prefix + ".model", |
|
specaugment_policy="ld" |
|
) |
|
|
|
shutil.rmtree(feature_root) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--output-root", "-o", required=True, type=str) |
|
parser.add_argument( |
|
"--vocab-type", |
|
default="unigram", |
|
required=True, |
|
type=str, |
|
choices=["bpe", "unigram", "char"], |
|
), |
|
parser.add_argument("--vocab-size", default=10000, type=int) |
|
args = parser.parse_args() |
|
|
|
process(args) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|