wr commited on Aug 18, 2022

Commit

31ad50e

1 Parent(s): f9fe32e

set .tsv and .txt to large file

Files changed (23) hide show

.gitattributes +2 -0
README.md +40 -0
manifest/TTS_examples.txt +3 -0
manifest/dev-clean.tsv +3 -0
manifest/dev-clean.txt +3 -0
manifest/dict.txt +3 -0
manifest/spm_char.model +3 -0
manifest/test-clean-200.tsv +3 -0
manifest/test-clean-200.txt +3 -0
manifest/test-clean.tsv +3 -0
manifest/test-clean.txt +3 -0
manifest/train-clean-100.tsv +3 -0
manifest/train-clean-100.txt +3 -0
manifest/train-clean-360.tsv +3 -0
manifest/train-clean-360.txt +3 -0
manifest/utils/libritts_manifest.py +120 -0
manifest/utils/make_tsv_txt.sh +13 -0
manifest/utils/prep_libritts_spkemb.py +72 -0
manifest/utils/resample_libritts.py +33 -0
manifest/utils/spec2wav.sh +8 -0
pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/config.yml +192 -0
pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/hifigan-libritts-1930000steps.pkl +3 -0
pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/stats.npy +3 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
+*.tsv filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,43 @@
 ---
 license: mit
 ---

 ---
 license: mit
+tags:
+- speech
+- text
+- cross-modal
+- unified model
+- self-supervised learning
+- SpeechT5
+datasets:
+- LibriTTS
 ---
+## SpeechT5 TTS Manifest
+| [**Github**](https://github.com/microsoft/SpeechT5) | [**Huggingface**](https://huggingface.co/mechanicalsea/speecht5-tts) |
+This manifest is an attempt to recreate the Text-to-Speech recipe used for training [SpeechT5](https://aclanthology.org/2022.acl-long.393). This manifest was constructed using [LibriTTS](http://www.openslr.org/60/) clean datasets, including train-clean-100 and train-clean-360 for training, dev-clean for validation, and test-clean for evaluation. The test-clean-200 contains 200 utterances id for the mean option score (MOS), and the comparison mean option score (CMOS).
+### Requirements
+- [SpeechBrain](https://github.com/speechbrain/speechbrain) for extracting speaker embedding
+- [Parallel WaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN) for implementing vocoder.
+### Tools
+- [manifest/utils](./manifest/utils/) is used to downsample waveform, extract speaker embedding, generate manifest, and apply vocoder.
+- [pretrained_vocoder](./pretrained_vocoder/) provides the pre-trained vocoder.
+### Reference
+If you find our work is useful in your research, please cite the following paper:
+```bibtex
+@inproceedings{ao-etal-2022-speecht5,
+    title = {{S}peech{T}5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing},
+    author = {Ao, Junyi and Wang, Rui and Zhou, Long and Wang, Chengyi and Ren, Shuo and Wu, Yu and Liu, Shujie and Ko, Tom and Li, Qing and Zhang, Yu and Wei, Zhihua and Qian, Yao and Li, Jinyu and Wei, Furu},
+    booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+    month = {May},
+    year = {2022},
+    pages={5723--5738},
+}
+```

manifest/TTS_examples.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8e2db9c6294f35bd8952435aa506ebe38d5e7b5aebf01dee3e086f4d4f9685f
+size 8018

manifest/dev-clean.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6cf77f21f3dab7dc8ca5e8470ee45f2ed1907304b05f1245f21febda73ea7d7
+size 635339

manifest/dev-clean.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec6d57b715e17da05dc462846d9fd1309e2f10c844cf2cc8566807741905ccd7
+size 548224

manifest/dict.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:036438c7cb5fc860b1d1066a3b111542515b1d4ac1f5a79a15a2322e8f79f402
+size 309

manifest/spm_char.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
+size 238473

manifest/test-clean-200.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b22354b2f305ba791d7efb72246a8ddb01cc832fcd1dcd123245faa9aa0a7931
+size 22150

manifest/test-clean-200.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39431d3e311a3a47935411d819c94c4f28161022cdc426f0b7f3d9dc0be9c569
+size 22526

manifest/test-clean.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:979bb2256a8138cf0492e2aa07628b815891bd0d81ac6a98d9d5d6889a176291
+size 535922

manifest/test-clean.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4470877fc16c4135723c4bfe0784d47f0211bf6b12088ec6d293bbf5e4fac1
+size 508964

manifest/train-clean-100.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c93390d311316c02d6e7da4bf5ab0b93cb922f80b075f6dfc30ff14c33b33bf0
+size 3864578

manifest/train-clean-100.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e40a9a117e7f588390bcb188ffad54830c37621a38d1e6e1f3f3f4e13885d863
+size 3180343

manifest/train-clean-360.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d14e7dfea4e60753aa6b882ee64472cf340174ff707c1e0f69e590b4373676ba
+size 13582849

manifest/train-clean-360.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c03d42d6310f67293b3010ee207da940e1ba03adf1924f1e5b959d9370f73037
+size 11483749

manifest/utils/libritts_manifest.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import argparse
+import os
+from typing import Tuple
+from scipy.io import wavfile
+from torchaudio.datasets import LIBRITTS
+from tqdm import tqdm
+def load_libritts_item(
+    fileid: str,
+    path: str,
+    ext_audio: str,
+    ext_original_txt: str,
+    ext_normalized_txt: str,
+) -> Tuple[int, int, str, str, int, int, str]:
+    speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
+    utterance_id = fileid
+    normalized_text = utterance_id + ext_normalized_txt
+    normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)
+    original_text = utterance_id + ext_original_txt
+    original_text = os.path.join(path, speaker_id, chapter_id, original_text)
+    file_audio = utterance_id + ext_audio
+    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
+    # Load audio
+    sample_rate, wav = wavfile.read(file_audio)
+    n_frames = wav.shape[0]
+    # Load original text
+    # with open(original_text) as ft:
+    #     original_text = ft.readline()
+    # Load normalized text
+    with open(normalized_text, "r") as ft:
+        normalized_text = ft.readline()
+    return (
+        n_frames,
+        sample_rate,
+        None,
+        normalized_text,
+        int(speaker_id),
+        int(chapter_id),
+        utterance_id,
+    )
+class LIBRITTS_16K(LIBRITTS):
+    def __getitem__(self, n: int) -> Tuple[int, int, str, str, int, int, str]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, str, str, int, int, str):
+            ``(waveform_length, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)``
+        """
+        fileid = self._walker[n]
+        return load_libritts_item(
+            fileid,
+            self._path,
+            self._ext_audio,
+            self._ext_original_txt,
+            self._ext_normalized_txt,
+        )
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "root", metavar="DIR", help="root directory containing wav files to index"
+    )
+    parser.add_argument(
+        "--dest", default=".", type=str, metavar="DIR", help="output directory"
+    )
+    parser.add_argument(
+        "--split", required=True, type=str, help="dataset splits"
+    )
+    parser.add_argument(
+        "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv"
+    )
+    parser.add_argument(
+        "--spkemb-npy-dir", required=True, type=str, help="speaker embedding directory"
+    )
+    return parser
+def main(args):
+    dest_dir = args.dest
+    wav_root = args.wav_root
+    if not os.path.exists(dest_dir):
+        os.makedirs(dest_dir)
+    dataset = LIBRITTS_16K(os.path.dirname(args.root), url=args.split, folder_in_archive=os.path.basename(args.root))
+    tsv_f = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w")
+    txt_f = open(os.path.join(dest_dir, f"{args.split}.txt"), "w")
+    print(wav_root, file=tsv_f)
+    for n_frames, sr, ori_text, norm_text, spk_id, chap_id, utt_id in tqdm(dataset, desc="tsv/txt/wav"):
+        assert sr == 16000, f"sampling rate {sr} != 16000"
+        utt_file = os.path.join(args.split, f"{spk_id}", f"{chap_id}", f"{utt_id}.wav")
+        spk_file = os.path.join(args.spkemb_npy_dir, f"{spk_id}-{chap_id}-{utt_id}.npy")
+        assert os.path.exists(os.path.join(wav_root, utt_file))
+        assert os.path.exists(os.path.join(wav_root, spk_file))
+        print(f"{utt_file}\t{n_frames}\t{spk_file}", file=tsv_f)
+        print(norm_text, file=txt_f)
+    tsv_f.close()
+    txt_f.close()
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)

manifest/utils/make_tsv_txt.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/bash
+# bash utils/make_tsv_txt.sh /mnt/bn/wangrui2022/wangrui2022/libritts/LibriTTS_16k /opt/tiger/libritts_finetuning_meta /opt/tiger/LibriTTS_16k
+root=$1
+dest=$2
+wav_root=$3
+spkemb_split=$4
+if [ -z ${spkemb_split} ]; then
+    spkemb_split=spkrec-xvect
+fi
+for split in dev-clean test-clean train-clean-100 train-clean-360; do
+    echo "making ${split}.tsv and ${split}.txt ..."
+    python utils/libritts_manifest.py ${root} --dest ${dest} --split ${split} --wav-root ${wav_root} --spkemb-npy-dir ${spkemb_split}
+done

manifest/utils/prep_libritts_spkemb.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import glob
+import numpy
+import argparse
+import torchaudio
+from speechbrain.pretrained import EncoderClassifier
+import torch
+from tqdm import tqdm
+import torch.nn.functional as F
+import torchaudio.transforms as T
+spk_model = {
+    "speechbrain/spkrec-xvect-voxceleb": 512,
+    "speechbrain/spkrec-ecapa-voxceleb": 192,
+}
+def f2embed(wav_file, classifier, size_embed, resampler=None):
+    signal, fs =torchaudio.load(wav_file)
+    if fs != 16000 and fs is not None:
+        assert fs == 24000, fs
+        signal = resampler(signal)
+        fs = 16000
+    assert fs == 16000, fs
+    with torch.no_grad():
+        embeddings = classifier.encode_batch(signal)
+        embeddings = F.normalize(embeddings, dim=2)
+        embeddings = embeddings.squeeze().cpu().numpy()
+    assert embeddings.shape[0] == size_embed, embeddings.shape[0]
+    return embeddings
+def process(args):
+    wavlst = []
+    for split in args.splits.split(","):
+        wav_dir = os.path.join(args.libritts_root, split)
+        wavlst_split = glob.glob(os.path.join(wav_dir, "*", "*", "*.wav"))
+        print(f"{split} {len(wavlst_split)} utterances.")
+        wavlst.extend(wavlst_split)
+    spkemb_root = args.output_root
+    if not os.path.exists(spkemb_root):
+        print(f"Create speaker embedding directory: {spkemb_root}")
+        os.mkdir(spkemb_root)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir='/tmp')
+    size_embed = spk_model[args.speaker_embed]
+    resampler = T.Resample(24000, 16000)
+    for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
+        utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
+        utt_emb = f2embed(utt_i, classifier, size_embed, resampler)
+        numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--libritts-root", "-i", required=True, type=str, help="LibriTTS root directory.")
+    parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
+    parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
+                        help="Pretrained model for extracting speaker emebdding.")
+    parser.add_argument("--splits", default="train-clean-100,train-clean-360,dev-clean,test-clean", type=str,
+                        help="Split of train,dev,test seperate by comma.")
+    args = parser.parse_args()
+    print(f"Loading utterances from {args.libritts_root}/{args.splits}, "
+        + f"Save speaker embedding 'npy' to {args.output_root}, "
+        + f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
+    process(args)
+if __name__ == "__main__":
+    """
+    python examples/text_to_speech/prep_libritts_spkemb.py \
+        -i /mnt/default/v-junyiao/dataset/Original/LibriTTS \
+        -o /mnt/default/v-junyiao/dataset/Original/LibriTTS/spkrec-ecapa \
+        -s speechbrain/spkrec-ecapa-voxceleb
+    """
+    main()

manifest/utils/resample_libritts.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from pathlib import Path
+from shutil import copyfile
+import soundfile as sf
+import librosa
+import os
+#LibriTTS
+# 1.6G    /root/data/libritts/LibriTTS/dev-clean
+# 1.5G    /root/data/libritts/LibriTTS/test-clean
+# 9.1G    /root/data/libritts/LibriTTS/train-clean-100
+# 33G     /root/data/libritts/LibriTTS/train-clean-360
+# 44G     /root/data/libritts/LibriTTS
+#LibriTTS_16k
+# The pattern "**" means all subdirectories recursively,
+# with "*.wav" meaning all files with any name ending in ".wav".
+dest_dir = Path("/root/data/libritts/LibriTTS_16k")
+dest_dir.mkdir(exist_ok=True)
+for file in Path("/root/data/libritts/LibriTTS").glob("**/*"):
+    if not file.is_file():  # Skip directories
+        continue
+    file = str(file)
+    new_path = Path(file.replace('LibriTTS', 'LibriTTS_16k'))
+    os.system('mkdir -p ' + str(new_path.parent))
+    if file.endswith('wav'):
+        audio, fs = sf.read(file)
+        x = librosa.resample(audio, fs, 16000)
+        sf.write(str(new_path), x, 16000)
+        # librosa.output.write_wav(str(new_path), x, 16000)
+    else:
+        copyfile(file, file.replace('LibriTTS', 'LibriTTS_16k'))

manifest/utils/spec2wav.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+feats_root=$1
+wav_root=`dirname ${feats_root}`/gen_wav
+parallel-wavegan-decode \
+    --checkpoint train_nodev_clean_libritts_hifigan.v1/hifigan-libritts-1930000steps.pkl \
+    --dumpdir ${feats_root} \
+    --outdir ${wav_root} \
+    --normalize-before

pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/config.yml ADDED Viewed

	@@ -0,0 +1,192 @@

+allow_cache: false
+batch_max_steps: 8192
+batch_size: 16
+config: conf/hifigan.v1.yaml
+dev_dumpdir: dump/dev_clean/norm
+dev_feats_scp: null
+dev_segments: null
+dev_wav_scp: null
+discriminator_adv_loss_params:
+  average_by_discriminators: false
+discriminator_grad_norm: -1
+discriminator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_optimizer_type: Adam
+discriminator_params:
+  follow_official_norm: true
+  period_discriminator_params:
+    bias: true
+    channels: 32
+    downsample_scales:
+    - 3
+    - 3
+    - 3
+    - 3
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+    use_spectral_norm: false
+    use_weight_norm: true
+  periods:
+  - 2
+  - 3
+  - 5
+  - 7
+  - 11
+  scale_discriminator_params:
+    bias: true
+    channels: 128
+    downsample_scales:
+    - 4
+    - 4
+    - 4
+    - 4
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 15
+    - 41
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    max_groups: 16
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+  scale_downsample_pooling: AvgPool1d
+  scale_downsample_pooling_params:
+    kernel_size: 4
+    padding: 2
+    stride: 2
+  scales: 3
+discriminator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+discriminator_scheduler_type: MultiStepLR
+discriminator_train_start_steps: 0
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+distributed: true
+eval_interval_steps: 1000
+feat_match_loss_params:
+  average_by_discriminators: false
+  average_by_layers: false
+  include_final_outputs: false
+fft_size: 1024
+fmax: 7600
+fmin: 80
+format: npy
+generator_adv_loss_params:
+  average_by_discriminators: false
+generator_grad_norm: -1
+generator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+generator_optimizer_type: Adam
+generator_params:
+  bias: true
+  channels: 512
+  in_channels: 80
+  kernel_size: 7
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    negative_slope: 0.1
+  out_channels: 1
+  resblock_dilations:
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  resblock_kernel_sizes:
+  - 3
+  - 7
+  - 11
+  upsample_kernal_sizes:
+  - 8
+  - 8
+  - 8
+  - 8
+  upsample_scales:
+  - 4
+  - 4
+  - 4
+  - 4
+  use_additional_convs: true
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+generator_scheduler_type: MultiStepLR
+generator_train_start_steps: 1
+generator_type: HiFiGANGenerator
+global_gain_scale: 1.0
+hop_size: 256
+lambda_adv: 1.0
+lambda_aux: 45.0
+lambda_feat_match: 2.0
+log_interval_steps: 100
+mel_loss_params:
+  fft_size: 1024
+  fmax: 7600
+  fmin: 80
+  fs: 16000
+  hop_size: 256
+  log_base: null
+  num_mels: 80
+  win_length: 1024
+  window: hann
+num_mels: 80
+num_save_intermediate_results: 4
+num_workers: 4
+outdir: exp/train_nodev_clean_libritts_hifigan.v1
+pin_memory: true
+pretrain: ''
+rank: 1
+remove_short_samples: false
+resume: /mnt/default/v-junyiao/libritts_vocoder2/train_nodev_clean_libritts_hifigan.v1/checkpoint-50000steps.pkl
+sampling_rate: 16000
+save_interval_steps: 10000
+train_dumpdir: dump/train_nodev_clean/norm
+train_feats_scp: null
+train_max_steps: 2500000
+train_segments: null
+train_wav_scp: null
+trim_frame_size: 1024
+trim_hop_size: 256
+trim_silence: false
+trim_threshold_in_db: 20
+use_feat_match_loss: true
+use_mel_loss: true
+use_stft_loss: false
+verbose: 1
+version: 0.5.1
+win_length: 1024
+window: hann
+world_size: 2

pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/hifigan-libritts-1930000steps.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b119deddc85a78061bed39aaa5c2f9a8093e2701c46d9a0f9a25b2ac52457e4
+size 333645593

pretrained_vocoder/train_nodev_clean_libritts_hifigan.v1/stats.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1a72747d543205699e741ae3092d83b233b30e4974fe1991d553d11e895c535
+size 768

set *.tsv and *.txt to large file

set .tsv and .txt to large file