maskgct

Runtime error

App Files Files Community

Hecheng0625 commited on Oct 18, 2024

Commit

7ee3434

verified ·

1 Parent(s): c968fc3

Upload 61 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
app.py +355 -0
text/__init__.py +79 -0
text/cleaners.py +98 -0
text/cmudict.py +145 -0
text/g2p.py +38 -0
text/g2p_module.py +230 -0
text/lexicon/librispeech-lexicon.txt +0 -0
text/lexicon/pinyin-lexicon-r.txt +4120 -0
text/numbers.py +77 -0
text/pinyin.py +218 -0
text/symbol_table.py +292 -0
text/symbols.py +34 -0
text/text_token_collation.py +123 -0
utils/HyperParams/__init__.py +6 -0
utils/HyperParams/hps.py +43 -0
utils/__init__.py +0 -0
utils/audio.py +74 -0
utils/audio_slicer.py +476 -0
utils/cut_by_vad.py +105 -0
utils/data_utils.py +588 -0
utils/distribution.py +270 -0
utils/dsp.py +97 -0
utils/duration.py +86 -0
utils/f0.py +275 -0
utils/hparam.py +659 -0
utils/hubert.py +155 -0
utils/io.py +182 -0
utils/io_optim.py +123 -0
utils/mel.py +280 -0
utils/mert.py +139 -0
utils/mfa_prepare.py +116 -0
utils/model_summary.py +74 -0
utils/prompt_preparer.py +68 -0
utils/ssim.py +80 -0
utils/stft.py +278 -0
utils/symbol_table.py +317 -0
utils/tokenizer.py +150 -0
utils/topk_sampling.py +89 -0
utils/trainer_utils.py +16 -0
utils/util.py +687 -0
utils/whisper_transcription.py +122 -0
utils/world.py +92 -0
visualization/SingVisio/System_Introduction_of_SingVisio_V2.pdf +3 -0
visualization/SingVisio/webpage/Dockerfile +23 -0
visualization/SingVisio/webpage/README.md +126 -0
visualization/SingVisio/webpage/config/default.json +407 -0
visualization/SingVisio/webpage/img/difference_bar.jpg +0 -0
visualization/SingVisio/webpage/img/syllable.png +0 -0
visualization/SingVisio/webpage/index.html +390 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text
 models/codec/facodec/modules/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
 models/tts/maskgct/g2p/sources/chinese_lexicon.txt filter=lfs diff=lfs merge=lfs -text
 models/tts/maskgct/wav/prompt.wav filter=lfs diff=lfs merge=lfs -text

 models/codec/facodec/modules/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text
 models/tts/maskgct/g2p/sources/chinese_lexicon.txt filter=lfs diff=lfs merge=lfs -text
 models/tts/maskgct/wav/prompt.wav filter=lfs diff=lfs merge=lfs -text
+visualization/SingVisio/System_Introduction_of_SingVisio_V2.pdf filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import gradio as gr
+import torch
+import safetensors
+from huggingface_hub import hf_hub_download
+import soundfile as sf
+import numpy as np
+import librosa
+from models.codec.kmeans.repcodec_model import RepCodec
+from models.tts.maskgct.maskgct_s2a import MaskGCT_S2A
+from models.tts.maskgct.maskgct_t2s import MaskGCT_T2S
+from models.codec.amphion_codec.codec import CodecEncoder, CodecDecoder
+from transformers import Wav2Vec2BertModel
+from utils.util import load_config
+from models.tts.maskgct.g2p.g2p_generation import g2p, chn_eng_g2p
+from transformers import SeamlessM4TFeatureExtractor
+processor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def g2p_(text, language):
+    if language in ["zh", "en"]:
+        return chn_eng_g2p(text)
+    else:
+        return g2p(text, sentence=None, language=language)
+def build_t2s_model(cfg, device):
+    t2s_model = MaskGCT_T2S(cfg=cfg)
+    t2s_model.eval()
+    t2s_model.to(device)
+    return t2s_model
+def build_s2a_model(cfg, device):
+    soundstorm_model = MaskGCT_S2A(cfg=cfg)
+    soundstorm_model.eval()
+    soundstorm_model.to(device)
+    return soundstorm_model
+def build_semantic_model(device):
+    semantic_model = Wav2Vec2BertModel.from_pretrained("facebook/w2v-bert-2.0")
+    semantic_model.eval()
+    semantic_model.to(device)
+    stat_mean_var = torch.load("./models/tts/maskgct/ckpt/wav2vec2bert_stats.pt")
+    semantic_mean = stat_mean_var["mean"]
+    semantic_std = torch.sqrt(stat_mean_var["var"])
+    semantic_mean = semantic_mean.to(device)
+    semantic_std = semantic_std.to(device)
+    return semantic_model, semantic_mean, semantic_std
+def build_semantic_codec(cfg, device):
+    semantic_codec = RepCodec(cfg=cfg)
+    semantic_codec.eval()
+    semantic_codec.to(device)
+    return semantic_codec
+def build_acoustic_codec(cfg, device):
+    codec_encoder = CodecEncoder(cfg=cfg.encoder)
+    codec_decoder = CodecDecoder(cfg=cfg.decoder)
+    codec_encoder.eval()
+    codec_decoder.eval()
+    codec_encoder.to(device)
+    codec_decoder.to(device)
+    return codec_encoder, codec_decoder
+@torch.no_grad()
+def extract_features(speech, processor):
+    inputs = processor(speech, sampling_rate=16000, return_tensors="pt")
+    input_features = inputs["input_features"][0]
+    attention_mask = inputs["attention_mask"][0]
+    return input_features, attention_mask
+@torch.no_grad()
+def extract_semantic_code(semantic_mean, semantic_std, input_features, attention_mask):
+    vq_emb = semantic_model(
+        input_features=input_features,
+        attention_mask=attention_mask,
+        output_hidden_states=True,
+    )
+    feat = vq_emb.hidden_states[17]  # (B, T, C)
+    feat = (feat - semantic_mean.to(feat)) / semantic_std.to(feat)
+    semantic_code, rec_feat = semantic_codec.quantize(feat)  # (B, T)
+    return semantic_code, rec_feat
+@torch.no_grad()
+def extract_acoustic_code(speech):
+    vq_emb = codec_encoder(speech.unsqueeze(1))
+    _, vq, _, _, _ = codec_decoder.quantizer(vq_emb)
+    acoustic_code = vq.permute(1, 2, 0)
+    return acoustic_code
+@torch.no_grad()
+def text2semantic(
+    device,
+    prompt_speech,
+    prompt_text,
+    prompt_language,
+    target_text,
+    target_language,
+    target_len=None,
+    n_timesteps=50,
+    cfg=2.5,
+    rescale_cfg=0.75,
+):
+    prompt_phone_id = g2p_(prompt_text, prompt_language)[1]
+    target_phone_id = g2p_(target_text, target_language)[1]
+    if target_len is None:
+        target_len = int(
+            (len(prompt_speech) * len(target_phone_id) / len(prompt_phone_id))
+            / 16000
+            * 50
+        )
+    else:
+        target_len = int(target_len * 50)
+    prompt_phone_id = torch.tensor(prompt_phone_id, dtype=torch.long).to(device)
+    target_phone_id = torch.tensor(target_phone_id, dtype=torch.long).to(device)
+    phone_id = torch.cat([prompt_phone_id, target_phone_id])
+    input_fetures, attention_mask = extract_features(prompt_speech, processor)
+    input_fetures = input_fetures.unsqueeze(0).to(device)
+    attention_mask = attention_mask.unsqueeze(0).to(device)
+    semantic_code, _ = extract_semantic_code(
+        semantic_mean, semantic_std, input_fetures, attention_mask
+    )
+    predict_semantic = t2s_model.reverse_diffusion(
+        semantic_code[:, :],
+        target_len,
+        phone_id.unsqueeze(0),
+        n_timesteps=n_timesteps,
+        cfg=cfg,
+        rescale_cfg=rescale_cfg,
+    )
+    combine_semantic_code = torch.cat([semantic_code[:, :], predict_semantic], dim=-1)
+    prompt_semantic_code = semantic_code
+    return combine_semantic_code, prompt_semantic_code
+@torch.no_grad()
+def semantic2acoustic(
+    device,
+    combine_semantic_code,
+    acoustic_code,
+    n_timesteps=[25, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    cfg=2.5,
+    rescale_cfg=0.75,
+):
+    semantic_code = combine_semantic_code
+    cond = s2a_model_1layer.cond_emb(semantic_code)
+    prompt = acoustic_code[:, :, :]
+    predict_1layer = s2a_model_1layer.reverse_diffusion(
+        cond=cond,
+        prompt=prompt,
+        temp=1.5,
+        filter_thres=0.98,
+        n_timesteps=n_timesteps[:1],
+        cfg=cfg,
+        rescale_cfg=rescale_cfg,
+    )
+    cond = s2a_model_full.cond_emb(semantic_code)
+    prompt = acoustic_code[:, :, :]
+    predict_full = s2a_model_full.reverse_diffusion(
+        cond=cond,
+        prompt=prompt,
+        temp=1.5,
+        filter_thres=0.98,
+        n_timesteps=n_timesteps,
+        cfg=cfg,
+        rescale_cfg=rescale_cfg,
+        gt_code=predict_1layer,
+    )
+    vq_emb = codec_decoder.vq2emb(predict_full.permute(2, 0, 1), n_quantizers=12)
+    recovered_audio = codec_decoder(vq_emb)
+    prompt_vq_emb = codec_decoder.vq2emb(prompt.permute(2, 0, 1), n_quantizers=12)
+    recovered_prompt_audio = codec_decoder(prompt_vq_emb)
+    recovered_prompt_audio = recovered_prompt_audio[0][0].cpu().numpy()
+    recovered_audio = recovered_audio[0][0].cpu().numpy()
+    combine_audio = np.concatenate([recovered_prompt_audio, recovered_audio])
+    return combine_audio, recovered_audio
+# Load the model and checkpoints
+def load_models():
+    cfg_path = "./models/tts/maskgct/config/maskgct.json"
+    cfg = load_config(cfg_path)
+    semantic_model, semantic_mean, semantic_std = build_semantic_model(device)
+    semantic_codec = build_semantic_codec(cfg.model.semantic_codec, device)
+    codec_encoder, codec_decoder = build_acoustic_codec(
+        cfg.model.acoustic_codec, device
+    )
+    t2s_model = build_t2s_model(cfg.model.t2s_model, device)
+    s2a_model_1layer = build_s2a_model(cfg.model.s2a_model.s2a_1layer, device)
+    s2a_model_full = build_s2a_model(cfg.model.s2a_model.s2a_full, device)
+    # Download checkpoints
+    semantic_code_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="semantic_codec/model.safetensors"
+    )
+    codec_encoder_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="acoustic_codec/model.safetensors"
+    )
+    codec_decoder_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="acoustic_codec/model_1.safetensors"
+    )
+    t2s_model_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="t2s_model/model.safetensors"
+    )
+    s2a_1layer_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="s2a_model/s2a_model_1layer/model.safetensors"
+    )
+    s2a_full_ckpt = hf_hub_download(
+        "amphion/MaskGCT", filename="s2a_model/s2a_model_full/model.safetensors"
+    )
+    safetensors.torch.load_model(semantic_codec, semantic_code_ckpt)
+    safetensors.torch.load_model(codec_encoder, codec_encoder_ckpt)
+    safetensors.torch.load_model(codec_decoder, codec_decoder_ckpt)
+    safetensors.torch.load_model(t2s_model, t2s_model_ckpt)
+    safetensors.torch.load_model(s2a_model_1layer, s2a_1layer_ckpt)
+    safetensors.torch.load_model(s2a_model_full, s2a_full_ckpt)
+    return (
+        semantic_model,
+        semantic_mean,
+        semantic_std,
+        semantic_codec,
+        codec_encoder,
+        codec_decoder,
+        t2s_model,
+        s2a_model_1layer,
+        s2a_model_full,
+    )
+@torch.no_grad()
+def maskgct_inference(
+    prompt_speech_path,
+    prompt_text,
+    target_text,
+    language="en",
+    target_language="en",
+    target_len=None,
+    n_timesteps=25,
+    cfg=2.5,
+    rescale_cfg=0.75,
+    n_timesteps_s2a=[25, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    cfg_s2a=2.5,
+    rescale_cfg_s2a=0.75,
+    device=torch.device("cuda:5"),
+):
+    speech_16k = librosa.load(prompt_speech_path, sr=16000)[0]
+    speech = librosa.load(prompt_speech_path, sr=24000)[0]
+    combine_semantic_code, _ = text2semantic(
+        device,
+        speech_16k,
+        prompt_text,
+        language,
+        target_text,
+        target_language,
+        target_len,
+        n_timesteps,
+        cfg,
+        rescale_cfg,
+    )
+    acoustic_code = extract_acoustic_code(torch.tensor(speech).unsqueeze(0).to(device))
+    _, recovered_audio = semantic2acoustic(
+        device,
+        combine_semantic_code,
+        acoustic_code,
+        n_timesteps=n_timesteps_s2a,
+        cfg=cfg_s2a,
+        rescale_cfg=rescale_cfg_s2a,
+    )
+    return recovered_audio
+@torch.no_grad()
+def inference(
+    prompt_wav,
+    prompt_text,
+    target_text,
+    target_len,
+    n_timesteps,
+    language,
+    target_language,
+):
+    save_path = "./output/output.wav"
+    os.makedirs("./output", exist_ok=True)
+    recovered_audio = maskgct_inference(
+        prompt_wav,
+        prompt_text,
+        target_text,
+        language,
+        target_language,
+        target_len=target_len,
+        n_timesteps=int(n_timesteps),
+        device=device,
+    )
+    sf.write(save_path, recovered_audio, 24000)
+    return save_path
+# Language list
+language_list = ["en", "zh", "ja", "ko", "fr", "de"]
+# Gradio interface
+iface = gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.Audio(label="Upload Prompt Wav", type="filepath"),
+        gr.Textbox(label="Prompt Text"),
+        gr.Textbox(label="Target Text"),
+        gr.Number(
+            label="Target Duration (in seconds)", value=None
+        ),  # Removed 'optional=True'
+        gr.Slider(
+            label="Number of Timesteps", minimum=15, maximum=100, value=25, step=1
+        ),
+        gr.Dropdown(label="Language", choices=language_list, value="en"),
+        gr.Dropdown(label="Target Language", choices=language_list, value="en"),
+    ],
+    outputs=gr.Audio(label="Generated Audio"),
+    title="MaskGCT TTS Demo",
+    description="Generate speech from text using the MaskGCT model.",
+)
+# Launch the interface
+iface.launch(allowed_paths=["./output"])

text/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+import re
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+# Regular expression matching text enclosed in curly braces:
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
+def text_to_sequence(text, cleaner_names):
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+    """
+    sequence = []
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
+    return sequence
+def sequence_to_text(sequence):
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == "@":
+                s = "{%s}" % s[1:]
+            result += s
+    return result.replace("}{", " ")
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text
+def _symbols_to_sequence(symbols):
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+def _arpabet_to_sequence(text):
+    return _symbols_to_sequence(["@" + s for s in text.split()])
+def _should_keep_symbol(s):
+    return s in _symbol_to_id and s != "_" and s != "~"

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+"""
+Cleaners are transformations that run over the input text at both training and eval time.
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+     the symbols in symbols.py to match your data).
+"""
+# Regular expression matching whitespace:
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def expand_numbers(text):
+    return normalize_numbers(text)
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text

text/cmudict.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""  This code is modified from https://github.com/keithito/tacotron """
+import re
+valid_symbols = [
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
+]
+_valid_symbol_set = set(valid_symbols)
+class CMUDict:
+    """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding="latin-1") as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+    def __len__(self):
+        return len(self._entries)
+    def lookup(self, word):
+        """Returns list of ARPAbet pronunciations of the given word."""
+        return self._entries.get(word.upper())
+_alt_re = re.compile(r"\([0-9]+\)")
+def _parse_cmudict(file):
+    cmudict = {}
+    for line in file:
+        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict
+def _get_pronunciation(s):
+    parts = s.strip().split(" ")
+    for part in parts:
+        if part not in _valid_symbol_set:
+            return None
+    return " ".join(parts)

text/g2p.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from g2p_en import G2p
+from string import punctuation
+def read_lexicon(lex_path):
+    lexicon = {}
+    with open(lex_path) as f:
+        for line in f:
+            temp = re.split(r"\s+", line.strip("\n"))
+            word = temp[0]
+            phones = temp[1:]
+            if word.lower() not in lexicon:
+                lexicon[word.lower()] = phones
+    return lexicon
+def preprocess_english(text, lexicon):
+    text = text.rstrip(punctuation)
+    g2p = G2p()
+    phones = []
+    words = re.split(r"([,;.\-\?\!\s+])", text)
+    for w in words:
+        if w.lower() in lexicon:
+            phones += lexicon[w.lower()]
+        else:
+            phones += list(filter(lambda p: p != " ", g2p(w)))
+    phones = "}{".join(phones)
+    phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
+    phones = phones.replace("}{", " ")
+    return phones

text/g2p_module.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from g2p_en import G2p
+from string import punctuation
+from typing import Any, Dict, List, Optional, Pattern, Union
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.punctuation import Punctuation
+from phonemizer.separator import Separator
+try:
+    from pypinyin import Style, pinyin
+    from pypinyin.style._utils import get_finals, get_initials
+except Exception:
+    pass
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
+class PypinyinBackend:
+    """PypinyinBackend for Chinese. Most codes is referenced from espnet.
+    There are two types pinyin or initials_finals, one is
+    just like "ni1 hao3", the other is like "n i1 h ao3".
+    """
+    def __init__(
+        self,
+        backend="initials_finals",
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+    ) -> None:
+        self.backend = backend
+        self.punctuation_marks = punctuation_marks
+    def phonemize(
+        self, text: List[str], separator: Separator, strip=True, njobs=1
+    ) -> List[str]:
+        assert isinstance(text, List)
+        phonemized = []
+        for _text in text:
+            _text = re.sub(" +", " ", _text.strip())
+            _text = _text.replace(" ", separator.word)
+            phones = []
+            if self.backend == "pypinyin":
+                for n, py in enumerate(
+                    pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True)
+                ):
+                    if all([c in self.punctuation_marks for c in py[0]]):
+                        if len(phones):
+                            assert phones[-1] == separator.syllable
+                            phones.pop(-1)
+                        phones.extend(list(py[0]))
+                    else:
+                        phones.extend([py[0], separator.syllable])
+            elif self.backend == "pypinyin_initials_finals":
+                for n, py in enumerate(
+                    pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True)
+                ):
+                    if all([c in self.punctuation_marks for c in py[0]]):
+                        if len(phones):
+                            assert phones[-1] == separator.syllable
+                            phones.pop(-1)
+                        phones.extend(list(py[0]))
+                    else:
+                        if py[0][-1].isalnum():
+                            initial = get_initials(py[0], strict=False)
+                            if py[0][-1].isdigit():
+                                final = get_finals(py[0][:-1], strict=False) + py[0][-1]
+                            else:
+                                final = get_finals(py[0], strict=False)
+                            phones.extend(
+                                [
+                                    initial,
+                                    separator.phone,
+                                    final,
+                                    separator.syllable,
+                                ]
+                            )
+                        else:
+                            assert ValueError
+            else:
+                raise NotImplementedError
+            phonemized.append(
+                "".join(phones).rstrip(f"{separator.word}{separator.syllable}")
+            )
+        return phonemized
+class G2PModule:
+    """Phonemize Text."""
+    # We support espeak to extract IPA (International Phonetic Alphabet), which supports 100 languages,
+    # https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
+    def __init__(
+        self,
+        language="en-us",
+        backend="espeak",
+        separator=Separator(word="_", syllable="-", phone="|"),
+        preserve_punctuation=True,
+        punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = "keep-flags",
+        words_mismatch: WordMismatch = "ignore",
+    ) -> None:
+        self.separator = separator
+        self.backend = self._initialize_backend(
+            backend,
+            language,
+            punctuation_marks,
+            preserve_punctuation,
+            with_stress,
+            tie,
+            language_switch,
+            words_mismatch,
+        )
+    def _initialize_backend(
+        self,
+        backend,
+        language,
+        punctuation_marks,
+        preserve_punctuation,
+        with_stress,
+        tie,
+        language_switch,
+        words_mismatch,
+    ):
+        if backend == "espeak":
+            return EspeakBackend(
+                language,
+                punctuation_marks=punctuation_marks,
+                preserve_punctuation=preserve_punctuation,
+                with_stress=with_stress,
+                tie=tie,
+                language_switch=language_switch,
+                words_mismatch=words_mismatch,
+            )
+        elif backend in ["pypinyin", "pypinyin_initials_finals"]:
+            if language != "cmn":
+                raise ValueError(
+                    f"{language} is not supported for pypinyin and pypinyin_initials_finals."
+                )
+            return PypinyinBackend(
+                backend=backend,
+                punctuation_marks=punctuation_marks + self.separator.word,
+            )
+        else:
+            raise NotImplementedError(f"{backend}")
+    def to_list(self, phonemized: str) -> List[str]:
+        fields = []
+        for word in phonemized.split(self.separator.word):
+            pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
+            fields.extend(
+                [p for p in pp if p != self.separator.phone] + [self.separator.word]
+            )
+        assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
+            self.separator.phone
+        )
+        return fields[:-1]
+    def phonemization(self, text, strip=True) -> List[List[str]]:
+        if isinstance(text, str):
+            text = [text]
+        phonemized = self.backend.phonemize(
+            text, separator=self.separator, strip=strip, njobs=1
+        )
+        phonemes = [self.to_list(p) for p in phonemized]
+        return phonemes
+    def g2p_conversion(self, text: str) -> List[str]:
+        phonemes = self.phonemization([text.strip()])
+        return phonemes[0]
+class LexiconModule:
+    def __init__(self, lex_path, language="en-us") -> None:
+        # todo: check lexicon derivation, merge with G2PModule?
+        lexicon = {}
+        with open(lex_path) as f:
+            for line in f:
+                temp = re.split(r"\s+", line.strip("\n"))
+                word = temp[0]
+                phones = temp[1:]
+                if word.lower() not in lexicon:
+                    lexicon[word.lower()] = phones
+        self.lexicon = lexicon
+        self.language = language
+        self.lang2g2p = {"en-us": G2p()}
+    def g2p_conversion(self, text):
+        phone = None
+        # todo: preprocess with other languages
+        if self.language == "en-us":
+            phone = self.preprocess_english(text)
+        else:
+            print("No support to", self.language)
+            raise
+        return phone
+    def preprocess_english(self, text):
+        text = text.rstrip(punctuation)
+        g2p = self.lang2g2p["en-us"]
+        phones = []
+        words = re.split(r"([,;.\-\?\!\s+])", text)
+        for w in words:
+            if w.lower() in self.lexicon:
+                phones += self.lexicon[w.lower()]
+            else:
+                phones += list(filter(lambda p: p != " ", g2p(w)))
+        phones = "}{".join(phones)
+        phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
+        phones = phones.replace("}{", " ")
+        return phones

text/lexicon/librispeech-lexicon.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text/lexicon/pinyin-lexicon-r.txt ADDED Viewed

	@@ -0,0 +1,4120 @@

+a1 a1
+a2 a2
+a3 a3
+a4 a4
+a5 a5
+ai1 ai1
+ai2 ai2
+ai3 ai3
+ai4 ai4
+ai5 ai5
+an1 an1
+an2 an2
+an3 an3
+an4 an4
+an5 an5
+ang1 ang1
+ang2 ang2
+ang3 ang3
+ang4 ang4
+ang5 ang5
+ao1 ao1
+ao2 ao2
+ao3 ao3
+ao4 ao4
+ao5 ao5
+ba1 b a1
+ba2 b a2
+ba3 b a3
+ba4 b a4
+ba5 b a5
+bai1 b ai1
+bai2 b ai2
+bai3 b ai3
+bai4 b ai4
+bai5 b ai5
+ban1 b an1
+ban2 b an2
+ban3 b an3
+ban4 b an4
+ban5 b an5
+bang1 b ang1
+bang2 b ang2
+bang3 b ang3
+bang4 b ang4
+bang5 b ang5
+bao1 b ao1
+bao2 b ao2
+bao3 b ao3
+bao4 b ao4
+bao5 b ao5
+bei1 b ei1
+bei2 b ei2
+bei3 b ei3
+bei4 b ei4
+bei5 b ei5
+ben1 b en1
+ben2 b en2
+ben3 b en3
+ben4 b en4
+ben5 b en5
+beng1 b eng1
+beng2 b eng2
+beng3 b eng3
+beng4 b eng4
+beng5 b eng5
+bi1 b i1
+bi2 b i2
+bi3 b i3
+bi4 b i4
+bi5 b i5
+bian1 b ian1
+bian2 b ian2
+bian3 b ian3
+bian4 b ian4
+bian5 b ian5
+biao1 b iao1
+biao2 b iao2
+biao3 b iao3
+biao4 b iao4
+biao5 b iao5
+bie1 b ie1
+bie2 b ie2
+bie3 b ie3
+bie4 b ie4
+bie5 b ie5
+bin1 b in1
+bin2 b in2
+bin3 b in3
+bin4 b in4
+bin5 b in5
+bing1 b ing1
+bing2 b ing2
+bing3 b ing3
+bing4 b ing4
+bing5 b ing5
+bo1 b o1
+bo2 b o2
+bo3 b o3
+bo4 b o4
+bo5 b o5
+bu1 b u1
+bu2 b u2
+bu3 b u3
+bu4 b u4
+bu5 b u5
+ca1 c a1
+ca2 c a2
+ca3 c a3
+ca4 c a4
+ca5 c a5
+cai1 c ai1
+cai2 c ai2
+cai3 c ai3
+cai4 c ai4
+cai5 c ai5
+can1 c an1
+can2 c an2
+can3 c an3
+can4 c an4
+can5 c an5
+cang1 c ang1
+cang2 c ang2
+cang3 c ang3
+cang4 c ang4
+cang5 c ang5
+cao1 c ao1
+cao2 c ao2
+cao3 c ao3
+cao4 c ao4
+cao5 c ao5
+ce1 c e1
+ce2 c e2
+ce3 c e3
+ce4 c e4
+ce5 c e5
+cen1 c en1
+cen2 c en2
+cen3 c en3
+cen4 c en4
+cen5 c en5
+ceng1 c eng1
+ceng2 c eng2
+ceng3 c eng3
+ceng4 c eng4
+ceng5 c eng5
+cha1 ch a1
+cha2 ch a2
+cha3 ch a3
+cha4 ch a4
+cha5 ch a5
+chai1 ch ai1
+chai2 ch ai2
+chai3 ch ai3
+chai4 ch ai4
+chai5 ch ai5
+chan1 ch an1
+chan2 ch an2
+chan3 ch an3
+chan4 ch an4
+chan5 ch an5
+chang1 ch ang1
+chang2 ch ang2
+chang3 ch ang3
+chang4 ch ang4
+chang5 ch ang5
+chao1 ch ao1
+chao2 ch ao2
+chao3 ch ao3
+chao4 ch ao4
+chao5 ch ao5
+che1 ch e1
+che2 ch e2
+che3 ch e3
+che4 ch e4
+che5 ch e5
+chen1 ch en1
+chen2 ch en2
+chen3 ch en3
+chen4 ch en4
+chen5 ch en5
+cheng1 ch eng1
+cheng2 ch eng2
+cheng3 ch eng3
+cheng4 ch eng4
+cheng5 ch eng5
+chi1 ch iii1
+chi2 ch iii2
+chi3 ch iii3
+chi4 ch iii4
+chi5 ch iii5
+chong1 ch ong1
+chong2 ch ong2
+chong3 ch ong3
+chong4 ch ong4
+chong5 ch ong5
+chou1 ch ou1
+chou2 ch ou2
+chou3 ch ou3
+chou4 ch ou4
+chou5 ch ou5
+chu1 ch u1
+chu2 ch u2
+chu3 ch u3
+chu4 ch u4
+chu5 ch u5
+chuai1 ch uai1
+chuai2 ch uai2
+chuai3 ch uai3
+chuai4 ch uai4
+chuai5 ch uai5
+chuan1 ch uan1
+chuan2 ch uan2
+chuan3 ch uan3
+chuan4 ch uan4
+chuan5 ch uan5
+chuang1 ch uang1
+chuang2 ch uang2
+chuang3 ch uang3
+chuang4 ch uang4
+chuang5 ch uang5
+chui1 ch uei1
+chui2 ch uei2
+chui3 ch uei3
+chui4 ch uei4
+chui5 ch uei5
+chun1 ch uen1
+chun2 ch uen2
+chun3 ch uen3
+chun4 ch uen4
+chun5 ch uen5
+chuo1 ch uo1
+chuo2 ch uo2
+chuo3 ch uo3
+chuo4 ch uo4
+chuo5 ch uo5
+ci1 c ii1
+ci2 c ii2
+ci3 c ii3
+ci4 c ii4
+ci5 c ii5
+cong1 c ong1
+cong2 c ong2
+cong3 c ong3
+cong4 c ong4
+cong5 c ong5
+cou1 c ou1
+cou2 c ou2
+cou3 c ou3
+cou4 c ou4
+cou5 c ou5
+cu1 c u1
+cu2 c u2
+cu3 c u3
+cu4 c u4
+cu5 c u5
+cuan1 c uan1
+cuan2 c uan2
+cuan3 c uan3
+cuan4 c uan4
+cuan5 c uan5
+cui1 c uei1
+cui2 c uei2
+cui3 c uei3
+cui4 c uei4
+cui5 c uei5
+cun1 c uen1
+cun2 c uen2
+cun3 c uen3
+cun4 c uen4
+cun5 c uen5
+cuo1 c uo1
+cuo2 c uo2
+cuo3 c uo3
+cuo4 c uo4
+cuo5 c uo5
+da1 d a1
+da2 d a2
+da3 d a3
+da4 d a4
+da5 d a5
+dai1 d ai1
+dai2 d ai2
+dai3 d ai3
+dai4 d ai4
+dai5 d ai5
+dan1 d an1
+dan2 d an2
+dan3 d an3
+dan4 d an4
+dan5 d an5
+dang1 d ang1
+dang2 d ang2
+dang3 d ang3
+dang4 d ang4
+dang5 d ang5
+dao1 d ao1
+dao2 d ao2
+dao3 d ao3
+dao4 d ao4
+dao5 d ao5
+de1 d e1
+de2 d e2
+de3 d e3
+de4 d e4
+de5 d e5
+dei1 d ei1
+dei2 d ei2
+dei3 d ei3
+dei4 d ei4
+dei5 d ei5
+den1 d en1
+den2 d en2
+den3 d en3
+den4 d en4
+den5 d en5
+deng1 d eng1
+deng2 d eng2
+deng3 d eng3
+deng4 d eng4
+deng5 d eng5
+di1 d i1
+di2 d i2
+di3 d i3
+di4 d i4
+di5 d i5
+dia1 d ia1
+dia2 d ia2
+dia3 d ia3
+dia4 d ia4
+dia5 d ia5
+dian1 d ian1
+dian2 d ian2
+dian3 d ian3
+dian4 d ian4
+dian5 d ian5
+diao1 d iao1
+diao2 d iao2
+diao3 d iao3
+diao4 d iao4
+diao5 d iao5
+die1 d ie1
+die2 d ie2
+die3 d ie3
+die4 d ie4
+die5 d ie5
+ding1 d ing1
+ding2 d ing2
+ding3 d ing3
+ding4 d ing4
+ding5 d ing5
+diu1 d iou1
+diu2 d iou2
+diu3 d iou3
+diu4 d iou4
+diu5 d iou5
+dong1 d ong1
+dong2 d ong2
+dong3 d ong3
+dong4 d ong4
+dong5 d ong5
+dou1 d ou1
+dou2 d ou2
+dou3 d ou3
+dou4 d ou4
+dou5 d ou5
+du1 d u1
+du2 d u2
+du3 d u3
+du4 d u4
+du5 d u5
+duan1 d uan1
+duan2 d uan2
+duan3 d uan3
+duan4 d uan4
+duan5 d uan5
+dui1 d uei1
+dui2 d uei2
+dui3 d uei3
+dui4 d uei4
+dui5 d uei5
+dun1 d uen1
+dun2 d uen2
+dun3 d uen3
+dun4 d uen4
+dun5 d uen5
+duo1 d uo1
+duo2 d uo2
+duo3 d uo3
+duo4 d uo4
+duo5 d uo5
+e1 e1
+e2 e2
+e3 e3
+e4 e4
+e5 e5
+ei1 ei1
+ei2 ei2
+ei3 ei3
+ei4 ei4
+ei5 ei5
+en1 en1
+en2 en2
+en3 en3
+en4 en4
+en5 en5
+eng1 eng1
+eng2 eng2
+eng3 eng3
+eng4 eng4
+eng5 eng5
+r1 er1
+r2 er2
+r3 er3
+r4 er4
+r5 er5
+er1 er1
+er2 er2
+er3 er3
+er4 er4
+er5 er5
+fa1 f a1
+fa2 f a2
+fa3 f a3
+fa4 f a4
+fa5 f a5
+fan1 f an1
+fan2 f an2
+fan3 f an3
+fan4 f an4
+fan5 f an5
+fang1 f ang1
+fang2 f ang2
+fang3 f ang3
+fang4 f ang4
+fang5 f ang5
+fei1 f ei1
+fei2 f ei2
+fei3 f ei3
+fei4 f ei4
+fei5 f ei5
+fen1 f en1
+fen2 f en2
+fen3 f en3
+fen4 f en4
+fen5 f en5
+feng1 f eng1
+feng2 f eng2
+feng3 f eng3
+feng4 f eng4
+feng5 f eng5
+fo1 f o1
+fo2 f o2
+fo3 f o3
+fo4 f o4
+fo5 f o5
+fou1 f ou1
+fou2 f ou2
+fou3 f ou3
+fou4 f ou4
+fou5 f ou5
+fu1 f u1
+fu2 f u2
+fu3 f u3
+fu4 f u4
+fu5 f u5
+ga1 g a1
+ga2 g a2
+ga3 g a3
+ga4 g a4
+ga5 g a5
+gai1 g ai1
+gai2 g ai2
+gai3 g ai3
+gai4 g ai4
+gai5 g ai5
+gan1 g an1
+gan2 g an2
+gan3 g an3
+gan4 g an4
+gan5 g an5
+gang1 g ang1
+gang2 g ang2
+gang3 g ang3
+gang4 g ang4
+gang5 g ang5
+gao1 g ao1
+gao2 g ao2
+gao3 g ao3
+gao4 g ao4
+gao5 g ao5
+ge1 g e1
+ge2 g e2
+ge3 g e3
+ge4 g e4
+ge5 g e5
+gei1 g ei1
+gei2 g ei2
+gei3 g ei3
+gei4 g ei4
+gei5 g ei5
+gen1 g en1
+gen2 g en2
+gen3 g en3
+gen4 g en4
+gen5 g en5
+geng1 g eng1
+geng2 g eng2
+geng3 g eng3
+geng4 g eng4
+geng5 g eng5
+gong1 g ong1
+gong2 g ong2
+gong3 g ong3
+gong4 g ong4
+gong5 g ong5
+gou1 g ou1
+gou2 g ou2
+gou3 g ou3
+gou4 g ou4
+gou5 g ou5
+gu1 g u1
+gu2 g u2
+gu3 g u3
+gu4 g u4
+gu5 g u5
+gua1 g ua1
+gua2 g ua2
+gua3 g ua3
+gua4 g ua4
+gua5 g ua5
+guai1 g uai1
+guai2 g uai2
+guai3 g uai3
+guai4 g uai4
+guai5 g uai5
+guan1 g uan1
+guan2 g uan2
+guan3 g uan3
+guan4 g uan4
+guan5 g uan5
+guang1 g uang1
+guang2 g uang2
+guang3 g uang3
+guang4 g uang4
+guang5 g uang5
+gui1 g uei1
+gui2 g uei2
+gui3 g uei3
+gui4 g uei4
+gui5 g uei5
+gun1 g uen1
+gun2 g uen2
+gun3 g uen3
+gun4 g uen4
+gun5 g uen5
+guo1 g uo1
+guo2 g uo2
+guo3 g uo3
+guo4 g uo4
+guo5 g uo5
+ha1 h a1
+ha2 h a2
+ha3 h a3
+ha4 h a4
+ha5 h a5
+hai1 h ai1
+hai2 h ai2
+hai3 h ai3
+hai4 h ai4
+hai5 h ai5
+han1 h an1
+han2 h an2
+han3 h an3
+han4 h an4
+han5 h an5
+hang1 h ang1
+hang2 h ang2
+hang3 h ang3
+hang4 h ang4
+hang5 h ang5
+hao1 h ao1
+hao2 h ao2
+hao3 h ao3
+hao4 h ao4
+hao5 h ao5
+he1 h e1
+he2 h e2
+he3 h e3
+he4 h e4
+he5 h e5
+hei1 h ei1
+hei2 h ei2
+hei3 h ei3
+hei4 h ei4
+hei5 h ei5
+hen1 h en1
+hen2 h en2
+hen3 h en3
+hen4 h en4
+hen5 h en5
+heng1 h eng1
+heng2 h eng2
+heng3 h eng3
+heng4 h eng4
+heng5 h eng5
+hong1 h ong1
+hong2 h ong2
+hong3 h ong3
+hong4 h ong4
+hong5 h ong5
+hou1 h ou1
+hou2 h ou2
+hou3 h ou3
+hou4 h ou4
+hou5 h ou5
+hu1 h u1
+hu2 h u2
+hu3 h u3
+hu4 h u4
+hu5 h u5
+hua1 h ua1
+hua2 h ua2
+hua3 h ua3
+hua4 h ua4
+hua5 h ua5
+huai1 h uai1
+huai2 h uai2
+huai3 h uai3
+huai4 h uai4
+huai5 h uai5
+huan1 h uan1
+huan2 h uan2
+huan3 h uan3
+huan4 h uan4
+huan5 h uan5
+huang1 h uang1
+huang2 h uang2
+huang3 h uang3
+huang4 h uang4
+huang5 h uang5
+hui1 h uei1
+hui2 h uei2
+hui3 h uei3
+hui4 h uei4
+hui5 h uei5
+hun1 h uen1
+hun2 h uen2
+hun3 h uen3
+hun4 h uen4
+hun5 h uen5
+huo1 h uo1
+huo2 h uo2
+huo3 h uo3
+huo4 h uo4
+huo5 h uo5
+ji1 j i1
+ji2 j i2
+ji3 j i3
+ji4 j i4
+ji5 j i5
+jia1 j ia1
+jia2 j ia2
+jia3 j ia3
+jia4 j ia4
+jia5 j ia5
+jian1 j ian1
+jian2 j ian2
+jian3 j ian3
+jian4 j ian4
+jian5 j ian5
+jiang1 j iang1
+jiang2 j iang2
+jiang3 j iang3
+jiang4 j iang4
+jiang5 j iang5
+jiao1 j iao1
+jiao2 j iao2
+jiao3 j iao3
+jiao4 j iao4
+jiao5 j iao5
+jie1 j ie1
+jie2 j ie2
+jie3 j ie3
+jie4 j ie4
+jie5 j ie5
+jin1 j in1
+jin2 j in2
+jin3 j in3
+jin4 j in4
+jin5 j in5
+jing1 j ing1
+jing2 j ing2
+jing3 j ing3
+jing4 j ing4
+jing5 j ing5
+jiong1 j iong1
+jiong2 j iong2
+jiong3 j iong3
+jiong4 j iong4
+jiong5 j iong5
+jiu1 j iou1
+jiu2 j iou2
+jiu3 j iou3
+jiu4 j iou4
+jiu5 j iou5
+ju1 j v1
+ju2 j v2
+ju3 j v3
+ju4 j v4
+ju5 j v5
+juan1 j van1
+juan2 j van2
+juan3 j van3
+juan4 j van4
+juan5 j van5
+jue1 j ve1
+jue2 j ve2
+jue3 j ve3
+jue4 j ve4
+jue5 j ve5
+jun1 j vn1
+jun2 j vn2
+jun3 j vn3
+jun4 j vn4
+jun5 j vn5
+ka1 k a1
+ka2 k a2
+ka3 k a3
+ka4 k a4
+ka5 k a5
+kai1 k ai1
+kai2 k ai2
+kai3 k ai3
+kai4 k ai4
+kai5 k ai5
+kan1 k an1
+kan2 k an2
+kan3 k an3
+kan4 k an4
+kan5 k an5
+kang1 k ang1
+kang2 k ang2
+kang3 k ang3
+kang4 k ang4
+kang5 k ang5
+kao1 k ao1
+kao2 k ao2
+kao3 k ao3
+kao4 k ao4
+kao5 k ao5
+ke1 k e1
+ke2 k e2
+ke3 k e3
+ke4 k e4
+ke5 k e5
+kei1 k ei1
+kei2 k ei2
+kei3 k ei3
+kei4 k ei4
+kei5 k ei5
+ken1 k en1
+ken2 k en2
+ken3 k en3
+ken4 k en4
+ken5 k en5
+keng1 k eng1
+keng2 k eng2
+keng3 k eng3
+keng4 k eng4
+keng5 k eng5
+kong1 k ong1
+kong2 k ong2
+kong3 k ong3
+kong4 k ong4
+kong5 k ong5
+kou1 k ou1
+kou2 k ou2
+kou3 k ou3
+kou4 k ou4
+kou5 k ou5
+ku1 k u1
+ku2 k u2
+ku3 k u3
+ku4 k u4
+ku5 k u5
+kua1 k ua1
+kua2 k ua2
+kua3 k ua3
+kua4 k ua4
+kua5 k ua5
+kuai1 k uai1
+kuai2 k uai2
+kuai3 k uai3
+kuai4 k uai4
+kuai5 k uai5
+kuan1 k uan1
+kuan2 k uan2
+kuan3 k uan3
+kuan4 k uan4
+kuan5 k uan5
+kuang1 k uang1
+kuang2 k uang2
+kuang3 k uang3
+kuang4 k uang4
+kuang5 k uang5
+kui1 k uei1
+kui2 k uei2
+kui3 k uei3
+kui4 k uei4
+kui5 k uei5
+kun1 k uen1
+kun2 k uen2
+kun3 k uen3
+kun4 k uen4
+kun5 k uen5
+kuo1 k uo1
+kuo2 k uo2
+kuo3 k uo3
+kuo4 k uo4
+kuo5 k uo5
+la1 l a1
+la2 l a2
+la3 l a3
+la4 l a4
+la5 l a5
+lai1 l ai1
+lai2 l ai2
+lai3 l ai3
+lai4 l ai4
+lai5 l ai5
+lan1 l an1
+lan2 l an2
+lan3 l an3
+lan4 l an4
+lan5 l an5
+lang1 l ang1
+lang2 l ang2
+lang3 l ang3
+lang4 l ang4
+lang5 l ang5
+lao1 l ao1
+lao2 l ao2
+lao3 l ao3
+lao4 l ao4
+lao5 l ao5
+le1 l e1
+le2 l e2
+le3 l e3
+le4 l e4
+le5 l e5
+lei1 l ei1
+lei2 l ei2
+lei3 l ei3
+lei4 l ei4
+lei5 l ei5
+leng1 l eng1
+leng2 l eng2
+leng3 l eng3
+leng4 l eng4
+leng5 l eng5
+li1 l i1
+li2 l i2
+li3 l i3
+li4 l i4
+li5 l i5
+lia1 l ia1
+lia2 l ia2
+lia3 l ia3
+lia4 l ia4
+lia5 l ia5
+lian1 l ian1
+lian2 l ian2
+lian3 l ian3
+lian4 l ian4
+lian5 l ian5
+liang1 l iang1
+liang2 l iang2
+liang3 l iang3
+liang4 l iang4
+liang5 l iang5
+liao1 l iao1
+liao2 l iao2
+liao3 l iao3
+liao4 l iao4
+liao5 l iao5
+lie1 l ie1
+lie2 l ie2
+lie3 l ie3
+lie4 l ie4
+lie5 l ie5
+lin1 l in1
+lin2 l in2
+lin3 l in3
+lin4 l in4
+lin5 l in5
+ling1 l ing1
+ling2 l ing2
+ling3 l ing3
+ling4 l ing4
+ling5 l ing5
+liu1 l iou1
+liu2 l iou2
+liu3 l iou3
+liu4 l iou4
+liu5 l iou5
+lo1 l o1
+lo2 l o2
+lo3 l o3
+lo4 l o4
+lo5 l o5
+long1 l ong1
+long2 l ong2
+long3 l ong3
+long4 l ong4
+long5 l ong5
+lou1 l ou1
+lou2 l ou2
+lou3 l ou3
+lou4 l ou4
+lou5 l ou5
+lu1 l u1
+lu2 l u2
+lu3 l u3
+lu4 l u4
+lu5 l u5
+luan1 l uan1
+luan2 l uan2
+luan3 l uan3
+luan4 l uan4
+luan5 l uan5
+lue1 l ve1
+lue2 l ve2
+lue3 l ve3
+lue4 l ve4
+lue5 l ve5
+lve1 l ve1
+lve2 l ve2
+lve3 l ve3
+lve4 l ve4
+lve5 l ve5
+lun1 l uen1
+lun2 l uen2
+lun3 l uen3
+lun4 l uen4
+lun5 l uen5
+luo1 l uo1
+luo2 l uo2
+luo3 l uo3
+luo4 l uo4
+luo5 l uo5
+lv1 l v1
+lv2 l v2
+lv3 l v3
+lv4 l v4
+lv5 l v5
+ma1 m a1
+ma2 m a2
+ma3 m a3
+ma4 m a4
+ma5 m a5
+mai1 m ai1
+mai2 m ai2
+mai3 m ai3
+mai4 m ai4
+mai5 m ai5
+man1 m an1
+man2 m an2
+man3 m an3
+man4 m an4
+man5 m an5
+mang1 m ang1
+mang2 m ang2
+mang3 m ang3
+mang4 m ang4
+mang5 m ang5
+mao1 m ao1
+mao2 m ao2
+mao3 m ao3
+mao4 m ao4
+mao5 m ao5
+me1 m e1
+me2 m e2
+me3 m e3
+me4 m e4
+me5 m e5
+mei1 m ei1
+mei2 m ei2
+mei3 m ei3
+mei4 m ei4
+mei5 m ei5
+men1 m en1
+men2 m en2
+men3 m en3
+men4 m en4
+men5 m en5
+meng1 m eng1
+meng2 m eng2
+meng3 m eng3
+meng4 m eng4
+meng5 m eng5
+mi1 m i1
+mi2 m i2
+mi3 m i3
+mi4 m i4
+mi5 m i5
+mian1 m ian1
+mian2 m ian2
+mian3 m ian3
+mian4 m ian4
+mian5 m ian5
+miao1 m iao1
+miao2 m iao2
+miao3 m iao3
+miao4 m iao4
+miao5 m iao5
+mie1 m ie1
+mie2 m ie2
+mie3 m ie3
+mie4 m ie4
+mie5 m ie5
+min1 m in1
+min2 m in2
+min3 m in3
+min4 m in4
+min5 m in5
+ming1 m ing1
+ming2 m ing2
+ming3 m ing3
+ming4 m ing4
+ming5 m ing5
+miu1 m iou1
+miu2 m iou2
+miu3 m iou3
+miu4 m iou4
+miu5 m iou5
+mo1 m o1
+mo2 m o2
+mo3 m o3
+mo4 m o4
+mo5 m o5
+mou1 m ou1
+mou2 m ou2
+mou3 m ou3
+mou4 m ou4
+mou5 m ou5
+mu1 m u1
+mu2 m u2
+mu3 m u3
+mu4 m u4
+mu5 m u5
+na1 n a1
+na2 n a2
+na3 n a3
+na4 n a4
+na5 n a5
+nai1 n ai1
+nai2 n ai2
+nai3 n ai3
+nai4 n ai4
+nai5 n ai5
+nan1 n an1
+nan2 n an2
+nan3 n an3
+nan4 n an4
+nan5 n an5
+nang1 n ang1
+nang2 n ang2
+nang3 n ang3
+nang4 n ang4
+nang5 n ang5
+nao1 n ao1
+nao2 n ao2
+nao3 n ao3
+nao4 n ao4
+nao5 n ao5
+ne1 n e1
+ne2 n e2
+ne3 n e3
+ne4 n e4
+ne5 n e5
+nei1 n ei1
+nei2 n ei2
+nei3 n ei3
+nei4 n ei4
+nei5 n ei5
+nen1 n en1
+nen2 n en2
+nen3 n en3
+nen4 n en4
+nen5 n en5
+neng1 n eng1
+neng2 n eng2
+neng3 n eng3
+neng4 n eng4
+neng5 n eng5
+ni1 n i1
+ni2 n i2
+ni3 n i3
+ni4 n i4
+ni5 n i5
+nian1 n ian1
+nian2 n ian2
+nian3 n ian3
+nian4 n ian4
+nian5 n ian5
+niang1 n iang1
+niang2 n iang2
+niang3 n iang3
+niang4 n iang4
+niang5 n iang5
+niao1 n iao1
+niao2 n iao2
+niao3 n iao3
+niao4 n iao4
+niao5 n iao5
+nie1 n ie1
+nie2 n ie2
+nie3 n ie3
+nie4 n ie4
+nie5 n ie5
+nin1 n in1
+nin2 n in2
+nin3 n in3
+nin4 n in4
+nin5 n in5
+ning1 n ing1
+ning2 n ing2
+ning3 n ing3
+ning4 n ing4
+ning5 n ing5
+niu1 n iou1
+niu2 n iou2
+niu3 n iou3
+niu4 n iou4
+niu5 n iou5
+nong1 n ong1
+nong2 n ong2
+nong3 n ong3
+nong4 n ong4
+nong5 n ong5
+nou1 n ou1
+nou2 n ou2
+nou3 n ou3
+nou4 n ou4
+nou5 n ou5
+nu1 n u1
+nu2 n u2
+nu3 n u3
+nu4 n u4
+nu5 n u5
+nuan1 n uan1
+nuan2 n uan2
+nuan3 n uan3
+nuan4 n uan4
+nuan5 n uan5
+nue1 n ve1
+nue2 n ve2
+nue3 n ve3
+nue4 n ve4
+nue5 n ve5
+nve1 n ve1
+nve2 n ve2
+nve3 n ve3
+nve4 n ve4
+nve5 n ve5
+nuo1 n uo1
+nuo2 n uo2
+nuo3 n uo3
+nuo4 n uo4
+nuo5 n uo5
+nv1 n v1
+nv2 n v2
+nv3 n v3
+nv4 n v4
+nv5 n v5
+o1 o1
+o2 o2
+o3 o3
+o4 o4
+o5 o5
+ou1 ou1
+ou2 ou2
+ou3 ou3
+ou4 ou4
+ou5 ou5
+pa1 p a1
+pa2 p a2
+pa3 p a3
+pa4 p a4
+pa5 p a5
+pai1 p ai1
+pai2 p ai2
+pai3 p ai3
+pai4 p ai4
+pai5 p ai5
+pan1 p an1
+pan2 p an2
+pan3 p an3
+pan4 p an4
+pan5 p an5
+pang1 p ang1
+pang2 p ang2
+pang3 p ang3
+pang4 p ang4
+pang5 p ang5
+pao1 p ao1
+pao2 p ao2
+pao3 p ao3
+pao4 p ao4
+pao5 p ao5
+pei1 p ei1
+pei2 p ei2
+pei3 p ei3
+pei4 p ei4
+pei5 p ei5
+pen1 p en1
+pen2 p en2
+pen3 p en3
+pen4 p en4
+pen5 p en5
+peng1 p eng1
+peng2 p eng2
+peng3 p eng3
+peng4 p eng4
+peng5 p eng5
+pi1 p i1
+pi2 p i2
+pi3 p i3
+pi4 p i4
+pi5 p i5
+pian1 p ian1
+pian2 p ian2
+pian3 p ian3
+pian4 p ian4
+pian5 p ian5
+piao1 p iao1
+piao2 p iao2
+piao3 p iao3
+piao4 p iao4
+piao5 p iao5
+pie1 p ie1
+pie2 p ie2
+pie3 p ie3
+pie4 p ie4
+pie5 p ie5
+pin1 p in1
+pin2 p in2
+pin3 p in3
+pin4 p in4
+pin5 p in5
+ping1 p ing1
+ping2 p ing2
+ping3 p ing3
+ping4 p ing4
+ping5 p ing5
+po1 p o1
+po2 p o2
+po3 p o3
+po4 p o4
+po5 p o5
+pou1 p ou1
+pou2 p ou2
+pou3 p ou3
+pou4 p ou4
+pou5 p ou5
+pu1 p u1
+pu2 p u2
+pu3 p u3
+pu4 p u4
+pu5 p u5
+qi1 q i1
+qi2 q i2
+qi3 q i3
+qi4 q i4
+qi5 q i5
+qia1 q ia1
+qia2 q ia2
+qia3 q ia3
+qia4 q ia4
+qia5 q ia5
+qian1 q ian1
+qian2 q ian2
+qian3 q ian3
+qian4 q ian4
+qian5 q ian5
+qiang1 q iang1
+qiang2 q iang2
+qiang3 q iang3
+qiang4 q iang4
+qiang5 q iang5
+qiao1 q iao1
+qiao2 q iao2
+qiao3 q iao3
+qiao4 q iao4
+qiao5 q iao5
+qie1 q ie1
+qie2 q ie2
+qie3 q ie3
+qie4 q ie4
+qie5 q ie5
+qin1 q in1
+qin2 q in2
+qin3 q in3
+qin4 q in4
+qin5 q in5
+qing1 q ing1
+qing2 q ing2
+qing3 q ing3
+qing4 q ing4
+qing5 q ing5
+qiong1 q iong1
+qiong2 q iong2
+qiong3 q iong3
+qiong4 q iong4
+qiong5 q iong5
+qiu1 q iou1
+qiu2 q iou2
+qiu3 q iou3
+qiu4 q iou4
+qiu5 q iou5
+qu1 q v1
+qu2 q v2
+qu3 q v3
+qu4 q v4
+qu5 q v5
+quan1 q van1
+quan2 q van2
+quan3 q van3
+quan4 q van4
+quan5 q van5
+que1 q ve1
+que2 q ve2
+que3 q ve3
+que4 q ve4
+que5 q ve5
+qun1 q vn1
+qun2 q vn2
+qun3 q vn3
+qun4 q vn4
+qun5 q vn5
+ran1 r an1
+ran2 r an2
+ran3 r an3
+ran4 r an4
+ran5 r an5
+rang1 r ang1
+rang2 r ang2
+rang3 r ang3
+rang4 r ang4
+rang5 r ang5
+rao1 r ao1
+rao2 r ao2
+rao3 r ao3
+rao4 r ao4
+rao5 r ao5
+re1 r e1
+re2 r e2
+re3 r e3
+re4 r e4
+re5 r e5
+ren1 r en1
+ren2 r en2
+ren3 r en3
+ren4 r en4
+ren5 r en5
+reng1 r eng1
+reng2 r eng2
+reng3 r eng3
+reng4 r eng4
+reng5 r eng5
+ri1 r iii1
+ri2 r iii2
+ri3 r iii3
+ri4 r iii4
+ri5 r iii5
+rong1 r ong1
+rong2 r ong2
+rong3 r ong3
+rong4 r ong4
+rong5 r ong5
+rou1 r ou1
+rou2 r ou2
+rou3 r ou3
+rou4 r ou4
+rou5 r ou5
+ru1 r u1
+ru2 r u2
+ru3 r u3
+ru4 r u4
+ru5 r u5
+rua1 r ua1
+rua2 r ua2
+rua3 r ua3
+rua4 r ua4
+rua5 r ua5
+ruan1 r uan1
+ruan2 r uan2
+ruan3 r uan3
+ruan4 r uan4
+ruan5 r uan5
+rui1 r uei1
+rui2 r uei2
+rui3 r uei3
+rui4 r uei4
+rui5 r uei5
+run1 r uen1
+run2 r uen2
+run3 r uen3
+run4 r uen4
+run5 r uen5
+ruo1 r uo1
+ruo2 r uo2
+ruo3 r uo3
+ruo4 r uo4
+ruo5 r uo5
+sa1 s a1
+sa2 s a2
+sa3 s a3
+sa4 s a4
+sa5 s a5
+sai1 s ai1
+sai2 s ai2
+sai3 s ai3
+sai4 s ai4
+sai5 s ai5
+san1 s an1
+san2 s an2
+san3 s an3
+san4 s an4
+san5 s an5
+sang1 s ang1
+sang2 s ang2
+sang3 s ang3
+sang4 s ang4
+sang5 s ang5
+sao1 s ao1
+sao2 s ao2
+sao3 s ao3
+sao4 s ao4
+sao5 s ao5
+se1 s e1
+se2 s e2
+se3 s e3
+se4 s e4
+se5 s e5
+sen1 s en1
+sen2 s en2
+sen3 s en3
+sen4 s en4
+sen5 s en5
+seng1 s eng1
+seng2 s eng2
+seng3 s eng3
+seng4 s eng4
+seng5 s eng5
+sha1 sh a1
+sha2 sh a2
+sha3 sh a3
+sha4 sh a4
+sha5 sh a5
+shai1 sh ai1
+shai2 sh ai2
+shai3 sh ai3
+shai4 sh ai4
+shai5 sh ai5
+shan1 sh an1
+shan2 sh an2
+shan3 sh an3
+shan4 sh an4
+shan5 sh an5
+shang1 sh ang1
+shang2 sh ang2
+shang3 sh ang3
+shang4 sh ang4
+shang5 sh ang5
+shao1 sh ao1
+shao2 sh ao2
+shao3 sh ao3
+shao4 sh ao4
+shao5 sh ao5
+she1 sh e1
+she2 sh e2
+she3 sh e3
+she4 sh e4
+she5 sh e5
+shei1 sh ei1
+shei2 sh ei2
+shei3 sh ei3
+shei4 sh ei4
+shei5 sh ei5
+shen1 sh en1
+shen2 sh en2
+shen3 sh en3
+shen4 sh en4
+shen5 sh en5
+sheng1 sh eng1
+sheng2 sh eng2
+sheng3 sh eng3
+sheng4 sh eng4
+sheng5 sh eng5
+shi1 sh iii1
+shi2 sh iii2
+shi3 sh iii3
+shi4 sh iii4
+shi5 sh iii5
+shou1 sh ou1
+shou2 sh ou2
+shou3 sh ou3
+shou4 sh ou4
+shou5 sh ou5
+shu1 sh u1
+shu2 sh u2
+shu3 sh u3
+shu4 sh u4
+shu5 sh u5
+shua1 sh ua1
+shua2 sh ua2
+shua3 sh ua3
+shua4 sh ua4
+shua5 sh ua5
+shuai1 sh uai1
+shuai2 sh uai2
+shuai3 sh uai3
+shuai4 sh uai4
+shuai5 sh uai5
+shuan1 sh uan1
+shuan2 sh uan2
+shuan3 sh uan3
+shuan4 sh uan4
+shuan5 sh uan5
+shuang1 sh uang1
+shuang2 sh uang2
+shuang3 sh uang3
+shuang4 sh uang4
+shuang5 sh uang5
+shui1 sh uei1
+shui2 sh uei2
+shui3 sh uei3
+shui4 sh uei4
+shui5 sh uei5
+shun1 sh uen1
+shun2 sh uen2
+shun3 sh uen3
+shun4 sh uen4
+shun5 sh uen5
+shuo1 sh uo1
+shuo2 sh uo2
+shuo3 sh uo3
+shuo4 sh uo4
+shuo5 sh uo5
+si1 s ii1
+si2 s ii2
+si3 s ii3
+si4 s ii4
+si5 s ii5
+song1 s ong1
+song2 s ong2
+song3 s ong3
+song4 s ong4
+song5 s ong5
+sou1 s ou1
+sou2 s ou2
+sou3 s ou3
+sou4 s ou4
+sou5 s ou5
+su1 s u1
+su2 s u2
+su3 s u3
+su4 s u4
+su5 s u5
+suan1 s uan1
+suan2 s uan2
+suan3 s uan3
+suan4 s uan4
+suan5 s uan5
+sui1 s uei1
+sui2 s uei2
+sui3 s uei3
+sui4 s uei4
+sui5 s uei5
+sun1 s uen1
+sun2 s uen2
+sun3 s uen3
+sun4 s uen4
+sun5 s uen5
+suo1 s uo1
+suo2 s uo2
+suo3 s uo3
+suo4 s uo4
+suo5 s uo5
+ta1 t a1
+ta2 t a2
+ta3 t a3
+ta4 t a4
+ta5 t a5
+tai1 t ai1
+tai2 t ai2
+tai3 t ai3
+tai4 t ai4
+tai5 t ai5
+tan1 t an1
+tan2 t an2
+tan3 t an3
+tan4 t an4
+tan5 t an5
+tang1 t ang1
+tang2 t ang2
+tang3 t ang3
+tang4 t ang4
+tang5 t ang5
+tao1 t ao1
+tao2 t ao2
+tao3 t ao3
+tao4 t ao4
+tao5 t ao5
+te1 t e1
+te2 t e2
+te3 t e3
+te4 t e4
+te5 t e5
+tei1 t ei1
+tei2 t ei2
+tei3 t ei3
+tei4 t ei4
+tei5 t ei5
+teng1 t eng1
+teng2 t eng2
+teng3 t eng3
+teng4 t eng4
+teng5 t eng5
+ti1 t i1
+ti2 t i2
+ti3 t i3
+ti4 t i4
+ti5 t i5
+tian1 t ian1
+tian2 t ian2
+tian3 t ian3
+tian4 t ian4
+tian5 t ian5
+tiao1 t iao1
+tiao2 t iao2
+tiao3 t iao3
+tiao4 t iao4
+tiao5 t iao5
+tie1 t ie1
+tie2 t ie2
+tie3 t ie3
+tie4 t ie4
+tie5 t ie5
+ting1 t ing1
+ting2 t ing2
+ting3 t ing3
+ting4 t ing4
+ting5 t ing5
+tong1 t ong1
+tong2 t ong2
+tong3 t ong3
+tong4 t ong4
+tong5 t ong5
+tou1 t ou1
+tou2 t ou2
+tou3 t ou3
+tou4 t ou4
+tou5 t ou5
+tu1 t u1
+tu2 t u2
+tu3 t u3
+tu4 t u4
+tu5 t u5
+tuan1 t uan1
+tuan2 t uan2
+tuan3 t uan3
+tuan4 t uan4
+tuan5 t uan5
+tui1 t uei1
+tui2 t uei2
+tui3 t uei3
+tui4 t uei4
+tui5 t uei5
+tun1 t uen1
+tun2 t uen2
+tun3 t uen3
+tun4 t uen4
+tun5 t uen5
+tuo1 t uo1
+tuo2 t uo2
+tuo3 t uo3
+tuo4 t uo4
+tuo5 t uo5
+wa1 w ua1
+wa2 w ua2
+wa3 w ua3
+wa4 w ua4
+wa5 w ua5
+wai1 w uai1
+wai2 w uai2
+wai3 w uai3
+wai4 w uai4
+wai5 w uai5
+wan1 w uan1
+wan2 w uan2
+wan3 w uan3
+wan4 w uan4
+wan5 w uan5
+wang1 w uang1
+wang2 w uang2
+wang3 w uang3
+wang4 w uang4
+wang5 w uang5
+wei1 w uei1
+wei2 w uei2
+wei3 w uei3
+wei4 w uei4
+wei5 w uei5
+wen1 w uen1
+wen2 w uen2
+wen3 w uen3
+wen4 w uen4
+wen5 w uen5
+weng1 w uen1
+weng2 w uen2
+weng3 w uen3
+weng4 w uen4
+weng5 w uen5
+wo1 w uo1
+wo2 w uo2
+wo3 w uo3
+wo4 w uo4
+wo5 w uo5
+wu1 w u1
+wu2 w u2
+wu3 w u3
+wu4 w u4
+wu5 w u5
+xi1 x i1
+xi2 x i2
+xi3 x i3
+xi4 x i4
+xi5 x i5
+xia1 x ia1
+xia2 x ia2
+xia3 x ia3
+xia4 x ia4
+xia5 x ia5
+xian1 x ian1
+xian2 x ian2
+xian3 x ian3
+xian4 x ian4
+xian5 x ian5
+xiang1 x iang1
+xiang2 x iang2
+xiang3 x iang3
+xiang4 x iang4
+xiang5 x iang5
+xiao1 x iao1
+xiao2 x iao2
+xiao3 x iao3
+xiao4 x iao4
+xiao5 x iao5
+xie1 x ie1
+xie2 x ie2
+xie3 x ie3
+xie4 x ie4
+xie5 x ie5
+xin1 x in1
+xin2 x in2
+xin3 x in3
+xin4 x in4
+xin5 x in5
+xing1 x ing1
+xing2 x ing2
+xing3 x ing3
+xing4 x ing4
+xing5 x ing5
+xiong1 x iong1
+xiong2 x iong2
+xiong3 x iong3
+xiong4 x iong4
+xiong5 x iong5
+xiu1 x iou1
+xiu2 x iou2
+xiu3 x iou3
+xiu4 x iou4
+xiu5 x iou5
+xu1 x v1
+xu2 x v2
+xu3 x v3
+xu4 x v4
+xu5 x v5
+xuan1 x van1
+xuan2 x van2
+xuan3 x van3
+xuan4 x van4
+xuan5 x van5
+xue1 x ve1
+xue2 x ve2
+xue3 x ve3
+xue4 x ve4
+xue5 x ve5
+xun1 x vn1
+xun2 x vn2
+xun3 x vn3
+xun4 x vn4
+xun5 x vn5
+ya1 y ia1
+ya2 y ia2
+ya3 y ia3
+ya4 y ia4
+ya5 y ia5
+yan1 y ian1
+yan2 y ian2
+yan3 y ian3
+yan4 y ian4
+yan5 y ian5
+yang1 y iang1
+yang2 y iang2
+yang3 y iang3
+yang4 y iang4
+yang5 y iang5
+yao1 y iao1
+yao2 y iao2
+yao3 y iao3
+yao4 y iao4
+yao5 y iao5
+ye1 y ie1
+ye2 y ie2
+ye3 y ie3
+ye4 y ie4
+ye5 y ie5
+yi1 y i1
+yi2 y i2
+yi3 y i3
+yi4 y i4
+yi5 y i5
+yin1 y in1
+yin2 y in2
+yin3 y in3
+yin4 y in4
+yin5 y in5
+ying1 y ing1
+ying2 y ing2
+ying3 y ing3
+ying4 y ing4
+ying5 y ing5
+yo1 y iou1
+yo2 y iou2
+yo3 y iou3
+yo4 y iou4
+yo5 y iou5
+yong1 y iong1
+yong2 y iong2
+yong3 y iong3
+yong4 y iong4
+yong5 y iong5
+you1 y iou1
+you2 y iou2
+you3 y iou3
+you4 y iou4
+you5 y iou5
+yu1 y v1
+yu2 y v2
+yu3 y v3
+yu4 y v4
+yu5 y v5
+yuan1 y van1
+yuan2 y van2
+yuan3 y van3
+yuan4 y van4
+yuan5 y van5
+yue1 y ve1
+yue2 y ve2
+yue3 y ve3
+yue4 y ve4
+yue5 y ve5
+yun1 y vn1
+yun2 y vn2
+yun3 y vn3
+yun4 y vn4
+yun5 y vn5
+za1 z a1
+za2 z a2
+za3 z a3
+za4 z a4
+za5 z a5
+zai1 z ai1
+zai2 z ai2
+zai3 z ai3
+zai4 z ai4
+zai5 z ai5
+zan1 z an1
+zan2 z an2
+zan3 z an3
+zan4 z an4
+zan5 z an5
+zang1 z ang1
+zang2 z ang2
+zang3 z ang3
+zang4 z ang4
+zang5 z ang5
+zao1 z ao1
+zao2 z ao2
+zao3 z ao3
+zao4 z ao4
+zao5 z ao5
+ze1 z e1
+ze2 z e2
+ze3 z e3
+ze4 z e4
+ze5 z e5
+zei1 z ei1
+zei2 z ei2
+zei3 z ei3
+zei4 z ei4
+zei5 z ei5
+zen1 z en1
+zen2 z en2
+zen3 z en3
+zen4 z en4
+zen5 z en5
+zeng1 z eng1
+zeng2 z eng2
+zeng3 z eng3
+zeng4 z eng4
+zeng5 z eng5
+zha1 zh a1
+zha2 zh a2
+zha3 zh a3
+zha4 zh a4
+zha5 zh a5
+zhai1 zh ai1
+zhai2 zh ai2
+zhai3 zh ai3
+zhai4 zh ai4
+zhai5 zh ai5
+zhan1 zh an1
+zhan2 zh an2
+zhan3 zh an3
+zhan4 zh an4
+zhan5 zh an5
+zhang1 zh ang1
+zhang2 zh ang2
+zhang3 zh ang3
+zhang4 zh ang4
+zhang5 zh ang5
+zhao1 zh ao1
+zhao2 zh ao2
+zhao3 zh ao3
+zhao4 zh ao4
+zhao5 zh ao5
+zhe1 zh e1
+zhe2 zh e2
+zhe3 zh e3
+zhe4 zh e4
+zhe5 zh e5
+zhei1 zh ei1
+zhei2 zh ei2
+zhei3 zh ei3
+zhei4 zh ei4
+zhei5 zh ei5
+zhen1 zh en1
+zhen2 zh en2
+zhen3 zh en3
+zhen4 zh en4
+zhen5 zh en5
+zheng1 zh eng1
+zheng2 zh eng2
+zheng3 zh eng3
+zheng4 zh eng4
+zheng5 zh eng5
+zhi1 zh iii1
+zhi2 zh iii2
+zhi3 zh iii3
+zhi4 zh iii4
+zhi5 zh iii5
+zhong1 zh ong1
+zhong2 zh ong2
+zhong3 zh ong3
+zhong4 zh ong4
+zhong5 zh ong5
+zhou1 zh ou1
+zhou2 zh ou2
+zhou3 zh ou3
+zhou4 zh ou4
+zhou5 zh ou5
+zhu1 zh u1
+zhu2 zh u2
+zhu3 zh u3
+zhu4 zh u4
+zhu5 zh u5
+zhua1 zh ua1
+zhua2 zh ua2
+zhua3 zh ua3
+zhua4 zh ua4
+zhua5 zh ua5
+zhuai1 zh uai1
+zhuai2 zh uai2
+zhuai3 zh uai3
+zhuai4 zh uai4
+zhuai5 zh uai5
+zhuan1 zh uan1
+zhuan2 zh uan2
+zhuan3 zh uan3
+zhuan4 zh uan4
+zhuan5 zh uan5
+zhuang1 zh uang1
+zhuang2 zh uang2
+zhuang3 zh uang3
+zhuang4 zh uang4
+zhuang5 zh uang5
+zhui1 zh uei1
+zhui2 zh uei2
+zhui3 zh uei3
+zhui4 zh uei4
+zhui5 zh uei5
+zhun1 zh uen1
+zhun2 zh uen2
+zhun3 zh uen3
+zhun4 zh uen4
+zhun5 zh uen5
+zhuo1 zh uo1
+zhuo2 zh uo2
+zhuo3 zh uo3
+zhuo4 zh uo4
+zhuo5 zh uo5
+zi1 z ii1
+zi2 z ii2
+zi3 z ii3
+zi4 z ii4
+zi5 z ii5
+zong1 z ong1
+zong2 z ong2
+zong3 z ong3
+zong4 z ong4
+zong5 z ong5
+zou1 z ou1
+zou2 z ou2
+zou3 z ou3
+zou4 z ou4
+zou5 z ou5
+zu1 z u1
+zu2 z u2
+zu3 z u3
+zu4 z u4
+zu5 z u5
+zuan1 z uan1
+zuan2 z uan2
+zuan3 z uan3
+zuan4 z uan4
+zuan5 z uan5
+zui1 z uei1
+zui2 z uei2
+zui3 z uei3
+zui4 z uei4
+zui5 z uei5
+zun1 z uen1
+zun2 z uen2
+zun3 z uen3
+zun4 z uen4
+zun5 z uen5
+zuo1 z uo1
+zuo2 z uo2
+zuo3 z uo3
+zuo4 z uo4
+zuo5 z uo5
+ar1 a1 rr
+ar2 a2 rr
+ar3 a3 rr
+ar4 a4 rr
+ar5 a5 rr
+air1 ai1 rr
+air2 ai2 rr
+air3 ai3 rr
+air4 ai4 rr
+air5 ai5 rr
+anr1 an1 rr
+anr2 an2 rr
+anr3 an3 rr
+anr4 an4 rr
+anr5 an5 rr
+angr1 ang1 rr
+angr2 ang2 rr
+angr3 ang3 rr
+angr4 ang4 rr
+angr5 ang5 rr
+aor1 ao1 rr
+aor2 ao2 rr
+aor3 ao3 rr
+aor4 ao4 rr
+aor5 ao5 rr
+bar1 b a1 rr
+bar2 b a2 rr
+bar3 b a3 rr
+bar4 b a4 rr
+bar5 b a5 rr
+bair1 b ai1 rr
+bair2 b ai2 rr
+bair3 b ai3 rr
+bair4 b ai4 rr
+bair5 b ai5 rr
+banr1 b an1 rr
+banr2 b an2 rr
+banr3 b an3 rr
+banr4 b an4 rr
+banr5 b an5 rr
+bangr1 b ang1 rr
+bangr2 b ang2 rr
+bangr3 b ang3 rr
+bangr4 b ang4 rr
+bangr5 b ang5 rr
+baor1 b ao1 rr
+baor2 b ao2 rr
+baor3 b ao3 rr
+baor4 b ao4 rr
+baor5 b ao5 rr
+beir1 b ei1 rr
+beir2 b ei2 rr
+beir3 b ei3 rr
+beir4 b ei4 rr
+beir5 b ei5 rr
+benr1 b en1 rr
+benr2 b en2 rr
+benr3 b en3 rr
+benr4 b en4 rr
+benr5 b en5 rr
+bengr1 b eng1 rr
+bengr2 b eng2 rr
+bengr3 b eng3 rr
+bengr4 b eng4 rr
+bengr5 b eng5 rr
+bir1 b i1 rr
+bir2 b i2 rr
+bir3 b i3 rr
+bir4 b i4 rr
+bir5 b i5 rr
+bianr1 b ian1 rr
+bianr2 b ian2 rr
+bianr3 b ian3 rr
+bianr4 b ian4 rr
+bianr5 b ian5 rr
+biaor1 b iao1 rr
+biaor2 b iao2 rr
+biaor3 b iao3 rr
+biaor4 b iao4 rr
+biaor5 b iao5 rr
+bier1 b ie1 rr
+bier2 b ie2 rr
+bier3 b ie3 rr
+bier4 b ie4 rr
+bier5 b ie5 rr
+binr1 b in1 rr
+binr2 b in2 rr
+binr3 b in3 rr
+binr4 b in4 rr
+binr5 b in5 rr
+bingr1 b ing1 rr
+bingr2 b ing2 rr
+bingr3 b ing3 rr
+bingr4 b ing4 rr
+bingr5 b ing5 rr
+bor1 b o1 rr
+bor2 b o2 rr
+bor3 b o3 rr
+bor4 b o4 rr
+bor5 b o5 rr
+bur1 b u1 rr
+bur2 b u2 rr
+bur3 b u3 rr
+bur4 b u4 rr
+bur5 b u5 rr
+car1 c a1 rr
+car2 c a2 rr
+car3 c a3 rr
+car4 c a4 rr
+car5 c a5 rr
+cair1 c ai1 rr
+cair2 c ai2 rr
+cair3 c ai3 rr
+cair4 c ai4 rr
+cair5 c ai5 rr
+canr1 c an1 rr
+canr2 c an2 rr
+canr3 c an3 rr
+canr4 c an4 rr
+canr5 c an5 rr
+cangr1 c ang1 rr
+cangr2 c ang2 rr
+cangr3 c ang3 rr
+cangr4 c ang4 rr
+cangr5 c ang5 rr
+caor1 c ao1 rr
+caor2 c ao2 rr
+caor3 c ao3 rr
+caor4 c ao4 rr
+caor5 c ao5 rr
+cer1 c e1 rr
+cer2 c e2 rr
+cer3 c e3 rr
+cer4 c e4 rr
+cer5 c e5 rr
+cenr1 c en1 rr
+cenr2 c en2 rr
+cenr3 c en3 rr
+cenr4 c en4 rr
+cenr5 c en5 rr
+cengr1 c eng1 rr
+cengr2 c eng2 rr
+cengr3 c eng3 rr
+cengr4 c eng4 rr
+cengr5 c eng5 rr
+char1 ch a1 rr
+char2 ch a2 rr
+char3 ch a3 rr
+char4 ch a4 rr
+char5 ch a5 rr
+chair1 ch ai1 rr
+chair2 ch ai2 rr
+chair3 ch ai3 rr
+chair4 ch ai4 rr
+chair5 ch ai5 rr
+chanr1 ch an1 rr
+chanr2 ch an2 rr
+chanr3 ch an3 rr
+chanr4 ch an4 rr
+chanr5 ch an5 rr
+changr1 ch ang1 rr
+changr2 ch ang2 rr
+changr3 ch ang3 rr
+changr4 ch ang4 rr
+changr5 ch ang5 rr
+chaor1 ch ao1 rr
+chaor2 ch ao2 rr
+chaor3 ch ao3 rr
+chaor4 ch ao4 rr
+chaor5 ch ao5 rr
+cher1 ch e1 rr
+cher2 ch e2 rr
+cher3 ch e3 rr
+cher4 ch e4 rr
+cher5 ch e5 rr
+chenr1 ch en1 rr
+chenr2 ch en2 rr
+chenr3 ch en3 rr
+chenr4 ch en4 rr
+chenr5 ch en5 rr
+chengr1 ch eng1 rr
+chengr2 ch eng2 rr
+chengr3 ch eng3 rr
+chengr4 ch eng4 rr
+chengr5 ch eng5 rr
+chir1 ch iii1 rr
+chir2 ch iii2 rr
+chir3 ch iii3 rr
+chir4 ch iii4 rr
+chir5 ch iii5 rr
+chongr1 ch ong1 rr
+chongr2 ch ong2 rr
+chongr3 ch ong3 rr
+chongr4 ch ong4 rr
+chongr5 ch ong5 rr
+chour1 ch ou1 rr
+chour2 ch ou2 rr
+chour3 ch ou3 rr
+chour4 ch ou4 rr
+chour5 ch ou5 rr
+chur1 ch u1 rr
+chur2 ch u2 rr
+chur3 ch u3 rr
+chur4 ch u4 rr
+chur5 ch u5 rr
+chuair1 ch uai1 rr
+chuair2 ch uai2 rr
+chuair3 ch uai3 rr
+chuair4 ch uai4 rr
+chuair5 ch uai5 rr
+chuanr1 ch uan1 rr
+chuanr2 ch uan2 rr
+chuanr3 ch uan3 rr
+chuanr4 ch uan4 rr
+chuanr5 ch uan5 rr
+chuangr1 ch uang1 rr
+chuangr2 ch uang2 rr
+chuangr3 ch uang3 rr
+chuangr4 ch uang4 rr
+chuangr5 ch uang5 rr
+chuir1 ch uei1 rr
+chuir2 ch uei2 rr
+chuir3 ch uei3 rr
+chuir4 ch uei4 rr
+chuir5 ch uei5 rr
+chunr1 ch uen1 rr
+chunr2 ch uen2 rr
+chunr3 ch uen3 rr
+chunr4 ch uen4 rr
+chunr5 ch uen5 rr
+chuor1 ch uo1 rr
+chuor2 ch uo2 rr
+chuor3 ch uo3 rr
+chuor4 ch uo4 rr
+chuor5 ch uo5 rr
+cir1 c ii1 rr
+cir2 c ii2 rr
+cir3 c ii3 rr
+cir4 c ii4 rr
+cir5 c ii5 rr
+congr1 c ong1 rr
+congr2 c ong2 rr
+congr3 c ong3 rr
+congr4 c ong4 rr
+congr5 c ong5 rr
+cour1 c ou1 rr
+cour2 c ou2 rr
+cour3 c ou3 rr
+cour4 c ou4 rr
+cour5 c ou5 rr
+cur1 c u1 rr
+cur2 c u2 rr
+cur3 c u3 rr
+cur4 c u4 rr
+cur5 c u5 rr
+cuanr1 c uan1 rr
+cuanr2 c uan2 rr
+cuanr3 c uan3 rr
+cuanr4 c uan4 rr
+cuanr5 c uan5 rr
+cuir1 c uei1 rr
+cuir2 c uei2 rr
+cuir3 c uei3 rr
+cuir4 c uei4 rr
+cuir5 c uei5 rr
+cunr1 c uen1 rr
+cunr2 c uen2 rr
+cunr3 c uen3 rr
+cunr4 c uen4 rr
+cunr5 c uen5 rr
+cuor1 c uo1 rr
+cuor2 c uo2 rr
+cuor3 c uo3 rr
+cuor4 c uo4 rr
+cuor5 c uo5 rr
+dar1 d a1 rr
+dar2 d a2 rr
+dar3 d a3 rr
+dar4 d a4 rr
+dar5 d a5 rr
+dair1 d ai1 rr
+dair2 d ai2 rr
+dair3 d ai3 rr
+dair4 d ai4 rr
+dair5 d ai5 rr
+danr1 d an1 rr
+danr2 d an2 rr
+danr3 d an3 rr
+danr4 d an4 rr
+danr5 d an5 rr
+dangr1 d ang1 rr
+dangr2 d ang2 rr
+dangr3 d ang3 rr
+dangr4 d ang4 rr
+dangr5 d ang5 rr
+daor1 d ao1 rr
+daor2 d ao2 rr
+daor3 d ao3 rr
+daor4 d ao4 rr
+daor5 d ao5 rr
+der1 d e1 rr
+der2 d e2 rr
+der3 d e3 rr
+der4 d e4 rr
+der5 d e5 rr
+deir1 d ei1 rr
+deir2 d ei2 rr
+deir3 d ei3 rr
+deir4 d ei4 rr
+deir5 d ei5 rr
+denr1 d en1 rr
+denr2 d en2 rr
+denr3 d en3 rr
+denr4 d en4 rr
+denr5 d en5 rr
+dengr1 d eng1 rr
+dengr2 d eng2 rr
+dengr3 d eng3 rr
+dengr4 d eng4 rr
+dengr5 d eng5 rr
+dir1 d i1 rr
+dir2 d i2 rr
+dir3 d i3 rr
+dir4 d i4 rr
+dir5 d i5 rr
+diar1 d ia1 rr
+diar2 d ia2 rr
+diar3 d ia3 rr
+diar4 d ia4 rr
+diar5 d ia5 rr
+dianr1 d ian1 rr
+dianr2 d ian2 rr
+dianr3 d ian3 rr
+dianr4 d ian4 rr
+dianr5 d ian5 rr
+diaor1 d iao1 rr
+diaor2 d iao2 rr
+diaor3 d iao3 rr
+diaor4 d iao4 rr
+diaor5 d iao5 rr
+dier1 d ie1 rr
+dier2 d ie2 rr
+dier3 d ie3 rr
+dier4 d ie4 rr
+dier5 d ie5 rr
+dingr1 d ing1 rr
+dingr2 d ing2 rr
+dingr3 d ing3 rr
+dingr4 d ing4 rr
+dingr5 d ing5 rr
+diur1 d iou1 rr
+diur2 d iou2 rr
+diur3 d iou3 rr
+diur4 d iou4 rr
+diur5 d iou5 rr
+dongr1 d ong1 rr
+dongr2 d ong2 rr
+dongr3 d ong3 rr
+dongr4 d ong4 rr
+dongr5 d ong5 rr
+dour1 d ou1 rr
+dour2 d ou2 rr
+dour3 d ou3 rr
+dour4 d ou4 rr
+dour5 d ou5 rr
+dur1 d u1 rr
+dur2 d u2 rr
+dur3 d u3 rr
+dur4 d u4 rr
+dur5 d u5 rr
+duanr1 d uan1 rr
+duanr2 d uan2 rr
+duanr3 d uan3 rr
+duanr4 d uan4 rr
+duanr5 d uan5 rr
+duir1 d uei1 rr
+duir2 d uei2 rr
+duir3 d uei3 rr
+duir4 d uei4 rr
+duir5 d uei5 rr
+dunr1 d uen1 rr
+dunr2 d uen2 rr
+dunr3 d uen3 rr
+dunr4 d uen4 rr
+dunr5 d uen5 rr
+duor1 d uo1 rr
+duor2 d uo2 rr
+duor3 d uo3 rr
+duor4 d uo4 rr
+duor5 d uo5 rr
+er1 e1 rr
+er2 e2 rr
+er3 e3 rr
+er4 e4 rr
+er5 e5 rr
+eir1 ei1 rr
+eir2 ei2 rr
+eir3 ei3 rr
+eir4 ei4 rr
+eir5 ei5 rr
+enr1 en1 rr
+enr2 en2 rr
+enr3 en3 rr
+enr4 en4 rr
+enr5 en5 rr
+engr1 eng1 rr
+engr2 eng2 rr
+engr3 eng3 rr
+engr4 eng4 rr
+engr5 eng5 rr
+far1 f a1 rr
+far2 f a2 rr
+far3 f a3 rr
+far4 f a4 rr
+far5 f a5 rr
+fanr1 f an1 rr
+fanr2 f an2 rr
+fanr3 f an3 rr
+fanr4 f an4 rr
+fanr5 f an5 rr
+fangr1 f ang1 rr
+fangr2 f ang2 rr
+fangr3 f ang3 rr
+fangr4 f ang4 rr
+fangr5 f ang5 rr
+feir1 f ei1 rr
+feir2 f ei2 rr
+feir3 f ei3 rr
+feir4 f ei4 rr
+feir5 f ei5 rr
+fenr1 f en1 rr
+fenr2 f en2 rr
+fenr3 f en3 rr
+fenr4 f en4 rr
+fenr5 f en5 rr
+fengr1 f eng1 rr
+fengr2 f eng2 rr
+fengr3 f eng3 rr
+fengr4 f eng4 rr
+fengr5 f eng5 rr
+for1 f o1 rr
+for2 f o2 rr
+for3 f o3 rr
+for4 f o4 rr
+for5 f o5 rr
+four1 f ou1 rr
+four2 f ou2 rr
+four3 f ou3 rr
+four4 f ou4 rr
+four5 f ou5 rr
+fur1 f u1 rr
+fur2 f u2 rr
+fur3 f u3 rr
+fur4 f u4 rr
+fur5 f u5 rr
+gar1 g a1 rr
+gar2 g a2 rr
+gar3 g a3 rr
+gar4 g a4 rr
+gar5 g a5 rr
+gair1 g ai1 rr
+gair2 g ai2 rr
+gair3 g ai3 rr
+gair4 g ai4 rr
+gair5 g ai5 rr
+ganr1 g an1 rr
+ganr2 g an2 rr
+ganr3 g an3 rr
+ganr4 g an4 rr
+ganr5 g an5 rr
+gangr1 g ang1 rr
+gangr2 g ang2 rr
+gangr3 g ang3 rr
+gangr4 g ang4 rr
+gangr5 g ang5 rr
+gaor1 g ao1 rr
+gaor2 g ao2 rr
+gaor3 g ao3 rr
+gaor4 g ao4 rr
+gaor5 g ao5 rr
+ger1 g e1 rr
+ger2 g e2 rr
+ger3 g e3 rr
+ger4 g e4 rr
+ger5 g e5 rr
+geir1 g ei1 rr
+geir2 g ei2 rr
+geir3 g ei3 rr
+geir4 g ei4 rr
+geir5 g ei5 rr
+genr1 g en1 rr
+genr2 g en2 rr
+genr3 g en3 rr
+genr4 g en4 rr
+genr5 g en5 rr
+gengr1 g eng1 rr
+gengr2 g eng2 rr
+gengr3 g eng3 rr
+gengr4 g eng4 rr
+gengr5 g eng5 rr
+gongr1 g ong1 rr
+gongr2 g ong2 rr
+gongr3 g ong3 rr
+gongr4 g ong4 rr
+gongr5 g ong5 rr
+gour1 g ou1 rr
+gour2 g ou2 rr
+gour3 g ou3 rr
+gour4 g ou4 rr
+gour5 g ou5 rr
+gur1 g u1 rr
+gur2 g u2 rr
+gur3 g u3 rr
+gur4 g u4 rr
+gur5 g u5 rr
+guar1 g ua1 rr
+guar2 g ua2 rr
+guar3 g ua3 rr
+guar4 g ua4 rr
+guar5 g ua5 rr
+guair1 g uai1 rr
+guair2 g uai2 rr
+guair3 g uai3 rr
+guair4 g uai4 rr
+guair5 g uai5 rr
+guanr1 g uan1 rr
+guanr2 g uan2 rr
+guanr3 g uan3 rr
+guanr4 g uan4 rr
+guanr5 g uan5 rr
+guangr1 g uang1 rr
+guangr2 g uang2 rr
+guangr3 g uang3 rr
+guangr4 g uang4 rr
+guangr5 g uang5 rr
+guir1 g uei1 rr
+guir2 g uei2 rr
+guir3 g uei3 rr
+guir4 g uei4 rr
+guir5 g uei5 rr
+gunr1 g uen1 rr
+gunr2 g uen2 rr
+gunr3 g uen3 rr
+gunr4 g uen4 rr
+gunr5 g uen5 rr
+guor1 g uo1 rr
+guor2 g uo2 rr
+guor3 g uo3 rr
+guor4 g uo4 rr
+guor5 g uo5 rr
+har1 h a1 rr
+har2 h a2 rr
+har3 h a3 rr
+har4 h a4 rr
+har5 h a5 rr
+hair1 h ai1 rr
+hair2 h ai2 rr
+hair3 h ai3 rr
+hair4 h ai4 rr
+hair5 h ai5 rr
+hanr1 h an1 rr
+hanr2 h an2 rr
+hanr3 h an3 rr
+hanr4 h an4 rr
+hanr5 h an5 rr
+hangr1 h ang1 rr
+hangr2 h ang2 rr
+hangr3 h ang3 rr
+hangr4 h ang4 rr
+hangr5 h ang5 rr
+haor1 h ao1 rr
+haor2 h ao2 rr
+haor3 h ao3 rr
+haor4 h ao4 rr
+haor5 h ao5 rr
+her1 h e1 rr
+her2 h e2 rr
+her3 h e3 rr
+her4 h e4 rr
+her5 h e5 rr
+heir1 h ei1 rr
+heir2 h ei2 rr
+heir3 h ei3 rr
+heir4 h ei4 rr
+heir5 h ei5 rr
+henr1 h en1 rr
+henr2 h en2 rr
+henr3 h en3 rr
+henr4 h en4 rr
+henr5 h en5 rr
+hengr1 h eng1 rr
+hengr2 h eng2 rr
+hengr3 h eng3 rr
+hengr4 h eng4 rr
+hengr5 h eng5 rr
+hongr1 h ong1 rr
+hongr2 h ong2 rr
+hongr3 h ong3 rr
+hongr4 h ong4 rr
+hongr5 h ong5 rr
+hour1 h ou1 rr
+hour2 h ou2 rr
+hour3 h ou3 rr
+hour4 h ou4 rr
+hour5 h ou5 rr
+hur1 h u1 rr
+hur2 h u2 rr
+hur3 h u3 rr
+hur4 h u4 rr
+hur5 h u5 rr
+huar1 h ua1 rr
+huar2 h ua2 rr
+huar3 h ua3 rr
+huar4 h ua4 rr
+huar5 h ua5 rr
+huair1 h uai1 rr
+huair2 h uai2 rr
+huair3 h uai3 rr
+huair4 h uai4 rr
+huair5 h uai5 rr
+huanr1 h uan1 rr
+huanr2 h uan2 rr
+huanr3 h uan3 rr
+huanr4 h uan4 rr
+huanr5 h uan5 rr
+huangr1 h uang1 rr
+huangr2 h uang2 rr
+huangr3 h uang3 rr
+huangr4 h uang4 rr
+huangr5 h uang5 rr
+huir1 h uei1 rr
+huir2 h uei2 rr
+huir3 h uei3 rr
+huir4 h uei4 rr
+huir5 h uei5 rr
+hunr1 h uen1 rr
+hunr2 h uen2 rr
+hunr3 h uen3 rr
+hunr4 h uen4 rr
+hunr5 h uen5 rr
+huor1 h uo1 rr
+huor2 h uo2 rr
+huor3 h uo3 rr
+huor4 h uo4 rr
+huor5 h uo5 rr
+jir1 j i1 rr
+jir2 j i2 rr
+jir3 j i3 rr
+jir4 j i4 rr
+jir5 j i5 rr
+jiar1 j ia1 rr
+jiar2 j ia2 rr
+jiar3 j ia3 rr
+jiar4 j ia4 rr
+jiar5 j ia5 rr
+jianr1 j ian1 rr
+jianr2 j ian2 rr
+jianr3 j ian3 rr
+jianr4 j ian4 rr
+jianr5 j ian5 rr
+jiangr1 j iang1 rr
+jiangr2 j iang2 rr
+jiangr3 j iang3 rr
+jiangr4 j iang4 rr
+jiangr5 j iang5 rr
+jiaor1 j iao1 rr
+jiaor2 j iao2 rr
+jiaor3 j iao3 rr
+jiaor4 j iao4 rr
+jiaor5 j iao5 rr
+jier1 j ie1 rr
+jier2 j ie2 rr
+jier3 j ie3 rr
+jier4 j ie4 rr
+jier5 j ie5 rr
+jinr1 j in1 rr
+jinr2 j in2 rr
+jinr3 j in3 rr
+jinr4 j in4 rr
+jinr5 j in5 rr
+jingr1 j ing1 rr
+jingr2 j ing2 rr
+jingr3 j ing3 rr
+jingr4 j ing4 rr
+jingr5 j ing5 rr
+jiongr1 j iong1 rr
+jiongr2 j iong2 rr
+jiongr3 j iong3 rr
+jiongr4 j iong4 rr
+jiongr5 j iong5 rr
+jiur1 j iou1 rr
+jiur2 j iou2 rr
+jiur3 j iou3 rr
+jiur4 j iou4 rr
+jiur5 j iou5 rr
+jur1 j v1 rr
+jur2 j v2 rr
+jur3 j v3 rr
+jur4 j v4 rr
+jur5 j v5 rr
+juanr1 j van1 rr
+juanr2 j van2 rr
+juanr3 j van3 rr
+juanr4 j van4 rr
+juanr5 j van5 rr
+juer1 j ve1 rr
+juer2 j ve2 rr
+juer3 j ve3 rr
+juer4 j ve4 rr
+juer5 j ve5 rr
+junr1 j vn1 rr
+junr2 j vn2 rr
+junr3 j vn3 rr
+junr4 j vn4 rr
+junr5 j vn5 rr
+kar1 k a1 rr
+kar2 k a2 rr
+kar3 k a3 rr
+kar4 k a4 rr
+kar5 k a5 rr
+kair1 k ai1 rr
+kair2 k ai2 rr
+kair3 k ai3 rr
+kair4 k ai4 rr
+kair5 k ai5 rr
+kanr1 k an1 rr
+kanr2 k an2 rr
+kanr3 k an3 rr
+kanr4 k an4 rr
+kanr5 k an5 rr
+kangr1 k ang1 rr
+kangr2 k ang2 rr
+kangr3 k ang3 rr
+kangr4 k ang4 rr
+kangr5 k ang5 rr
+kaor1 k ao1 rr
+kaor2 k ao2 rr
+kaor3 k ao3 rr
+kaor4 k ao4 rr
+kaor5 k ao5 rr
+ker1 k e1 rr
+ker2 k e2 rr
+ker3 k e3 rr
+ker4 k e4 rr
+ker5 k e5 rr
+keir1 k ei1 rr
+keir2 k ei2 rr
+keir3 k ei3 rr
+keir4 k ei4 rr
+keir5 k ei5 rr
+kenr1 k en1 rr
+kenr2 k en2 rr
+kenr3 k en3 rr
+kenr4 k en4 rr
+kenr5 k en5 rr
+kengr1 k eng1 rr
+kengr2 k eng2 rr
+kengr3 k eng3 rr
+kengr4 k eng4 rr
+kengr5 k eng5 rr
+kongr1 k ong1 rr
+kongr2 k ong2 rr
+kongr3 k ong3 rr
+kongr4 k ong4 rr
+kongr5 k ong5 rr
+kour1 k ou1 rr
+kour2 k ou2 rr
+kour3 k ou3 rr
+kour4 k ou4 rr
+kour5 k ou5 rr
+kur1 k u1 rr
+kur2 k u2 rr
+kur3 k u3 rr
+kur4 k u4 rr
+kur5 k u5 rr
+kuar1 k ua1 rr
+kuar2 k ua2 rr
+kuar3 k ua3 rr
+kuar4 k ua4 rr
+kuar5 k ua5 rr
+kuair1 k uai1 rr
+kuair2 k uai2 rr
+kuair3 k uai3 rr
+kuair4 k uai4 rr
+kuair5 k uai5 rr
+kuanr1 k uan1 rr
+kuanr2 k uan2 rr
+kuanr3 k uan3 rr
+kuanr4 k uan4 rr
+kuanr5 k uan5 rr
+kuangr1 k uang1 rr
+kuangr2 k uang2 rr
+kuangr3 k uang3 rr
+kuangr4 k uang4 rr
+kuangr5 k uang5 rr
+kuir1 k uei1 rr
+kuir2 k uei2 rr
+kuir3 k uei3 rr
+kuir4 k uei4 rr
+kuir5 k uei5 rr
+kunr1 k uen1 rr
+kunr2 k uen2 rr
+kunr3 k uen3 rr
+kunr4 k uen4 rr
+kunr5 k uen5 rr
+kuor1 k uo1 rr
+kuor2 k uo2 rr
+kuor3 k uo3 rr
+kuor4 k uo4 rr
+kuor5 k uo5 rr
+lar1 l a1 rr
+lar2 l a2 rr
+lar3 l a3 rr
+lar4 l a4 rr
+lar5 l a5 rr
+lair1 l ai1 rr
+lair2 l ai2 rr
+lair3 l ai3 rr
+lair4 l ai4 rr
+lair5 l ai5 rr
+lanr1 l an1 rr
+lanr2 l an2 rr
+lanr3 l an3 rr
+lanr4 l an4 rr
+lanr5 l an5 rr
+langr1 l ang1 rr
+langr2 l ang2 rr
+langr3 l ang3 rr
+langr4 l ang4 rr
+langr5 l ang5 rr
+laor1 l ao1 rr
+laor2 l ao2 rr
+laor3 l ao3 rr
+laor4 l ao4 rr
+laor5 l ao5 rr
+ler1 l e1 rr
+ler2 l e2 rr
+ler3 l e3 rr
+ler4 l e4 rr
+ler5 l e5 rr
+leir1 l ei1 rr
+leir2 l ei2 rr
+leir3 l ei3 rr
+leir4 l ei4 rr
+leir5 l ei5 rr
+lengr1 l eng1 rr
+lengr2 l eng2 rr
+lengr3 l eng3 rr
+lengr4 l eng4 rr
+lengr5 l eng5 rr
+lir1 l i1 rr
+lir2 l i2 rr
+lir3 l i3 rr
+lir4 l i4 rr
+lir5 l i5 rr
+liar1 l ia1 rr
+liar2 l ia2 rr
+liar3 l ia3 rr
+liar4 l ia4 rr
+liar5 l ia5 rr
+lianr1 l ian1 rr
+lianr2 l ian2 rr
+lianr3 l ian3 rr
+lianr4 l ian4 rr
+lianr5 l ian5 rr
+liangr1 l iang1 rr
+liangr2 l iang2 rr
+liangr3 l iang3 rr
+liangr4 l iang4 rr
+liangr5 l iang5 rr
+liaor1 l iao1 rr
+liaor2 l iao2 rr
+liaor3 l iao3 rr
+liaor4 l iao4 rr
+liaor5 l iao5 rr
+lier1 l ie1 rr
+lier2 l ie2 rr
+lier3 l ie3 rr
+lier4 l ie4 rr
+lier5 l ie5 rr
+linr1 l in1 rr
+linr2 l in2 rr
+linr3 l in3 rr
+linr4 l in4 rr
+linr5 l in5 rr
+lingr1 l ing1 rr
+lingr2 l ing2 rr
+lingr3 l ing3 rr
+lingr4 l ing4 rr
+lingr5 l ing5 rr
+liur1 l iou1 rr
+liur2 l iou2 rr
+liur3 l iou3 rr
+liur4 l iou4 rr
+liur5 l iou5 rr
+lor1 l o1 rr
+lor2 l o2 rr
+lor3 l o3 rr
+lor4 l o4 rr
+lor5 l o5 rr
+longr1 l ong1 rr
+longr2 l ong2 rr
+longr3 l ong3 rr
+longr4 l ong4 rr
+longr5 l ong5 rr
+lour1 l ou1 rr
+lour2 l ou2 rr
+lour3 l ou3 rr
+lour4 l ou4 rr
+lour5 l ou5 rr
+lur1 l u1 rr
+lur2 l u2 rr
+lur3 l u3 rr
+lur4 l u4 rr
+lur5 l u5 rr
+luanr1 l uan1 rr
+luanr2 l uan2 rr
+luanr3 l uan3 rr
+luanr4 l uan4 rr
+luanr5 l uan5 rr
+luer1 l ve1 rr
+luer2 l ve2 rr
+luer3 l ve3 rr
+luer4 l ve4 rr
+luer5 l ve5 rr
+lver1 l ve1 rr
+lver2 l ve2 rr
+lver3 l ve3 rr
+lver4 l ve4 rr
+lver5 l ve5 rr
+lunr1 l uen1 rr
+lunr2 l uen2 rr
+lunr3 l uen3 rr
+lunr4 l uen4 rr
+lunr5 l uen5 rr
+luor1 l uo1 rr
+luor2 l uo2 rr
+luor3 l uo3 rr
+luor4 l uo4 rr
+luor5 l uo5 rr
+lvr1 l v1 rr
+lvr2 l v2 rr
+lvr3 l v3 rr
+lvr4 l v4 rr
+lvr5 l v5 rr
+mar1 m a1 rr
+mar2 m a2 rr
+mar3 m a3 rr
+mar4 m a4 rr
+mar5 m a5 rr
+mair1 m ai1 rr
+mair2 m ai2 rr
+mair3 m ai3 rr
+mair4 m ai4 rr
+mair5 m ai5 rr
+manr1 m an1 rr
+manr2 m an2 rr
+manr3 m an3 rr
+manr4 m an4 rr
+manr5 m an5 rr
+mangr1 m ang1 rr
+mangr2 m ang2 rr
+mangr3 m ang3 rr
+mangr4 m ang4 rr
+mangr5 m ang5 rr
+maor1 m ao1 rr
+maor2 m ao2 rr
+maor3 m ao3 rr
+maor4 m ao4 rr
+maor5 m ao5 rr
+mer1 m e1 rr
+mer2 m e2 rr
+mer3 m e3 rr
+mer4 m e4 rr
+mer5 m e5 rr
+meir1 m ei1 rr
+meir2 m ei2 rr
+meir3 m ei3 rr
+meir4 m ei4 rr
+meir5 m ei5 rr
+menr1 m en1 rr
+menr2 m en2 rr
+menr3 m en3 rr
+menr4 m en4 rr
+menr5 m en5 rr
+mengr1 m eng1 rr
+mengr2 m eng2 rr
+mengr3 m eng3 rr
+mengr4 m eng4 rr
+mengr5 m eng5 rr
+mir1 m i1 rr
+mir2 m i2 rr
+mir3 m i3 rr
+mir4 m i4 rr
+mir5 m i5 rr
+mianr1 m ian1 rr
+mianr2 m ian2 rr
+mianr3 m ian3 rr
+mianr4 m ian4 rr
+mianr5 m ian5 rr
+miaor1 m iao1 rr
+miaor2 m iao2 rr
+miaor3 m iao3 rr
+miaor4 m iao4 rr
+miaor5 m iao5 rr
+mier1 m ie1 rr
+mier2 m ie2 rr
+mier3 m ie3 rr
+mier4 m ie4 rr
+mier5 m ie5 rr
+minr1 m in1 rr
+minr2 m in2 rr
+minr3 m in3 rr
+minr4 m in4 rr
+minr5 m in5 rr
+mingr1 m ing1 rr
+mingr2 m ing2 rr
+mingr3 m ing3 rr
+mingr4 m ing4 rr
+mingr5 m ing5 rr
+miur1 m iou1 rr
+miur2 m iou2 rr
+miur3 m iou3 rr
+miur4 m iou4 rr
+miur5 m iou5 rr
+mor1 m o1 rr
+mor2 m o2 rr
+mor3 m o3 rr
+mor4 m o4 rr
+mor5 m o5 rr
+mour1 m ou1 rr
+mour2 m ou2 rr
+mour3 m ou3 rr
+mour4 m ou4 rr
+mour5 m ou5 rr
+mur1 m u1 rr
+mur2 m u2 rr
+mur3 m u3 rr
+mur4 m u4 rr
+mur5 m u5 rr
+nar1 n a1 rr
+nar2 n a2 rr
+nar3 n a3 rr
+nar4 n a4 rr
+nar5 n a5 rr
+nair1 n ai1 rr
+nair2 n ai2 rr
+nair3 n ai3 rr
+nair4 n ai4 rr
+nair5 n ai5 rr
+nanr1 n an1 rr
+nanr2 n an2 rr
+nanr3 n an3 rr
+nanr4 n an4 rr
+nanr5 n an5 rr
+nangr1 n ang1 rr
+nangr2 n ang2 rr
+nangr3 n ang3 rr
+nangr4 n ang4 rr
+nangr5 n ang5 rr
+naor1 n ao1 rr
+naor2 n ao2 rr
+naor3 n ao3 rr
+naor4 n ao4 rr
+naor5 n ao5 rr
+ner1 n e1 rr
+ner2 n e2 rr
+ner3 n e3 rr
+ner4 n e4 rr
+ner5 n e5 rr
+neir1 n ei1 rr
+neir2 n ei2 rr
+neir3 n ei3 rr
+neir4 n ei4 rr
+neir5 n ei5 rr
+nenr1 n en1 rr
+nenr2 n en2 rr
+nenr3 n en3 rr
+nenr4 n en4 rr
+nenr5 n en5 rr
+nengr1 n eng1 rr
+nengr2 n eng2 rr
+nengr3 n eng3 rr
+nengr4 n eng4 rr
+nengr5 n eng5 rr
+nir1 n i1 rr
+nir2 n i2 rr
+nir3 n i3 rr
+nir4 n i4 rr
+nir5 n i5 rr
+nianr1 n ian1 rr
+nianr2 n ian2 rr
+nianr3 n ian3 rr
+nianr4 n ian4 rr
+nianr5 n ian5 rr
+niangr1 n iang1 rr
+niangr2 n iang2 rr
+niangr3 n iang3 rr
+niangr4 n iang4 rr
+niangr5 n iang5 rr
+niaor1 n iao1 rr
+niaor2 n iao2 rr
+niaor3 n iao3 rr
+niaor4 n iao4 rr
+niaor5 n iao5 rr
+nier1 n ie1 rr
+nier2 n ie2 rr
+nier3 n ie3 rr
+nier4 n ie4 rr
+nier5 n ie5 rr
+ninr1 n in1 rr
+ninr2 n in2 rr
+ninr3 n in3 rr
+ninr4 n in4 rr
+ninr5 n in5 rr
+ningr1 n ing1 rr
+ningr2 n ing2 rr
+ningr3 n ing3 rr
+ningr4 n ing4 rr
+ningr5 n ing5 rr
+niur1 n iou1 rr
+niur2 n iou2 rr
+niur3 n iou3 rr
+niur4 n iou4 rr
+niur5 n iou5 rr
+nongr1 n ong1 rr
+nongr2 n ong2 rr
+nongr3 n ong3 rr
+nongr4 n ong4 rr
+nongr5 n ong5 rr
+nour1 n ou1 rr
+nour2 n ou2 rr
+nour3 n ou3 rr
+nour4 n ou4 rr
+nour5 n ou5 rr
+nur1 n u1 rr
+nur2 n u2 rr
+nur3 n u3 rr
+nur4 n u4 rr
+nur5 n u5 rr
+nuanr1 n uan1 rr
+nuanr2 n uan2 rr
+nuanr3 n uan3 rr
+nuanr4 n uan4 rr
+nuanr5 n uan5 rr
+nuer1 n ve1 rr
+nuer2 n ve2 rr
+nuer3 n ve3 rr
+nuer4 n ve4 rr
+nuer5 n ve5 rr
+nver1 n ve1 rr
+nver2 n ve2 rr
+nver3 n ve3 rr
+nver4 n ve4 rr
+nver5 n ve5 rr
+nuor1 n uo1 rr
+nuor2 n uo2 rr
+nuor3 n uo3 rr
+nuor4 n uo4 rr
+nuor5 n uo5 rr
+nvr1 n v1 rr
+nvr2 n v2 rr
+nvr3 n v3 rr
+nvr4 n v4 rr
+nvr5 n v5 rr
+or1 o1 rr
+or2 o2 rr
+or3 o3 rr
+or4 o4 rr
+or5 o5 rr
+our1 ou1 rr
+our2 ou2 rr
+our3 ou3 rr
+our4 ou4 rr
+our5 ou5 rr
+par1 p a1 rr
+par2 p a2 rr
+par3 p a3 rr
+par4 p a4 rr
+par5 p a5 rr
+pair1 p ai1 rr
+pair2 p ai2 rr
+pair3 p ai3 rr
+pair4 p ai4 rr
+pair5 p ai5 rr
+panr1 p an1 rr
+panr2 p an2 rr
+panr3 p an3 rr
+panr4 p an4 rr
+panr5 p an5 rr
+pangr1 p ang1 rr
+pangr2 p ang2 rr
+pangr3 p ang3 rr
+pangr4 p ang4 rr
+pangr5 p ang5 rr
+paor1 p ao1 rr
+paor2 p ao2 rr
+paor3 p ao3 rr
+paor4 p ao4 rr
+paor5 p ao5 rr
+peir1 p ei1 rr
+peir2 p ei2 rr
+peir3 p ei3 rr
+peir4 p ei4 rr
+peir5 p ei5 rr
+penr1 p en1 rr
+penr2 p en2 rr
+penr3 p en3 rr
+penr4 p en4 rr
+penr5 p en5 rr
+pengr1 p eng1 rr
+pengr2 p eng2 rr
+pengr3 p eng3 rr
+pengr4 p eng4 rr
+pengr5 p eng5 rr
+pir1 p i1 rr
+pir2 p i2 rr
+pir3 p i3 rr
+pir4 p i4 rr
+pir5 p i5 rr
+pianr1 p ian1 rr
+pianr2 p ian2 rr
+pianr3 p ian3 rr
+pianr4 p ian4 rr
+pianr5 p ian5 rr
+piaor1 p iao1 rr
+piaor2 p iao2 rr
+piaor3 p iao3 rr
+piaor4 p iao4 rr
+piaor5 p iao5 rr
+pier1 p ie1 rr
+pier2 p ie2 rr
+pier3 p ie3 rr
+pier4 p ie4 rr
+pier5 p ie5 rr
+pinr1 p in1 rr
+pinr2 p in2 rr
+pinr3 p in3 rr
+pinr4 p in4 rr
+pinr5 p in5 rr
+pingr1 p ing1 rr
+pingr2 p ing2 rr
+pingr3 p ing3 rr
+pingr4 p ing4 rr
+pingr5 p ing5 rr
+por1 p o1 rr
+por2 p o2 rr
+por3 p o3 rr
+por4 p o4 rr
+por5 p o5 rr
+pour1 p ou1 rr
+pour2 p ou2 rr
+pour3 p ou3 rr
+pour4 p ou4 rr
+pour5 p ou5 rr
+pur1 p u1 rr
+pur2 p u2 rr
+pur3 p u3 rr
+pur4 p u4 rr
+pur5 p u5 rr
+qir1 q i1 rr
+qir2 q i2 rr
+qir3 q i3 rr
+qir4 q i4 rr
+qir5 q i5 rr
+qiar1 q ia1 rr
+qiar2 q ia2 rr
+qiar3 q ia3 rr
+qiar4 q ia4 rr
+qiar5 q ia5 rr
+qianr1 q ian1 rr
+qianr2 q ian2 rr
+qianr3 q ian3 rr
+qianr4 q ian4 rr
+qianr5 q ian5 rr
+qiangr1 q iang1 rr
+qiangr2 q iang2 rr
+qiangr3 q iang3 rr
+qiangr4 q iang4 rr
+qiangr5 q iang5 rr
+qiaor1 q iao1 rr
+qiaor2 q iao2 rr
+qiaor3 q iao3 rr
+qiaor4 q iao4 rr
+qiaor5 q iao5 rr
+qier1 q ie1 rr
+qier2 q ie2 rr
+qier3 q ie3 rr
+qier4 q ie4 rr
+qier5 q ie5 rr
+qinr1 q in1 rr
+qinr2 q in2 rr
+qinr3 q in3 rr
+qinr4 q in4 rr
+qinr5 q in5 rr
+qingr1 q ing1 rr
+qingr2 q ing2 rr
+qingr3 q ing3 rr
+qingr4 q ing4 rr
+qingr5 q ing5 rr
+qiongr1 q iong1 rr
+qiongr2 q iong2 rr
+qiongr3 q iong3 rr
+qiongr4 q iong4 rr
+qiongr5 q iong5 rr
+qiur1 q iou1 rr
+qiur2 q iou2 rr
+qiur3 q iou3 rr
+qiur4 q iou4 rr
+qiur5 q iou5 rr
+qur1 q v1 rr
+qur2 q v2 rr
+qur3 q v3 rr
+qur4 q v4 rr
+qur5 q v5 rr
+quanr1 q van1 rr
+quanr2 q van2 rr
+quanr3 q van3 rr
+quanr4 q van4 rr
+quanr5 q van5 rr
+quer1 q ve1 rr
+quer2 q ve2 rr
+quer3 q ve3 rr
+quer4 q ve4 rr
+quer5 q ve5 rr
+qunr1 q vn1 rr
+qunr2 q vn2 rr
+qunr3 q vn3 rr
+qunr4 q vn4 rr
+qunr5 q vn5 rr
+ranr1 r an1 rr
+ranr2 r an2 rr
+ranr3 r an3 rr
+ranr4 r an4 rr
+ranr5 r an5 rr
+rangr1 r ang1 rr
+rangr2 r ang2 rr
+rangr3 r ang3 rr
+rangr4 r ang4 rr
+rangr5 r ang5 rr
+raor1 r ao1 rr
+raor2 r ao2 rr
+raor3 r ao3 rr
+raor4 r ao4 rr
+raor5 r ao5 rr
+rer1 r e1 rr
+rer2 r e2 rr
+rer3 r e3 rr
+rer4 r e4 rr
+rer5 r e5 rr
+renr1 r en1 rr
+renr2 r en2 rr
+renr3 r en3 rr
+renr4 r en4 rr
+renr5 r en5 rr
+rengr1 r eng1 rr
+rengr2 r eng2 rr
+rengr3 r eng3 rr
+rengr4 r eng4 rr
+rengr5 r eng5 rr
+rir1 r iii1 rr
+rir2 r iii2 rr
+rir3 r iii3 rr
+rir4 r iii4 rr
+rir5 r iii5 rr
+rongr1 r ong1 rr
+rongr2 r ong2 rr
+rongr3 r ong3 rr
+rongr4 r ong4 rr
+rongr5 r ong5 rr
+rour1 r ou1 rr
+rour2 r ou2 rr
+rour3 r ou3 rr
+rour4 r ou4 rr
+rour5 r ou5 rr
+rur1 r u1 rr
+rur2 r u2 rr
+rur3 r u3 rr
+rur4 r u4 rr
+rur5 r u5 rr
+ruar1 r ua1 rr
+ruar2 r ua2 rr
+ruar3 r ua3 rr
+ruar4 r ua4 rr
+ruar5 r ua5 rr
+ruanr1 r uan1 rr
+ruanr2 r uan2 rr
+ruanr3 r uan3 rr
+ruanr4 r uan4 rr
+ruanr5 r uan5 rr
+ruir1 r uei1 rr
+ruir2 r uei2 rr
+ruir3 r uei3 rr
+ruir4 r uei4 rr
+ruir5 r uei5 rr
+runr1 r uen1 rr
+runr2 r uen2 rr
+runr3 r uen3 rr
+runr4 r uen4 rr
+runr5 r uen5 rr
+ruor1 r uo1 rr
+ruor2 r uo2 rr
+ruor3 r uo3 rr
+ruor4 r uo4 rr
+ruor5 r uo5 rr
+sar1 s a1 rr
+sar2 s a2 rr
+sar3 s a3 rr
+sar4 s a4 rr
+sar5 s a5 rr
+sair1 s ai1 rr
+sair2 s ai2 rr
+sair3 s ai3 rr
+sair4 s ai4 rr
+sair5 s ai5 rr
+sanr1 s an1 rr
+sanr2 s an2 rr
+sanr3 s an3 rr
+sanr4 s an4 rr
+sanr5 s an5 rr
+sangr1 s ang1 rr
+sangr2 s ang2 rr
+sangr3 s ang3 rr
+sangr4 s ang4 rr
+sangr5 s ang5 rr
+saor1 s ao1 rr
+saor2 s ao2 rr
+saor3 s ao3 rr
+saor4 s ao4 rr
+saor5 s ao5 rr
+ser1 s e1 rr
+ser2 s e2 rr
+ser3 s e3 rr
+ser4 s e4 rr
+ser5 s e5 rr
+senr1 s en1 rr
+senr2 s en2 rr
+senr3 s en3 rr
+senr4 s en4 rr
+senr5 s en5 rr
+sengr1 s eng1 rr
+sengr2 s eng2 rr
+sengr3 s eng3 rr
+sengr4 s eng4 rr
+sengr5 s eng5 rr
+shar1 sh a1 rr
+shar2 sh a2 rr
+shar3 sh a3 rr
+shar4 sh a4 rr
+shar5 sh a5 rr
+shair1 sh ai1 rr
+shair2 sh ai2 rr
+shair3 sh ai3 rr
+shair4 sh ai4 rr
+shair5 sh ai5 rr
+shanr1 sh an1 rr
+shanr2 sh an2 rr
+shanr3 sh an3 rr
+shanr4 sh an4 rr
+shanr5 sh an5 rr
+shangr1 sh ang1 rr
+shangr2 sh ang2 rr
+shangr3 sh ang3 rr
+shangr4 sh ang4 rr
+shangr5 sh ang5 rr
+shaor1 sh ao1 rr
+shaor2 sh ao2 rr
+shaor3 sh ao3 rr
+shaor4 sh ao4 rr
+shaor5 sh ao5 rr
+sher1 sh e1 rr
+sher2 sh e2 rr
+sher3 sh e3 rr
+sher4 sh e4 rr
+sher5 sh e5 rr
+sheir1 sh ei1 rr
+sheir2 sh ei2 rr
+sheir3 sh ei3 rr
+sheir4 sh ei4 rr
+sheir5 sh ei5 rr
+shenr1 sh en1 rr
+shenr2 sh en2 rr
+shenr3 sh en3 rr
+shenr4 sh en4 rr
+shenr5 sh en5 rr
+shengr1 sh eng1 rr
+shengr2 sh eng2 rr
+shengr3 sh eng3 rr
+shengr4 sh eng4 rr
+shengr5 sh eng5 rr
+shir1 sh iii1 rr
+shir2 sh iii2 rr
+shir3 sh iii3 rr
+shir4 sh iii4 rr
+shir5 sh iii5 rr
+shour1 sh ou1 rr
+shour2 sh ou2 rr
+shour3 sh ou3 rr
+shour4 sh ou4 rr
+shour5 sh ou5 rr
+shur1 sh u1 rr
+shur2 sh u2 rr
+shur3 sh u3 rr
+shur4 sh u4 rr
+shur5 sh u5 rr
+shuar1 sh ua1 rr
+shuar2 sh ua2 rr
+shuar3 sh ua3 rr
+shuar4 sh ua4 rr
+shuar5 sh ua5 rr
+shuair1 sh uai1 rr
+shuair2 sh uai2 rr
+shuair3 sh uai3 rr
+shuair4 sh uai4 rr
+shuair5 sh uai5 rr
+shuanr1 sh uan1 rr
+shuanr2 sh uan2 rr
+shuanr3 sh uan3 rr
+shuanr4 sh uan4 rr
+shuanr5 sh uan5 rr
+shuangr1 sh uang1 rr
+shuangr2 sh uang2 rr
+shuangr3 sh uang3 rr
+shuangr4 sh uang4 rr
+shuangr5 sh uang5 rr
+shuir1 sh uei1 rr
+shuir2 sh uei2 rr
+shuir3 sh uei3 rr
+shuir4 sh uei4 rr
+shuir5 sh uei5 rr
+shunr1 sh uen1 rr
+shunr2 sh uen2 rr
+shunr3 sh uen3 rr
+shunr4 sh uen4 rr
+shunr5 sh uen5 rr
+shuor1 sh uo1 rr
+shuor2 sh uo2 rr
+shuor3 sh uo3 rr
+shuor4 sh uo4 rr
+shuor5 sh uo5 rr
+sir1 s ii1 rr
+sir2 s ii2 rr
+sir3 s ii3 rr
+sir4 s ii4 rr
+sir5 s ii5 rr
+songr1 s ong1 rr
+songr2 s ong2 rr
+songr3 s ong3 rr
+songr4 s ong4 rr
+songr5 s ong5 rr
+sour1 s ou1 rr
+sour2 s ou2 rr
+sour3 s ou3 rr
+sour4 s ou4 rr
+sour5 s ou5 rr
+sur1 s u1 rr
+sur2 s u2 rr
+sur3 s u3 rr
+sur4 s u4 rr
+sur5 s u5 rr
+suanr1 s uan1 rr
+suanr2 s uan2 rr
+suanr3 s uan3 rr
+suanr4 s uan4 rr
+suanr5 s uan5 rr
+suir1 s uei1 rr
+suir2 s uei2 rr
+suir3 s uei3 rr
+suir4 s uei4 rr
+suir5 s uei5 rr
+sunr1 s uen1 rr
+sunr2 s uen2 rr
+sunr3 s uen3 rr
+sunr4 s uen4 rr
+sunr5 s uen5 rr
+suor1 s uo1 rr
+suor2 s uo2 rr
+suor3 s uo3 rr
+suor4 s uo4 rr
+suor5 s uo5 rr
+tar1 t a1 rr
+tar2 t a2 rr
+tar3 t a3 rr
+tar4 t a4 rr
+tar5 t a5 rr
+tair1 t ai1 rr
+tair2 t ai2 rr
+tair3 t ai3 rr
+tair4 t ai4 rr
+tair5 t ai5 rr
+tanr1 t an1 rr
+tanr2 t an2 rr
+tanr3 t an3 rr
+tanr4 t an4 rr
+tanr5 t an5 rr
+tangr1 t ang1 rr
+tangr2 t ang2 rr
+tangr3 t ang3 rr
+tangr4 t ang4 rr
+tangr5 t ang5 rr
+taor1 t ao1 rr
+taor2 t ao2 rr
+taor3 t ao3 rr
+taor4 t ao4 rr
+taor5 t ao5 rr
+ter1 t e1 rr
+ter2 t e2 rr
+ter3 t e3 rr
+ter4 t e4 rr
+ter5 t e5 rr
+teir1 t ei1 rr
+teir2 t ei2 rr
+teir3 t ei3 rr
+teir4 t ei4 rr
+teir5 t ei5 rr
+tengr1 t eng1 rr
+tengr2 t eng2 rr
+tengr3 t eng3 rr
+tengr4 t eng4 rr
+tengr5 t eng5 rr
+tir1 t i1 rr
+tir2 t i2 rr
+tir3 t i3 rr
+tir4 t i4 rr
+tir5 t i5 rr
+tianr1 t ian1 rr
+tianr2 t ian2 rr
+tianr3 t ian3 rr
+tianr4 t ian4 rr
+tianr5 t ian5 rr
+tiaor1 t iao1 rr
+tiaor2 t iao2 rr
+tiaor3 t iao3 rr
+tiaor4 t iao4 rr
+tiaor5 t iao5 rr
+tier1 t ie1 rr
+tier2 t ie2 rr
+tier3 t ie3 rr
+tier4 t ie4 rr
+tier5 t ie5 rr
+tingr1 t ing1 rr
+tingr2 t ing2 rr
+tingr3 t ing3 rr
+tingr4 t ing4 rr
+tingr5 t ing5 rr
+tongr1 t ong1 rr
+tongr2 t ong2 rr
+tongr3 t ong3 rr
+tongr4 t ong4 rr
+tongr5 t ong5 rr
+tour1 t ou1 rr
+tour2 t ou2 rr
+tour3 t ou3 rr
+tour4 t ou4 rr
+tour5 t ou5 rr
+tur1 t u1 rr
+tur2 t u2 rr
+tur3 t u3 rr
+tur4 t u4 rr
+tur5 t u5 rr
+tuanr1 t uan1 rr
+tuanr2 t uan2 rr
+tuanr3 t uan3 rr
+tuanr4 t uan4 rr
+tuanr5 t uan5 rr
+tuir1 t uei1 rr
+tuir2 t uei2 rr
+tuir3 t uei3 rr
+tuir4 t uei4 rr
+tuir5 t uei5 rr
+tunr1 t uen1 rr
+tunr2 t uen2 rr
+tunr3 t uen3 rr
+tunr4 t uen4 rr
+tunr5 t uen5 rr
+tuor1 t uo1 rr
+tuor2 t uo2 rr
+tuor3 t uo3 rr
+tuor4 t uo4 rr
+tuor5 t uo5 rr
+war1 w ua1 rr
+war2 w ua2 rr
+war3 w ua3 rr
+war4 w ua4 rr
+war5 w ua5 rr
+wair1 w uai1 rr
+wair2 w uai2 rr
+wair3 w uai3 rr
+wair4 w uai4 rr
+wair5 w uai5 rr
+wanr1 w uan1 rr
+wanr2 w uan2 rr
+wanr3 w uan3 rr
+wanr4 w uan4 rr
+wanr5 w uan5 rr
+wangr1 w uang1 rr
+wangr2 w uang2 rr
+wangr3 w uang3 rr
+wangr4 w uang4 rr
+wangr5 w uang5 rr
+weir1 w uei1 rr
+weir2 w uei2 rr
+weir3 w uei3 rr
+weir4 w uei4 rr
+weir5 w uei5 rr
+wenr1 w uen1 rr
+wenr2 w uen2 rr
+wenr3 w uen3 rr
+wenr4 w uen4 rr
+wenr5 w uen5 rr
+wengr1 w uen1 rr
+wengr2 w uen2 rr
+wengr3 w uen3 rr
+wengr4 w uen4 rr
+wengr5 w uen5 rr
+wor1 w uo1 rr
+wor2 w uo2 rr
+wor3 w uo3 rr
+wor4 w uo4 rr
+wor5 w uo5 rr
+wur1 w u1 rr
+wur2 w u2 rr
+wur3 w u3 rr
+wur4 w u4 rr
+wur5 w u5 rr
+xir1 x i1 rr
+xir2 x i2 rr
+xir3 x i3 rr
+xir4 x i4 rr
+xir5 x i5 rr
+xiar1 x ia1 rr
+xiar2 x ia2 rr
+xiar3 x ia3 rr
+xiar4 x ia4 rr
+xiar5 x ia5 rr
+xianr1 x ian1 rr
+xianr2 x ian2 rr
+xianr3 x ian3 rr
+xianr4 x ian4 rr
+xianr5 x ian5 rr
+xiangr1 x iang1 rr
+xiangr2 x iang2 rr
+xiangr3 x iang3 rr
+xiangr4 x iang4 rr
+xiangr5 x iang5 rr
+xiaor1 x iao1 rr
+xiaor2 x iao2 rr
+xiaor3 x iao3 rr
+xiaor4 x iao4 rr
+xiaor5 x iao5 rr
+xier1 x ie1 rr
+xier2 x ie2 rr
+xier3 x ie3 rr
+xier4 x ie4 rr
+xier5 x ie5 rr
+xinr1 x in1 rr
+xinr2 x in2 rr
+xinr3 x in3 rr
+xinr4 x in4 rr
+xinr5 x in5 rr
+xingr1 x ing1 rr
+xingr2 x ing2 rr
+xingr3 x ing3 rr
+xingr4 x ing4 rr
+xingr5 x ing5 rr
+xiongr1 x iong1 rr
+xiongr2 x iong2 rr
+xiongr3 x iong3 rr
+xiongr4 x iong4 rr
+xiongr5 x iong5 rr
+xiur1 x iou1 rr
+xiur2 x iou2 rr
+xiur3 x iou3 rr
+xiur4 x iou4 rr
+xiur5 x iou5 rr
+xur1 x v1 rr
+xur2 x v2 rr
+xur3 x v3 rr
+xur4 x v4 rr
+xur5 x v5 rr
+xuanr1 x van1 rr
+xuanr2 x van2 rr
+xuanr3 x van3 rr
+xuanr4 x van4 rr
+xuanr5 x van5 rr
+xuer1 x ve1 rr
+xuer2 x ve2 rr
+xuer3 x ve3 rr
+xuer4 x ve4 rr
+xuer5 x ve5 rr
+xunr1 x vn1 rr
+xunr2 x vn2 rr
+xunr3 x vn3 rr
+xunr4 x vn4 rr
+xunr5 x vn5 rr
+yar1 y ia1 rr
+yar2 y ia2 rr
+yar3 y ia3 rr
+yar4 y ia4 rr
+yar5 y ia5 rr
+yanr1 y ian1 rr
+yanr2 y ian2 rr
+yanr3 y ian3 rr
+yanr4 y ian4 rr
+yanr5 y ian5 rr
+yangr1 y iang1 rr
+yangr2 y iang2 rr
+yangr3 y iang3 rr
+yangr4 y iang4 rr
+yangr5 y iang5 rr
+yaor1 y iao1 rr
+yaor2 y iao2 rr
+yaor3 y iao3 rr
+yaor4 y iao4 rr
+yaor5 y iao5 rr
+yer1 y ie1 rr
+yer2 y ie2 rr
+yer3 y ie3 rr
+yer4 y ie4 rr
+yer5 y ie5 rr
+yir1 y i1 rr
+yir2 y i2 rr
+yir3 y i3 rr
+yir4 y i4 rr
+yir5 y i5 rr
+yinr1 y in1 rr
+yinr2 y in2 rr
+yinr3 y in3 rr
+yinr4 y in4 rr
+yinr5 y in5 rr
+yingr1 y ing1 rr
+yingr2 y ing2 rr
+yingr3 y ing3 rr
+yingr4 y ing4 rr
+yingr5 y ing5 rr
+yor1 y iou1 rr
+yor2 y iou2 rr
+yor3 y iou3 rr
+yor4 y iou4 rr
+yor5 y iou5 rr
+yongr1 y iong1 rr
+yongr2 y iong2 rr
+yongr3 y iong3 rr
+yongr4 y iong4 rr
+yongr5 y iong5 rr
+your1 y iou1 rr
+your2 y iou2 rr
+your3 y iou3 rr
+your4 y iou4 rr
+your5 y iou5 rr
+yur1 y v1 rr
+yur2 y v2 rr
+yur3 y v3 rr
+yur4 y v4 rr
+yur5 y v5 rr
+yuanr1 y van1 rr
+yuanr2 y van2 rr
+yuanr3 y van3 rr
+yuanr4 y van4 rr
+yuanr5 y van5 rr
+yuer1 y ve1 rr
+yuer2 y ve2 rr
+yuer3 y ve3 rr
+yuer4 y ve4 rr
+yuer5 y ve5 rr
+yunr1 y vn1 rr
+yunr2 y vn2 rr
+yunr3 y vn3 rr
+yunr4 y vn4 rr
+yunr5 y vn5 rr
+zar1 z a1 rr
+zar2 z a2 rr
+zar3 z a3 rr
+zar4 z a4 rr
+zar5 z a5 rr
+zair1 z ai1 rr
+zair2 z ai2 rr
+zair3 z ai3 rr
+zair4 z ai4 rr
+zair5 z ai5 rr
+zanr1 z an1 rr
+zanr2 z an2 rr
+zanr3 z an3 rr
+zanr4 z an4 rr
+zanr5 z an5 rr
+zangr1 z ang1 rr
+zangr2 z ang2 rr
+zangr3 z ang3 rr
+zangr4 z ang4 rr
+zangr5 z ang5 rr
+zaor1 z ao1 rr
+zaor2 z ao2 rr
+zaor3 z ao3 rr
+zaor4 z ao4 rr
+zaor5 z ao5 rr
+zer1 z e1 rr
+zer2 z e2 rr
+zer3 z e3 rr
+zer4 z e4 rr
+zer5 z e5 rr
+zeir1 z ei1 rr
+zeir2 z ei2 rr
+zeir3 z ei3 rr
+zeir4 z ei4 rr
+zeir5 z ei5 rr
+zenr1 z en1 rr
+zenr2 z en2 rr
+zenr3 z en3 rr
+zenr4 z en4 rr
+zenr5 z en5 rr
+zengr1 z eng1 rr
+zengr2 z eng2 rr
+zengr3 z eng3 rr
+zengr4 z eng4 rr
+zengr5 z eng5 rr
+zhar1 zh a1 rr
+zhar2 zh a2 rr
+zhar3 zh a3 rr
+zhar4 zh a4 rr
+zhar5 zh a5 rr
+zhair1 zh ai1 rr
+zhair2 zh ai2 rr
+zhair3 zh ai3 rr
+zhair4 zh ai4 rr
+zhair5 zh ai5 rr
+zhanr1 zh an1 rr
+zhanr2 zh an2 rr
+zhanr3 zh an3 rr
+zhanr4 zh an4 rr
+zhanr5 zh an5 rr
+zhangr1 zh ang1 rr
+zhangr2 zh ang2 rr
+zhangr3 zh ang3 rr
+zhangr4 zh ang4 rr
+zhangr5 zh ang5 rr
+zhaor1 zh ao1 rr
+zhaor2 zh ao2 rr
+zhaor3 zh ao3 rr
+zhaor4 zh ao4 rr
+zhaor5 zh ao5 rr
+zher1 zh e1 rr
+zher2 zh e2 rr
+zher3 zh e3 rr
+zher4 zh e4 rr
+zher5 zh e5 rr
+zheir1 zh ei1 rr
+zheir2 zh ei2 rr
+zheir3 zh ei3 rr
+zheir4 zh ei4 rr
+zheir5 zh ei5 rr
+zhenr1 zh en1 rr
+zhenr2 zh en2 rr
+zhenr3 zh en3 rr
+zhenr4 zh en4 rr
+zhenr5 zh en5 rr
+zhengr1 zh eng1 rr
+zhengr2 zh eng2 rr
+zhengr3 zh eng3 rr
+zhengr4 zh eng4 rr
+zhengr5 zh eng5 rr
+zhir1 zh iii1 rr
+zhir2 zh iii2 rr
+zhir3 zh iii3 rr
+zhir4 zh iii4 rr
+zhir5 zh iii5 rr
+zhongr1 zh ong1 rr
+zhongr2 zh ong2 rr
+zhongr3 zh ong3 rr
+zhongr4 zh ong4 rr
+zhongr5 zh ong5 rr
+zhour1 zh ou1 rr
+zhour2 zh ou2 rr
+zhour3 zh ou3 rr
+zhour4 zh ou4 rr
+zhour5 zh ou5 rr
+zhur1 zh u1 rr
+zhur2 zh u2 rr
+zhur3 zh u3 rr
+zhur4 zh u4 rr
+zhur5 zh u5 rr
+zhuar1 zh ua1 rr
+zhuar2 zh ua2 rr
+zhuar3 zh ua3 rr
+zhuar4 zh ua4 rr
+zhuar5 zh ua5 rr
+zhuair1 zh uai1 rr
+zhuair2 zh uai2 rr
+zhuair3 zh uai3 rr
+zhuair4 zh uai4 rr
+zhuair5 zh uai5 rr
+zhuanr1 zh uan1 rr
+zhuanr2 zh uan2 rr
+zhuanr3 zh uan3 rr
+zhuanr4 zh uan4 rr
+zhuanr5 zh uan5 rr
+zhuangr1 zh uang1 rr
+zhuangr2 zh uang2 rr
+zhuangr3 zh uang3 rr
+zhuangr4 zh uang4 rr
+zhuangr5 zh uang5 rr
+zhuir1 zh uei1 rr
+zhuir2 zh uei2 rr
+zhuir3 zh uei3 rr
+zhuir4 zh uei4 rr
+zhuir5 zh uei5 rr
+zhunr1 zh uen1 rr
+zhunr2 zh uen2 rr
+zhunr3 zh uen3 rr
+zhunr4 zh uen4 rr
+zhunr5 zh uen5 rr
+zhuor1 zh uo1 rr
+zhuor2 zh uo2 rr
+zhuor3 zh uo3 rr
+zhuor4 zh uo4 rr
+zhuor5 zh uo5 rr
+zir1 z ii1 rr
+zir2 z ii2 rr
+zir3 z ii3 rr
+zir4 z ii4 rr
+zir5 z ii5 rr
+zongr1 z ong1 rr
+zongr2 z ong2 rr
+zongr3 z ong3 rr
+zongr4 z ong4 rr
+zongr5 z ong5 rr
+zour1 z ou1 rr
+zour2 z ou2 rr
+zour3 z ou3 rr
+zour4 z ou4 rr
+zour5 z ou5 rr
+zur1 z u1 rr
+zur2 z u2 rr
+zur3 z u3 rr
+zur4 z u4 rr
+zur5 z u5 rr
+zuanr1 z uan1 rr
+zuanr2 z uan2 rr
+zuanr3 z uan3 rr
+zuanr4 z uan4 rr
+zuanr5 z uan5 rr
+zuir1 z uei1 rr
+zuir2 z uei2 rr
+zuir3 z uei3 rr
+zuir4 z uei4 rr
+zuir5 z uei5 rr
+zunr1 z uen1 rr
+zunr2 z uen2 rr
+zunr3 z uen3 rr
+zunr4 z uen4 rr
+zunr5 z uen5 rr
+zuor1 z uo1 rr
+zuor2 z uo2 rr
+zuor3 z uo3 rr
+zuor4 z uo4 rr
+zuor5 z uo5 rr

text/numbers.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+import inflect
+import re
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

text/pinyin.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+initials = [
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "w",
+    "x",
+    "y",
+    "z",
+    "zh",
+]
+finals = [
+    "a1",
+    "a2",
+    "a3",
+    "a4",
+    "a5",
+    "ai1",
+    "ai2",
+    "ai3",
+    "ai4",
+    "ai5",
+    "an1",
+    "an2",
+    "an3",
+    "an4",
+    "an5",
+    "ang1",
+    "ang2",
+    "ang3",
+    "ang4",
+    "ang5",
+    "ao1",
+    "ao2",
+    "ao3",
+    "ao4",
+    "ao5",
+    "e1",
+    "e2",
+    "e3",
+    "e4",
+    "e5",
+    "ei1",
+    "ei2",
+    "ei3",
+    "ei4",
+    "ei5",
+    "en1",
+    "en2",
+    "en3",
+    "en4",
+    "en5",
+    "eng1",
+    "eng2",
+    "eng3",
+    "eng4",
+    "eng5",
+    "er1",
+    "er2",
+    "er3",
+    "er4",
+    "er5",
+    "i1",
+    "i2",
+    "i3",
+    "i4",
+    "i5",
+    "ia1",
+    "ia2",
+    "ia3",
+    "ia4",
+    "ia5",
+    "ian1",
+    "ian2",
+    "ian3",
+    "ian4",
+    "ian5",
+    "iang1",
+    "iang2",
+    "iang3",
+    "iang4",
+    "iang5",
+    "iao1",
+    "iao2",
+    "iao3",
+    "iao4",
+    "iao5",
+    "ie1",
+    "ie2",
+    "ie3",
+    "ie4",
+    "ie5",
+    "ii1",
+    "ii2",
+    "ii3",
+    "ii4",
+    "ii5",
+    "iii1",
+    "iii2",
+    "iii3",
+    "iii4",
+    "iii5",
+    "in1",
+    "in2",
+    "in3",
+    "in4",
+    "in5",
+    "ing1",
+    "ing2",
+    "ing3",
+    "ing4",
+    "ing5",
+    "iong1",
+    "iong2",
+    "iong3",
+    "iong4",
+    "iong5",
+    "iou1",
+    "iou2",
+    "iou3",
+    "iou4",
+    "iou5",
+    "o1",
+    "o2",
+    "o3",
+    "o4",
+    "o5",
+    "ong1",
+    "ong2",
+    "ong3",
+    "ong4",
+    "ong5",
+    "ou1",
+    "ou2",
+    "ou3",
+    "ou4",
+    "ou5",
+    "u1",
+    "u2",
+    "u3",
+    "u4",
+    "u5",
+    "ua1",
+    "ua2",
+    "ua3",
+    "ua4",
+    "ua5",
+    "uai1",
+    "uai2",
+    "uai3",
+    "uai4",
+    "uai5",
+    "uan1",
+    "uan2",
+    "uan3",
+    "uan4",
+    "uan5",
+    "uang1",
+    "uang2",
+    "uang3",
+    "uang4",
+    "uang5",
+    "uei1",
+    "uei2",
+    "uei3",
+    "uei4",
+    "uei5",
+    "uen1",
+    "uen2",
+    "uen3",
+    "uen4",
+    "uen5",
+    "uo1",
+    "uo2",
+    "uo3",
+    "uo4",
+    "uo5",
+    "v1",
+    "v2",
+    "v3",
+    "v4",
+    "v5",
+    "van1",
+    "van2",
+    "van3",
+    "van4",
+    "van5",
+    "ve1",
+    "ve2",
+    "ve3",
+    "ve4",
+    "ve5",
+    "vn1",
+    "vn2",
+    "vn3",
+    "vn4",
+    "vn5",
+]
+valid_symbols = initials + finals + ["rr"]

text/symbol_table.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# Copyright      2020  Mobvoi Inc.        (authors: Fangjun Kuang)
+#
+# See ../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import TypeVar
+from typing import Union
+Symbol = TypeVar("Symbol")
+# SymbolTable is copied from
+# https://github.com/k2-fsa/k2/blob/master/k2/python/k2/symbol_table.py
+"""
+    SymbolTable: map symbol to id
+"""
+@dataclass(repr=False)
+class SymbolTable(Generic[Symbol]):
+    """SymbolTable that maps symbol IDs, found on the FSA arcs to
+    actual objects. These objects can be arbitrary Python objects
+    that can serve as keys in a dictionary (i.e. they need to be
+    hashable and immutable).
+    The SymbolTable can only be read to/written from disk if the
+    symbols are strings.
+    """
+    _id2sym: Dict[int, Symbol] = field(default_factory=dict)
+    """Map an integer to a symbol.
+    """
+    _sym2id: Dict[Symbol, int] = field(default_factory=dict)
+    """Map a symbol to an integer.
+    """
+    _next_available_id: int = 1
+    """A helper internal field that helps adding new symbols
+    to the table efficiently.
+    """
+    eps: Symbol = "<eps>"
+    """Null symbol, always mapped to index 0.
+    """
+    def __post_init__(self):
+        assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
+        assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
+        assert 0 not in self._id2sym or self._id2sym[0] == self.eps
+        self._next_available_id = max(self._id2sym, default=0) + 1
+        self._id2sym.setdefault(0, self.eps)
+        self._sym2id.setdefault(self.eps, 0)
+    @staticmethod
+    def from_str(s: str) -> "SymbolTable":
+        """Build a symbol table from a string.
+        The string consists of lines. Every line has two fields separated
+        by space(s), tab(s) or both. The first field is the symbol and the
+        second the integer id of the symbol.
+        Args:
+          s:
+            The input string with the format described above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        """
+        id2sym: Dict[int, str] = dict()
+        sym2id: Dict[str, int] = dict()
+        for line in s.split("\n"):
+            fields = line.split()
+            if len(fields) == 0:
+                continue  # skip empty lines
+            assert (
+                len(fields) == 2
+            ), f"Expect a line with 2 fields. Given: {len(fields)}"
+            sym, idx = fields[0], int(fields[1])
+            assert sym not in sym2id, f"Duplicated symbol {sym}"
+            assert idx not in id2sym, f"Duplicated id {idx}"
+            id2sym[idx] = sym
+            sym2id[sym] = idx
+        eps = id2sym.get(0, "<eps>")
+        return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
+    @staticmethod
+    def from_file(filename: str) -> "SymbolTable":
+        """Build a symbol table from file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        """
+        with open(filename, "r", encoding="utf-8") as f:
+            return SymbolTable.from_str(f.read().strip())
+    def to_str(self) -> str:
+        """
+        Returns:
+          Return a string representation of this object. You can pass
+          it to the method ``from_str`` to recreate an identical object.
+        """
+        s = ""
+        for idx, symbol in sorted(self._id2sym.items()):
+            s += f"{symbol} {idx}\n"
+        return s
+    def to_file(self, filename: str):
+        """Serialize the SymbolTable to a file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        """
+        with open(filename, "w") as f:
+            for idx, symbol in sorted(self._id2sym.items()):
+                print(symbol, idx, file=f)
+    def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
+        """Add a new symbol to the SymbolTable.
+        Args:
+            symbol:
+                The symbol to be added.
+            index:
+                Optional int id to which the symbol should be assigned.
+                If it is not available, a ValueError will be raised.
+        Returns:
+            The int id to which the symbol has been assigned.
+        """
+        # Already in the table? Return its ID.
+        if symbol in self._sym2id:
+            return self._sym2id[symbol]
+        # Specific ID not provided - use next available.
+        if index is None:
+            index = self._next_available_id
+        # Specific ID provided but not available.
+        if index in self._id2sym:
+            raise ValueError(
+                f"Cannot assign id '{index}' to '{symbol}' - "
+                f"already occupied by {self._id2sym[index]}"
+            )
+        self._sym2id[symbol] = index
+        self._id2sym[index] = symbol
+        # Update next available ID if needed
+        if self._next_available_id <= index:
+            self._next_available_id = index + 1
+        return index
+    def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
+        """Get a symbol for an id or get an id for a symbol
+        Args:
+          k:
+            If it is an id, it tries to find the symbol corresponding
+            to the id; if it is a symbol, it tries to find the id
+            corresponding to the symbol.
+        Returns:
+          An id or a symbol depending on the given `k`.
+        """
+        if isinstance(k, int):
+            return self._id2sym[k]
+        else:
+            return self._sym2id[k]
+    def merge(self, other: "SymbolTable") -> "SymbolTable":
+        """Create a union of two SymbolTables.
+        Raises an AssertionError if the same IDs are occupied by
+        different symbols.
+        Args:
+            other:
+                A symbol table to merge with ``self``.
+        Returns:
+            A new symbol table.
+        """
+        self._check_compatible(other)
+        return SymbolTable(
+            _id2sym={**self._id2sym, **other._id2sym},
+            _sym2id={**self._sym2id, **other._sym2id},
+            eps=self.eps,
+        )
+    def _check_compatible(self, other: "SymbolTable") -> None:
+        # Epsilon compatibility
+        assert self.eps == other.eps, (
+            f"Mismatched epsilon symbol: " f"{self.eps} != {other.eps}"
+        )
+        # IDs compatibility
+        common_ids = set(self._id2sym).intersection(other._id2sym)
+        for idx in common_ids:
+            assert self[idx] == other[idx], (
+                f"ID conflict for id: {idx}, "
+                f'self[idx] = "{self[idx]}", '
+                f'other[idx] = "{other[idx]}"'
+            )
+        # Symbols compatibility
+        common_symbols = set(self._sym2id).intersection(other._sym2id)
+        for sym in common_symbols:
+            assert self[sym] == other[sym], (
+                f"ID conflict for id: {sym}, "
+                f'self[sym] = "{self[sym]}", '
+                f'other[sym] = "{other[sym]}"'
+            )
+    def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
+        return self.get(item)
+    def __contains__(self, item: Union[int, Symbol]) -> bool:
+        if isinstance(item, int):
+            return item in self._id2sym
+        else:
+            return item in self._sym2id
+    def __len__(self) -> int:
+        return len(self._id2sym)
+    def __eq__(self, other: "SymbolTable") -> bool:
+        if len(self) != len(other):
+            return False
+        for s in self.symbols:
+            if self[s] != other[s]:
+                return False
+        return True
+    @property
+    def ids(self) -> List[int]:
+        """Returns a list of integer IDs corresponding to the symbols."""
+        ans = list(self._id2sym.keys())
+        ans.sort()
+        return ans
+    @property
+    def symbols(self) -> List[Symbol]:
+        """Returns a list of symbols (e.g., strings) corresponding to
+        the integer IDs.
+        """
+        ans = list(self._sym2id.keys())
+        ans.sort()
+        return ans

text/symbols.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/keithito/tacotron """
+"""
+Defines the set of symbols used in text input to the model.
+The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """
+from text import cmudict, pinyin
+_pad = "_"
+_punctuation = "!'(),.:;? "
+_special = "-"
+_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_silences = ["@sp", "@spn", "@sil"]
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+_arpabet = ["@" + s for s in cmudict.valid_symbols]
+_pinyin = ["@" + s for s in pinyin.valid_symbols]
+# Export all symbols:
+symbols = (
+    [_pad]
+    + list(_special)
+    + list(_punctuation)
+    + list(_letters)
+    + _arpabet
+    + _silences
+    # + _pinyin # for chinese
+)

text/text_token_collation.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) 2023 Amphion.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from pathlib import Path
+from typing import List, Tuple
+import os
+import numpy as np
+import torch
+from text.symbol_table import SymbolTable
+from text import text_to_sequence
+"""
+    TextToken: map text to id
+"""
+# TextTokenCollator is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/collation.py
+class TextTokenCollator:
+    def __init__(
+        self,
+        text_tokens: List[str],
+        add_eos: bool = True,
+        add_bos: bool = True,
+        pad_symbol: str = "<pad>",
+        bos_symbol: str = "<bos>",
+        eos_symbol: str = "<eos>",
+    ):
+        self.pad_symbol = pad_symbol
+        self.add_eos = add_eos
+        self.add_bos = add_bos
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+        unique_tokens = [pad_symbol]
+        if add_bos:
+            unique_tokens.append(bos_symbol)
+        if add_eos:
+            unique_tokens.append(eos_symbol)
+        unique_tokens.extend(sorted(text_tokens))
+        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
+        self.idx2token = unique_tokens
+    def index(self, tokens_list: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        seqs, seq_lens = [], []
+        for tokens in tokens_list:
+            assert all([True if s in self.token2idx else False for s in tokens]) is True
+            seq = (
+                ([self.bos_symbol] if self.add_bos else [])
+                + list(tokens)
+                + ([self.eos_symbol] if self.add_eos else [])
+            )
+            seqs.append(seq)
+            seq_lens.append(len(seq))
+        max_len = max(seq_lens)
+        for k, (seq, seq_len) in enumerate(zip(seqs, seq_lens)):
+            seq.extend([self.pad_symbol] * (max_len - seq_len))
+        tokens = torch.from_numpy(
+            np.array(
+                [[self.token2idx[token] for token in seq] for seq in seqs],
+                dtype=np.int64,
+            )
+        )
+        tokens_lens = torch.IntTensor(seq_lens)
+        return tokens, tokens_lens
+    def __call__(self, text):
+        tokens_seq = [p for p in text]
+        seq = (
+            ([self.bos_symbol] if self.add_bos else [])
+            + tokens_seq
+            + ([self.eos_symbol] if self.add_eos else [])
+        )
+        token_ids = [self.token2idx[token] for token in seq]
+        token_lens = len(tokens_seq) + self.add_eos + self.add_bos
+        return token_ids, token_lens
+def get_text_token_collater(text_tokens_file: str) -> TextTokenCollator:
+    text_tokens_path = Path(text_tokens_file)
+    unique_tokens = SymbolTable.from_file(text_tokens_path)
+    collater = TextTokenCollator(unique_tokens.symbols, add_bos=True, add_eos=True)
+    token2idx = collater.token2idx
+    return collater, token2idx
+class phoneIDCollation:
+    def __init__(self, cfg, dataset=None, symbols_dict_file=None) -> None:
+        if cfg.preprocess.phone_extractor != "lexicon":
+            ### get text token collator
+            if symbols_dict_file is None:
+                assert dataset is not None
+                symbols_dict_file = os.path.join(
+                    cfg.preprocess.processed_dir, dataset, cfg.preprocess.symbols_dict
+                )
+            self.text_token_colloator, token2idx = get_text_token_collater(
+                symbols_dict_file
+            )
+            # # unique_tokens = SymbolTable.from_file(symbols_dict_path)
+            # # text_tokenizer = TextToken(unique_tokens.symbols, add_bos=True, add_eos=True)
+            # # update phone symbols dict file with pad_symbol or optional tokens (add_bos and add_eos) in TextTokenCollator
+            # phone_symbol_dict = SymbolTable()
+            # for s in sorted(list(set(token2idx.keys()))):
+            #     phone_symbol_dict.add(s)
+            # phone_symbol_dict.to_file(symbols_dict_file)
+    def get_phone_id_sequence(self, cfg, phones_seq):
+        if cfg.preprocess.phone_extractor == "lexicon":
+            phones_seq = " ".join(phones_seq)
+            sequence = text_to_sequence(phones_seq, cfg.preprocess.text_cleaners)
+        else:
+            sequence, seq_len = self.text_token_colloator(phones_seq)
+        return sequence

utils/HyperParams/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .hps import HyperParams

utils/HyperParams/hps.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+class HyperParams:
+    """The class to store hyperparameters. The key is case-insensitive.
+    Args:
+        *args: a list of dict or HyperParams.
+        **kwargs: a list of key-value pairs.
+    """
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HyperParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()

utils/__init__.py ADDED Viewed

File without changes

utils/audio.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import numpy as np
+from numpy import linalg as LA
+import librosa
+import soundfile as sf
+import librosa.filters
+def load_audio_torch(wave_file, fs):
+    """Load audio data into torch tensor
+    Args:
+        wave_file (str): path to wave file
+        fs (int): sample rate
+    Returns:
+        audio (tensor): audio data in tensor
+        fs (int): sample rate
+    """
+    audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
+    # audio: (T,)
+    assert len(audio) > 2
+    # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
+    if np.issubdtype(audio.dtype, np.integer):
+        max_mag = -np.iinfo(audio.dtype).min
+    else:
+        max_mag = max(np.amax(audio), -np.amin(audio))
+        max_mag = (
+            (2**31) + 1
+            if max_mag > (2**15)
+            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
+        )
+    # Normalize the audio
+    audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag
+    if (torch.isnan(audio) | torch.isinf(audio)).any():
+        return [], sample_rate or fs or 48000
+    # Resample the audio to our target samplerate
+    if fs is not None and fs != sample_rate:
+        audio = torch.from_numpy(
+            librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
+        )
+        sample_rate = fs
+    return audio, fs
+def _stft(y, cfg):
+    return librosa.stft(
+        y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
+    )
+def energy(wav, cfg):
+    D = _stft(wav, cfg)
+    magnitudes = np.abs(D).T  # [F, T]
+    return LA.norm(magnitudes, axis=1)
+def get_energy_from_tacotron(audio, _stft):
+    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    mel, energy = _stft.mel_spectrogram(audio)
+    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
+    return mel, energy

utils/audio_slicer.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import json
+import numpy as np
+from tqdm import tqdm
+import torch
+import torchaudio
+from utils.io import save_audio
+from utils.audio import load_audio_torch
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    *,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+    return np.sqrt(power)
+class Slicer:
+    """
+    Copy from: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py
+    """
+    def __init__(
+        self,
+        sr: int,
+        threshold: float = -40.0,
+        min_length: int = 5000,
+        min_interval: int = 300,
+        hop_size: int = 10,
+        max_sil_kept: int = 5000,
+    ):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
+        if not max_sil_kept >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+    def _apply_slice(self, waveform, begin, end):
+        begin = begin * self.hop_size
+        if len(waveform.shape) > 1:
+            end = min(waveform.shape[1], end * self.hop_size)
+            return waveform[:, begin:end], begin, end
+        else:
+            end = min(waveform.shape[0], end * self.hop_size)
+            return waveform[begin:end], begin, end
+    # @timeit
+    def slice(self, waveform, return_chunks_positions=False):
+        if len(waveform.shape) > 1:
+            # (#channle, wave_len) -> (wave_len)
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            chunks_pos_of_waveform = []
+            if sil_tags[0][0] > 0:
+                chunk, begin, end = self._apply_slice(waveform, 0, sil_tags[0][0])
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            for i in range(len(sil_tags) - 1):
+                chunk, begin, end = self._apply_slice(
+                    waveform, sil_tags[i][1], sil_tags[i + 1][0]
+                )
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            if sil_tags[-1][1] < total_frames:
+                chunk, begin, end = self._apply_slice(
+                    waveform, sil_tags[-1][1], total_frames
+                )
+                chunks.append(chunk)
+                chunks_pos_of_waveform.append((begin, end))
+            return (
+                chunks
+                if not return_chunks_positions
+                else (
+                    chunks,
+                    chunks_pos_of_waveform,
+                )
+            )
+def split_utterances_from_audio(
+    wav_file,
+    output_dir,
+    max_duration_of_utterance=10.0,
+    min_interval=300,
+    db_threshold=-40,
+):
+    """
+    Split a long audio into utterances accoring to the silence (VAD).
+    max_duration_of_utterance (second):
+        The maximum duration of every utterance (seconds)
+    min_interval (millisecond):
+        The smaller min_interval is, the more sliced audio clips this script is likely to generate.
+    """
+    print("File:", wav_file.split("/")[-1])
+    waveform, fs = torchaudio.load(wav_file)
+    slicer = Slicer(sr=fs, min_interval=min_interval, threshold=db_threshold)
+    chunks, positions = slicer.slice(waveform, return_chunks_positions=True)
+    durations = [(end - begin) / fs for begin, end in positions]
+    print(
+        "Slicer's min silence part is {}ms, min and max duration of sliced utterances is {}s and {}s".format(
+            min_interval, min(durations), max(durations)
+        )
+    )
+    res_chunks, res_positions = [], []
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) == 1:
+            chunk = chunk[None, :]
+        begin, end = positions[i]
+        assert end - begin == chunk.shape[-1]
+        max_wav_len = max_duration_of_utterance * fs
+        if chunk.shape[-1] <= max_wav_len:
+            res_chunks.append(chunk)
+            res_positions.append(positions[i])
+        else:
+            # TODO: to reserve overlapping and conduct fade-in, fade-out
+            # Get segments number
+            number = 2
+            while chunk.shape[-1] // number >= max_wav_len:
+                number += 1
+            seg_len = chunk.shape[-1] // number
+            # Split
+            for num in range(number):
+                s = seg_len * num
+                t = min(s + seg_len, chunk.shape[-1])
+                seg_begin = begin + s
+                seg_end = begin + t
+                res_chunks.append(chunk[:, s:t])
+                res_positions.append((seg_begin, seg_end))
+    # Save utterances
+    os.makedirs(output_dir, exist_ok=True)
+    res = {"fs": int(fs)}
+    for i, chunk in enumerate(res_chunks):
+        filename = "{:04d}.wav".format(i)
+        res[filename] = [int(p) for p in res_positions[i]]
+        save_audio(os.path.join(output_dir, filename), chunk, fs)
+    # Save positions
+    with open(os.path.join(output_dir, "positions.json"), "w") as f:
+        json.dump(res, f, indent=4, ensure_ascii=False)
+    return res
+def is_silence(
+    wavform,
+    fs,
+    threshold=-40.0,
+    min_interval=300,
+    hop_size=10,
+    min_length=5000,
+):
+    """
+    Detect whether the given wavform is a silence
+    wavform: (T, )
+    """
+    threshold = 10 ** (threshold / 20.0)
+    hop_size = round(fs * hop_size / 1000)
+    win_size = min(round(min_interval), 4 * hop_size)
+    min_length = round(fs * min_length / 1000 / hop_size)
+    if wavform.shape[0] <= min_length:
+        return True
+    # (#Frame,)
+    rms_array = get_rms(y=wavform, frame_length=win_size, hop_length=hop_size).squeeze(
+        0
+    )
+    return (rms_array < threshold).all()
+def split_audio(
+    wav_file, target_sr, output_dir, max_duration_of_segment=10.0, overlap_duration=1.0
+):
+    """
+    Split a long audio into segments.
+    target_sr:
+        The target sampling rate to save the segments.
+    max_duration_of_utterance (second):
+        The maximum duration of every utterance (second)
+    overlap_duraion:
+        Each segment has "overlap duration" (second) overlap with its previous and next segment
+    """
+    # (#channel, T) -> (T,)
+    waveform, fs = torchaudio.load(wav_file)
+    waveform = torchaudio.functional.resample(
+        waveform, orig_freq=fs, new_freq=target_sr
+    )
+    waveform = torch.mean(waveform, dim=0)
+    # waveform, _ = load_audio_torch(wav_file, target_sr)
+    assert len(waveform.shape) == 1
+    assert overlap_duration < max_duration_of_segment
+    length = int(max_duration_of_segment * target_sr)
+    stride = int((max_duration_of_segment - overlap_duration) * target_sr)
+    chunks = []
+    for i in range(0, len(waveform), stride):
+        # (length,)
+        chunks.append(waveform[i : i + length])
+        if i + length >= len(waveform):
+            break
+    # Save segments
+    os.makedirs(output_dir, exist_ok=True)
+    results = []
+    for i, chunk in enumerate(chunks):
+        uid = "{:04d}".format(i)
+        filename = os.path.join(output_dir, "{}.wav".format(uid))
+        results.append(
+            {"Uid": uid, "Path": filename, "Duration": len(chunk) / target_sr}
+        )
+        save_audio(
+            filename,
+            chunk,
+            target_sr,
+            turn_up=not is_silence(chunk, target_sr),
+            add_silence=False,
+        )
+    return results
+def merge_segments_torchaudio(wav_files, fs, output_path, overlap_duration=1.0):
+    """Merge the given wav_files (may have overlaps) into a long audio
+    fs:
+        The sampling rate of the wav files.
+    output_path:
+        The output path to save the merged audio.
+    overlap_duration (float, optional):
+        Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
+    """
+    waveforms = []
+    for file in wav_files:
+        # (T,)
+        waveform, _ = load_audio_torch(file, fs)
+        waveforms.append(waveform)
+    if len(waveforms) == 1:
+        save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
+        return
+    overlap_len = int(overlap_duration * fs)
+    fade_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
+    fade_in = torchaudio.transforms.Fade(fade_in_len=overlap_len)
+    fade_in_and_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
+    segments_lens = [len(wav) for wav in waveforms]
+    merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
+    merged_waveform = torch.zeros(merged_waveform_len)
+    start = 0
+    for index, wav in enumerate(
+        tqdm(waveforms, desc="Merge for {}".format(output_path))
+    ):
+        wav_len = len(wav)
+        if index == 0:
+            wav = fade_out(wav)
+        elif index == len(waveforms) - 1:
+            wav = fade_in(wav)
+        else:
+            wav = fade_in_and_out(wav)
+        merged_waveform[start : start + wav_len] = wav
+        start += wav_len - overlap_len
+    save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)
+def merge_segments_encodec(wav_files, fs, output_path, overlap_duration=1.0):
+    """Merge the given wav_files (may have overlaps) into a long audio
+    fs:
+        The sampling rate of the wav files.
+    output_path:
+        The output path to save the merged audio.
+    overlap_duration (float, optional):
+        Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
+    """
+    waveforms = []
+    for file in wav_files:
+        # (T,)
+        waveform, _ = load_audio_torch(file, fs)
+        waveforms.append(waveform)
+    if len(waveforms) == 1:
+        save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
+        return
+    device = waveforms[0].device
+    dtype = waveforms[0].dtype
+    shape = waveforms[0].shape[:-1]
+    overlap_len = int(overlap_duration * fs)
+    segments_lens = [len(wav) for wav in waveforms]
+    merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
+    sum_weight = torch.zeros(merged_waveform_len, device=device, dtype=dtype)
+    out = torch.zeros(*shape, merged_waveform_len, device=device, dtype=dtype)
+    offset = 0
+    for frame in waveforms:
+        frame_length = frame.size(-1)
+        t = torch.linspace(0, 1, frame_length + 2, device=device, dtype=torch.float32)[
+            1:-1
+        ]
+        weight = 0.5 - (t - 0.5).abs()
+        weighted_frame = frame * weight
+        cur = out[..., offset : offset + frame_length]
+        cur += weighted_frame[..., : cur.size(-1)]
+        out[..., offset : offset + frame_length] = cur
+        cur = sum_weight[offset : offset + frame_length]
+        cur += weight[..., : cur.size(-1)]
+        sum_weight[offset : offset + frame_length] = cur
+        offset += frame_length - overlap_len
+    assert sum_weight.min() > 0
+    merged_waveform = out / sum_weight
+    save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)

utils/cut_by_vad.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py"""
+import pathlib
+import soundfile as sf
+import numpy as np
+import json
+import multiprocessing
+import tqdm
+def save(seq, fname, index, extension):
+    """save audio sequences to file"""
+    output = np.hstack(seq)
+    file_name = fname.parent / (fname.stem + f"_{index:04}{extension}")
+    fname.parent.mkdir(exist_ok=True, parents=True)
+    sf.write(file_name, output, samplerate=16000)
+def cut_sequence(path, vad, path_out, target_len_sec, out_extension):
+    """cut audio sequences based on VAD"""
+    data, samplerate = sf.read(path)
+    assert len(data.shape) == 1
+    assert samplerate == 16000
+    to_stitch = []
+    length_accumulated = 0.0
+    i = 0
+    # Iterate over VAD segments
+    for start, end in vad:
+        start_index = int(start * samplerate)
+        end_index = int(end * samplerate)
+        slice = data[start_index:end_index]
+        # Save slices that exceed the target length or if there's already accumulated audio
+        if (
+            length_accumulated + (end - start) > target_len_sec
+            and length_accumulated > 0
+        ):
+            save(to_stitch, path_out, i, out_extension)
+            to_stitch = []
+            i += 1
+            length_accumulated = 0
+        # Add the current slice to the list to be stitched
+        to_stitch.append(slice)
+        length_accumulated += end - start
+    # Save any remaining slices
+    if to_stitch:
+        save(to_stitch, path_out, i, out_extension)
+def cut_book(task):
+    """process each book in the dataset"""
+    path_book, root_out, target_len_sec, extension = task
+    speaker = pathlib.Path(path_book.parent.name)
+    for i, meta_file_path in enumerate(path_book.glob("*.json")):
+        with open(meta_file_path, "r") as f:
+            meta = json.loads(f.read())
+        book_id = meta["book_meta"]["id"]
+        vad = meta["voice_activity"]
+        sound_file = meta_file_path.parent / (meta_file_path.stem + ".flac")
+        path_out = root_out / speaker / book_id / (meta_file_path.stem)
+        cut_sequence(sound_file, vad, path_out, target_len_sec, extension)
+def cut_segments(
+    input_dir, output_dir, target_len_sec=30, n_process=32, out_extension=".wav"
+):
+    """Main function to cut segments from audio files"""
+    pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
+    list_dir = pathlib.Path(input_dir).glob("*/*")
+    list_dir = [x for x in list_dir if x.is_dir()]
+    print(f"{len(list_dir)} directories detected")
+    print(f"Launching {n_process} processes")
+    # Create tasks for multiprocessing
+    tasks = [
+        (path_book, output_dir, target_len_sec, out_extension) for path_book in list_dir
+    ]
+    # Process tasks in parallel using multiprocessing
+    with multiprocessing.Pool(processes=n_process) as pool:
+        for _ in tqdm.tqdm(pool.imap_unordered(cut_book, tasks), total=len(tasks)):
+            pass
+if __name__ == "__main__":
+    input_dir = "/path/to/input_dir"
+    output_dir = "/path/to/output_dir"
+    target_len_sec = 10
+    n_process = 16
+    cut_segments(input_dir, output_dir, target_len_sec, n_process)

utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import numpy as np
+from scipy.interpolate import interp1d
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler
+def intersperse(lst, item):
+    """
+    Insert an item in between any two consecutive elements of the given list, including beginning and end of list
+    Example:
+        >>> intersperse(0, [1, 74, 5, 31])
+            [0, 1, 0, 74, 0, 5, 0, 31, 0]
+    """
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def load_content_feature_path(meta_data, processed_dir, feat_dir):
+    utt2feat_path = {}
+    for utt_info in meta_data:
+        utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+        feat_path = os.path.join(
+            processed_dir, utt_info["Dataset"], feat_dir, f'{utt_info["Uid"]}.npy'
+        )
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def load_source_content_feature_path(meta_data, feat_dir):
+    utt2feat_path = {}
+    for utt in meta_data:
+        feat_path = os.path.join(feat_dir, f"{utt}.npy")
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def get_spk_map(spk2id_path, utt2spk_path):
+    utt2spk = {}
+    with open(spk2id_path, "r") as spk2id_file:
+        spk2id = json.load(spk2id_file)
+    with open(utt2spk_path, encoding="utf-8") as f:
+        for line in f.readlines():
+            utt, spk = line.strip().split("\t")
+            utt2spk[utt] = spk
+    return spk2id, utt2spk
+def get_target_f0_median(f0_dir):
+    total_f0 = []
+    for utt in os.listdir(f0_dir):
+        if not utt.endswith(".npy"):
+            continue
+        f0_feat_path = os.path.join(f0_dir, utt)
+        f0 = np.load(f0_feat_path)
+        total_f0 += f0.tolist()
+    total_f0 = np.array(total_f0)
+    voiced_position = np.where(total_f0 != 0)
+    return np.median(total_f0[voiced_position])
+def get_conversion_f0_factor(source_f0, target_median, source_median=None):
+    """Align the median between source f0 and target f0
+    Note: Here we use multiplication, whose factor is target_median/source_median
+    Reference: Frequency and pitch interval
+    http://blog.ccyg.studio/article/be12c2ee-d47c-4098-9782-ca76da3035e4/
+    """
+    if source_median is None:
+        voiced_position = np.where(source_f0 != 0)
+        source_median = np.median(source_f0[voiced_position])
+    factor = target_median / source_median
+    return source_median, factor
+def transpose_key(frame_pitch, trans_key):
+    # Transpose by user's argument
+    print("Transpose key = {} ...\n".format(trans_key))
+    transed_pitch = frame_pitch * 2 ** (trans_key / 12)
+    return transed_pitch
+def pitch_shift_to_target(frame_pitch, target_pitch_median, source_pitch_median=None):
+    # Loading F0 Base (median) and shift
+    source_pitch_median, factor = get_conversion_f0_factor(
+        frame_pitch, target_pitch_median, source_pitch_median
+    )
+    print(
+        "Auto transposing: source f0 median = {:.1f}, target f0 median = {:.1f}, factor = {:.2f}".format(
+            source_pitch_median, target_pitch_median, factor
+        )
+    )
+    transed_pitch = frame_pitch * factor
+    return transed_pitch
+def load_frame_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            pitch = np.load(pitch_path)
+            assert len(pitch) > 0
+            uv = pitch != 0
+            utt2uv[utt] = uv
+            if use_log_scale:
+                nonzero_idxes = np.where(pitch != 0)[0]
+                pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+            utt2pitch[utt] = pitch
+            pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                utt2pitch[utt] = normalized_pitch
+        pitch_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                pitch = np.load(pitch_path)
+                assert len(pitch) > 0
+                uv = pitch != 0
+                utt2uv[utt] = uv
+                if use_log_scale:
+                    nonzero_idxes = np.where(pitch != 0)[0]
+                    pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+                utt2pitch[utt] = pitch
+                pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2pitch, utt2uv, pitch_statistic
+# discard
+def load_phone_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    utt2dur,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=True,
+    utt2spk=None,
+):
+    print("Load Phone Pitch")
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_pitch = np.load(pitch_path)
+            assert len(frame_pitch) > 0
+            uv = frame_pitch != 0
+            utt2uv[utt] = uv
+            phone_pitch = phone_average_pitch(frame_pitch, utt2dur[utt], interoperate)
+            if use_log_scale:
+                nonzero_idxes = np.where(phone_pitch != 0)[0]
+                phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+            utt2pitch[utt] = phone_pitch
+            pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        max_value = np.finfo(np.float64).min
+        min_value = np.finfo(np.float64).max
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                max_value = max(max_value, max(normalized_pitch))
+                min_value = min(min_value, min(normalized_pitch))
+                utt2pitch[utt] = normalized_pitch
+                phone_normalized_pitch_path = os.path.join(
+                    processed_dir,
+                    utt_info["Dataset"],
+                    "phone_level_" + pitch_dir,
+                    f'{utt_info["Uid"]}.npy',
+                )
+        pitch_statistic = {
+            "mean": mean,
+            "std": std,
+            "min_value": min_value,
+            "max_value": max_value,
+        }
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                frame_pitch = np.load(pitch_path)
+                assert len(frame_pitch) > 0
+                uv = frame_pitch != 0
+                utt2uv[utt] = uv
+                phone_pitch = phone_average_pitch(
+                    frame_pitch, utt2dur[utt], interoperate
+                )
+                if use_log_scale:
+                    nonzero_idxes = np.where(phone_pitch != 0)[0]
+                    phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+                utt2pitch[utt] = phone_pitch
+                pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            max_value = np.finfo(np.float64).min
+            min_value = np.finfo(np.float64).max
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    max_value = max(max_value, max(normalized_pitch))
+                    min_value = min(min_value, min(normalized_pitch))
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append(
+                {
+                    "spk": spk,
+                    "mean": mean,
+                    "std": std,
+                    "min_value": min_value,
+                    "max_value": max_value,
+                }
+            )
+    return utt2pitch, utt2uv, pitch_statistic
+def phone_average_pitch(pitch, dur, interoperate=False):
+    pos = 0
+    if interoperate:
+        nonzero_ids = np.where(pitch != 0)[0]
+        interp_fn = interp1d(
+            nonzero_ids,
+            pitch[nonzero_ids],
+            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
+            bounds_error=False,
+        )
+        pitch = interp_fn(np.arange(0, len(pitch)))
+    phone_pitch = np.zeros(len(dur))
+    for i, d in enumerate(dur):
+        d = int(d)
+        if d > 0 and pos < len(pitch):
+            phone_pitch[i] = np.mean(pitch[pos : pos + d])
+        else:
+            phone_pitch[i] = 0
+        pos += d
+    return phone_pitch
+def load_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            if not os.path.exists(energy_path):
+                continue
+            energy = np.load(energy_path)
+            assert len(energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(energy != 0)[0]
+                energy[nonzero_idxes] = np.log(energy[nonzero_idxes])
+            utt2energy[utt] = energy
+        if return_norm:
+            with open(
+                os.path.join(
+                    processed_dir, utt_info["Dataset"], energy_dir, "statistics.json"
+                )
+            ) as f:
+                stats = json.load(f)
+                mean, std = (
+                    stats[utt_info["Dataset"] + "_" + utt_info["Singer"]][
+                        "voiced_positions"
+                    ]["mean"],
+                    stats["LJSpeech_LJSpeech"]["voiced_positions"]["std"],
+                )
+            for utt in utt2energy.keys():
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                if not os.path.exists(energy_path):
+                    continue
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def load_frame_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        energy_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_energy = np.load(energy_path)
+            assert len(frame_energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(frame_energy != 0)[0]
+                frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+            utt2energy[utt] = frame_energy
+            energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+        mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def align_length(feature, target_len, pad_value=0.0):
+    feature_len = feature.shape[-1]
+    dim = len(feature.shape)
+    # align 1-D data
+    if dim == 2:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature,
+                ((0, 0), (0, target_len - feature_len)),
+                constant_values=pad_value,
+            )
+        else:
+            feature = feature[:, :target_len]
+    # align 2-D data
+    elif dim == 1:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature, (0, target_len - feature_len), constant_values=pad_value
+            )
+        else:
+            feature = feature[:target_len]
+    else:
+        raise NotImplementedError
+    return feature
+def align_whisper_feauture_length(
+    feature, target_len, fast_mapping=True, source_hop=320, target_hop=256
+):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    max_source_len = 1500
+    target_len = min(target_len, max_source_len * source_hop // target_hop)
+    width = feature.shape[-1]
+    if fast_mapping:
+        source_len = target_len * target_hop // source_hop + 1
+        feature = feature[:source_len]
+    else:
+        source_len = max_source_len
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    assert len(down_sampling_feats) >= target_len
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def align_content_feature_length(feature, target_len, source_hop=320, target_hop=256):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    # (source_len, 256)
+    source_len, width = feature.shape
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    err = abs(target_len - len(down_sampling_feats))
+    if err > 4:  ## why 4 not 3?
+        print("target_len:", target_len)
+        print("raw feature:", feature.shape)
+        print("up_sampling:", up_sampling_feats.shape)
+        print("down_sampling_feats:", down_sampling_feats.shape)
+        exit()
+    if len(down_sampling_feats) < target_len:
+        # (1, dim) -> (err, dim)
+        end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+        down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def remove_outlier(values):
+    values = np.array(values)
+    p25 = np.percentile(values, 25)
+    p75 = np.percentile(values, 75)
+    lower = p25 - 1.5 * (p75 - p25)
+    upper = p75 + 1.5 * (p75 - p25)
+    normal_indices = np.logical_and(values > lower, values < upper)
+    return values[normal_indices]

utils/distribution.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.distributions import Normal
+def log_sum_exp(x):
+    """numerically stable log_sum_exp implementation that prevents overflow"""
+    # TF ordering
+    axis = len(x.size()) - 1
+    m, _ = torch.max(x, dim=axis)
+    m2, _ = torch.max(x, dim=axis, keepdim=True)
+    return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
+def discretized_mix_logistic_loss(
+    y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True
+):
+    """Discretized mixture of logistic distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        num_classes (int): Number of classes
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    assert y_hat.size(1) % 3 == 0
+    nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters. (B, T, num_mixtures) x 3
+    logit_probs = y_hat[:, :, :nr_mix]
+    means = y_hat[:, :, nr_mix : 2 * nr_mix]
+    log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min)
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1))
+    cdf_plus = torch.sigmoid(plus_in)
+    min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1))
+    cdf_min = torch.sigmoid(min_in)
+    # log probability for edge case of 0 (before scaling)
+    # equivalent: torch.log(torch.sigmoid(plus_in))
+    log_cdf_plus = plus_in - F.softplus(plus_in)
+    # log probability for edge case of 255 (before scaling)
+    # equivalent: (1 - torch.sigmoid(min_in)).log()
+    log_one_minus_cdf_min = -F.softplus(min_in)
+    # probability for all other cases
+    cdf_delta = cdf_plus - cdf_min
+    mid_in = inv_stdv * centered_y
+    # log probability in the center of the bin, to be used in extreme cases
+    # (not actually used in our code)
+    log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
+    # tf equivalent
+    """
+    log_probs = tf.where(x < -0.999, log_cdf_plus,
+                         tf.where(x > 0.999, log_one_minus_cdf_min,
+                                  tf.where(cdf_delta > 1e-5,
+                                           tf.log(tf.maximum(cdf_delta, 1e-12)),
+                                           log_pdf_mid - np.log(127.5))))
+    """
+    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+    # for num_classes=65536 case? 1e-7? not sure..
+    inner_inner_cond = (cdf_delta > 1e-5).float()
+    inner_inner_out = inner_inner_cond * torch.log(
+        torch.clamp(cdf_delta, min=1e-12)
+    ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+    inner_cond = (y > 0.999).float()
+    inner_out = (
+        inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out
+    )
+    cond = (y < -0.999).float()
+    log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out
+    log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        return -torch.sum(log_sum_exp(log_probs))
+    else:
+        return -log_sum_exp(log_probs).unsqueeze(-1)
+def to_one_hot(tensor, n, fill_with=1.0):
+    # we perform one hot encore with respect to the last axis
+    one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
+    if tensor.is_cuda:
+        one_hot = one_hot.cuda()
+    one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
+    return one_hot
+def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0, clamp_log_scale=False):
+    """
+    Sample from discretized mixture of logistic distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    assert y.size(1) % 3 == 0
+    nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    logit_probs = y[:, :, :nr_mix]
+    # sample mixture indicator from softmax
+    temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+    temp = logit_probs.data - torch.log(-torch.log(temp))
+    _, argmax = temp.max(dim=-1)
+    # (B, T) -> (B, T, nr_mix)
+    one_hot = to_one_hot(argmax, nr_mix)
+    # select logistic parameters
+    means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+    log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    if clamp_log_scale:
+        log_scales = torch.clamp(log_scales, min=log_scale_min)
+    # sample from logistic & clip to interval
+    # we don't actually round to the nearest 8bit value when sampling
+    u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
+    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u))
+    x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0)
+    return x
+# we can easily define discretized version of the gaussian loss, however,
+# use continuous version as same as the https://clarinet-demo.github.io/
+def mix_gaussian_loss(y_hat, y, log_scale_min=-7.0, reduce=True):
+    """Mixture of continuous gaussian distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    C = y_hat.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y_hat.size(1) % 3 == 0
+        nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters.
+    if C == 2:
+        # special case for C == 2, just for compatibility
+        logit_probs = None
+        means = y_hat[:, :, 0:1]
+        log_scales = torch.clamp(y_hat[:, :, 1:2], min=log_scale_min)
+    else:
+        #  (B, T, num_mixtures) x 3
+        logit_probs = y_hat[:, :, :nr_mix]
+        means = y_hat[:, :, nr_mix : 2 * nr_mix]
+        log_scales = torch.clamp(
+            y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min
+        )
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    dist = Normal(loc=0.0, scale=torch.exp(log_scales))
+    # do we need to add a trick to avoid log(0)?
+    log_probs = dist.log_prob(centered_y)
+    if nr_mix > 1:
+        log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        if nr_mix == 1:
+            return -torch.sum(log_probs)
+        else:
+            return -torch.sum(log_sum_exp(log_probs))
+    else:
+        if nr_mix == 1:
+            return -log_probs
+        else:
+            return -log_sum_exp(log_probs).unsqueeze(-1)
+def sample_from_mix_gaussian(y, log_scale_min=-7.0):
+    """
+    Sample from (discretized) mixture of gaussian distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    C = y.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y.size(1) % 3 == 0
+        nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    if C == 2:
+        logit_probs = None
+    else:
+        logit_probs = y[:, :, :nr_mix]
+    if nr_mix > 1:
+        # sample mixture indicator from softmax
+        temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+        temp = logit_probs.data - torch.log(-torch.log(temp))
+        _, argmax = temp.max(dim=-1)
+        # (B, T) -> (B, T, nr_mix)
+        one_hot = to_one_hot(argmax, nr_mix)
+        # Select means and log scales
+        means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+        log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    else:
+        if C == 2:
+            means, log_scales = y[:, :, 0], y[:, :, 1]
+        elif C == 3:
+            means, log_scales = y[:, :, 1], y[:, :, 2]
+        else:
+            assert False, "shouldn't happen"
+    scales = torch.exp(log_scales)
+    dist = Normal(loc=means, scale=scales)
+    x = dist.sample()
+    x = torch.clamp(x, min=-1.0, max=1.0)
+    return x

utils/dsp.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+# ZERO = 1e-12
+def gaussian_normalize_mel_channel(mel, mu, sigma):
+    """
+    Shift to Standorm Normal Distribution
+    Args:
+        mel: (n_mels, frame_len)
+        mu: (n_mels,), mean value
+        sigma: (n_mels,), sd value
+    Return:
+        Tensor like mel
+    """
+    mu = np.expand_dims(mu, -1)
+    sigma = np.expand_dims(sigma, -1)
+    return (mel - mu) / sigma
+def de_gaussian_normalize_mel_channel(mel, mu, sigma):
+    """
+    Args:
+        mel: (n_mels, frame_len)
+        mu: (n_mels,), mean value
+        sigma: (n_mels,), sd value
+    Return:
+        Tensor like mel
+    """
+    mu = np.expand_dims(mu, -1)
+    sigma = np.expand_dims(sigma, -1)
+    return sigma * mel + mu
+def decompress(audio_compressed, bits):
+    mu = 2**bits - 1
+    audio = np.sign(audio_compressed) / mu * ((1 + mu) ** np.abs(audio_compressed) - 1)
+    return audio
+def compress(audio, bits):
+    mu = 2**bits - 1
+    audio_compressed = np.sign(audio) * np.log(1 + mu * np.abs(audio)) / np.log(mu + 1)
+    return audio_compressed
+def label_to_audio(quant, bits):
+    classes = 2**bits
+    audio = 2 * quant / (classes - 1.0) - 1.0
+    return audio
+def audio_to_label(audio, bits):
+    """Normalized audio data tensor to digit array
+    Args:
+        audio (tensor): audio data
+        bits (int): data bits
+    Returns:
+        array<int>: digit array of audio data
+    """
+    classes = 2**bits
+    # initialize an increasing array with values from -1 to 1
+    bins = np.linspace(-1, 1, classes)
+    # change value in audio tensor to digits
+    quant = np.digitize(audio, bins) - 1
+    return quant
+def label_to_onehot(x, bits):
+    """Converts a class vector (integers) to binary class matrix.
+    Args:
+        x: class vector to be converted into a matrix
+            (integers from 0 to num_classes).
+        num_classes: total number of classes.
+    Returns:
+        A binary matrix representation of the input. The classes axis
+        is placed last.
+    """
+    classes = 2**bits
+    result = torch.zeros((x.shape[0], classes), dtype=torch.float32)
+    for i in range(x.shape[0]):
+        result[i, x[i]] = 1
+    output_shape = x.shape + (classes,)
+    output = torch.reshape(result, output_shape)
+    return output

utils/duration.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import os
+import tgt
+def get_alignment(tier, cfg):
+    sample_rate = cfg["sample_rate"]
+    hop_size = cfg["hop_size"]
+    sil_phones = ["sil", "sp", "spn"]
+    phones = []
+    durations = []
+    start_time = 0
+    end_time = 0
+    end_idx = 0
+    for t in tier._objects:
+        s, e, p = t.start_time, t.end_time, t.text
+        # Trim leading silences
+        if phones == []:
+            if p in sil_phones:
+                continue
+            else:
+                start_time = s
+        if p not in sil_phones:
+            # For ordinary phones
+            phones.append(p)
+            end_time = e
+            end_idx = len(phones)
+        else:
+            # For silent phones
+            phones.append(p)
+        durations.append(
+            int(
+                np.round(e * sample_rate / hop_size)
+                - np.round(s * sample_rate / hop_size)
+            )
+        )
+    # Trim tailing silences
+    phones = phones[:end_idx]
+    durations = durations[:end_idx]
+    return phones, durations, start_time, end_time
+def get_duration(utt, wav, cfg):
+    speaker = utt["Singer"]
+    basename = utt["Uid"]
+    dataset = utt["Dataset"]
+    sample_rate = cfg["sample_rate"]
+    # print(cfg.processed_dir, dataset, speaker, basename)
+    wav_path = os.path.join(
+        cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
+    )
+    text_path = os.path.join(
+        cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
+    )
+    tg_path = os.path.join(
+        cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
+    )
+    # Read raw text
+    with open(text_path, "r") as f:
+        raw_text = f.readline().strip("\n")
+    # Get alignments
+    textgrid = tgt.io.read_textgrid(tg_path)
+    phone, duration, start, end = get_alignment(
+        textgrid.get_tier_by_name("phones"), cfg
+    )
+    text = "{" + " ".join(phone) + "}"
+    if start >= end:
+        return None
+    return duration, text, int(sample_rate * start), int(sample_rate * end)

utils/f0.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import librosa
+import numpy as np
+import torch
+import parselmouth
+import torchcrepe
+import pyworld as pw
+def f0_to_coarse(f0, pitch_bin, f0_min, f0_max):
+    """
+    Convert f0 (Hz) to pitch (mel scale), and then quantize the mel-scale pitch to the
+    range from [1, 2, 3, ..., pitch_bin-1]
+    Reference: https://en.wikipedia.org/wiki/Mel_scale
+    Args:
+        f0 (array or Tensor): Hz
+        pitch_bin (int): the vocabulary size
+        f0_min (int): the minimum f0 (Hz)
+        f0_max (int): the maximum f0 (Hz)
+    Returns:
+        quantized f0 (array or Tensor)
+    """
+    f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+    f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (pitch_bin - 2) / (
+        f0_mel_max - f0_mel_min
+    ) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > pitch_bin - 1] = pitch_bin - 1
+    f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int32)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+        f0_coarse.max(),
+        f0_coarse.min(),
+    )
+    return f0_coarse
+def interpolate(f0):
+    """Interpolate the unvoiced part. Thus the f0 can be passed to a subtractive synthesizer.
+    Args:
+        f0: A numpy array of shape (seq_len,)
+    Returns:
+        f0: Interpolated f0 of shape (seq_len,)
+        uv: Unvoiced part of shape (seq_len,)
+    """
+    uv = f0 == 0
+    if len(f0[~uv]) > 0:
+        # interpolate the unvoiced f0
+        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+        uv = uv.astype("float")
+        uv = np.min(np.array([uv[:-2], uv[1:-1], uv[2:]]), axis=0)
+        uv = np.pad(uv, (1, 1))
+    return f0, uv
+def get_log_f0(f0):
+    f0[np.where(f0 == 0)] = 1
+    log_f0 = np.log(f0)
+    return log_f0
+def get_f0_features_using_pyin(audio, cfg):
+    """Using pyin to extract the f0 feature.
+    Args:
+        audio
+        fs
+        win_length
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    f0, voiced_flag, voiced_probs = librosa.pyin(
+        y=audio,
+        fmin=cfg.f0_min,
+        fmax=cfg.f0_max,
+        sr=cfg.sample_rate,
+        win_length=cfg.win_size,
+        hop_length=cfg.hop_size,
+    )
+    # Set nan to 0
+    f0[voiced_flag == False] = 0
+    return f0
+def get_f0_features_using_parselmouth(audio, cfg, speed=1):
+    """Using parselmouth to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        hop_length
+        fs
+        f0_min
+        f0_max
+        speed(default=1)
+    Returns:
+        f0: numpy array of shape (frame_len,)
+        pitch_coarse: numpy array of shape (frame_len,)
+    """
+    hop_size = int(np.round(cfg.hop_size * speed))
+    # Calculate the time step for pitch extraction
+    time_step = hop_size / cfg.sample_rate * 1000
+    f0 = (
+        parselmouth.Sound(audio, cfg.sample_rate)
+        .to_pitch_ac(
+            time_step=time_step / 1000,
+            voicing_threshold=0.6,
+            pitch_floor=cfg.f0_min,
+            pitch_ceiling=cfg.f0_max,
+        )
+        .selected_array["frequency"]
+    )
+    return f0
+def get_f0_features_using_dio(audio, cfg):
+    """Using dio to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    # Get the raw f0
+    _f0, t = pw.dio(
+        audio.astype("double"),
+        cfg.sample_rate,
+        f0_floor=cfg.f0_min,
+        f0_ceil=cfg.f0_max,
+        channels_in_octave=2,
+        frame_period=(1000 * cfg.hop_size / cfg.sample_rate),
+    )
+    # Get the f0
+    f0 = pw.stonemask(audio.astype("double"), _f0, t, cfg.sample_rate)
+    return f0
+def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max):
+    """Using harvest to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        f0_min
+        f0_max
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    f0, _ = pw.harvest(
+        audio.astype("double"),
+        fs,
+        f0_floor=f0_min,
+        f0_ceil=f0_max,
+        frame_period=(1000 * hop_length / fs),
+    )
+    f0 = f0.astype("float")[:mel_len]
+    return f0
+def get_f0_features_using_crepe(
+    audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
+):
+    """Using torchcrepe to extract the f0 feature.
+    Args:
+        audio
+        mel_len
+        fs
+        hop_length
+        hop_length_new
+        f0_min
+        f0_max
+        threshold(default=0.3)
+    Returns:
+        f0: numpy array of shape (frame_len,)
+    """
+    # Currently, crepe only supports 16khz audio
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    audio_16k = librosa.resample(audio, orig_sr=fs, target_sr=16000)
+    audio_16k_torch = torch.FloatTensor(audio_16k).unsqueeze(0).to(device)
+    # Get the raw pitch
+    f0, pd = torchcrepe.predict(
+        audio_16k_torch,
+        16000,
+        hop_length_new,
+        f0_min,
+        f0_max,
+        pad=True,
+        model="full",
+        batch_size=1024,
+        device=device,
+        return_periodicity=True,
+    )
+    # Filter, de-silence, set up threshold for unvoiced part
+    pd = torchcrepe.filter.median(pd, 3)
+    pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_16k_torch, 16000, hop_length_new)
+    f0 = torchcrepe.threshold.At(threshold)(f0, pd)
+    f0 = torchcrepe.filter.mean(f0, 3)
+    # Convert unvoiced part to 0hz
+    f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)
+    # Interpolate f0
+    nzindex = torch.nonzero(f0[0]).squeeze()
+    f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy()
+    time_org = 0.005 * nzindex.cpu().numpy()
+    time_frame = np.arange(mel_len) * hop_length / fs
+    f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
+    return f0
+def get_f0(audio, cfg, use_interpolate=False, return_uv=False):
+    if cfg.pitch_extractor == "dio":
+        f0 = get_f0_features_using_dio(audio, cfg)
+    elif cfg.pitch_extractor == "pyin":
+        f0 = get_f0_features_using_pyin(audio, cfg)
+    elif cfg.pitch_extractor == "parselmouth":
+        f0 = get_f0_features_using_parselmouth(audio, cfg)
+    if use_interpolate:
+        f0, uv = interpolate(f0)
+    else:
+        uv = f0 == 0
+    if return_uv:
+        return f0, uv
+    return f0
+def get_cents(f0_hz):
+    """
+    F_{cent} = 1200 * log2 (F/440)
+    Reference:
+        APSIPA'17, Perceptual Evaluation of Singing Quality
+    """
+    voiced_f0 = f0_hz[f0_hz != 0]
+    return 1200 * np.log2(voiced_f0 / 440)
+def get_pitch_derivatives(f0_hz):
+    """
+    f0_hz: (,T)
+    """
+    f0_cent = get_cents(f0_hz)
+    return f0_cent[1:] - f0_cent[:-1]
+def get_pitch_sub_median(f0_hz):
+    """
+    f0_hz: (,T)
+    """
+    f0_cent = get_cents(f0_hz)
+    return f0_cent - np.median(f0_cent)

utils/hparam.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py  pylint: disable=line-too-long
+"""Hyperparameter values."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import numbers
+import re
+import six
+# Define the regular expression for parsing a single clause of the input
+# (delimited by commas).  A legal clause looks like:
+#   <variable name>[<index>]? = <rhs>
+# where <rhs> is either a single token or [] enclosed list of tokens.
+# For example:  "var[1] = a" or "x = [1,2,3]"
+PARAM_RE = re.compile(
+    r"""
+  (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
+  (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
+  \s*=\s*
+  ((?P<val>[^,\[]*)            # single value: "a" or None
+   |
+   \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
+  ($|,\s*)""",
+    re.VERBOSE,
+)
+def _parse_fail(name, var_type, value, values):
+    """Helper function for raising a value error for bad assignment."""
+    raise ValueError(
+        "Could not parse hparam '%s' of type '%s' with value '%s' in %s"
+        % (name, var_type.__name__, value, values)
+    )
+def _reuse_fail(name, values):
+    """Helper function for raising a value error for reuse of name."""
+    raise ValueError("Multiple assignments to variable '%s' in %s" % (name, values))
+def _process_scalar_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary with a scalar value.
+    Used to update the results_dictionary to be returned by parse_values when
+    encountering a clause with a scalar RHS (e.g.  "s=5" or "arr[0]=5".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("s" or "arr").
+      parse_fn: Function for parsing the actual value.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+        m_dict['index']: List index value (or None)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has already been used.
+    """
+    try:
+        parsed_value = parse_fn(m_dict["val"])
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["val"], values)
+    # If no index is provided
+    if not m_dict["index"]:
+        if name in results_dictionary:
+            _reuse_fail(name, values)
+        results_dictionary[name] = parsed_value
+    else:
+        if name in results_dictionary:
+            # The name has already been used as a scalar, then it
+            # will be in this dictionary and map to a non-dictionary.
+            if not isinstance(results_dictionary.get(name), dict):
+                _reuse_fail(name, values)
+        else:
+            results_dictionary[name] = {}
+        index = int(m_dict["index"])
+        # Make sure the index position hasn't already been assigned a value.
+        if index in results_dictionary[name]:
+            _reuse_fail("{}[{}]".format(name, index), values)
+        results_dictionary[name][index] = parsed_value
+def _process_list_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary from a list of values.
+    Used to update results_dictionary to be returned by parse_values when
+    encountering a clause with a list RHS (e.g.  "arr=[1,2,3]".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("arr").
+      parse_fn: Function for parsing individual values.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has an index or the values cannot be parsed.
+    """
+    if m_dict["index"] is not None:
+        raise ValueError("Assignment of a list to a list index.")
+    elements = filter(None, re.split("[ ,]", m_dict["vals"]))
+    # Make sure the name hasn't already been assigned a value
+    if name in results_dictionary:
+        raise _reuse_fail(name, values)
+    try:
+        results_dictionary[name] = [parse_fn(e) for e in elements]
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["vals"], values)
+def _cast_to_type_if_compatible(name, param_type, value):
+    """Cast hparam to the provided type, if compatible.
+    Args:
+      name: Name of the hparam to be cast.
+      param_type: The type of the hparam.
+      value: The value to be cast, if compatible.
+    Returns:
+      The result of casting `value` to `param_type`.
+    Raises:
+      ValueError: If the type of `value` is not compatible with param_type.
+        * If `param_type` is a string type, but `value` is not.
+        * If `param_type` is a boolean, but `value` is not, or vice versa.
+        * If `param_type` is an integer type, but `value` is not.
+        * If `param_type` is a float type, but `value` is not a numeric type.
+    """
+    fail_msg = "Could not cast hparam '%s' of type '%s' from value %r" % (
+        name,
+        param_type,
+        value,
+    )
+    # Some callers use None, for which we can't do any casting/checking. :(
+    if issubclass(param_type, type(None)):
+        return value
+    # Avoid converting a non-string type to a string.
+    if issubclass(param_type, (six.string_types, six.binary_type)) and not isinstance(
+        value, (six.string_types, six.binary_type)
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a number or string type to a boolean or vice versa.
+    if issubclass(param_type, bool) != isinstance(value, bool):
+        raise ValueError(fail_msg)
+    # Avoid converting float to an integer (the reverse is fine).
+    if issubclass(param_type, numbers.Integral) and not isinstance(
+        value, numbers.Integral
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a non-numeric type to a numeric type.
+    if issubclass(param_type, numbers.Number) and not isinstance(value, numbers.Number):
+        raise ValueError(fail_msg)
+    return param_type(value)
+def parse_values(values, type_map, ignore_unknown=False):
+    """Parses hyperparameter values from a string into a python map.
+    `values` is a string containing comma-separated `name=value` pairs.
+    For each pair, the value of the hyperparameter named `name` is set to
+    `value`.
+    If a hyperparameter name appears multiple times in `values`, a ValueError
+    is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
+    If a hyperparameter name in both an index assignment and scalar assignment,
+    a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
+    The hyperparameter name may contain '.' symbols, which will result in an
+    attribute name that is only accessible through the getattr and setattr
+    functions.  (And must be first explicit added through add_hparam.)
+    WARNING: Use of '.' in your variable names is allowed, but is not well
+    supported and not recommended.
+    The `value` in `name=value` must follows the syntax according to the
+    type of the parameter:
+    *  Scalar integer: A Python-parsable integer point value.  E.g.: 1,
+       100, -12.
+    *  Scalar float: A Python-parsable floating point value.  E.g.: 1.0,
+       -.54e89.
+    *  Boolean: Either true or false.
+    *  Scalar string: A non-empty sequence of characters, excluding comma,
+       spaces, and square brackets.  E.g.: foo, bar_1.
+    *  List: A comma separated list of scalar values of the parameter type
+       enclosed in square brackets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+    When index assignment is used, the corresponding type_map key should be the
+    list name.  E.g. for "arr[1]=0" the type_map must have the key "arr" (not
+    "arr[1]").
+    Args:
+      values: String.  Comma separated list of `name=value` pairs where
+        'value' must follow the syntax described above.
+      type_map: A dictionary mapping hyperparameter names to types.  Note every
+        parameter name in values must be a key in type_map.  The values must
+        conform to the types indicated, where a value V is said to conform to a
+        type T if either V has type T, or V is a list of elements of type T.
+        Hence, for a multidimensional parameter 'x' taking float values,
+        'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+      ignore_unknown: Bool. Whether values that are missing a type in type_map
+        should be ignored. If set to True, a ValueError will not be raised for
+        unknown hyperparameter type.
+    Returns:
+      A python map mapping each name to either:
+      * A scalar value.
+      * A list of scalar values.
+      * A dictionary mapping index numbers to scalar values.
+      (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
+    Raises:
+      ValueError: If there is a problem with input.
+      * If `values` cannot be parsed.
+      * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
+      * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
+        'a[1]=1,a[1]=2', or 'a=1,a=[1]')
+    """
+    results_dictionary = {}
+    pos = 0
+    while pos < len(values):
+        m = PARAM_RE.match(values, pos)
+        if not m:
+            raise ValueError("Malformed hyperparameter value: %s" % values[pos:])
+        # Check that there is a comma between parameters and move past it.
+        pos = m.end()
+        # Parse the values.
+        m_dict = m.groupdict()
+        name = m_dict["name"]
+        if name not in type_map:
+            if ignore_unknown:
+                continue
+            raise ValueError("Unknown hyperparameter type for %s" % name)
+        type_ = type_map[name]
+        # Set up correct parsing function (depending on whether type_ is a bool)
+        if type_ == bool:
+            def parse_bool(value):
+                if value in ["true", "True"]:
+                    return True
+                elif value in ["false", "False"]:
+                    return False
+                else:
+                    try:
+                        return bool(int(value))
+                    except ValueError:
+                        _parse_fail(name, type_, value, values)
+            parse = parse_bool
+        else:
+            parse = type_
+        # If a singe value is provided
+        if m_dict["val"] is not None:
+            _process_scalar_value(
+                name, parse, type_, m_dict, values, results_dictionary
+            )
+        # If the assigned value is a list:
+        elif m_dict["vals"] is not None:
+            _process_list_value(name, parse, type_, m_dict, values, results_dictionary)
+        else:  # Not assigned a list or value
+            _parse_fail(name, type_, "", values)
+    return results_dictionary
+class HParams(object):
+    """Class to hold a set of hyperparameters as name-value pairs.
+    A `HParams` object holds hyperparameters used to build and train a model,
+    such as the number of hidden units in a neural net layer or the learning rate
+    to use when training.
+    You first create a `HParams` object by specifying the names and values of the
+    hyperparameters.
+    To make them easily accessible the parameter names are added as direct
+    attributes of the class.  A typical usage is as follows:
+    ```python
+    # Create a HParams object specifying names and values of the model
+    # hyperparameters:
+    hparams = HParams(learning_rate=0.1, num_hidden_units=100)
+    # The hyperparameter are available as attributes of the HParams object:
+    hparams.learning_rate ==> 0.1
+    hparams.num_hidden_units ==> 100
+    ```
+    Hyperparameters have type, which is inferred from the type of their value
+    passed at construction type.   The currently supported types are: integer,
+    float, boolean, string, and list of integer, float, boolean, or string.
+    You can override hyperparameter values by calling the
+    [`parse()`](#HParams.parse) method, passing a string of comma separated
+    `name=value` pairs.  This is intended to make it possible to override
+    any hyperparameter values from a single command-line flag to which
+    the user passes 'hyper-param=value' pairs.  It avoids having to define
+    one flag for each hyperparameter.
+    The syntax expected for each value depends on the type of the parameter.
+    See `parse()` for a description of the syntax.
+    Example:
+    ```python
+    # Define a command line flag to pass name=value pairs.
+    # For example using argparse:
+    import argparse
+    parser = argparse.ArgumentParser(description='Train my model.')
+    parser.add_argument('--hparams', type=str,
+                        help='Comma separated list of "name=value" pairs.')
+    args = parser.parse_args()
+    ...
+    def my_program():
+      # Create a HParams object specifying the names and values of the
+      # model hyperparameters:
+      hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
+                           activations=['relu', 'tanh'])
+      # Override hyperparameters values by parsing the command line
+      hparams.parse(args.hparams)
+      # If the user passed `--hparams=learning_rate=0.3` on the command line
+      # then 'hparams' has the following attributes:
+      hparams.learning_rate ==> 0.3
+      hparams.num_hidden_units ==> 100
+      hparams.activations ==> ['relu', 'tanh']
+      # If the hyperparameters are in json format use parse_json:
+      hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
+    ```
+    """
+    _HAS_DYNAMIC_ATTRIBUTES = True  # Required for pytype checks.
+    def __init__(self, model_structure=None, **kwargs):
+        """Create an instance of `HParams` from keyword arguments.
+        The keyword arguments specify name-values pairs for the hyperparameters.
+        The parameter types are inferred from the type of the values passed.
+        The parameter names are added as attributes of `HParams` object, so they
+        can be accessed directly with the dot notation `hparams._name_`.
+        Example:
+        ```python
+        # Define 3 hyperparameters: 'learning_rate' is a float parameter,
+        # 'num_hidden_units' an integer parameter, and 'activation' a string
+        # parameter.
+        hparams = tf.HParams(
+            learning_rate=0.1, num_hidden_units=100, activation='relu')
+        hparams.activation ==> 'relu'
+        ```
+        Note that a few names are reserved and cannot be used as hyperparameter
+        names.  If you use one of the reserved name the constructor raises a
+        `ValueError`.
+        Args:
+          model_structure: An instance of ModelStructure, defining the feature
+            crosses to be used in the Trial.
+          **kwargs: Key-value pairs where the key is the hyperparameter name and
+            the value is the value for the parameter.
+        Raises:
+          ValueError: If both `hparam_def` and initialization values are provided,
+            or if one of the arguments is invalid.
+        """
+        # Register the hyperparameters and their type in _hparam_types.
+        # This simplifies the implementation of parse().
+        # _hparam_types maps the parameter name to a tuple (type, bool).
+        # The type value is the type of the parameter for scalar hyperparameters,
+        # or the type of the list elements for multidimensional hyperparameters.
+        # The bool value is True if the value is a list, False otherwise.
+        self._hparam_types = {}
+        self._model_structure = model_structure
+        for name, value in six.iteritems(kwargs):
+            self.add_hparam(name, value)
+    def add_hparam(self, name, value):
+        """Adds {name, value} pair to hyperparameters.
+        Args:
+          name: Name of the hyperparameter.
+          value: Value of the hyperparameter. Can be one of the following types:
+            int, float, string, int list, float list, or string list.
+        Raises:
+          ValueError: if one of the arguments is invalid.
+        """
+        # Keys in kwargs are unique, but 'name' could the name of a pre-existing
+        # attribute of this object.  In that case we refuse to use it as a
+        # hyperparameter name.
+        if getattr(self, name, None) is not None:
+            raise ValueError("Hyperparameter name is reserved: %s" % name)
+        if isinstance(value, (list, tuple)):
+            if not value:
+                raise ValueError(
+                    "Multi-valued hyperparameters cannot be empty: %s" % name
+                )
+            self._hparam_types[name] = (type(value[0]), True)
+        else:
+            self._hparam_types[name] = (type(value), False)
+        setattr(self, name, value)
+    def set_hparam(self, name, value):
+        """Set the value of an existing hyperparameter.
+        This function verifies that the type of the value matches the type of the
+        existing hyperparameter.
+        Args:
+          name: Name of the hyperparameter.
+          value: New value of the hyperparameter.
+        Raises:
+          KeyError: If the hyperparameter doesn't exist.
+          ValueError: If there is a type mismatch.
+        """
+        param_type, is_list = self._hparam_types[name]
+        if isinstance(value, list):
+            if not is_list:
+                raise ValueError(
+                    "Must not pass a list for single-valued parameter: %s" % name
+                )
+            setattr(
+                self,
+                name,
+                [_cast_to_type_if_compatible(name, param_type, v) for v in value],
+            )
+        else:
+            if is_list:
+                raise ValueError(
+                    "Must pass a list for multi-valued parameter: %s." % name
+                )
+            setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
+    def del_hparam(self, name):
+        """Removes the hyperparameter with key 'name'.
+        Does nothing if it isn't present.
+        Args:
+          name: Name of the hyperparameter.
+        """
+        if hasattr(self, name):
+            delattr(self, name)
+            del self._hparam_types[name]
+    def parse(self, values):
+        """Override existing hyperparameter values, parsing new values from a string.
+        See parse_values for more detail on the allowed format for values.
+        Args:
+          values: String.  Comma separated list of `name=value` pairs where 'value'
+            must follow the syntax described above.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+          doesn't exist.
+        """
+        type_map = {}
+        for name, t in self._hparam_types.items():
+            param_type, _ = t
+            type_map[name] = param_type
+        values_map = parse_values(values, type_map)
+        return self.override_from_dict(values_map)
+    def override_from_dict(self, values_dict):
+        """Override existing hyperparameter values, parsing new values from a dictionary.
+        Args:
+          values_dict: Dictionary of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_dict` doesn't exist.
+          ValueError: If `values_dict` cannot be parsed.
+        """
+        for name, value in values_dict.items():
+            self.set_hparam(name, value)
+        return self
+    def set_model_structure(self, model_structure):
+        self._model_structure = model_structure
+    def get_model_structure(self):
+        return self._model_structure
+    def to_json(self, indent=None, separators=None, sort_keys=False):
+        """Serializes the hyperparameters into JSON.
+        Args:
+          indent: If a non-negative integer, JSON array elements and object members
+            will be pretty-printed with that indent level. An indent level of 0, or
+            negative, will only insert newlines. `None` (the default) selects the
+            most compact representation.
+          separators: Optional `(item_separator, key_separator)` tuple. Default is
+            `(', ', ': ')`.
+          sort_keys: If `True`, the output dictionaries will be sorted by key.
+        Returns:
+          A JSON string.
+        """
+        def remove_callables(x):
+            """Omit callable elements from input with arbitrary nesting."""
+            if isinstance(x, dict):
+                return {
+                    k: remove_callables(v)
+                    for k, v in six.iteritems(x)
+                    if not callable(v)
+                }
+            elif isinstance(x, list):
+                return [remove_callables(i) for i in x if not callable(i)]
+            return x
+        return json.dumps(
+            remove_callables(self.values()),
+            indent=indent,
+            separators=separators,
+            sort_keys=sort_keys,
+        )
+    def parse_json(self, values_json):
+        """Override existing hyperparameter values, parsing new values from a json object.
+        Args:
+          values_json: String containing a json object of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_json` doesn't exist.
+          ValueError: If `values_json` cannot be parsed.
+        """
+        values_map = json.loads(values_json)
+        return self.override_from_dict(values_map)
+    def values(self):
+        """Return the hyperparameter values as a Python dictionary.
+        Returns:
+          A dictionary with hyperparameter names as keys.  The values are the
+          hyperparameter values.
+        """
+        return {n: getattr(self, n) for n in self._hparam_types.keys()}
+    def get(self, key, default=None):
+        """Returns the value of `key` if it exists, else `default`."""
+        if key in self._hparam_types:
+            # Ensure that default is compatible with the parameter type.
+            if default is not None:
+                param_type, is_param_list = self._hparam_types[key]
+                type_str = "list<%s>" % param_type if is_param_list else str(param_type)
+                fail_msg = (
+                    "Hparam '%s' of type '%s' is incompatible with "
+                    "default=%s" % (key, type_str, default)
+                )
+                is_default_list = isinstance(default, list)
+                if is_param_list != is_default_list:
+                    raise ValueError(fail_msg)
+                try:
+                    if is_default_list:
+                        for value in default:
+                            _cast_to_type_if_compatible(key, param_type, value)
+                    else:
+                        _cast_to_type_if_compatible(key, param_type, default)
+                except ValueError as e:
+                    raise ValueError("%s. %s" % (fail_msg, e))
+            return getattr(self, key)
+        return default
+    def __contains__(self, key):
+        return key in self._hparam_types
+    def __str__(self):
+        return str(sorted(self.values().items()))
+    def __repr__(self):
+        return "%s(%s)" % (type(self).__name__, self.__str__())
+    @staticmethod
+    def _get_kind_name(param_type, is_list):
+        """Returns the field name given parameter type and is_list.
+        Args:
+          param_type: Data type of the hparam.
+          is_list: Whether this is a list.
+        Returns:
+          A string representation of the field name.
+        Raises:
+          ValueError: If parameter type is not recognized.
+        """
+        if issubclass(param_type, bool):
+            # This check must happen before issubclass(param_type, six.integer_types),
+            # since Python considers bool to be a subclass of int.
+            typename = "bool"
+        elif issubclass(param_type, six.integer_types):
+            # Setting 'int' and 'long' types to be 'int64' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "int64"
+        elif issubclass(param_type, (six.string_types, six.binary_type)):
+            # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "bytes"
+        elif issubclass(param_type, float):
+            typename = "float"
+        else:
+            raise ValueError("Unsupported parameter type: %s" % str(param_type))
+        suffix = "list" if is_list else "value"
+        return "_".join([typename, suffix])

utils/hubert.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.0/preprocess_hubert_f0.py
+import os
+import librosa
+import torch
+import numpy as np
+from fairseq import checkpoint_utils
+from tqdm import tqdm
+import torch
+def load_hubert_model(hps):
+    # Load model
+    ckpt_path = hps.hubert_file
+    print("Load Hubert Model...")
+    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+        [ckpt_path],
+        suffix="",
+    )
+    model = models[0]
+    model.eval()
+    if torch.cuda.is_available():
+        model = model.cuda()
+    return model
+def get_hubert_content(hmodel, wav_16k_tensor):
+    feats = wav_16k_tensor
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    feats = feats.view(1, -1)
+    padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+    inputs = {
+        "source": feats.to(wav_16k_tensor.device),
+        "padding_mask": padding_mask.to(wav_16k_tensor.device),
+        "output_layer": 9,  # layer 9
+    }
+    with torch.no_grad():
+        logits = hmodel.extract_features(**inputs)
+        feats = hmodel.final_proj(logits[0]).squeeze(0)
+    return feats
+def content_vector_encoder(model, audio_path, default_sampling_rate=16000):
+    """
+    # content vector default sr: 16000
+    """
+    wav16k, sr = librosa.load(audio_path, sr=default_sampling_rate)
+    device = next(model.parameters()).device
+    wav16k = torch.from_numpy(wav16k).to(device)
+    # (1, 256, frame_len)
+    content_feature = get_hubert_content(model, wav_16k_tensor=wav16k)
+    return content_feature.cpu().detach().numpy()
+def repeat_expand_2d(content, target_len):
+    """
+    content : [hubert_dim(256), src_len]
+    target: [hubert_dim(256), target_len]
+    """
+    src_len = content.shape[-1]
+    target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(
+        content.device
+    )
+    temp = torch.arange(src_len + 1) * target_len / src_len
+    current_pos = 0
+    for i in range(target_len):
+        if i < temp[current_pos + 1]:
+            target[:, i] = content[:, current_pos]
+        else:
+            current_pos += 1
+            target[:, i] = content[:, current_pos]
+    return target
+def get_mapped_features(raw_content_features, mapping_features):
+    """
+    Content Vector: frameshift = 20ms, hop_size = 480 in 24k
+    Now it's only used for mapping to bigvgan's mels (sr = 24k, hop_size = 256, frameshift ~= 10.7 ms)
+    """
+    source_hop = 480
+    target_hop = 256
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    print(
+        "Mapping source's {} frames => target's {} frames".format(
+            target_hop, source_hop
+        )
+    )
+    results = []
+    for index, mapping_feat in enumerate(tqdm(mapping_features)):
+        # mappping_feat: (mels_frame_len, n_mels)
+        target_len = len(mapping_feat)
+        # (source_len, 256)
+        raw_feats = raw_content_features[index][0].cpu().numpy().T
+        source_len, width = raw_feats.shape
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        err = abs(target_len - len(down_sampling_feats))
+        if err > 3:
+            print("index:", index)
+            print("mels:", mapping_feat.shape)
+            print("raw content vector:", raw_feats.shape)
+            print("up_sampling:", up_sampling_feats.shape)
+            print("down_sampling_feats:", down_sampling_feats.shape)
+            exit()
+        if len(down_sampling_feats) < target_len:
+            # (1, dim) -> (err, dim)
+            end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+            down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+        # (target_len, dim)
+        feats = down_sampling_feats[:target_len]
+        results.append(feats)
+    return results
+def extract_hubert_features_of_dataset(datasets, model, out_dir):
+    for utt in tqdm(datasets):
+        uid = utt["Uid"]
+        audio_path = utt["Path"]
+        content_vector_feature = content_vector_encoder(model, audio_path)  # (T, 256)
+        save_path = os.path.join(out_dir, uid + ".npy")
+        np.save(save_path, content_vector_feature)

utils/io.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import torch
+import torchaudio
+def save_feature(process_dir, feature_dir, item, feature, overrides=True):
+    """Save features to path
+    Args:
+        process_dir (str): directory to store features
+        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
+        item (str): uid
+        feature (tensor): feature tensor
+        overrides (bool, optional): whether to override existing files. Defaults to True.
+    """
+    process_dir = os.path.join(process_dir, feature_dir)
+    os.makedirs(process_dir, exist_ok=True)
+    out_path = os.path.join(process_dir, item + ".npy")
+    if os.path.exists(out_path):
+        if overrides:
+            np.save(out_path, feature)
+    else:
+        np.save(out_path, feature)
+def save_txt(process_dir, feature_dir, item, feature, overrides=True):
+    process_dir = os.path.join(process_dir, feature_dir)
+    os.makedirs(process_dir, exist_ok=True)
+    out_path = os.path.join(process_dir, item + ".txt")
+    if os.path.exists(out_path):
+        if overrides:
+            f = open(out_path, "w")
+            f.writelines(feature)
+            f.close()
+    else:
+        f = open(out_path, "w")
+        f.writelines(feature)
+        f.close()
+def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
+    """Save audio to path with processing  (turn up volume, add silence)
+    Args:
+        path (str): path to save audio
+        waveform (numpy array): waveform to save
+        fs (int): sampling rate
+        add_silence (bool, optional): whether to add silence to beginning and end. Defaults to False.
+        turn_up (bool, optional): whether to turn up volume. Defaults to False.
+        volume_peak (float, optional): volume peak. Defaults to 0.9.
+    """
+    if turn_up:
+        # continue to turn up to volume_peak
+        ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
+        waveform = waveform * ratio
+    if add_silence:
+        silence_len = fs // 20
+        silence = np.zeros((silence_len,), dtype=waveform.dtype)
+        result = np.concatenate([silence, waveform, silence])
+        waveform = result
+    waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
+    if len(waveform.size()) == 1:
+        waveform = waveform[None, :]
+    elif waveform.size(0) != 1:
+        # Stereo to mono
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)
+def save_torch_audio(process_dir, feature_dir, item, wav_torch, fs, overrides=True):
+    """Save torch audio to path without processing
+    Args:
+        process_dir (str): directory to store features
+        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
+        item (str): uid
+        wav_torch (tensor): feature tensor
+        fs (int): sampling rate
+        overrides (bool, optional): whether to override existing files. Defaults to True.
+    """
+    if wav_torch.shape != 2:
+        wav_torch = wav_torch.unsqueeze(0)
+    process_dir = os.path.join(process_dir, feature_dir)
+    os.makedirs(process_dir, exist_ok=True)
+    out_path = os.path.join(process_dir, item + ".wav")
+    torchaudio.save(out_path, wav_torch, fs)
+async def async_load_audio(path, sample_rate: int = 24000):
+    r"""
+    Args:
+        path: The source loading path.
+        sample_rate: The target sample rate, will automatically resample if necessary.
+    Returns:
+        waveform: The waveform object. Should be [1 x sequence_len].
+    """
+    async def use_torchaudio_load(path):
+        return torchaudio.load(path)
+    waveform, sr = await use_torchaudio_load(path)
+    waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sr != sample_rate:
+        waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
+    if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
+        raise ValueError("NaN or Inf found in waveform.")
+    return waveform
+async def async_save_audio(
+    path,
+    waveform,
+    sample_rate: int = 24000,
+    add_silence: bool = False,
+    volume_peak: float = 0.9,
+):
+    r"""
+    Args:
+        path: The target saving path.
+        waveform: The waveform object. Should be [n_channel x sequence_len].
+        sample_rate: Sample rate.
+        add_silence: If ``true``, concat 0.05s silence to beginning and end.
+        volume_peak: Turn up volume for larger number, vice versa.
+    """
+    async def use_torchaudio_save(path, waveform, sample_rate):
+        torchaudio.save(
+            path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
+        )
+    waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
+    shape = waveform.size()[:-1]
+    ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
+    waveform = waveform * ratio
+    if add_silence:
+        silence_len = sample_rate // 20
+        silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
+        waveform = torch.concatenate((silence, waveform, silence), dim=-1)
+    if waveform.dim() == 1:
+        waveform = waveform[None]
+    await use_torchaudio_save(path, waveform, sample_rate)
+def load_mel_extrema(cfg, dataset_name, split):
+    dataset_dir = os.path.join(
+        cfg.OUTPUT_PATH,
+        "preprocess/{}_version".format(cfg.data.process_version),
+        dataset_name,
+    )
+    min_file = os.path.join(
+        dataset_dir,
+        "mel_min_max",
+        split.split("_")[-1],
+        "mel_min.npy",
+    )
+    max_file = os.path.join(
+        dataset_dir,
+        "mel_min_max",
+        split.split("_")[-1],
+        "mel_max.npy",
+    )
+    mel_min = np.load(min_file)
+    mel_max = np.load(max_file)
+    return mel_min, mel_max

utils/io_optim.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torchaudio
+import json
+import os
+import numpy as np
+import librosa
+import whisper
+from torch.nn.utils.rnn import pad_sequence
+class TorchaudioDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+        """
+        assert isinstance(dataset, str)
+        self.sr = sr
+        self.cfg = cfg
+        if metadata is None:
+            self.train_metadata_path = os.path.join(
+                cfg.preprocess.processed_dir, dataset, cfg.preprocess.train_file
+            )
+            self.valid_metadata_path = os.path.join(
+                cfg.preprocess.processed_dir, dataset, cfg.preprocess.valid_file
+            )
+            self.metadata = self.get_metadata()
+        else:
+            self.metadata = metadata
+        if accelerator is not None:
+            self.device = accelerator.device
+        elif torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+    def get_metadata(self):
+        metadata = []
+        with open(self.train_metadata_path, "r", encoding="utf-8") as t:
+            metadata.extend(json.load(t))
+        with open(self.valid_metadata_path, "r", encoding="utf-8") as v:
+            metadata.extend(json.load(v))
+        return metadata
+    def __len__(self):
+        return len(self.metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        wav, sr = torchaudio.load(wav_path)
+        # resample
+        if sr != self.sr:
+            wav = torchaudio.functional.resample(wav, sr, self.sr)
+        # downmixing
+        if wav.shape[0] > 1:
+            wav = torch.mean(wav, dim=0, keepdim=True)
+        assert wav.shape[0] == 1
+        wav = wav.squeeze(0)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        # wav: (T)
+        return utt_info, wav, length
+class LibrosaDataset(TorchaudioDataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        super().__init__(cfg, dataset, sr, accelerator, metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        wav, _ = librosa.load(wav_path, sr=self.sr)
+        # wav: (T)
+        wav = torch.from_numpy(wav)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        return utt_info, wav, length
+class FFmpegDataset(TorchaudioDataset):
+    def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
+        super().__init__(cfg, dataset, sr, accelerator, metadata)
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        wav_path = utt_info["Path"]
+        # wav: (T,)
+        wav = whisper.load_audio(wav_path, sr=16000)  # sr = 16000
+        # convert to torch tensor
+        wav = torch.from_numpy(wav)
+        # record the length of wav without padding
+        length = wav.shape[0]
+        return utt_info, wav, length
+def collate_batch(batch_list):
+    """
+    Args:
+        batch_list: list of (metadata, wav, length)
+    """
+    metadata = [item[0] for item in batch_list]
+    # wavs: (B, T)
+    wavs = pad_sequence([item[1] for item in batch_list], batch_first=True)
+    lens = [item[2] for item in batch_list]
+    return metadata, wavs, lens

utils/mel.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    # Min value: ln(1e-5) = -11.5129
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def extract_linear_features(y, cfg, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.squeeze(spec, 0)
+    return spec
+def mel_spectrogram_torch(y, cfg, center=False):
+    """
+    TODO: to merge this funtion with the extract_mel_features below
+    """
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+mel_basis = {}
+hann_window = {}
+def extract_mel_features(
+    y,
+    cfg,
+    center=False,
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec.squeeze(0)
+def extract_mel_features_tts(
+    y,
+    cfg,
+    center=False,
+    taco=False,
+    _stft=None,
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+        taco: use tacotron mel
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if not taco:
+        if torch.min(y) < -1.0:
+            print("min value is ", torch.min(y))
+        if torch.max(y) > 1.0:
+            print("max value is ", torch.max(y))
+        global mel_basis, hann_window
+        if cfg.fmax not in mel_basis:
+            mel = librosa_mel_fn(
+                sr=cfg.sample_rate,
+                n_fft=cfg.n_fft,
+                n_mels=cfg.n_mel,
+                fmin=cfg.fmin,
+                fmax=cfg.fmax,
+            )
+            mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+                torch.from_numpy(mel).float().to(y.device)
+            )
+            hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+            mode="reflect",
+        )
+        y = y.squeeze(1)
+        # complex tensor as default, then use view_as_real for future pytorch compatibility
+        spec = torch.stft(
+            y,
+            cfg.n_fft,
+            hop_length=cfg.hop_size,
+            win_length=cfg.win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+    else:
+        audio = torch.clip(y, -1, 1)
+        audio = torch.autograd.Variable(audio, requires_grad=False)
+        spec, energy = _stft.mel_spectrogram(audio)
+    return spec.squeeze(0)
+def amplitude_phase_spectrum(y, cfg):
+    hann_window = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    stft_spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window,
+        center=False,
+        return_complex=True,
+    )
+    stft_spec = torch.view_as_real(stft_spec)
+    if stft_spec.size()[0] == 1:
+        stft_spec = stft_spec.squeeze(0)
+    if len(list(stft_spec.size())) == 4:
+        rea = stft_spec[:, :, :, 0]  # [batch_size, n_fft//2+1, frames]
+        imag = stft_spec[:, :, :, 1]  # [batch_size, n_fft//2+1, frames]
+    else:
+        rea = stft_spec[:, :, 0]  # [n_fft//2+1, frames]
+        imag = stft_spec[:, :, 1]  # [n_fft//2+1, frames]
+    log_amplitude = torch.log(
+        torch.abs(torch.sqrt(torch.pow(rea, 2) + torch.pow(imag, 2))) + 1e-5
+    )  # [n_fft//2+1, frames]
+    phase = torch.atan2(imag, rea)  # [n_fft//2+1, frames]
+    return log_amplitude, phase, rea, imag

utils/mert.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://huggingface.co/m-a-p/MERT-v1-330M
+import torch
+from tqdm import tqdm
+import numpy as np
+from transformers import Wav2Vec2FeatureExtractor
+from transformers import AutoModel
+import torchaudio
+import torchaudio.transforms as T
+from sklearn.preprocessing import StandardScaler
+def mert_encoder(model, processor, audio_path, hps):
+    """
+    # mert default sr: 24000
+    """
+    with torch.no_grad():
+        resample_rate = processor.sampling_rate
+        device = next(model.parameters()).device
+        input_audio, sampling_rate = torchaudio.load(audio_path)
+        input_audio = input_audio.squeeze()
+        if sampling_rate != resample_rate:
+            resampler = T.Resample(sampling_rate, resample_rate)
+            input_audio = resampler(input_audio)
+        inputs = processor(
+            input_audio, sampling_rate=resample_rate, return_tensors="pt"
+        ).to(
+            device
+        )  # {input_values: tensor, attention_mask: tensor}
+        outputs = model(**inputs, output_hidden_states=True)  # list: len is 25
+    # [25 layer, Time steps, 1024 feature_dim]
+    # all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
+    # mert_features.append(all_layer_hidden_states)
+    feature = outputs.hidden_states[
+        hps.mert_feature_layer
+    ].squeeze()  # [1, frame len, 1024] ->  [frame len, 1024]
+    return feature.cpu().detach().numpy()
+def mert_features_normalization(raw_mert_features):
+    normalized_mert_features = list()
+    mert_features = np.array(raw_mert_features)
+    scaler = StandardScaler().fit(mert_features)
+    for raw_mert_feature in raw_mert_feature:
+        normalized_mert_feature = scaler.transform(raw_mert_feature)
+        normalized_mert_features.append(normalized_mert_feature)
+    return normalized_mert_features
+def get_mapped_mert_features(raw_mert_features, mapping_features, fast_mapping=True):
+    source_hop = 320
+    target_hop = 256
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    print(
+        "Mapping source's {} frames => target's {} frames".format(
+            target_hop, source_hop
+        )
+    )
+    mert_features = []
+    for index, mapping_feat in enumerate(tqdm(mapping_features)):
+        # mapping_feat: (mels_frame_len, n_mels)
+        target_len = mapping_feat.shape[0]
+        # (frame_len, 1024)
+        raw_feats = raw_mert_features[index].cpu().numpy()
+        source_len, width = raw_feats.shape
+        # const ~= target_len * target_hop
+        const = source_len * source_hop // target_hop * target_hop
+        # (source_len * source_hop, dim)
+        up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
+        # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+        down_sampling_feats = np.average(
+            up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+        )
+        err = abs(target_len - len(down_sampling_feats))
+        if err > 3:
+            print("index:", index)
+            print("mels:", mapping_feat.shape)
+            print("raw mert vector:", raw_feats.shape)
+            print("up_sampling:", up_sampling_feats.shape)
+            print("const:", const)
+            print("down_sampling_feats:", down_sampling_feats.shape)
+            exit()
+        if len(down_sampling_feats) < target_len:
+            # (1, dim) -> (err, dim)
+            end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+            down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+        # (target_len, dim)
+        feats = down_sampling_feats[:target_len]
+        mert_features.append(feats)
+    return mert_features
+def load_mert_model(hps):
+    print("Loading MERT Model: ", hps.mert_model)
+    # Load model
+    model_name = hps.mert_model
+    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    # model = model.eval()
+    preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(
+        model_name, trust_remote_code=True
+    )
+    return model, preprocessor
+# loading the corresponding preprocessor config
+# def load_preprocessor (model_name="m-a-p/MERT-v1-330M"):
+#     print('load_preprocessor...')
+#     preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,trust_remote_code=True)
+#     return preprocessor

utils/mfa_prepare.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+""" This code is modified from https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/performance.html"""
+import os
+import subprocess
+from multiprocessing import Pool
+from tqdm import tqdm
+import torchaudio
+from pathlib import Path
+def remove_empty_dirs(path):
+    """remove empty directories in a given path"""
+    # Check if the given path is a directory
+    if not os.path.isdir(path):
+        print(f"{path} is not a directory")
+        return
+    # Walk through all directories and subdirectories
+    for root, dirs, _ in os.walk(path, topdown=False):
+        for dir in dirs:
+            dir_path = os.path.join(root, dir)
+            # Check if the directory is empty
+            if not os.listdir(dir_path):
+                os.rmdir(dir_path)  # "Removed empty directory
+def process_single_wav_file(task):
+    """process a single wav file"""
+    wav_file, output_dir = task
+    speaker_id, book_name, filename = Path(wav_file).parts[-3:]
+    output_book_dir = Path(output_dir, speaker_id)
+    output_book_dir.mkdir(parents=True, exist_ok=True)
+    new_filename = f"{speaker_id}_{book_name}_{filename}"
+    new_wav_file = Path(output_book_dir, new_filename)
+    command = [
+        "ffmpeg",
+        "-nostdin",
+        "-hide_banner",
+        "-loglevel",
+        "error",
+        "-nostats",
+        "-i",
+        wav_file,
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        new_wav_file,
+    ]
+    subprocess.check_call(
+        command
+    )  # Run the command to convert the file to 16kHz and 16-bit PCM
+    os.remove(wav_file)
+def process_wav_files(wav_files, output_dir, n_process):
+    """process wav files in parallel"""
+    tasks = [(wav_file, output_dir) for wav_file in wav_files]
+    print(f"Processing {len(tasks)} files")
+    with Pool(processes=n_process) as pool:
+        for _ in tqdm(
+            pool.imap_unordered(process_single_wav_file, tasks), total=len(tasks)
+        ):
+            pass
+    print("Removing empty directories...")
+    remove_empty_dirs(output_dir)
+    print("Done!")
+def get_wav_files(dataset_path):
+    """get all wav files in the dataset"""
+    wav_files = []
+    for speaker_id in os.listdir(dataset_path):
+        speaker_dir = os.path.join(dataset_path, speaker_id)
+        if not os.path.isdir(speaker_dir):
+            continue
+        for book_name in os.listdir(speaker_dir):
+            book_dir = os.path.join(speaker_dir, book_name)
+            if not os.path.isdir(book_dir):
+                continue
+            for file in os.listdir(book_dir):
+                if file.endswith(".wav"):
+                    wav_files.append(os.path.join(book_dir, file))
+    print("Found {} wav files".format(len(wav_files)))
+    return wav_files
+def filter_wav_files_by_length(wav_files, max_len_sec=15):
+    """filter wav files by length"""
+    print("original wav files: {}".format(len(wav_files)))
+    filtered_wav_files = []
+    for audio_file in wav_files:
+        metadata = torchaudio.info(str(audio_file))
+        audio_length = metadata.num_frames / metadata.sample_rate
+        if audio_length <= max_len_sec:
+            filtered_wav_files.append(audio_file)
+        else:
+            os.remove(audio_file)
+    print("filtered wav files: {}".format(len(filtered_wav_files)))
+    return filtered_wav_files
+if __name__ == "__main__":
+    dataset_path = "/path/to/output/directory"
+    n_process = 16
+    max_len_sec = 15
+    wav_files = get_wav_files(dataset_path)
+    filtered_wav_files = filter_wav_files_by_length(wav_files, max_len_sec)
+    process_wav_files(filtered_wav_files, dataset_path, n_process)

utils/model_summary.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import humanfriendly
+import numpy as np
+import torch
+def get_human_readable_count(number: int) -> str:
+    """Return human_readable_count
+    Originated from:
+    https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/core/memory.py
+    Abbreviates an integer number with K, M, B, T for thousands, millions,
+    billions and trillions, respectively.
+    Examples:
+        >>> get_human_readable_count(123)
+        '123  '
+        >>> get_human_readable_count(1234)  # (one thousand)
+        '1 K'
+        >>> get_human_readable_count(2e6)   # (two million)
+        '2 M'
+        >>> get_human_readable_count(3e9)   # (three billion)
+        '3 B'
+        >>> get_human_readable_count(4e12)  # (four trillion)
+        '4 T'
+        >>> get_human_readable_count(5e15)  # (more than trillion)
+        '5,000 T'
+    Args:
+        number: a positive integer number
+    Return:
+        A string formatted according to the pattern described above.
+    """
+    assert number >= 0
+    labels = [" ", "K", "M", "B", "T"]
+    num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1)
+    num_groups = int(np.ceil(num_digits / 3))
+    num_groups = min(num_groups, len(labels))
+    shift = -3 * (num_groups - 1)
+    number = number * (10**shift)
+    index = num_groups - 1
+    return f"{number:.2f} {labels[index]}"
+def to_bytes(dtype) -> int:
+    return int(str(dtype)[-2:]) // 8
+def model_summary(model: torch.nn.Module) -> str:
+    message = "Model structure:\n"
+    message += str(model)
+    tot_params = sum(p.numel() for p in model.parameters())
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    percent_trainable = "{:.1f}".format(num_params * 100.0 / tot_params)
+    tot_params = get_human_readable_count(tot_params)
+    num_params = get_human_readable_count(num_params)
+    message += "\n\nModel summary:\n"
+    message += f"    Class Name: {model.__class__.__name__}\n"
+    message += f"    Total Number of model parameters: {tot_params}\n"
+    message += (
+        f"    Number of trainable parameters: {num_params} ({percent_trainable}%)\n"
+    )
+    num_bytes = humanfriendly.format_size(
+        sum(
+            p.numel() * to_bytes(p.dtype) for p in model.parameters() if p.requires_grad
+        )
+    )
+    message += f"    Size: {num_bytes}\n"
+    dtype = next(iter(model.parameters())).dtype
+    message += f"    Type: {dtype}"
+    return message

utils/prompt_preparer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class PromptPreparer:
+    def prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 0:
+            y_emb, prefix_len = self._handle_prefix_mode_0(y, codes, nar_stage)
+        elif self.prefix_mode == 1:
+            y_emb, prefix_len = self._handle_prefix_mode_1(y, y_lens, codes, nar_stage)
+        elif self.prefix_mode in [2, 4]:
+            y_emb, prefix_len = self._handle_prefix_mode_2_4(
+                y, y_lens, codes, nar_stage, y_prompts_codes
+            )
+        else:
+            raise ValueError("Invalid prefix mode")
+        return y_emb, prefix_len
+    def _handle_prefix_mode_0(self, y, codes, nar_stage):
+        prefix_len = 0
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, nar_stage):
+            y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
+        return y_emb, 0
+    def _handle_prefix_mode_1(self, y, y_lens, codes, nar_stage):
+        int_low = (0.25 * y_lens.min()).type(torch.int64).item()
+        prefix_len = torch.randint(int_low, int_low * 2, size=()).item()
+        prefix_len = min(prefix_len, 225)
+        y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
+        y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](codes[:, :prefix_len, j])
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](codes[:, prefix_len:, j])
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len
+    def _handle_prefix_mode_2_4(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 2:
+            prefix_len = min(225, int(0.25 * y_lens.min().item()))
+            y_prompts_codes = []
+            for b in range(codes.shape[0]):
+                start = self.rng.randint(0, y_lens[b].item() - prefix_len)
+                y_prompts_codes.append(
+                    torch.clone(codes[b, start : start + prefix_len])
+                )
+                codes[b, start : start + prefix_len, nar_stage] = self.audio_token_num
+            y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
+        else:
+            prefix_len = y_prompts_codes.shape[1]
+        y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](y_prompts_codes[..., j])
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](codes[..., j])
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len

utils/ssim.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/Po-Hsun-Su/pytorch-ssim
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from math import exp
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor(
+        [
+            exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
+            for x in range(window_size)
+        ]
+    )
+    return gauss / gauss.sum()
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = Variable(
+        _2D_window.expand(channel, 1, window_size, window_size).contiguous()
+    )
+    return window
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = (
+        F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    )
+    sigma2_sq = (
+        F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    )
+    sigma12 = (
+        F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
+        - mu1_mu2
+    )
+    C1 = 0.01**2
+    C2 = 0.03**2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
+        (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
+    )
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1)
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = 1
+        self.window = create_window(window_size, self.channel)
+    def forward(self, fake, real, bias=6.0):
+        fake = fake[:, None, :, :] + bias  # [B, 1, T, n_mels]
+        real = real[:, None, :, :] + bias  # [B, 1, T, n_mels]
+        self.window = self.window.to(dtype=fake.dtype, device=fake.device)
+        loss = 1 - _ssim(
+            fake, real, self.window, self.window_size, self.channel, self.size_average
+        )
+        return loss

utils/stft.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+from librosa.filters import mel as librosa_mel_fn
+import torch
+import numpy as np
+import librosa.util as librosa_util
+from scipy.signal import get_window
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def griffin_lim(magnitudes, stft_fn, n_iters=30):
+    """
+    PARAMS
+    ------
+    magnitudes: spectrogram magnitudes
+    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
+    """
+    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
+    angles = angles.astype(np.float32)
+    angles = torch.autograd.Variable(torch.from_numpy(angles))
+    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    for i in range(n_iters):
+        _, angles = stft_fn.transform(signal)
+        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
+    return signal
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = F.conv1d(
+            input_data.cuda(),
+            torch.autograd.Variable(self.forward_basis, requires_grad=False).cuda(),
+            stride=self.hop_length,
+            padding=0,
+        ).cpu()
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length,
+        hop_length,
+        win_length,
+        n_mel_channels,
+        sampling_rate,
+        mel_fmin,
+        mel_fmax,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes):
+        output = dynamic_range_compression(magnitudes)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1
+        assert torch.max(y.data) <= 1
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output)
+        energy = torch.norm(magnitudes, dim=1)
+        return mel_output, energy

utils/symbol_table.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/utils/symbol_table.py
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import Generic
+from typing import List
+from typing import Optional
+from typing import TypeVar
+from typing import Union
+Symbol = TypeVar("Symbol")
+@dataclass(repr=False)
+class SymbolTable(Generic[Symbol]):
+    """SymbolTable that maps symbol IDs, found on the FSA arcs to
+    actual objects. These objects can be arbitrary Python objects
+    that can serve as keys in a dictionary (i.e. they need to be
+    hashable and immutable).
+    The SymbolTable can only be read to/written from disk if the
+    symbols are strings.
+    """
+    _id2sym: Dict[int, Symbol] = field(default_factory=dict)
+    """Map an integer to a symbol.
+    """
+    _sym2id: Dict[Symbol, int] = field(default_factory=dict)
+    """Map a symbol to an integer.
+    """
+    _next_available_id: int = 1
+    """A helper internal field that helps adding new symbols
+    to the table efficiently.
+    """
+    eps: Symbol = "<eps>"
+    """Null symbol, always mapped to index 0.
+    """
+    def __post_init__(self):
+        assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
+        assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
+        assert 0 not in self._id2sym or self._id2sym[0] == self.eps
+        self._next_available_id = max(self._id2sym, default=0) + 1
+        self._id2sym.setdefault(0, self.eps)
+        self._sym2id.setdefault(self.eps, 0)
+    @staticmethod
+    def from_str(s: str) -> "SymbolTable":
+        """Build a symbol table from a string.
+        The string consists of lines. Every line has two fields separated
+        by space(s), tab(s) or both. The first field is the symbol and the
+        second the integer id of the symbol.
+        Args:
+          s:
+            The input string with the format described above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        """
+        id2sym: Dict[int, str] = dict()
+        sym2id: Dict[str, int] = dict()
+        for line in s.split("\n"):
+            fields = line.split()
+            if len(fields) == 0:
+                continue  # skip empty lines
+            assert (
+                len(fields) == 2
+            ), f"Expect a line with 2 fields. Given: {len(fields)}"
+            sym, idx = fields[0], int(fields[1])
+            assert sym not in sym2id, f"Duplicated symbol {sym}"
+            assert idx not in id2sym, f"Duplicated id {idx}"
+            id2sym[idx] = sym
+            sym2id[sym] = idx
+        eps = id2sym.get(0, "<eps>")
+        return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
+    @staticmethod
+    def from_file(filename: str) -> "SymbolTable":
+        """Build a symbol table from file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        Returns:
+          An instance of :class:`SymbolTable`.
+        """
+        with open(filename, "r", encoding="utf-8") as f:
+            return SymbolTable.from_str(f.read().strip())
+    def to_str(self) -> str:
+        """
+        Returns:
+          Return a string representation of this object. You can pass
+          it to the method ``from_str`` to recreate an identical object.
+        """
+        s = ""
+        for idx, symbol in sorted(self._id2sym.items()):
+            s += f"{symbol} {idx}\n"
+        return s
+    def to_file(self, filename: str):
+        """Serialize the SymbolTable to a file.
+        Every line in the symbol table file has two fields separated by
+        space(s), tab(s) or both. The following is an example file:
+        .. code-block::
+            <eps> 0
+            a 1
+            b 2
+            c 3
+        Args:
+          filename:
+            Name of the symbol table file. Its format is documented above.
+        """
+        with open(filename, "w") as f:
+            for idx, symbol in sorted(self._id2sym.items()):
+                print(symbol, idx, file=f)
+    def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
+        """Add a new symbol to the SymbolTable.
+        Args:
+            symbol:
+                The symbol to be added.
+            index:
+                Optional int id to which the symbol should be assigned.
+                If it is not available, a ValueError will be raised.
+        Returns:
+            The int id to which the symbol has been assigned.
+        """
+        # Already in the table? Return its ID.
+        if symbol in self._sym2id:
+            return self._sym2id[symbol]
+        # Specific ID not provided - use next available.
+        if index is None:
+            index = self._next_available_id
+        # Specific ID provided but not available.
+        if index in self._id2sym:
+            raise ValueError(
+                f"Cannot assign id '{index}' to '{symbol}' - "
+                f"already occupied by {self._id2sym[index]}"
+            )
+        self._sym2id[symbol] = index
+        self._id2sym[index] = symbol
+        # Update next available ID if needed
+        if self._next_available_id <= index:
+            self._next_available_id = index + 1
+        return index
+    def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
+        """Get a symbol for an id or get an id for a symbol
+        Args:
+          k:
+            If it is an id, it tries to find the symbol corresponding
+            to the id; if it is a symbol, it tries to find the id
+            corresponding to the symbol.
+        Returns:
+          An id or a symbol depending on the given `k`.
+        """
+        if isinstance(k, int):
+            return self._id2sym[k]
+        else:
+            return self._sym2id[k]
+    def merge(self, other: "SymbolTable") -> "SymbolTable":
+        """Create a union of two SymbolTables.
+        Raises an AssertionError if the same IDs are occupied by
+        different symbols.
+        Args:
+            other:
+                A symbol table to merge with ``self``.
+        Returns:
+            A new symbol table.
+        """
+        self._check_compatible(other)
+        return SymbolTable(
+            _id2sym={**self._id2sym, **other._id2sym},
+            _sym2id={**self._sym2id, **other._sym2id},
+            eps=self.eps,
+        )
+    def _check_compatible(self, other: "SymbolTable") -> None:
+        # Epsilon compatibility
+        assert self.eps == other.eps, (
+            f"Mismatched epsilon symbol: " f"{self.eps} != {other.eps}"
+        )
+        # IDs compatibility
+        common_ids = set(self._id2sym).intersection(other._id2sym)
+        for idx in common_ids:
+            assert self[idx] == other[idx], (
+                f"ID conflict for id: {idx}, "
+                f'self[idx] = "{self[idx]}", '
+                f'other[idx] = "{other[idx]}"'
+            )
+        # Symbols compatibility
+        common_symbols = set(self._sym2id).intersection(other._sym2id)
+        for sym in common_symbols:
+            assert self[sym] == other[sym], (
+                f"ID conflict for id: {sym}, "
+                f'self[sym] = "{self[sym]}", '
+                f'other[sym] = "{other[sym]}"'
+            )
+    def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
+        return self.get(item)
+    def __contains__(self, item: Union[int, Symbol]) -> bool:
+        if isinstance(item, int):
+            return item in self._id2sym
+        else:
+            return item in self._sym2id
+    def __len__(self) -> int:
+        return len(self._id2sym)
+    def __eq__(self, other: "SymbolTable") -> bool:
+        if len(self) != len(other):
+            return False
+        for s in self.symbols:
+            if self[s] != other[s]:
+                return False
+        return True
+    @property
+    def ids(self) -> List[int]:
+        """Returns a list of integer IDs corresponding to the symbols."""
+        ans = list(self._id2sym.keys())
+        ans.sort()
+        return ans
+    @property
+    def symbols(self) -> List[Symbol]:
+        """Returns a list of symbols (e.g., strings) corresponding to
+        the integer IDs.
+        """
+        ans = list(self._sym2id.keys())
+        ans.sort()
+        return ans
+class TextToken:
+    def __init__(
+        self,
+        text_tokens: List[str],
+        add_eos: bool = True,
+        add_bos: bool = True,
+        pad_symbol: str = "<pad>",
+        bos_symbol: str = "<bos>",
+        eos_symbol: str = "<eos>",
+    ):
+        self.pad_symbol = pad_symbol
+        self.add_eos = add_eos
+        self.add_bos = add_bos
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+        unique_tokens = [pad_symbol]
+        if add_bos:
+            unique_tokens.append(bos_symbol)
+        if add_eos:
+            unique_tokens.append(eos_symbol)
+        unique_tokens.extend(sorted(text_tokens))
+        self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
+        self.idx2token = unique_tokens
+    def get_token_id_seq(self, text):
+        tokens_seq = [p for p in text]
+        seq = (
+            ([self.bos_symbol] if self.add_bos else [])
+            + tokens_seq
+            + ([self.eos_symbol] if self.add_eos else [])
+        )
+        token_ids = [self.token2idx[token] for token in seq]
+        token_lens = len(tokens_seq) + self.add_eos + self.add_bos
+        return token_ids, token_lens

utils/tokenizer.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from
+# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
+import re
+from typing import Any, Dict, List, Optional, Pattern, Union
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+class AudioTokenizer:
+    """EnCodec audio tokenizer for encoding and decoding audio.
+    Attributes:
+        device: The device on which the codec model is loaded.
+        codec: The pretrained EnCodec model.
+        sample_rate: Sample rate of the model.
+        channels: Number of audio channels in the model.
+    """
+    def __init__(self, device: Any = None) -> None:
+        model = EncodecModel.encodec_model_24khz()
+        model.set_target_bandwidth(6.0)
+        remove_encodec_weight_norm(model)
+        if not device:
+            device = torch.device("cpu")
+            if torch.cuda.is_available():
+                device = torch.device("cuda:0")
+        self._device = device
+        self.codec = model.to(device)
+        self.sample_rate = model.sample_rate
+        self.channels = model.channels
+    @property
+    def device(self):
+        return self._device
+    def encode(self, wav: torch.Tensor) -> torch.Tensor:
+        """Encode the audio waveform.
+        Args:
+            wav: A tensor representing the audio waveform.
+        Returns:
+            A tensor representing the encoded audio.
+        """
+        return self.codec.encode(wav.to(self.device))
+    def decode(self, frames: torch.Tensor) -> torch.Tensor:
+        """Decode the encoded audio frames.
+        Args:
+            frames: A tensor representing the encoded audio frames.
+        Returns:
+            A tensor representing the decoded audio waveform.
+        """
+        return self.codec.decode(frames)
+def tokenize_audio(tokenizer: AudioTokenizer, audio_path: str):
+    """
+    Tokenize the audio waveform using the given AudioTokenizer.
+    Args:
+        tokenizer: An instance of AudioTokenizer.
+        audio_path: Path to the audio file.
+    Returns:
+        A tensor of encoded frames from the audio.
+    Raises:
+        FileNotFoundError: If the audio file is not found.
+        RuntimeError: If there's an error processing the audio data.
+    """
+    # try:
+    # Load and preprocess the audio waveform
+    wav, sr = torchaudio.load(audio_path)
+    wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
+    wav = wav.unsqueeze(0)
+    # Extract discrete codes from EnCodec
+    with torch.no_grad():
+        encoded_frames = tokenizer.encode(wav)
+    return encoded_frames
+    # except FileNotFoundError:
+    #     raise FileNotFoundError(f"Audio file not found at {audio_path}")
+    # except Exception as e:
+    #     raise RuntimeError(f"Error processing audio data: {e}")
+def remove_encodec_weight_norm(model):
+    from encodec.modules import SConv1d
+    from encodec.modules.seanet import SConvTranspose1d, SEANetResnetBlock
+    from torch.nn.utils import remove_weight_norm
+    encoder = model.encoder.model
+    for key in encoder._modules:
+        if isinstance(encoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
+            block_modules = encoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(encoder._modules[key], SConv1d):
+            remove_weight_norm(encoder._modules[key].conv.conv)
+    decoder = model.decoder.model
+    for key in decoder._modules:
+        if isinstance(decoder._modules[key], SEANetResnetBlock):
+            remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
+            block_modules = decoder._modules[key].block._modules
+            for skey in block_modules:
+                if isinstance(block_modules[skey], SConv1d):
+                    remove_weight_norm(block_modules[skey].conv.conv)
+        elif isinstance(decoder._modules[key], SConvTranspose1d):
+            remove_weight_norm(decoder._modules[key].convtr.convtr)
+        elif isinstance(decoder._modules[key], SConv1d):
+            remove_weight_norm(decoder._modules[key].conv.conv)
+def extract_encodec_token(wav_path):
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(6.0)
+    wav, sr = torchaudio.load(wav_path)
+    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+    wav = wav.unsqueeze(0)
+    if torch.cuda.is_available():
+        model = model.cuda()
+        wav = wav.cuda()
+    with torch.no_grad():
+        encoded_frames = model.encode(wav)
+        codes_ = torch.cat(
+            [encoded[0] for encoded in encoded_frames], dim=-1
+        )  # [B, n_q, T]
+        codes = codes_.cpu().numpy()[0, :, :].T  # [T, 8]
+        return codes

utils/topk_sampling.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+# This function is modified from https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
+def top_k_top_p_filtering(
+    logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
+):
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
+    Args:
+        logits (torch.Tensor): Logits distribution with shape (batch size, vocabulary size).
+        top_k (int, optional): Keep only top k tokens with highest probability (top-k filtering).
+                               Set to 0 to disable. Defaults to 0.
+        top_p (float, optional): Keep the top tokens with a cumulative probability >= top_p (nucleus filtering).
+                                 Must be between 0 and 1, inclusive. Defaults to 1.0.
+        filter_value (float, optional): The value to assign to filtered logits. Defaults to -float('Inf').
+        min_tokens_to_keep (int, optional): Ensure that at least this number of tokens are kept per batch example.
+                                            Defaults to 1.
+    Returns:
+        torch.Tensor: The filtered logits.
+    """
+    """
+        Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        # Apply top-k filtering
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
+        indices_to_remove = logits < torch.topk(logits, top_k).values[..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        # Apply top-p filtering
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Create a mask to remove tokens with cumulative probability above the top_p threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # Scatter sorted tensors back to original indexing
+        indices_to_remove = sorted_indices.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
+def topk_sampling(logits, top_k=50, top_p=1.0, temperature=1.0):
+    """
+    Perform top-k and top-p sampling on logits.
+    Args:
+        logits (torch.Tensor): The logits to sample from.
+        top_k (int, optional): The number of highest probability tokens to keep for top-k filtering.
+                               Must be a positive integer. Defaults to 50.
+        top_p (float, optional): The cumulative probability threshold for nucleus sampling.
+                                 Must be between 0 and 1. Defaults to 1.0.
+        temperature (float, optional): The scaling factor to adjust the logits distribution.
+                                       Must be strictly positive. Defaults to 1.0.
+    Returns:
+        torch.Tensor: The sampled token.
+    """
+    # Adjust logits using temperature
+    if temperature != 1.0:
+        logits = logits / temperature
+    # Top-p/top-k filtering
+    logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    # Sample from the filtered distribution
+    token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
+    return token

utils/trainer_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def check_nan(logger, loss, y_pred, y_gt):
+    if torch.any(torch.isnan(loss)):
+        logger.info("out has nan: ", torch.any(torch.isnan(y_pred)))
+        logger.info("y_gt has nan: ", torch.any(torch.isnan(y_gt)))
+        logger.info("out: ", y_pred)
+        logger.info("y_gt: ", y_gt)
+        logger.info("loss = {:.4f}\n".format(loss.item()))
+        exit()

utils/util.py ADDED Viewed

	@@ -0,0 +1,687 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+import glob
+import os
+import random
+import time
+import argparse
+from collections import OrderedDict
+import json5
+import numpy as np
+import glob
+from torch.nn import functional as F
+try:
+    from ruamel.yaml import YAML as yaml
+except:
+    from ruamel_yaml import YAML as yaml
+import torch
+from utils.hparam import HParams
+import logging
+from logging import handlers
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def find_checkpoint_of_mapper(mapper_ckpt_dir):
+    mapper_ckpts = glob.glob(os.path.join(mapper_ckpt_dir, "ckpts/*.pt"))
+    # Select the max steps
+    mapper_ckpts.sort()
+    mapper_weights_file = mapper_ckpts[-1]
+    return mapper_weights_file
+def pad_f0_to_tensors(f0s, batched=None):
+    # Initialize
+    tensors = []
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for f0 in f0s:
+            size = max(size, f0.shape[-1])
+        tensor = torch.zeros(len(f0s), size)
+        for i, f0 in enumerate(f0s):
+            tensor[i, : f0.shape[-1]] = f0[:]
+        tensors.append(tensor)
+    else:
+        start = 0
+        while start + batched - 1 < len(f0s):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(batched, size)
+            for i in range(start, end + 1):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+            start = start + batched
+        if start != len(f0s):
+            end = len(f0s)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(len(f0s) - start, size)
+            for i in range(start, end):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+    return tensors
+def pad_mels_to_tensors(mels, batched=None):
+    """
+    Args:
+        mels: A list of mel-specs
+    Returns:
+        tensors: A list of tensors containing the batched mel-specs
+        mel_frames: A list of tensors containing the frames of the original mel-specs
+    """
+    # Initialize
+    tensors = []
+    mel_frames = []
+    # Split mel-specs into batches to avoid cuda memory exceed
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for mel in mels:
+            size = max(size, mel.shape[-1])
+        tensor = torch.zeros(len(mels), mels[0].shape[0], size)
+        mel_frame = torch.zeros(len(mels), dtype=torch.int32)
+        for i, mel in enumerate(mels):
+            tensor[i, :, : mel.shape[-1]] = mel[:]
+            mel_frame[i] = mel.shape[-1]
+        tensors.append(tensor)
+        mel_frames.append(mel_frame)
+    else:
+        start = 0
+        while start + batched - 1 < len(mels):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(batched, mels[0].shape[0], size)
+            mel_frame = torch.zeros(batched, dtype=torch.int32)
+            for i in range(start, end + 1):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+            start = start + batched
+        if start != len(mels):
+            end = len(mels)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(len(mels) - start, mels[0].shape[0], size)
+            mel_frame = torch.zeros(len(mels) - start, dtype=torch.int32)
+            for i in range(start, end):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+    return tensors, mel_frames
+def load_model_config(args):
+    """Load model configurations (in args.json under checkpoint directory)
+    Args:
+        args (ArgumentParser): arguments to run bins/preprocess.py
+    Returns:
+        dict: dictionary that stores model configurations
+    """
+    if args.checkpoint_dir is None:
+        assert args.checkpoint_file is not None
+        checkpoint_dir = os.path.split(args.checkpoint_file)[0]
+    else:
+        checkpoint_dir = args.checkpoint_dir
+    config_path = os.path.join(checkpoint_dir, "args.json")
+    print("config_path: ", config_path)
+    config = load_config(config_path)
+    return config
+def remove_and_create(dir):
+    if os.path.exists(dir):
+        os.system("rm -r {}".format(dir))
+    os.makedirs(dir, exist_ok=True)
+def has_existed(path, warning=False):
+    if not warning:
+        return os.path.exists(path)
+    if os.path.exists(path):
+        answer = input(
+            "The path {} has existed. \nInput 'y' (or hit Enter) to skip it, and input 'n' to re-write it [y/n]\n".format(
+                path
+            )
+        )
+        if not answer == "n":
+            return True
+    return False
+def remove_older_ckpt(saved_model_name, checkpoint_dir, max_to_keep=5):
+    if os.path.exists(os.path.join(checkpoint_dir, "checkpoint")):
+        with open(os.path.join(checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(checkpoint_dir, item)):
+            os.remove(os.path.join(checkpoint_dir, item))
+    with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def set_all_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+def save_checkpoint(
+    args,
+    generator,
+    g_optimizer,
+    step,
+    discriminator=None,
+    d_optimizer=None,
+    max_to_keep=5,
+):
+    saved_model_name = "model.ckpt-{}.pt".format(step)
+    checkpoint_path = os.path.join(args.checkpoint_dir, saved_model_name)
+    if discriminator and d_optimizer:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "discriminator": discriminator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "d_optimizer": d_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    else:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    print("Saved checkpoint: {}".format(checkpoint_path))
+    if os.path.exists(os.path.join(args.checkpoint_dir, "checkpoint")):
+        with open(os.path.join(args.checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(args.checkpoint_dir, item)):
+            os.remove(os.path.join(args.checkpoint_dir, item))
+    with open(os.path.join(args.checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def attempt_to_restore(
+    generator, g_optimizer, checkpoint_dir, discriminator=None, d_optimizer=None
+):
+    checkpoint_list = os.path.join(checkpoint_dir, "checkpoint")
+    if os.path.exists(checkpoint_list):
+        checkpoint_filename = open(checkpoint_list).readlines()[-1].strip()
+        checkpoint_path = os.path.join(checkpoint_dir, "{}".format(checkpoint_filename))
+        print("Restore from {}".format(checkpoint_path))
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        if generator:
+            if not list(generator.state_dict().keys())[0].startswith("module."):
+                raw_dict = checkpoint["generator"]
+                clean_dict = OrderedDict()
+                for k, v in raw_dict.items():
+                    if k.startswith("module."):
+                        clean_dict[k[7:]] = v
+                    else:
+                        clean_dict[k] = v
+                generator.load_state_dict(clean_dict)
+            else:
+                generator.load_state_dict(checkpoint["generator"])
+        if g_optimizer:
+            g_optimizer.load_state_dict(checkpoint["g_optimizer"])
+        global_step = 100000
+        if discriminator and "discriminator" in checkpoint.keys():
+            discriminator.load_state_dict(checkpoint["discriminator"])
+            global_step = checkpoint["global_step"]
+            print("restore discriminator")
+        if d_optimizer and "d_optimizer" in checkpoint.keys():
+            d_optimizer.load_state_dict(checkpoint["d_optimizer"])
+            print("restore d_optimizer...")
+    else:
+        global_step = 0
+    return global_step
+class ExponentialMovingAverage(object):
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+    def update(self, name, x):
+        assert name in self.shadow
+        update_delta = self.shadow[name] - x
+        self.shadow[name] -= (1.0 - self.decay) * update_delta
+def apply_moving_average(model, ema):
+    for name, param in model.named_parameters():
+        if name in ema.shadow:
+            ema.update(name, param.data)
+def register_model_to_ema(model, ema):
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            ema.register(name, param.data)
+class YParams(HParams):
+    def __init__(self, yaml_file):
+        if not os.path.exists(yaml_file):
+            raise IOError("yaml file: {} is not existed".format(yaml_file))
+        super().__init__()
+        self.d = collections.OrderedDict()
+        with open(yaml_file) as fp:
+            for _, v in yaml().load(fp).items():
+                for k1, v1 in v.items():
+                    try:
+                        if self.get(k1):
+                            self.set_hparam(k1, v1)
+                        else:
+                            self.add_hparam(k1, v1)
+                        self.d[k1] = v1
+                    except Exception:
+                        import traceback
+                        print(traceback.format_exc())
+    # @property
+    def get_elements(self):
+        return self.d.items()
+def override_config(base_config, new_config):
+    """Update new configurations in the original dict with the new dict
+    Args:
+        base_config (dict): original dict to be overridden
+        new_config (dict): dict with new configurations
+    Returns:
+        dict: updated configuration dict
+    """
+    for k, v in new_config.items():
+        if type(v) == dict:
+            if k not in base_config.keys():
+                base_config[k] = {}
+            base_config[k] = override_config(base_config[k], v)
+        else:
+            base_config[k] = v
+    return base_config
+def get_lowercase_keys_config(cfg):
+    """Change all keys in cfg to lower case
+    Args:
+        cfg (dict): dictionary that stores configurations
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    updated_cfg = dict()
+    for k, v in cfg.items():
+        if type(v) == dict:
+            v = get_lowercase_keys_config(v)
+        updated_cfg[k.lower()] = v
+    return updated_cfg
+def _load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    with open(config_fn, "r") as f:
+        data = f.read()
+    config_ = json5.loads(data)
+    if "base_config" in config_:
+        # load configurations from new path
+        p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
+        p_config_ = _load_config(p_config_path)
+        config_ = override_config(p_config_, config_)
+    if lowercase:
+        # change keys in config_ to lower case
+        config_ = get_lowercase_keys_config(config_)
+    return config_
+def load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): _description_. Defaults to False.
+    Returns:
+        JsonHParams: an object that stores configurations
+    """
+    config_ = _load_config(config_fn, lowercase=lowercase)
+    # create an JsonHParams object with configuration dict
+    cfg = JsonHParams(**config_)
+    return cfg
+def save_config(save_path, cfg):
+    """Save configurations into a json file
+    Args:
+        save_path (str): path to save configurations
+        cfg (dict): dictionary that stores configurations
+    """
+    with open(save_path, "w") as f:
+        json5.dump(
+            cfg, f, ensure_ascii=False, indent=4, quote_keys=True, sort_keys=True
+        )
+class JsonHParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = JsonHParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+class ValueWindow:
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1) :] + [x]
+    @property
+    def sum(self):
+        return sum(self._values)
+    @property
+    def count(self):
+        return len(self._values)
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+    def reset(self):
+        self._values = []
+class Logger(object):
+    def __init__(
+        self,
+        filename,
+        level="info",
+        when="D",
+        backCount=10,
+        fmt="%(asctime)s : %(message)s",
+    ):
+        self.level_relations = {
+            "debug": logging.DEBUG,
+            "info": logging.INFO,
+            "warning": logging.WARNING,
+            "error": logging.ERROR,
+            "crit": logging.CRITICAL,
+        }
+        if level == "debug":
+            fmt = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s"
+        self.logger = logging.getLogger(filename)
+        format_str = logging.Formatter(fmt)
+        self.logger.setLevel(self.level_relations.get(level))
+        sh = logging.StreamHandler()
+        sh.setFormatter(format_str)
+        th = handlers.TimedRotatingFileHandler(
+            filename=filename, when=when, backupCount=backCount, encoding="utf-8"
+        )
+        th.setFormatter(format_str)
+        self.logger.addHandler(sh)
+        self.logger.addHandler(th)
+        self.logger.info(
+            "==========================New Starting Here=============================="
+        )
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
+def get_current_time():
+    pass
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)

utils/whisper_transcription.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import pathlib
+import string
+import time
+from multiprocessing import Pool, Value, Lock
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+import torch
+import whisper
+processed_files_count = Value("i", 0)  # count of processed files
+lock = Lock()  # lock for the count
+def preprocess_text(text):
+    """Preprocess text after ASR"""
+    return text.lower().translate(str.maketrans("", "", string.punctuation))
+def transcribe_audio(model, processor, audio_file, device):
+    """Transcribe audio file"""
+    audio = whisper.load_audio(audio_file)  # load from path
+    audio = whisper.pad_or_trim(audio)  # default 30 seconds
+    inputs = whisper.log_mel_spectrogram(audio).to(
+        device=device
+    )  # convert to spectrogram
+    inputs = inputs.unsqueeze(0).type(torch.cuda.HalfTensor)  # add batch dimension
+    outputs = model.generate(
+        inputs=inputs, max_new_tokens=128
+    )  # generate transcription
+    transcription = processor.batch_decode(outputs, skip_special_tokens=True)[
+        0
+    ]  # decode
+    transcription_processed = preprocess_text(transcription)  # preprocess
+    return transcription_processed
+def write_transcription(audio_file, transcription):
+    """Write transcription to txt file"""
+    txt_file = audio_file.with_suffix(".txt")
+    with open(txt_file, "w") as file:
+        file.write(transcription)
+def init_whisper(model_id, device):
+    """Initialize whisper model and processor"""
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    print(f"Loading model {model_id}")  # model_id = "distil-whisper/distil-large-v2"
+    distil_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=False
+    )
+    distil_model = distil_model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    return distil_model, processor
+def asr_wav_files(file_list, gpu_id, total_files, model_id):
+    """Transcribe wav files in a list"""
+    device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
+    whisper_model, processor = init_whisper(model_id, device)
+    print(f"Processing on {device} starts")
+    start_time = time.time()
+    for audio_file in file_list:
+        try:
+            transcription = transcribe_audio(
+                whisper_model, processor, audio_file, device
+            )
+            write_transcription(audio_file, transcription)
+            with lock:
+                processed_files_count.value += 1
+                if processed_files_count.value % 5 == 0:
+                    current_time = time.time()
+                    avg_time_per_file = (current_time - start_time) / (
+                        processed_files_count.value
+                    )
+                    remaining_files = total_files - processed_files_count.value
+                    estimated_time_remaining = avg_time_per_file * remaining_files
+                    remaining_time_formatted = time.strftime(
+                        "%H:%M:%S", time.gmtime(estimated_time_remaining)
+                    )
+                    print(
+                        f"Processed {processed_files_count.value}/{total_files} files, time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}, Estimated time remaining: {remaining_time_formatted}"
+                    )
+        except Exception as e:
+            print(f"Error processing file {audio_file}: {e}")
+def asr_main(input_dir, num_gpus, model_id):
+    """Transcribe wav files in a directory"""
+    num_processes = min(num_gpus, os.cpu_count())
+    print(f"Using {num_processes} GPUs for transcription")
+    wav_files = list(pathlib.Path(input_dir).rglob("*.wav"))
+    total_files = len(wav_files)
+    print(f"Found {total_files} wav files in {input_dir}")
+    files_per_process = len(wav_files) // num_processes
+    print(f"Processing {files_per_process} files per process")
+    with Pool(num_processes) as p:
+        p.starmap(
+            asr_wav_files,
+            [
+                (
+                    wav_files[i * files_per_process : (i + 1) * files_per_process],
+                    i % num_gpus,
+                    total_files,
+                    model_id,
+                )
+                for i in range(num_processes)
+            ],
+        )
+    print("Done!")
+if __name__ == "__main__":
+    input_dir = "/path/to/output/directory"
+    num_gpus = 2
+    model_id = "distil-whisper/distil-large-v2"
+    asr_main(input_dir, num_gpus, model_id)

utils/world.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# 1. Extract WORLD features including F0, AP, SP
+# 2. Transform between SP and MCEP
+import torchaudio
+import pyworld as pw
+import numpy as np
+import torch
+import diffsptk
+import os
+from tqdm import tqdm
+import pickle
+import torchaudio
+def get_mcep_params(fs):
+    """Hyperparameters of transformation between SP and MCEP
+    Reference:
+        https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world_v2/copy_synthesis.sh
+    """
+    if fs in [44100, 48000]:
+        fft_size = 2048
+        alpha = 0.77
+    if fs in [16000]:
+        fft_size = 1024
+        alpha = 0.58
+    return fft_size, alpha
+def extract_world_features(waveform, frameshift=10):
+    # waveform: (1, seq)
+    # x: (seq,)
+    x = np.array(waveform, dtype=np.double)
+    _f0, t = pw.dio(x, fs, frame_period=frameshift)  # raw pitch extractor
+    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
+    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
+    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity
+    return f0, sp, ap, fs
+def sp2mcep(x, mcsize, fs):
+    fft_size, alpha = get_mcep_params(fs)
+    x = torch.as_tensor(x, dtype=torch.float)
+    tmp = diffsptk.ScalarOperation("SquareRoot")(x)
+    tmp = diffsptk.ScalarOperation("Multiplication", 32768.0)(tmp)
+    mgc = diffsptk.MelCepstralAnalysis(
+        cep_order=mcsize - 1, fft_length=fft_size, alpha=alpha, n_iter=1
+    )(tmp)
+    return mgc.numpy()
+def mcep2sp(x, mcsize, fs):
+    fft_size, alpha = get_mcep_params(fs)
+    x = torch.as_tensor(x, dtype=torch.float)
+    tmp = diffsptk.MelGeneralizedCepstrumToSpectrum(
+        alpha=alpha,
+        cep_order=mcsize - 1,
+        fft_length=fft_size,
+    )(x)
+    tmp = diffsptk.ScalarOperation("Division", 32768.0)(tmp)
+    sp = diffsptk.ScalarOperation("Power", 2)(tmp)
+    return sp.double().numpy()
+def f0_statistics(f0_features, path):
+    print("\nF0 statistics...")
+    total_f0 = []
+    for f0 in tqdm(f0_features):
+        total_f0 += [f for f in f0 if f != 0]
+    mean = sum(total_f0) / len(total_f0)
+    print("Min = {}, Max = {}, Mean = {}".format(min(total_f0), max(total_f0), mean))
+    with open(path, "wb") as f:
+        pickle.dump([mean, total_f0], f)
+def world_synthesis(f0, sp, ap, fs, frameshift):
+    y = pw.synthesize(
+        f0, sp, ap, fs, frame_period=frameshift
+    )  # synthesize an utterance using the parameters
+    return y

visualization/SingVisio/System_Introduction_of_SingVisio_V2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dd205eace26d91a558e70662a61f017e3ca78e89d98cf45a72ee0911c6a64d2
+size 4592895

visualization/SingVisio/webpage/Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+FROM python:3.10
+WORKDIR /app
+COPY resources ./resources
+COPY img ./img
+COPY index.html ./index.html
+COPY server.py ./server.py
+COPY config ./config
+RUN pip install numpy scikit-learn flask flask_cors gunicorn -i https://pypi.tuna.tsinghua.edu.cn/simple
+EXPOSE 8000
+ENTRYPOINT ["gunicorn", "-w", "8", "-b", "0.0.0.0:8000", "server:app"]
+# docker build -t singvisio .
+# docker run -v $(pwd)/data:/app/data -p 8000:8000 singvisio

visualization/SingVisio/webpage/README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+## SingVisio Webpage
+This is the source code for the SingVisio Webpage. This README file will introduce the project and provide an installation guide. For introduction to SingVisio, please check this [README.md](../../../egs/visualization/SingVisio/README.md) file.
+### Tech Stack
+- [Tailwind CSS](https://tailwindcss.com/)
+- [Flowbite](https://flowbite.com/)
+- [D3.js](https://d3js.org/)
+- [Driver.js](https://driverjs.com/)
+### Structure
+- `index.html`: The entry point file.
+- `config`: Contains JSON configuration files loaded by `index.html`.
+- `img`: Image files.
+- `resources`: Contains CSS styles and JavaScript files.
+    - `init.js`: Loads the configuration and initializes variables.
+    - `function.js`: Houses the functions used in this project.
+    - `event.js`: Binds webpage mouse and keyboard events to functions.
+- `Dockerfile`: For building a Docker image if deployment is needed.
+### Configuration
+Before installation, you need to configure the data path in the `config/default.json` file.
+To better understand our project, please note that this configuration pertains to our pre-processed data. If you want to visualize your own data, you can follow the guide below to properly set up the system.
+1. **Update the Data Configuration** in the `config/default.json` file.
+    SingVisio will read the configuration from this JSON file and render the webpage. Be aware that any errors in the JSON file may cause the system to shut down.
+    ```json
+    {
+        "pathData": {
+            "<mode_name>": { // supports multiple modes
+                    "users": ["basic", "advanced"], // mode choice: "basic" or "advanced"
+                    "multi": ["<id>"], // song_id, sourcesinger_id, or target_id. Set to false to disable. Enables multiple choices for the configured checkbox.
+                    "curve": true, // set to true if the metric curve is needed
+                    "referenceMap": { // configures reference paths when multiple choices are enabled.
+                        "<sourcesinger_id>": [ // e.g., m4singer_Tenor-6
+                            "<path_to_wav>", // e.g., Tenor-6_寂寞沙洲冷_0002
+                        ]
+                    },
+                    "data": [
+                        { // supports multiple datasets
+                            "dataset": "<dataset_name>",
+                            "basePath": "<path_to_the_processed_data>",
+                            "pathMap": {
+                                "<sourcesinger_id>": {
+                                    "songs": [
+                                        "<song_id>" // set song ID; supports multiple IDs
+                                    ],
+                                    "targets": [
+                                        "<target_id>" // set target singer ID; supports multiple IDs
+                                    ]
+                                }
+                            }
+                        }
+                    ]
+            }
+        },
+        "mapToName": {
+            "<map_from>": "<map_to>"
+        },
+        "mapToSong": {
+            "<map_from>": "<map_to>"
+        },
+        "mapToSpace": {
+            "<map_from>": "<map_to>"
+        },
+        "picTypes": [
+            "<pic_type>" // supports multiple types
+        ],
+        "evaluation_data": [
+            { // supports multiple data sets
+                "target": "<target_id>",
+                "sourcesinger": "<sourcesinger_id>",
+                "song": "<song_id>",
+                "best": [
+                     "<best_metric>" // activated when clicking the respective metric
+                ]
+            },
+        ],
+        "colorList": [
+            "<color_hex_code>" // supports multiple colors
+        ],
+        "histogramData": [
+            { // displayed in the top left graph
+                "type": "high", // "high" or "low"; "high" means the higher, the better
+                "name": "<metric_name>",
+                "value": <metric_value>
+            }
+        ]
+    }
+    ```
+2. **Change the Data Source Path**
+    The total size of our pre-processed data is approximately 60-70 GB. We provide an online host server, and the server path (`baseLink`) can be modified in the `index.html` file on line 15.
+    If you prefer to host the data on your local computer, you can set the `baseLink` value to an empty string as shown below. This will direct the server to read data from your local `data` folder.
+    ```html
+    <script>
+        const baseLink = ''; // do not end with '/'
+    </script>
+    ```
+### Installation
+This project does not require a build process. There are multiple ways to run it, but here we introduce the simplest method:
+1. Install Python 3.10 and required packages.
+    ```bash
+    pip install numpy scikit-learn flask flask_cors gunicorn
+    ```
+2. Run the following command to start the HTTP server:
+    ```bash
+    cd webpage
+    gunicorn -w 8 -b 0.0.0.0:8080 server:app
+    ```
+3. After starting the HTTP web server, open the following link in your browser: [http://localhost:8080/](http://localhost:8080/)

visualization/SingVisio/webpage/config/default.json ADDED Viewed

	@@ -0,0 +1,407 @@

+{
+    "pathData": {
+        "Step Comparison": {
+            "users": ["basic", "advanced"],
+            "multi": false,
+            "data": [
+                {
+                    "dataset": "SVCC",
+                    "basePath": "data/gd_svcc",
+                    "pathMap": {
+                        "SF1": {
+                            "songs": [
+                                "30001",
+                                "30002",
+                                "30003"
+                            ],
+                            "targets": [
+                                "svcc_IDF1",
+                                "svcc_IDM1",
+                                "svcc_CDF1",
+                                "svcc_CDM1"
+                            ]
+                        },
+                        "SM1": {
+                            "songs": [
+                                "30001",
+                                "30002",
+                                "30003"
+                            ],
+                            "targets": [
+                                "svcc_IDF1",
+                                "svcc_IDM1",
+                                "svcc_CDF1",
+                                "svcc_CDM1"
+                            ]
+                        }
+                    }
+                },
+                {
+                    "dataset": "M4Singer",
+                    "basePath": "data/gd_m4sg",
+                    "pathMap": {
+                        "Alto-1": {
+                            "songs": [
+                                "美错_0014"
+                            ],
+                            "targets": [
+                                "opencpop"
+                            ]
+                        },
+                        "Bass-1": {
+                            "songs": [
+                                "十年_0008"
+                            ],
+                            "targets": [
+                                "opencpop"
+                            ]
+                        },
+                        "Soprano-2": {
+                            "songs": [
+                                "同桌的你_0018"
+                            ],
+                            "targets": [
+                                "opencpop"
+                            ]
+                        },
+                        "Tenor-5": {
+                            "songs": [
+                                "爱笑的眼睛_0010"
+                            ],
+                            "targets": [
+                                "opencpop"
+                            ]
+                        }
+                    }
+                }
+            ]
+        },
+        "Metric Comparison": {
+            "users": ["basic", "advanced"],
+            "multi": false,
+            "curve": true,
+            "data": [
+                {
+                    "dataset": "SVCC",
+                    "basePath": "data/ev_best",
+                    "pathMap": {
+                        "SM1": {
+                            "songs": [
+                                "30009"
+                            ],
+                            "targets": [
+                                "svcc_IDM1"
+                            ]
+                        },
+                        "SF1": {
+                            "songs": [
+                                "30005",
+                                "30006",
+                                "30009",
+                                "30016",
+                                "30022",
+                                "30019"
+                            ],
+                            "targets": [
+                                "svcc_IDF1"
+                            ]
+                        }
+                    }
+                }
+            ]
+        },
+        "Source Singer Comparison": {
+            "users": ["advanced"],
+            "multi": [
+                "sourcesinger_id"
+            ],
+            "referenceMap": {
+                "m4singer_Alto-7": [
+                    "Alto-7_寂寞沙洲冷_0000",
+                    "Alto-7_寂寞沙洲冷_0011"
+                ],
+                "m4singer_Bass-1": [
+                    "Bass-1_寂寞沙洲冷_0002",
+                    "Bass-1_寂寞沙洲冷_0021"
+                ],
+                "m4singer_Tenor-6": [
+                    "Tenor-6_寂寞沙洲冷_0002",
+                    "Tenor-6_寂寞沙洲冷_0020"
+                ],
+                "m4singer_Tenor-7": [
+                    "Tenor-7_寂寞沙洲冷_0002",
+                    "Tenor-7_寂寞沙洲冷_0013",
+                    "Tenor-7_寂寞沙洲冷_0023"
+                ]
+            },
+            "indexMode": "number",
+            "data": [
+                {
+                    "dataset": "M4Singer",
+                    "basePath": "data/dc_dss",
+                    "pathMap": {
+                        "Alto-7": {
+                            "songs": [
+                                "寂寞沙洲冷_0000",
+                                "寂寞沙洲冷_0011"
+                            ],
+                            "targets": [
+                                "m4singer_Tenor-7",
+                                "m4singer_Alto-7"
+                            ]
+                        },
+                        "Bass-1": {
+                            "songs": [
+                                "寂寞沙洲冷_0002",
+                                "寂寞沙洲冷_0021"
+                            ],
+                            "targets": [
+                                "m4singer_Tenor-7",
+                                "m4singer_Bass-1"
+                            ]
+                        },
+                        "Tenor-6": {
+                            "songs": [
+                                "寂寞沙洲冷_0002",
+                                "寂寞沙洲冷_0020"
+                            ],
+                            "targets": [
+                                "m4singer_Tenor-7",
+                                "m4singer_Tenor-6"
+                            ]
+                        },
+                        "Tenor-7": {
+                            "songs": [
+                                "寂寞沙洲冷_0002",
+                                "寂寞沙洲冷_0013"
+                            ],
+                            "targets": [
+                                "m4singer_Alto-7",
+                                "m4singer_Bass-1",
+                                "m4singer_Tenor-6"
+                            ]
+                        }
+                    }
+                }
+            ]
+        },
+        "Song Comparison": {
+            "users": ["advanced"],
+            "multi": [
+                "song_id"
+            ],
+            "referenceMap": {
+                "m4singer_Tenor-6": [
+                    "Tenor-6_寂寞沙洲冷_0002",
+                    "Tenor-6_寂寞沙洲冷_0020"
+                ],
+                "m4singer_Tenor-7": [
+                    "Tenor-7_寂寞沙洲冷_0002",
+                    "Tenor-7_寂寞沙洲冷_0013"
+                ]
+            },
+            "data": [
+                {
+                    "dataset": "M4Singer",
+                    "basePath": "data/dc_dss",
+                    "pathMap": {
+                        "Tenor-6": {
+                            "songs": [
+                                "寂寞沙洲冷_0002",
+                                "寂寞沙洲冷_0020"
+                            ],
+                            "targets": [
+                                "m4singer_Tenor-7",
+                                "m4singer_Tenor-6"
+                            ]
+                        }
+                    }
+                }
+            ]
+        },
+        "Target Singer Comparison": {
+            "users": ["advanced"],
+            "multi": [
+                "song_id",
+                "target_id"
+            ],
+            "referenceMap": {
+                "m4singer_Alto-7": [
+                    "Alto-7_寂寞沙洲冷_0000",
+                    "Alto-7_寂寞沙洲冷_0011"
+                ],
+                "m4singer_Bass-1": [
+                    "Bass-1_寂寞沙洲冷_0002",
+                    "Bass-1_寂寞沙洲冷_0021"
+                ],
+                "m4singer_Tenor-7": [
+                    "Tenor-7_寂寞沙洲冷_0002",
+                    "Tenor-7_寂寞沙洲冷_0013"
+                ],
+                "m4singer_Tenor-6": [
+                    "Tenor-6_寂寞沙洲冷_0002",
+                    "Tenor-6_寂寞沙洲冷_0020"
+                ]
+            },
+            "data": [
+                {
+                    "dataset": "M4Singer",
+                    "basePath": "data/dc_ssd",
+                    "pathMap": {
+                        "Tenor-6": {
+                            "songs": [
+                                "寂寞沙洲冷_0002",
+                                "寂寞沙洲冷_0020"
+                            ],
+                            "targets": [
+                                "m4singer_Alto-7",
+                                "m4singer_Bass-1",
+                                "m4singer_Tenor-7",
+                                "m4singer_Tenor-6"
+                            ]
+                        }
+                    }
+                }
+            ]
+        }
+    },
+    "mapToName": {
+        "SF1": "Singer 1",
+        "SM1": "Singer 2",
+        "CDF1": "Singer 3",
+        "CDM1": "Singer 4",
+        "IDF1": "Singer 5",
+        "IDM1": "Singer 6",
+        "svcc_CDF1": "Singer 3",
+        "svcc_CDM1": "Singer 4",
+        "svcc_IDF1": "Singer 5",
+        "svcc_IDM1": "Singer 6",
+        "Alto-1": "Singer 7",
+        "m4singer_Alto-1": "Singer 7",
+        "Alto-7": "Singer 8",
+        "m4singer_Alto-7": "Singer 8",
+        "Bass-1": "Singer 9",
+        "m4singer_Bass-1": "Singer 9",
+        "Soprano-2": "Singer 10",
+        "m4singer_Soprano-2": "Singer 10",
+        "Tenor-5": "Singer 11",
+        "m4singer_Tenor-5": "Singer 11",
+        "Tenor-6": "Singer 12",
+        "m4singer_Tenor-6": "Singer 12",
+        "Tenor-7": "Singer 13",
+        "m4singer_Tenor-7": "Singer 13",
+        "opencpop": "Singer 14"
+    },
+    "mapToSong": {
+        "30001": "Song 1",
+        "30002": "Song 2",
+        "30003": "Song 3",
+        "10001": "Song 4",
+        "10030": "Song 5",
+        "10120": "Song 6",
+        "10140": "Song 7",
+        "美错_0014": "Song 8",
+        "十年_0008": "Song 9",
+        "同桌的你_0018": "Song 10",
+        "爱笑的眼睛_0010": "Song 11",
+        "寂寞沙洲冷_0000": "Song 12",
+        "寂寞沙洲冷_0002": "Song 12",
+        "寂寞沙洲冷_0011": "Song 13",
+        "寂寞沙洲冷_0013": "Song 13",
+        "寂寞沙洲冷_0020": "Song 13",
+        "寂寞沙洲冷_0021": "Song 14",
+        "30005": "Song 15",
+        "30006": "Song 16",
+        "30009": "Song 17",
+        "30016": "Song 18",
+        "30022": "Song 19",
+        "30019": "Song 20"
+    },
+    "mapToSpace": {
+        "encoded_step": "Step (Diffusion step)",
+        "noise_step_layer0": "Step + Noise (First layer)",
+        "noise_step_layer10": "Step + Noise (Middle layer)",
+        "noise_step_layer19": "Step + Noise (Last layer)",
+        "noise_step_condition_layer0": "Step + Noise + Condition (First layer)",
+        "noise_step_condition_layer10": "Step + Noise + Condition (Middle layer)",
+        "noise_step_condition_layer19": "Step + Noise + Condition (Last layer)"
+    },
+    "picTypes": [
+        "encoded_step",
+        "noise_step_layer0",
+        "noise_step_layer10",
+        "noise_step_layer19",
+        "noise_step_condition_layer0",
+        "noise_step_condition_layer10",
+        "noise_step_condition_layer19"
+    ],
+    "evaluation_data": [
+        {
+            "target": "svcc_IDM1",
+            "sourcesinger": "SM1",
+            "song": "30009",
+            "best": [
+                "MCD"
+            ]
+        },
+        {
+            "target": "svcc_IDF1",
+            "sourcesinger": "SF1",
+            "song": "30016",
+            "best": [
+                "F0CORR",
+                "FAD"
+            ]
+        },
+        {
+            "target": "svcc_IDF1",
+            "sourcesinger": "SF1",
+            "song": "30009",
+            "best": [
+                "F0RMSE",
+                "CER"
+            ]
+        },
+        {
+            "target": "svcc_IDF1",
+            "sourcesinger": "SF1",
+            "song": "30019",
+            "best": [
+                "Dembed"
+            ]
+        }
+    ],
+    "colorList": [
+        "#FFA500",
+        "#1C64F2",
+        "#7E3AF2",
+        "#9F580A"
+    ],
+    "histogramData": [
+        {
+            "type": "high",
+            "name": "F0CORR",
+            "value": 0.946698913
+        },
+        {
+            "type": "high",
+            "name": "Dembed",
+            "value": 0.688410708
+        },
+        {
+            "type": "low",
+            "name": "MCD",
+            "value": 11.44773471
+        },
+        {
+            "type": "low",
+            "name": "F0RMSE",
+            "value": 70.81400428
+        },
+        {
+            "type": "low",
+            "name": "FAD",
+            "value": 10.35121372
+        }
+    ]
+}

visualization/SingVisio/webpage/img/difference_bar.jpg ADDED Viewed

visualization/SingVisio/webpage/img/syllable.png ADDED Viewed

visualization/SingVisio/webpage/index.html ADDED Viewed

	@@ -0,0 +1,390 @@

+<!--
+    Copyright (c) 2023 Amphion.
+    This source code is licensed under the MIT license found in the
+    LICENSE file in the root directory of this source tree。
+-->
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=1200px, initial-scale=1.0">
+    <title>SingVisio: Visual Analytics of Diffusion Model for Singing Voice Conversion</title>
+    <script>
+        const baseLink = 'https://dsvc.openmmlab.org.cn'; // end without '/'
+    </script>
+    <!-- Load Tailwind CSS and D3.js -->
+    <script src="./resources/tailwind.js"></script>
+    <script src="./resources/d3.v4.min.js"></script>
+    <script src="./resources/htl.min.js"></script>
+    <script src="./resources/d3-scale-chromatic.v1.min.js"></script>
+    <script src="./resources/d3-contour.v1.min.js"></script>
+    <!-- Load the Guide driver -->
+    <script src="./resources/driver.js.iife.min.js"></script>
+    <link rel="stylesheet" href="./resources/driver.min.css">
+    <!-- Config Tailwind CSS -->
+    <script type="module">
+        import cfg from "./tailwind.config.js";
+        tailwind.config = cfg;
+    </script>
+    <style type="text/tailwindcss">
+        @layer components {
+        .btn-small {
+            @apply px-3 py-2 text-xs font-medium text-center text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-200 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700
+        }
+        .btn {
+            @apply text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 mr-2 mb-2 dark:bg-blue-600 dark:hover:bg-blue-700 focus:outline-none dark:focus:ring-blue-800;
+        }
+        .btn-sec {
+            @apply py-2.5 px-5 mr-2 mb-2 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-200 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700
+        }
+        .select-select {
+            @apply my-0 py-2 px-1 bg-gray-50 border border-gray-300 text-gray-900 text-xs rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500
+        }
+        .select-label {
+            @apply block text-sm font-medium text-gray-900 dark:text-white;
+        }
+        .card {
+            @apply p-6 bg-white border border-gray-200 rounded-lg dark:bg-gray-800 dark:border-gray-700
+        }
+        .card-title {
+            @apply mb-2 text-base font-bold tracking-tight text-gray-900 dark:text-white
+        }
+        .timeline-point {
+            @apply absolute w-3 h-3 bg-gray-200 rounded-full mt-1.5 -left-1.5 border border-white dark:border-gray-900 dark:bg-gray-700
+        }
+        .timeline-title {
+            @apply text-lg font-semibold text-gray-900 dark:text-white
+        }
+        .timeline-subtitle {
+            @apply text-base font-normal text-gray-500 dark:text-gray-400
+        }
+        .small-input {
+            @apply block w-full p-2 text-gray-900 border border-gray-300 rounded-lg bg-gray-50 sm:text-xs focus:ring-blue-500 focus:border-blue-500 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500;
+        }
+        .checkbox {
+            @apply w-4 h-4 text-blue-600 bg-gray-100 border-gray-300 rounded focus:ring-blue-500 dark:focus:ring-blue-600 dark:ring-offset-gray-800 focus:ring-2 dark:bg-gray-700 dark:border-gray-600
+        }
+        .dropdown_button_text {
+            @apply w-full text-xs font-normal text-gray-900 dark:text-white text-left
+        }
+        .dropdown_button {
+            @apply btn-sec text-xs flex items-center w-full px-2 py-2 my-0 disabled:cursor-not-allowed disabled:opacity-50
+        }
+        }
+        </style>
+    <style>
+        input.step-axis {
+            outline: none;
+            -webkit-appearance: none;
+            background: #0000002b;
+            height: 8px;
+        }
+        input.step-axis::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            position: relative;
+            width: 18px;
+            height: 18px;
+            background: url("./img/syllable.png") no-repeat;
+            background-size: 18px;
+            border-radius: 50%;
+            cursor: pointer;
+        }
+        audio {
+            outline: none;
+            height: 34px;
+        }
+        /* make a input with two handles */
+        .inputs {
+            display: block;
+            width: 100%;
+            height: 10px;
+            /* background-color: azure; */
+        }
+        .inputs input {
+            position: absolute;
+        }
+        .inputs input::-webkit-slider-thumb {
+            pointer-events: all;
+            z-index: 2;
+        }
+        .inputs input::-webkit-slider-runnable-track {
+            pointer-events: none;
+            z-index: 1;
+        }
+    </style>
+</head>
+<body class="bg-gray-100 dark:bg-gray-900">
+    <div id="alert"
+        class="hidden fixed top-0 right-0 left-0 z-50 w-full h-[100vh] bg-black bg-opacity-50 justify-center items-center overflow-y-hidden">
+        <div class="card flex flex-col min-w-[400px] max-w-2xl max-h-[80vh] p-0 overflow-hidden">
+            <!-- Modal header -->
+            <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
+                <h3 id="alert_title" class="text-xl font-semibold text-gray-900 dark:text-white">
+                    Title
+                </h3>
+                <button id="close_alert" type="button"
+                    class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white">
+                    <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none"
+                        viewBox="0 0 14 14">
+                        <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"
+                            d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6" />
+                    </svg>
+                    <span class="sr-only">Close modal</span>
+                </button>
+            </div>
+            <!-- Modal body -->
+            <div id="alert_text"
+                class="p-4 md:p-5 space-y-4 text-base leading-relaxed text-gray-700 dark:text-gray-200 overflow-y-auto">
+                Text
+            </div>
+            <!-- Modal footer -->
+            <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
+                <button id="finish_alert" type="button"
+                    class="ml-auto text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800">OK</button>
+            </div>
+        </div>
+    </div>
+    <div class="bg-white dark:bg-gray-800 w-full py-4 px-6 border border-b border-gray-200 dark:border-gray-600">
+        <div class="mx-auto max-w-[1490px] grid grid-cols-6 align-center items-center">
+            <!-- <img class="dark:hidden" src="img/cuhksz_logo.png" alt="cuhksz logo" class="h-[40px]">
+            <img class="hidden dark:block" src="img/cuhksz_logo_white.png" alt="cuhksz logo" class="h-[40px]"> -->
+            <span class="col-span-1"></span>
+            <span id="title" class="col-span-4 mx-auto font-[800] text-[20px] dark:text-white">SingVisio: Visual
+                Analytics of Diffusion Model for Singing Voice Conversion</span>
+            <!-- <span class="ml-auto mr-0 text-sm dark:text-white">Team: <i>Human Language Technology Lab,
+                    CUHK-Shenzhen</i></span> -->
+            <div class="flex">
+                <button class="btn-small ml-auto" id="mode_change">Switch to _</button>
+                <button class="btn-small ml-2" id="help">Help?</button>
+            </div>
+        </div>
+    </div>
+    <div class="max-w-[1500px] m-auto">
+        <div class="flex flex-row items-start gap-0.5 py-3 p-1">
+            <div class="w-[300px] flex flex-col flex-none">
+                <div id="performance" class="card p-2 mb-2 flex flex-col flex-none relative">
+                    <button class="absolute right-1 top-1 btn-small px-1.5 py-0.5 ml-auto rounded-full"
+                        id="metrics_help">?</button>
+                    <div class="flex flex-row">
+                        <div id="histogram" class="flex-none"></div>
+                        <div id="histogram2" class="flex-none"></div>
+                    </div>
+                    <span class="text-[12px] mx-auto dark:text-white">Metrics</span>
+                </div>
+                <div id="touch_map" class="card p-2 relative">
+                    <button class="absolute right-1 top-1 btn-small px-1.5 py-0.5 ml-auto rounded-full"
+                        id="projection_help">?</button>
+                    <div class="flex mb-1 align-center items-center space-between dark:text-white">
+                        <div class="ml-1 text-sm">Step: <span id="current_step_display_number"></span></div>
+                        <div class="ml-auto flex mr-2">
+                            <button class="btn-sec h-9 w-9 p-2.5 mb-0" id="reset_map">
+                                <svg class="w-3.5 h-3.5" aria-hidden="true" xmlns="http://www.w3.org/2000/svg"
+                                    fill="none" viewBox="0 0 18 20">
+                                    <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                                        stroke-width="2"
+                                        d="M16 1v5h-5M2 19v-5h5m10-4a8 8 0 0 1-14.947 3.97M1 10a8 8 0 0 1 14.947-3.97" />
+                                </svg>
+                            </button>
+                        </div>
+                    </div>
+                    <div id="dataviz_axisZoom" class="flex flex-wrap border bg-white dark:bg-gray-800 relative"></div>
+                </div>
+            </div>
+            <div class="w-full">
+                <div id="step_preview" class="flex min-w-[500px] w-full bg-white dark:bg-gray-800 p-2 card mb-2">
+                    <div class="mx-auto" id="preview_container">
+                    </div>
+                    <div class="mx-auto" id="preview_container2">
+                    </div>
+                    <div class="flex flex-col">
+                        <button class="btn-sec" id="refreshpreview">
+                            <svg class="w-4 h-4" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none"
+                                viewBox="0 0 18 20">
+                                <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                                    stroke-width="2"
+                                    d="M16 1v5h-5M2 19v-5h5m10-4a8 8 0 0 1-14.947 3.97M1 10a8 8 0 0 1 14.947-3.97" />
+                            </svg>
+                        </button>
+                    </div>
+                </div>
+                <div id="mel_card_container" class="grid grid-cols-3 min-w-[915px] w-full gap-1 justify-items-center">
+                </div>
+                <div id="tips">
+                </div>
+                <div id="tooltip" role="tooltip"
+                    class="invisible absolute z-10 inline-block px-3 py-2 text-sm font-medium text-white bg-gray-900 rounded-lg shadow-sm opacity-[0.9] dark:bg-gray-700">
+                    Tooltip content
+                </div>
+            </div>
+            <div class="shrink-0 w-[180px]">
+                <div class="card py-2 px-3 relative">
+                    <button class="absolute right-1 top-1 btn-small px-1.5 py-0.5 ml-auto rounded-full"
+                        id="control_help">?</button>
+                    <div class="flex items-center">
+                        <h5 class="card-title my-1 text-lg">Control Panel</h5>
+                    </div>
+                    <div class="flex flex-col w-full rounded-lg gap-0.5" id="control_panel">
+                        <div>
+                            <label for="mode_id" class="select-label">Display Mode</label>
+                            <select id="mode_id" class="select-select"></select>
+                        </div>
+                        <div>
+                            <label for="sourcesinger_id" class="select-label">Source Singer</label>
+                            <button id="sourcesinger_id" class="dropdown_button" type="button">
+                                <span class="dropdown_button_text" id="sourcesinger_id_text">Choose Singer</span> <svg
+                                    class="w-2.5 h-2.5" aria-hidden="true" xmlns="http://www.w3.org/2000/svg"
+                                    fill="none" viewBox="0 0 10 6">
+                                    <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                                        stroke-width="2" d="m1 1 4 4 4-4" />
+                                </svg>
+                            </button>
+                            <!-- Dropdown menu -->
+                            <div id="sourcesinger_id_dropdown"
+                                class="absolute z-10 hidden bg-white divide-y divide-gray-100 rounded-lg shadow w-44 dark:bg-gray-700">
+                                <ul class="py-2 text-sm text-gray-700 dark:text-gray-200">
+                                </ul>
+                            </div>
+                        </div>
+                        <div>
+                            <label for="song_id" class="select-label">Song</label>
+                            <button id="song_id" class="dropdown_button" type="button">
+                                <span class="dropdown_button_text" id="song_id_text">Choose Song</span> <svg
+                                    class="w-2.5 h-2.5" aria-hidden="true" xmlns="http://www.w3.org/2000/svg"
+                                    fill="none" viewBox="0 0 10 6">
+                                    <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                                        stroke-width="2" d="m1 1 4 4 4-4" />
+                                </svg>
+                            </button>
+                            <!-- Dropdown menu -->
+                            <div id="song_id_dropdown"
+                                class="absolute z-10 hidden bg-white divide-y divide-gray-100 rounded-lg shadow w-44 dark:bg-gray-700">
+                                <ul class="py-2 text-sm text-gray-700 dark:text-gray-200">
+                                </ul>
+                            </div>
+                        </div>
+                        <div>
+                            <label for="target_id" class="select-label">Target Singer</label>
+                            <button id="target_id" class="dropdown_button" type="button">
+                                <span class="dropdown_button_text" id="target_id_text">Target Singer</span> <svg
+                                    class="w-2.5 h-2.5" aria-hidden="true" xmlns="http://www.w3.org/2000/svg"
+                                    fill="none" viewBox="0 0 10 6">
+                                    <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round"
+                                        stroke-width="2" d="m1 1 4 4 4-4" />
+                                </svg>
+                            </button>
+                            <!-- Dropdown menu -->
+                            <div id="target_id_dropdown"
+                                class="absolute z-10 hidden bg-white divide-y divide-gray-100 rounded-lg shadow w-44 dark:bg-gray-700">
+                                <ul class="py-2 text-sm text-gray-700 dark:text-gray-200">
+                                </ul>
+                            </div>
+                        </div>
+                        <div class="relative">
+                            <label for="pic_id" class="select-label">Projection Embedding</label>
+                            <select id="pic_id" class="select-select"></select>
+                        </div>
+                        <div class="relative" id="components">
+                            <label for="components" class="select-label">Components</label>
+                            <div class="flex flex-col gap-0.5">
+                                <div class="flex items-center">
+                                    <input id="components_pitch" type="checkbox" checked class="checkbox">
+                                    <label for="components_pitch"
+                                        class="ml-1 text-[0.775rem] font-normal text-gray-900 dark:text-gray-300">F0
+                                        contour</label>
+                                </div>
+                                <div class="flex items-start">
+                                    <input id="components_frequncy" type="checkbox" checked class="checkbox">
+                                    <div class="flex flex-col gap-0.5 grow">
+                                        <label for="components_frequncy"
+                                            class="ml-1 mb-1 text-[0.775rem] font-normal text-gray-900 dark:text-gray-300">Frequency</label>
+                                        <div class="flex inputs w-full">
+                                            <input id="inputs_min" type="range"
+                                                class="h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer dark:bg-gray-700"
+                                                value="0" min="0">
+                                            <input id="inputs_max" type="range"
+                                                class="h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer dark:bg-gray-700"
+                                                value="100" max="100">
+                                        </div>
+                                        <div class="flex w-full">
+                                            <span id="inputs_left" class="ml-1 mr-auto text-[0.7rem] font-normal text-gray-900 dark:text-white">0</span>
+                                            <span id="inputs_right" class="mr-2 ml-auto text-[0.7rem] font-normal text-gray-900 dark:text-white">100</span>
+                                        </div>
+                                    </div>
+                                </div>
+                                <div class="flex items-start">
+                                    <input id="sampling_steps" type="checkbox" class="checkbox">
+                                    <div class="flex flex-col grow">
+                                        <label for="sampling_steps"
+                                            class="ml-1 text-[0.775rem] font-normal text-gray-900 dark:text-gray-300">Sampling
+                                            steps</label>
+                                        <div class="flex flex-row h-[32px]">
+                                            <span class="my-auto mx-1 text-[0.775rem] font-normal text-gray-900 dark:text-white">Step count:</span>
+                                            <input type="text"
+                                                class="small-input flex-none w-[50px] text-center bg-white dark:bg-gray-800"
+                                                id="sampling_num" value="100">
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                        </div>
+                    <div id="step_axis">
+                        <label for="range" class="select-label">Step Axis</label>
+                        <div
+                            class="items-center w-full rounded-lg bg-gray-50 flex flex-row gap-2 px-2 py-0.5 border border-gray-300 dark:border-gray-600 dark:text-white dark:bg-gray-700">
+                            <input class="step-axis my-2 w-full" id="range" type="range" min="0" max="999" value="0"
+                                step="1">
+                            <button class="btn-small" id="controls">
+                                <svg id="icon_play" style="display: none" class="w-3 h-3" aria-hidden="true"
+                                    xmlns="http://www.w3.org/2000/svg" fill="currentColor" viewBox="0 0 14 16">
+                                    <path
+                                        d="M0 .984v14.032a1 1 0 0 0 1.506.845l12.006-7.016a.974.974 0 0 0 0-1.69L1.506.139A1 1 0 0 0 0 .984Z" />
+                                </svg>
+                                <svg id="icon_stop" class="w-3 h-3" aria-hidden="true"
+                                    xmlns="http://www.w3.org/2000/svg" fill="currentColor" viewBox="0 0 12 16">
+                                    <path
+                                        d="M3 0H2a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h1a2 2 0 0 0 2-2V2a2 2 0 0 0-2-2Zm7 0H9a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h1a2 2 0 0 0 2-2V2a2 2 0 0 0-2-2Z" />
+                                </svg>
+                            </button>
+                        </div>
+                        <div class="flex gap-1 mt-2">
+                            <span class="my-auto mr-1 text-sm font-medium text-gray-900 dark:text-white">Step:</span>
+                            <input type="text"
+                                class="small-input flex-none w-[60px] text-center bg-white dark:bg-gray-800" id="value">
+                            <button class="btn-small" id="add_preview">
+                                Pin
+                            </button>
+                        </div>
+                    </div>
+                </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script src="./resources/init.js"></script>
+    <script src="./resources/function.js"></script>
+    <script src="./resources/event.js"></script>
+    <script>
+        initConfig('./config/default.json')
+    </script>
+</body>