Spaces:

softcatala
/

comparativa-tts-catala-espeak-upstream

Build error

App Files Files Community

ccoreilly commited on Apr 30, 2023

Commit

d2a1ee2

•

1 Parent(s): 5a22996

Import inicial

Browse files

Files changed (17) hide show

Dockerfile +38 -0
README.md +3 -3
app.py +131 -0
engine.py +144 -0
models/bsc/best_model.pth +3 -0
models/bsc/config.json +262 -0
models/bsc/speaker_map.json +10 -0
models/bsc/speakers.pth +3 -0
models/collectivat/catotron-ona-TTS-API-entry.json +10 -0
models/collectivat/fast-speech_best_model.pth +3 -0
models/collectivat/fast-speech_config.json +213 -0
models/collectivat/ljspeech--hifigan_v2_config.json +158 -0
models/collectivat/ljspeech--hifigan_v2_model_file.pth +3 -0
models/piper/MODEL_CARD +15 -0
models/piper/ca-upc_ona-x-low.onnx +3 -0
models/piper/ca-upc_ona-x-low.onnx.json +409 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+FROM python:3.9
+RUN apt update && apt install -y git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev
+RUN git clone https://github.com/projecte-aina/espeak-ng
+RUN cd espeak-ng && \
+ ./autogen.sh && \
+ ./configure --prefix=/usr && \
+ make && \
+ make install
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+COPY --chown=user requirements.txt .
+COPY --chown=user models models
+RUN pip install -r requirements.txt
+COPY --chown=user engine.py .
+COPY --chown=user app.py .
+RUN mkdir -p cache && chmod 777 cache
+ENV NUMBA_CACHE_DIR=/home/user/cache
+ENV MPLCONFIGDIR=/home/user/cache
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Comparativa Tts Catala Espeak Upstream
-emoji: 🌍
 colorFrom: green
-colorTo: blue
 sdk: docker
 pinned: false
 ---

 ---
+title: Síntesi en català
+emoji: 👁
 colorFrom: green
+colorTo: red
 sdk: docker
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from engine import Piper
+import tempfile
+from typing import Optional
+from TTS.config import load_config
+import gradio as gr
+import numpy as np
+import os
+import json
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+from espeak_phonemizer import Phonemizer
+MAX_TXT_LEN = 325
+SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
+fonemitzador = Phonemizer("ca")
+def carrega_bsc():
+    model_path = os.getcwd() + "/models/bsc/best_model.pth"
+    config_path = os.getcwd() + "/models/bsc/config.json"
+    speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
+    vocoder_path = None
+    vocoder_config_path = None
+    synthesizer = Synthesizer(
+        model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
+    )
+    return synthesizer
+def carrega_collectivat():
+    model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
+    config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
+    vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
+    vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
+    synthesizer = Synthesizer(
+        model_path, config_path, None, None, vocoder_path, vocoder_config_path
+    )
+    return synthesizer
+def carrega_piper():
+    return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
+model_bsc = carrega_bsc()
+SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
+model_collectivat = carrega_collectivat()
+model_piper = carrega_piper()
+def tts(text, speaker_idx):
+    if len(text) > MAX_TXT_LEN:
+        text = text[:MAX_TXT_LEN]
+        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
+    print(text)
+    # synthesize
+    wav_bsc = model_bsc.tts(text, speaker_idx)
+    wav_coll = model_collectivat.tts(text)
+    wav_piper = model_piper.synthesize(text)
+    #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
+    # return output
+    fp_bsc = ""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        model_bsc.save_wav(wav_bsc, fp)
+        fp_bsc = fp.name
+    fp_coll = ""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        model_collectivat.save_wav(wav_coll, fp)
+        fp_coll = fp.name
+    fp_piper = ""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        fp.write(wav_piper)
+        fp_piper = fp.name
+    fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
+    return fonemes, fp_bsc, fp_coll, fp_piper
+description="""
+Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català.
+1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina)
+    https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker
+2. Model Fastspeech entrenat per Col·lectivat
+    https://github.com/CollectivaT-dev/TTS-API
+3. Model VITS entrenat per Piper/Home Assistant
+    https://github.com/rhasspy/piper
+Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
+Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
+https://github.com/projecte-aina/espeak-ng
+NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador.
+"""
+article= ""
+iface = gr.Interface(
+    fn=tts,
+    inputs=[
+        gr.Textbox(
+            label="Text",
+            value="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
+        ),
+        gr.Dropdown(label="Selecciona un parlant pel model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona")
+    ],
+    outputs=[
+        gr.Markdown(label="Fonemes"),
+        gr.Audio(label="BSC VITS",type="filepath"),
+        gr.Audio(label="Collectivat Fastspeech",type="filepath"),
+        gr.Audio(label="Piper VITS",type="filepath")
+    ],
+    title="Comparativa de síntesi lliure en català️",
+    description=description,
+    article=article,
+    allow_flagging="never",
+    layout="vertical",
+    live=False
+)
+iface.launch(server_name="0.0.0.0", server_port=7860)

engine.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import io
+import json
+import os
+import wave
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Mapping, Optional, Sequence, Union
+import numpy as np
+import onnxruntime
+from espeak_phonemizer import Phonemizer
+_BOS = "^"
+_EOS = "$"
+_PAD = "_"
+@dataclass
+class PiperConfig:
+    num_symbols: int
+    num_speakers: int
+    sample_rate: int
+    espeak_voice: str
+    length_scale: float
+    noise_scale: float
+    noise_w: float
+    phoneme_id_map: Mapping[str, Sequence[int]]
+class Piper:
+    def __init__(
+        self,
+        model_path: Union[str, Path],
+        config_path: Optional[Union[str, Path]] = None,
+        use_cuda: bool = False,
+    ):
+        if config_path is None:
+            config_path = f"{model_path}.json"
+        self.config = load_config(config_path)
+        self.phonemizer = Phonemizer(self.config.espeak_voice)
+        self.onnx_options = onnxruntime.SessionOptions()
+        self.onnx_options.intra_op_num_threads = os.cpu_count() - 1
+        self.model = onnxruntime.InferenceSession(
+            str(model_path),
+            sess_options=self.onnx_options,
+            providers=["CPUExecutionProvider"]
+            if not use_cuda
+            else ["CUDAExecutionProvider"],
+        )
+    def synthesize(
+        self,
+        text: str,
+        speaker_id: Optional[int] = None,
+        length_scale: Optional[float] = None,
+        noise_scale: Optional[float] = None,
+        noise_w: Optional[float] = None,
+    ) -> bytes:
+        """Synthesize WAV audio from text."""
+        if length_scale is None:
+            length_scale = self.config.length_scale
+        if noise_scale is None:
+            noise_scale = self.config.noise_scale
+        if noise_w is None:
+            noise_w = self.config.noise_w
+        phonemes_str = self.phonemizer.phonemize(text, keep_clause_breakers=True)
+        phonemes = [_BOS] + list(phonemes_str)
+        phoneme_ids: List[int] = []
+        for phoneme in phonemes:
+            phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
+            phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
+        phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
+        phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
+        phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
+        scales = np.array(
+            [noise_scale, length_scale, noise_w],
+            dtype=np.float32,
+        )
+        if (self.config.num_speakers > 1) and (speaker_id is not None):
+            # Default speaker
+            speaker_id = 0
+        sid = None
+        if speaker_id is not None:
+            sid = np.array([speaker_id], dtype=np.int64)
+        # Synthesize through Onnx
+        audio = self.model.run(
+            None,
+            {
+                "input": phoneme_ids_array,
+                "input_lengths": phoneme_ids_lengths,
+                "scales": scales,
+                "sid": sid,
+            },
+        )[0].squeeze((0, 1))
+        audio = audio_float_to_int16(audio.squeeze())
+        # Convert to WAV
+        with io.BytesIO() as wav_io:
+            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+            with wav_file:
+                wav_file.setframerate(self.config.sample_rate)
+                wav_file.setsampwidth(2)
+                wav_file.setnchannels(1)
+                wav_file.writeframes(audio.tobytes())
+            return wav_io.getvalue()
+def load_config(config_path: Union[str, Path]) -> PiperConfig:
+    with open(config_path, "r", encoding="utf-8") as config_file:
+        config_dict = json.load(config_file)
+        inference = config_dict.get("inference", {})
+        return PiperConfig(
+            num_symbols=config_dict["num_symbols"],
+            num_speakers=config_dict["num_speakers"],
+            sample_rate=config_dict["audio"]["sample_rate"],
+            espeak_voice=config_dict["espeak"]["voice"],
+            noise_scale=inference.get("noise_scale", 0.667),
+            length_scale=inference.get("length_scale", 1.0),
+            noise_w=inference.get("noise_w", 0.8),
+            phoneme_id_map=config_dict["phoneme_id_map"],
+        )
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm

models/bsc/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
+size 1038659133

models/bsc/config.json ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+    "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
+    "logger_uri": null,
+    "run_name": "multispeaker_vits_ca_1e4_1e4_32",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": 1000,
+    "save_step": 1000,
+    "save_n_checkpoints": 5,
+    "save_checkpoints": true,
+    "save_all_best": true,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": true,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "epochs": 1000,
+    "batch_size": 16,
+    "eval_batch_size": 8,
+    "grad_clip": [
+        1000.0,
+        1000.0
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": "",
+    "lr_scheduler_params": null,
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 4,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 22050,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0,
+        "mel_fmax": null
+    },
+    "use_phonemes": true,
+    "phonemizer": "espeak",
+    "phoneme_language": "ca",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
+    "characters": {
+        "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
+        "vocab_dict": null,
+        "pad": "<PAD>",
+        "eos": "<EOS>",
+        "bos": "<BOS>",
+        "blank": "<BLNK>",
+        "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": null,
+        "is_unique": false,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 5,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": 325,
+    "compute_f0": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "datasets": [
+        {
+            "formatter": "vctk_old",
+            "dataset_name": "vctk_old",
+            "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
+            "meta_file_train": "",
+            "ignored_speakers": [
+                "uri",
+                "09796",
+                "05450"
+            ],
+            "language": "ca",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
+        ],
+        [
+            "Preguntin-se si aix\u00f2 era necessari."
+        ],
+        [
+            "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
+        ],
+        [
+            "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
+        ]
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 131,
+        "out_channels": 513,
+        "spec_segment_size": 32,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 6,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "1",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1.0,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": true,
+        "num_speakers": 257,
+        "speakers_file": "/home/user/app/models/bsc/speakers.pth",
+        "d_vector_file": null,
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": false,
+        "d_vector_dim": 0,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": null,
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "",
+        "speaker_encoder_model_path": "",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0001,
+    "lr_disc": 0.0001,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 1.0,
+    "return_wav": true,
+    "use_weighted_sampler": false,
+    "weighted_sampler_attrs": null,
+    "weighted_sampler_multipliers": null,
+    "r": 1,
+    "num_speakers": 257,
+    "use_speaker_embedding": true,
+    "speakers_file": "/home/user/app/models/bsc/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": null,
+    "use_language_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": 0
+}

models/bsc/speaker_map.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "f_cen_05": "05739",
+    "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
+    "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
+    "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
+    "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
+    "m_cen_08": "08935",
+    "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
+    "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
+}

models/bsc/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
+size 30191

models/collectivat/catotron-ona-TTS-API-entry.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "voice": "ona-fast-hifigan",
+    "lang": "ca",
+    "model_type": "coqui",
+    "tts_config_path": "fast-speech_config.json",
+    "tts_model_path": "fast-speech_best_model.pth",
+    "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
+    "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
+    "load": true
+}

models/collectivat/fast-speech_best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
+size 457921637

models/collectivat/fast-speech_config.json ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+    "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
+    "logger_uri": null,
+    "run_name": "fast_pitch_ljspeech",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": null,
+    "save_step": 10000,
+    "save_n_checkpoints": 5,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 1000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "epochs": 1000,
+    "batch_size": 16,
+    "eval_batch_size": 16,
+    "grad_clip": 5.0,
+    "scheduler_after_epoch": true,
+    "lr": 0.0001,
+    "optimizer": "Adam",
+    "optimizer_params": {
+        "betas": [
+            0.9,
+            0.998
+        ],
+        "weight_decay": 1e-06
+    },
+    "lr_scheduler": "NoamLR",
+    "lr_scheduler_params": {
+        "warmup_steps": 4000
+    },
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "fast_pitch",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 22050,
+        "resample": false,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "log_func": "np.log",
+        "do_trim_silence": true,
+        "trim_db": 60.0,
+        "do_rms_norm": false,
+        "db_level": null,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000,
+        "spec_gain": 1.0,
+        "do_amp_to_db_linear": true,
+        "do_amp_to_db_mel": true,
+        "pitch_fmax": 640.0,
+        "pitch_fmin": 0.0,
+        "signal_norm": false,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": null
+    },
+    "use_phonemes": false,
+    "phonemizer": null,
+    "phoneme_language": "ca-es",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.utils.text.characters.Graphemes",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "*",
+        "bos": "^",
+        "blank": null,
+        "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
+        "punctuations": "!'(),-.:;?\u00b7 ",
+        "phonemes": "",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": false,
+    "batch_group_size": 0,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": true,
+    "compute_linear_spec": false,
+    "precompute_num_workers": 4,
+    "start_by_longest": false,
+    "datasets": [
+        {
+            "name": "custom_turkish",
+            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
+            "meta_file_train": "upc_ona_train.txt",
+            "ignored_speakers": null,
+            "language": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "custom_turkish",
+            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
+            "meta_file_train": "upc_ona_val.txt",
+            "ignored_speakers": null,
+            "language": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        "Hola Barcelona!",
+        "Escriviu al text."
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "base_model": "forward_tts",
+    "model_args": {
+        "num_chars": 89,
+        "out_channels": 80,
+        "hidden_channels": 384,
+        "use_aligner": true,
+        "use_pitch": true,
+        "pitch_predictor_hidden_channels": 256,
+        "pitch_predictor_kernel_size": 3,
+        "pitch_predictor_dropout_p": 0.1,
+        "pitch_embedding_kernel_size": 3,
+        "duration_predictor_hidden_channels": 256,
+        "duration_predictor_kernel_size": 3,
+        "duration_predictor_dropout_p": 0.1,
+        "positional_encoding": true,
+        "poisitonal_encoding_use_scale": true,
+        "length_scale": 1,
+        "encoder_type": "fftransformer",
+        "encoder_params": {
+            "hidden_channels_ffn": 1024,
+            "num_heads": 1,
+            "num_layers": 6,
+            "dropout_p": 0.1
+        },
+        "decoder_type": "fftransformer",
+        "decoder_params": {
+            "hidden_channels_ffn": 1024,
+            "num_heads": 1,
+            "num_layers": 6,
+            "dropout_p": 0.1
+        },
+        "detach_duration_predictor": false,
+        "max_duration": 75,
+        "num_speakers": 1,
+        "use_speaker_embedding": false,
+        "speakers_file": null,
+        "use_d_vector_file": false,
+        "d_vector_dim": null,
+        "d_vector_file": null
+    },
+    "num_speakers": 0,
+    "speakers_file": null,
+    "use_speaker_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": false,
+    "d_vector_dim": 0,
+    "spec_loss_type": "mse",
+    "duration_loss_type": "mse",
+    "use_ssim_loss": true,
+    "ssim_loss_alpha": 1.0,
+    "spec_loss_alpha": 1.0,
+    "aligner_loss_alpha": 1.0,
+    "pitch_loss_alpha": 0.1,
+    "dur_loss_alpha": 0.1,
+    "binary_align_loss_alpha": 0.1,
+    "binary_loss_warmup_epochs": 150,
+    "min_seq_len": 13,
+    "max_seq_len": 500000,
+    "r": 1,
+    "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
+    "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
+    "github_branch": "* dev"
+}

models/collectivat/ljspeech--hifigan_v2_config.json ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+    "run_name": "hifigan",
+    "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "log_func": "np.log",
+        // Silence trimming
+        "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+        // Normalization parameters
+        "signal_norm": false,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54324"
+    },
+    // MODEL PARAMETERS
+    "use_pqmf": false,
+    // LOSS PARAMETERS
+    "use_stft_loss": false,
+    "use_subband_stft_loss": false,
+    "use_mse_gan_loss": true,
+    "use_hinge_gan_loss": false,
+    "use_feat_match_loss": true,  // use only with melgan discriminators
+    "use_l1_spec_loss": true,
+    // loss weights
+    "stft_loss_weight": 0,
+    "subband_stft_loss_weight": 0,
+    "mse_G_loss_weight": 1,
+    "hinge_G_loss_weight": 0,
+    "feat_match_loss_weight": 10,
+    "l1_spec_loss_weight": 45,
+    // multiscale stft loss parameters
+    // "stft_loss_params": {
+    //     "n_ffts": [1024, 2048, 512],
+    //     "hop_lengths": [120, 240, 50],
+    //     "win_lengths": [600, 1200, 240]
+    // },
+    "l1_spec_loss_params": {
+        "use_mel": true,
+        "sample_rate": 16000,
+        "n_fft": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
+    // DISCRIMINATOR
+    "discriminator_model": "hifigan_discriminator",
+    //"discriminator_model_params":{
+    //    "peroids": [2, 3, 5, 7, 11],
+    //    "base_channels": 16,
+    //    "max_channels":512,
+    //    "downsample_factors":[4, 4, 4]
+    //},
+    "steps_to_start_discriminator": 0,      // steps required to start GAN trainining.1
+    // GENERATOR
+    "generator_model": "hifigan_generator",
+    "generator_model_params": {
+        "resblock_type": "1",
+        "upsample_factors": [8,8,2,2],
+        "upsample_kernel_sizes": [16,16,4,4],
+        "upsample_initial_channel": 128,
+        "resblock_kernel_sizes": [3,7,11],
+        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
+    },
+    // DATASET
+    "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
+    "feature_path": null,
+    // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
+    "seq_len": 8192,
+    "pad_short": 2000,
+    "conv_pad": 0,
+    "use_noise_augment": false,
+    "use_cache": true,
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+    // TRAINING
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+    // OPTIMIZER
+    "epochs": 10000,                // total number of epochs to train.
+    "wd": 0.0,                // Weight decay weight.
+    "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
+    "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
+    // "lr_scheduler_gen": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    // "lr_scheduler_gen_params": {
+    //    "gamma": 0.999,
+        // "last_epoch": -1
+    // },
+    // "lr_scheduler_disc": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    // "lr_scheduler_disc_params": {
+    	//   "gamma": 0.999,
+        // "last_epoch": -1
+    // },
+    "lr_gen": 0.00001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_disc": 0.00001,
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+    // DATA LOADING
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 10,
+    // PATHS
+    "output_path": "/home/erogol/gdrive/Trainings/sam/"
+}

models/collectivat/ljspeech--hifigan_v2_model_file.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
+size 3794153

models/piper/MODEL_CARD ADDED Viewed

	@@ -0,0 +1,15 @@

+# Model card for upc_ona (x-low)
+* Language: ca (Catalan)
+* Speakers: 1
+* Quality: x-low
+* Samplerate: 16,000Hz
+## Dataset
+* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
+* License: CC BY-SA 3.0 ES
+## Training
+Trained from scratch.

models/piper/ca-upc_ona-x-low.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
+size 20628813

models/piper/ca-upc_ona-x-low.onnx.json ADDED Viewed

	@@ -0,0 +1,409 @@

+{
+    "audio": {
+        "sample_rate": 16000
+    },
+    "espeak": {
+        "voice": "ca"
+    },
+    "inference": {
+        "noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_w": 0.8
+    },
+    "phoneme_map": {},
+    "phoneme_id_map": {
+        "_": [
+            0
+        ],
+        "^": [
+            1
+        ],
+        "$": [
+            2
+        ],
+        " ": [
+            3
+        ],
+        "!": [
+            4
+        ],
+        "'": [
+            5
+        ],
+        "(": [
+            6
+        ],
+        ")": [
+            7
+        ],
+        ",": [
+            8
+        ],
+        "-": [
+            9
+        ],
+        ".": [
+            10
+        ],
+        ":": [
+            11
+        ],
+        ";": [
+            12
+        ],
+        "?": [
+            13
+        ],
+        "a": [
+            14
+        ],
+        "b": [
+            15
+        ],
+        "c": [
+            16
+        ],
+        "d": [
+            17
+        ],
+        "e": [
+            18
+        ],
+        "f": [
+            19
+        ],
+        "h": [
+            20
+        ],
+        "i": [
+            21
+        ],
+        "j": [
+            22
+        ],
+        "k": [
+            23
+        ],
+        "l": [
+            24
+        ],
+        "m": [
+            25
+        ],
+        "n": [
+            26
+        ],
+        "o": [
+            27
+        ],
+        "p": [
+            28
+        ],
+        "q": [
+            29
+        ],
+        "r": [
+            30
+        ],
+        "s": [
+            31
+        ],
+        "t": [
+            32
+        ],
+        "u": [
+            33
+        ],
+        "v": [
+            34
+        ],
+        "w": [
+            35
+        ],
+        "x": [
+            36
+        ],
+        "y": [
+            37
+        ],
+        "z": [
+            38
+        ],
+        "æ": [
+            39
+        ],
+        "ç": [
+            40
+        ],
+        "ð": [
+            41
+        ],
+        "ø": [
+            42
+        ],
+        "ħ": [
+            43
+        ],
+        "ŋ": [
+            44
+        ],
+        "œ": [
+            45
+        ],
+        "ǀ": [
+            46
+        ],
+        "ǁ": [
+            47
+        ],
+        "ǂ": [
+            48
+        ],
+        "ǃ": [
+            49
+        ],
+        "ɐ": [
+            50
+        ],
+        "ɑ": [
+            51
+        ],
+        "ɒ": [
+            52
+        ],
+        "ɓ": [
+            53
+        ],
+        "ɔ": [
+            54
+        ],
+        "ɕ": [
+            55
+        ],
+        "ɖ": [
+            56
+        ],
+        "ɗ": [
+            57
+        ],
+        "ɘ": [
+            58
+        ],
+        "ə": [
+            59
+        ],
+        "ɚ": [
+            60
+        ],
+        "ɛ": [
+            61
+        ],
+        "ɜ": [
+            62
+        ],
+        "ɞ": [
+            63
+        ],
+        "ɟ": [
+            64
+        ],
+        "ɠ": [
+            65
+        ],
+        "ɡ": [
+            66
+        ],
+        "ɢ": [
+            67
+        ],
+        "ɣ": [
+            68
+        ],
+        "ɤ": [
+            69
+        ],
+        "ɥ": [
+            70
+        ],
+        "ɦ": [
+            71
+        ],
+        "ɧ": [
+            72
+        ],
+        "ɨ": [
+            73
+        ],
+        "ɪ": [
+            74
+        ],
+        "ɫ": [
+            75
+        ],
+        "ɬ": [
+            76
+        ],
+        "ɭ": [
+            77
+        ],
+        "ɮ": [
+            78
+        ],
+        "ɯ": [
+            79
+        ],
+        "ɰ": [
+            80
+        ],
+        "ɱ": [
+            81
+        ],
+        "ɲ": [
+            82
+        ],
+        "ɳ": [
+            83
+        ],
+        "ɴ": [
+            84
+        ],
+        "ɵ": [
+            85
+        ],
+        "ɶ": [
+            86
+        ],
+        "ɸ": [
+            87
+        ],
+        "ɹ": [
+            88
+        ],
+        "ɺ": [
+            89
+        ],
+        "ɻ": [
+            90
+        ],
+        "ɽ": [
+            91
+        ],
+        "ɾ": [
+            92
+        ],
+        "ʀ": [
+            93
+        ],
+        "ʁ": [
+            94
+        ],
+        "ʂ": [
+            95
+        ],
+        "ʃ": [
+            96
+        ],
+        "ʄ": [
+            97
+        ],
+        "ʈ": [
+            98
+        ],
+        "ʉ": [
+            99
+        ],
+        "ʊ": [
+            100
+        ],
+        "ʋ": [
+            101
+        ],
+        "ʌ": [
+            102
+        ],
+        "ʍ": [
+            103
+        ],
+        "ʎ": [
+            104
+        ],
+        "ʏ": [
+            105
+        ],
+        "ʐ": [
+            106
+        ],
+        "ʑ": [
+            107
+        ],
+        "ʒ": [
+            108
+        ],
+        "ʔ": [
+            109
+        ],
+        "ʕ": [
+            110
+        ],
+        "ʘ": [
+            111
+        ],
+        "ʙ": [
+            112
+        ],
+        "ʛ": [
+            113
+        ],
+        "ʜ": [
+            114
+        ],
+        "ʝ": [
+            115
+        ],
+        "ʟ": [
+            116
+        ],
+        "ʡ": [
+            117
+        ],
+        "ʢ": [
+            118
+        ],
+        "ʲ": [
+            119
+        ],
+        "ˈ": [
+            120
+        ],
+        "ˌ": [
+            121
+        ],
+        "ː": [
+            122
+        ],
+        "ˑ": [
+            123
+        ],
+        "˞": [
+            124
+        ],
+        "β": [
+            125
+        ],
+        "θ": [
+            126
+        ],
+        "χ": [
+            127
+        ],
+        "ᵻ": [
+            128
+        ],
+        "ⱱ": [
+            129
+        ]
+    },
+    "num_symbols": 130,
+    "num_speakers": 1,
+    "speaker_id_map": {}
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+git+https://github.com/coqui-ai/TTS@dev#egg=TTS
+gradio
+espeak-phonemizer>=1.1.0,<2
+onnxruntime~=1.11.0