ccoreilly commited on
Commit
d2a1ee2
1 Parent(s): 5a22996

Import inicial

Browse files
Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN apt update && apt install -y git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev
4
+
5
+ RUN git clone https://github.com/projecte-aina/espeak-ng
6
+
7
+ RUN cd espeak-ng && \
8
+ ./autogen.sh && \
9
+ ./configure --prefix=/usr && \
10
+ make && \
11
+ make install
12
+
13
+ RUN useradd -m -u 1000 user
14
+
15
+ USER user
16
+
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ # Set the working directory to the user's home directory
21
+ WORKDIR $HOME/app
22
+
23
+ COPY --chown=user requirements.txt .
24
+ COPY --chown=user models models
25
+
26
+ RUN pip install -r requirements.txt
27
+
28
+ COPY --chown=user engine.py .
29
+ COPY --chown=user app.py .
30
+
31
+ RUN mkdir -p cache && chmod 777 cache
32
+
33
+ ENV NUMBA_CACHE_DIR=/home/user/cache
34
+ ENV MPLCONFIGDIR=/home/user/cache
35
+
36
+ EXPOSE 7860
37
+
38
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Comparativa Tts Catala Espeak Upstream
3
- emoji: 🌍
4
  colorFrom: green
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
 
1
  ---
2
+ title: Síntesi en català
3
+ emoji: 👁
4
  colorFrom: green
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
  ---
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from engine import Piper
2
+ import tempfile
3
+ from typing import Optional
4
+ from TTS.config import load_config
5
+ import gradio as gr
6
+ import numpy as np
7
+ import os
8
+ import json
9
+ from TTS.utils.manage import ModelManager
10
+ from TTS.utils.synthesizer import Synthesizer
11
+ from espeak_phonemizer import Phonemizer
12
+
13
+ MAX_TXT_LEN = 325
14
+
15
+ SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
16
+
17
+ fonemitzador = Phonemizer("ca")
18
+
19
+ def carrega_bsc():
20
+ model_path = os.getcwd() + "/models/bsc/best_model.pth"
21
+ config_path = os.getcwd() + "/models/bsc/config.json"
22
+ speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
23
+ vocoder_path = None
24
+ vocoder_config_path = None
25
+
26
+ synthesizer = Synthesizer(
27
+ model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
28
+ )
29
+
30
+ return synthesizer
31
+
32
+ def carrega_collectivat():
33
+ model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
34
+ config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
35
+ vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
36
+ vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
37
+ synthesizer = Synthesizer(
38
+ model_path, config_path, None, None, vocoder_path, vocoder_config_path
39
+ )
40
+
41
+ return synthesizer
42
+
43
+ def carrega_piper():
44
+ return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
45
+
46
+
47
+ model_bsc = carrega_bsc()
48
+ SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
49
+
50
+ model_collectivat = carrega_collectivat()
51
+
52
+ model_piper = carrega_piper()
53
+
54
+ def tts(text, speaker_idx):
55
+ if len(text) > MAX_TXT_LEN:
56
+ text = text[:MAX_TXT_LEN]
57
+ print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
58
+ print(text)
59
+
60
+ # synthesize
61
+ wav_bsc = model_bsc.tts(text, speaker_idx)
62
+ wav_coll = model_collectivat.tts(text)
63
+ wav_piper = model_piper.synthesize(text)
64
+
65
+ #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
66
+
67
+ # return output
68
+ fp_bsc = ""
69
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
70
+ model_bsc.save_wav(wav_bsc, fp)
71
+ fp_bsc = fp.name
72
+
73
+ fp_coll = ""
74
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
75
+ model_collectivat.save_wav(wav_coll, fp)
76
+ fp_coll = fp.name
77
+
78
+ fp_piper = ""
79
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
80
+ fp.write(wav_piper)
81
+ fp_piper = fp.name
82
+
83
+ fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
84
+
85
+ return fonemes, fp_bsc, fp_coll, fp_piper
86
+
87
+
88
+ description="""
89
+ Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català.
90
+
91
+ 1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina)
92
+ https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker
93
+
94
+ 2. Model Fastspeech entrenat per Col·lectivat
95
+ https://github.com/CollectivaT-dev/TTS-API
96
+
97
+ 3. Model VITS entrenat per Piper/Home Assistant
98
+ https://github.com/rhasspy/piper
99
+
100
+ Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
101
+
102
+ Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
103
+ https://github.com/projecte-aina/espeak-ng
104
+
105
+ NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador.
106
+ """
107
+ article= ""
108
+
109
+ iface = gr.Interface(
110
+ fn=tts,
111
+ inputs=[
112
+ gr.Textbox(
113
+ label="Text",
114
+ value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
115
+ ),
116
+ gr.Dropdown(label="Selecciona un parlant pel model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona")
117
+ ],
118
+ outputs=[
119
+ gr.Markdown(label="Fonemes"),
120
+ gr.Audio(label="BSC VITS",type="filepath"),
121
+ gr.Audio(label="Collectivat Fastspeech",type="filepath"),
122
+ gr.Audio(label="Piper VITS",type="filepath")
123
+ ],
124
+ title="Comparativa de síntesi lliure en català️",
125
+ description=description,
126
+ article=article,
127
+ allow_flagging="never",
128
+ layout="vertical",
129
+ live=False
130
+ )
131
+ iface.launch(server_name="0.0.0.0", server_port=7860)
engine.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ import wave
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import List, Mapping, Optional, Sequence, Union
8
+
9
+ import numpy as np
10
+ import onnxruntime
11
+ from espeak_phonemizer import Phonemizer
12
+
13
+ _BOS = "^"
14
+ _EOS = "$"
15
+ _PAD = "_"
16
+
17
+
18
+ @dataclass
19
+ class PiperConfig:
20
+ num_symbols: int
21
+ num_speakers: int
22
+ sample_rate: int
23
+ espeak_voice: str
24
+ length_scale: float
25
+ noise_scale: float
26
+ noise_w: float
27
+ phoneme_id_map: Mapping[str, Sequence[int]]
28
+
29
+
30
+ class Piper:
31
+ def __init__(
32
+ self,
33
+ model_path: Union[str, Path],
34
+ config_path: Optional[Union[str, Path]] = None,
35
+ use_cuda: bool = False,
36
+ ):
37
+ if config_path is None:
38
+ config_path = f"{model_path}.json"
39
+
40
+ self.config = load_config(config_path)
41
+ self.phonemizer = Phonemizer(self.config.espeak_voice)
42
+ self.onnx_options = onnxruntime.SessionOptions()
43
+ self.onnx_options.intra_op_num_threads = os.cpu_count() - 1
44
+ self.model = onnxruntime.InferenceSession(
45
+ str(model_path),
46
+ sess_options=self.onnx_options,
47
+ providers=["CPUExecutionProvider"]
48
+ if not use_cuda
49
+ else ["CUDAExecutionProvider"],
50
+ )
51
+
52
+ def synthesize(
53
+ self,
54
+ text: str,
55
+ speaker_id: Optional[int] = None,
56
+ length_scale: Optional[float] = None,
57
+ noise_scale: Optional[float] = None,
58
+ noise_w: Optional[float] = None,
59
+ ) -> bytes:
60
+ """Synthesize WAV audio from text."""
61
+ if length_scale is None:
62
+ length_scale = self.config.length_scale
63
+
64
+ if noise_scale is None:
65
+ noise_scale = self.config.noise_scale
66
+
67
+ if noise_w is None:
68
+ noise_w = self.config.noise_w
69
+
70
+ phonemes_str = self.phonemizer.phonemize(text, keep_clause_breakers=True)
71
+ phonemes = [_BOS] + list(phonemes_str)
72
+ phoneme_ids: List[int] = []
73
+
74
+ for phoneme in phonemes:
75
+ phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
76
+ phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
77
+
78
+ phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
79
+
80
+ phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
81
+ phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
82
+ scales = np.array(
83
+ [noise_scale, length_scale, noise_w],
84
+ dtype=np.float32,
85
+ )
86
+
87
+ if (self.config.num_speakers > 1) and (speaker_id is not None):
88
+ # Default speaker
89
+ speaker_id = 0
90
+
91
+ sid = None
92
+
93
+ if speaker_id is not None:
94
+ sid = np.array([speaker_id], dtype=np.int64)
95
+
96
+ # Synthesize through Onnx
97
+ audio = self.model.run(
98
+ None,
99
+ {
100
+ "input": phoneme_ids_array,
101
+ "input_lengths": phoneme_ids_lengths,
102
+ "scales": scales,
103
+ "sid": sid,
104
+ },
105
+ )[0].squeeze((0, 1))
106
+ audio = audio_float_to_int16(audio.squeeze())
107
+
108
+ # Convert to WAV
109
+ with io.BytesIO() as wav_io:
110
+ wav_file: wave.Wave_write = wave.open(wav_io, "wb")
111
+ with wav_file:
112
+ wav_file.setframerate(self.config.sample_rate)
113
+ wav_file.setsampwidth(2)
114
+ wav_file.setnchannels(1)
115
+ wav_file.writeframes(audio.tobytes())
116
+
117
+ return wav_io.getvalue()
118
+
119
+
120
+ def load_config(config_path: Union[str, Path]) -> PiperConfig:
121
+ with open(config_path, "r", encoding="utf-8") as config_file:
122
+ config_dict = json.load(config_file)
123
+ inference = config_dict.get("inference", {})
124
+
125
+ return PiperConfig(
126
+ num_symbols=config_dict["num_symbols"],
127
+ num_speakers=config_dict["num_speakers"],
128
+ sample_rate=config_dict["audio"]["sample_rate"],
129
+ espeak_voice=config_dict["espeak"]["voice"],
130
+ noise_scale=inference.get("noise_scale", 0.667),
131
+ length_scale=inference.get("length_scale", 1.0),
132
+ noise_w=inference.get("noise_w", 0.8),
133
+ phoneme_id_map=config_dict["phoneme_id_map"],
134
+ )
135
+
136
+
137
+ def audio_float_to_int16(
138
+ audio: np.ndarray, max_wav_value: float = 32767.0
139
+ ) -> np.ndarray:
140
+ """Normalize audio and convert to int16 range"""
141
+ audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
142
+ audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
143
+ audio_norm = audio_norm.astype("int16")
144
+ return audio_norm
models/bsc/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
3
+ size 1038659133
models/bsc/config.json ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
3
+ "logger_uri": null,
4
+ "run_name": "multispeaker_vits_ca_1e4_1e4_32",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": 1000,
13
+ "save_step": 1000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": true,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": true,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 16,
28
+ "eval_batch_size": 8,
29
+ "grad_clip": [
30
+ 1000.0,
31
+ 1000.0
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": "",
45
+ "lr_scheduler_params": null,
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 4,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 22050,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": true,
65
+ "phonemizer": "espeak",
66
+ "phoneme_language": "ca",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "multilingual_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
80
+ "punctuations": "!'(),-.:;? ",
81
+ "phonemes": null,
82
+ "is_unique": false,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 5,
87
+ "loss_masking": null,
88
+ "min_audio_len": 1,
89
+ "max_audio_len": Infinity,
90
+ "min_text_len": 1,
91
+ "max_text_len": 325,
92
+ "compute_f0": false,
93
+ "compute_linear_spec": true,
94
+ "precompute_num_workers": 0,
95
+ "start_by_longest": false,
96
+ "datasets": [
97
+ {
98
+ "formatter": "vctk_old",
99
+ "dataset_name": "vctk_old",
100
+ "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
101
+ "meta_file_train": "",
102
+ "ignored_speakers": [
103
+ "uri",
104
+ "09796",
105
+ "05450"
106
+ ],
107
+ "language": "ca",
108
+ "meta_file_val": "",
109
+ "meta_file_attn_mask": ""
110
+ }
111
+ ],
112
+ "test_sentences": [
113
+ [
114
+ "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
115
+ ],
116
+ [
117
+ "Preguntin-se si aix\u00f2 era necessari."
118
+ ],
119
+ [
120
+ "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
121
+ ],
122
+ [
123
+ "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
124
+ ]
125
+ ],
126
+ "eval_split_max_size": null,
127
+ "eval_split_size": 0.01,
128
+ "use_speaker_weighted_sampler": false,
129
+ "speaker_weighted_sampler_alpha": 1.0,
130
+ "use_language_weighted_sampler": false,
131
+ "language_weighted_sampler_alpha": 1.0,
132
+ "use_length_weighted_sampler": false,
133
+ "length_weighted_sampler_alpha": 1.0,
134
+ "model_args": {
135
+ "num_chars": 131,
136
+ "out_channels": 513,
137
+ "spec_segment_size": 32,
138
+ "hidden_channels": 192,
139
+ "hidden_channels_ffn_text_encoder": 768,
140
+ "num_heads_text_encoder": 2,
141
+ "num_layers_text_encoder": 6,
142
+ "kernel_size_text_encoder": 3,
143
+ "dropout_p_text_encoder": 0.1,
144
+ "dropout_p_duration_predictor": 0.5,
145
+ "kernel_size_posterior_encoder": 5,
146
+ "dilation_rate_posterior_encoder": 1,
147
+ "num_layers_posterior_encoder": 16,
148
+ "kernel_size_flow": 5,
149
+ "dilation_rate_flow": 1,
150
+ "num_layers_flow": 4,
151
+ "resblock_type_decoder": "1",
152
+ "resblock_kernel_sizes_decoder": [
153
+ 3,
154
+ 7,
155
+ 11
156
+ ],
157
+ "resblock_dilation_sizes_decoder": [
158
+ [
159
+ 1,
160
+ 3,
161
+ 5
162
+ ],
163
+ [
164
+ 1,
165
+ 3,
166
+ 5
167
+ ],
168
+ [
169
+ 1,
170
+ 3,
171
+ 5
172
+ ]
173
+ ],
174
+ "upsample_rates_decoder": [
175
+ 8,
176
+ 8,
177
+ 2,
178
+ 2
179
+ ],
180
+ "upsample_initial_channel_decoder": 512,
181
+ "upsample_kernel_sizes_decoder": [
182
+ 16,
183
+ 16,
184
+ 4,
185
+ 4
186
+ ],
187
+ "periods_multi_period_discriminator": [
188
+ 2,
189
+ 3,
190
+ 5,
191
+ 7,
192
+ 11
193
+ ],
194
+ "use_sdp": true,
195
+ "noise_scale": 1.0,
196
+ "inference_noise_scale": 0.667,
197
+ "length_scale": 1.0,
198
+ "noise_scale_dp": 1.0,
199
+ "inference_noise_scale_dp": 1.0,
200
+ "max_inference_len": null,
201
+ "init_discriminator": true,
202
+ "use_spectral_norm_disriminator": false,
203
+ "use_speaker_embedding": true,
204
+ "num_speakers": 257,
205
+ "speakers_file": "/home/user/app/models/bsc/speakers.pth",
206
+ "d_vector_file": null,
207
+ "speaker_embedding_channels": 256,
208
+ "use_d_vector_file": false,
209
+ "d_vector_dim": 0,
210
+ "detach_dp_input": true,
211
+ "use_language_embedding": false,
212
+ "embedded_language_dim": 4,
213
+ "num_languages": 0,
214
+ "language_ids_file": null,
215
+ "use_speaker_encoder_as_loss": false,
216
+ "speaker_encoder_config_path": "",
217
+ "speaker_encoder_model_path": "",
218
+ "condition_dp_on_speaker": true,
219
+ "freeze_encoder": false,
220
+ "freeze_DP": false,
221
+ "freeze_PE": false,
222
+ "freeze_flow_decoder": false,
223
+ "freeze_waveform_decoder": false,
224
+ "encoder_sample_rate": null,
225
+ "interpolate_z": true,
226
+ "reinit_DP": false,
227
+ "reinit_text_encoder": false
228
+ },
229
+ "lr_gen": 0.0001,
230
+ "lr_disc": 0.0001,
231
+ "lr_scheduler_gen": "ExponentialLR",
232
+ "lr_scheduler_gen_params": {
233
+ "gamma": 0.999875,
234
+ "last_epoch": -1
235
+ },
236
+ "lr_scheduler_disc": "ExponentialLR",
237
+ "lr_scheduler_disc_params": {
238
+ "gamma": 0.999875,
239
+ "last_epoch": -1
240
+ },
241
+ "kl_loss_alpha": 1.0,
242
+ "disc_loss_alpha": 1.0,
243
+ "gen_loss_alpha": 1.0,
244
+ "feat_loss_alpha": 1.0,
245
+ "mel_loss_alpha": 45.0,
246
+ "dur_loss_alpha": 1.0,
247
+ "speaker_encoder_loss_alpha": 1.0,
248
+ "return_wav": true,
249
+ "use_weighted_sampler": false,
250
+ "weighted_sampler_attrs": null,
251
+ "weighted_sampler_multipliers": null,
252
+ "r": 1,
253
+ "num_speakers": 257,
254
+ "use_speaker_embedding": true,
255
+ "speakers_file": "/home/user/app/models/bsc/speakers.pth",
256
+ "speaker_embedding_channels": 256,
257
+ "language_ids_file": null,
258
+ "use_language_embedding": false,
259
+ "use_d_vector_file": false,
260
+ "d_vector_file": null,
261
+ "d_vector_dim": 0
262
+ }
models/bsc/speaker_map.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "f_cen_05": "05739",
3
+ "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
4
+ "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
5
+ "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
6
+ "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
7
+ "m_cen_08": "08935",
8
+ "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
9
+ "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
10
+ }
models/bsc/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
3
+ size 30191
models/collectivat/catotron-ona-TTS-API-entry.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "voice": "ona-fast-hifigan",
3
+ "lang": "ca",
4
+ "model_type": "coqui",
5
+ "tts_config_path": "fast-speech_config.json",
6
+ "tts_model_path": "fast-speech_best_model.pth",
7
+ "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
8
+ "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
9
+ "load": true
10
+ }
models/collectivat/fast-speech_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
3
+ size 457921637
models/collectivat/fast-speech_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
3
+ "logger_uri": null,
4
+ "run_name": "fast_pitch_ljspeech",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 1000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 16,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": 5.0,
30
+ "scheduler_after_epoch": true,
31
+ "lr": 0.0001,
32
+ "optimizer": "Adam",
33
+ "optimizer_params": {
34
+ "betas": [
35
+ 0.9,
36
+ 0.998
37
+ ],
38
+ "weight_decay": 1e-06
39
+ },
40
+ "lr_scheduler": "NoamLR",
41
+ "lr_scheduler_params": {
42
+ "warmup_steps": 4000
43
+ },
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "fast_pitch",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 4,
52
+ "use_noise_augment": false,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "use_phonemes": false,
88
+ "phonemizer": null,
89
+ "phoneme_language": "ca-es",
90
+ "compute_input_seq_cache": true,
91
+ "text_cleaner": "multilingual_cleaners",
92
+ "enable_eos_bos_chars": false,
93
+ "test_sentences_file": "",
94
+ "phoneme_cache_path": null,
95
+ "characters": {
96
+ "characters_class": "TTS.tts.utils.text.characters.Graphemes",
97
+ "vocab_dict": null,
98
+ "pad": "_",
99
+ "eos": "*",
100
+ "bos": "^",
101
+ "blank": null,
102
+ "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
103
+ "punctuations": "!'(),-.:;?\u00b7 ",
104
+ "phonemes": "",
105
+ "is_unique": true,
106
+ "is_sorted": true
107
+ },
108
+ "add_blank": false,
109
+ "batch_group_size": 0,
110
+ "loss_masking": null,
111
+ "min_audio_len": 1,
112
+ "max_audio_len": Infinity,
113
+ "min_text_len": 1,
114
+ "max_text_len": Infinity,
115
+ "compute_f0": true,
116
+ "compute_linear_spec": false,
117
+ "precompute_num_workers": 4,
118
+ "start_by_longest": false,
119
+ "datasets": [
120
+ {
121
+ "name": "custom_turkish",
122
+ "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
123
+ "meta_file_train": "upc_ona_train.txt",
124
+ "ignored_speakers": null,
125
+ "language": "",
126
+ "meta_file_val": "",
127
+ "meta_file_attn_mask": ""
128
+ },
129
+ {
130
+ "name": "custom_turkish",
131
+ "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
132
+ "meta_file_train": "upc_ona_val.txt",
133
+ "ignored_speakers": null,
134
+ "language": "",
135
+ "meta_file_val": "",
136
+ "meta_file_attn_mask": ""
137
+ }
138
+ ],
139
+ "test_sentences": [
140
+ "Hola Barcelona!",
141
+ "Escriviu al text."
142
+ ],
143
+ "eval_split_max_size": null,
144
+ "eval_split_size": 0.01,
145
+ "use_speaker_weighted_sampler": false,
146
+ "speaker_weighted_sampler_alpha": 1.0,
147
+ "use_language_weighted_sampler": false,
148
+ "language_weighted_sampler_alpha": 1.0,
149
+ "use_length_weighted_sampler": false,
150
+ "length_weighted_sampler_alpha": 1.0,
151
+ "base_model": "forward_tts",
152
+ "model_args": {
153
+ "num_chars": 89,
154
+ "out_channels": 80,
155
+ "hidden_channels": 384,
156
+ "use_aligner": true,
157
+ "use_pitch": true,
158
+ "pitch_predictor_hidden_channels": 256,
159
+ "pitch_predictor_kernel_size": 3,
160
+ "pitch_predictor_dropout_p": 0.1,
161
+ "pitch_embedding_kernel_size": 3,
162
+ "duration_predictor_hidden_channels": 256,
163
+ "duration_predictor_kernel_size": 3,
164
+ "duration_predictor_dropout_p": 0.1,
165
+ "positional_encoding": true,
166
+ "poisitonal_encoding_use_scale": true,
167
+ "length_scale": 1,
168
+ "encoder_type": "fftransformer",
169
+ "encoder_params": {
170
+ "hidden_channels_ffn": 1024,
171
+ "num_heads": 1,
172
+ "num_layers": 6,
173
+ "dropout_p": 0.1
174
+ },
175
+ "decoder_type": "fftransformer",
176
+ "decoder_params": {
177
+ "hidden_channels_ffn": 1024,
178
+ "num_heads": 1,
179
+ "num_layers": 6,
180
+ "dropout_p": 0.1
181
+ },
182
+ "detach_duration_predictor": false,
183
+ "max_duration": 75,
184
+ "num_speakers": 1,
185
+ "use_speaker_embedding": false,
186
+ "speakers_file": null,
187
+ "use_d_vector_file": false,
188
+ "d_vector_dim": null,
189
+ "d_vector_file": null
190
+ },
191
+ "num_speakers": 0,
192
+ "speakers_file": null,
193
+ "use_speaker_embedding": false,
194
+ "use_d_vector_file": false,
195
+ "d_vector_file": false,
196
+ "d_vector_dim": 0,
197
+ "spec_loss_type": "mse",
198
+ "duration_loss_type": "mse",
199
+ "use_ssim_loss": true,
200
+ "ssim_loss_alpha": 1.0,
201
+ "spec_loss_alpha": 1.0,
202
+ "aligner_loss_alpha": 1.0,
203
+ "pitch_loss_alpha": 0.1,
204
+ "dur_loss_alpha": 0.1,
205
+ "binary_align_loss_alpha": 0.1,
206
+ "binary_loss_warmup_epochs": 150,
207
+ "min_seq_len": 13,
208
+ "max_seq_len": 500000,
209
+ "r": 1,
210
+ "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
211
+ "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
212
+ "github_branch": "* dev"
213
+ }
models/collectivat/ljspeech--hifigan_v2_config.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_name": "hifigan",
3
+ "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
4
+
5
+
6
+ // AUDIO PARAMETERS
7
+ "audio":{
8
+ "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
9
+ "win_length": 1024, // stft window length in ms.
10
+ "hop_length": 256, // stft window hop-lengh in ms.
11
+ "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
12
+ "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
13
+
14
+ // Audio processing parameters
15
+ "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
16
+ "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
17
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
18
+ "log_func": "np.log",
19
+
20
+ // Silence trimming
21
+ "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
22
+ "trim_db": 60, // threshold for timming silence. Set this according to your dataset.
23
+
24
+ // MelSpectrogram parameters
25
+ "num_mels": 80, // size of the mel spec frame.
26
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
27
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
28
+ "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
29
+
30
+ // Normalization parameters
31
+ "signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
32
+ "min_level_db": -100, // lower bound for normalization
33
+ "symmetric_norm": true, // move normalization to range [-1, 1]
34
+ "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
35
+ "clip_norm": true, // clip normalized values into the range.
36
+ "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
37
+ },
38
+
39
+ // DISTRIBUTED TRAINING
40
+ "distributed":{
41
+ "backend": "nccl",
42
+ "url": "tcp:\/\/localhost:54324"
43
+ },
44
+
45
+ // MODEL PARAMETERS
46
+ "use_pqmf": false,
47
+
48
+ // LOSS PARAMETERS
49
+ "use_stft_loss": false,
50
+ "use_subband_stft_loss": false,
51
+ "use_mse_gan_loss": true,
52
+ "use_hinge_gan_loss": false,
53
+ "use_feat_match_loss": true, // use only with melgan discriminators
54
+ "use_l1_spec_loss": true,
55
+
56
+ // loss weights
57
+ "stft_loss_weight": 0,
58
+ "subband_stft_loss_weight": 0,
59
+ "mse_G_loss_weight": 1,
60
+ "hinge_G_loss_weight": 0,
61
+ "feat_match_loss_weight": 10,
62
+ "l1_spec_loss_weight": 45,
63
+
64
+ // multiscale stft loss parameters
65
+ // "stft_loss_params": {
66
+ // "n_ffts": [1024, 2048, 512],
67
+ // "hop_lengths": [120, 240, 50],
68
+ // "win_lengths": [600, 1200, 240]
69
+ // },
70
+
71
+ "l1_spec_loss_params": {
72
+ "use_mel": true,
73
+ "sample_rate": 16000,
74
+ "n_fft": 1024,
75
+ "hop_length": 256,
76
+ "win_length": 1024,
77
+ "n_mels": 80,
78
+ "mel_fmin": 0.0,
79
+ "mel_fmax": null
80
+ },
81
+
82
+ "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
83
+
84
+ // DISCRIMINATOR
85
+ "discriminator_model": "hifigan_discriminator",
86
+ //"discriminator_model_params":{
87
+ // "peroids": [2, 3, 5, 7, 11],
88
+ // "base_channels": 16,
89
+ // "max_channels":512,
90
+ // "downsample_factors":[4, 4, 4]
91
+ //},
92
+ "steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
93
+
94
+ // GENERATOR
95
+ "generator_model": "hifigan_generator",
96
+ "generator_model_params": {
97
+ "resblock_type": "1",
98
+ "upsample_factors": [8,8,2,2],
99
+ "upsample_kernel_sizes": [16,16,4,4],
100
+ "upsample_initial_channel": 128,
101
+ "resblock_kernel_sizes": [3,7,11],
102
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
103
+ },
104
+
105
+ // DATASET
106
+ "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
107
+ "feature_path": null,
108
+ // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
109
+ "seq_len": 8192,
110
+ "pad_short": 2000,
111
+ "conv_pad": 0,
112
+ "use_noise_augment": false,
113
+ "use_cache": true,
114
+ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
115
+
116
+ // TRAINING
117
+ "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
118
+
119
+ // VALIDATION
120
+ "run_eval": true,
121
+ "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
122
+ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
123
+
124
+ // OPTIMIZER
125
+ "epochs": 10000, // total number of epochs to train.
126
+ "wd": 0.0, // Weight decay weight.
127
+ "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
128
+ "disc_clip_grad": -1, // Discriminator gradient clipping threshold.
129
+ // "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
130
+ // "lr_scheduler_gen_params": {
131
+ // "gamma": 0.999,
132
+ // "last_epoch": -1
133
+ // },
134
+ // "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
135
+ // "lr_scheduler_disc_params": {
136
+ // "gamma": 0.999,
137
+ // "last_epoch": -1
138
+ // },
139
+ "lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
140
+ "lr_disc": 0.00001,
141
+
142
+ // TENSORBOARD and LOGGING
143
+ "print_step": 25, // Number of steps to log traning on console.
144
+ "print_eval": false, // If True, it prints loss values for each step in eval run.
145
+ "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
146
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
147
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
148
+
149
+ // DATA LOADING
150
+ "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
151
+ "num_val_loader_workers": 4, // number of evaluation data loader processes.
152
+ "eval_split_size": 10,
153
+
154
+ // PATHS
155
+ "output_path": "/home/erogol/gdrive/Trainings/sam/"
156
+ }
157
+
158
+
models/collectivat/ljspeech--hifigan_v2_model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
3
+ size 3794153
models/piper/MODEL_CARD ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model card for upc_ona (x-low)
2
+
3
+ * Language: ca (Catalan)
4
+ * Speakers: 1
5
+ * Quality: x-low
6
+ * Samplerate: 16,000Hz
7
+
8
+ ## Dataset
9
+
10
+ * URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
11
+ * License: CC BY-SA 3.0 ES
12
+
13
+ ## Training
14
+
15
+ Trained from scratch.
models/piper/ca-upc_ona-x-low.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
3
+ size 20628813
models/piper/ca-upc_ona-x-low.onnx.json ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio": {
3
+ "sample_rate": 16000
4
+ },
5
+ "espeak": {
6
+ "voice": "ca"
7
+ },
8
+ "inference": {
9
+ "noise_scale": 0.667,
10
+ "length_scale": 1,
11
+ "noise_w": 0.8
12
+ },
13
+ "phoneme_map": {},
14
+ "phoneme_id_map": {
15
+ "_": [
16
+ 0
17
+ ],
18
+ "^": [
19
+ 1
20
+ ],
21
+ "$": [
22
+ 2
23
+ ],
24
+ " ": [
25
+ 3
26
+ ],
27
+ "!": [
28
+ 4
29
+ ],
30
+ "'": [
31
+ 5
32
+ ],
33
+ "(": [
34
+ 6
35
+ ],
36
+ ")": [
37
+ 7
38
+ ],
39
+ ",": [
40
+ 8
41
+ ],
42
+ "-": [
43
+ 9
44
+ ],
45
+ ".": [
46
+ 10
47
+ ],
48
+ ":": [
49
+ 11
50
+ ],
51
+ ";": [
52
+ 12
53
+ ],
54
+ "?": [
55
+ 13
56
+ ],
57
+ "a": [
58
+ 14
59
+ ],
60
+ "b": [
61
+ 15
62
+ ],
63
+ "c": [
64
+ 16
65
+ ],
66
+ "d": [
67
+ 17
68
+ ],
69
+ "e": [
70
+ 18
71
+ ],
72
+ "f": [
73
+ 19
74
+ ],
75
+ "h": [
76
+ 20
77
+ ],
78
+ "i": [
79
+ 21
80
+ ],
81
+ "j": [
82
+ 22
83
+ ],
84
+ "k": [
85
+ 23
86
+ ],
87
+ "l": [
88
+ 24
89
+ ],
90
+ "m": [
91
+ 25
92
+ ],
93
+ "n": [
94
+ 26
95
+ ],
96
+ "o": [
97
+ 27
98
+ ],
99
+ "p": [
100
+ 28
101
+ ],
102
+ "q": [
103
+ 29
104
+ ],
105
+ "r": [
106
+ 30
107
+ ],
108
+ "s": [
109
+ 31
110
+ ],
111
+ "t": [
112
+ 32
113
+ ],
114
+ "u": [
115
+ 33
116
+ ],
117
+ "v": [
118
+ 34
119
+ ],
120
+ "w": [
121
+ 35
122
+ ],
123
+ "x": [
124
+ 36
125
+ ],
126
+ "y": [
127
+ 37
128
+ ],
129
+ "z": [
130
+ 38
131
+ ],
132
+ "æ": [
133
+ 39
134
+ ],
135
+ "ç": [
136
+ 40
137
+ ],
138
+ "ð": [
139
+ 41
140
+ ],
141
+ "ø": [
142
+ 42
143
+ ],
144
+ "ħ": [
145
+ 43
146
+ ],
147
+ "ŋ": [
148
+ 44
149
+ ],
150
+ "œ": [
151
+ 45
152
+ ],
153
+ "ǀ": [
154
+ 46
155
+ ],
156
+ "ǁ": [
157
+ 47
158
+ ],
159
+ "ǂ": [
160
+ 48
161
+ ],
162
+ "ǃ": [
163
+ 49
164
+ ],
165
+ "ɐ": [
166
+ 50
167
+ ],
168
+ "ɑ": [
169
+ 51
170
+ ],
171
+ "ɒ": [
172
+ 52
173
+ ],
174
+ "ɓ": [
175
+ 53
176
+ ],
177
+ "ɔ": [
178
+ 54
179
+ ],
180
+ "ɕ": [
181
+ 55
182
+ ],
183
+ "ɖ": [
184
+ 56
185
+ ],
186
+ "ɗ": [
187
+ 57
188
+ ],
189
+ "ɘ": [
190
+ 58
191
+ ],
192
+ "ə": [
193
+ 59
194
+ ],
195
+ "ɚ": [
196
+ 60
197
+ ],
198
+ "ɛ": [
199
+ 61
200
+ ],
201
+ "ɜ": [
202
+ 62
203
+ ],
204
+ "ɞ": [
205
+ 63
206
+ ],
207
+ "ɟ": [
208
+ 64
209
+ ],
210
+ "ɠ": [
211
+ 65
212
+ ],
213
+ "ɡ": [
214
+ 66
215
+ ],
216
+ "ɢ": [
217
+ 67
218
+ ],
219
+ "ɣ": [
220
+ 68
221
+ ],
222
+ "ɤ": [
223
+ 69
224
+ ],
225
+ "ɥ": [
226
+ 70
227
+ ],
228
+ "ɦ": [
229
+ 71
230
+ ],
231
+ "ɧ": [
232
+ 72
233
+ ],
234
+ "ɨ": [
235
+ 73
236
+ ],
237
+ "ɪ": [
238
+ 74
239
+ ],
240
+ "ɫ": [
241
+ 75
242
+ ],
243
+ "ɬ": [
244
+ 76
245
+ ],
246
+ "ɭ": [
247
+ 77
248
+ ],
249
+ "ɮ": [
250
+ 78
251
+ ],
252
+ "ɯ": [
253
+ 79
254
+ ],
255
+ "ɰ": [
256
+ 80
257
+ ],
258
+ "ɱ": [
259
+ 81
260
+ ],
261
+ "ɲ": [
262
+ 82
263
+ ],
264
+ "ɳ": [
265
+ 83
266
+ ],
267
+ "ɴ": [
268
+ 84
269
+ ],
270
+ "ɵ": [
271
+ 85
272
+ ],
273
+ "ɶ": [
274
+ 86
275
+ ],
276
+ "ɸ": [
277
+ 87
278
+ ],
279
+ "ɹ": [
280
+ 88
281
+ ],
282
+ "ɺ": [
283
+ 89
284
+ ],
285
+ "ɻ": [
286
+ 90
287
+ ],
288
+ "ɽ": [
289
+ 91
290
+ ],
291
+ "ɾ": [
292
+ 92
293
+ ],
294
+ "ʀ": [
295
+ 93
296
+ ],
297
+ "ʁ": [
298
+ 94
299
+ ],
300
+ "ʂ": [
301
+ 95
302
+ ],
303
+ "ʃ": [
304
+ 96
305
+ ],
306
+ "ʄ": [
307
+ 97
308
+ ],
309
+ "ʈ": [
310
+ 98
311
+ ],
312
+ "ʉ": [
313
+ 99
314
+ ],
315
+ "ʊ": [
316
+ 100
317
+ ],
318
+ "ʋ": [
319
+ 101
320
+ ],
321
+ "ʌ": [
322
+ 102
323
+ ],
324
+ "ʍ": [
325
+ 103
326
+ ],
327
+ "ʎ": [
328
+ 104
329
+ ],
330
+ "ʏ": [
331
+ 105
332
+ ],
333
+ "ʐ": [
334
+ 106
335
+ ],
336
+ "ʑ": [
337
+ 107
338
+ ],
339
+ "ʒ": [
340
+ 108
341
+ ],
342
+ "ʔ": [
343
+ 109
344
+ ],
345
+ "ʕ": [
346
+ 110
347
+ ],
348
+ "ʘ": [
349
+ 111
350
+ ],
351
+ "ʙ": [
352
+ 112
353
+ ],
354
+ "ʛ": [
355
+ 113
356
+ ],
357
+ "ʜ": [
358
+ 114
359
+ ],
360
+ "ʝ": [
361
+ 115
362
+ ],
363
+ "ʟ": [
364
+ 116
365
+ ],
366
+ "ʡ": [
367
+ 117
368
+ ],
369
+ "ʢ": [
370
+ 118
371
+ ],
372
+ "ʲ": [
373
+ 119
374
+ ],
375
+ "ˈ": [
376
+ 120
377
+ ],
378
+ "ˌ": [
379
+ 121
380
+ ],
381
+ "ː": [
382
+ 122
383
+ ],
384
+ "ˑ": [
385
+ 123
386
+ ],
387
+ "˞": [
388
+ 124
389
+ ],
390
+ "β": [
391
+ 125
392
+ ],
393
+ "θ": [
394
+ 126
395
+ ],
396
+ "χ": [
397
+ 127
398
+ ],
399
+ "ᵻ": [
400
+ 128
401
+ ],
402
+ "ⱱ": [
403
+ 129
404
+ ]
405
+ },
406
+ "num_symbols": 130,
407
+ "num_speakers": 1,
408
+ "speaker_id_map": {}
409
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/coqui-ai/TTS@dev#egg=TTS
2
+ gradio
3
+ espeak-phonemizer>=1.1.0,<2
4
+ onnxruntime~=1.11.0