Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Jan 14, 2022

Commit

cb6b82c

1 Parent(s): e35756c

Add VITS model

Browse files

Files changed (4) hide show

.gitignore +3 -0
README.md +2 -2
app.py +9 -14
config.json +220 -157

.gitignore CHANGED Viewed

@@ -127,3 +127,6 @@ dmypy.json
 # Pyre type checker
 .pyre/

 # Pyre type checker
 .pyre/
+# model files
+*.pth.tar

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: "Ukrainian TTS"
-emoji: 🐸
 colorFrom: green
 colorTo: green
 sdk: gradio
@@ -15,7 +15,7 @@ Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailab
 Link to online demo -> [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
 # Support
-If you like my work, please support -> [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
 # Example
 https://user-images.githubusercontent.com/5759207/140622395-9e734c95-159c-4d72-9f56-e8d1f1ac66c2.mp4

 ---
 title: "Ukrainian TTS"
+emoji: 🇺🇦
 colorFrom: green
 colorTo: green
 sdk: gradio
 Link to online demo -> [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
 # Support
+If you like my work, please support -> ![mono](https://www.monobank.ua/favicon.ico) [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
 # Example
 https://user-images.githubusercontent.com/5759207/140622395-9e734c95-159c-4d72-9f56-e8d1f1ac66c2.mp4

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import tempfile
-from typing import Optional
 import gradio as gr
-import numpy as np
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
@@ -10,7 +8,7 @@ import requests
 from os.path import exists
 MODEL_NAMES = [
-    "uk/mai/glow-tts"
 ]
 MODELS = {}
@@ -29,21 +27,18 @@ def download(url, file_name):
 for MODEL_NAME in MODEL_NAMES:
     print(f"downloading {MODEL_NAME}")
-    model_path, config_path, model_item = manager.download_model(
-        f"tts_models/{MODEL_NAME}")
-    vocoder_name: Optional[str] = model_item["default_vocoder"]
-    release_number = "0.0.1"
-    vocoder_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder.pth.tar"
-    vocoder_config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder_config.json"
-    vocoder_path = "vocoder.pth.tar"
-    vocoder_config_path = "vocoder_config.json"
-    download(vocoder_link, vocoder_path)
-    download(vocoder_config_link, vocoder_config_path)
     synthesizer = Synthesizer(
-        model_path, config_path, None, vocoder_path, vocoder_config_path,
     )
     MODELS[MODEL_NAME] = synthesizer

 import tempfile
 import gradio as gr
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 from os.path import exists
 MODEL_NAMES = [
+    "uk/mai/vits-tts"
 ]
 MODELS = {}
 for MODEL_NAME in MODEL_NAMES:
     print(f"downloading {MODEL_NAME}")
+    release_number = "1.0.0"
+    model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/model.pth.tar"
+    config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/config.json"
+    model_path = "model.pth.tar"
+    config_path = "config.json"
+    download(model_link, model_path)
+    download(config_link, config_path)
     synthesizer = Synthesizer(
+        model_path, config_path, None, None, None,
     )
     MODELS[MODEL_NAME] = synthesizer

config.json CHANGED Viewed

@@ -1,158 +1,221 @@
-{
-    "model": "glow_tts",
-    "run_name": "coqui_tts",
-    "run_description": "",
-    "epochs": 1000,
-    "batch_size": 32,
-    "eval_batch_size": 16,
-    "mixed_precision": true,
-    "scheduler_after_epoch": false,
-    "run_eval": true,
-    "test_delay_epochs": -1,
-    "print_eval": true,
-    "dashboard_logger": "tensorboard",
-    "print_step": 25,
-    "plot_step": 100,
-    "model_param_stats": false,
-    "project_name": null,
-    "log_model_step": null,
-    "wandb_entity": null,
-    "save_step": 10000,
-    "checkpoint": true,
-    "keep_all_best": false,
-    "keep_after": 10000,
-    "num_loader_workers": 1,
-    "num_eval_loader_workers": 1,
-    "use_noise_augment": false,
-    "output_path": "./ukrainian",
-    "distributed_backend": "nccl",
-    "distributed_url": "tcp://localhost:54321",
-    "audio": {
-        "fft_size": 1024,
-        "win_length": 1024,
-        "hop_length": 256,
-        "frame_shift_ms": null,
-        "frame_length_ms": null,
-        "stft_pad_mode": "reflect",
-        "sample_rate": 16000,
-        "resample": false,
-        "preemphasis": 0.0,
-        "ref_level_db": 20,
-        "do_sound_norm": false,
-        "log_func": "np.log10",
-        "do_trim_silence": true,
-        "trim_db": 45,
-        "power": 1.5,
-        "griffin_lim_iters": 60,
-        "num_mels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": null,
-        "spec_gain": 20,
-        "do_amp_to_db_linear": true,
-        "do_amp_to_db_mel": true,
-        "signal_norm": true,
-        "min_level_db": -100,
-        "symmetric_norm": true,
-        "max_norm": 4.0,
-        "clip_norm": true,
-        "stats_path": null
-    },
-    "use_phonemes": false,
-    "use_espeak_phonemes": false,
-    "phoneme_language": null,
-    "compute_input_seq_cache": false,
-    "text_cleaner": "basic_cleaners",
-    "enable_eos_bos_chars": false,
-    "test_sentences_file": "",
-    "phoneme_cache_path": "./phoneme_cache",
-    "characters": {
-        "pad": "_",
-        "eos": "~",
-        "bos": "^",
-        "characters": "!',-.:;?ABIMXaceinoprxy\u0404\u0406\u0407\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042c\u042f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0490\u0491 ",
-        "punctuations": "!',-.:;? ",
-        "phonemes": null,
-        "unique": true
-    },
-    "batch_group_size": 0,
-    "loss_masking": null,
-    "sort_by_audio_len": false,
-    "min_seq_len": 3,
-    "max_seq_len": 500,
-    "compute_f0": false,
-    "compute_linear_spec": false,
-    "add_blank": false,
-    "datasets": [
-        {
-            "name": "ljspeech",
-            "path": "./data/uk_UK/by_book/female/sumska/kaydasheva",
-            "meta_file_train": "metadata.csv",
-            "ununsed_speakers": null,
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        },
-        {
-            "name": "ljspeech",
-            "path": "./data/uk_UK/by_book/female/sumska/mykola_djerya",
-            "meta_file_train": "metadata.csv",
-            "ununsed_speakers": null,
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        }
-    ],
-    "optimizer": "RAdam",
-    "optimizer_params": {
-        "betas": [
-            0.9,
-            0.998
-        ],
-        "weight_decay": 1e-06
-    },
-    "lr_scheduler": "NoamLR",
-    "lr_scheduler_params": {
-        "warmup_steps": 4000
-    },
-    "test_sentences": [
-        "\u0413\u043e\u0432\u043e\u0440\u0438 \u043d\u0456\u0431\u0438 \u0442\u0438 \u0436\u0438\u0432\u0438\u0439!",
-        "\u041f\u043e\u043b \u043f\u0435\u0440\u0435\u0442\u043d\u0443\u0432 \u043f\u0443\u0441\u0442\u0435\u043b\u044e",
-        "\u041f\u0440\u0438\u0432\u0456\u0442, \u0441\u0432\u0456\u0442\u0435!"
-    ],
-    "use_speaker_embedding": false,
-    "use_d_vector_file": false,
-    "d_vector_dim": 0,
-    "num_chars": null,
-    "encoder_type": "rel_pos_transformer",
-    "encoder_params": {
-        "kernel_size": 3,
-        "dropout_p": 0.1,
-        "num_layers": 6,
-        "num_heads": 2,
-        "hidden_channels_ffn": 768,
-        "input_length": null
-    },
-    "use_encoder_prenet": true,
-    "hidden_channels_enc": 192,
-    "hidden_channels_dec": 192,
-    "hidden_channels_dp": 256,
-    "dropout_p_dp": 0.1,
-    "dropout_p_dec": 0.05,
-    "mean_only": true,
-    "out_channels": 80,
-    "num_flow_blocks_dec": 12,
-    "inference_noise_scale": 0.0,
-    "kernel_size_dec": 5,
-    "dilation_rate": 1,
-    "num_block_layers": 4,
-    "num_speakers": 0,
-    "c_in_channels": 0,
-    "num_splits": 4,
-    "num_squeeze": 2,
-    "sigmoid_scale": false,
-    "data_dep_init_steps": 10,
-    "style_wav_for_test": null,
-    "length_scale": 1.0,
-    "d_vector_file": false,
-    "grad_clip": 5.0,
-    "lr": 0.001,
-    "r": 1
 }

+{
+    "model": "vits",
+    "run_name": "vits_ljspeech",
+    "run_description": "",
+    "epochs": 1000,
+    "batch_size": 18,
+    "eval_batch_size": 16,
+    "mixed_precision": true,
+    "scheduler_after_epoch": true,
+    "run_eval": true,
+    "test_delay_epochs": -1,
+    "print_eval": true,
+    "dashboard_logger": "tensorboard",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "project_name": null,
+    "log_model_step": null,
+    "wandb_entity": null,
+    "save_step": 10000,
+    "checkpoint": true,
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "num_loader_workers": 12,
+    "num_eval_loader_workers": 12,
+    "use_noise_augment": false,
+    "output_path": "./ukrainian-vits",
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_shift_ms": null,
+        "frame_length_ms": null,
+        "stft_pad_mode": "reflect",
+        "sample_rate": 16000,
+        "resample": false,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_sound_norm": false,
+        "log_func": "np.log",
+        "do_trim_silence": true,
+        "trim_db": 45,
+        "power": 1.3,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null,
+        "spec_gain": 1,
+        "do_amp_to_db_linear": false,
+        "do_amp_to_db_mel": true,
+        "signal_norm": false,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": null
+    },
+    "use_phonemes": false,
+    "use_espeak_phonemes": true,
+    "phoneme_language": null,
+    "compute_input_seq_cache": true,
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": "./ukrainian/phoneme_cache",
+    "characters": {
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "!',-.:;?\u0410\u0411\u0412\u0413\u0490\u0414\u0415\u0404\u0416\u0417\u0418\u0406\u0407\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042c\u042e\u042f\u0430\u0431\u0432\u0433\u0491\u0434\u0435\u0454\u0436\u0437\u0438\u0456\u0457\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f ",
+        "punctuations": "!',-.:;? ",
+        "phonemes": null,
+        "unique": true
+    },
+    "batch_group_size": 5,
+    "loss_masking": null,
+    "sort_by_audio_len": true,
+    "min_seq_len": 0,
+    "max_seq_len": 500000,
+    "compute_f0": false,
+    "compute_linear_spec": true,
+    "add_blank": true,
+    "datasets": [
+        {
+            "name": "ljspeech",
+            "path": "./Data/uk_UK/by_book/female/sumska/kaydasheva",
+            "meta_file_train": "metadata.csv",
+            "ununsed_speakers": null,
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "name": "ljspeech",
+            "path": "./Data/uk_UK/by_book/female/sumska/mykola_djerya",
+            "meta_file_train": "metadata.csv",
+            "ununsed_speakers": null,
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": "",
+    "lr_scheduler_params": {},
+    "test_sentences": [
+        "\u0413\u043e\u0432\u043e\u0440\u0438 \u043d\u0456\u0431\u0438 \u0442\u0438 \u0436\u0438\u0432\u0438\u0439!",
+        "\u041f\u043e\u043b \u043f\u0435\u0440\u0435\u0442\u043d\u0443\u0432 \u043f\u0443\u0441\u0442\u0435\u043b\u044e",
+        "\u041f\u0440\u0438\u0432\u0456\u0442, \u0441\u0432\u0456\u0442\u0435!"
+    ],
+    "model_args": {
+        "num_chars": 86,
+        "out_channels": 513,
+        "spec_segment_size": 24,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 6,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "1",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1.0,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": null,
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": false,
+        "d_vector_file": null,
+        "d_vector_dim": 0,
+        "detach_dp_input": true
+    },
+    "grad_clip": [
+        1000.0,
+        1000.0
+    ],
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "return_wav": true,
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": null,
+    "speaker_embedding_channels": 256,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": 0
 }