Spaces:

projecte-aina
/

transcripcio-fonetica-catala

Runtime error

App Files Files Community

te-ch commited on Jun 28, 2023

Commit

d9c1056

2 Parent(s): 23ce701 0ff4340

removed junk 2

Browse files

Files changed (17) hide show

app.py +0 -1
models/bsc/best_model.pth +0 -3
models/bsc/config.json +0 -262
models/bsc/speaker_map.json +0 -10
models/bsc/speakers.pth +0 -3
models/collectivat/catotron-ona-TTS-API-entry.json +0 -10
models/collectivat/fast-speech_best_model.pth +0 -3
models/collectivat/fast-speech_config.json +0 -213
models/collectivat/ljspeech--hifigan_v2_config.json +0 -158
models/collectivat/ljspeech--hifigan_v2_model_file.pth +0 -3
models/mms/G_100000.pth +0 -3
models/mms/config.json +0 -87
models/mms/vocab.txt +0 -39
models/piper/MODEL_CARD +0 -15
models/piper/ca-upc_ona-x-low.onnx +0 -3
models/piper/ca-upc_ona-x-low.onnx.json +0 -409
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -39,7 +39,6 @@ iface = gr.Interface(
             value="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
         ),
         gr.Dropdown(label="dialect", choices="")
     ],
     outputs=[
         gr.Markdown(label="Fonemes")

             value="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
         ),
         gr.Dropdown(label="dialect", choices="")
     ],
     outputs=[
         gr.Markdown(label="Fonemes")

models/bsc/best_model.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
-size 1038659133

models/bsc/config.json DELETED Viewed

@@ -1,262 +0,0 @@
-{
-    "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
-    "logger_uri": null,
-    "run_name": "multispeaker_vits_ca_1e4_1e4_32",
-    "project_name": null,
-    "run_description": "\ud83d\udc38Coqui trainer run.",
-    "print_step": 25,
-    "plot_step": 100,
-    "model_param_stats": false,
-    "wandb_entity": null,
-    "dashboard_logger": "tensorboard",
-    "log_model_step": 1000,
-    "save_step": 1000,
-    "save_n_checkpoints": 5,
-    "save_checkpoints": true,
-    "save_all_best": true,
-    "save_best_after": 10000,
-    "target_loss": null,
-    "print_eval": true,
-    "test_delay_epochs": -1,
-    "run_eval": true,
-    "run_eval_steps": null,
-    "distributed_backend": "nccl",
-    "distributed_url": "tcp://localhost:54321",
-    "mixed_precision": false,
-    "epochs": 1000,
-    "batch_size": 16,
-    "eval_batch_size": 8,
-    "grad_clip": [
-        1000.0,
-        1000.0
-    ],
-    "scheduler_after_epoch": true,
-    "lr": 0.001,
-    "optimizer": "AdamW",
-    "optimizer_params": {
-        "betas": [
-            0.8,
-            0.99
-        ],
-        "eps": 1e-09,
-        "weight_decay": 0.01
-    },
-    "lr_scheduler": "",
-    "lr_scheduler_params": null,
-    "use_grad_scaler": false,
-    "cudnn_enable": true,
-    "cudnn_deterministic": false,
-    "cudnn_benchmark": false,
-    "training_seed": 54321,
-    "model": "vits",
-    "num_loader_workers": 4,
-    "num_eval_loader_workers": 4,
-    "use_noise_augment": false,
-    "audio": {
-        "fft_size": 1024,
-        "sample_rate": 22050,
-        "win_length": 1024,
-        "hop_length": 256,
-        "num_mels": 80,
-        "mel_fmin": 0,
-        "mel_fmax": null
-    },
-    "use_phonemes": true,
-    "phonemizer": "espeak",
-    "phoneme_language": "ca",
-    "compute_input_seq_cache": true,
-    "text_cleaner": "multilingual_cleaners",
-    "enable_eos_bos_chars": false,
-    "test_sentences_file": "",
-    "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
-    "characters": {
-        "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
-        "vocab_dict": null,
-        "pad": "<PAD>",
-        "eos": "<EOS>",
-        "bos": "<BOS>",
-        "blank": "<BLNK>",
-        "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
-        "punctuations": "!'(),-.:;? ",
-        "phonemes": null,
-        "is_unique": false,
-        "is_sorted": true
-    },
-    "add_blank": true,
-    "batch_group_size": 5,
-    "loss_masking": null,
-    "min_audio_len": 1,
-    "max_audio_len": Infinity,
-    "min_text_len": 1,
-    "max_text_len": 325,
-    "compute_f0": false,
-    "compute_linear_spec": true,
-    "precompute_num_workers": 0,
-    "start_by_longest": false,
-    "datasets": [
-        {
-            "formatter": "vctk_old",
-            "dataset_name": "vctk_old",
-            "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
-            "meta_file_train": "",
-            "ignored_speakers": [
-                "uri",
-                "09796",
-                "05450"
-            ],
-            "language": "ca",
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        }
-    ],
-    "test_sentences": [
-        [
-            "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
-        ],
-        [
-            "Preguntin-se si aix\u00f2 era necessari."
-        ],
-        [
-            "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
-        ],
-        [
-            "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
-        ]
-    ],
-    "eval_split_max_size": null,
-    "eval_split_size": 0.01,
-    "use_speaker_weighted_sampler": false,
-    "speaker_weighted_sampler_alpha": 1.0,
-    "use_language_weighted_sampler": false,
-    "language_weighted_sampler_alpha": 1.0,
-    "use_length_weighted_sampler": false,
-    "length_weighted_sampler_alpha": 1.0,
-    "model_args": {
-        "num_chars": 131,
-        "out_channels": 513,
-        "spec_segment_size": 32,
-        "hidden_channels": 192,
-        "hidden_channels_ffn_text_encoder": 768,
-        "num_heads_text_encoder": 2,
-        "num_layers_text_encoder": 6,
-        "kernel_size_text_encoder": 3,
-        "dropout_p_text_encoder": 0.1,
-        "dropout_p_duration_predictor": 0.5,
-        "kernel_size_posterior_encoder": 5,
-        "dilation_rate_posterior_encoder": 1,
-        "num_layers_posterior_encoder": 16,
-        "kernel_size_flow": 5,
-        "dilation_rate_flow": 1,
-        "num_layers_flow": 4,
-        "resblock_type_decoder": "1",
-        "resblock_kernel_sizes_decoder": [
-            3,
-            7,
-            11
-        ],
-        "resblock_dilation_sizes_decoder": [
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ]
-        ],
-        "upsample_rates_decoder": [
-            8,
-            8,
-            2,
-            2
-        ],
-        "upsample_initial_channel_decoder": 512,
-        "upsample_kernel_sizes_decoder": [
-            16,
-            16,
-            4,
-            4
-        ],
-        "periods_multi_period_discriminator": [
-            2,
-            3,
-            5,
-            7,
-            11
-        ],
-        "use_sdp": true,
-        "noise_scale": 1.0,
-        "inference_noise_scale": 0.667,
-        "length_scale": 1.0,
-        "noise_scale_dp": 1.0,
-        "inference_noise_scale_dp": 1.0,
-        "max_inference_len": null,
-        "init_discriminator": true,
-        "use_spectral_norm_disriminator": false,
-        "use_speaker_embedding": true,
-        "num_speakers": 257,
-        "speakers_file": "/home/user/app/models/bsc/speakers.pth",
-        "d_vector_file": null,
-        "speaker_embedding_channels": 256,
-        "use_d_vector_file": false,
-        "d_vector_dim": 0,
-        "detach_dp_input": true,
-        "use_language_embedding": false,
-        "embedded_language_dim": 4,
-        "num_languages": 0,
-        "language_ids_file": null,
-        "use_speaker_encoder_as_loss": false,
-        "speaker_encoder_config_path": "",
-        "speaker_encoder_model_path": "",
-        "condition_dp_on_speaker": true,
-        "freeze_encoder": false,
-        "freeze_DP": false,
-        "freeze_PE": false,
-        "freeze_flow_decoder": false,
-        "freeze_waveform_decoder": false,
-        "encoder_sample_rate": null,
-        "interpolate_z": true,
-        "reinit_DP": false,
-        "reinit_text_encoder": false
-    },
-    "lr_gen": 0.0001,
-    "lr_disc": 0.0001,
-    "lr_scheduler_gen": "ExponentialLR",
-    "lr_scheduler_gen_params": {
-        "gamma": 0.999875,
-        "last_epoch": -1
-    },
-    "lr_scheduler_disc": "ExponentialLR",
-    "lr_scheduler_disc_params": {
-        "gamma": 0.999875,
-        "last_epoch": -1
-    },
-    "kl_loss_alpha": 1.0,
-    "disc_loss_alpha": 1.0,
-    "gen_loss_alpha": 1.0,
-    "feat_loss_alpha": 1.0,
-    "mel_loss_alpha": 45.0,
-    "dur_loss_alpha": 1.0,
-    "speaker_encoder_loss_alpha": 1.0,
-    "return_wav": true,
-    "use_weighted_sampler": false,
-    "weighted_sampler_attrs": null,
-    "weighted_sampler_multipliers": null,
-    "r": 1,
-    "num_speakers": 257,
-    "use_speaker_embedding": true,
-    "speakers_file": "/home/user/app/models/bsc/speakers.pth",
-    "speaker_embedding_channels": 256,
-    "language_ids_file": null,
-    "use_language_embedding": false,
-    "use_d_vector_file": false,
-    "d_vector_file": null,
-    "d_vector_dim": 0
-}

models/bsc/speaker_map.json DELETED Viewed

@@ -1,10 +0,0 @@
-{
-    "f_cen_05": "05739",
-    "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
-    "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
-    "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
-    "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
-    "m_cen_08": "08935",
-    "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
-    "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
-}

models/bsc/speakers.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
-size 30191

models/collectivat/catotron-ona-TTS-API-entry.json DELETED Viewed

@@ -1,10 +0,0 @@
-{
-    "voice": "ona-fast-hifigan",
-    "lang": "ca",
-    "model_type": "coqui",
-    "tts_config_path": "fast-speech_config.json",
-    "tts_model_path": "fast-speech_best_model.pth",
-    "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
-    "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
-    "load": true
-}

models/collectivat/fast-speech_best_model.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
-size 457921637

models/collectivat/fast-speech_config.json DELETED Viewed

@@ -1,213 +0,0 @@
-{
-    "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
-    "logger_uri": null,
-    "run_name": "fast_pitch_ljspeech",
-    "project_name": null,
-    "run_description": "\ud83d\udc38Coqui trainer run.",
-    "print_step": 50,
-    "plot_step": 100,
-    "model_param_stats": false,
-    "wandb_entity": null,
-    "dashboard_logger": "tensorboard",
-    "log_model_step": null,
-    "save_step": 10000,
-    "save_n_checkpoints": 5,
-    "save_checkpoints": true,
-    "save_all_best": false,
-    "save_best_after": 1000,
-    "target_loss": null,
-    "print_eval": false,
-    "test_delay_epochs": -1,
-    "run_eval": true,
-    "run_eval_steps": null,
-    "distributed_backend": "nccl",
-    "distributed_url": "tcp://localhost:54321",
-    "mixed_precision": false,
-    "epochs": 1000,
-    "batch_size": 16,
-    "eval_batch_size": 16,
-    "grad_clip": 5.0,
-    "scheduler_after_epoch": true,
-    "lr": 0.0001,
-    "optimizer": "Adam",
-    "optimizer_params": {
-        "betas": [
-            0.9,
-            0.998
-        ],
-        "weight_decay": 1e-06
-    },
-    "lr_scheduler": "NoamLR",
-    "lr_scheduler_params": {
-        "warmup_steps": 4000
-    },
-    "use_grad_scaler": false,
-    "cudnn_enable": true,
-    "cudnn_deterministic": false,
-    "cudnn_benchmark": false,
-    "training_seed": 54321,
-    "model": "fast_pitch",
-    "num_loader_workers": 8,
-    "num_eval_loader_workers": 4,
-    "use_noise_augment": false,
-    "audio": {
-        "fft_size": 1024,
-        "win_length": 1024,
-        "hop_length": 256,
-        "frame_shift_ms": null,
-        "frame_length_ms": null,
-        "stft_pad_mode": "reflect",
-        "sample_rate": 22050,
-        "resample": false,
-        "preemphasis": 0.0,
-        "ref_level_db": 20,
-        "do_sound_norm": false,
-        "log_func": "np.log",
-        "do_trim_silence": true,
-        "trim_db": 60.0,
-        "do_rms_norm": false,
-        "db_level": null,
-        "power": 1.5,
-        "griffin_lim_iters": 60,
-        "num_mels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000,
-        "spec_gain": 1.0,
-        "do_amp_to_db_linear": true,
-        "do_amp_to_db_mel": true,
-        "pitch_fmax": 640.0,
-        "pitch_fmin": 0.0,
-        "signal_norm": false,
-        "min_level_db": -100,
-        "symmetric_norm": true,
-        "max_norm": 4.0,
-        "clip_norm": true,
-        "stats_path": null
-    },
-    "use_phonemes": false,
-    "phonemizer": null,
-    "phoneme_language": "ca-es",
-    "compute_input_seq_cache": true,
-    "text_cleaner": "multilingual_cleaners",
-    "enable_eos_bos_chars": false,
-    "test_sentences_file": "",
-    "phoneme_cache_path": null,
-    "characters": {
-        "characters_class": "TTS.tts.utils.text.characters.Graphemes",
-        "vocab_dict": null,
-        "pad": "_",
-        "eos": "*",
-        "bos": "^",
-        "blank": null,
-        "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
-        "punctuations": "!'(),-.:;?\u00b7 ",
-        "phonemes": "",
-        "is_unique": true,
-        "is_sorted": true
-    },
-    "add_blank": false,
-    "batch_group_size": 0,
-    "loss_masking": null,
-    "min_audio_len": 1,
-    "max_audio_len": Infinity,
-    "min_text_len": 1,
-    "max_text_len": Infinity,
-    "compute_f0": true,
-    "compute_linear_spec": false,
-    "precompute_num_workers": 4,
-    "start_by_longest": false,
-    "datasets": [
-        {
-            "name": "custom_turkish",
-            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
-            "meta_file_train": "upc_ona_train.txt",
-            "ignored_speakers": null,
-            "language": "",
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        },
-        {
-            "name": "custom_turkish",
-            "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
-            "meta_file_train": "upc_ona_val.txt",
-            "ignored_speakers": null,
-            "language": "",
-            "meta_file_val": "",
-            "meta_file_attn_mask": ""
-        }
-    ],
-    "test_sentences": [
-        "Hola Barcelona!",
-        "Escriviu al text."
-    ],
-    "eval_split_max_size": null,
-    "eval_split_size": 0.01,
-    "use_speaker_weighted_sampler": false,
-    "speaker_weighted_sampler_alpha": 1.0,
-    "use_language_weighted_sampler": false,
-    "language_weighted_sampler_alpha": 1.0,
-    "use_length_weighted_sampler": false,
-    "length_weighted_sampler_alpha": 1.0,
-    "base_model": "forward_tts",
-    "model_args": {
-        "num_chars": 89,
-        "out_channels": 80,
-        "hidden_channels": 384,
-        "use_aligner": true,
-        "use_pitch": true,
-        "pitch_predictor_hidden_channels": 256,
-        "pitch_predictor_kernel_size": 3,
-        "pitch_predictor_dropout_p": 0.1,
-        "pitch_embedding_kernel_size": 3,
-        "duration_predictor_hidden_channels": 256,
-        "duration_predictor_kernel_size": 3,
-        "duration_predictor_dropout_p": 0.1,
-        "positional_encoding": true,
-        "poisitonal_encoding_use_scale": true,
-        "length_scale": 1,
-        "encoder_type": "fftransformer",
-        "encoder_params": {
-            "hidden_channels_ffn": 1024,
-            "num_heads": 1,
-            "num_layers": 6,
-            "dropout_p": 0.1
-        },
-        "decoder_type": "fftransformer",
-        "decoder_params": {
-            "hidden_channels_ffn": 1024,
-            "num_heads": 1,
-            "num_layers": 6,
-            "dropout_p": 0.1
-        },
-        "detach_duration_predictor": false,
-        "max_duration": 75,
-        "num_speakers": 1,
-        "use_speaker_embedding": false,
-        "speakers_file": null,
-        "use_d_vector_file": false,
-        "d_vector_dim": null,
-        "d_vector_file": null
-    },
-    "num_speakers": 0,
-    "speakers_file": null,
-    "use_speaker_embedding": false,
-    "use_d_vector_file": false,
-    "d_vector_file": false,
-    "d_vector_dim": 0,
-    "spec_loss_type": "mse",
-    "duration_loss_type": "mse",
-    "use_ssim_loss": true,
-    "ssim_loss_alpha": 1.0,
-    "spec_loss_alpha": 1.0,
-    "aligner_loss_alpha": 1.0,
-    "pitch_loss_alpha": 0.1,
-    "dur_loss_alpha": 0.1,
-    "binary_align_loss_alpha": 0.1,
-    "binary_loss_warmup_epochs": 150,
-    "min_seq_len": 13,
-    "max_seq_len": 500000,
-    "r": 1,
-    "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
-    "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
-    "github_branch": "* dev"
-}

models/collectivat/ljspeech--hifigan_v2_config.json DELETED Viewed

@@ -1,158 +0,0 @@
-{
-    "run_name": "hifigan",
-    "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
-    // AUDIO PARAMETERS
-    "audio":{
-        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
-        "win_length": 1024,      // stft window length in ms.
-        "hop_length": 256,       // stft window hop-lengh in ms.
-        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        // Audio processing parameters
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "log_func": "np.log",
-        // Silence trimming
-        "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        // MelSpectrogram parameters
-        "num_mels": 80,         // size of the mel spec frame.
-        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
-        // Normalization parameters
-        "signal_norm": false,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
-        "min_level_db": -100,   // lower bound for normalization
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
-    },
-    // DISTRIBUTED TRAINING
-    "distributed":{
-        "backend": "nccl",
-        "url": "tcp:\/\/localhost:54324"
-    },
-    // MODEL PARAMETERS
-    "use_pqmf": false,
-    // LOSS PARAMETERS
-    "use_stft_loss": false,
-    "use_subband_stft_loss": false,
-    "use_mse_gan_loss": true,
-    "use_hinge_gan_loss": false,
-    "use_feat_match_loss": true,  // use only with melgan discriminators
-    "use_l1_spec_loss": true,
-    // loss weights
-    "stft_loss_weight": 0,
-    "subband_stft_loss_weight": 0,
-    "mse_G_loss_weight": 1,
-    "hinge_G_loss_weight": 0,
-    "feat_match_loss_weight": 10,
-    "l1_spec_loss_weight": 45,
-    // multiscale stft loss parameters
-    // "stft_loss_params": {
-    //     "n_ffts": [1024, 2048, 512],
-    //     "hop_lengths": [120, 240, 50],
-    //     "win_lengths": [600, 1200, 240]
-    // },
-    "l1_spec_loss_params": {
-        "use_mel": true,
-        "sample_rate": 16000,
-        "n_fft": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": null
-    },
-    "target_loss": "avg_G_loss",  // loss value to pick the best model to save after each epoch
-    // DISCRIMINATOR
-    "discriminator_model": "hifigan_discriminator",
-    //"discriminator_model_params":{
-    //    "peroids": [2, 3, 5, 7, 11],
-    //    "base_channels": 16,
-    //    "max_channels":512,
-    //    "downsample_factors":[4, 4, 4]
-    //},
-    "steps_to_start_discriminator": 0,      // steps required to start GAN trainining.1
-    // GENERATOR
-    "generator_model": "hifigan_generator",
-    "generator_model_params": {
-        "resblock_type": "1",
-        "upsample_factors": [8,8,2,2],
-        "upsample_kernel_sizes": [16,16,4,4],
-        "upsample_initial_channel": 128,
-        "resblock_kernel_sizes": [3,7,11],
-        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
-    },
-    // DATASET
-    "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
-    "feature_path": null,
-    // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
-    "seq_len": 8192,
-    "pad_short": 2000,
-    "conv_pad": 0,
-    "use_noise_augment": false,
-    "use_cache": true,
-    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
-    // TRAINING
-    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    // VALIDATION
-    "run_eval": true,
-    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
-    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
-    // OPTIMIZER
-    "epochs": 10000,                // total number of epochs to train.
-    "wd": 0.0,                // Weight decay weight.
-    "gen_clip_grad": -1,      // Generator gradient clipping threshold. Apply gradient clipping if > 0
-    "disc_clip_grad": -1,     // Discriminator gradient clipping threshold.
-    // "lr_scheduler_gen": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
-    // "lr_scheduler_gen_params": {
-    //    "gamma": 0.999,
-        // "last_epoch": -1
-    // },
-    // "lr_scheduler_disc": "ExponentialLR",   // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
-    // "lr_scheduler_disc_params": {
-    	//   "gamma": 0.999,
-        // "last_epoch": -1
-    // },
-    "lr_gen": 0.00001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_disc": 0.00001,
-    // TENSORBOARD and LOGGING
-    "print_step": 25,       // Number of steps to log traning on console.
-    "print_eval": false,     // If True, it prints loss values for each step in eval run.
-    "save_step": 25000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
-    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
-    // DATA LOADING
-    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "eval_split_size": 10,
-    // PATHS
-    "output_path": "/home/erogol/gdrive/Trainings/sam/"
-}

models/collectivat/ljspeech--hifigan_v2_model_file.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
-size 3794153

models/mms/G_100000.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0382edd70333f8ddc663177e672c8a66312e1b30f7929a8f9d458ef66f6b5349
-size 436622793

models/mms/config.json DELETED Viewed

@@ -1,87 +0,0 @@
-{
-    "train": {
-        "log_interval": 200,
-        "eval_interval": 1000,
-        "seed": 1234,
-        "epochs": 20000,
-        "learning_rate": 0.0002,
-        "betas": [
-            0.8,
-            0.99
-        ],
-        "eps": 1e-09,
-        "batch_size": 64,
-        "fp16_run": true,
-        "lr_decay": 0.999875,
-        "segment_size": 8192,
-        "init_lr_ratio": 1,
-        "warmup_epochs": 0,
-        "c_mel": 45,
-        "c_kl": 1.0
-    },
-    "data": {
-        "training_files": "train.ltr",
-        "validation_files": "dev.ltr",
-        "text_cleaners": [
-            "transliteration_cleaners"
-        ],
-        "max_wav_value": 32768.0,
-        "sampling_rate": 16000,
-        "filter_length": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mel_channels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": null,
-        "add_blank": true,
-        "n_speakers": 0,
-        "cleaned_text": true
-    },
-    "model": {
-        "inter_channels": 192,
-        "hidden_channels": 192,
-        "filter_channels": 768,
-        "n_heads": 2,
-        "n_layers": 6,
-        "kernel_size": 3,
-        "p_dropout": 0.1,
-        "resblock": "1",
-        "resblock_kernel_sizes": [
-            3,
-            7,
-            11
-        ],
-        "resblock_dilation_sizes": [
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ],
-            [
-                1,
-                3,
-                5
-            ]
-        ],
-        "upsample_rates": [
-            8,
-            8,
-            2,
-            2
-        ],
-        "upsample_initial_channel": 512,
-        "upsample_kernel_sizes": [
-            16,
-            16,
-            4,
-            4
-        ],
-        "n_layers_q": 3,
-        "use_spectral_norm": false
-    }
-}

models/mms/vocab.txt DELETED Viewed

@@ -1,39 +0,0 @@
-z
-f
-i
-g
-m
-o
-r
-è
-h
-l
-v
-à
-u
-d
-ú
-ç
-p
-s
-'
-é
-_
--
-e
-a
-—
-x
-ü
-q
-t
-b
-í
-ó
-ï
-ò
-c
-j
-n
-y

models/piper/MODEL_CARD DELETED Viewed

@@ -1,15 +0,0 @@
-# Model card for upc_ona (x-low)
-* Language: ca (Catalan)
-* Speakers: 1
-* Quality: x-low
-* Samplerate: 16,000Hz
-## Dataset
-* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
-* License: CC BY-SA 3.0 ES
-## Training
-Trained from scratch.

models/piper/ca-upc_ona-x-low.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
-size 20628813

models/piper/ca-upc_ona-x-low.onnx.json DELETED Viewed

@@ -1,409 +0,0 @@
-{
-    "audio": {
-        "sample_rate": 16000
-    },
-    "espeak": {
-        "voice": "ca"
-    },
-    "inference": {
-        "noise_scale": 0.667,
-        "length_scale": 1,
-        "noise_w": 0.8
-    },
-    "phoneme_map": {},
-    "phoneme_id_map": {
-        "_": [
-            0
-        ],
-        "^": [
-            1
-        ],
-        "$": [
-            2
-        ],
-        " ": [
-            3
-        ],
-        "!": [
-            4
-        ],
-        "'": [
-            5
-        ],
-        "(": [
-            6
-        ],
-        ")": [
-            7
-        ],
-        ",": [
-            8
-        ],
-        "-": [
-            9
-        ],
-        ".": [
-            10
-        ],
-        ":": [
-            11
-        ],
-        ";": [
-            12
-        ],
-        "?": [
-            13
-        ],
-        "a": [
-            14
-        ],
-        "b": [
-            15
-        ],
-        "c": [
-            16
-        ],
-        "d": [
-            17
-        ],
-        "e": [
-            18
-        ],
-        "f": [
-            19
-        ],
-        "h": [
-            20
-        ],
-        "i": [
-            21
-        ],
-        "j": [
-            22
-        ],
-        "k": [
-            23
-        ],
-        "l": [
-            24
-        ],
-        "m": [
-            25
-        ],
-        "n": [
-            26
-        ],
-        "o": [
-            27
-        ],
-        "p": [
-            28
-        ],
-        "q": [
-            29
-        ],
-        "r": [
-            30
-        ],
-        "s": [
-            31
-        ],
-        "t": [
-            32
-        ],
-        "u": [
-            33
-        ],
-        "v": [
-            34
-        ],
-        "w": [
-            35
-        ],
-        "x": [
-            36
-        ],
-        "y": [
-            37
-        ],
-        "z": [
-            38
-        ],
-        "æ": [
-            39
-        ],
-        "ç": [
-            40
-        ],
-        "ð": [
-            41
-        ],
-        "ø": [
-            42
-        ],
-        "ħ": [
-            43
-        ],
-        "ŋ": [
-            44
-        ],
-        "œ": [
-            45
-        ],
-        "ǀ": [
-            46
-        ],
-        "ǁ": [
-            47
-        ],
-        "ǂ": [
-            48
-        ],
-        "ǃ": [
-            49
-        ],
-        "ɐ": [
-            50
-        ],
-        "ɑ": [
-            51
-        ],
-        "ɒ": [
-            52
-        ],
-        "ɓ": [
-            53
-        ],
-        "ɔ": [
-            54
-        ],
-        "ɕ": [
-            55
-        ],
-        "ɖ": [
-            56
-        ],
-        "ɗ": [
-            57
-        ],
-        "ɘ": [
-            58
-        ],
-        "ə": [
-            59
-        ],
-        "ɚ": [
-            60
-        ],
-        "ɛ": [
-            61
-        ],
-        "ɜ": [
-            62
-        ],
-        "ɞ": [
-            63
-        ],
-        "ɟ": [
-            64
-        ],
-        "ɠ": [
-            65
-        ],
-        "ɡ": [
-            66
-        ],
-        "ɢ": [
-            67
-        ],
-        "ɣ": [
-            68
-        ],
-        "ɤ": [
-            69
-        ],
-        "ɥ": [
-            70
-        ],
-        "ɦ": [
-            71
-        ],
-        "ɧ": [
-            72
-        ],
-        "ɨ": [
-            73
-        ],
-        "ɪ": [
-            74
-        ],
-        "ɫ": [
-            75
-        ],
-        "ɬ": [
-            76
-        ],
-        "ɭ": [
-            77
-        ],
-        "ɮ": [
-            78
-        ],
-        "ɯ": [
-            79
-        ],
-        "ɰ": [
-            80
-        ],
-        "ɱ": [
-            81
-        ],
-        "ɲ": [
-            82
-        ],
-        "ɳ": [
-            83
-        ],
-        "ɴ": [
-            84
-        ],
-        "ɵ": [
-            85
-        ],
-        "ɶ": [
-            86
-        ],
-        "ɸ": [
-            87
-        ],
-        "ɹ": [
-            88
-        ],
-        "ɺ": [
-            89
-        ],
-        "ɻ": [
-            90
-        ],
-        "ɽ": [
-            91
-        ],
-        "ɾ": [
-            92
-        ],
-        "ʀ": [
-            93
-        ],
-        "ʁ": [
-            94
-        ],
-        "ʂ": [
-            95
-        ],
-        "ʃ": [
-            96
-        ],
-        "ʄ": [
-            97
-        ],
-        "ʈ": [
-            98
-        ],
-        "ʉ": [
-            99
-        ],
-        "ʊ": [
-            100
-        ],
-        "ʋ": [
-            101
-        ],
-        "ʌ": [
-            102
-        ],
-        "ʍ": [
-            103
-        ],
-        "ʎ": [
-            104
-        ],
-        "ʏ": [
-            105
-        ],
-        "ʐ": [
-            106
-        ],
-        "ʑ": [
-            107
-        ],
-        "ʒ": [
-            108
-        ],
-        "ʔ": [
-            109
-        ],
-        "ʕ": [
-            110
-        ],
-        "ʘ": [
-            111
-        ],
-        "ʙ": [
-            112
-        ],
-        "ʛ": [
-            113
-        ],
-        "ʜ": [
-            114
-        ],
-        "ʝ": [
-            115
-        ],
-        "ʟ": [
-            116
-        ],
-        "ʡ": [
-            117
-        ],
-        "ʢ": [
-            118
-        ],
-        "ʲ": [
-            119
-        ],
-        "ˈ": [
-            120
-        ],
-        "ˌ": [
-            121
-        ],
-        "ː": [
-            122
-        ],
-        "ˑ": [
-            123
-        ],
-        "˞": [
-            124
-        ],
-        "β": [
-            125
-        ],
-        "θ": [
-            126
-        ],
-        "χ": [
-            127
-        ],
-        "ᵻ": [
-            128
-        ],
-        "ⱱ": [
-            129
-        ]
-    },
-    "num_symbols": 130,
-    "num_speakers": 1,
-    "speaker_id_map": {}
-}

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	gradio
2	- espeak-phonemizer>=1.1.0,<2


1	gradio
2	+ espeak-phonemizer>=1.1.0,<2