Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Sep 17, 2023

Commit

316ae6b

1 Parent(s): dc248e1

Release 6.0.0 model

Browse files

Files changed (7) hide show

.gitignore +1 -0
README.md +10 -7
app.py +6 -17
config.yaml +139 -125
requirements.txt +1 -1
setup.py +2 -2
ukrainian_tts/tts.py +7 -10

.gitignore CHANGED Viewed

@@ -135,6 +135,7 @@ dmypy.json
 *.pth.tar
 *.pth
 *.ark
 # gradio
 gradio_queue.db

 *.pth.tar
 *.pth
 *.ark
+*.npz
 # gradio
 gradio_queue.db

README.md CHANGED Viewed

@@ -38,27 +38,30 @@ If you like my work, please support ❤️ -> [https://send.monobank.ua/jar/48iH
 You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
 # Examples 🤖
-`Tetiana (female)`:
-https://user-images.githubusercontent.com/5759207/224504324-d8236cad-7302-4dfd-9696-7a42b9f05fce.mp4
 <details>
   <summary>More voices 📢🤖</summary>
-`Dmytro (male)`:
-https://user-images.githubusercontent.com/5759207/224504354-f84f74d3-fa46-497c-9604-4b63ba45989f.mp4
-`Lada (female)`:
-https://user-images.githubusercontent.com/5759207/224504360-ec198ac2-647c-4238-99ef-b6f074d633fd.mp4
 `Mykyta (male)`:
-https://user-images.githubusercontent.com/5759207/224504363-0227e8bf-8c1c-49ad-8602-8cbf8feaa82b.mp4
 </details>

 You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
 # Examples 🤖
+`Oleksa (male)`:
+https://github.com/robinhad/ukrainian-tts/assets/5759207/ace842ef-06d0-4b1f-ad49-5fda92999dbb
 <details>
   <summary>More voices 📢🤖</summary>
+`Tetiana (female)`:
+https://github.com/robinhad/ukrainian-tts/assets/5759207/a6ecacf6-62ae-4fc5-b6d5-41e6cdd3d992
+`Dmytro (male)`:
+https://github.com/robinhad/ukrainian-tts/assets/5759207/67d3dac9-6626-40ef-98e5-ec194096bbe0
+`Lada (female)`:
+https://github.com/robinhad/ukrainian-tts/assets/5759207/fcf558b2-3ff9-4539-ad9e-8455b52223a4
 `Mykyta (male)`:
+https://github.com/robinhad/ukrainian-tts/assets/5759207/033f5215-3f09-4021-ba19-1f55158445ca
 </details>

app.py CHANGED Viewed

@@ -43,6 +43,7 @@ class VoiceOption(Enum):
     Mykyta = "Микита (чоловічий) 👨"
     Lada = "Лада (жіночий) 👩"
     Dmytro = "Дмитро (чоловічий) 👨"
 print(f"CUDA available? {is_available()}")
@@ -51,7 +52,7 @@ print(f"CUDA available? {is_available()}")
 ukr_tts = TTS(device="cuda" if is_available() else "cpu")
-def tts(text: str, voice: str, speed: float):
     print("============================")
     print("Original text:", text)
     print("Voice", voice)
@@ -62,6 +63,7 @@ def tts(text: str, voice: str, speed: float):
         VoiceOption.Mykyta.value: Voices.Mykyta.value,
         VoiceOption.Lada.value: Voices.Lada.value,
         VoiceOption.Dmytro.value: Voices.Dmytro.value,
     }
     speaker_name = voice_mapping[voice]
@@ -72,11 +74,11 @@ def tts(text: str, voice: str, speed: float):
     if getenv("HF_API_TOKEN") is not None:
         log_queue.put(
-            [text, speaker_name, Stress.Dictionary.value, speed, str(datetime.utcnow())]
         )
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-        _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp, speed)
         return fp.name, text
@@ -97,9 +99,6 @@ iface = gr.Interface(
             choices=[option.value for option in VoiceOption],
             value=VoiceOption.Tetiana.value,
         ),
-        gr.components.Slider(
-            label="Швидкість", minimum=0.5, maximum=2, value=1, step=0.05
-        ),
     ],
     outputs=[
         gr.components.Audio(label="Output"),
@@ -112,32 +111,22 @@ iface = gr.Interface(
         [
             "Привіт, як тебе звати?",
             VoiceOption.Tetiana.value,
-            1,
         ],
         [
             "Введіть, будь ласка, св+оє реч+ення.",
             VoiceOption.Dmytro.value,
-            1,
-        ],
-        [
-            "Введіть, будь ласка, своє речення.",
-            VoiceOption.Dmytro.value,
-            1.3,
         ],
         [
             "Введіть, будь ласка, своє речення.",
-            VoiceOption.Mykyta.value,
-            1,
         ],
         [
             "Введіть, будь ласка, своє речення.",
             VoiceOption.Mykyta.value,
-            0.7,
         ],
         [
             "Договір підписано 4 квітня 1949 року.",
             VoiceOption.Lada.value,
-            0.9,
         ],
     ],
 )

     Mykyta = "Микита (чоловічий) 👨"
     Lada = "Лада (жіночий) 👩"
     Dmytro = "Дмитро (чоловічий) 👨"
+    Oleksa = "Олекса (чоловічий) 👨"
 print(f"CUDA available? {is_available()}")
 ukr_tts = TTS(device="cuda" if is_available() else "cpu")
+def tts(text: str, voice: str):
     print("============================")
     print("Original text:", text)
     print("Voice", voice)
         VoiceOption.Mykyta.value: Voices.Mykyta.value,
         VoiceOption.Lada.value: Voices.Lada.value,
         VoiceOption.Dmytro.value: Voices.Dmytro.value,
+        VoiceOption.Oleksa.value: Voices.Oleksa.value,
     }
     speaker_name = voice_mapping[voice]
     if getenv("HF_API_TOKEN") is not None:
         log_queue.put(
+            [text, speaker_name, Stress.Dictionary.value, 1, str(datetime.utcnow())]
         )
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        _, text = ukr_tts.tts(text, speaker_name, Stress.Dictionary.value, fp)
         return fp.name, text
             choices=[option.value for option in VoiceOption],
             value=VoiceOption.Tetiana.value,
         ),
     ],
     outputs=[
         gr.components.Audio(label="Output"),
         [
             "Привіт, як тебе звати?",
             VoiceOption.Tetiana.value,
         ],
         [
             "Введіть, будь ласка, св+оє реч+ення.",
             VoiceOption.Dmytro.value,
         ],
         [
             "Введіть, будь ласка, своє речення.",
+            VoiceOption.Oleksa.value,
         ],
         [
             "Введіть, будь ласка, своє речення.",
             VoiceOption.Mykyta.value,
         ],
         [
             "Договір підписано 4 квітня 1949 року.",
             VoiceOption.Lada.value,
         ],
     ],
 )

config.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
-config: ./conf/tuning/train_vits.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
-output_dir: exp/22k/tts_train_vits_raw_char
 ngpu: 1
-seed: 3407
 num_workers: 4
 num_att_plot: 3
 dist_backend: nccl
@@ -24,7 +24,7 @@ cudnn_benchmark: false
 cudnn_deterministic: false
 collect_stats: false
 write_collected_feats: false
-max_epoch: 1000
 patience: null
 val_scheduler_criterion:
 - valid
@@ -34,10 +34,16 @@ early_stopping_criterion:
 - loss
 - min
 best_model_criterion:
 -   - train
     - total_count
     - max
-keep_nbest_models: 10
 nbest_averaging_interval: 0
 grad_clip: -1
 grad_clip_type: 2.0
@@ -59,20 +65,23 @@ wandb_name: null
 wandb_model_log_interval: -1
 detect_anomaly: false
 pretrain_path: null
-init_param: []
 ignore_init_mismatch: false
 freeze_param: []
 num_iters_per_epoch: null
 batch_size: 20
 valid_batch_size: null
-batch_bins: 1500000
 valid_batch_bins: null
 train_shape_file:
-- exp/22k/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
-- exp/22k/tts_stats_raw_linear_spectrogram_char/train/speech_shape
 valid_shape_file:
-- exp/22k/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
-- exp/22k/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
@@ -110,29 +119,27 @@ max_cache_fd: 32
 valid_max_cache_size: null
 exclude_weight_decay: false
 exclude_weight_decay_conf: {}
-optim: adamw
 optim_conf:
-    lr: 0.0002
     betas:
-    - 0.8
-    - 0.99
-    eps: 1.0e-09
     weight_decay: 0.0
 scheduler: exponentiallr
 scheduler_conf:
     gamma: 0.999875
-optim2: adamw
 optim2_conf:
-    lr: 0.0002
     betas:
-    - 0.8
-    - 0.99
-    eps: 1.0e-09
     weight_decay: 0.0
 scheduler2: exponentiallr
 scheduler2_conf:
     gamma: 0.999875
-generator_first: false
 token_list:
 - <blank>
 - <unk>
@@ -154,14 +161,13 @@ token_list:
 - к
 - м
 - п
-- .
 - я
 - з
 - ','
 - б
 - ь
-- ч
 - г
 - й
 - ж
 - х
@@ -176,13 +182,12 @@ token_list:
 - '!'
 - ''''
 - ф
 - '"'
-- ':'
 - ґ
-- (
-- )
-- „
 - /
 - <sos/eos>
 odim: null
 model_conf: {}
@@ -192,54 +197,67 @@ bpemodel: null
 non_linguistic_symbols: null
 cleaner: null
 g2p: g2p_en
-feats_extract: linear_spectrogram
 feats_extract_conf:
     n_fft: 1024
     hop_length: 256
     win_length: null
-normalize: null
-normalize_conf: {}
-tts: vits
 tts_conf:
-    generator_type: vits_generator
-    generator_params:
-        hidden_channels: 192
-        spks: -1
         spk_embed_dim: 192
-        global_channels: 256
-        segment_size: 32
-        text_encoder_attention_heads: 2
-        text_encoder_ffn_expand: 4
-        text_encoder_blocks: 6
-        text_encoder_positionwise_layer_type: conv1d
-        text_encoder_positionwise_conv_kernel_size: 3
-        text_encoder_positional_encoding_layer_type: rel_pos
-        text_encoder_self_attention_layer_type: rel_selfattn
-        text_encoder_activation_type: swish
-        text_encoder_normalize_before: true
-        text_encoder_dropout_rate: 0.1
-        text_encoder_positional_dropout_rate: 0.0
-        text_encoder_attention_dropout_rate: 0.1
-        use_macaron_style_in_text_encoder: true
-        use_conformer_conv_in_text_encoder: false
-        text_encoder_conformer_kernel_size: -1
-        decoder_kernel_size: 7
-        decoder_channels: 512
-        decoder_upsample_scales:
-        - 8
-        - 8
-        - 2
-        - 2
-        decoder_upsample_kernel_sizes:
-        - 16
-        - 16
-        - 4
-        - 4
-        decoder_resblock_kernel_sizes:
-        - 3
-        - 7
-        - 11
-        decoder_resblock_dilations:
         -   - 1
             - 3
             - 5
@@ -249,94 +267,90 @@ tts_conf:
         -   - 1
             - 3
             - 5
-        use_weight_norm_in_decoder: true
-        posterior_encoder_kernel_size: 5
-        posterior_encoder_layers: 16
-        posterior_encoder_stacks: 1
-        posterior_encoder_base_dilation: 1
-        posterior_encoder_dropout_rate: 0.0
-        use_weight_norm_in_posterior_encoder: true
-        flow_flows: 4
-        flow_kernel_size: 5
-        flow_base_dilation: 1
-        flow_layers: 4
-        flow_dropout_rate: 0.0
-        use_weight_norm_in_flow: true
-        use_only_mean_in_flow: true
-        stochastic_duration_predictor_kernel_size: 3
-        stochastic_duration_predictor_dropout_rate: 0.5
-        stochastic_duration_predictor_flows: 4
-        stochastic_duration_predictor_dds_conv_layers: 3
-        vocabs: 50
-        aux_channels: 513
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
-        scales: 1
-        scale_downsample_pooling: AvgPool1d
-        scale_downsample_pooling_params:
-            kernel_size: 4
-            stride: 2
-            padding: 2
-        scale_discriminator_params:
             in_channels: 1
-            out_channels: 1
             kernel_sizes:
-            - 15
-            - 41
             - 5
             - 3
-            channels: 128
             max_downsample_channels: 1024
-            max_groups: 16
-            bias: true
-            downsample_scales:
-            - 2
-            - 2
-            - 4
-            - 4
-            - 1
             nonlinear_activation: LeakyReLU
             nonlinear_activation_params:
                 negative_slope: 0.1
-            use_weight_norm: true
             use_spectral_norm: false
-        follow_official_norm: false
         periods:
         - 2
         - 3
         - 5
         - 7
         - 11
-        period_discriminator_params:
             in_channels: 1
-            out_channels: 1
             kernel_sizes:
             - 5
             - 3
-            channels: 32
-            downsample_scales:
-            - 3
-            - 3
-            - 3
-            - 3
-            - 1
             max_downsample_channels: 1024
-            bias: true
             nonlinear_activation: LeakyReLU
             nonlinear_activation_params:
                 negative_slope: 0.1
-            use_weight_norm: true
-            use_spectral_norm: false
     generator_adv_loss_params:
         average_by_discriminators: false
         loss_type: mse
     discriminator_adv_loss_params:
         average_by_discriminators: false
         loss_type: mse
     feat_match_loss_params:
         average_by_discriminators: false
         average_by_layers: false
         include_final_outputs: true
     mel_loss_params:
         fs: 22050
         n_fft: 1024
@@ -347,12 +361,12 @@ tts_conf:
         fmin: 0
         fmax: null
         log_base: null
     lambda_adv: 1.0
     lambda_mel: 45.0
     lambda_feat_match: 2.0
-    lambda_dur: 1.0
-    lambda_kl: 1.0
     sampling_rate: 22050
     cache_generator_outputs: true
 pitch_extract: null
 pitch_extract_conf: {}

+config: ./conf/tuning/finetune_joint_tacotron2_hifigan.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
+output_dir: exp/22k/tts_finetune_joint_tacotron2_hifigan_raw_char
 ngpu: 1
+seed: 777
 num_workers: 4
 num_att_plot: 3
 dist_backend: nccl
 cudnn_deterministic: false
 collect_stats: false
 write_collected_feats: false
+max_epoch: 140
 patience: null
 val_scheduler_criterion:
 - valid
 - loss
 - min
 best_model_criterion:
+-   - valid
+    - text2mel_loss
+    - min
+-   - train
+    - text2mel_loss
+    - min
 -   - train
     - total_count
     - max
+keep_nbest_models: 5
 nbest_averaging_interval: 0
 grad_clip: -1
 grad_clip_type: 2.0
 wandb_model_log_interval: -1
 detect_anomaly: false
 pretrain_path: null
+init_param:
+- exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
+- exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
+- exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
 ignore_init_mismatch: false
 freeze_param: []
 num_iters_per_epoch: null
 batch_size: 20
 valid_batch_size: null
+batch_bins: 1600000
 valid_batch_bins: null
 train_shape_file:
+- exp/22k/tts_stats_raw_char/train/text_shape.char
+- exp/22k/tts_stats_raw_char/train/speech_shape
 valid_shape_file:
+- exp/22k/tts_stats_raw_char/valid/text_shape.char
+- exp/22k/tts_stats_raw_char/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
 valid_max_cache_size: null
 exclude_weight_decay: false
 exclude_weight_decay_conf: {}
+optim: adam
 optim_conf:
+    lr: 1.25e-05
     betas:
+    - 0.5
+    - 0.9
     weight_decay: 0.0
 scheduler: exponentiallr
 scheduler_conf:
     gamma: 0.999875
+optim2: adam
 optim2_conf:
+    lr: 1.25e-05
     betas:
+    - 0.5
+    - 0.9
     weight_decay: 0.0
 scheduler2: exponentiallr
 scheduler2_conf:
     gamma: 0.999875
+generator_first: true
 token_list:
 - <blank>
 - <unk>
 - к
 - м
 - п
 - я
 - з
 - ','
 - б
 - ь
 - г
+- ч
 - й
 - ж
 - х
 - '!'
 - ''''
 - ф
+- .
 - '"'
 - ґ
+- ':'
 - /
+- „
 - <sos/eos>
 odim: null
 model_conf: {}
 non_linguistic_symbols: null
 cleaner: null
 g2p: g2p_en
+feats_extract: fbank
 feats_extract_conf:
     n_fft: 1024
     hop_length: 256
     win_length: null
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: global_mvn
+normalize_conf:
+    stats_file: feats_stats.npz
+tts: joint_text2wav
 tts_conf:
+    text2mel_type: tacotron2
+    text2mel_params:
+        embed_dim: 512
+        elayers: 1
+        eunits: 512
+        econv_layers: 3
+        econv_chans: 512
+        econv_filts: 5
+        atype: location
+        adim: 512
+        aconv_chans: 32
+        aconv_filts: 15
+        cumulate_att_w: true
+        dlayers: 2
+        dunits: 1024
+        prenet_layers: 2
+        prenet_units: 256
+        postnet_layers: 5
+        postnet_chans: 512
+        postnet_filts: 5
+        output_activation: null
+        use_batch_norm: true
+        use_concate: true
+        use_residual: false
         spk_embed_dim: 192
+        spk_embed_integration_type: add
+        dropout_rate: 0.5
+        zoneout_rate: 0.1
+        reduction_factor: 1
+        use_masking: true
+        bce_pos_weight: 10.0
+        use_guided_attn_loss: true
+        guided_attn_loss_sigma: 0.4
+        guided_attn_loss_lambda: 1.0
+        idim: 48
+        odim: 80
+    vocoder_type: hifigan_generator
+    vocoder_params:
+        bias: true
+        channels: 512
+        in_channels: 80
+        kernel_size: 7
+        nonlinear_activation: LeakyReLU
+        nonlinear_activation_params:
+            negative_slope: 0.1
+        out_channels: 1
+        resblock_dilations:
         -   - 1
             - 3
             - 5
         -   - 1
             - 3
             - 5
+        resblock_kernel_sizes:
+        - 3
+        - 7
+        - 11
+        upsample_kernel_sizes:
+        - 16
+        - 16
+        - 4
+        - 4
+        upsample_scales:
+        - 8
+        - 8
+        - 2
+        - 2
+        use_additional_convs: true
+        use_weight_norm: true
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
+        follow_official_norm: true
+        period_discriminator_params:
+            bias: true
+            channels: 32
+            downsample_scales:
+            - 3
+            - 3
+            - 3
+            - 3
+            - 1
             in_channels: 1
             kernel_sizes:
             - 5
             - 3
             max_downsample_channels: 1024
             nonlinear_activation: LeakyReLU
             nonlinear_activation_params:
                 negative_slope: 0.1
+            out_channels: 1
             use_spectral_norm: false
+            use_weight_norm: true
         periods:
         - 2
         - 3
         - 5
         - 7
         - 11
+        scale_discriminator_params:
+            bias: true
+            channels: 128
+            downsample_scales:
+            - 4
+            - 4
+            - 4
+            - 4
+            - 1
             in_channels: 1
             kernel_sizes:
+            - 15
+            - 41
             - 5
             - 3
             max_downsample_channels: 1024
+            max_groups: 16
             nonlinear_activation: LeakyReLU
             nonlinear_activation_params:
                 negative_slope: 0.1
+            out_channels: 1
+        scale_downsample_pooling: AvgPool1d
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            padding: 2
+            stride: 2
+        scales: 3
     generator_adv_loss_params:
         average_by_discriminators: false
         loss_type: mse
     discriminator_adv_loss_params:
         average_by_discriminators: false
         loss_type: mse
+    use_feat_match_loss: true
     feat_match_loss_params:
         average_by_discriminators: false
         average_by_layers: false
         include_final_outputs: true
+    use_mel_loss: true
     mel_loss_params:
         fs: 22050
         n_fft: 1024
         fmin: 0
         fmax: null
         log_base: null
+    lambda_text2mel: 1.0
     lambda_adv: 1.0
     lambda_mel: 45.0
     lambda_feat_match: 2.0
     sampling_rate: 22050
+    segment_size: 32
     cache_generator_outputs: true
 pitch_extract: null
 pitch_extract_conf: {}

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 # requirements for HuggingFace demo. Installs local package.
 torch
-espnet>=202301
 typeguard<3 # typeguard 3.0.0 is incompatible with espnet
 git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
 ukrainian-word-stress==1.0.2

 # requirements for HuggingFace demo. Installs local package.
 torch
+espnet==202301
 typeguard<3 # typeguard 3.0.0 is incompatible with espnet
 git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772
 ukrainian-word-stress==1.0.2

setup.py CHANGED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
     name="ukrainian-tts",
-    version="5.0",
     description="Ukrainian TTS using ESPNET",
     author="Yurii Paniv",
     author_email="mr.robinhad@gmail.com",
@@ -12,7 +12,7 @@ setup(
     packages=find_packages(),
     python_requires=">3.6.0",
     install_requires=[
-        "espnet>=202301",
         "typeguard<3",
         "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
         "ukrainian-word-stress==1.0.2",

 setup(
     name="ukrainian-tts",
+    version="6.0",
     description="Ukrainian TTS using ESPNET",
     author="Yurii Paniv",
     author_email="mr.robinhad@gmail.com",
     packages=find_packages(),
     python_requires=">3.6.0",
     install_requires=[
+        "espnet==202301",
         "typeguard<3",
         "num2words @ git+https://github.com/savoirfairelinux/num2words.git@3e39091d052829fc9e65c18176ce7b7ff6169772",
         "ukrainian-word-stress==1.0.2",

ukrainian_tts/tts.py CHANGED Viewed

@@ -19,6 +19,7 @@ class Voices(Enum):
     Mykyta = "mykyta"
     Lada = "lada"
     Dmytro = "dmytro"
 class Stress(Enum):
@@ -41,7 +42,7 @@ class TTS:
         self.device = device
         self.__setup_cache(cache_folder)
-    def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO(), speed=1.0):
         """
         Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         - `text` - your model input text.
@@ -71,9 +72,7 @@ class TTS:
         # synthesis
         with no_grad():
             start = time.time()
-            wav = self.synthesizer(
-                text, spembs=self.xvectors[voice][0], decode_conf={"alpha": 1 / speed}
-            )["wav"]
         rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
         print(f"RTF = {rtf:5f}")
@@ -99,6 +98,7 @@ class TTS:
         model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
         config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
         speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
         if cache_folder is None:
             cache_folder = "."
@@ -106,19 +106,16 @@ class TTS:
         model_path = join(cache_folder, "model.pth")
         config_path = join(cache_folder, "config.yaml")
         speakers_path = join(cache_folder, "spk_xvector.ark")
         self.__download(model_link, model_path)
         self.__download(config_link, config_path)
         self.__download(speakers_link, speakers_path)
         print("downloaded.")
         self.synthesizer = Text2Speech(
-            train_config=config_path,
-            model_file=model_path,
-            device=self.device,
-            # Only for VITS
-            noise_scale=0.333,
-            noise_scale_dur=0.333,
         )
         self.xvectors = {k: v for k, v in load_ark(speakers_path)}

     Mykyta = "mykyta"
     Lada = "lada"
     Dmytro = "dmytro"
+    Oleksa = "oleksa"
 class Stress(Enum):
         self.device = device
         self.__setup_cache(cache_folder)
+    def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO()):
         """
         Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
         - `text` - your model input text.
         # synthesis
         with no_grad():
             start = time.time()
+            wav = self.synthesizer(text, spembs=self.xvectors[voice][0])["wav"]
         rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
         print(f"RTF = {rtf:5f}")
         model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
         config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
         speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/spk_xvector.ark"
+        feat_stats_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/feat_stats.npz"
         if cache_folder is None:
             cache_folder = "."
         model_path = join(cache_folder, "model.pth")
         config_path = join(cache_folder, "config.yaml")
         speakers_path = join(cache_folder, "spk_xvector.ark")
+        feat_stats_path = join(cache_folder, "feats_stats.npz")
         self.__download(model_link, model_path)
         self.__download(config_link, config_path)
         self.__download(speakers_link, speakers_path)
+        self.__download(feat_stats_link, feat_stats_path)
         print("downloaded.")
         self.synthesizer = Text2Speech(
+            train_config=config_path, model_file=model_path, device=self.device
         )
         self.xvectors = {k: v for k, v in load_ark(speakers_path)}