Yurii Paniv commited on
Commit
1647edb
β€’
1 Parent(s): b547cb7

Initial model

Browse files
Files changed (3) hide show
  1. README.md +24 -2
  2. config.json +158 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,2 +1,24 @@
1
- # ukrainian-tts
2
- Ukrainian TTS (text-to-speech) using Coqui TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ukrainian TTS πŸ“’πŸ€–
2
+ Ukrainian TTS (text-to-speech) using Coqui TTS.
3
+
4
+ Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/).
5
+
6
+ # How to use :
7
+ 1. `pip install -r requirements.txt`.
8
+ 2. Download model from "Releases" tab.
9
+ 3. Launch as one-time command:
10
+ ```
11
+ tts --text "Text for TTS" \
12
+ --model_path path/to/model.pth.tar \
13
+ --config_path path/to/config.json \
14
+ --out_path folder/to/save/output.wav
15
+ ```
16
+ or alternatively launch web server using:
17
+ ```
18
+ tts-server --model_path path/to/model.pth.tar \
19
+ --config_path path/to/config.json
20
+ ```
21
+
22
+ # How to train:
23
+ 1. Refer to ["Nervous beginner guide"](https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html) in Coqui TTS docs.
24
+ 2. Instead of provided `config.json` use one from this repo.
config.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glow_tts",
3
+ "run_name": "coqui_tts",
4
+ "run_description": "",
5
+ "epochs": 1000,
6
+ "batch_size": 32,
7
+ "eval_batch_size": 16,
8
+ "mixed_precision": true,
9
+ "scheduler_after_epoch": false,
10
+ "run_eval": true,
11
+ "test_delay_epochs": -1,
12
+ "print_eval": true,
13
+ "dashboard_logger": "tensorboard",
14
+ "print_step": 25,
15
+ "plot_step": 100,
16
+ "model_param_stats": false,
17
+ "project_name": null,
18
+ "log_model_step": null,
19
+ "wandb_entity": null,
20
+ "save_step": 10000,
21
+ "checkpoint": true,
22
+ "keep_all_best": false,
23
+ "keep_after": 10000,
24
+ "num_loader_workers": 1,
25
+ "num_eval_loader_workers": 1,
26
+ "use_noise_augment": false,
27
+ "output_path": "./ukrainian",
28
+ "distributed_backend": "nccl",
29
+ "distributed_url": "tcp://localhost:54321",
30
+ "audio": {
31
+ "fft_size": 1024,
32
+ "win_length": 1024,
33
+ "hop_length": 256,
34
+ "frame_shift_ms": null,
35
+ "frame_length_ms": null,
36
+ "stft_pad_mode": "reflect",
37
+ "sample_rate": 16000,
38
+ "resample": false,
39
+ "preemphasis": 0.0,
40
+ "ref_level_db": 20,
41
+ "do_sound_norm": false,
42
+ "log_func": "np.log10",
43
+ "do_trim_silence": true,
44
+ "trim_db": 45,
45
+ "power": 1.5,
46
+ "griffin_lim_iters": 60,
47
+ "num_mels": 80,
48
+ "mel_fmin": 0.0,
49
+ "mel_fmax": null,
50
+ "spec_gain": 20,
51
+ "do_amp_to_db_linear": true,
52
+ "do_amp_to_db_mel": true,
53
+ "signal_norm": true,
54
+ "min_level_db": -100,
55
+ "symmetric_norm": true,
56
+ "max_norm": 4.0,
57
+ "clip_norm": true,
58
+ "stats_path": null
59
+ },
60
+ "use_phonemes": false,
61
+ "use_espeak_phonemes": false,
62
+ "phoneme_language": null,
63
+ "compute_input_seq_cache": false,
64
+ "text_cleaner": "basic_cleaners",
65
+ "enable_eos_bos_chars": false,
66
+ "test_sentences_file": "",
67
+ "phoneme_cache_path": "./phoneme_cache",
68
+ "characters": {
69
+ "pad": "_",
70
+ "eos": "~",
71
+ "bos": "^",
72
+ "characters": "!',-.:;?ABIMXaceinoprxy\u0404\u0406\u0407\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042c\u042f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0490\u0491 ",
73
+ "punctuations": "!',-.:;? ",
74
+ "phonemes": null,
75
+ "unique": true
76
+ },
77
+ "batch_group_size": 0,
78
+ "loss_masking": null,
79
+ "sort_by_audio_len": false,
80
+ "min_seq_len": 3,
81
+ "max_seq_len": 500,
82
+ "compute_f0": false,
83
+ "compute_linear_spec": false,
84
+ "add_blank": false,
85
+ "datasets": [
86
+ {
87
+ "name": "ljspeech",
88
+ "path": "./data/uk_UK/by_book/female/sumska/kaydasheva",
89
+ "meta_file_train": "metadata.csv",
90
+ "ununsed_speakers": null,
91
+ "meta_file_val": "",
92
+ "meta_file_attn_mask": ""
93
+ },
94
+ {
95
+ "name": "ljspeech",
96
+ "path": "./data/uk_UK/by_book/female/sumska/mykola_djerya",
97
+ "meta_file_train": "metadata.csv",
98
+ "ununsed_speakers": null,
99
+ "meta_file_val": "",
100
+ "meta_file_attn_mask": ""
101
+ }
102
+ ],
103
+ "optimizer": "RAdam",
104
+ "optimizer_params": {
105
+ "betas": [
106
+ 0.9,
107
+ 0.998
108
+ ],
109
+ "weight_decay": 1e-06
110
+ },
111
+ "lr_scheduler": "NoamLR",
112
+ "lr_scheduler_params": {
113
+ "warmup_steps": 4000
114
+ },
115
+ "test_sentences": [
116
+ "\u0413\u043e\u0432\u043e\u0440\u0438 \u043d\u0456\u0431\u0438 \u0442\u0438 \u0436\u0438\u0432\u0438\u0439!",
117
+ "\u041f\u043e\u043b \u043f\u0435\u0440\u0435\u0442\u043d\u0443\u0432 \u043f\u0443\u0441\u0442\u0435\u043b\u044e",
118
+ "\u041f\u0440\u0438\u0432\u0456\u0442, \u0441\u0432\u0456\u0442\u0435!"
119
+ ],
120
+ "use_speaker_embedding": false,
121
+ "use_d_vector_file": false,
122
+ "d_vector_dim": 0,
123
+ "num_chars": null,
124
+ "encoder_type": "rel_pos_transformer",
125
+ "encoder_params": {
126
+ "kernel_size": 3,
127
+ "dropout_p": 0.1,
128
+ "num_layers": 6,
129
+ "num_heads": 2,
130
+ "hidden_channels_ffn": 768,
131
+ "input_length": null
132
+ },
133
+ "use_encoder_prenet": true,
134
+ "hidden_channels_enc": 192,
135
+ "hidden_channels_dec": 192,
136
+ "hidden_channels_dp": 256,
137
+ "dropout_p_dp": 0.1,
138
+ "dropout_p_dec": 0.05,
139
+ "mean_only": true,
140
+ "out_channels": 80,
141
+ "num_flow_blocks_dec": 12,
142
+ "inference_noise_scale": 0.0,
143
+ "kernel_size_dec": 5,
144
+ "dilation_rate": 1,
145
+ "num_block_layers": 4,
146
+ "num_speakers": 0,
147
+ "c_in_channels": 0,
148
+ "num_splits": 4,
149
+ "num_squeeze": 2,
150
+ "sigmoid_scale": false,
151
+ "data_dep_init_steps": 10,
152
+ "style_wav_for_test": null,
153
+ "length_scale": 1.0,
154
+ "d_vector_file": false,
155
+ "grad_clip": 5.0,
156
+ "lr": 0.001,
157
+ "r": 1
158
+ }
requirements.txt ADDED
@@ -0,0 +1 @@
 
1
+ TTS==0.3.1