kevinwang676 commited on
Commit
2035195
1 Parent(s): b3b60ab

Upload folder using huggingface_hub

Browse files
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b6186a8b35aa2c513c6f24c5c9d3a1b9146a1bc9d4bd86fdd316af2aa21873
3
+ size 5607926997
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_120.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b6186a8b35aa2c513c6f24c5c9d3a1b9146a1bc9d4bd86fdd316af2aa21873
3
+ size 5607926997
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/config.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output",
3
+ "logger_uri": null,
4
+ "run_name": "run",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 10000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 32,
30
+ "eval_batch_size": 16,
31
+ "grad_clip": 0.0,
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.001,
34
+ "optimizer": "radam",
35
+ "optimizer_params": null,
36
+ "lr_scheduler": null,
37
+ "lr_scheduler_params": {},
38
+ "use_grad_scaler": false,
39
+ "allow_tf32": false,
40
+ "cudnn_enable": true,
41
+ "cudnn_deterministic": false,
42
+ "cudnn_benchmark": false,
43
+ "training_seed": 54321,
44
+ "model": "xtts",
45
+ "num_loader_workers": 0,
46
+ "num_eval_loader_workers": 0,
47
+ "use_noise_augment": false,
48
+ "audio": {
49
+ "sample_rate": 22050,
50
+ "output_sample_rate": 24000
51
+ },
52
+ "use_phonemes": false,
53
+ "phonemizer": null,
54
+ "phoneme_language": null,
55
+ "compute_input_seq_cache": false,
56
+ "text_cleaner": null,
57
+ "enable_eos_bos_chars": false,
58
+ "test_sentences_file": "",
59
+ "phoneme_cache_path": null,
60
+ "characters": null,
61
+ "add_blank": false,
62
+ "batch_group_size": 0,
63
+ "loss_masking": null,
64
+ "min_audio_len": 1,
65
+ "max_audio_len": Infinity,
66
+ "min_text_len": 1,
67
+ "max_text_len": Infinity,
68
+ "compute_f0": false,
69
+ "compute_energy": false,
70
+ "compute_linear_spec": false,
71
+ "precompute_num_workers": 0,
72
+ "start_by_longest": false,
73
+ "shuffle": false,
74
+ "drop_last": false,
75
+ "datasets": [
76
+ {
77
+ "formatter": "",
78
+ "dataset_name": "",
79
+ "path": "",
80
+ "meta_file_train": "",
81
+ "ignored_speakers": null,
82
+ "language": "",
83
+ "phonemizer": "",
84
+ "meta_file_val": "",
85
+ "meta_file_attn_mask": ""
86
+ }
87
+ ],
88
+ "test_sentences": [],
89
+ "eval_split_max_size": null,
90
+ "eval_split_size": 0.01,
91
+ "use_speaker_weighted_sampler": false,
92
+ "speaker_weighted_sampler_alpha": 1.0,
93
+ "use_language_weighted_sampler": false,
94
+ "language_weighted_sampler_alpha": 1.0,
95
+ "use_length_weighted_sampler": false,
96
+ "length_weighted_sampler_alpha": 1.0,
97
+ "model_args": {
98
+ "gpt_batch_size": 1,
99
+ "enable_redaction": false,
100
+ "kv_cache": true,
101
+ "gpt_checkpoint": null,
102
+ "clvp_checkpoint": null,
103
+ "decoder_checkpoint": null,
104
+ "num_chars": 255,
105
+ "tokenizer_file": "",
106
+ "gpt_max_audio_tokens": 605,
107
+ "gpt_max_text_tokens": 402,
108
+ "gpt_max_prompt_tokens": 70,
109
+ "gpt_layers": 30,
110
+ "gpt_n_model_channels": 1024,
111
+ "gpt_n_heads": 16,
112
+ "gpt_number_text_tokens": 6681,
113
+ "gpt_start_text_token": null,
114
+ "gpt_stop_text_token": null,
115
+ "gpt_num_audio_tokens": 1026,
116
+ "gpt_start_audio_token": 1024,
117
+ "gpt_stop_audio_token": 1025,
118
+ "gpt_code_stride_len": 1024,
119
+ "gpt_use_masking_gt_prompt_approach": true,
120
+ "gpt_use_perceiver_resampler": true,
121
+ "input_sample_rate": 22050,
122
+ "output_sample_rate": 24000,
123
+ "output_hop_length": 256,
124
+ "decoder_input_dim": 1024,
125
+ "d_vector_dim": 512,
126
+ "cond_d_vector_in_each_upsampling_layer": true,
127
+ "duration_const": 102400
128
+ },
129
+ "model_dir": null,
130
+ "languages": [
131
+ "en",
132
+ "es",
133
+ "fr",
134
+ "de",
135
+ "it",
136
+ "pt",
137
+ "pl",
138
+ "tr",
139
+ "ru",
140
+ "nl",
141
+ "cs",
142
+ "ar",
143
+ "zh-cn",
144
+ "hu",
145
+ "ko",
146
+ "ja",
147
+ "hi"
148
+ ],
149
+ "temperature": 0.75,
150
+ "length_penalty": 1.0,
151
+ "repetition_penalty": 5.0,
152
+ "top_k": 50,
153
+ "top_p": 0.85,
154
+ "num_gpt_outputs": 1,
155
+ "gpt_cond_len": 30,
156
+ "gpt_cond_chunk_len": 4,
157
+ "max_ref_len": 30,
158
+ "sound_norm_refs": false
159
+ }
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/events.out.tfevents.1720099779.0426d3820a65.4247.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1051188d1501f7b3cca279b6cd19d7b84e8c8565b48132b0e33e0102bf8f8cf
3
+ size 62903
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/trainer_0_log.txt ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > Training Environment:
2
+ | > Backend: Torch
3
+ | > Mixed precision: False
4
+ | > Precision: float32
5
+ | > Current device: 0
6
+ | > Num. of GPUs: 1
7
+ | > Num. of CPUs: 12
8
+ | > Num. of Torch Threads: 1
9
+ | > Torch seed: 1
10
+ | > Torch CUDNN: True
11
+ | > Torch CUDNN deterministic: False
12
+ | > Torch CUDNN benchmark: False
13
+ | > Torch TF32 MatMul: False
14
+ > Start Tensorboard: tensorboard --logdir=/content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
15
+
16
+ > Model has 518442047 parameters
17
+
18
+  > EPOCH: 0/8
19
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
20
+
21
+  > TRAINING (2024-07-04 13:29:40) 
22
+
23
+  --> TIME: 2024-07-04 13:29:45 -- STEP: 0/15 -- GLOBAL_STEP: 0
24
+ | > loss_text_ce: 0.03748854622244835 (0.03748854622244835)
25
+ | > loss_mel_ce: 3.7625863552093506 (3.7625863552093506)
26
+ | > loss: 3.800074815750122 (3.800074815750122)
27
+ | > grad_norm: 0 (0)
28
+ | > current_lr: 5e-06
29
+ | > step_time: 1.2487 (1.248706579208374)
30
+ | > loader_time: 3.617 (3.6170296669006348)
31
+
32
+
33
+  > EVALUATION 
34
+
35
+
36
+ --> EVAL PERFORMANCE
37
+ | > avg_loader_time: 0.1386585235595703 (+0)
38
+ | > avg_loss_text_ce: 0.03846975229680538 (+0)
39
+ | > avg_loss_mel_ce: 3.401963472366333 (+0)
40
+ | > avg_loss: 3.440433144569397 (+0)
41
+
42
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_15.pth
43
+
44
+  > EPOCH: 1/8
45
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
46
+
47
+  > TRAINING (2024-07-04 13:30:23) 
48
+
49
+  > EVALUATION 
50
+
51
+
52
+ --> EVAL PERFORMANCE
53
+ | > avg_loader_time: 0.12281334400177002 (-0.015845179557800293)
54
+ | > avg_loss_text_ce: 0.037664541974663734 (-0.0008052103221416473)
55
+ | > avg_loss_mel_ce: 3.2830638885498047 (-0.11889958381652832)
56
+ | > avg_loss: 3.3207284212112427 (-0.1197047233581543)
57
+
58
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_30.pth
59
+
60
+  > EPOCH: 2/8
61
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
62
+
63
+  > TRAINING (2024-07-04 13:31:03) 
64
+
65
+  > EVALUATION 
66
+
67
+
68
+ --> EVAL PERFORMANCE
69
+ | > avg_loader_time: 0.1253896951675415 (+0.0025763511657714844)
70
+ | > avg_loss_text_ce: 0.03692051209509373 (-0.0007440298795700073)
71
+ | > avg_loss_mel_ce: 3.2056061029434204 (-0.07745778560638428)
72
+ | > avg_loss: 3.242526650428772 (-0.0782017707824707)
73
+
74
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_45.pth
75
+
76
+  > EPOCH: 3/8
77
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
78
+
79
+  > TRAINING (2024-07-04 13:32:49) 
80
+
81
+  --> TIME: 2024-07-04 13:32:54 -- STEP: 5/15 -- GLOBAL_STEP: 50
82
+ | > loss_text_ce: 0.03478017821907997 (0.0374395526945591)
83
+ | > loss_mel_ce: 3.1205806732177734 (3.1422857284545898)
84
+ | > loss: 3.1553609371185303 (3.1797253131866454)
85
+ | > grad_norm: 0 (0.0)
86
+ | > current_lr: 5e-06
87
+ | > step_time: 0.3306 (0.32296051979064944)
88
+ | > loader_time: 0.0082 (0.008426570892333984)
89
+
90
+
91
+  > EVALUATION 
92
+
93
+
94
+ --> EVAL PERFORMANCE
95
+ | > avg_loader_time: 0.1264183521270752 (+0.0010286569595336914)
96
+ | > avg_loss_text_ce: 0.03631771728396416 (-0.00060279481112957)
97
+ | > avg_loss_mel_ce: 3.1698708534240723 (-0.035735249519348145)
98
+ | > avg_loss: 3.2061885595321655 (-0.036338090896606445)
99
+
100
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_60.pth
101
+
102
+  > EPOCH: 4/8
103
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
104
+
105
+  > TRAINING (2024-07-04 13:37:54) 
106
+
107
+  > EVALUATION 
108
+
109
+
110
+ --> EVAL PERFORMANCE
111
+ | > avg_loader_time: 0.1271202564239502 (+0.000701904296875)
112
+ | > avg_loss_text_ce: 0.035877423360943794 (-0.00044029392302036285)
113
+ | > avg_loss_mel_ce: 3.1362961530685425 (-0.033574700355529785)
114
+ | > avg_loss: 3.1721736192703247 (-0.03401494026184082)
115
+
116
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_75.pth
117
+
118
+  > EPOCH: 5/8
119
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
120
+
121
+  > TRAINING (2024-07-04 13:48:50) 
122
+
123
+  > EVALUATION 
124
+
125
+
126
+ --> EVAL PERFORMANCE
127
+ | > avg_loader_time: 0.12177145481109619 (-0.005348801612854004)
128
+ | > avg_loss_text_ce: 0.03552030771970749 (-0.00035711564123630524)
129
+ | > avg_loss_mel_ce: 3.130112409591675 (-0.006183743476867676)
130
+ | > avg_loss: 3.165632724761963 (-0.006540894508361816)
131
+
132
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_90.pth
133
+
134
+  > EPOCH: 6/8
135
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
136
+
137
+  > TRAINING (2024-07-04 13:54:05) 
138
+
139
+  --> TIME: 2024-07-04 13:54:13 -- STEP: 10/15 -- GLOBAL_STEP: 100
140
+ | > loss_text_ce: 0.03132708743214607 (0.03531287014484406)
141
+ | > loss_mel_ce: 2.6794981956481934 (2.865787482261658)
142
+ | > loss: 2.710825204849243 (2.9011003494262697)
143
+ | > grad_norm: 0 (0.0)
144
+ | > current_lr: 5e-06
145
+ | > step_time: 0.3346 (0.3178673505783081)
146
+ | > loader_time: 0.0077 (0.008596587181091308)
147
+
148
+
149
+  > EVALUATION 
150
+
151
+
152
+ --> EVAL PERFORMANCE
153
+ | > avg_loader_time: 0.12420761585235596 (+0.0024361610412597656)
154
+ | > avg_loss_text_ce: 0.03527705930173397 (-0.00024324841797351837)
155
+ | > avg_loss_mel_ce: 3.1189099550247192 (-0.011202454566955566)
156
+ | > avg_loss: 3.154186964035034 (-0.011445760726928711)
157
+
158
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_105.pth
159
+
160
+  > EPOCH: 7/8
161
+ --> /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9
162
+
163
+  > TRAINING (2024-07-04 14:01:23) 
164
+
165
+  > EVALUATION 
166
+
167
+
168
+ --> EVAL PERFORMANCE
169
+ | > avg_loader_time: 0.12220907211303711 (-0.0019985437393188477)
170
+ | > avg_loss_text_ce: 0.035058384761214256 (-0.00021867454051971436)
171
+ | > avg_loss_mel_ce: 3.100669741630554 (-0.01824021339416504)
172
+ | > avg_loss: 3.1357282400131226 (-0.01845872402191162)
173
+
174
+ > BEST MODEL : /content/xtts-v2/finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/best_model_120.pth
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
finetune_models/run/training/GPT_XTTS_FT-July-04-2024_01+29PM-44c61c9/webui_xtts.py ADDED
@@ -0,0 +1,1115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,shutil,sys,pdb,re
2
+ now_dir = os.getcwd()
3
+ sys.path.append(now_dir)
4
+ import json,yaml,warnings,torch
5
+ import platform
6
+ import psutil
7
+ import signal
8
+ from pathlib import Path
9
+
10
+ warnings.filterwarnings("ignore")
11
+ torch.manual_seed(233333)
12
+ tmp = os.path.join(now_dir, "TEMP")
13
+ os.makedirs(tmp, exist_ok=True)
14
+ os.environ["TEMP"] = tmp
15
+ if(os.path.exists(tmp)):
16
+ for name in os.listdir(tmp):
17
+ if(name=="jieba.cache"):continue
18
+ path="%s/%s"%(tmp,name)
19
+ delete=os.remove if os.path.isfile(path) else shutil.rmtree
20
+ try:
21
+ delete(path)
22
+ except Exception as e:
23
+ print(str(e))
24
+ pass
25
+ import site
26
+ site_packages_roots = []
27
+ for path in site.getsitepackages():
28
+ if "packages" in path:
29
+ site_packages_roots.append(path)
30
+ if(site_packages_roots==[]):site_packages_roots=["%s/runtime/Lib/site-packages" % now_dir]
31
+ #os.environ["OPENBLAS_NUM_THREADS"] = "4"
32
+ os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
33
+ os.environ["all_proxy"] = ""
34
+ for site_packages_root in site_packages_roots:
35
+ if os.path.exists(site_packages_root):
36
+ try:
37
+ with open("%s/users.pth" % (site_packages_root), "w") as f:
38
+ f.write(
39
+ "%s\n%s/tools\n%s/tools/damo_asr\n%s/GPT_SoVITS\n%s/tools/uvr5"
40
+ % (now_dir, now_dir, now_dir, now_dir, now_dir)
41
+ )
42
+ break
43
+ except PermissionError:
44
+ pass
45
+ from tools import my_utils
46
+ import traceback
47
+ import shutil
48
+ import pdb
49
+ import gradio as gr
50
+ from subprocess import Popen
51
+ import signal
52
+ from config import python_exec,infer_device,is_half,exp_root,webui_port_main,webui_port_infer_tts,webui_port_uvr5,webui_port_subfix,is_share
53
+ from tools.i18n.i18n import I18nAuto
54
+ i18n = I18nAuto()
55
+ from scipy.io import wavfile
56
+ from tools.my_utils import load_audio
57
+ from multiprocessing import cpu_count
58
+
59
+ import argparse
60
+ import os
61
+ import sys
62
+ import tempfile
63
+
64
+ import gradio as gr
65
+ import librosa.display
66
+ import numpy as np
67
+
68
+ import torch
69
+ import torchaudio
70
+ import traceback
71
+ from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
72
+ from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
73
+
74
+ from TTS.tts.configs.xtts_config import XttsConfig
75
+ from TTS.tts.models.xtts import Xtts
76
+
77
+ # from .list to .csv
78
+ import pandas as pd
79
+ from sklearn.model_selection import train_test_split
80
+
81
+ def split_csv(input_csv, train_csv, eval_csv, eval_size=0.15):
82
+ # Load the data from the CSV file
83
+ data = pd.read_csv(input_csv, delimiter='|', header=0)
84
+
85
+ # Split the data into training and evaluation sets
86
+ train_data, eval_data = train_test_split(data, test_size=eval_size, random_state=42)
87
+
88
+ # Save the training data to a CSV file
89
+ train_data.to_csv(train_csv, index=False, sep='|')
90
+
91
+ # Save the evaluation data to a CSV file
92
+ eval_data.to_csv(eval_csv, index=False, sep='|')
93
+
94
+ print("CSV files have been successfully split.")
95
+
96
+
97
+ def convert_list_to_csv(input_file, output_file):
98
+ try:
99
+ # Open the input .list file to read
100
+ with open(input_file, 'r', encoding='utf-8') as infile:
101
+ # Open the output .csv file to write
102
+ with open(output_file, 'w', encoding='utf-8') as outfile:
103
+ # Write the header to the CSV
104
+ outfile.write("audio_file|text|speaker_name\n")
105
+ # Process each line in the input file
106
+ for line in infile:
107
+ parts = line.strip().split('|')
108
+ if len(parts) == 4:
109
+ # Extract relevant parts: WAV file path and transcription
110
+ wav_path = parts[0]
111
+ transcription = parts[3]
112
+ # Write the formatted line to the CSV file
113
+ outfile.write(f"{wav_path}|{transcription}|coqui\n")
114
+ print("Conversion to CSV completed successfully.")
115
+ split_csv(output_file, "train.csv", "eval.csv")
116
+ print("Split completed successfully")
117
+ return "train.csv", "eval.csv"
118
+ except Exception as e:
119
+ print(f"An error occurred: {e}")
120
+
121
+
122
+ def clear_gpu_cache():
123
+ # clear the GPU cache
124
+ if torch.cuda.is_available():
125
+ torch.cuda.empty_cache()
126
+
127
+ XTTS_MODEL = None
128
+ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
129
+ global XTTS_MODEL
130
+ clear_gpu_cache()
131
+ if not xtts_checkpoint or not xtts_config or not xtts_vocab:
132
+ return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
133
+ config = XttsConfig()
134
+ config.load_json(xtts_config)
135
+ XTTS_MODEL = Xtts.init_from_config(config)
136
+ print("Loading XTTS model! ")
137
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
138
+ if torch.cuda.is_available():
139
+ XTTS_MODEL.cuda()
140
+
141
+ print("模型已成功加载!")
142
+ return "模型已成功加载!"
143
+
144
+ def run_tts(lang, tts_text, speaker_audio_file):
145
+ if XTTS_MODEL is None or not speaker_audio_file:
146
+ return "您需要先执行第5步 - 加载模型", None, None
147
+
148
+ speaker_audio_file = "".join([item for item in speaker_audio_file.strip().split("\n") if item != ""])
149
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
150
+ out = XTTS_MODEL.inference(
151
+ text=tts_text.strip(),
152
+ language=lang,
153
+ gpt_cond_latent=gpt_cond_latent,
154
+ speaker_embedding=speaker_embedding,
155
+ temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
156
+ length_penalty=XTTS_MODEL.config.length_penalty,
157
+ repetition_penalty=XTTS_MODEL.config.repetition_penalty,
158
+ top_k=XTTS_MODEL.config.top_k,
159
+ top_p=XTTS_MODEL.config.top_p,
160
+ )
161
+
162
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
163
+ out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
164
+ out_path = fp.name
165
+ torchaudio.save(out_path, out["wav"], 24000)
166
+
167
+ return "推理成功,快来听听吧!", out_path, speaker_audio_file
168
+
169
+
170
+
171
+
172
+ # define a logger to redirect
173
+ class Logger:
174
+ def __init__(self, filename="log.out"):
175
+ self.log_file = filename
176
+ self.terminal = sys.stdout
177
+ self.log = open(self.log_file, "w")
178
+
179
+ def write(self, message):
180
+ self.terminal.write(message)
181
+ self.log.write(message)
182
+
183
+ def flush(self):
184
+ self.terminal.flush()
185
+ self.log.flush()
186
+
187
+ def isatty(self):
188
+ return False
189
+
190
+ # redirect stdout and stderr to a file
191
+ sys.stdout = Logger()
192
+ sys.stderr = sys.stdout
193
+
194
+
195
+ # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
196
+ import logging
197
+ logging.basicConfig(
198
+ level=logging.WARNING,
199
+ format="%(asctime)s [%(levelname)s] %(message)s",
200
+ handlers=[
201
+ logging.StreamHandler(sys.stdout)
202
+ ]
203
+ )
204
+
205
+ def read_logs():
206
+ sys.stdout.flush()
207
+ with open(sys.stdout.log_file, "r") as f:
208
+ return f.read()
209
+
210
+
211
+ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
212
+
213
+ n_cpu=cpu_count()
214
+
215
+ ngpu = torch.cuda.device_count()
216
+ gpu_infos = []
217
+ mem = []
218
+ if_gpu_ok = False
219
+
220
+ # 判断是否有能用来训练和加速推理的N卡
221
+ if torch.cuda.is_available() or ngpu != 0:
222
+ for i in range(ngpu):
223
+ gpu_name = torch.cuda.get_device_name(i)
224
+ if any(value in gpu_name.upper()for value in ["10","16","20","30","40","A2","A3","A4","P4","A50","500","A60","70","80","90","M4","T4","TITAN","L4","4060"]):
225
+ # A10#A100#V100#A40#P40#M40#K80#A4500
226
+ if_gpu_ok = True # 至少有一张能用的N卡
227
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
228
+ mem.append(int(torch.cuda.get_device_properties(i).total_memory/ 1024/ 1024/ 1024+ 0.4))
229
+ # 判断是否支持mps加速
230
+ if torch.backends.mps.is_available():
231
+ if_gpu_ok = True
232
+ gpu_infos.append("%s\t%s" % ("0", "Apple GPU"))
233
+ mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存
234
+
235
+ if if_gpu_ok and len(gpu_infos) > 0:
236
+ gpu_info = "\n".join(gpu_infos)
237
+ default_batch_size = min(mem) // 2
238
+ else:
239
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
240
+ default_batch_size = 1
241
+ gpus = "-".join([i[0] for i in gpu_infos])
242
+
243
+ pretrained_sovits_name="GPT_SoVITS/pretrained_models/s2G488k.pth"
244
+ pretrained_gpt_name="GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
245
+ def get_weights_names():
246
+ SoVITS_names = [pretrained_sovits_name]
247
+ for name in os.listdir(SoVITS_weight_root):
248
+ if name.endswith(".pth"):SoVITS_names.append(name)
249
+ GPT_names = [pretrained_gpt_name]
250
+ for name in os.listdir(GPT_weight_root):
251
+ if name.endswith(".ckpt"): GPT_names.append(name)
252
+ return SoVITS_names,GPT_names
253
+ SoVITS_weight_root="SoVITS_weights"
254
+ GPT_weight_root="GPT_weights"
255
+ os.makedirs(SoVITS_weight_root,exist_ok=True)
256
+ os.makedirs(GPT_weight_root,exist_ok=True)
257
+ SoVITS_names,GPT_names = get_weights_names()
258
+
259
+ def custom_sort_key(s):
260
+ # 使用正则表达式提取字符串中的数字部分和非数字部分
261
+ parts = re.split('(\d+)', s)
262
+ # 将数字部分转换为整数,非数字部分保持不变
263
+ parts = [int(part) if part.isdigit() else part for part in parts]
264
+ return parts
265
+
266
+ def change_choices():
267
+ SoVITS_names, GPT_names = get_weights_names()
268
+ return {"choices": sorted(SoVITS_names,key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names,key=custom_sort_key), "__type__": "update"}
269
+
270
+ p_label=None
271
+ p_uvr5=None
272
+ p_asr=None
273
+ p_denoise=None
274
+ p_tts_inference=None
275
+
276
+ def kill_proc_tree(pid, including_parent=True):
277
+ try:
278
+ parent = psutil.Process(pid)
279
+ except psutil.NoSuchProcess:
280
+ # Process already terminated
281
+ return
282
+
283
+ children = parent.children(recursive=True)
284
+ for child in children:
285
+ try:
286
+ os.kill(child.pid, signal.SIGTERM) # or signal.SIGKILL
287
+ except OSError:
288
+ pass
289
+ if including_parent:
290
+ try:
291
+ os.kill(parent.pid, signal.SIGTERM) # or signal.SIGKILL
292
+ except OSError:
293
+ pass
294
+
295
+ system=platform.system()
296
+ def kill_process(pid):
297
+ if(system=="Windows"):
298
+ cmd = "taskkill /t /f /pid %s" % pid
299
+ os.system(cmd)
300
+ else:
301
+ kill_proc_tree(pid)
302
+
303
+
304
+ def change_label(if_label,path_list):
305
+ global p_label
306
+ if(if_label==True and p_label==None):
307
+ path_list=my_utils.clean_path(path_list)
308
+ cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s'%(python_exec,path_list,webui_port_subfix,is_share)
309
+ yield i18n("打标工具WebUI已开启")
310
+ print(cmd)
311
+ p_label = Popen(cmd, shell=True)
312
+ elif(if_label==False and p_label!=None):
313
+ kill_process(p_label.pid)
314
+ p_label=None
315
+ yield i18n("打标工具WebUI已关闭")
316
+
317
+ def change_uvr5(if_uvr5):
318
+ global p_uvr5
319
+ if(if_uvr5==True and p_uvr5==None):
320
+ cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s'%(python_exec,infer_device,is_half,webui_port_uvr5,is_share)
321
+ yield i18n("UVR5已开启")
322
+ print(cmd)
323
+ p_uvr5 = Popen(cmd, shell=True)
324
+ elif(if_uvr5==False and p_uvr5!=None):
325
+ kill_process(p_uvr5.pid)
326
+ p_uvr5=None
327
+ yield i18n("UVR5已关闭")
328
+
329
+ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits_path):
330
+ global p_tts_inference
331
+ if(if_tts==True and p_tts_inference==None):
332
+ os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
333
+ os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
334
+ os.environ["cnhubert_base_path"]=cnhubert_base_path
335
+ os.environ["bert_path"]=bert_path
336
+ os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_number
337
+ os.environ["is_half"]=str(is_half)
338
+ os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
339
+ os.environ["is_share"]=str(is_share)
340
+ cmd = '"%s" GPT_SoVITS/inference_webui.py'%(python_exec)
341
+ yield i18n("TTS推理进程已开启")
342
+ print(cmd)
343
+ p_tts_inference = Popen(cmd, shell=True)
344
+ elif(if_tts==False and p_tts_inference!=None):
345
+ kill_process(p_tts_inference.pid)
346
+ p_tts_inference=None
347
+ yield i18n("TTS推理进程已关闭")
348
+
349
+ from tools.asr.config import asr_dict
350
+ def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
351
+ global p_asr
352
+ if(p_asr==None):
353
+ asr_inp_dir=my_utils.clean_path(asr_inp_dir)
354
+ cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
355
+ cmd += f' -i "{asr_inp_dir}"'
356
+ cmd += f' -o "{asr_opt_dir}"'
357
+ cmd += f' -s {asr_model_size}'
358
+ cmd += f' -l {asr_lang}'
359
+ cmd += " -p %s"%("float16"if is_half==True else "float32")
360
+
361
+ yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
362
+ print(cmd)
363
+ p_asr = Popen(cmd, shell=True)
364
+ p_asr.wait()
365
+ p_asr=None
366
+ yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
367
+ else:
368
+ yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
369
+ # return None
370
+
371
+ def close_asr():
372
+ global p_asr
373
+ if(p_asr!=None):
374
+ kill_process(p_asr.pid)
375
+ p_asr=None
376
+ return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
377
+ def open_denoise(denoise_inp_dir, denoise_opt_dir):
378
+ global p_denoise
379
+ if(p_denoise==None):
380
+ denoise_inp_dir=my_utils.clean_path(denoise_inp_dir)
381
+ denoise_opt_dir=my_utils.clean_path(denoise_opt_dir)
382
+ cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32")
383
+
384
+ yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
385
+ print(cmd)
386
+ p_denoise = Popen(cmd, shell=True)
387
+ p_denoise.wait()
388
+ p_denoise=None
389
+ yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
390
+ else:
391
+ yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
392
+ # return None
393
+
394
+ def close_denoise():
395
+ global p_denoise
396
+ if(p_denoise!=None):
397
+ kill_process(p_denoise.pid)
398
+ p_denoise=None
399
+ return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
400
+
401
+ p_train_SoVITS=None
402
+ def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D):
403
+ global p_train_SoVITS
404
+ if(p_train_SoVITS==None):
405
+ with open("GPT_SoVITS/configs/s2.json")as f:
406
+ data=f.read()
407
+ data=json.loads(data)
408
+ s2_dir="%s/%s"%(exp_root,exp_name)
409
+ os.makedirs("%s/logs_s2"%(s2_dir),exist_ok=True)
410
+ if(is_half==False):
411
+ data["train"]["fp16_run"]=False
412
+ batch_size=max(1,batch_size//2)
413
+ data["train"]["batch_size"]=batch_size
414
+ data["train"]["epochs"]=total_epoch
415
+ data["train"]["text_low_lr_rate"]=text_low_lr_rate
416
+ data["train"]["pretrained_s2G"]=pretrained_s2G
417
+ data["train"]["pretrained_s2D"]=pretrained_s2D
418
+ data["train"]["if_save_latest"]=if_save_latest
419
+ data["train"]["if_save_every_weights"]=if_save_every_weights
420
+ data["train"]["save_every_epoch"]=save_every_epoch
421
+ data["train"]["gpu_numbers"]=gpu_numbers1Ba
422
+ data["data"]["exp_dir"]=data["s2_ckpt_dir"]=s2_dir
423
+ data["save_weight_dir"]=SoVITS_weight_root
424
+ data["name"]=exp_name
425
+ tmp_config_path="%s/tmp_s2.json"%tmp
426
+ with open(tmp_config_path,"w")as f:f.write(json.dumps(data))
427
+
428
+ cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path)
429
+ yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
430
+ print(cmd)
431
+ p_train_SoVITS = Popen(cmd, shell=True)
432
+ p_train_SoVITS.wait()
433
+ p_train_SoVITS=None
434
+ yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
435
+ else:
436
+ yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
437
+
438
+ def close1Ba():
439
+ global p_train_SoVITS
440
+ if(p_train_SoVITS!=None):
441
+ kill_process(p_train_SoVITS.pid)
442
+ p_train_SoVITS=None
443
+ return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
444
+
445
+ p_train_GPT=None
446
+ def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1):
447
+ global p_train_GPT
448
+ if(p_train_GPT==None):
449
+ with open("GPT_SoVITS/configs/s1longer.yaml")as f:
450
+ data=f.read()
451
+ data=yaml.load(data, Loader=yaml.FullLoader)
452
+ s1_dir="%s/%s"%(exp_root,exp_name)
453
+ os.makedirs("%s/logs_s1"%(s1_dir),exist_ok=True)
454
+ if(is_half==False):
455
+ data["train"]["precision"]="32"
456
+ batch_size = max(1, batch_size // 2)
457
+ data["train"]["batch_size"]=batch_size
458
+ data["train"]["epochs"]=total_epoch
459
+ data["pretrained_s1"]=pretrained_s1
460
+ data["train"]["save_every_n_epoch"]=save_every_epoch
461
+ data["train"]["if_save_every_weights"]=if_save_every_weights
462
+ data["train"]["if_save_latest"]=if_save_latest
463
+ data["train"]["if_dpo"]=if_dpo
464
+ data["train"]["half_weights_save_dir"]=GPT_weight_root
465
+ data["train"]["exp_name"]=exp_name
466
+ data["train_semantic_path"]="%s/6-name2semantic.tsv"%s1_dir
467
+ data["train_phoneme_path"]="%s/2-name2text.txt"%s1_dir
468
+ data["output_dir"]="%s/logs_s1"%s1_dir
469
+
470
+ os.environ["_CUDA_VISIBLE_DEVICES"]=gpu_numbers.replace("-",",")
471
+ os.environ["hz"]="25hz"
472
+ tmp_config_path="%s/tmp_s1.yaml"%tmp
473
+ with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False))
474
+ # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir)
475
+ cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path)
476
+ yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
477
+ print(cmd)
478
+ p_train_GPT = Popen(cmd, shell=True)
479
+ p_train_GPT.wait()
480
+ p_train_GPT=None
481
+ yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
482
+ else:
483
+ yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True}
484
+
485
+ def close1Bb():
486
+ global p_train_GPT
487
+ if(p_train_GPT!=None):
488
+ kill_process(p_train_GPT.pid)
489
+ p_train_GPT=None
490
+ return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
491
+
492
+ ps_slice=[]
493
+ def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts):
494
+ global ps_slice
495
+ inp = my_utils.clean_path(inp)
496
+ opt_root = my_utils.clean_path(opt_root)
497
+ if(os.path.exists(inp)==False):
498
+ yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
499
+ return
500
+ if os.path.isfile(inp):n_parts=1
501
+ elif os.path.isdir(inp):pass
502
+ else:
503
+ yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
504
+ return
505
+ if (ps_slice == []):
506
+ for i_part in range(n_parts):
507
+ cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s''' % (python_exec,inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, n_parts)
508
+ print(cmd)
509
+ p = Popen(cmd, shell=True)
510
+ ps_slice.append(p)
511
+ yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
512
+ for p in ps_slice:
513
+ p.wait()
514
+ ps_slice=[]
515
+ yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
516
+ else:
517
+ yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
518
+
519
+ def close_slice():
520
+ global ps_slice
521
+ if (ps_slice != []):
522
+ for p_slice in ps_slice:
523
+ try:
524
+ kill_process(p_slice.pid)
525
+ except:
526
+ traceback.print_exc()
527
+ ps_slice=[]
528
+ return "已终止所有切割进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
529
+
530
+ ps1a=[]
531
+ def open1a(inp_text,inp_wav_dir,exp_name,gpu_numbers,bert_pretrained_dir):
532
+ global ps1a
533
+ inp_text = my_utils.clean_path(inp_text)
534
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
535
+ if (ps1a == []):
536
+ opt_dir="%s/%s"%(exp_root,exp_name)
537
+ config={
538
+ "inp_text":inp_text,
539
+ "inp_wav_dir":inp_wav_dir,
540
+ "exp_name":exp_name,
541
+ "opt_dir":opt_dir,
542
+ "bert_pretrained_dir":bert_pretrained_dir,
543
+ }
544
+ gpu_names=gpu_numbers.split("-")
545
+ all_parts=len(gpu_names)
546
+ for i_part in range(all_parts):
547
+ config.update(
548
+ {
549
+ "i_part": str(i_part),
550
+ "all_parts": str(all_parts),
551
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
552
+ "is_half": str(is_half)
553
+ }
554
+ )
555
+ os.environ.update(config)
556
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
557
+ print(cmd)
558
+ p = Popen(cmd, shell=True)
559
+ ps1a.append(p)
560
+ yield "文本进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
561
+ for p in ps1a:
562
+ p.wait()
563
+ opt = []
564
+ for i_part in range(all_parts):
565
+ txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
566
+ with open(txt_path, "r", encoding="utf8") as f:
567
+ opt += f.read().strip("\n").split("\n")
568
+ os.remove(txt_path)
569
+ path_text = "%s/2-name2text.txt" % opt_dir
570
+ with open(path_text, "w", encoding="utf8") as f:
571
+ f.write("\n".join(opt) + "\n")
572
+ ps1a=[]
573
+ yield "文本进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
574
+ else:
575
+ yield "已有正在进行的文本任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
576
+
577
+ def close1a():
578
+ global ps1a
579
+ if (ps1a != []):
580
+ for p1a in ps1a:
581
+ try:
582
+ kill_process(p1a.pid)
583
+ except:
584
+ traceback.print_exc()
585
+ ps1a=[]
586
+ return "已终止所有1a进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
587
+
588
+ ps1b=[]
589
+ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir):
590
+ global ps1b
591
+ inp_text = my_utils.clean_path(inp_text)
592
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
593
+ if (ps1b == []):
594
+ config={
595
+ "inp_text":inp_text,
596
+ "inp_wav_dir":inp_wav_dir,
597
+ "exp_name":exp_name,
598
+ "opt_dir":"%s/%s"%(exp_root,exp_name),
599
+ "cnhubert_base_dir":ssl_pretrained_dir,
600
+ "is_half": str(is_half)
601
+ }
602
+ gpu_names=gpu_numbers.split("-")
603
+ all_parts=len(gpu_names)
604
+ for i_part in range(all_parts):
605
+ config.update(
606
+ {
607
+ "i_part": str(i_part),
608
+ "all_parts": str(all_parts),
609
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
610
+ }
611
+ )
612
+ os.environ.update(config)
613
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
614
+ print(cmd)
615
+ p = Popen(cmd, shell=True)
616
+ ps1b.append(p)
617
+ yield "SSL提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
618
+ for p in ps1b:
619
+ p.wait()
620
+ ps1b=[]
621
+ yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
622
+ else:
623
+ yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
624
+
625
+ def close1b():
626
+ global ps1b
627
+ if (ps1b != []):
628
+ for p1b in ps1b:
629
+ try:
630
+ kill_process(p1b.pid)
631
+ except:
632
+ traceback.print_exc()
633
+ ps1b=[]
634
+ return "已终止所有1b进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
635
+
636
+ ps1c=[]
637
+ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path):
638
+ global ps1c
639
+ inp_text = my_utils.clean_path(inp_text)
640
+ if (ps1c == []):
641
+ opt_dir="%s/%s"%(exp_root,exp_name)
642
+ config={
643
+ "inp_text":inp_text,
644
+ "exp_name":exp_name,
645
+ "opt_dir":opt_dir,
646
+ "pretrained_s2G":pretrained_s2G_path,
647
+ "s2config_path":"GPT_SoVITS/configs/s2.json",
648
+ "is_half": str(is_half)
649
+ }
650
+ gpu_names=gpu_numbers.split("-")
651
+ all_parts=len(gpu_names)
652
+ for i_part in range(all_parts):
653
+ config.update(
654
+ {
655
+ "i_part": str(i_part),
656
+ "all_parts": str(all_parts),
657
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
658
+ }
659
+ )
660
+ os.environ.update(config)
661
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
662
+ print(cmd)
663
+ p = Popen(cmd, shell=True)
664
+ ps1c.append(p)
665
+ yield "语义token提取进程执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
666
+ for p in ps1c:
667
+ p.wait()
668
+ opt = ["item_name\tsemantic_audio"]
669
+ path_semantic = "%s/6-name2semantic.tsv" % opt_dir
670
+ for i_part in range(all_parts):
671
+ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
672
+ with open(semantic_path, "r", encoding="utf8") as f:
673
+ opt += f.read().strip("\n").split("\n")
674
+ os.remove(semantic_path)
675
+ with open(path_semantic, "w", encoding="utf8") as f:
676
+ f.write("\n".join(opt) + "\n")
677
+ ps1c=[]
678
+ yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False}
679
+ else:
680
+ yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
681
+
682
+ def close1c():
683
+ global ps1c
684
+ if (ps1c != []):
685
+ for p1c in ps1c:
686
+ try:
687
+ kill_process(p1c.pid)
688
+ except:
689
+ traceback.print_exc()
690
+ ps1c=[]
691
+ return "已终止所有语义token进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
692
+ #####inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,cnhubert_base_dir,pretrained_s2G
693
+ ps1abc=[]
694
+ def open1abc(inp_text,inp_wav_dir,exp_name,gpu_numbers1a,gpu_numbers1Ba,gpu_numbers1c,bert_pretrained_dir,ssl_pretrained_dir,pretrained_s2G_path):
695
+ global ps1abc
696
+ inp_text = my_utils.clean_path(inp_text)
697
+ inp_wav_dir = my_utils.clean_path(inp_wav_dir)
698
+ if (ps1abc == []):
699
+ opt_dir="%s/%s"%(exp_root,exp_name)
700
+ try:
701
+ #############################1a
702
+ path_text="%s/2-name2text.txt" % opt_dir
703
+ if(os.path.exists(path_text)==False or (os.path.exists(path_text)==True and len(open(path_text,"r",encoding="utf8").read().strip("\n").split("\n"))<2)):
704
+ config={
705
+ "inp_text":inp_text,
706
+ "inp_wav_dir":inp_wav_dir,
707
+ "exp_name":exp_name,
708
+ "opt_dir":opt_dir,
709
+ "bert_pretrained_dir":bert_pretrained_dir,
710
+ "is_half": str(is_half)
711
+ }
712
+ gpu_names=gpu_numbers1a.split("-")
713
+ all_parts=len(gpu_names)
714
+ for i_part in range(all_parts):
715
+ config.update(
716
+ {
717
+ "i_part": str(i_part),
718
+ "all_parts": str(all_parts),
719
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
720
+ }
721
+ )
722
+ os.environ.update(config)
723
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py'%python_exec
724
+ print(cmd)
725
+ p = Popen(cmd, shell=True)
726
+ ps1abc.append(p)
727
+ yield "进度:1a-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
728
+ for p in ps1abc:p.wait()
729
+
730
+ opt = []
731
+ for i_part in range(all_parts):#txt_path="%s/2-name2text-%s.txt"%(opt_dir,i_part)
732
+ txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
733
+ with open(txt_path, "r",encoding="utf8") as f:
734
+ opt += f.read().strip("\n").split("\n")
735
+ os.remove(txt_path)
736
+ with open(path_text, "w",encoding="utf8") as f:
737
+ f.write("\n".join(opt) + "\n")
738
+
739
+ yield "进度:1a-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
740
+ ps1abc=[]
741
+ #############################1b
742
+ config={
743
+ "inp_text":inp_text,
744
+ "inp_wav_dir":inp_wav_dir,
745
+ "exp_name":exp_name,
746
+ "opt_dir":opt_dir,
747
+ "cnhubert_base_dir":ssl_pretrained_dir,
748
+ }
749
+ gpu_names=gpu_numbers1Ba.split("-")
750
+ all_parts=len(gpu_names)
751
+ for i_part in range(all_parts):
752
+ config.update(
753
+ {
754
+ "i_part": str(i_part),
755
+ "all_parts": str(all_parts),
756
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
757
+ }
758
+ )
759
+ os.environ.update(config)
760
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py'%python_exec
761
+ print(cmd)
762
+ p = Popen(cmd, shell=True)
763
+ ps1abc.append(p)
764
+ yield "进度:1a-done, 1b-ing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
765
+ for p in ps1abc:p.wait()
766
+ yield "进度:1a1b-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
767
+ ps1abc=[]
768
+ #############################1c
769
+ path_semantic = "%s/6-name2semantic.tsv" % opt_dir
770
+ if(os.path.exists(path_semantic)==False or (os.path.exists(path_semantic)==True and os.path.getsize(path_semantic)<31)):
771
+ config={
772
+ "inp_text":inp_text,
773
+ "exp_name":exp_name,
774
+ "opt_dir":opt_dir,
775
+ "pretrained_s2G":pretrained_s2G_path,
776
+ "s2config_path":"GPT_SoVITS/configs/s2.json",
777
+ }
778
+ gpu_names=gpu_numbers1c.split("-")
779
+ all_parts=len(gpu_names)
780
+ for i_part in range(all_parts):
781
+ config.update(
782
+ {
783
+ "i_part": str(i_part),
784
+ "all_parts": str(all_parts),
785
+ "_CUDA_VISIBLE_DEVICES": gpu_names[i_part],
786
+ }
787
+ )
788
+ os.environ.update(config)
789
+ cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py'%python_exec
790
+ print(cmd)
791
+ p = Popen(cmd, shell=True)
792
+ ps1abc.append(p)
793
+ yield "进度:1a1b-done, 1cing", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
794
+ for p in ps1abc:p.wait()
795
+
796
+ opt = ["item_name\tsemantic_audio"]
797
+ for i_part in range(all_parts):
798
+ semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
799
+ with open(semantic_path, "r",encoding="utf8") as f:
800
+ opt += f.read().strip("\n").split("\n")
801
+ os.remove(semantic_path)
802
+ with open(path_semantic, "w",encoding="utf8") as f:
803
+ f.write("\n".join(opt) + "\n")
804
+ yield "进度:all-done", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
805
+ ps1abc = []
806
+ yield "一键三连进程结束", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
807
+ except:
808
+ traceback.print_exc()
809
+ close1abc()
810
+ yield "一键三连中途报错", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
811
+ else:
812
+ yield "已有正在进行的一键三连任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}
813
+
814
+ def close1abc():
815
+ global ps1abc
816
+ if (ps1abc != []):
817
+ for p1abc in ps1abc:
818
+ try:
819
+ kill_process(p1abc.pid)
820
+ except:
821
+ traceback.print_exc()
822
+ ps1abc=[]
823
+ return "已终止所有一键三连进程", {"__type__": "update", "visible": True}, {"__type__": "update", "visible": False}
824
+
825
+ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
826
+ gr.Markdown("# <center>🌊💕🎶 XTTS 微调:2分钟语音,开启中日英16种语言真实拟声</center>")
827
+ gr.Markdown("## <center>🌟 只需2分钟的语音,一键在线微调 最强多语种模型</center>")
828
+ gr.Markdown("### <center>🤗 更多精彩,尽在[滔滔AI](https://www.talktalkai.com/);滔滔AI,��爱滔滔!💕</center>")
829
+
830
+ with gr.Tabs():
831
+ with gr.TabItem(i18n("1 - 制作数据集")):#提前随机切片防止uvr5爆内存->uvr5->slicer->asr->打标
832
+ #gr.Markdown(value=i18n("0a-UVR5人声伴奏分离&去混响去延迟工具"))
833
+ with gr.Row():
834
+ if_uvr5 = gr.Checkbox(label=i18n("是否开启UVR5-WebUI"),show_label=True, visible=False)
835
+ uvr5_info = gr.Textbox(label=i18n("UVR5进程输出信息"), visible=False)
836
+ gr.Markdown(value=i18n("1a-语音切分工具"))
837
+ with gr.Row():
838
+ with gr.Row():
839
+ slice_inp_path=gr.Textbox(label=i18n("音频自动切分输入路径,可文件可文件夹"),info="您需要先在xtts-v2文件夹中上传训练音频,如jay.wav;音频时长建议大于2分钟",value="",placeholder="jay.wav")
840
+ slice_opt_root=gr.Textbox(label=i18n("切分后的子音频的输出根目录"),value="output/slicer_opt")
841
+ threshold=gr.Textbox(label=i18n("threshold:音量小于这个值视作静音的备选切割点"),value="-34")
842
+ min_length=gr.Textbox(label=i18n("min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值"),value="4000")
843
+ min_interval=gr.Textbox(label=i18n("min_interval:最短切割间隔"),value="300")
844
+ hop_size=gr.Textbox(label=i18n("hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)"),value="10")
845
+ max_sil_kept=gr.Textbox(label=i18n("max_sil_kept:切完后静音最多留多长"),value="500")
846
+ with gr.Row():
847
+ open_slicer_button=gr.Button(i18n("1. 开启语音切割"), variant="primary",visible=True)
848
+ close_slicer_button=gr.Button(i18n("终止语音切割"), variant="primary",visible=False)
849
+ _max=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("max:归一化后最大值多少"),value=0.9,interactive=True)
850
+ alpha=gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("alpha_mix:混多少比例归一化后音频进来"),value=0.25,interactive=True)
851
+ n_process=gr.Slider(minimum=1,maximum=n_cpu,step=1,label=i18n("切割使用的进程数"),value=4,interactive=True)
852
+ slicer_info = gr.Textbox(label=i18n("语音切割进程输出信息"))
853
+ #gr.Markdown(value=i18n("0bb-语音降噪工具"))
854
+ with gr.Row():
855
+ open_denoise_button = gr.Button(i18n("开启语音降噪"), visible=False)
856
+ close_denoise_button = gr.Button(i18n("终止语音降噪进程"), variant="primary",visible=False)
857
+ denoise_input_dir=gr.Textbox(label=i18n("降噪音频文件输入文件夹"),value="", visible=False)
858
+ denoise_output_dir=gr.Textbox(label=i18n("降噪结果输出文件夹"),value="output/denoise_opt", visible=False)
859
+ denoise_info = gr.Textbox(label=i18n("语音降噪进程输出信息"), visible=False)
860
+ gr.Markdown(value=i18n("1b-批量语音识别"))
861
+ with gr.Row():
862
+ open_asr_button = gr.Button(i18n("2. 开启离线批量ASR"), variant="primary",visible=True)
863
+ close_asr_button = gr.Button(i18n("终止ASR进程"), variant="primary",visible=False)
864
+ with gr.Column():
865
+ with gr.Row():
866
+ asr_inp_dir = gr.Textbox(
867
+ label=i18n("输入文件夹路径"),
868
+ value="output/slicer_opt",
869
+ interactive=True,
870
+ )
871
+ asr_opt_dir = gr.Textbox(
872
+ label = i18n("输出文件夹路径"),
873
+ value = "output/asr_opt",
874
+ interactive = True,
875
+ )
876
+ with gr.Row():
877
+ asr_model = gr.Dropdown(
878
+ label = i18n("ASR 模型"),
879
+ choices = list(asr_dict.keys()),
880
+ interactive = True,
881
+ value="达摩 ASR (中文)"
882
+ )
883
+ asr_size = gr.Dropdown(
884
+ label = i18n("ASR 模型尺寸"),
885
+ choices = ["large"],
886
+ interactive = True,
887
+ value="large"
888
+ )
889
+ asr_lang = gr.Dropdown(
890
+ label = i18n("ASR 语言设置"),
891
+ choices = ["zh"],
892
+ interactive = True,
893
+ value="zh"
894
+ )
895
+ lang = asr_lang
896
+ with gr.Row():
897
+ asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
898
+
899
+ def change_lang_choices(key): #根据选择的模型修改可选的语言
900
+ # return gr.Dropdown(choices=asr_dict[key]['lang'])
901
+ return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
902
+ def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
903
+ # return gr.Dropdown(choices=asr_dict[key]['size'])
904
+ return {"__type__": "update", "choices": asr_dict[key]['size']}
905
+ asr_model.change(change_lang_choices, [asr_model], [asr_lang])
906
+ asr_model.change(change_size_choices, [asr_model], [asr_size])
907
+
908
+ gr.Markdown(value=i18n("1c-语音文本校对标注工具"))
909
+ with gr.Row():
910
+ if_label = gr.Checkbox(label=i18n("是否开启打标WebUI"),show_label=True)
911
+ path_list = gr.Textbox(
912
+ label=i18n(".list标注文件的路径"),
913
+ value="output/asr_opt/slicer_opt.list",
914
+ interactive=True,
915
+ )
916
+ label_info = gr.Textbox(label=i18n("打标工具进程输出信息"))
917
+ if_label.change(change_label, [if_label,path_list], [label_info])
918
+ if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info])
919
+ open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button])
920
+ close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button])
921
+ open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button])
922
+ close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button])
923
+ open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button])
924
+ close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button])
925
+
926
+ with gr.Tab("2 - XTTS模型微调"):
927
+ inp_list_path_value = str(Path.cwd() / "output/asr_opt/slicer_opt.list")
928
+ out_csv_path_value = str(Path.cwd() / "output.csv")
929
+ inp_list_path = gr.Textbox(value=inp_list_path_value, label=".list文件地址")
930
+ out_csv_path = gr.Textbox(value=out_csv_path_value, label=".csv文件地址")
931
+ list_to_csv = gr.Button("3. 准备训练csv文件", variant="primary")
932
+ train_csv = gr.Textbox(
933
+ label="训练数据集csv文件",
934
+ )
935
+ eval_csv = gr.Textbox(
936
+ label="评价数据集csv文件",
937
+ )
938
+ list_to_csv.click(convert_list_to_csv, [inp_list_path, out_csv_path], [train_csv, eval_csv])
939
+ out_path_value = str(Path.cwd() / "finetune_models")
940
+ out_path = gr.Textbox(value=out_path_value, label="XTTS微调模型的文件夹")
941
+ num_epochs = gr.Slider(
942
+ label="训练步数 Number of epochs:",
943
+ minimum=1,
944
+ maximum=100,
945
+ step=1,
946
+ value=6,
947
+ )
948
+ batch_size = gr.Slider(
949
+ label="Batch size:",
950
+ minimum=2,
951
+ maximum=512,
952
+ step=1,
953
+ value=2,
954
+ )
955
+ grad_acumm = gr.Slider(
956
+ label="Grad accumulation steps:",
957
+ minimum=1,
958
+ maximum=128,
959
+ step=1,
960
+ value=1,
961
+ )
962
+ max_audio_length = gr.Slider(
963
+ label="Max permitted audio size in seconds:",
964
+ minimum=2,
965
+ maximum=20,
966
+ step=1,
967
+ value=11,
968
+ visible=False,
969
+ )
970
+ progress_train = gr.Label(
971
+ label="训练进程"
972
+ )
973
+ logs_tts_train = gr.Textbox(
974
+ label="训练详细信息",
975
+ interactive=False,
976
+ )
977
+ app.load(read_logs, None, logs_tts_train, every=1)
978
+ train_btn = gr.Button(value="4. 开始模型训练", variant="primary")
979
+
980
+ def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
981
+ print(f"开始训练,训练素材的语种为:{language}")
982
+ clear_gpu_cache()
983
+ if not train_csv or not eval_csv:
984
+ return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
985
+ try:
986
+ # convert seconds to waveform frames
987
+ max_audio_length = int(max_audio_length * 22050)
988
+ config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
989
+ except:
990
+ traceback.print_exc()
991
+ error = traceback.format_exc()
992
+ return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
993
+
994
+ # copy original files to avoid parameters changes issues
995
+ os.system(f"cp {config_path} {exp_path}")
996
+ os.system(f"cp {vocab_file} {exp_path}")
997
+
998
+ ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
999
+ print("模型已成功微调!")
1000
+ clear_gpu_cache()
1001
+ ref_audio_names = os.listdir("dingzhen/dingzhen")
1002
+ ref_audio_list = [os.path.join("dingzhen/dingzhen", ref_audio_name) for ref_audio_name in ref_audio_names]
1003
+ first_five_ref_audio = "\n".join(ref_audio_list[0:8])
1004
+ return "模型已成功微调!", config_path, vocab_file, ft_xtts_checkpoint, first_five_ref_audio, speaker_wav
1005
+
1006
+ with gr.Tab("3 - XTTS语音合成"):
1007
+ with gr.Row():
1008
+ with gr.Column() as col1:
1009
+ xtts_checkpoint = gr.Textbox(
1010
+ label="XTTS checkpoint 路径",
1011
+ value="",
1012
+ )
1013
+ xtts_config = gr.Textbox(
1014
+ label="XTTS config 路径",
1015
+ value="",
1016
+ )
1017
+
1018
+ xtts_vocab = gr.Textbox(
1019
+ label="XTTS vocab 路径",
1020
+ value="",
1021
+ )
1022
+ progress_load = gr.Label(
1023
+ label="模型加载进程"
1024
+ )
1025
+ load_btn = gr.Button(value="5. 加载已训练好的模型", variant="primary")
1026
+
1027
+ with gr.Column() as col2:
1028
+ first_five_speaker_reference_audio = gr.Textbox(label="您可以选用的参考音频", visible=True, interactive=True)
1029
+ speaker_reference_audio = gr.Textbox(
1030
+ label="您正在使用的参考音频",
1031
+ info="不同参考音频对应的合成效果不同。您可以尝试多次,每次填写一条音频路径",
1032
+ value="",
1033
+ )
1034
+ tts_text = gr.Textbox(
1035
+ label="请填写语音合成的文本🍻",
1036
+ placeholder="想说却还没说的,还很多",
1037
+ )
1038
+ tts_language = gr.Dropdown(
1039
+ label="请选择文本对应的语言",
1040
+ value="zh",
1041
+ choices=[
1042
+ "en",
1043
+ "es",
1044
+ "fr",
1045
+ "de",
1046
+ "it",
1047
+ "pt",
1048
+ "pl",
1049
+ "tr",
1050
+ "ru",
1051
+ "nl",
1052
+ "cs",
1053
+ "ar",
1054
+ "zh",
1055
+ "hu",
1056
+ "ko",
1057
+ "ja",
1058
+ ]
1059
+ )
1060
+
1061
+ tts_btn = gr.Button(value="6. 开启AI语音之旅吧💕", variant="primary")
1062
+
1063
+ with gr.Column() as col3:
1064
+ progress_gen = gr.Label(
1065
+ label="语音合成进程"
1066
+ )
1067
+ tts_output_audio = gr.Audio(label="为您合成的专属音频🎶")
1068
+ reference_audio = gr.Audio(label="您使用的参考音频")
1069
+
1070
+ train_btn.click(
1071
+ fn=train_model,
1072
+ inputs=[
1073
+ lang,
1074
+ train_csv,
1075
+ eval_csv,
1076
+ num_epochs,
1077
+ batch_size,
1078
+ grad_acumm,
1079
+ out_path,
1080
+ max_audio_length,
1081
+ ],
1082
+ outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, first_five_speaker_reference_audio, speaker_reference_audio],
1083
+ )
1084
+
1085
+ load_btn.click(
1086
+ fn=load_model,
1087
+ inputs=[
1088
+ xtts_checkpoint,
1089
+ xtts_config,
1090
+ xtts_vocab
1091
+ ],
1092
+ outputs=[progress_load],
1093
+ )
1094
+
1095
+ tts_btn.click(
1096
+ fn=run_tts,
1097
+ inputs=[
1098
+ tts_language,
1099
+ tts_text,
1100
+ speaker_reference_audio,
1101
+ ],
1102
+ outputs=[progress_gen, tts_output_audio, reference_audio],
1103
+ )
1104
+
1105
+ gr.Markdown("### <center>注意❗:请不要生成会对个人以及���织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。请自觉合规使用此程序,程序开发者不负有任何责任。</center>")
1106
+ gr.HTML('''
1107
+ <div class="footer">
1108
+ <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
1109
+ </p>
1110
+ </div>
1111
+ ''')
1112
+ app.queue().launch(
1113
+ share=True,
1114
+ show_error=True,
1115
+ )
finetune_models/run/training/XTTS_v2.0_original_model_files/config.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "output",
3
+ "logger_uri": null,
4
+ "run_name": "run",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": null,
14
+ "save_step": 10000,
15
+ "save_n_checkpoints": 5,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 32,
30
+ "eval_batch_size": 16,
31
+ "grad_clip": 0.0,
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.001,
34
+ "optimizer": "radam",
35
+ "optimizer_params": null,
36
+ "lr_scheduler": null,
37
+ "lr_scheduler_params": {},
38
+ "use_grad_scaler": false,
39
+ "allow_tf32": false,
40
+ "cudnn_enable": true,
41
+ "cudnn_deterministic": false,
42
+ "cudnn_benchmark": false,
43
+ "training_seed": 54321,
44
+ "model": "xtts",
45
+ "num_loader_workers": 0,
46
+ "num_eval_loader_workers": 0,
47
+ "use_noise_augment": false,
48
+ "audio": {
49
+ "sample_rate": 22050,
50
+ "output_sample_rate": 24000
51
+ },
52
+ "use_phonemes": false,
53
+ "phonemizer": null,
54
+ "phoneme_language": null,
55
+ "compute_input_seq_cache": false,
56
+ "text_cleaner": null,
57
+ "enable_eos_bos_chars": false,
58
+ "test_sentences_file": "",
59
+ "phoneme_cache_path": null,
60
+ "characters": null,
61
+ "add_blank": false,
62
+ "batch_group_size": 0,
63
+ "loss_masking": null,
64
+ "min_audio_len": 1,
65
+ "max_audio_len": Infinity,
66
+ "min_text_len": 1,
67
+ "max_text_len": Infinity,
68
+ "compute_f0": false,
69
+ "compute_energy": false,
70
+ "compute_linear_spec": false,
71
+ "precompute_num_workers": 0,
72
+ "start_by_longest": false,
73
+ "shuffle": false,
74
+ "drop_last": false,
75
+ "datasets": [
76
+ {
77
+ "formatter": "",
78
+ "dataset_name": "",
79
+ "path": "",
80
+ "meta_file_train": "",
81
+ "ignored_speakers": null,
82
+ "language": "",
83
+ "phonemizer": "",
84
+ "meta_file_val": "",
85
+ "meta_file_attn_mask": ""
86
+ }
87
+ ],
88
+ "test_sentences": [],
89
+ "eval_split_max_size": null,
90
+ "eval_split_size": 0.01,
91
+ "use_speaker_weighted_sampler": false,
92
+ "speaker_weighted_sampler_alpha": 1.0,
93
+ "use_language_weighted_sampler": false,
94
+ "language_weighted_sampler_alpha": 1.0,
95
+ "use_length_weighted_sampler": false,
96
+ "length_weighted_sampler_alpha": 1.0,
97
+ "model_args": {
98
+ "gpt_batch_size": 1,
99
+ "enable_redaction": false,
100
+ "kv_cache": true,
101
+ "gpt_checkpoint": null,
102
+ "clvp_checkpoint": null,
103
+ "decoder_checkpoint": null,
104
+ "num_chars": 255,
105
+ "tokenizer_file": "",
106
+ "gpt_max_audio_tokens": 605,
107
+ "gpt_max_text_tokens": 402,
108
+ "gpt_max_prompt_tokens": 70,
109
+ "gpt_layers": 30,
110
+ "gpt_n_model_channels": 1024,
111
+ "gpt_n_heads": 16,
112
+ "gpt_number_text_tokens": 6681,
113
+ "gpt_start_text_token": null,
114
+ "gpt_stop_text_token": null,
115
+ "gpt_num_audio_tokens": 1026,
116
+ "gpt_start_audio_token": 1024,
117
+ "gpt_stop_audio_token": 1025,
118
+ "gpt_code_stride_len": 1024,
119
+ "gpt_use_masking_gt_prompt_approach": true,
120
+ "gpt_use_perceiver_resampler": true,
121
+ "input_sample_rate": 22050,
122
+ "output_sample_rate": 24000,
123
+ "output_hop_length": 256,
124
+ "decoder_input_dim": 1024,
125
+ "d_vector_dim": 512,
126
+ "cond_d_vector_in_each_upsampling_layer": true,
127
+ "duration_const": 102400
128
+ },
129
+ "model_dir": null,
130
+ "languages": [
131
+ "en",
132
+ "es",
133
+ "fr",
134
+ "de",
135
+ "it",
136
+ "pt",
137
+ "pl",
138
+ "tr",
139
+ "ru",
140
+ "nl",
141
+ "cs",
142
+ "ar",
143
+ "zh-cn",
144
+ "hu",
145
+ "ko",
146
+ "ja",
147
+ "hi"
148
+ ],
149
+ "temperature": 0.75,
150
+ "length_penalty": 1.0,
151
+ "repetition_penalty": 5.0,
152
+ "top_k": 50,
153
+ "top_p": 0.85,
154
+ "num_gpt_outputs": 1,
155
+ "gpt_cond_len": 30,
156
+ "gpt_cond_chunk_len": 4,
157
+ "max_ref_len": 30,
158
+ "sound_norm_refs": false
159
+ }
finetune_models/run/training/XTTS_v2.0_original_model_files/dvae.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b29bc227d410d4991e0a8c09b858f77415013eeb9fba9650258e96095557d97a
3
+ size 210514388
finetune_models/run/training/XTTS_v2.0_original_model_files/mel_stats.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f69422a8a8f344c4fca2f0c6b8d41d2151d6615b7321e48e6bb15ae949b119c
3
+ size 1067
finetune_models/run/training/XTTS_v2.0_original_model_files/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ea20001c6a0a841c77e252d8409f6a74fb423e79b3206a0771ba5989776187
3
+ size 1867929118
finetune_models/run/training/XTTS_v2.0_original_model_files/vocab.json ADDED
The diff for this file is too large to render. See raw diff