Respair commited on
Commit
eba1aea
1 Parent(s): 3db5663

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ reference_sample_wavs/01008270.wav filter=lfs diff=lfs merge=lfs -text
37
+ reference_sample_wavs/kaede_san.wav filter=lfs diff=lfs merge=lfs -text
38
+ reference_sample_wavs/riamu_zeroshot_02.wav filter=lfs diff=lfs merge=lfs -text
39
+ reference_sample_wavs/sample_ref01.wav filter=lfs diff=lfs merge=lfs -text
40
+ reference_sample_wavs/sample_ref02.wav filter=lfs diff=lfs merge=lfs -text
41
+ reference_sample_wavs/shiki_fine05.wav filter=lfs diff=lfs merge=lfs -text
42
+ reference_sample_wavs/syuukovoice_200918_3_01.wav filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Configs/config.yml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/LJSpeech"
2
+ first_stage_path: "first_stage.pth"
3
+ save_freq: 2
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 200 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 100 # number of peochs for second stage training (joint training)
8
+ batch_size: 16
9
+ max_len: 400 # maximum number of frames
10
+ pretrained_model: ""
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ F0_path: "Utils/JDC/bst.t7"
15
+ ASR_config: "Utils/ASR/config.yml"
16
+ ASR_path: "Utils/ASR/epoch_00080.pth"
17
+ PLBERT_dir: 'Utils/PLBERT/'
18
+
19
+ data_params:
20
+ train_data: "Data/train_list.txt"
21
+ val_data: "Data/val_list.txt"
22
+ root_path: "/local/LJSpeech-1.1/wavs"
23
+ OOD_data: "Data/OOD_texts.txt"
24
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
25
+
26
+ preprocess_params:
27
+ sr: 24000
28
+ spect_params:
29
+ n_fft: 2048
30
+ win_length: 1200
31
+ hop_length: 300
32
+
33
+ model_params:
34
+ multispeaker: false
35
+
36
+ dim_in: 64
37
+ hidden_dim: 512
38
+ max_conv_dim: 512
39
+ n_layer: 3
40
+ n_mels: 80
41
+
42
+ n_token: 178 # number of phoneme tokens
43
+ max_dur: 50 # maximum duration of a single phoneme
44
+ style_dim: 128 # style vector size
45
+
46
+ dropout: 0.2
47
+
48
+ # config for decoder
49
+ decoder:
50
+ type: 'istftnet' # either hifigan or istftnet
51
+ resblock_kernel_sizes: [3,7,11]
52
+ upsample_rates : [10, 6]
53
+ upsample_initial_channel: 512
54
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
55
+ upsample_kernel_sizes: [20, 12]
56
+ gen_istft_n_fft: 20
57
+ gen_istft_hop_size: 5
58
+
59
+ # speech language model config
60
+ slm:
61
+ model: 'microsoft/wavlm-base-plus'
62
+ sr: 16000 # sampling rate of SLM
63
+ hidden: 768 # hidden size of SLM
64
+ nlayers: 13 # number of layers of SLM
65
+ initial_channel: 64 # initial channels of SLM discriminator head
66
+
67
+ # style diffusion model config
68
+ diffusion:
69
+ embedding_mask_proba: 0.1
70
+ # transformer config
71
+ transformer:
72
+ num_layers: 3
73
+ num_heads: 8
74
+ head_features: 64
75
+ multiplier: 2
76
+
77
+ # diffusion distribution config
78
+ dist:
79
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
80
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
81
+ mean: -3.0
82
+ std: 1.0
83
+
84
+ loss_params:
85
+ lambda_mel: 5. # mel reconstruction loss
86
+ lambda_gen: 1. # generator loss
87
+ lambda_slm: 1. # slm feature matching loss
88
+
89
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
90
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
91
+ TMA_epoch: 50 # TMA starting epoch (1st stage)
92
+
93
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
94
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
95
+ lambda_dur: 1. # duration loss (2nd stage)
96
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
97
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
98
+ lambda_diff: 1. # score matching loss (2nd stage)
99
+
100
+ diff_epoch: 20 # style diffusion starting epoch (2nd stage)
101
+ joint_epoch: 50 # joint training starting epoch (2nd stage)
102
+
103
+ optimizer_params:
104
+ lr: 0.0001 # general learning rate
105
+ bert_lr: 0.00001 # learning rate for PLBERT
106
+ ft_lr: 0.00001 # learning rate for acoustic modules
107
+
108
+ slmadv_params:
109
+ min_len: 400 # minimum length of samples
110
+ max_len: 500 # maximum length of samples
111
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
112
+ iter: 10 # update the discriminator every this iterations of generator update
113
+ thresh: 5 # gradient norm above which the gradient is scaled
114
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
115
+ sig: 1.5 # sigma for differentiable duration modeling
116
+
Configs/config_ft.yml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/IMAS_FineTuned"
2
+ save_freq: 1
3
+ log_interval: 10
4
+ device: "cuda"
5
+ epochs: 50 # number of finetuning epoch (1 hour of data)
6
+ batch_size: 3
7
+ max_len: 2500 # maximum number of frames
8
+ pretrained_model: "/home/austin/disk2/llmvcs/tt/stylekan/Models/Style_Kanade/NO_SLM_3_epoch_2nd_00002.pth"
9
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
10
+ load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
11
+
12
+ F0_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/JDC/bst.t7"
13
+ ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
14
+ ASR_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/bst_00080.pth"
15
+
16
+ PLBERT_dir: 'Utils/PLBERT/'
17
+
18
+ data_params:
19
+ train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/FT_imas.csv"
20
+ val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/FT_imas_valid.csv"
21
+ root_path: ""
22
+ OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
23
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
24
+
25
+
26
+ preprocess_params:
27
+ sr: 24000
28
+ spect_params:
29
+ n_fft: 2048
30
+ win_length: 1200
31
+ hop_length: 300
32
+
33
+ model_params:
34
+ multispeaker: true
35
+
36
+ dim_in: 64
37
+ hidden_dim: 512
38
+ max_conv_dim: 512
39
+ n_layer: 3
40
+ n_mels: 80
41
+
42
+ n_token: 178 # number of phoneme tokens
43
+ max_dur: 50 # maximum duration of a single phoneme
44
+ style_dim: 128 # style vector size
45
+
46
+ dropout: 0.2
47
+
48
+ decoder:
49
+ type: 'istftnet' # either hifigan or istftnet
50
+ resblock_kernel_sizes: [3,7,11]
51
+ upsample_rates : [10, 6]
52
+ upsample_initial_channel: 512
53
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
54
+ upsample_kernel_sizes: [20, 12]
55
+ gen_istft_n_fft: 20
56
+ gen_istft_hop_size: 5
57
+
58
+
59
+
60
+ # speech language model config
61
+ slm:
62
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
63
+ sr: 16000 # sampling rate of SLM
64
+ hidden: 1280 # hidden size of SLM
65
+ nlayers: 33 # number of layers of SLM
66
+ initial_channel: 64 # initial channels of SLM discriminator head
67
+
68
+ # style diffusion model config
69
+ diffusion:
70
+ embedding_mask_proba: 0.1
71
+ # transformer config
72
+ transformer:
73
+ num_layers: 3
74
+ num_heads: 8
75
+ head_features: 64
76
+ multiplier: 2
77
+
78
+ # diffusion distribution config
79
+ dist:
80
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
81
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
82
+ mean: -3.0
83
+ std: 1.0
84
+
85
+ loss_params:
86
+ lambda_mel: 10. # mel reconstruction loss
87
+ lambda_gen: 1. # generator loss
88
+ lambda_slm: 1. # slm feature matching loss
89
+
90
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
91
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
92
+ TMA_epoch: 9 # TMA starting epoch (1st stage)
93
+
94
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
95
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
96
+ lambda_dur: 1. # duration loss (2nd stage)
97
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
98
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
99
+ lambda_diff: 1. # score matching loss (2nd stage)
100
+
101
+ diff_epoch: 0 # style diffusion starting epoch (2nd stage)
102
+ joint_epoch: 30 # joint training starting epoch (2nd stage)
103
+
104
+ optimizer_params:
105
+ lr: 0.0001 # general learning rate
106
+ bert_lr: 0.00001 # learning rate for PLBERT
107
+ ft_lr: 0.00001 # learning rate for acoustic modules
108
+
109
+ slmadv_params:
110
+ min_len: 400 # minimum length of samples
111
+ max_len: 500 # maximum length of samples
112
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
113
+ iter: 20 # update the discriminator every this iterations of generator update
114
+ thresh: 5 # gradient norm above which the gradient is scaled
115
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
116
+ sig: 1.5 # sigma for differentiable duration modeling
Configs/config_kanade.yml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Models/Style_Kanade_v02"
2
+ first_stage_path: ""
3
+ save_freq: 1
4
+ log_interval: 10
5
+ device: "cuda"
6
+ epochs_1st: 30 # number of epochs for first stage training (pre-training)
7
+ epochs_2nd: 20 # number of peochs for second stage training (joint training)
8
+ batch_size: 64
9
+ max_len: 560 # maximum number of frames
10
+ pretrained_model: "Models/Style_Kanade_v02/epoch_2nd_00007.pth"
11
+ second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
12
+ load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
13
+
14
+ F0_path: "Utils/JDC/bst.t7"
15
+ ASR_config: "Utils/ASR/config.yml"
16
+ ASR_path: "Utils/ASR/bst_00080.pth"
17
+
18
+ PLBERT_dir: 'Utils/PLBERT/'
19
+
20
+ data_params:
21
+ train_data: "Data/metadata_cleanest/DATA.csv"
22
+ val_data: "Data/VALID.txt"
23
+ root_path: ""
24
+ OOD_data: "Data/OOD_LargeScale_.csv"
25
+ min_length: 50 # sample until texts with this size are obtained for OOD texts
26
+
27
+
28
+ preprocess_params:
29
+ sr: 24000
30
+ spect_params:
31
+ n_fft: 2048
32
+ win_length: 1200
33
+ hop_length: 300
34
+
35
+ model_params:
36
+ multispeaker: true
37
+
38
+ dim_in: 64
39
+ hidden_dim: 512
40
+ max_conv_dim: 512
41
+ n_layer: 3
42
+ n_mels: 80
43
+
44
+ n_token: 178 # number of phoneme tokens
45
+ max_dur: 50 # maximum duration of a single phoneme
46
+ style_dim: 128 # style vector size
47
+
48
+ dropout: 0.2
49
+
50
+ decoder:
51
+ type: 'istftnet' # either hifigan or istftnet
52
+ resblock_kernel_sizes: [3,7,11]
53
+ upsample_rates : [10, 6]
54
+ upsample_initial_channel: 512
55
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
56
+ upsample_kernel_sizes: [20, 12]
57
+ gen_istft_n_fft: 20
58
+ gen_istft_hop_size: 5
59
+
60
+
61
+
62
+ # speech language model config
63
+ slm:
64
+ model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
65
+ sr: 16000 # sampling rate of SLM
66
+ hidden: 1280 # hidden size of SLM
67
+ nlayers: 33 # number of layers of SLM
68
+ initial_channel: 64 # initial channels of SLM discriminator head
69
+
70
+ # style diffusion model config
71
+ diffusion:
72
+ embedding_mask_proba: 0.1
73
+ # transformer config
74
+ transformer:
75
+ num_layers: 3
76
+ num_heads: 8
77
+ head_features: 64
78
+ multiplier: 2
79
+
80
+ # diffusion distribution config
81
+ dist:
82
+ sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
83
+ estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
84
+ mean: -3.0
85
+ std: 1.0
86
+
87
+ loss_params:
88
+ lambda_mel: 10. # mel reconstruction loss
89
+ lambda_gen: 1. # generator loss
90
+ lambda_slm: 1. # slm feature matching loss
91
+
92
+ lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
93
+ lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
94
+ TMA_epoch: 5 # TMA starting epoch (1st stage)
95
+
96
+ lambda_F0: 1. # F0 reconstruction loss (2nd stage)
97
+ lambda_norm: 1. # norm reconstruction loss (2nd stage)
98
+ lambda_dur: 1. # duration loss (2nd stage)
99
+ lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
100
+ lambda_sty: 1. # style reconstruction loss (2nd stage)
101
+ lambda_diff: 1. # score matching loss (2nd stage)
102
+
103
+ diff_epoch: 4 # style diffusion starting epoch (2nd stage)
104
+ joint_epoch: 999 # joint training starting epoch (2nd stage)
105
+
106
+ optimizer_params:
107
+ lr: 0.0001 # general learning rate
108
+ bert_lr: 0.00001 # learning rate for PLBERT
109
+ ft_lr: 0.00001 # learning rate for acoustic modules
110
+
111
+ slmadv_params:
112
+ min_len: 400 # minimum length of samples
113
+ max_len: 500 # maximum length of samples
114
+ batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
115
+ iter: 20 # update the discriminator every this iterations of generator update
116
+ thresh: 5 # gradient norm above which the gradient is scaled
117
+ scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
118
+ sig: 1.5 # sigma for differentiable duration modeling
Inference/infer_24khz_mod.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Inference/input_for_prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ この俺に何度も同じことを説明させるな!! お前たちは俺の忠告を完全に無視して、とんでもない結果を招いてしまった。これが最後の警告だ。次は絶対に許さないぞ!
2
+ 時には、静けさの中にこそ、本当の答えが見つかるものですね。慌てる必要はないのです。
3
+ 人生には、表現しきれないほどの驚きがあると思うよ。それは、目には見えない力で、人々を繋ぐ不思議な絆だ。私は、その驚きを胸に秘め、日々を楽しく過ごしているんだ。言葉を伝えるたびに、未来への期待を込めて、元気に話す。それは、夢を叶えるための魔法のようなものだ。
4
+ かなたの次元より迫り来る混沌の使者たちよ、貴様らの野望を我が焔の業火で焼き尽くす! 運命の歯車は、我が意思と共にすでに動き出したのだ。我が宿命の敵に立ち向かうため、禁断の呪文を紡ぐ時は今ここに訪れる。さあ、見るがよい。我が力を!!
Inference/prompt.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ A male voice that resonates with deep, thunderous intensity. His rapid-fire words slam like aggressive drumbeats, each syllable charged with intense rage. The expressive tone fluctuates between restrained fury and explosive outbursts.
2
+ A female voice with a distinctively deep, low pitch that commands attention. Her slightly monotone delivery creates an air of composure and gravitas, while maintaining a calm, measured pace. Her voice carries a soothing weight, like gentle thunder in the distance, making her words feel grounded and reassuring.
3
+ a female voice that is gentle and soft, with a slightly high pitch that adds a comforting warmth to her words. Her tone is moderate, neither too expressive nor too flat, creating a balanced and soothing atmosphere. The slow speed of her speech gives her words a deliberate and thoughtful cadence, allowing each phrase to resonate fully. There's a sense of wonder and optimism in her voice, as if she is constantly marveling at the mysteries of life. Her gentle demeanor and soft delivery make her sound approachable and kind, inviting listeners to share in her sense of wonder.
4
+ A female voice that resonates with deep intensity and a distinctly low pitch. Her words flow with the force and rhythm of a relentless tide, each syllable weighted with profound determination. The expressive tone navigates between measured intensity and powerful surges.
Inference/random_texts.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Akashi: 不思議な人ですね、レザさんは。たまには子供扱いしてくれてちょっとむきになりますけど、とても頼りがいのある人だと思いますよ?
2
+ Kimiji: 人生は、果てしない探求の旅のようなもの。私たちは、自分自身や周囲の世界について、常に新しい発見をしていく。それは、時として喜びをもたらすこともあれば、困難に直面することもある。しかしそれら全てが、自分を形作る貴重な経験である。
3
+ Reira: 私に何度も同じことを説明させないでよ! お前たちは私の忠告を完全に無視して、とんでもない結果を招いてしまった!! これが最後の警告だ。次は絶対に許さないぞ! ------------------------------------------- (NOTE: enable the diffusion, then set the Intensity to 3, also remove this line!)
4
+ Yoriko: この世には、言葉にできない悲しみがある。 それは、胸の奥に沈んでいくような重さで、時間が経つにつれて、じわじわと広がっていく。私は、その悲しみを抱えながら、日々を過ごしていいた。 言葉を発するたびに、心の中で何度も繰り返し、慎重に選び抜いている。それは、痛みを和らげるための儀式のようなものだ.
5
+ Kimiji: 人生には、表現しきれないほどの驚きがあると思います。それは、目には見えない力で、人々を繋ぐ不思議な絆です。私は、その驚きを胸に秘め、日々を楽しく過ごしています。言葉を伝えるたびに、未来への期待を込めて、元気に話す。それは、夢を叶えるための魔法のようなものです。
6
+ Teppei: そうだな、この新しいシステムの仕組みについて説明しておこう。基本的には三層構造になっていて、各層が独立して機能している。一番下のレイヤーでデータの処理と保存を行い、真ん中の層でビジネスロジックを実装している。ユーザーが直接触れるのは最上位層だけで、そこでインターフェースの制御をしているんだ。パフォーマンスを考えると、データベースへのアクセスを最小限に抑える必要があるから、キャッシュの実装も検討している。ただ、システムの規模を考えると、当面は現状の構成で十分だと思われる。将来的にスケールする必要が出てきた場合は、その時点で見直しを検討すればいいだろう。
7
+ Kirara: べ、別にそんなことじゃないってば!あんたのことなんて全然気にしてないんだからね!
8
+ Maiko: ねえ、ちょっと今日の空を見上げて。朝から少しずつ変わっていく雲の形が、まるで漫画の中の風景みたい。東の方からゆっくりと暖かい風が吹いてきて、桜の花びらが舞い散るように、優しく大地を撫でていくの。春の陽気が徐々に夏の暑さに変わっていくこの季節は、なんだかわくわくするよね。でも、週間予報によると、明後日あたりから天気が崩れるみたいで、しばらくは傘の出番かもしれないんだ。梅雨の時期が近づいているから、空気も少しずつ湿っぽくなってきているのを感じない?でも、雨上がりの空気って、なんだか特別な匂いがして、私、結構好きなんだよね。
9
+ Amane: そ、そうかな?あなたと過ごした今までの時間は、私に、とても、とーっても大事だよ?だって、あなたの言葉には、どこか特別な温もりがあるような気がする。まるで、心の中に直接響いてくるようで、ほんの少しの間だけ、世界が優しくなった気がするよ。
10
+ Shioji: つい昨日のことみたいなのに、もう随分と昔のお話なんだね ! 今でも古い本の匂いがすると、あの静かで穏やかな時間がまるで透明で綺麗な海みたいに心の中でふわりと溶けていくの。
11
+ Riemi: かなたの次元より迫り来る混沌の使者たちよ、貴様らの野望を我が焔の業火で焼き尽くす! 運命の歯車は、我が意思と共にすでに動き出したのだ。我が宿命の敵に立ち向かうため、禁断の呪文を紡ぐ時は今ここに訪れる。さあ、見るがよい。我が力を!!
12
+ だめですよ?そんなことは。もっと慎重に行動してくださいね。
13
+ Kazunari: ojousama wa, katachi dake no tsukibito ga ireba sore de ii, to omotte orareru you desuga, katachi bakari de mo, san nen ka o tomo ni seikatsu o suru koto ni narimasɯ. --------------------------------------- (NOTE!: your spacing will impact the intonations. also remove this line!)
14
+ Homa: 君は muzukashiku 考えると kekka to shite 空回り suru taipu dakara na. 大体, sonna kanji de oboete ikeba iin da. -------------------------------------------------- (NOTE!: your spacing will impact the intonations. also remove this line!)
LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE FOR STYLETTS2 DEMO PAGE: GPL (I KNOW I DON'T LIKE GPL BUT I HAVE TO BECAUSE OF PHONEMIZER REQUIREMENT)
2
+
3
+
4
+ styletts 2 license:
5
+
6
+ Copyright (c) 2023 Aaron (Yinghao) Li
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.
25
+
__pycache__/cotlet_phon.cpython-311.pyc ADDED
Binary file (5.58 kB). View file
 
__pycache__/cotlet_utils.cpython-311.pyc ADDED
Binary file (25.1 kB). View file
 
__pycache__/importable.cpython-311.pyc ADDED
Binary file (21.5 kB). View file
 
__pycache__/models.cpython-311.pyc ADDED
Binary file (52.6 kB). View file
 
__pycache__/text_utils.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (5.8 kB). View file
 
app_tsumugi_remote.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INTROTXT = """#
2
+ Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel)
3
+ This space uses Tsukasa (24khz).
4
+ **Check the Read me tabs down below.** <br>
5
+ Enjoy!
6
+ """
7
+ import gradio as gr
8
+ import random
9
+ import torch
10
+ import os
11
+ import pickle
12
+ from gradio_client import Client
13
+ client = Client('https://9155368f00f8a50bc7.gradio.live')
14
+
15
+ voices = {}
16
+ example_texts = {}
17
+ prompts = []
18
+ inputs = []
19
+
20
+
21
+ theme = gr.themes.Base(
22
+ font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
23
+ )
24
+
25
+ voicelist = [v for v in os.listdir("/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/reference_sample_wavs")]
26
+
27
+ for v in voicelist:
28
+ voices[v] = f'reference_sample_wavs/{v}'
29
+
30
+
31
+
32
+ with open(f'Inference/random_texts.txt', 'r') as r:
33
+ random_texts = [line.strip() for line in r]
34
+
35
+ example_texts = {f"{text[:30]}...": text for text in random_texts}
36
+
37
+ def update_text_input(preview):
38
+
39
+ return example_texts[preview]
40
+
41
+ def get_random_text():
42
+ return random.choice(random_texts)
43
+
44
+
45
+ with open('Inference/prompt.txt', 'r') as p:
46
+ prompts = [line.strip() for line in p]
47
+
48
+ with open('Inference/input_for_prompt.txt', 'r') as i:
49
+ inputs = [line.strip() for line in i]
50
+
51
+
52
+ last_idx = None
53
+
54
+ def get_random_prompt_pair():
55
+ global last_idx
56
+ max_idx = min(len(prompts), len(inputs)) - 1
57
+
58
+
59
+ random_idx = random.randint(0, max_idx)
60
+ while random_idx == last_idx:
61
+ random_idx = random.randint(0, max_idx)
62
+
63
+ last_idx = random_idx
64
+ return inputs[random_idx], prompts[random_idx]
65
+
66
+
67
+ def Synthesize_Audio(text, voice=None, voice2=None, vcsteps=2, embscale=1, alpha=.4, beta=.4, ros=.1):
68
+ # Wrap the file path using the gradio.File class
69
+ if voice2 is not None:
70
+ voice2 = {"path": voice2, "meta": {"_type": "gradio.FileData"}}
71
+
72
+
73
+ # Call the Gradio endpoint through the client with the appropriate API Name
74
+ result = client.predict(
75
+ text,
76
+ voice,
77
+ voice2,
78
+ vcsteps,
79
+ embscale,
80
+ alpha,
81
+ beta,
82
+ ros,
83
+ api_name="/Synthesize_Audio"
84
+ )
85
+ return result
86
+
87
+ # Example usage
88
+
89
+ def LongformSynth_Text(text, s_prev, Kotodama, alpha, beta, t, diffusion_steps, embedding_scale, rate_of_speech):
90
+
91
+ result = client.predict(
92
+ text,
93
+ alpha,
94
+ beta,
95
+ diffusion_steps,
96
+ embedding_scale,
97
+ rate_of_speech,
98
+ api_name="/LongformSynth_Text"
99
+ )
100
+ return result
101
+
102
+
103
+
104
+ def Inference_Synth_Prompt(text, description, Kotodama, alpha, beta, diffusion_steps, embedding_scale, rate_of_speech):
105
+
106
+ result = client.predict(
107
+ text,
108
+ description,
109
+ alpha,
110
+ beta,
111
+ diffusion_steps,
112
+ embedding_scale,
113
+ rate_of_speech,
114
+ api_name="/Inference_Synth_Prompt"
115
+ )
116
+ return result
117
+
118
+
119
+ with gr.Blocks() as audio_inf:
120
+ with gr.Row():
121
+ with gr.Column(scale=1):
122
+ inp = gr.Textbox(label="Text", info="Enter the text", value="きみの存在は、私の心の中で燃える小さな光のよう。きみがいない時、世界は白黒の写真みたいに寂しくて、何も輝いてない。きみの笑顔だけが、私の灰色の日々に色を塗ってくれる。離れてる時間は、めちゃくちゃ長く感じられて、きみへの想いは風船みたいにどんどん膨らんでいく。きみなしの世界なんて、想像できないよ。", interactive=True, scale=5)
123
+ voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value=voicelist[5], interactive=True)
124
+ voice_2 = gr.Audio(label="Upload your own Audio", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
125
+
126
+ with gr.Accordion("Advanced Parameters", open=False):
127
+
128
+ alpha = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1, label="Alpha", info="a Diffusion sampler parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled", interactive=True)
129
+ beta = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1, label="Beta", info="a Diffusion sampler parameter, higher means less affected by the reference | 0 = diffusion is disabled", interactive=True)
130
+ multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", interactive=True)
131
+ embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, label="Intensity", info="will impact the expressiveness, if you raise it too much it'll break.", interactive=True)
132
+ rate_of_speech = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Rate of Speech", info="Higher -> Faster", interactive=True)
133
+
134
+ with gr.Column(scale=1):
135
+ btn = gr.Button("Synthesize", variant="primary")
136
+ audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
137
+ btn.click(Synthesize_Audio, inputs=[inp, voice, voice_2, multispeakersteps, embscale, alpha, beta, rate_of_speech], outputs=[audio], concurrency_limit=4)
138
+
139
+ # Kotodama Text sampler Synthesis Block
140
+ with gr.Blocks() as longform:
141
+ with gr.Row():
142
+ with gr.Column(scale=1):
143
+ inp_longform = gr.Textbox(
144
+ label="Text",
145
+ info="Enter the text [Speaker: Text -> japanese or romaji both work, check the last example!] \n Also works without any names. ",
146
+ value=list(example_texts.values())[0],
147
+ interactive=True,
148
+ scale=5
149
+ )
150
+
151
+ with gr.Row():
152
+ example_dropdown = gr.Dropdown(
153
+ choices=list(example_texts.keys()),
154
+ label="Example Texts [pick one!]",
155
+ value=list(example_texts.keys())[0],
156
+ interactive=True
157
+ )
158
+
159
+ example_dropdown.change(
160
+ fn=update_text_input,
161
+ inputs=[example_dropdown],
162
+ outputs=[inp_longform]
163
+ )
164
+
165
+ with gr.Accordion("Advanced Parameters", open=False):
166
+
167
+ alpha_longform = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
168
+ label="Alpha",
169
+ info="a Diffusion parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled",
170
+ interactive=True)
171
+ beta_longform = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
172
+ label="Beta",
173
+ info="a Diffusion parameter, higher means less affected by the reference | 0 = diffusion is disabled",
174
+ interactive=True)
175
+ diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=10, step=1,
176
+ label="Diffusion Steps",
177
+ interactive=True)
178
+ embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1.25, step=0.1,
179
+ label="Intensity",
180
+ info="a Diffusion parameter, it will impact the expressiveness, if you raise it too much it'll break.",
181
+ interactive=True)
182
+
183
+ rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
184
+ label="Rate of Speech",
185
+ info="Higher = Faster",
186
+ interactive=True)
187
+
188
+ with gr.Column(scale=1):
189
+ btn_longform = gr.Button("Synthesize", variant="primary")
190
+ audio_longform = gr.Audio(interactive=False,
191
+ label="Synthesized Audio",
192
+ waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
193
+
194
+ btn_longform.click(LongformSynth_Text,
195
+ inputs=[inp_longform,
196
+ gr.State(None), # s_prev
197
+ gr.State(None), # Kotodama
198
+ alpha_longform,
199
+ beta_longform,
200
+ gr.State(.8), # t parameter
201
+ diffusion_steps_longform,
202
+ embedding_scale_longform,
203
+ rate_of_speech_longform],
204
+ outputs=[audio_longform],
205
+ concurrency_limit=4)
206
+
207
+ # Kotodama prompt sampler Inference Block
208
+ with gr.Blocks() as prompt_inference:
209
+ with gr.Row():
210
+ with gr.Column(scale=1):
211
+ text_prompt = gr.Textbox(
212
+ label="Text",
213
+ info="Enter the text to synthesize. This text will also be fed to the encoder. Make sure to see the Read Me for more details!",
214
+ value=inputs[0],
215
+ interactive=True,
216
+ scale=5
217
+ )
218
+ description_prompt = gr.Textbox(
219
+ label="Description",
220
+ info="Enter a highly detailed, descriptive prompt that matches the vibe of your text to guide the synthesis.",
221
+ value=prompts[0],
222
+ interactive=True,
223
+ scale=7
224
+ )
225
+
226
+ with gr.Row():
227
+ random_btn = gr.Button('Random Example', variant='secondary')
228
+
229
+ with gr.Accordion("Advanced Parameters", open=True):
230
+ embedding_scale_prompt = gr.Slider(minimum=1, maximum=5, value=1, step=0.25,
231
+ label="Intensity",
232
+ info="it will impact the expressiveness, if you raise it too much it'll break.",
233
+ interactive=True)
234
+ alpha_prompt = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
235
+ label="Alpha",
236
+ info="a Diffusion sampler parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled",
237
+ interactive=True)
238
+ beta_prompt = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
239
+ label="Beta",
240
+ info="a Diffusion sampler parameter, higher means less affected by the reference | 0 = diffusion is disabled",
241
+ interactive=True)
242
+ diffusion_steps_prompt = gr.Slider(minimum=3, maximum=15, value=10, step=1,
243
+ label="Diffusion Steps",
244
+ interactive=True)
245
+ rate_of_speech_prompt = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
246
+ label="Rate of Speech",
247
+ info="Higher = Faster",
248
+ interactive=True)
249
+ with gr.Column(scale=1):
250
+ btn_prompt = gr.Button("Synthesize with Prompt", variant="primary")
251
+ audio_prompt = gr.Audio(interactive=False,
252
+ label="Prompt-based Synthesized Audio",
253
+ waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
254
+
255
+
256
+ random_btn.click(
257
+ fn=get_random_prompt_pair,
258
+ inputs=[],
259
+ outputs=[text_prompt, description_prompt]
260
+ )
261
+
262
+ btn_prompt.click(Inference_Synth_Prompt,
263
+ inputs=[text_prompt,
264
+ description_prompt,
265
+ gr.State(None),
266
+ alpha_prompt,
267
+ beta_prompt,
268
+ diffusion_steps_prompt,
269
+ embedding_scale_prompt,
270
+ rate_of_speech_prompt],
271
+ outputs=[audio_prompt],
272
+ concurrency_limit=4)
273
+ notes = """
274
+ <h1>Notes</h1>
275
+
276
+ <p>
277
+ This work is somewhat different from your typical speech model. It offers a high degree of control<br>
278
+ over the generation process, which means it's easy to inadvertently produce unimpressive outputs.
279
+ </p>
280
+
281
+ <p>
282
+ <b>Kotodama</b> and the <b>Diffusion sampler</b> can significantly help guide the generation towards<br>
283
+ something that aligns with your input, but they aren't foolproof. turn off the diffusion sampler or <br>
284
+ set it to very low values if it doesn't sound good to you. <br>
285
+ </p>
286
+
287
+ <p>
288
+ The prompt encoder is also highly experimental and should be treated as a proof of concept. Due to the<br>
289
+ overwhelming ratio of female to male speakers and the wide variation in both speakers and their expressions,<br>
290
+ the prompt encoder may occasionally produce subpar or contradicting outputs. For example, high expressiveness alongside <br>
291
+ high pitch has been associated with females speakers simply because I had orders of magnitude more of them in the dataset.<br>
292
+ </p>
293
+
294
+ <p>
295
+ ________________________________________________________ <br>
296
+ <strong>A useful note about the voice design and prompting:</strong><br>\n
297
+ The vibe of the dialogue impacts the generated voice since the Japanese dialogue <br>
298
+ and the prompts were jointly trained. This is a peculiar feature of the Japanese lanuage.<br>
299
+ For example if you use 俺 (ore)、僕(boku) or your input is overall masculine <br>
300
+ you may get a guy's voice, even if you describe it as female in the prompt. <br> \n
301
+ The Japanese text that is fed to the prompt doesn't necessarily have to be <br>
302
+ the same as your input, but we can't do it in this demo <br>
303
+ to not make the page too convoluted. In a real world scenario, you can just use a <br>
304
+ prompt with a suitable Japanese text to guide the model, get the style<br>
305
+ then move on to apply it to whatever dialogue you wish your model to speak.<br>
306
+
307
+
308
+ </p>
309
+ ________________________________________________________ <br>
310
+ <p>
311
+ The pitch information in my data was accurately calculated, but it only works in comparison to the other speakers <br>
312
+ so you may find a deep pitch may not be exactly too deep; although it actually is <br>
313
+ when you compare it to others within the same data, also some of the gender labels <br>
314
+ are inaccurate since we used a model to annotate them. <br> \n
315
+ The main goal of this inference method is to demonstrate that style can be mapped to description's embeddings <br>
316
+ yielding reasonably good results.
317
+ </p>
318
+
319
+ <p>
320
+ Overall, I'm confident that with a bit of experimentation, you can achieve reasonbaly good results. <br>
321
+ The model should work well out of the box 90% of the time without the need for extensive tweaking.<br>
322
+ However, here are some tips in case you encounter issues:
323
+ </p>
324
+
325
+ <h2>Tips:</h2>
326
+
327
+ <ul>
328
+ <li>
329
+ Ensure that your input closely matches your reference (audio or text prompt) in terms of tone,<br>
330
+ non-verbal cues, duration, etc.
331
+ </li>
332
+
333
+ <li>
334
+ If your audio is too long but the input is too short, the speech rate will be slow, and vice versa.
335
+ </li>
336
+
337
+ <li>
338
+ Experiment with the <b>alpha</b>, <b>beta</b>, and <b>Intensity</b> parameters. The Diffusion<br>
339
+ sampler is non-deterministic, so regenerate a few times if you're not satisfied with the output.
340
+ </li>
341
+
342
+ <li>
343
+ The speaker's share and expressive distribution in the dataset significantly impact the quality;<br>
344
+ you won't necessarily get perfect results with all speakers.
345
+ </li>
346
+
347
+ <li>
348
+ Punctuation is very important, for example if you add «!» mark it will raise the voice or make it more intense.
349
+ </li>
350
+
351
+ <li>
352
+ Not all speakers are equal. Less represented speakers or out-of-distribution inputs may result<br>
353
+ in artifacts.
354
+ </li>
355
+
356
+ <li>
357
+ If the Diffusion sampler works but the speaker didn't have a certain expression (e.g., extreme anger)<br>
358
+ in the dataset, try raising the diffusion sampler's parameters and let it handle everything. Though<br>
359
+ it may result in less speaker similarity, the ideal way to handle this is to cook new vectors by<br>
360
+ transferring an emotion from one speaker to another. But you can't do that in this space.
361
+ </li>
362
+
363
+ <li>
364
+ For voice-based inference, you can use litagin's awesome <a href="https://huggingface.co/datasets/litagin/Moe-speech" target="_blank">Moe-speech dataset</a>,<br>
365
+ as part of the training data includes a portion of that.
366
+ </li>
367
+
368
+ <li>
369
+ you may also want to tweak the phonemes if you're going for something wild. <br>
370
+ i have used cutlet in the backend, but that doesn't seem to like some of my mappings.
371
+ </li>
372
+
373
+
374
+ </ul>
375
+ """
376
+
377
+
378
+ notes_jp = """
379
+ <h1>メモ</h1>
380
+
381
+ <p>
382
+ この作業は、典型的なスピーチモデルとは少し異なります。生成プロセスに対して高い制御を提供するため、意図せずに<br>
383
+ 比較的にクオリティーの低い出力を生成してしまうことが容易です。
384
+ </p>
385
+
386
+ <p>
387
+ <b>Kotodama</b>と<b>Diffusionサンプラー</b>は、入力に沿ったものを生成するための大きな助けとなりますが、<br>
388
+ 万全というわけではありません。良いアウトプットが出ない場合は、ディフュージョンサンプラーをオフにするか、非常に低い値に設定してください。
389
+ </p>
390
+
391
+
392
+ _____________________________________________<br>\n
393
+ <strong>音声デザインとプロンプトに関する有用なメモ:</strong><br>
394
+ ダイアログの雰囲気は、日本語のダイアログとプロンプトが共同でTrainされたため、生成される音声に影響を与えます。<br>
395
+ これは日本語の特徴的な機能です。例えば、「俺」や「僕」を使用したり、全体的に男性らしい入力をすると、<br>
396
+ プロンプトで女性と記述していても、男性の声が得られる可能性があります。<br>
397
+ プロンプトに入力される日本語のテキストは、必ずしも入力内容と同じである必要はありませんが、<br>
398
+ このデモではページが複雑になりすぎないようにそれを行うことはできません。<br>
399
+ 実際のシナリオでは、適切な日本語のテキストを含むプロンプトを使用してモデルを導き、<br>
400
+ スタイルを取得した後、それを希望するダイアログに適用することができます。<br>
401
+
402
+ _____________________________________________<br>\n
403
+
404
+ <p>
405
+ プロンプトエンコーダも非常に実験的であり、概念実証として扱うべきです。女性話者対男性話者の比率が圧倒的で、<br>
406
+ また話者とその表現に大きなバリエーションがあるため、エンコーダは質の低い出力を生成する可能性があります。<br>
407
+ 例えば、高い表現力は、データセットに多く含まれていた女性話者と関連付けられています。<br>
408
+ それに、データのピッチ情報は正確に計算されましたが、それは他のスピーカーとの比較でしか機能しません...<br>
409
+ だから、深いピッチが必ずしも深すぎるわけではないことに気づくかもしれません。<br>
410
+ ただし、実際には、同じデータ内の他の人と比較すると、深すぎます。このインフレンス��主な目的は、<br>
411
+ スタイルベクトルを記述にマッピングし、合理的に良い結果を得ることにあります。
412
+ </p>
413
+
414
+ <p>
415
+ 全体として、少しの実験でほぼ望む結果を達成できると自信を持っています。90%のケースで、大幅な調整を必要とせず、<br>
416
+ そのままでうまく動作するはずです。しかし、問題が発生した場合のためにいくつかのヒントがあります:
417
+ </p>
418
+
419
+ <h2>ヒント:</h2>
420
+
421
+ <ul>
422
+ <li>
423
+ 入力がリファレンス(音声またはテキストプロンプト)とトーン、非言語的な手がかり、<br>
424
+ 長さなどで密接に一致していることを確認してください。
425
+ </li>
426
+
427
+ <li>
428
+ 音声が長すぎるが入力が短すぎる場合、話速が遅くなります。その逆もまた同様です。
429
+ </li>
430
+
431
+ <li>
432
+ アルファ、ベータ、および埋め込みスケールのパラメータを試行錯誤してください。Diffusionサンプラーは<br>
433
+ 非決定的なので、満足のいく出力が得られない場合は何度か再生成してください。
434
+ </li>
435
+
436
+ <li>
437
+ データセット内の話者の分布と表現力の分布は品質に大きく影響します。<br>
438
+ すべての話者で必ずしも完璧な結果が得られるわけではありません。
439
+ </li>
440
+
441
+ <li>
442
+ 句読点は重要です。たとえな、「!」を使えば、スタイルのインテンシティが上がります。
443
+ </li>
444
+
445
+ <li>
446
+ すべての話者が平等に表現されているわけではありません。少ない表現の話者や<br>
447
+ 分布外の入力はアーティファクトを生じさせる可能性があります。
448
+ </li>
449
+
450
+ <li>
451
+ Diffusionサンプラーが機能しているが、データセット内で特定の表現(例:極度の怒り)がない場合、<br>
452
+ Diffusionサンプラーのパラメータを引き上げ、サンプラーにすべてを任せてください。ただし、それにより<br>
453
+ 話者の類似性が低下する可能性があります。この問題を理想的に解決する方法は、ある話者から別の話者に<br>
454
+ 感情を転送し新しいベクトルを作成することですが、ここではできません。
455
+ </li>
456
+
457
+ <li>
458
+ 音声ベースのインフレンスには、トレーニングデータの一部としてMoe-speechデータセットの一部を含む<br>
459
+ <a href="https://huggingface.co/datasets/litagin/Moe-speech" target="_blank">litaginの素晴らしいデータセット</a>を使用できます。
460
+ </li>
461
+
462
+ <li>
463
+ たまには音素の調整が必要になる場合もあります。バックエンドではcutletを使っているのですが、<br>
464
+ いくつかのOODマッピングがcutletと相性が良くないみたいです。
465
+ </li>
466
+ </ul>
467
+
468
+ """
469
+ with gr.Blocks() as read_me:
470
+ with gr.Row():
471
+ with gr.Column(scale=1):
472
+ gr.Markdown(notes)
473
+
474
+ with gr.Blocks() as read_me_jp:
475
+ with gr.Row():
476
+ with gr.Column(scale=1):
477
+ gr.Markdown(notes_jp)
478
+
479
+
480
+ custom_css = """
481
+ .tab-label {
482
+ color: #FFD700 !important;
483
+ }
484
+ """
485
+
486
+
487
+
488
+
489
+ with gr.Blocks(title="Tsukasa 司", css=custom_css + "footer{display:none !important}", theme="Respair/Shiki@1.2.1") as demo:
490
+ # gr.DuplicateButton("Duplicate Space")
491
+ gr.Markdown(INTROTXT)
492
+
493
+
494
+ gr.TabbedInterface([longform, audio_inf, prompt_inference, read_me, read_me_jp],
495
+ ['Kotodama Text Inference', 'Voice-guided Inference','Prompt-guided Inference [Highly Experimental - not optimized]', 'Read Me! [English]', 'Read Me! [日本語]'])
496
+
497
+ if __name__ == "__main__":
498
+ demo.queue(api_open=False, max_size=15).launch(show_api=False, share=True)
reference_sample_wavs/01001240.ogg ADDED
Binary file (168 kB). View file
 
reference_sample_wavs/01008270.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a306920452276a8b801454ba5d540c7f3c28a3fc0d5ce01bf4a3f679e0f42c3
3
+ size 1082540
reference_sample_wavs/kaede_san.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:376737a52bf7f67ba6035597bae5ad87b5220d005bad78318d3f8062eb9ff692
3
+ size 1812558
reference_sample_wavs/riamu_zeroshot_01.wav ADDED
Binary file (535 kB). View file
 
reference_sample_wavs/riamu_zeroshot_02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa74683a6ac7dca963e3ae4b10f5984902683bd55d2806542b8821a9d07beaa2
3
+ size 1427500
reference_sample_wavs/sample_ref01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4241b264d96819f6d4290c23861401f7b116bbb9fa9aace8b65add01b0d812b
3
+ size 1644002
reference_sample_wavs/sample_ref02.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a38f2f13d1035d0148d965410cde080c6371c9460e17f147ef130aacb4551b1c
3
+ size 1803998
reference_sample_wavs/shiki_fine05.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd063aba7ad59b2bbfb5ed57d164dbbf75c70b91b163c851ca334661911c16c5
3
+ size 2123200
reference_sample_wavs/syuukovoice_200918_3_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7929dc92fdfa61ba580a20d95a677d1f6fe8de10edeae6778d664075e43aeb02
3
+ size 1979500
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ gradio_client