Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +7 -0
- .gradio/certificate.pem +31 -0
- Configs/config.yml +116 -0
- Configs/config_ft.yml +116 -0
- Configs/config_kanade.yml +118 -0
- Inference/infer_24khz_mod.ipynb +0 -0
- Inference/input_for_prompt.txt +4 -0
- Inference/prompt.txt +4 -0
- Inference/random_texts.txt +14 -0
- LICENSE +25 -0
- __pycache__/cotlet_phon.cpython-311.pyc +0 -0
- __pycache__/cotlet_utils.cpython-311.pyc +0 -0
- __pycache__/importable.cpython-311.pyc +0 -0
- __pycache__/models.cpython-311.pyc +0 -0
- __pycache__/text_utils.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- app_tsumugi_remote.py +498 -0
- reference_sample_wavs/01001240.ogg +0 -0
- reference_sample_wavs/01008270.wav +3 -0
- reference_sample_wavs/kaede_san.wav +3 -0
- reference_sample_wavs/riamu_zeroshot_01.wav +0 -0
- reference_sample_wavs/riamu_zeroshot_02.wav +3 -0
- reference_sample_wavs/sample_ref01.wav +3 -0
- reference_sample_wavs/sample_ref02.wav +3 -0
- reference_sample_wavs/shiki_fine05.wav +3 -0
- reference_sample_wavs/syuukovoice_200918_3_01.wav +3 -0
- requirements.txt +2 -0
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
reference_sample_wavs/01008270.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
reference_sample_wavs/kaede_san.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
reference_sample_wavs/riamu_zeroshot_02.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
reference_sample_wavs/sample_ref01.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
reference_sample_wavs/sample_ref02.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
reference_sample_wavs/shiki_fine05.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
reference_sample_wavs/syuukovoice_200918_3_01.wav filter=lfs diff=lfs merge=lfs -text
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
Configs/config.yml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/LJSpeech"
|
2 |
+
first_stage_path: "first_stage.pth"
|
3 |
+
save_freq: 2
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
epochs_1st: 200 # number of epochs for first stage training (pre-training)
|
7 |
+
epochs_2nd: 100 # number of peochs for second stage training (joint training)
|
8 |
+
batch_size: 16
|
9 |
+
max_len: 400 # maximum number of frames
|
10 |
+
pretrained_model: ""
|
11 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
F0_path: "Utils/JDC/bst.t7"
|
15 |
+
ASR_config: "Utils/ASR/config.yml"
|
16 |
+
ASR_path: "Utils/ASR/epoch_00080.pth"
|
17 |
+
PLBERT_dir: 'Utils/PLBERT/'
|
18 |
+
|
19 |
+
data_params:
|
20 |
+
train_data: "Data/train_list.txt"
|
21 |
+
val_data: "Data/val_list.txt"
|
22 |
+
root_path: "/local/LJSpeech-1.1/wavs"
|
23 |
+
OOD_data: "Data/OOD_texts.txt"
|
24 |
+
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
25 |
+
|
26 |
+
preprocess_params:
|
27 |
+
sr: 24000
|
28 |
+
spect_params:
|
29 |
+
n_fft: 2048
|
30 |
+
win_length: 1200
|
31 |
+
hop_length: 300
|
32 |
+
|
33 |
+
model_params:
|
34 |
+
multispeaker: false
|
35 |
+
|
36 |
+
dim_in: 64
|
37 |
+
hidden_dim: 512
|
38 |
+
max_conv_dim: 512
|
39 |
+
n_layer: 3
|
40 |
+
n_mels: 80
|
41 |
+
|
42 |
+
n_token: 178 # number of phoneme tokens
|
43 |
+
max_dur: 50 # maximum duration of a single phoneme
|
44 |
+
style_dim: 128 # style vector size
|
45 |
+
|
46 |
+
dropout: 0.2
|
47 |
+
|
48 |
+
# config for decoder
|
49 |
+
decoder:
|
50 |
+
type: 'istftnet' # either hifigan or istftnet
|
51 |
+
resblock_kernel_sizes: [3,7,11]
|
52 |
+
upsample_rates : [10, 6]
|
53 |
+
upsample_initial_channel: 512
|
54 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
55 |
+
upsample_kernel_sizes: [20, 12]
|
56 |
+
gen_istft_n_fft: 20
|
57 |
+
gen_istft_hop_size: 5
|
58 |
+
|
59 |
+
# speech language model config
|
60 |
+
slm:
|
61 |
+
model: 'microsoft/wavlm-base-plus'
|
62 |
+
sr: 16000 # sampling rate of SLM
|
63 |
+
hidden: 768 # hidden size of SLM
|
64 |
+
nlayers: 13 # number of layers of SLM
|
65 |
+
initial_channel: 64 # initial channels of SLM discriminator head
|
66 |
+
|
67 |
+
# style diffusion model config
|
68 |
+
diffusion:
|
69 |
+
embedding_mask_proba: 0.1
|
70 |
+
# transformer config
|
71 |
+
transformer:
|
72 |
+
num_layers: 3
|
73 |
+
num_heads: 8
|
74 |
+
head_features: 64
|
75 |
+
multiplier: 2
|
76 |
+
|
77 |
+
# diffusion distribution config
|
78 |
+
dist:
|
79 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
80 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
81 |
+
mean: -3.0
|
82 |
+
std: 1.0
|
83 |
+
|
84 |
+
loss_params:
|
85 |
+
lambda_mel: 5. # mel reconstruction loss
|
86 |
+
lambda_gen: 1. # generator loss
|
87 |
+
lambda_slm: 1. # slm feature matching loss
|
88 |
+
|
89 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
90 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
91 |
+
TMA_epoch: 50 # TMA starting epoch (1st stage)
|
92 |
+
|
93 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
94 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
95 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
96 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
97 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
98 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
99 |
+
|
100 |
+
diff_epoch: 20 # style diffusion starting epoch (2nd stage)
|
101 |
+
joint_epoch: 50 # joint training starting epoch (2nd stage)
|
102 |
+
|
103 |
+
optimizer_params:
|
104 |
+
lr: 0.0001 # general learning rate
|
105 |
+
bert_lr: 0.00001 # learning rate for PLBERT
|
106 |
+
ft_lr: 0.00001 # learning rate for acoustic modules
|
107 |
+
|
108 |
+
slmadv_params:
|
109 |
+
min_len: 400 # minimum length of samples
|
110 |
+
max_len: 500 # maximum length of samples
|
111 |
+
batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
|
112 |
+
iter: 10 # update the discriminator every this iterations of generator update
|
113 |
+
thresh: 5 # gradient norm above which the gradient is scaled
|
114 |
+
scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
|
115 |
+
sig: 1.5 # sigma for differentiable duration modeling
|
116 |
+
|
Configs/config_ft.yml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/IMAS_FineTuned"
|
2 |
+
save_freq: 1
|
3 |
+
log_interval: 10
|
4 |
+
device: "cuda"
|
5 |
+
epochs: 50 # number of finetuning epoch (1 hour of data)
|
6 |
+
batch_size: 3
|
7 |
+
max_len: 2500 # maximum number of frames
|
8 |
+
pretrained_model: "/home/austin/disk2/llmvcs/tt/stylekan/Models/Style_Kanade/NO_SLM_3_epoch_2nd_00002.pth"
|
9 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
10 |
+
load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
|
11 |
+
|
12 |
+
F0_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/JDC/bst.t7"
|
13 |
+
ASR_config: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/config.yml"
|
14 |
+
ASR_path: "/home/austin/disk2/llmvcs/tt/stylekan/Utils/ASR/bst_00080.pth"
|
15 |
+
|
16 |
+
PLBERT_dir: 'Utils/PLBERT/'
|
17 |
+
|
18 |
+
data_params:
|
19 |
+
train_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/FT_imas.csv"
|
20 |
+
val_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/metadata_cleanest/FT_imas_valid.csv"
|
21 |
+
root_path: ""
|
22 |
+
OOD_data: "/home/austin/disk2/llmvcs/tt/stylekan/Data/OOD_LargeScale_.csv"
|
23 |
+
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
24 |
+
|
25 |
+
|
26 |
+
preprocess_params:
|
27 |
+
sr: 24000
|
28 |
+
spect_params:
|
29 |
+
n_fft: 2048
|
30 |
+
win_length: 1200
|
31 |
+
hop_length: 300
|
32 |
+
|
33 |
+
model_params:
|
34 |
+
multispeaker: true
|
35 |
+
|
36 |
+
dim_in: 64
|
37 |
+
hidden_dim: 512
|
38 |
+
max_conv_dim: 512
|
39 |
+
n_layer: 3
|
40 |
+
n_mels: 80
|
41 |
+
|
42 |
+
n_token: 178 # number of phoneme tokens
|
43 |
+
max_dur: 50 # maximum duration of a single phoneme
|
44 |
+
style_dim: 128 # style vector size
|
45 |
+
|
46 |
+
dropout: 0.2
|
47 |
+
|
48 |
+
decoder:
|
49 |
+
type: 'istftnet' # either hifigan or istftnet
|
50 |
+
resblock_kernel_sizes: [3,7,11]
|
51 |
+
upsample_rates : [10, 6]
|
52 |
+
upsample_initial_channel: 512
|
53 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
54 |
+
upsample_kernel_sizes: [20, 12]
|
55 |
+
gen_istft_n_fft: 20
|
56 |
+
gen_istft_hop_size: 5
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
# speech language model config
|
61 |
+
slm:
|
62 |
+
model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
|
63 |
+
sr: 16000 # sampling rate of SLM
|
64 |
+
hidden: 1280 # hidden size of SLM
|
65 |
+
nlayers: 33 # number of layers of SLM
|
66 |
+
initial_channel: 64 # initial channels of SLM discriminator head
|
67 |
+
|
68 |
+
# style diffusion model config
|
69 |
+
diffusion:
|
70 |
+
embedding_mask_proba: 0.1
|
71 |
+
# transformer config
|
72 |
+
transformer:
|
73 |
+
num_layers: 3
|
74 |
+
num_heads: 8
|
75 |
+
head_features: 64
|
76 |
+
multiplier: 2
|
77 |
+
|
78 |
+
# diffusion distribution config
|
79 |
+
dist:
|
80 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
81 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
82 |
+
mean: -3.0
|
83 |
+
std: 1.0
|
84 |
+
|
85 |
+
loss_params:
|
86 |
+
lambda_mel: 10. # mel reconstruction loss
|
87 |
+
lambda_gen: 1. # generator loss
|
88 |
+
lambda_slm: 1. # slm feature matching loss
|
89 |
+
|
90 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
91 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
92 |
+
TMA_epoch: 9 # TMA starting epoch (1st stage)
|
93 |
+
|
94 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
95 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
96 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
97 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
98 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
99 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
100 |
+
|
101 |
+
diff_epoch: 0 # style diffusion starting epoch (2nd stage)
|
102 |
+
joint_epoch: 30 # joint training starting epoch (2nd stage)
|
103 |
+
|
104 |
+
optimizer_params:
|
105 |
+
lr: 0.0001 # general learning rate
|
106 |
+
bert_lr: 0.00001 # learning rate for PLBERT
|
107 |
+
ft_lr: 0.00001 # learning rate for acoustic modules
|
108 |
+
|
109 |
+
slmadv_params:
|
110 |
+
min_len: 400 # minimum length of samples
|
111 |
+
max_len: 500 # maximum length of samples
|
112 |
+
batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
|
113 |
+
iter: 20 # update the discriminator every this iterations of generator update
|
114 |
+
thresh: 5 # gradient norm above which the gradient is scaled
|
115 |
+
scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
|
116 |
+
sig: 1.5 # sigma for differentiable duration modeling
|
Configs/config_kanade.yml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
log_dir: "Models/Style_Kanade_v02"
|
2 |
+
first_stage_path: ""
|
3 |
+
save_freq: 1
|
4 |
+
log_interval: 10
|
5 |
+
device: "cuda"
|
6 |
+
epochs_1st: 30 # number of epochs for first stage training (pre-training)
|
7 |
+
epochs_2nd: 20 # number of peochs for second stage training (joint training)
|
8 |
+
batch_size: 64
|
9 |
+
max_len: 560 # maximum number of frames
|
10 |
+
pretrained_model: "Models/Style_Kanade_v02/epoch_2nd_00007.pth"
|
11 |
+
second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
|
12 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
13 |
+
|
14 |
+
F0_path: "Utils/JDC/bst.t7"
|
15 |
+
ASR_config: "Utils/ASR/config.yml"
|
16 |
+
ASR_path: "Utils/ASR/bst_00080.pth"
|
17 |
+
|
18 |
+
PLBERT_dir: 'Utils/PLBERT/'
|
19 |
+
|
20 |
+
data_params:
|
21 |
+
train_data: "Data/metadata_cleanest/DATA.csv"
|
22 |
+
val_data: "Data/VALID.txt"
|
23 |
+
root_path: ""
|
24 |
+
OOD_data: "Data/OOD_LargeScale_.csv"
|
25 |
+
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
26 |
+
|
27 |
+
|
28 |
+
preprocess_params:
|
29 |
+
sr: 24000
|
30 |
+
spect_params:
|
31 |
+
n_fft: 2048
|
32 |
+
win_length: 1200
|
33 |
+
hop_length: 300
|
34 |
+
|
35 |
+
model_params:
|
36 |
+
multispeaker: true
|
37 |
+
|
38 |
+
dim_in: 64
|
39 |
+
hidden_dim: 512
|
40 |
+
max_conv_dim: 512
|
41 |
+
n_layer: 3
|
42 |
+
n_mels: 80
|
43 |
+
|
44 |
+
n_token: 178 # number of phoneme tokens
|
45 |
+
max_dur: 50 # maximum duration of a single phoneme
|
46 |
+
style_dim: 128 # style vector size
|
47 |
+
|
48 |
+
dropout: 0.2
|
49 |
+
|
50 |
+
decoder:
|
51 |
+
type: 'istftnet' # either hifigan or istftnet
|
52 |
+
resblock_kernel_sizes: [3,7,11]
|
53 |
+
upsample_rates : [10, 6]
|
54 |
+
upsample_initial_channel: 512
|
55 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
56 |
+
upsample_kernel_sizes: [20, 12]
|
57 |
+
gen_istft_n_fft: 20
|
58 |
+
gen_istft_hop_size: 5
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
# speech language model config
|
63 |
+
slm:
|
64 |
+
model: 'Respair/Whisper_Large_v2_Encoder_Block' # The model itself is hardcoded, change it through -> losses.py
|
65 |
+
sr: 16000 # sampling rate of SLM
|
66 |
+
hidden: 1280 # hidden size of SLM
|
67 |
+
nlayers: 33 # number of layers of SLM
|
68 |
+
initial_channel: 64 # initial channels of SLM discriminator head
|
69 |
+
|
70 |
+
# style diffusion model config
|
71 |
+
diffusion:
|
72 |
+
embedding_mask_proba: 0.1
|
73 |
+
# transformer config
|
74 |
+
transformer:
|
75 |
+
num_layers: 3
|
76 |
+
num_heads: 8
|
77 |
+
head_features: 64
|
78 |
+
multiplier: 2
|
79 |
+
|
80 |
+
# diffusion distribution config
|
81 |
+
dist:
|
82 |
+
sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
|
83 |
+
estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
|
84 |
+
mean: -3.0
|
85 |
+
std: 1.0
|
86 |
+
|
87 |
+
loss_params:
|
88 |
+
lambda_mel: 10. # mel reconstruction loss
|
89 |
+
lambda_gen: 1. # generator loss
|
90 |
+
lambda_slm: 1. # slm feature matching loss
|
91 |
+
|
92 |
+
lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
|
93 |
+
lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
|
94 |
+
TMA_epoch: 5 # TMA starting epoch (1st stage)
|
95 |
+
|
96 |
+
lambda_F0: 1. # F0 reconstruction loss (2nd stage)
|
97 |
+
lambda_norm: 1. # norm reconstruction loss (2nd stage)
|
98 |
+
lambda_dur: 1. # duration loss (2nd stage)
|
99 |
+
lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
|
100 |
+
lambda_sty: 1. # style reconstruction loss (2nd stage)
|
101 |
+
lambda_diff: 1. # score matching loss (2nd stage)
|
102 |
+
|
103 |
+
diff_epoch: 4 # style diffusion starting epoch (2nd stage)
|
104 |
+
joint_epoch: 999 # joint training starting epoch (2nd stage)
|
105 |
+
|
106 |
+
optimizer_params:
|
107 |
+
lr: 0.0001 # general learning rate
|
108 |
+
bert_lr: 0.00001 # learning rate for PLBERT
|
109 |
+
ft_lr: 0.00001 # learning rate for acoustic modules
|
110 |
+
|
111 |
+
slmadv_params:
|
112 |
+
min_len: 400 # minimum length of samples
|
113 |
+
max_len: 500 # maximum length of samples
|
114 |
+
batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
|
115 |
+
iter: 20 # update the discriminator every this iterations of generator update
|
116 |
+
thresh: 5 # gradient norm above which the gradient is scaled
|
117 |
+
scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
|
118 |
+
sig: 1.5 # sigma for differentiable duration modeling
|
Inference/infer_24khz_mod.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Inference/input_for_prompt.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
この俺に何度も同じことを説明させるな!! お前たちは俺の忠告を完全に無視して、とんでもない結果を招いてしまった。これが最後の警告だ。次は絶対に許さないぞ!
|
2 |
+
時には、静けさの中にこそ、本当の答えが見つかるものですね。慌てる必要はないのです。
|
3 |
+
人生には、表現しきれないほどの驚きがあると思うよ。それは、目には見えない力で、人々を繋ぐ不思議な絆だ。私は、その驚きを胸に秘め、日々を楽しく過ごしているんだ。言葉を伝えるたびに、未来への期待を込めて、元気に話す。それは、夢を叶えるための魔法のようなものだ。
|
4 |
+
かなたの次元より迫り来る混沌の使者たちよ、貴様らの野望を我が焔の業火で焼き尽くす! 運命の歯車は、我が意思と共にすでに動き出したのだ。我が宿命の敵に立ち向かうため、禁断の呪文を紡ぐ時は今ここに訪れる。さあ、見るがよい。我が力を!!
|
Inference/prompt.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
A male voice that resonates with deep, thunderous intensity. His rapid-fire words slam like aggressive drumbeats, each syllable charged with intense rage. The expressive tone fluctuates between restrained fury and explosive outbursts.
|
2 |
+
A female voice with a distinctively deep, low pitch that commands attention. Her slightly monotone delivery creates an air of composure and gravitas, while maintaining a calm, measured pace. Her voice carries a soothing weight, like gentle thunder in the distance, making her words feel grounded and reassuring.
|
3 |
+
a female voice that is gentle and soft, with a slightly high pitch that adds a comforting warmth to her words. Her tone is moderate, neither too expressive nor too flat, creating a balanced and soothing atmosphere. The slow speed of her speech gives her words a deliberate and thoughtful cadence, allowing each phrase to resonate fully. There's a sense of wonder and optimism in her voice, as if she is constantly marveling at the mysteries of life. Her gentle demeanor and soft delivery make her sound approachable and kind, inviting listeners to share in her sense of wonder.
|
4 |
+
A female voice that resonates with deep intensity and a distinctly low pitch. Her words flow with the force and rhythm of a relentless tide, each syllable weighted with profound determination. The expressive tone navigates between measured intensity and powerful surges.
|
Inference/random_texts.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Akashi: 不思議な人ですね、レザさんは。たまには子供扱いしてくれてちょっとむきになりますけど、とても頼りがいのある人だと思いますよ?
|
2 |
+
Kimiji: 人生は、果てしない探求の旅のようなもの。私たちは、自分自身や周囲の世界について、常に新しい発見をしていく。それは、時として喜びをもたらすこともあれば、困難に直面することもある。しかしそれら全てが、自分を形作る貴重な経験である。
|
3 |
+
Reira: 私に何度も同じことを説明させないでよ! お前たちは私の忠告を完全に無視して、とんでもない結果を招いてしまった!! これが最後の警告だ。次は絶対に許さないぞ! ------------------------------------------- (NOTE: enable the diffusion, then set the Intensity to 3, also remove this line!)
|
4 |
+
Yoriko: この世には、言葉にできない悲しみがある。 それは、胸の奥に沈んでいくような重さで、時間が経つにつれて、じわじわと広がっていく。私は、その悲しみを抱えながら、日々を過ごしていいた。 言葉を発するたびに、心の中で何度も繰り返し、慎重に選び抜いている。それは、痛みを和らげるための儀式のようなものだ.
|
5 |
+
Kimiji: 人生には、表現しきれないほどの驚きがあると思います。それは、目には見えない力で、人々を繋ぐ不思議な絆です。私は、その驚きを胸に秘め、日々を楽しく過ごしています。言葉を伝えるたびに、未来への期待を込めて、元気に話す。それは、夢を叶えるための魔法のようなものです。
|
6 |
+
Teppei: そうだな、この新しいシステムの仕組みについて説明しておこう。基本的には三層構造になっていて、各層が独立して機能している。一番下のレイヤーでデータの処理と保存を行い、真ん中の層でビジネスロジックを実装している。ユーザーが直接触れるのは最上位層だけで、そこでインターフェースの制御をしているんだ。パフォーマンスを考えると、データベースへのアクセスを最小限に抑える必要があるから、キャッシュの実装も検討している。ただ、システムの規模を考えると、当面は現状の構成で十分だと思われる。将来的にスケールする必要が出てきた場合は、その時点で見直しを検討すればいいだろう。
|
7 |
+
Kirara: べ、別にそんなことじゃないってば!あんたのことなんて全然気にしてないんだからね!
|
8 |
+
Maiko: ねえ、ちょっと今日の空を見上げて。朝から少しずつ変わっていく雲の形が、まるで漫画の中の風景みたい。東の方からゆっくりと暖かい風が吹いてきて、桜の花びらが舞い散るように、優しく大地を撫でていくの。春の陽気が徐々に夏の暑さに変わっていくこの季節は、なんだかわくわくするよね。でも、週間予報によると、明後日あたりから天気が崩れるみたいで、しばらくは傘の出番かもしれないんだ。梅雨の時期が近づいているから、空気も少しずつ湿っぽくなってきているのを感じない?でも、雨上がりの空気って、なんだか特別な匂いがして、私、結構好きなんだよね。
|
9 |
+
Amane: そ、そうかな?あなたと過ごした今までの時間は、私に、とても、とーっても大事だよ?だって、あなたの言葉には、どこか特別な温もりがあるような気がする。まるで、心の中に直接響いてくるようで、ほんの少しの間だけ、世界が優しくなった気がするよ。
|
10 |
+
Shioji: つい昨日のことみたいなのに、もう随分と昔のお話なんだね ! 今でも古い本の匂いがすると、あの静かで穏やかな時間がまるで透明で綺麗な海みたいに心の中でふわりと溶けていくの。
|
11 |
+
Riemi: かなたの次元より迫り来る混沌の使者たちよ、貴様らの野望を我が焔の業火で焼き尽くす! 運命の歯車は、我が意思と共にすでに動き出したのだ。我が宿命の敵に立ち向かうため、禁断の呪文を紡ぐ時は今ここに訪れる。さあ、見るがよい。我が力を!!
|
12 |
+
だめですよ?そんなことは。もっと慎重に行動してくださいね。
|
13 |
+
Kazunari: ojousama wa, katachi dake no tsukibito ga ireba sore de ii, to omotte orareru you desuga, katachi bakari de mo, san nen ka o tomo ni seikatsu o suru koto ni narimasɯ. --------------------------------------- (NOTE!: your spacing will impact the intonations. also remove this line!)
|
14 |
+
Homa: 君は muzukashiku 考えると kekka to shite 空回り suru taipu dakara na. 大体, sonna kanji de oboete ikeba iin da. -------------------------------------------------- (NOTE!: your spacing will impact the intonations. also remove this line!)
|
LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LICENSE FOR STYLETTS2 DEMO PAGE: GPL (I KNOW I DON'T LIKE GPL BUT I HAVE TO BECAUSE OF PHONEMIZER REQUIREMENT)
|
2 |
+
|
3 |
+
|
4 |
+
styletts 2 license:
|
5 |
+
|
6 |
+
Copyright (c) 2023 Aaron (Yinghao) Li
|
7 |
+
|
8 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
of this software and associated documentation files (the "Software"), to deal
|
10 |
+
in the Software without restriction, including without limitation the rights
|
11 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
copies of the Software, and to permit persons to whom the Software is
|
13 |
+
furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
The above copyright notice and this permission notice shall be included in all
|
16 |
+
copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
+
SOFTWARE.
|
25 |
+
|
__pycache__/cotlet_phon.cpython-311.pyc
ADDED
Binary file (5.58 kB). View file
|
|
__pycache__/cotlet_utils.cpython-311.pyc
ADDED
Binary file (25.1 kB). View file
|
|
__pycache__/importable.cpython-311.pyc
ADDED
Binary file (21.5 kB). View file
|
|
__pycache__/models.cpython-311.pyc
ADDED
Binary file (52.6 kB). View file
|
|
__pycache__/text_utils.cpython-311.pyc
ADDED
Binary file (1.85 kB). View file
|
|
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (5.8 kB). View file
|
|
app_tsumugi_remote.py
ADDED
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INTROTXT = """#
|
2 |
+
Repo -> [Hugging Face - 🤗](https://huggingface.co/Respair/Project_Kanade_SpeechModel)
|
3 |
+
This space uses Tsukasa (24khz).
|
4 |
+
**Check the Read me tabs down below.** <br>
|
5 |
+
Enjoy!
|
6 |
+
"""
|
7 |
+
import gradio as gr
|
8 |
+
import random
|
9 |
+
import torch
|
10 |
+
import os
|
11 |
+
import pickle
|
12 |
+
from gradio_client import Client
|
13 |
+
client = Client('https://9155368f00f8a50bc7.gradio.live')
|
14 |
+
|
15 |
+
voices = {}
|
16 |
+
example_texts = {}
|
17 |
+
prompts = []
|
18 |
+
inputs = []
|
19 |
+
|
20 |
+
|
21 |
+
theme = gr.themes.Base(
|
22 |
+
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
|
23 |
+
)
|
24 |
+
|
25 |
+
voicelist = [v for v in os.listdir("/home/ubuntu/Kanade_Project/gradio/Tsukasa_Speech/reference_sample_wavs")]
|
26 |
+
|
27 |
+
for v in voicelist:
|
28 |
+
voices[v] = f'reference_sample_wavs/{v}'
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
with open(f'Inference/random_texts.txt', 'r') as r:
|
33 |
+
random_texts = [line.strip() for line in r]
|
34 |
+
|
35 |
+
example_texts = {f"{text[:30]}...": text for text in random_texts}
|
36 |
+
|
37 |
+
def update_text_input(preview):
|
38 |
+
|
39 |
+
return example_texts[preview]
|
40 |
+
|
41 |
+
def get_random_text():
|
42 |
+
return random.choice(random_texts)
|
43 |
+
|
44 |
+
|
45 |
+
with open('Inference/prompt.txt', 'r') as p:
|
46 |
+
prompts = [line.strip() for line in p]
|
47 |
+
|
48 |
+
with open('Inference/input_for_prompt.txt', 'r') as i:
|
49 |
+
inputs = [line.strip() for line in i]
|
50 |
+
|
51 |
+
|
52 |
+
last_idx = None
|
53 |
+
|
54 |
+
def get_random_prompt_pair():
|
55 |
+
global last_idx
|
56 |
+
max_idx = min(len(prompts), len(inputs)) - 1
|
57 |
+
|
58 |
+
|
59 |
+
random_idx = random.randint(0, max_idx)
|
60 |
+
while random_idx == last_idx:
|
61 |
+
random_idx = random.randint(0, max_idx)
|
62 |
+
|
63 |
+
last_idx = random_idx
|
64 |
+
return inputs[random_idx], prompts[random_idx]
|
65 |
+
|
66 |
+
|
67 |
+
def Synthesize_Audio(text, voice=None, voice2=None, vcsteps=2, embscale=1, alpha=.4, beta=.4, ros=.1):
|
68 |
+
# Wrap the file path using the gradio.File class
|
69 |
+
if voice2 is not None:
|
70 |
+
voice2 = {"path": voice2, "meta": {"_type": "gradio.FileData"}}
|
71 |
+
|
72 |
+
|
73 |
+
# Call the Gradio endpoint through the client with the appropriate API Name
|
74 |
+
result = client.predict(
|
75 |
+
text,
|
76 |
+
voice,
|
77 |
+
voice2,
|
78 |
+
vcsteps,
|
79 |
+
embscale,
|
80 |
+
alpha,
|
81 |
+
beta,
|
82 |
+
ros,
|
83 |
+
api_name="/Synthesize_Audio"
|
84 |
+
)
|
85 |
+
return result
|
86 |
+
|
87 |
+
# Example usage
|
88 |
+
|
89 |
+
def LongformSynth_Text(text, s_prev, Kotodama, alpha, beta, t, diffusion_steps, embedding_scale, rate_of_speech):
|
90 |
+
|
91 |
+
result = client.predict(
|
92 |
+
text,
|
93 |
+
alpha,
|
94 |
+
beta,
|
95 |
+
diffusion_steps,
|
96 |
+
embedding_scale,
|
97 |
+
rate_of_speech,
|
98 |
+
api_name="/LongformSynth_Text"
|
99 |
+
)
|
100 |
+
return result
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
def Inference_Synth_Prompt(text, description, Kotodama, alpha, beta, diffusion_steps, embedding_scale, rate_of_speech):
|
105 |
+
|
106 |
+
result = client.predict(
|
107 |
+
text,
|
108 |
+
description,
|
109 |
+
alpha,
|
110 |
+
beta,
|
111 |
+
diffusion_steps,
|
112 |
+
embedding_scale,
|
113 |
+
rate_of_speech,
|
114 |
+
api_name="/Inference_Synth_Prompt"
|
115 |
+
)
|
116 |
+
return result
|
117 |
+
|
118 |
+
|
119 |
+
with gr.Blocks() as audio_inf:
|
120 |
+
with gr.Row():
|
121 |
+
with gr.Column(scale=1):
|
122 |
+
inp = gr.Textbox(label="Text", info="Enter the text", value="きみの存在は、私の心の中で燃える小さな光のよう。きみがいない時、世界は白黒の写真みたいに寂しくて、何も輝いてない。きみの笑顔だけが、私の灰色の日々に色を塗ってくれる。離れてる時間は、めちゃくちゃ長く感じられて、きみへの想いは風船みたいにどんどん膨らんでいく。きみなしの世界なんて、想像できないよ。", interactive=True, scale=5)
|
123 |
+
voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value=voicelist[5], interactive=True)
|
124 |
+
voice_2 = gr.Audio(label="Upload your own Audio", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
125 |
+
|
126 |
+
with gr.Accordion("Advanced Parameters", open=False):
|
127 |
+
|
128 |
+
alpha = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1, label="Alpha", info="a Diffusion sampler parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled", interactive=True)
|
129 |
+
beta = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1, label="Beta", info="a Diffusion sampler parameter, higher means less affected by the reference | 0 = diffusion is disabled", interactive=True)
|
130 |
+
multispeakersteps = gr.Slider(minimum=3, maximum=15, value=5, step=1, label="Diffusion Steps", interactive=True)
|
131 |
+
embscale = gr.Slider(minimum=1, maximum=5, value=1, step=0.1, label="Intensity", info="will impact the expressiveness, if you raise it too much it'll break.", interactive=True)
|
132 |
+
rate_of_speech = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Rate of Speech", info="Higher -> Faster", interactive=True)
|
133 |
+
|
134 |
+
with gr.Column(scale=1):
|
135 |
+
btn = gr.Button("Synthesize", variant="primary")
|
136 |
+
audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
137 |
+
btn.click(Synthesize_Audio, inputs=[inp, voice, voice_2, multispeakersteps, embscale, alpha, beta, rate_of_speech], outputs=[audio], concurrency_limit=4)
|
138 |
+
|
139 |
+
# Kotodama Text sampler Synthesis Block
|
140 |
+
with gr.Blocks() as longform:
|
141 |
+
with gr.Row():
|
142 |
+
with gr.Column(scale=1):
|
143 |
+
inp_longform = gr.Textbox(
|
144 |
+
label="Text",
|
145 |
+
info="Enter the text [Speaker: Text -> japanese or romaji both work, check the last example!] \n Also works without any names. ",
|
146 |
+
value=list(example_texts.values())[0],
|
147 |
+
interactive=True,
|
148 |
+
scale=5
|
149 |
+
)
|
150 |
+
|
151 |
+
with gr.Row():
|
152 |
+
example_dropdown = gr.Dropdown(
|
153 |
+
choices=list(example_texts.keys()),
|
154 |
+
label="Example Texts [pick one!]",
|
155 |
+
value=list(example_texts.keys())[0],
|
156 |
+
interactive=True
|
157 |
+
)
|
158 |
+
|
159 |
+
example_dropdown.change(
|
160 |
+
fn=update_text_input,
|
161 |
+
inputs=[example_dropdown],
|
162 |
+
outputs=[inp_longform]
|
163 |
+
)
|
164 |
+
|
165 |
+
with gr.Accordion("Advanced Parameters", open=False):
|
166 |
+
|
167 |
+
alpha_longform = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
|
168 |
+
label="Alpha",
|
169 |
+
info="a Diffusion parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled",
|
170 |
+
interactive=True)
|
171 |
+
beta_longform = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
|
172 |
+
label="Beta",
|
173 |
+
info="a Diffusion parameter, higher means less affected by the reference | 0 = diffusion is disabled",
|
174 |
+
interactive=True)
|
175 |
+
diffusion_steps_longform = gr.Slider(minimum=3, maximum=15, value=10, step=1,
|
176 |
+
label="Diffusion Steps",
|
177 |
+
interactive=True)
|
178 |
+
embedding_scale_longform = gr.Slider(minimum=1, maximum=5, value=1.25, step=0.1,
|
179 |
+
label="Intensity",
|
180 |
+
info="a Diffusion parameter, it will impact the expressiveness, if you raise it too much it'll break.",
|
181 |
+
interactive=True)
|
182 |
+
|
183 |
+
rate_of_speech_longform = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
|
184 |
+
label="Rate of Speech",
|
185 |
+
info="Higher = Faster",
|
186 |
+
interactive=True)
|
187 |
+
|
188 |
+
with gr.Column(scale=1):
|
189 |
+
btn_longform = gr.Button("Synthesize", variant="primary")
|
190 |
+
audio_longform = gr.Audio(interactive=False,
|
191 |
+
label="Synthesized Audio",
|
192 |
+
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
193 |
+
|
194 |
+
btn_longform.click(LongformSynth_Text,
|
195 |
+
inputs=[inp_longform,
|
196 |
+
gr.State(None), # s_prev
|
197 |
+
gr.State(None), # Kotodama
|
198 |
+
alpha_longform,
|
199 |
+
beta_longform,
|
200 |
+
gr.State(.8), # t parameter
|
201 |
+
diffusion_steps_longform,
|
202 |
+
embedding_scale_longform,
|
203 |
+
rate_of_speech_longform],
|
204 |
+
outputs=[audio_longform],
|
205 |
+
concurrency_limit=4)
|
206 |
+
|
207 |
+
# Kotodama prompt sampler Inference Block
|
208 |
+
with gr.Blocks() as prompt_inference:
|
209 |
+
with gr.Row():
|
210 |
+
with gr.Column(scale=1):
|
211 |
+
text_prompt = gr.Textbox(
|
212 |
+
label="Text",
|
213 |
+
info="Enter the text to synthesize. This text will also be fed to the encoder. Make sure to see the Read Me for more details!",
|
214 |
+
value=inputs[0],
|
215 |
+
interactive=True,
|
216 |
+
scale=5
|
217 |
+
)
|
218 |
+
description_prompt = gr.Textbox(
|
219 |
+
label="Description",
|
220 |
+
info="Enter a highly detailed, descriptive prompt that matches the vibe of your text to guide the synthesis.",
|
221 |
+
value=prompts[0],
|
222 |
+
interactive=True,
|
223 |
+
scale=7
|
224 |
+
)
|
225 |
+
|
226 |
+
with gr.Row():
|
227 |
+
random_btn = gr.Button('Random Example', variant='secondary')
|
228 |
+
|
229 |
+
with gr.Accordion("Advanced Parameters", open=True):
|
230 |
+
embedding_scale_prompt = gr.Slider(minimum=1, maximum=5, value=1, step=0.25,
|
231 |
+
label="Intensity",
|
232 |
+
info="it will impact the expressiveness, if you raise it too much it'll break.",
|
233 |
+
interactive=True)
|
234 |
+
alpha_prompt = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
|
235 |
+
label="Alpha",
|
236 |
+
info="a Diffusion sampler parameter handling the timbre, higher means less affected by the reference | 0 = diffusion is disabled",
|
237 |
+
interactive=True)
|
238 |
+
beta_prompt = gr.Slider(minimum=0, maximum=1, value=0.0, step=0.1,
|
239 |
+
label="Beta",
|
240 |
+
info="a Diffusion sampler parameter, higher means less affected by the reference | 0 = diffusion is disabled",
|
241 |
+
interactive=True)
|
242 |
+
diffusion_steps_prompt = gr.Slider(minimum=3, maximum=15, value=10, step=1,
|
243 |
+
label="Diffusion Steps",
|
244 |
+
interactive=True)
|
245 |
+
rate_of_speech_prompt = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1,
|
246 |
+
label="Rate of Speech",
|
247 |
+
info="Higher = Faster",
|
248 |
+
interactive=True)
|
249 |
+
with gr.Column(scale=1):
|
250 |
+
btn_prompt = gr.Button("Synthesize with Prompt", variant="primary")
|
251 |
+
audio_prompt = gr.Audio(interactive=False,
|
252 |
+
label="Prompt-based Synthesized Audio",
|
253 |
+
waveform_options={'waveform_color': '#a3ffc3', 'waveform_progress_color': '#e972ab'})
|
254 |
+
|
255 |
+
|
256 |
+
random_btn.click(
|
257 |
+
fn=get_random_prompt_pair,
|
258 |
+
inputs=[],
|
259 |
+
outputs=[text_prompt, description_prompt]
|
260 |
+
)
|
261 |
+
|
262 |
+
btn_prompt.click(Inference_Synth_Prompt,
|
263 |
+
inputs=[text_prompt,
|
264 |
+
description_prompt,
|
265 |
+
gr.State(None),
|
266 |
+
alpha_prompt,
|
267 |
+
beta_prompt,
|
268 |
+
diffusion_steps_prompt,
|
269 |
+
embedding_scale_prompt,
|
270 |
+
rate_of_speech_prompt],
|
271 |
+
outputs=[audio_prompt],
|
272 |
+
concurrency_limit=4)
|
273 |
+
notes = """
|
274 |
+
<h1>Notes</h1>
|
275 |
+
|
276 |
+
<p>
|
277 |
+
This work is somewhat different from your typical speech model. It offers a high degree of control<br>
|
278 |
+
over the generation process, which means it's easy to inadvertently produce unimpressive outputs.
|
279 |
+
</p>
|
280 |
+
|
281 |
+
<p>
|
282 |
+
<b>Kotodama</b> and the <b>Diffusion sampler</b> can significantly help guide the generation towards<br>
|
283 |
+
something that aligns with your input, but they aren't foolproof. turn off the diffusion sampler or <br>
|
284 |
+
set it to very low values if it doesn't sound good to you. <br>
|
285 |
+
</p>
|
286 |
+
|
287 |
+
<p>
|
288 |
+
The prompt encoder is also highly experimental and should be treated as a proof of concept. Due to the<br>
|
289 |
+
overwhelming ratio of female to male speakers and the wide variation in both speakers and their expressions,<br>
|
290 |
+
the prompt encoder may occasionally produce subpar or contradicting outputs. For example, high expressiveness alongside <br>
|
291 |
+
high pitch has been associated with females speakers simply because I had orders of magnitude more of them in the dataset.<br>
|
292 |
+
</p>
|
293 |
+
|
294 |
+
<p>
|
295 |
+
________________________________________________________ <br>
|
296 |
+
<strong>A useful note about the voice design and prompting:</strong><br>\n
|
297 |
+
The vibe of the dialogue impacts the generated voice since the Japanese dialogue <br>
|
298 |
+
and the prompts were jointly trained. This is a peculiar feature of the Japanese lanuage.<br>
|
299 |
+
For example if you use 俺 (ore)、僕(boku) or your input is overall masculine <br>
|
300 |
+
you may get a guy's voice, even if you describe it as female in the prompt. <br> \n
|
301 |
+
The Japanese text that is fed to the prompt doesn't necessarily have to be <br>
|
302 |
+
the same as your input, but we can't do it in this demo <br>
|
303 |
+
to not make the page too convoluted. In a real world scenario, you can just use a <br>
|
304 |
+
prompt with a suitable Japanese text to guide the model, get the style<br>
|
305 |
+
then move on to apply it to whatever dialogue you wish your model to speak.<br>
|
306 |
+
|
307 |
+
|
308 |
+
</p>
|
309 |
+
________________________________________________________ <br>
|
310 |
+
<p>
|
311 |
+
The pitch information in my data was accurately calculated, but it only works in comparison to the other speakers <br>
|
312 |
+
so you may find a deep pitch may not be exactly too deep; although it actually is <br>
|
313 |
+
when you compare it to others within the same data, also some of the gender labels <br>
|
314 |
+
are inaccurate since we used a model to annotate them. <br> \n
|
315 |
+
The main goal of this inference method is to demonstrate that style can be mapped to description's embeddings <br>
|
316 |
+
yielding reasonably good results.
|
317 |
+
</p>
|
318 |
+
|
319 |
+
<p>
|
320 |
+
Overall, I'm confident that with a bit of experimentation, you can achieve reasonbaly good results. <br>
|
321 |
+
The model should work well out of the box 90% of the time without the need for extensive tweaking.<br>
|
322 |
+
However, here are some tips in case you encounter issues:
|
323 |
+
</p>
|
324 |
+
|
325 |
+
<h2>Tips:</h2>
|
326 |
+
|
327 |
+
<ul>
|
328 |
+
<li>
|
329 |
+
Ensure that your input closely matches your reference (audio or text prompt) in terms of tone,<br>
|
330 |
+
non-verbal cues, duration, etc.
|
331 |
+
</li>
|
332 |
+
|
333 |
+
<li>
|
334 |
+
If your audio is too long but the input is too short, the speech rate will be slow, and vice versa.
|
335 |
+
</li>
|
336 |
+
|
337 |
+
<li>
|
338 |
+
Experiment with the <b>alpha</b>, <b>beta</b>, and <b>Intensity</b> parameters. The Diffusion<br>
|
339 |
+
sampler is non-deterministic, so regenerate a few times if you're not satisfied with the output.
|
340 |
+
</li>
|
341 |
+
|
342 |
+
<li>
|
343 |
+
The speaker's share and expressive distribution in the dataset significantly impact the quality;<br>
|
344 |
+
you won't necessarily get perfect results with all speakers.
|
345 |
+
</li>
|
346 |
+
|
347 |
+
<li>
|
348 |
+
Punctuation is very important, for example if you add «!» mark it will raise the voice or make it more intense.
|
349 |
+
</li>
|
350 |
+
|
351 |
+
<li>
|
352 |
+
Not all speakers are equal. Less represented speakers or out-of-distribution inputs may result<br>
|
353 |
+
in artifacts.
|
354 |
+
</li>
|
355 |
+
|
356 |
+
<li>
|
357 |
+
If the Diffusion sampler works but the speaker didn't have a certain expression (e.g., extreme anger)<br>
|
358 |
+
in the dataset, try raising the diffusion sampler's parameters and let it handle everything. Though<br>
|
359 |
+
it may result in less speaker similarity, the ideal way to handle this is to cook new vectors by<br>
|
360 |
+
transferring an emotion from one speaker to another. But you can't do that in this space.
|
361 |
+
</li>
|
362 |
+
|
363 |
+
<li>
|
364 |
+
For voice-based inference, you can use litagin's awesome <a href="https://huggingface.co/datasets/litagin/Moe-speech" target="_blank">Moe-speech dataset</a>,<br>
|
365 |
+
as part of the training data includes a portion of that.
|
366 |
+
</li>
|
367 |
+
|
368 |
+
<li>
|
369 |
+
you may also want to tweak the phonemes if you're going for something wild. <br>
|
370 |
+
i have used cutlet in the backend, but that doesn't seem to like some of my mappings.
|
371 |
+
</li>
|
372 |
+
|
373 |
+
|
374 |
+
</ul>
|
375 |
+
"""
|
376 |
+
|
377 |
+
|
378 |
+
notes_jp = """
|
379 |
+
<h1>メモ</h1>
|
380 |
+
|
381 |
+
<p>
|
382 |
+
この作業は、典型的なスピーチモデルとは少し異なります。生成プロセスに対して高い制御を提供するため、意図せずに<br>
|
383 |
+
比較的にクオリティーの低い出力を生成してしまうことが容易です。
|
384 |
+
</p>
|
385 |
+
|
386 |
+
<p>
|
387 |
+
<b>Kotodama</b>と<b>Diffusionサンプラー</b>は、入力に沿ったものを生成するための大きな助けとなりますが、<br>
|
388 |
+
万全というわけではありません。良いアウトプットが出ない場合は、ディフュージョンサンプラーをオフにするか、非常に低い値に設定してください。
|
389 |
+
</p>
|
390 |
+
|
391 |
+
|
392 |
+
_____________________________________________<br>\n
|
393 |
+
<strong>音声デザインとプロンプトに関する有用なメモ:</strong><br>
|
394 |
+
ダイアログの雰囲気は、日本語のダイアログとプロンプトが共同でTrainされたため、生成される音声に影響を与えます。<br>
|
395 |
+
これは日本語の特徴的な機能です。例えば、「俺」や「僕」を使用したり、全体的に男性らしい入力をすると、<br>
|
396 |
+
プロンプトで女性と記述していても、男性の声が得られる可能性があります。<br>
|
397 |
+
プロンプトに入力される日本語のテキストは、必ずしも入力内容と同じである必要はありませんが、<br>
|
398 |
+
このデモではページが複雑になりすぎないようにそれを行うことはできません。<br>
|
399 |
+
実際のシナリオでは、適切な日本語のテキストを含むプロンプトを使用してモデルを導き、<br>
|
400 |
+
スタイルを取得した後、それを希望するダイアログに適用することができます。<br>
|
401 |
+
|
402 |
+
_____________________________________________<br>\n
|
403 |
+
|
404 |
+
<p>
|
405 |
+
プロンプトエンコーダも非常に実験的であり、概念実証として扱うべきです。女性話者対男性話者の比率が圧倒的で、<br>
|
406 |
+
また話者とその表現に大きなバリエーションがあるため、エンコーダは質の低い出力を生成する可能性があります。<br>
|
407 |
+
例えば、高い表現力は、データセットに多く含まれていた女性話者と関連付けられています。<br>
|
408 |
+
それに、データのピッチ情報は正確に計算されましたが、それは他のスピーカーとの比較でしか機能しません...<br>
|
409 |
+
だから、深いピッチが必ずしも深すぎるわけではないことに気づくかもしれません。<br>
|
410 |
+
ただし、実際には、同じデータ内の他の人と比較すると、深すぎます。このインフレンス��主な目的は、<br>
|
411 |
+
スタイルベクトルを記述にマッピングし、合理的に良い結果を得ることにあります。
|
412 |
+
</p>
|
413 |
+
|
414 |
+
<p>
|
415 |
+
全体として、少しの実験でほぼ望む結果を達成できると自信を持っています。90%のケースで、大幅な調整を必要とせず、<br>
|
416 |
+
そのままでうまく動作するはずです。しかし、問題が発生した場合のためにいくつかのヒントがあります:
|
417 |
+
</p>
|
418 |
+
|
419 |
+
<h2>ヒント:</h2>
|
420 |
+
|
421 |
+
<ul>
|
422 |
+
<li>
|
423 |
+
入力がリファレンス(音声またはテキストプロンプト)とトーン、非言語的な手がかり、<br>
|
424 |
+
長さなどで密接に一致していることを確認してください。
|
425 |
+
</li>
|
426 |
+
|
427 |
+
<li>
|
428 |
+
音声が長すぎるが入力が短すぎる場合、話速が遅くなります。その逆もまた同様です。
|
429 |
+
</li>
|
430 |
+
|
431 |
+
<li>
|
432 |
+
アルファ、ベータ、および埋め込みスケールのパラメータを試行錯誤してください。Diffusionサンプラーは<br>
|
433 |
+
非決定的なので、満足のいく出力が得られない場合は何度か再生成してください。
|
434 |
+
</li>
|
435 |
+
|
436 |
+
<li>
|
437 |
+
データセット内の話者の分布と表現力の分布は品質に大きく影響します。<br>
|
438 |
+
すべての話者で必ずしも完璧な結果が得られるわけではありません。
|
439 |
+
</li>
|
440 |
+
|
441 |
+
<li>
|
442 |
+
句読点は重要です。たとえな、「!」を使えば、スタイルのインテンシティが上がります。
|
443 |
+
</li>
|
444 |
+
|
445 |
+
<li>
|
446 |
+
すべての話者が平等に表現されているわけではありません。少ない表現の話者や<br>
|
447 |
+
分布外の入力はアーティファクトを生じさせる可能性があります。
|
448 |
+
</li>
|
449 |
+
|
450 |
+
<li>
|
451 |
+
Diffusionサンプラーが機能しているが、データセット内で特定の表現(例:極度の怒り)がない場合、<br>
|
452 |
+
Diffusionサンプラーのパラメータを引き上げ、サンプラーにすべてを任せてください。ただし、それにより<br>
|
453 |
+
話者の類似性が低下する可能性があります。この問題を理想的に解決する方法は、ある話者から別の話者に<br>
|
454 |
+
感情を転送し新しいベクトルを作成することですが、ここではできません。
|
455 |
+
</li>
|
456 |
+
|
457 |
+
<li>
|
458 |
+
音声ベースのインフレンスには、トレーニングデータの一部としてMoe-speechデータセットの一部を含む<br>
|
459 |
+
<a href="https://huggingface.co/datasets/litagin/Moe-speech" target="_blank">litaginの素晴らしいデータセット</a>を使用できます。
|
460 |
+
</li>
|
461 |
+
|
462 |
+
<li>
|
463 |
+
たまには音素の調整が必要になる場合もあります。バックエンドではcutletを使っているのですが、<br>
|
464 |
+
いくつかのOODマッピングがcutletと相性が良くないみたいです。
|
465 |
+
</li>
|
466 |
+
</ul>
|
467 |
+
|
468 |
+
"""
|
469 |
+
with gr.Blocks() as read_me:
|
470 |
+
with gr.Row():
|
471 |
+
with gr.Column(scale=1):
|
472 |
+
gr.Markdown(notes)
|
473 |
+
|
474 |
+
with gr.Blocks() as read_me_jp:
|
475 |
+
with gr.Row():
|
476 |
+
with gr.Column(scale=1):
|
477 |
+
gr.Markdown(notes_jp)
|
478 |
+
|
479 |
+
|
480 |
+
custom_css = """
|
481 |
+
.tab-label {
|
482 |
+
color: #FFD700 !important;
|
483 |
+
}
|
484 |
+
"""
|
485 |
+
|
486 |
+
|
487 |
+
|
488 |
+
|
489 |
+
with gr.Blocks(title="Tsukasa 司", css=custom_css + "footer{display:none !important}", theme="Respair/Shiki@1.2.1") as demo:
|
490 |
+
# gr.DuplicateButton("Duplicate Space")
|
491 |
+
gr.Markdown(INTROTXT)
|
492 |
+
|
493 |
+
|
494 |
+
gr.TabbedInterface([longform, audio_inf, prompt_inference, read_me, read_me_jp],
|
495 |
+
['Kotodama Text Inference', 'Voice-guided Inference','Prompt-guided Inference [Highly Experimental - not optimized]', 'Read Me! [English]', 'Read Me! [日本語]'])
|
496 |
+
|
497 |
+
if __name__ == "__main__":
|
498 |
+
demo.queue(api_open=False, max_size=15).launch(show_api=False, share=True)
|
reference_sample_wavs/01001240.ogg
ADDED
Binary file (168 kB). View file
|
|
reference_sample_wavs/01008270.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a306920452276a8b801454ba5d540c7f3c28a3fc0d5ce01bf4a3f679e0f42c3
|
3 |
+
size 1082540
|
reference_sample_wavs/kaede_san.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:376737a52bf7f67ba6035597bae5ad87b5220d005bad78318d3f8062eb9ff692
|
3 |
+
size 1812558
|
reference_sample_wavs/riamu_zeroshot_01.wav
ADDED
Binary file (535 kB). View file
|
|
reference_sample_wavs/riamu_zeroshot_02.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa74683a6ac7dca963e3ae4b10f5984902683bd55d2806542b8821a9d07beaa2
|
3 |
+
size 1427500
|
reference_sample_wavs/sample_ref01.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4241b264d96819f6d4290c23861401f7b116bbb9fa9aace8b65add01b0d812b
|
3 |
+
size 1644002
|
reference_sample_wavs/sample_ref02.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a38f2f13d1035d0148d965410cde080c6371c9460e17f147ef130aacb4551b1c
|
3 |
+
size 1803998
|
reference_sample_wavs/shiki_fine05.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd063aba7ad59b2bbfb5ed57d164dbbf75c70b91b163c851ca334661911c16c5
|
3 |
+
size 2123200
|
reference_sample_wavs/syuukovoice_200918_3_01.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7929dc92fdfa61ba580a20d95a677d1f6fe8de10edeae6778d664075e43aeb02
|
3 |
+
size 1979500
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
gradio_client
|