File size: 2,100 Bytes
edd9e8a
d197937
edd9e8a
254a63f
 
 
 
 
edd9e8a
e0e4c11
254a63f
a8cbdb8
254a63f
 
 
a8cbdb8
254a63f
 
 
 
6a6dfa7
d197937
 
254a63f
d197937
 
0dc9f9f
 
 
 
e0e4c11
0dc9f9f
 
d197937
254a63f
 
0dc9f9f
 
254a63f
0dc9f9f
254a63f
0dc9f9f
be197a9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
import subprocess

predefined_texts = [
    "A combination of Canadian capital quickly organized and petitioned for the same privileges.",
    "The date was nearly eighteen years old.",
    "Hardly were our plans made public before we were met by powerful opposition.",
]

emotion_mapping = {"amused": 0, "anger": 1, "disgust": 2, "neutral": 3, "sleepiness": 4}

def synthesize_speech(input_type, text, own_text, speaker_id, embed_type, emotion_id):
    if input_type == "Choose from examples":
        selected_text = text
    else:
        selected_text = own_text

    if embed_type == "bert_embed":
        command = f"python3 synthesize.py --text '{selected_text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
    else:
        command = f"python3 synthesize.py --text '{selected_text}' --emotion_id {emotion_mapping[emotion_id]} --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"

    output = subprocess.check_output(command, shell=True)
    audio_file = f'output/result/EmoV_DB/{selected_text}.wav'
    return audio_file

input_type = gr.Radio(
    choices=["Choose from examples", "Enter your own text"], label="Input Type")
text = gr.Dropdown(choices=predefined_texts, label="Select a text")
own_text = gr.Textbox(lines=2, label="Enter your own text")
speaker_id = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Speaker ID")  # Updated from `default` to `value`
embed_type = gr.Radio(choices=["bert_embed", "emotion_id"], label="Embedding Type")
emotion_id = gr.Dropdown(choices=list(emotion_mapping.keys()), label="Select Emotion")

iface = gr.Interface(
    fn=synthesize_speech,
    inputs=[input_type, text, own_text, speaker_id, embed_type, emotion_id],
    outputs=gr.Audio(type="filepath"),
    title="Text-to-Speech Demo",
    description="Select or enter text and configure options to synthesize speech."
)

iface.launch()