File size: 3,344 Bytes
d164d7c
6fa22dc
 
 
d164d7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa22dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d164d7c
00784e9
 
6fa22dc
 
 
 
d164d7c
6fa22dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# frontend.py
import gradio as gr
import httpx

examples = [
        ["Can you turn my English into German?", "./show_case/common_voice_en_19664034.mp3"],  # En-De
        ["Can you identify the initial word that connects to 'currency_name' in this audio clip?", "./show_case/audio-1434542201-headset.wav"],  # ER
        ["What do you think the speaker's message is intended to be in this audio?", "./show_case/audio-1434542201-headset.wav"],  # IC
        ["What does the person say?", "./show_case/p225_002.wav"],  # DFake
        # ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/Real.wav"],  # DFake
        ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/Fake.wav"],  # DFake
        ["What emotional weight does the speaker's tone carry?\nPick one answer from A, B, C, and D.\nA: fear\nB: sadness\nC: joy\nD: neutral", "./show_case/SER(emotion)_example.wav"],  #SER(emotion)
        # ["Assess whether this speech's pronunciation is Real or Fake.", "./show_case/SVD_14154_file31512.mp3.wav_16k.wav_norm.wav_mono.wav_silence.wav"],  # SVD
        ["Choose the most suitable answer from options A, B, C, and D to respond the question in next line, you may only choose A or B or C or D.\nThe number of speakers delivering this speech is what?\nA. 4\nB. 2\nC.1\nD. 3", "./show_case/SNV_example.wav"],  #SNV
        ["Identify the language of the conversation you just heard.","./show_case/SLR_example.wav"], #SLR
        ["tell the gender of the speaker in this audio.","./show_case/SGR_018.wav"], #SGR
        ["What's the sound we're hearing in this audio from?","./show_case/Sound_Vocal_example.wav"], #Sound_vocal
        ["What is your best guess at the setting of this sound clip?","./show_case/Scene_example.wav"], #Sound_cochl
        ["Choose the most suitable answer from options A, B, C, and D to respond the question in next line, Please think step by step and you may only choose A or B or C or D.\nRecognize the segment where 'project' is spoken by the speaker.\nA. [5.28, 5.39]\nB. [0.92, 1.39]\nC. [4.75, 5.28]\nD. [3.86, 4.23]","./show_case/SG_audio_1.wav"], #SG
        ["What type of business does the first person's son have?","./show_case/SFT_Fisher_example.wav"] #SFT_Fisher
    ]


async def call_api(text: str, audio_path: str):
    # 读取音频文件
    with open(audio_path, "rb") as f:
        audio_bytes = f.read()

    # 发送到后端API
    async with httpx.AsyncClient() as client:
        files = {"audio_file": (audio_path, audio_bytes)}
        data = {"text": text}
        response = await client.post(
            "http://36.151.70.8:30113/process/",
            files=files,
            data=data
        )

    return response.json()["result"]


iface = gr.Interface(
    fn=call_api,
    inputs=[
        gr.Textbox(label="Enter text instruction", value="What does the person say?"),
        gr.Audio(type="filepath", label="Upload Audio", value="./show_case/p225_002.wav")
    ],
    outputs=gr.Textbox(label="Model output"),
    examples=examples,
    allow_flagging="never",
    cache_examples=False
)

iface.launch()
if __name__ == '__main__':
    # curl -X POST -F "text=What does the person say?" -F "audio_file=@./test_audio.wav" http://36.151.70.8:30113/process/
    pass