File size: 2,898 Bytes
bd3e2e4
3b38860
8cf49b6
 
 
3b38860
65e32e0
bd3e2e4
8cf49b6
 
 
 
 
 
 
 
d4b5f92
 
 
 
 
bd3e2e4
82f0eab
 
97e9b74
82f0eab
 
 
 
 
 
 
 
bd3e2e4
97e9b74
63f3aa1
 
 
 
97e9b74
bd3e2e4
d4b5f92
 
8cf49b6
 
82f0eab
8cf49b6
bd3e2e4
d4b5f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f3aa1
d4b5f92
 
 
 
 
 
 
65e32e0
 
 
 
d4b5f92
 
 
 
 
 
 
65e32e0
 
 
 
d4b5f92
 
65e32e0
 
 
d4b5f92
3b38860
bd3e2e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
from faster_whisper import WhisperModel
import edge_tts
import tempfile
import asyncio

model = WhisperModel("tiny", compute_type="float32")

# Text-to-speech function
async def text_to_speech(text, voice):   
    communicate = edge_tts.Communicate(text, voice)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

def generate_response(
    language_level, buddy_personality,
    language_choice, user_query_audio,
    chatbot_history
):
    # Convert input audio to text

    language_codes = {'English':'en',
                     'Spanish':'es',
                     'Japanese':'ja'}

    user_query_transcribed_segments, info = model.transcribe(
        audio=user_query_audio,
        language=language_codes[language_choice]
        )
    user_query_transcribed = list(user_query_transcribed_segments)[0].text.strip()

    # Ask llm for response to text

    bot_message = 'bot: '+ user_query_transcribed

    chatbot_history.append(gr.ChatMessage(role="user", content=user_query_transcribed))
    chatbot_history.append(gr.ChatMessage(role="assistant", content=bot_message))

    # Convert llm response to audio
    # Return None to reset user input audio and
    # llm response + user inputs in chatbot_history object to be displayed 
    voice_short_name = "en-US-BrianNeural"
    bot_message_audio, warning = asyncio.run(text_to_speech(text=bot_message, voice=voice_short_name))
    
    return None, chatbot_history, bot_message_audio

with gr.Blocks() as demo:

    header_section = gr.Markdown(
    """
    # AI Language Buddy!
    Click the **converse** button to practice your language skills!
    """)
    
    language = gr.Dropdown(
        choices=['English', 'Spanish', 'Japanese'],
        label='Language Choice',
        value='English'
    )
    
    language_level = gr.Dropdown(
        choices=['Beginner', 'Intermediate', 'Advanced'],
        label='Language Level',
        value='Beginner'
    )
    
    personality = gr.Dropdown(
        choices=['Formal Teacher', 'Flirty Friend', 'Sarcastic Bro'],
        label='Language Buddy Personality',
        value='Flirty Friend'
    )

    chatbot = gr.Chatbot(type='messages')
    
    user_input = gr.Audio(
        sources='microphone',
        show_download_button=True,
        type='filepath'
    )

    ai_response = gr.Audio(
        autoplay=True
    )

    converse_button = gr.Button("Send Message")

    clear_button = gr.Button("Clear Convo History")

    converse_button.click(
        fn=generate_response,
        inputs=[
            language_level,
            personality,
            language,
            user_input,
            chatbot
        ],
        outputs=[user_input,
                 chatbot,
                 ai_response]
    )

demo.launch()