File size: 5,731 Bytes
f0ad67c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8afdb95
 
0bb7775
8afdb95
0bb7775
c2924bd
0bb7775
 
 
 
 
 
8afdb95
02d76aa
b92c64c
8afdb95
02d76aa
586d983
02d76aa
 
586d983
02d76aa
 
 
586d983
94786a8
 
bbbe41e
 
 
586d983
94786a8
586d983
02d76aa
 
c2924bd
 
 
586d983
02d76aa
586d983
 
 
8afdb95
02d76aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8afdb95
02d76aa
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# import os
# import gradio as gr
# import whisper
# from gtts import gTTS
# import io
# from groq import Groq

# # Initialize the Groq client
# groq_api_key = os.getenv('GROQ_API_KEY')
# client = Groq(api_key=groq_api_key) 

# # Load the Whisper model
# model = whisper.load_model("base")  # You can choose other models like "small", "medium", "large"

# def process_audio(file_path):
#     try:
#         # Load the audio file
#         audio = whisper.load_audio(file_path)

#         # Transcribe the audio using Whisper
#         result = model.transcribe(audio)
#         text = result["text"]

#         # Generate a response using Groq
#         chat_completion = client.chat.completions.create(
#             messages=[{"role": "user", "content": text}],
#             model="llama3-8b-8192",  # Replace with the correct model if necessary
#         )

#         # Access the response using dot notation
#         response_message = chat_completion.choices[0].message.content.strip()

#         # Convert the response text to speech
#         tts = gTTS(response_message)
#         response_audio_io = io.BytesIO()
#         tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
#         response_audio_io.seek(0)

#         # Save audio to a file to ensure it's generated correctly
#         with open("response.mp3", "wb") as audio_file:
#             audio_file.write(response_audio_io.getvalue())

#         # Return the response text and the path to the saved audio file
#         return response_message, "response.mp3"

#     except Exception as e:
#         return f"An error occurred: {e}", None

# iface = gr.Interface(
#     fn=process_audio,
#     inputs=gr.Audio(type="filepath"),  # Use type="filepath"
#     outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
#     live=True
# )

# iface.launch()
import os
import gradio as gr
import whisper
from gtts import gTTS
from anthropic import Anthropic  # Import the Anthropic client
import io  # Import io for BytesIO

# Get the Anthropic API key from environment variables
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
if not ANTHROPIC_API_KEY:
    raise ValueError("ANTHROPIC_API_KEY environment variable is not set.")
client = Anthropic(api_key=ANTHROPIC_API_KEY)  # Initialize the Anthropic client

# Load Whisper model
model = whisper.load_model("base")  # You can also use "small", "medium", "large"

def chatbot(audio=None):
    try:
        if audio is None:
            return "No input detected. Please provide an audio input.", None

        # Transcribe the audio input using Whisper
        transcription = model.transcribe(audio)
        user_input = transcription.get("text", "")

        # Generate a response using Anthropic API
        chat_completion = client.completions.create(
            model="claude-v1",  # Specify the model
            prompt=user_input,   # Provide the user input as the prompt
            max_tokens_to_sample=100,  # Specify the maximum tokens to sample
        )
        response_text = chat_completion['completion']

        # Convert the response text to speech using gTTS
        tts = gTTS(text=response_text, lang='en')
        response_audio_io = io.BytesIO()  # Create a BytesIO object
        tts.save(response_audio_io)  # Save the audio to the BytesIO object
        response_audio_io.seek(0)  # Rewind the BytesIO object

        return response_text, response_audio_io

    except Exception as e:
        return f"An error occurred: {e}", None

def clear_inputs():
    return None, None, None

# Create a custom interface
def build_interface():
    with gr.Blocks(css="""
        .block-title {
            text-align: center; 
            color: white;
            background-color: #4CAF50; 
            padding: 10px;
            border-radius: 8px;
        }
        .gradio-row {
            background-color: #f9f9f9;
            border-radius: 8px;
            padding: 20px;
            margin: 10px;
            box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.1);
        }
        .gradio-column {
            padding: 10px;
        }
        .gradio-button {
            background-color: #ff6347 !important;
            color: white !important;
            border-radius: 8px !important;
            padding: 10px 20px !important;
            font-size: 16px !important;
            border: none !important;
            cursor: pointer !important;
            box-shadow: 0px 4px 12px rgba(0, 0, 0, 0.2) !important;
            transition: background-color 0.3s ease !important;
        }
        .gradio-button:hover {
            background-color: #e5533d !important;
        }
    """) as demo:
        gr.Markdown(
            """
            <h1 class="block-title">Voice-to-Voice AI Chatbot</h1>
            """
        )
        with gr.Row(elem_classes="gradio-row"):
            with gr.Column(elem_classes="gradio-column", scale=1):
                audio_input = gr.Audio(type="filepath", label="Record Your Voice")
            with gr.Column(elem_classes="gradio-column", scale=2):
                chatbot_output_text = gr.Textbox(label="Chatbot Response")
                chatbot_output_audio = gr.Audio(label="Audio Response")

        clear_button = gr.Button("Clear", elem_classes="gradio-button")

        clear_button.click(
            fn=clear_inputs,
            outputs=[audio_input, chatbot_output_text, chatbot_output_audio]
        )

        audio_input.change(
            fn=chatbot,
            inputs=[audio_input],
            outputs=[chatbot_output_text, chatbot_output_audio]
        )

    return demo

# Launch the interface
if __name__ == "__main__":
    interface = build_interface()
    interface.launch()