DSatishchandra commited on
Commit
77c2b9f
1 Parent(s): 997c236

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -203
app.py CHANGED
@@ -1,204 +1,80 @@
1
- import gradio as gr
2
- from gradio_webrtc import WebRTC, StreamHandler, get_twilio_turn_credentials
3
- import websockets.sync.client
4
- import numpy as np
5
- import json
6
- import base64
7
- import os
8
- from dotenv import load_dotenv
9
-
10
- class GeminiConfig:
11
- def __init__(self):
12
- load_dotenv()
13
- self.api_key = self._get_api_key()
14
- self.host = 'generativelanguage.googleapis.com'
15
- self.model = 'models/gemini-2.0-flash-exp'
16
- self.ws_url = f'wss://{self.host}/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}'
17
-
18
- def _get_api_key(self):
19
- api_key = os.getenv('GOOGLE_API_KEY')
20
- if not api_key:
21
- raise ValueError("GOOGLE_API_KEY not found in environment variables. Please set it in your .env file.")
22
- return api_key
23
-
24
- class AudioProcessor:
25
- @staticmethod
26
- def encode_audio(data, sample_rate):
27
- encoded = base64.b64encode(data.tobytes()).decode('UTF-8')
28
- return {
29
- 'realtimeInput': {
30
- 'mediaChunks': [{
31
- 'mimeType': f'audio/pcm;rate={sample_rate}',
32
- 'data': encoded,
33
- }],
34
- },
35
- }
36
-
37
- @staticmethod
38
- def process_audio_response(data):
39
- audio_data = base64.b64decode(data)
40
- return np.frombuffer(audio_data, dtype=np.int16)
41
-
42
- class GeminiHandler(StreamHandler):
43
- def __init__(self,
44
- expected_layout="mono",
45
- output_sample_rate=24000,
46
- output_frame_size=480) -> None:
47
- super().__init__(expected_layout, output_sample_rate, output_frame_size,
48
- input_sample_rate=24000)
49
- self.config = GeminiConfig()
50
- self.ws = None
51
- self.all_output_data = None
52
- self.audio_processor = AudioProcessor()
53
-
54
- def copy(self):
55
- return GeminiHandler(
56
- expected_layout=self.expected_layout,
57
- output_sample_rate=self.output_sample_rate,
58
- output_frame_size=self.output_frame_size
59
- )
60
-
61
- def _initialize_websocket(self):
62
- try:
63
- self.ws = websockets.sync.client.connect(
64
- self.config.ws_url,
65
- timeout=30
66
- )
67
- initial_request = {
68
- 'setup': {
69
- 'model': self.config.model,
70
- }
71
- }
72
- self.ws.send(json.dumps(initial_request))
73
- setup_response = json.loads(self.ws.recv())
74
- print(f"Setup response: {setup_response}")
75
- except websockets.exceptions.WebSocketException as e:
76
- print(f"WebSocket connection failed: {str(e)}")
77
- self.ws = None
78
- except Exception as e:
79
- print(f"Setup failed: {str(e)}")
80
- self.ws = None
81
-
82
- def receive(self, frame: tuple[int, np.ndarray]) -> None:
83
- try:
84
- if not self.ws:
85
- self._initialize_websocket()
86
-
87
- _, array = frame
88
- array = array.squeeze()
89
- audio_message = self.audio_processor.encode_audio(array, self.output_sample_rate)
90
- self.ws.send(json.dumps(audio_message))
91
- except Exception as e:
92
- print(f"Error in receive: {str(e)}")
93
- if self.ws:
94
- self.ws.close()
95
- self.ws = None
96
-
97
- def _process_server_content(self, content):
98
- for part in content.get('parts', []):
99
- data = part.get('inlineData', {}).get('data', '')
100
- if data:
101
- audio_array = self.audio_processor.process_audio_response(data)
102
- if self.all_output_data is None:
103
- self.all_output_data = audio_array
104
- else:
105
- self.all_output_data = np.concatenate((self.all_output_data, audio_array))
106
-
107
- while self.all_output_data.shape[-1] >= self.output_frame_size:
108
- yield (self.output_sample_rate,
109
- self.all_output_data[:self.output_frame_size].reshape(1, -1))
110
- self.all_output_data = self.all_output_data[self.output_frame_size:]
111
-
112
- def generator(self):
113
- while True:
114
- if not self.ws:
115
- print("WebSocket not connected")
116
- yield None
117
- continue
118
-
119
- try:
120
- message = self.ws.recv(timeout=5)
121
- msg = json.loads(message)
122
-
123
- if 'serverContent' in msg:
124
- content = msg['serverContent'].get('modelTurn', {})
125
- yield from self._process_server_content(content)
126
- except TimeoutError:
127
- print("Timeout waiting for server response")
128
- yield None
129
- except Exception as e:
130
- print(f"Error in generator: {str(e)}")
131
- yield None
132
-
133
- def emit(self) -> tuple[int, np.ndarray] | None:
134
- if not self.ws:
135
- return None
136
- if not hasattr(self, '_generator'):
137
- self._generator = self.generator()
138
- try:
139
- return next(self._generator)
140
- except StopIteration:
141
- self.reset()
142
- return None
143
-
144
- def reset(self) -> None:
145
- if hasattr(self, '_generator'):
146
- delattr(self, '_generator')
147
- self.all_output_data = None
148
-
149
- def shutdown(self) -> None:
150
- if self.ws:
151
- self.ws.close()
152
-
153
- def check_connection(self):
154
- try:
155
- if not self.ws or self.ws.closed:
156
- self._initialize_websocket()
157
- return True
158
- except Exception as e:
159
- print(f"Connection check failed: {str(e)}")
160
- return False
161
-
162
- class GeminiVoiceChat:
163
- def __init__(self):
164
- load_dotenv()
165
- self.demo = self._create_interface()
166
-
167
- def _create_interface(self):
168
- with gr.Blocks() as demo:
169
- gr.HTML("""
170
- <div style='text-align: center'>
171
- <h1>Gemini 2.0 Voice Chat</h1>
172
- <p>Speak with Gemini using real-time audio streaming</p>
173
- </div>
174
- """)
175
-
176
- webrtc = WebRTC(
177
- label="Conversation",
178
- modality="audio",
179
- mode="send-receive",
180
- rtc_configuration=get_twilio_turn_credentials()
181
- )
182
-
183
- webrtc.stream(
184
- GeminiHandler(),
185
- inputs=[webrtc],
186
- outputs=[webrtc],
187
- time_limit=90,
188
- concurrency_limit=10
189
- )
190
- return demo
191
-
192
- def launch(self):
193
- self.demo.launch(
194
- server_name="0.0.0.0",
195
- server_port=int(os.environ.get("PORT", 7860)),
196
- share=True,
197
- ssl_verify=False,
198
- ssl_keyfile=None,
199
- ssl_certfile=None
200
- )
201
-
202
  if __name__ == "__main__":
203
- app = GeminiVoiceChat()
204
- app.launch()
 
1
+ import speech_recognition as sr
2
+ import pyttsx3
3
+ from transformers import pipeline
4
+ import random
5
+
6
+ # Initialize the speech engine
7
+ engine = pyttsx3.init()
8
+
9
+ # Menu data from the second image (hardcoded for simplicity)
10
+ menu = {
11
+ "Appetizer": ["Veg Samosas", "Cut Mirchi", "Onion", "Spinach", "Mixed Vegetable"],
12
+ "Pakodas": ["Veg Pakoda", "Chicken Pakoda", "Fish Pakoda"],
13
+ "Manchurian": ["Vegetable", "Paneer", "Chicken", "Fish", "Jhinga"],
14
+ "Chilly": ["Gobi", "Paneer", "Chicken", "Fish", "Shrimp"],
15
+ "Chef's Special": ["Murgh (Chicken)", "Gosht (Goat)", "Jhinga (Shrimp)", "Fish Fry"],
16
+ "Vegetarian Entree": ["Dal Fry", "Dal Makhani", "Channa Masala", "Aloo Gobi Masala", "Saag Paneer"],
17
+ "Chettinad": ["Egg", "Murgh (Chicken)", "Gosht (Goat)", "Jhinga (Shrimp)", "Crab"],
18
+ "Butter Masala": ["Chicken", "Shrimp", "Gosht (Goat)"]
19
+ }
20
+
21
+ # Initialize the speech recognition
22
+ recognizer = sr.Recognizer()
23
+
24
+ # Function to speak a text using text-to-speech
25
+ def speak(text):
26
+ engine.say(text)
27
+ engine.runAndWait()
28
+
29
+ # Function to listen to user's voice
30
+ def listen():
31
+ with sr.Microphone() as source:
32
+ print("Listening for your order...")
33
+ audio = recognizer.listen(source)
34
+ try:
35
+ # Using Google's speech recognition
36
+ return recognizer.recognize_google(audio)
37
+ except sr.UnknownValueError:
38
+ speak("Sorry, I could not understand that. Could you please repeat?")
39
+ return None
40
+ except sr.RequestError:
41
+ speak("Sorry, there was an issue with the service.")
42
+ return None
43
+
44
+ # Function to process the order
45
+ def process_order(order):
46
+ response = "You have ordered the following: "
47
+ order = order.lower()
48
+
49
+ # Check for matching menu items
50
+ ordered_items = []
51
+ for category, items in menu.items():
52
+ for item in items:
53
+ if item.lower() in order:
54
+ ordered_items.append(item)
55
+
56
+ if ordered_items:
57
+ response += ', '.join(ordered_items) + ". Is that correct?"
58
+ speak(response)
59
+ confirmation = listen()
60
+ if confirmation and "yes" in confirmation.lower():
61
+ speak("Thank you for your order. It will be ready shortly!")
62
+ else:
63
+ speak("Please tell me again what you'd like to order.")
64
+ else:
65
+ speak("Sorry, I couldn't find any items matching your order. Can you try again?")
66
+
67
+ # Main function to start the assistant
68
+ def start_assistant():
69
+ speak("Welcome to the Voice Food Ordering Assistant!")
70
+ speak("What would you like to order today?")
71
+ while True:
72
+ order = listen()
73
+ if order:
74
+ process_order(order)
75
+ else:
76
+ speak("Sorry, I didn't catch that.")
77
+
78
+ # Run the assistant
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  if __name__ == "__main__":
80
+ start_assistant()