Spaces:

chaos4455
/

gemini-audio-video-chat-Showcase

Sleeping

App Files Files Community

chaos4455 commited on 20 days ago

Commit

2a2a23e

verified ·

1 Parent(s): f936c11

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -22

app.py CHANGED Viewed

@@ -78,7 +78,7 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         self.quit = asyncio.Event()
         self.session = None
         self.last_frame_time = 0
-        self.conversation_history = [] # Added conversation history
         self.latest_text = ""
@@ -126,21 +126,18 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
         ):
             self.audio_queue.put_nowait(audio_response)
-    async def receive(self, frame: tuple[int, np.ndarray], text_input: str) -> None: # Added text_input here
         _, array = frame
         array = array.squeeze()
-        audio_message = encode_audio(array)
         if self.session:
             if text_input: # Checks if text was inputted
-                 full_prompt = PROMPT_BASE + "\n\n" + "User: " + text_input
-                 await self.session.send({"mime_type": "text", "data": full_prompt})
-                 self.conversation_history.append({"role": "user", "content": text_input}) # Add text conversation
-                 self.latest_text = "" # Clears the text after sending
-            elif array.size:
-                 full_prompt = PROMPT_BASE + "\n\n" + "User: " + str(base64.b64encode(array.tobytes()).decode("UTF-8")) # Formats the prompt
-                 await self.session.send({"mime_type": "text", "data": full_prompt}) # Sends text to Gemini
-                 self.conversation_history.append({"role": "user", "content": str(base64.b64encode(array.tobytes()).decode("UTF-8"))}) # Stores conversation in history
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
@@ -149,16 +146,14 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
             asyncio.create_task(self.connect(self.latest_args[1]))
         array = await self.audio_queue.get()
         return (self.output_sample_rate, array)
     def set_text(self, text):
        self.latest_text = text
-    def get_text(self):
-        return self.latest_text
     def clear_text(self):
-      self.latest_text=""
-      return ""
     def shutdown(self) -> None:
         self.quit.set()
@@ -205,18 +200,18 @@ with gr.Blocks(css=css) as demo:
             image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
             text_input = gr.Textbox(label="Text Message", placeholder="Type your message here")
             send_button = gr.Button("Send")
         handler = GeminiHandler()
-        send_button.click(handler.set_text,inputs=[text_input], outputs=[])
         send_button.click(handler.clear_text, inputs=[], outputs=[text_input])
         webrtc.stream(
             handler,
-            inputs=[webrtc, api_key, image_input, handler.get_text], # Added text_input here
             outputs=[webrtc],
             time_limit=90,
             concurrency_limit=2,
         )
 if __name__ == "__main__":
     demo.launch()

         self.quit = asyncio.Event()
         self.session = None
         self.last_frame_time = 0
+        self.conversation_history = []  # Added conversation history
         self.latest_text = ""
         ):
             self.audio_queue.put_nowait(audio_response)
+    async def receive(self, frame: tuple[int, np.ndarray], text_input: str) -> None:  # Added text_input here
         _, array = frame
         array = array.squeeze()
         if self.session:
             if text_input: # Checks if text was inputted
+                full_prompt = PROMPT_BASE + "\n\n" + "User: " + text_input
+                await self.session.send({"mime_type": "text", "data": full_prompt})
+                self.conversation_history.append({"role": "user", "content": text_input})  # Add text conversation
+            elif array.size: # Checks if audio was received
+                full_prompt = PROMPT_BASE + "\n\n" + "User: " + str(base64.b64encode(array.tobytes()).decode("UTF-8"))
+                await self.session.send({"mime_type": "text", "data": full_prompt})
+                self.conversation_history.append({"role": "user", "content": str(base64.b64encode(array.tobytes()).decode("UTF-8"))})
     async def emit(self) -> AudioEmitType:
         if not self.args_set.is_set():
             asyncio.create_task(self.connect(self.latest_args[1]))
         array = await self.audio_queue.get()
         return (self.output_sample_rate, array)
     def set_text(self, text):
        self.latest_text = text
     def clear_text(self):
+        self.latest_text = ""
+        return ""
     def shutdown(self) -> None:
         self.quit.set()
             image_input = gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
             text_input = gr.Textbox(label="Text Message", placeholder="Type your message here")
             send_button = gr.Button("Send")
         handler = GeminiHandler()
+        send_button.click(handler.set_text, inputs=[text_input], outputs=[])
         send_button.click(handler.clear_text, inputs=[], outputs=[text_input])
         webrtc.stream(
             handler,
+            inputs=[webrtc, api_key, image_input, text_input],
             outputs=[webrtc],
             time_limit=90,
             concurrency_limit=2,
         )
 if __name__ == "__main__":
     demo.launch()