Spaces:

humanvprojectceo
/

HumanV

Sleeping

App Files Files Community

humanvprojectceo commited on Feb 9

Commit

c28aa67

verified ·

1 Parent(s): c44a1c5

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -60

app.py CHANGED Viewed

@@ -1,95 +1,103 @@
 import os
 import io
 import asyncio
-import numpy as np
-import librosa
 import soundfile as sf
 import gradio as gr
 from google import genai
 client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
-MODEL = "gemini-2.5-flash-native-audio-preview-12-2025"
 config = {
-    "response_modalities": ["AUDIO"],
-    "system_instruction": "You are a helpful assistant and answer in a friendly tone.",
 }
-async def generate_audio_response(audio_bytes: bytes):
-    async with client.aio.live.connect(model=MODEL, config=config) as session:
-        await session.send_realtime_input(
-            audio={"data": audio_bytes, "mime_type": "audio/pcm"}
-        )
-        audio_chunks = []
-        last_receive_time = asyncio.get_event_loop().time()
-        while True:
-            turn = session.receive()
-            has_new = False
-            async for response in turn:
-                if response.server_content and response.server_content.model_turn:
-                    for part in response.server_content.model_turn.parts:
-                        if hasattr(part, "inline_data") and part.inline_data.data:
-                            audio_chunks.append(part.inline_data.data)
-                            has_new = True
-                            last_receive_time = asyncio.get_event_loop().time()
-            if audio_chunks and not has_new and (asyncio.get_event_loop().time() - last_receive_time > 3):
-                break
-            await asyncio.sleep(0.2)
-            if asyncio.get_event_loop().time() - last_receive_time > 30:
-                break
         full_audio = b''.join(audio_chunks)
         if not full_audio:
             raise ValueError("No audio response received from the model.")
         buf = io.BytesIO(full_audio)
-        y, sr = sf.read(buf, channels=1, samplerate=24000, format="RAW", subtype="PCM_16", dtype="float32")
         return sr, y
-def process_audio(input_path: str | None):
-    if input_path is None:
-        return None, "Please upload a WAV file."
     try:
-        y, orig_sr = librosa.load(input_path, sr=None, mono=True)
-        y = librosa.resample(y, orig_sr=orig_sr, target_sr=16000)
-        y_int = np.int16(y * 32767)
-        audio_bytes = y_int.tobytes()
-        sr, response_audio = asyncio.run(generate_audio_response(audio_bytes))
-        return (sr, response_audio), "Response generated successfully!"
     except Exception as e:
         return None, f"Error: {str(e)}"
 with gr.Blocks() as demo:
-    gr.Markdown("# Gemini Live Audio-to-Audio Demo")
-    gr.Markdown("Upload a WAV file (spoken query). Gemini will respond with spoken audio.")
-    with gr.Row():
-        input_audio = gr.Audio(
-            label="Upload your query (WAV file)",
-            type="filepath",
-            sources=["upload"],
-            format="wav"
-        )
-    with gr.Row():
-        output_audio = gr.Audio(
-            label="Gemini spoken response",
-            type="numpy",
-            autoplay=True
-        )
-    status = gr.Textbox(label="Status")
-    btn = gr.Button("Generate Response")
     btn.click(
         fn=process_audio,
@@ -97,6 +105,4 @@ with gr.Blocks() as demo:
         outputs=[output_audio, status]
     )
-    gr.Markdown("Example test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav")
-demo.launch()

 import os
 import io
 import asyncio
 import soundfile as sf
 import gradio as gr
 from google import genai
 client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
+MODEL = "gemini-2.5-flash-native-audio-preview-09-2025"
 config = {
+    "response_modalities": ["AUDIO"]
 }
+def load_and_convert_audio(file_path):
+    # load audio
+    y, sr = sf.read(file_path)
+    # تبدیل به mono
+    if len(y.shape) > 1:
+        y = y.mean(axis=1)
+    # resample به 16k اگر لازم باشد
+    if sr != 16000:
+        import resampy
+        y = resampy.resample(y, sr, 16000)
+        sr = 16000
+    # تبدیل به PCM16
+    pcm16 = (y * 32767).astype("int16")
+    return pcm16.tobytes()
+async def generate_audio_response_from_file(file_path: str):
+    audio_bytes = load_and_convert_audio(file_path)
+    async with client.aio.live.connect(model=MODEL, config=config) as session:
+        await session.send_client_content(
+            turns={
+                "role": "user",
+                "parts": [
+                    {
+                        "inline_data": {
+                            "data": audio_bytes,
+                            "mime_type": "audio/pcm"
+                        }
+                    }
+                ]
+            },
+            turn_complete=True
+        )
+        audio_chunks = []
+        async for response in session.receive():
+            if response.data is not None:
+                audio_chunks.append(response.data)
         full_audio = b''.join(audio_chunks)
         if not full_audio:
             raise ValueError("No audio response received from the model.")
         buf = io.BytesIO(full_audio)
+        y, sr = sf.read(
+            buf,
+            channels=1,
+            samplerate=24000,
+            format="RAW",
+            subtype="PCM_16",
+            dtype="float32"
+        )
         return sr, y
+def process_audio(file):
+    if file is None:
+        return None, "Please upload an audio file."
     try:
+        sr, audio_data = asyncio.run(
+            generate_audio_response_from_file(file)
+        )
+        return (sr, audio_data), "Response generated successfully!"
     except Exception as e:
         return None, f"Error: {str(e)}"
 with gr.Blocks() as demo:
+    gr.Markdown("# Gemini Audio → Audio")
+    input_audio = gr.Audio(
+        label="Upload audio",
+        type="filepath"
+    )
+    output_audio = gr.Audio(
+        label="Gemini spoken response",
+        type="numpy",
+        autoplay=True
+    )
+    status = gr.Textbox(label="Status")
+    btn = gr.Button("Send Audio")
     btn.click(
         fn=process_audio,
         outputs=[output_audio, status]
     )
+demo.launch()