humanvprojectceo commited on
Commit
c28aa67
·
verified ·
1 Parent(s): c44a1c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -60
app.py CHANGED
@@ -1,95 +1,103 @@
1
  import os
2
  import io
3
  import asyncio
4
- import numpy as np
5
- import librosa
6
  import soundfile as sf
7
  import gradio as gr
8
  from google import genai
9
 
10
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
11
 
12
- MODEL = "gemini-2.5-flash-native-audio-preview-12-2025"
13
 
14
  config = {
15
- "response_modalities": ["AUDIO"],
16
- "system_instruction": "You are a helpful assistant and answer in a friendly tone.",
17
  }
18
 
19
- async def generate_audio_response(audio_bytes: bytes):
20
- async with client.aio.live.connect(model=MODEL, config=config) as session:
21
- await session.send_realtime_input(
22
- audio={"data": audio_bytes, "mime_type": "audio/pcm"}
23
- )
24
 
25
- audio_chunks = []
26
- last_receive_time = asyncio.get_event_loop().time()
 
27
 
28
- while True:
29
- turn = session.receive()
30
- has_new = False
31
- async for response in turn:
32
- if response.server_content and response.server_content.model_turn:
33
- for part in response.server_content.model_turn.parts:
34
- if hasattr(part, "inline_data") and part.inline_data.data:
35
- audio_chunks.append(part.inline_data.data)
36
- has_new = True
37
- last_receive_time = asyncio.get_event_loop().time()
38
 
39
- if audio_chunks and not has_new and (asyncio.get_event_loop().time() - last_receive_time > 3):
40
- break
 
41
 
42
- await asyncio.sleep(0.2)
 
43
 
44
- if asyncio.get_event_loop().time() - last_receive_time > 30:
45
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  full_audio = b''.join(audio_chunks)
48
  if not full_audio:
49
  raise ValueError("No audio response received from the model.")
50
 
51
  buf = io.BytesIO(full_audio)
52
- y, sr = sf.read(buf, channels=1, samplerate=24000, format="RAW", subtype="PCM_16", dtype="float32")
 
 
 
 
 
 
 
53
  return sr, y
54
 
55
- def process_audio(input_path: str | None):
56
- if input_path is None:
57
- return None, "Please upload a WAV file."
58
 
59
  try:
60
- y, orig_sr = librosa.load(input_path, sr=None, mono=True)
61
- y = librosa.resample(y, orig_sr=orig_sr, target_sr=16000)
62
- y_int = np.int16(y * 32767)
63
- audio_bytes = y_int.tobytes()
64
-
65
- sr, response_audio = asyncio.run(generate_audio_response(audio_bytes))
66
-
67
- return (sr, response_audio), "Response generated successfully!"
68
  except Exception as e:
69
  return None, f"Error: {str(e)}"
70
 
71
  with gr.Blocks() as demo:
72
- gr.Markdown("# Gemini Live Audio-to-Audio Demo")
73
- gr.Markdown("Upload a WAV file (spoken query). Gemini will respond with spoken audio.")
74
-
75
- with gr.Row():
76
- input_audio = gr.Audio(
77
- label="Upload your query (WAV file)",
78
- type="filepath",
79
- sources=["upload"],
80
- format="wav"
81
- )
82
 
83
- with gr.Row():
84
- output_audio = gr.Audio(
85
- label="Gemini spoken response",
86
- type="numpy",
87
- autoplay=True
88
- )
89
 
90
- status = gr.Textbox(label="Status")
 
 
 
 
91
 
92
- btn = gr.Button("Generate Response")
 
93
 
94
  btn.click(
95
  fn=process_audio,
@@ -97,6 +105,4 @@ with gr.Blocks() as demo:
97
  outputs=[output_audio, status]
98
  )
99
 
100
- gr.Markdown("Example test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav")
101
-
102
- demo.launch()
 
1
  import os
2
  import io
3
  import asyncio
 
 
4
  import soundfile as sf
5
  import gradio as gr
6
  from google import genai
7
 
8
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
9
 
10
+ MODEL = "gemini-2.5-flash-native-audio-preview-09-2025"
11
 
12
  config = {
13
+ "response_modalities": ["AUDIO"]
 
14
  }
15
 
16
+ def load_and_convert_audio(file_path):
17
+ # load audio
18
+ y, sr = sf.read(file_path)
 
 
19
 
20
+ # تبدیل به mono
21
+ if len(y.shape) > 1:
22
+ y = y.mean(axis=1)
23
 
24
+ # resample به 16k اگر لازم باشد
25
+ if sr != 16000:
26
+ import resampy
27
+ y = resampy.resample(y, sr, 16000)
28
+ sr = 16000
 
 
 
 
 
29
 
30
+ # تبدیل به PCM16
31
+ pcm16 = (y * 32767).astype("int16")
32
+ return pcm16.tobytes()
33
 
34
+ async def generate_audio_response_from_file(file_path: str):
35
+ audio_bytes = load_and_convert_audio(file_path)
36
 
37
+ async with client.aio.live.connect(model=MODEL, config=config) as session:
38
+ await session.send_client_content(
39
+ turns={
40
+ "role": "user",
41
+ "parts": [
42
+ {
43
+ "inline_data": {
44
+ "data": audio_bytes,
45
+ "mime_type": "audio/pcm"
46
+ }
47
+ }
48
+ ]
49
+ },
50
+ turn_complete=True
51
+ )
52
+
53
+ audio_chunks = []
54
+ async for response in session.receive():
55
+ if response.data is not None:
56
+ audio_chunks.append(response.data)
57
 
58
  full_audio = b''.join(audio_chunks)
59
  if not full_audio:
60
  raise ValueError("No audio response received from the model.")
61
 
62
  buf = io.BytesIO(full_audio)
63
+ y, sr = sf.read(
64
+ buf,
65
+ channels=1,
66
+ samplerate=24000,
67
+ format="RAW",
68
+ subtype="PCM_16",
69
+ dtype="float32"
70
+ )
71
  return sr, y
72
 
73
+ def process_audio(file):
74
+ if file is None:
75
+ return None, "Please upload an audio file."
76
 
77
  try:
78
+ sr, audio_data = asyncio.run(
79
+ generate_audio_response_from_file(file)
80
+ )
81
+ return (sr, audio_data), "Response generated successfully!"
 
 
 
 
82
  except Exception as e:
83
  return None, f"Error: {str(e)}"
84
 
85
  with gr.Blocks() as demo:
86
+ gr.Markdown("# Gemini Audio → Audio")
 
 
 
 
 
 
 
 
 
87
 
88
+ input_audio = gr.Audio(
89
+ label="Upload audio",
90
+ type="filepath"
91
+ )
 
 
92
 
93
+ output_audio = gr.Audio(
94
+ label="Gemini spoken response",
95
+ type="numpy",
96
+ autoplay=True
97
+ )
98
 
99
+ status = gr.Textbox(label="Status")
100
+ btn = gr.Button("Send Audio")
101
 
102
  btn.click(
103
  fn=process_audio,
 
105
  outputs=[output_audio, status]
106
  )
107
 
108
+ demo.launch()