Gemini-Audio / app.py
SolarumAsteridion's picture
Update app.py
36aa57e verified
import base64
import mimetypes
import os
import re
import struct
import tempfile
from datetime import datetime
from flask import Flask, render_template, request, jsonify, send_file
from google import genai
from google.genai import types
import io
app = Flask(__name__)
# Store the latest generated audio in memory
latest_audio = None
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
"""Generates a WAV file header for the given audio data and parameters."""
parameters = parse_audio_mime_type(mime_type)
bits_per_sample = parameters["bits_per_sample"]
sample_rate = parameters["rate"]
num_channels = 1
data_size = len(audio_data)
bytes_per_sample = bits_per_sample // 8
block_align = num_channels * bytes_per_sample
byte_rate = sample_rate * block_align
chunk_size = 36 + data_size
header = struct.pack(
"<4sI4s4sIHHIIHH4sI",
b"RIFF",
chunk_size,
b"WAVE",
b"fmt ",
16,
1,
num_channels,
sample_rate,
byte_rate,
block_align,
bits_per_sample,
b"data",
data_size
)
return header + audio_data
def parse_audio_mime_type(mime_type: str):
"""Parses bits per sample and rate from an audio MIME type string."""
bits_per_sample = 16
rate = 24000
parts = mime_type.split(";")
for param in parts:
param = param.strip()
if param.lower().startswith("rate="):
try:
rate_str = param.split("=", 1)[1]
rate = int(rate_str)
except (ValueError, IndexError):
pass
elif param.startswith("audio/L"):
try:
bits_per_sample = int(param.split("L", 1)[1])
except (ValueError, IndexError):
pass
return {"bits_per_sample": bits_per_sample, "rate": rate}
def generate_audio(text, voice="Zephyr", accent_type="hindi"):
"""Generate audio from text using Gemini TTS"""
global latest_audio
client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
)
model = "gemini-2.5-flash-preview-tts"
# Different accent prompts
accent_prompts = {
"hindi": "Speak with a clear Indian Hindi accent, with low intonation and expressiveness. Do not say it aloud like a story. Be conversational like a customer care agent.",
"neutral": "Speak in a clear, neutral accent:",
"british": "Speak with a British English accent:",
"american": "Speak with an American English accent:"
}
prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}"
contents = [
types.Content(
role="user",
parts=[
types.Part.from_text(text=prompt_text),
],
),
]
generate_content_config = types.GenerateContentConfig(
temperature=0.5,
seed=42,
response_modalities=["audio"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
),
)
audio_data = None
mime_type = None
for chunk in client.models.generate_content_stream(
model=model,
contents=contents,
config=generate_content_config,
):
if (
chunk.candidates is None
or chunk.candidates[0].content is None
or chunk.candidates[0].content.parts is None
):
continue
if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
inline_data = chunk.candidates[0].content.parts[0].inline_data
audio_data = inline_data.data
mime_type = inline_data.mime_type
break
if audio_data:
# Convert to WAV format
wav_data = convert_to_wav(audio_data, mime_type)
latest_audio = wav_data
return True
return False
@app.route('/')
def index():
return render_template('index.html')
@app.route('/generate', methods=['POST'])
def generate():
try:
data = request.json
text = data.get('text', '')
voice = data.get('voice', 'Zephyr')
accent = data.get('accent', 'hindi')
if not text:
return jsonify({'error': 'Text is required'}), 400
# Generate audio
success = generate_audio(text, voice, accent)
if success and latest_audio:
# Convert to base64 for sending to frontend
audio_base64 = base64.b64encode(latest_audio).decode('utf-8')
return jsonify({
'success': True,
'audio': audio_base64
})
else:
return jsonify({'error': 'Failed to generate audio'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/download')
def download():
if latest_audio:
return send_file(
io.BytesIO(latest_audio),
mimetype='audio/wav',
as_attachment=True,
download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav'
)
return jsonify({'error': 'No audio available'}), 404