Spaces:
Running
Running
import base64 | |
import mimetypes | |
import os | |
import re | |
import struct | |
import tempfile | |
from datetime import datetime | |
from flask import Flask, render_template, request, jsonify, send_file | |
from google import genai | |
from google.genai import types | |
import io | |
app = Flask(__name__) | |
# Store the latest generated audio in memory | |
latest_audio = None | |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes: | |
"""Generates a WAV file header for the given audio data and parameters.""" | |
parameters = parse_audio_mime_type(mime_type) | |
bits_per_sample = parameters["bits_per_sample"] | |
sample_rate = parameters["rate"] | |
num_channels = 1 | |
data_size = len(audio_data) | |
bytes_per_sample = bits_per_sample // 8 | |
block_align = num_channels * bytes_per_sample | |
byte_rate = sample_rate * block_align | |
chunk_size = 36 + data_size | |
header = struct.pack( | |
"<4sI4s4sIHHIIHH4sI", | |
b"RIFF", | |
chunk_size, | |
b"WAVE", | |
b"fmt ", | |
16, | |
1, | |
num_channels, | |
sample_rate, | |
byte_rate, | |
block_align, | |
bits_per_sample, | |
b"data", | |
data_size | |
) | |
return header + audio_data | |
def parse_audio_mime_type(mime_type: str): | |
"""Parses bits per sample and rate from an audio MIME type string.""" | |
bits_per_sample = 16 | |
rate = 24000 | |
parts = mime_type.split(";") | |
for param in parts: | |
param = param.strip() | |
if param.lower().startswith("rate="): | |
try: | |
rate_str = param.split("=", 1)[1] | |
rate = int(rate_str) | |
except (ValueError, IndexError): | |
pass | |
elif param.startswith("audio/L"): | |
try: | |
bits_per_sample = int(param.split("L", 1)[1]) | |
except (ValueError, IndexError): | |
pass | |
return {"bits_per_sample": bits_per_sample, "rate": rate} | |
def generate_audio(text, voice="Zephyr", accent_type="hindi"): | |
"""Generate audio from text using Gemini TTS""" | |
global latest_audio | |
client = genai.Client( | |
api_key=os.environ.get("GEMINI_API_KEY"), | |
) | |
model = "gemini-2.5-flash-preview-tts" | |
# Different accent prompts | |
accent_prompts = { | |
"hindi": "Speak with a clear Indian Hindi accent, with low intonation and expressiveness. Do not say it aloud like a story. Be conversational like a customer care agent.", | |
"neutral": "Speak in a clear, neutral accent:", | |
"british": "Speak with a British English accent:", | |
"american": "Speak with an American English accent:" | |
} | |
prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}" | |
contents = [ | |
types.Content( | |
role="user", | |
parts=[ | |
types.Part.from_text(text=prompt_text), | |
], | |
), | |
] | |
generate_content_config = types.GenerateContentConfig( | |
temperature=0.5, | |
seed=42, | |
response_modalities=["audio"], | |
speech_config=types.SpeechConfig( | |
voice_config=types.VoiceConfig( | |
prebuilt_voice_config=types.PrebuiltVoiceConfig( | |
voice_name=voice | |
) | |
) | |
), | |
) | |
audio_data = None | |
mime_type = None | |
for chunk in client.models.generate_content_stream( | |
model=model, | |
contents=contents, | |
config=generate_content_config, | |
): | |
if ( | |
chunk.candidates is None | |
or chunk.candidates[0].content is None | |
or chunk.candidates[0].content.parts is None | |
): | |
continue | |
if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data: | |
inline_data = chunk.candidates[0].content.parts[0].inline_data | |
audio_data = inline_data.data | |
mime_type = inline_data.mime_type | |
break | |
if audio_data: | |
# Convert to WAV format | |
wav_data = convert_to_wav(audio_data, mime_type) | |
latest_audio = wav_data | |
return True | |
return False | |
def index(): | |
return render_template('index.html') | |
def generate(): | |
try: | |
data = request.json | |
text = data.get('text', '') | |
voice = data.get('voice', 'Zephyr') | |
accent = data.get('accent', 'hindi') | |
if not text: | |
return jsonify({'error': 'Text is required'}), 400 | |
# Generate audio | |
success = generate_audio(text, voice, accent) | |
if success and latest_audio: | |
# Convert to base64 for sending to frontend | |
audio_base64 = base64.b64encode(latest_audio).decode('utf-8') | |
return jsonify({ | |
'success': True, | |
'audio': audio_base64 | |
}) | |
else: | |
return jsonify({'error': 'Failed to generate audio'}), 500 | |
except Exception as e: | |
return jsonify({'error': str(e)}), 500 | |
def download(): | |
if latest_audio: | |
return send_file( | |
io.BytesIO(latest_audio), | |
mimetype='audio/wav', | |
as_attachment=True, | |
download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav' | |
) | |
return jsonify({'error': 'No audio available'}), 404 | |