Spaces:
Sleeping
Sleeping
File size: 5,393 Bytes
40b0211 f32a0e7 40b0211 36aa57e 40b0211 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import base64
import mimetypes
import os
import re
import struct
import tempfile
from datetime import datetime
from flask import Flask, render_template, request, jsonify, send_file
from google import genai
from google.genai import types
import io
app = Flask(__name__)
# Store the latest generated audio in memory
latest_audio = None
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
"""Generates a WAV file header for the given audio data and parameters."""
parameters = parse_audio_mime_type(mime_type)
bits_per_sample = parameters["bits_per_sample"]
sample_rate = parameters["rate"]
num_channels = 1
data_size = len(audio_data)
bytes_per_sample = bits_per_sample // 8
block_align = num_channels * bytes_per_sample
byte_rate = sample_rate * block_align
chunk_size = 36 + data_size
header = struct.pack(
"<4sI4s4sIHHIIHH4sI",
b"RIFF",
chunk_size,
b"WAVE",
b"fmt ",
16,
1,
num_channels,
sample_rate,
byte_rate,
block_align,
bits_per_sample,
b"data",
data_size
)
return header + audio_data
def parse_audio_mime_type(mime_type: str):
"""Parses bits per sample and rate from an audio MIME type string."""
bits_per_sample = 16
rate = 24000
parts = mime_type.split(";")
for param in parts:
param = param.strip()
if param.lower().startswith("rate="):
try:
rate_str = param.split("=", 1)[1]
rate = int(rate_str)
except (ValueError, IndexError):
pass
elif param.startswith("audio/L"):
try:
bits_per_sample = int(param.split("L", 1)[1])
except (ValueError, IndexError):
pass
return {"bits_per_sample": bits_per_sample, "rate": rate}
def generate_audio(text, voice="Zephyr", accent_type="hindi"):
"""Generate audio from text using Gemini TTS"""
global latest_audio
client = genai.Client(
api_key=os.environ.get("GEMINI_API_KEY"),
)
model = "gemini-2.5-flash-preview-tts"
# Different accent prompts
accent_prompts = {
"hindi": "Speak with a clear Indian Hindi accent, with low intonation and expressiveness. Do not say it aloud like a story. Be conversational like a customer care agent.",
"neutral": "Speak in a clear, neutral accent:",
"british": "Speak with a British English accent:",
"american": "Speak with an American English accent:"
}
prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}"
contents = [
types.Content(
role="user",
parts=[
types.Part.from_text(text=prompt_text),
],
),
]
generate_content_config = types.GenerateContentConfig(
temperature=0.5,
seed=42,
response_modalities=["audio"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
),
)
audio_data = None
mime_type = None
for chunk in client.models.generate_content_stream(
model=model,
contents=contents,
config=generate_content_config,
):
if (
chunk.candidates is None
or chunk.candidates[0].content is None
or chunk.candidates[0].content.parts is None
):
continue
if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
inline_data = chunk.candidates[0].content.parts[0].inline_data
audio_data = inline_data.data
mime_type = inline_data.mime_type
break
if audio_data:
# Convert to WAV format
wav_data = convert_to_wav(audio_data, mime_type)
latest_audio = wav_data
return True
return False
@app.route('/')
def index():
return render_template('index.html')
@app.route('/generate', methods=['POST'])
def generate():
try:
data = request.json
text = data.get('text', '')
voice = data.get('voice', 'Zephyr')
accent = data.get('accent', 'hindi')
if not text:
return jsonify({'error': 'Text is required'}), 400
# Generate audio
success = generate_audio(text, voice, accent)
if success and latest_audio:
# Convert to base64 for sending to frontend
audio_base64 = base64.b64encode(latest_audio).decode('utf-8')
return jsonify({
'success': True,
'audio': audio_base64
})
else:
return jsonify({'error': 'Failed to generate audio'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/download')
def download():
if latest_audio:
return send_file(
io.BytesIO(latest_audio),
mimetype='audio/wav',
as_attachment=True,
download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav'
)
return jsonify({'error': 'No audio available'}), 404
|