Spaces:

SolarumAsteridion
/

Gemini-Audio

Sleeping

File size: 5,393 Bytes

import base64
import mimetypes
import os
import re
import struct
import tempfile
from datetime import datetime
from flask import Flask, render_template, request, jsonify, send_file
from google import genai
from google.genai import types
import io

app = Flask(__name__)

# Store the latest generated audio in memory
latest_audio = None

def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
    """Generates a WAV file header for the given audio data and parameters."""
    parameters = parse_audio_mime_type(mime_type)
    bits_per_sample = parameters["bits_per_sample"]
    sample_rate = parameters["rate"]
    num_channels = 1
    data_size = len(audio_data)
    bytes_per_sample = bits_per_sample // 8
    block_align = num_channels * bytes_per_sample
    byte_rate = sample_rate * block_align
    chunk_size = 36 + data_size

    header = struct.pack(
        "<4sI4s4sIHHIIHH4sI",
        b"RIFF",
        chunk_size,
        b"WAVE",
        b"fmt ",
        16,
        1,
        num_channels,
        sample_rate,
        byte_rate,
        block_align,
        bits_per_sample,
        b"data",
        data_size
    )
    return header + audio_data

def parse_audio_mime_type(mime_type: str):
    """Parses bits per sample and rate from an audio MIME type string."""
    bits_per_sample = 16
    rate = 24000

    parts = mime_type.split(";")
    for param in parts:
        param = param.strip()
        if param.lower().startswith("rate="):
            try:
                rate_str = param.split("=", 1)[1]
                rate = int(rate_str)
            except (ValueError, IndexError):
                pass
        elif param.startswith("audio/L"):
            try:
                bits_per_sample = int(param.split("L", 1)[1])
            except (ValueError, IndexError):
                pass

    return {"bits_per_sample": bits_per_sample, "rate": rate}

def generate_audio(text, voice="Zephyr", accent_type="hindi"):
    """Generate audio from text using Gemini TTS"""
    global latest_audio
    
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    model = "gemini-2.5-flash-preview-tts"
    
    # Different accent prompts
    accent_prompts = {
        "hindi": "Speak with a clear Indian Hindi accent, with low intonation and expressiveness. Do not say it aloud like a story. Be conversational like a customer care agent.",
        "neutral": "Speak in a clear, neutral accent:",
        "british": "Speak with a British English accent:",
        "american": "Speak with an American English accent:"
    }
    
    prompt_text = f"{accent_prompts.get(accent_type, accent_prompts['hindi'])}\n\n{text}"
    
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=prompt_text),
            ],
        ),
    ]
    
    generate_content_config = types.GenerateContentConfig(
        temperature=0.5,
        seed=42,
        response_modalities=["audio"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name=voice
                )
            )
        ),
    )

    audio_data = None
    mime_type = None
    
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        if (
            chunk.candidates is None
            or chunk.candidates[0].content is None
            or chunk.candidates[0].content.parts is None
        ):
            continue
        if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
            inline_data = chunk.candidates[0].content.parts[0].inline_data
            audio_data = inline_data.data
            mime_type = inline_data.mime_type
            break
    
    if audio_data:
        # Convert to WAV format
        wav_data = convert_to_wav(audio_data, mime_type)
        latest_audio = wav_data
        return True
    return False

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/generate', methods=['POST'])
def generate():
    try:
        data = request.json
        text = data.get('text', '')
        voice = data.get('voice', 'Zephyr')
        accent = data.get('accent', 'hindi')
        
        if not text:
            return jsonify({'error': 'Text is required'}), 400
        
        # Generate audio
        success = generate_audio(text, voice, accent)
        
        if success and latest_audio:
            # Convert to base64 for sending to frontend
            audio_base64 = base64.b64encode(latest_audio).decode('utf-8')
            return jsonify({
                'success': True,
                'audio': audio_base64
            })
        else:
            return jsonify({'error': 'Failed to generate audio'}), 500
            
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/download')
def download():
    if latest_audio:
        return send_file(
            io.BytesIO(latest_audio),
            mimetype='audio/wav',
            as_attachment=True,
            download_name=f'generated_audio_{datetime.now().strftime("%Y%m%d_%H%M%S")}.wav'
        )
    return jsonify({'error': 'No audio available'}), 404