Gsgsgsg / app.py
Athspi's picture
Update app.py
e4ca1d6 verified
import gradio as gr
import google.generativeai as genai
from google.generativeai.types import GenerationConfig
import time
import os
import wave
# --- Load API Key from Hugging Face Secrets ---
# For this to work on Hugging Face Spaces, you must go to your Space's
# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# --- Helper Functions ---
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
"""Saves PCM audio data to a uniquely named WAV file and returns the path."""
output_dir = "audio_outputs"
os.makedirs(output_dir, exist_ok=True)
timestamp = int(time.time())
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')
try:
with wave.open(file_name, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm_data)
return file_name
except Exception as e:
print(f"Error saving wave file: {e}")
raise gr.Error(f"Could not save audio file. Error: {e}")
# --- Core API Logic (Corrected API Call Structure) ---
def synthesize_speech(text, voice):
"""
Synthesizes speech from text using the Gemini API's native TTS capabilities.
"""
# 1. Validate Inputs (API Key and Text)
if not GOOGLE_API_KEY:
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
if not text or not text.strip():
raise gr.Error("Please enter some text to synthesize.")
if not voice:
raise gr.Error("Please select a voice.")
try:
# 2. Configure the API key once
genai.configure(api_key=GOOGLE_API_KEY)
# 3. Instantiate the correct model
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")
# 4. Construct the GenerationConfig with ONLY the speech_config
tts_generation_config = GenerationConfig(
speech_config={
"voice_config": {
"prebuilt_voice_config": {
"voice_name": voice
}
}
}
)
# 5. Generate content, passing response_modalities directly
prompt = f"Say cheerfully: {text}"
response = model.generate_content(
contents=prompt,
generation_config=tts_generation_config,
response_modalities=["AUDIO"] # CORRECTED: This is a direct argument
)
# 6. Extract audio data from the response structure
if response.candidates and response.candidates[0].content.parts:
audio_data = response.candidates[0].content.parts[0].inline_data.data
audio_file_path = create_unique_wav_file(audio_data)
return audio_file_path
else:
raise gr.Error("The API did not return audio data. Please check your text or try again.")
except Exception as e:
# Provide a more informative error message in the UI.
print(f"An error occurred: {e}")
raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")
# --- Gradio User Interface ---
with gr.Blocks(theme=gr.themes.Soft()) as iface:
gr.Markdown(
"""
# ✨ Gemini Text-to-Speech Synthesizer
This app uses a Google AI API key stored securely in Hugging Face secrets.
Just enter the text, choose a voice, and generate speech!
"""
)
# List of available voices from the documentation
voice_options = [
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
]
# UI Components
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Hello! Welcome to the text-to-speech demonstration.",
lines=4,
)
voice_dropdown = gr.Dropdown(
voice_options, label="Choose a Voice", value="Kore"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
audio_output = gr.Audio(label="Generated Audio", type="filepath")
# Connect the button click event to the core function
submit_btn.click(
fn=synthesize_speech,
inputs=[text_input, voice_dropdown],
outputs=audio_output
)
gr.Examples(
examples=[
["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
["This is a demonstration of high-quality speech synthesis.", "Charon"],
["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
],
inputs=[text_input, voice_dropdown],
label="Example Prompts & Voices"
)
# --- Main execution block ---
if __name__ == "__main__":
iface.launch()