|
import gradio as gr |
|
import google.generativeai as genai |
|
from google.generativeai.types import GenerationConfig |
|
import time |
|
import os |
|
import wave |
|
|
|
|
|
|
|
|
|
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") |
|
|
|
|
|
def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2): |
|
"""Saves PCM audio data to a uniquely named WAV file and returns the path.""" |
|
output_dir = "audio_outputs" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
timestamp = int(time.time()) |
|
file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav') |
|
|
|
try: |
|
with wave.open(file_name, "wb") as wf: |
|
wf.setnchannels(channels) |
|
wf.setsampwidth(sample_width) |
|
wf.setframerate(rate) |
|
wf.writeframes(pcm_data) |
|
return file_name |
|
except Exception as e: |
|
print(f"Error saving wave file: {e}") |
|
raise gr.Error(f"Could not save audio file. Error: {e}") |
|
|
|
|
|
def synthesize_speech(text, voice): |
|
""" |
|
Synthesizes speech from text using the Gemini API's native TTS capabilities. |
|
""" |
|
|
|
if not GOOGLE_API_KEY: |
|
raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.") |
|
if not text or not text.strip(): |
|
raise gr.Error("Please enter some text to synthesize.") |
|
if not voice: |
|
raise gr.Error("Please select a voice.") |
|
|
|
try: |
|
|
|
genai.configure(api_key=GOOGLE_API_KEY) |
|
|
|
|
|
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts") |
|
|
|
|
|
tts_generation_config = GenerationConfig( |
|
speech_config={ |
|
"voice_config": { |
|
"prebuilt_voice_config": { |
|
"voice_name": voice |
|
} |
|
} |
|
} |
|
) |
|
|
|
|
|
prompt = f"Say cheerfully: {text}" |
|
response = model.generate_content( |
|
contents=prompt, |
|
generation_config=tts_generation_config, |
|
response_modalities=["AUDIO"] |
|
) |
|
|
|
|
|
if response.candidates and response.candidates[0].content.parts: |
|
audio_data = response.candidates[0].content.parts[0].inline_data.data |
|
audio_file_path = create_unique_wav_file(audio_data) |
|
return audio_file_path |
|
else: |
|
raise gr.Error("The API did not return audio data. Please check your text or try again.") |
|
|
|
except Exception as e: |
|
|
|
print(f"An error occurred: {e}") |
|
raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}") |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as iface: |
|
gr.Markdown( |
|
""" |
|
# ✨ Gemini Text-to-Speech Synthesizer |
|
This app uses a Google AI API key stored securely in Hugging Face secrets. |
|
Just enter the text, choose a voice, and generate speech! |
|
""" |
|
) |
|
|
|
|
|
voice_options = [ |
|
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede", |
|
"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba", |
|
"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar", |
|
"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi", |
|
"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat" |
|
] |
|
|
|
|
|
text_input = gr.Textbox( |
|
label="Text to Synthesize", |
|
placeholder="Hello! Welcome to the text-to-speech demonstration.", |
|
lines=4, |
|
) |
|
|
|
voice_dropdown = gr.Dropdown( |
|
voice_options, label="Choose a Voice", value="Kore" |
|
) |
|
|
|
submit_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
audio_output = gr.Audio(label="Generated Audio", type="filepath") |
|
|
|
|
|
submit_btn.click( |
|
fn=synthesize_speech, |
|
inputs=[text_input, voice_dropdown], |
|
outputs=audio_output |
|
) |
|
|
|
gr.Examples( |
|
examples=[ |
|
["The weather is wonderful today, perfect for a walk in the park.", "Puck"], |
|
["This is a demonstration of high-quality speech synthesis.", "Charon"], |
|
["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"], |
|
], |
|
inputs=[text_input, voice_dropdown], |
|
label="Example Prompts & Voices" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|