Spaces:

Athspi
/

Gsgsgsg

Running

App Files Files Community

Gsgsgsg / app.py

Athspi

Update app.py

e4ca1d6 verified 3 days ago

raw

history blame contribute delete

5.26 kB

	import gradio as gr
	import google.generativeai as genai
	from google.generativeai.types import GenerationConfig
	import time
	import os
	import wave

	# --- Load API Key from Hugging Face Secrets ---
	# For this to work on Hugging Face Spaces, you must go to your Space's
	# settings and add a secret named "GOOGLE_API_KEY" with your Google AI API key as the value.
	GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

	# --- Helper Functions ---
	def create_unique_wav_file(pcm_data, channels=1, rate=24000, sample_width=2):
	"""Saves PCM audio data to a uniquely named WAV file and returns the path."""
	output_dir = "audio_outputs"
	os.makedirs(output_dir, exist_ok=True)

	timestamp = int(time.time())
	file_name = os.path.join(output_dir, f'speech_output_{timestamp}.wav')

	try:
	with wave.open(file_name, "wb") as wf:
	wf.setnchannels(channels)
	wf.setsampwidth(sample_width)
	wf.setframerate(rate)
	wf.writeframes(pcm_data)
	return file_name
	except Exception as e:
	print(f"Error saving wave file: {e}")
	raise gr.Error(f"Could not save audio file. Error: {e}")

	# --- Core API Logic (Corrected API Call Structure) ---
	def synthesize_speech(text, voice):
	"""
	Synthesizes speech from text using the Gemini API's native TTS capabilities.
	"""
	# 1. Validate Inputs (API Key and Text)
	if not GOOGLE_API_KEY:
	raise gr.Error("Google API Key not found. Please ensure you have set the GOOGLE_API_KEY secret in your Hugging Face Space settings.")
	if not text or not text.strip():
	raise gr.Error("Please enter some text to synthesize.")
	if not voice:
	raise gr.Error("Please select a voice.")

	try:
	# 2. Configure the API key once
	genai.configure(api_key=GOOGLE_API_KEY)

	# 3. Instantiate the correct model
	model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-tts")

	# 4. Construct the GenerationConfig with ONLY the speech_config
	tts_generation_config = GenerationConfig(
	speech_config={
	"voice_config": {
	"prebuilt_voice_config": {
	"voice_name": voice
	}
	}
	}
	)

	# 5. Generate content, passing response_modalities directly
	prompt = f"Say cheerfully: {text}"
	response = model.generate_content(
	contents=prompt,
	generation_config=tts_generation_config,
	response_modalities=["AUDIO"] # CORRECTED: This is a direct argument
	)

	# 6. Extract audio data from the response structure
	if response.candidates and response.candidates[0].content.parts:
	audio_data = response.candidates[0].content.parts[0].inline_data.data
	audio_file_path = create_unique_wav_file(audio_data)
	return audio_file_path
	else:
	raise gr.Error("The API did not return audio data. Please check your text or try again.")

	except Exception as e:
	# Provide a more informative error message in the UI.
	print(f"An error occurred: {e}")
	raise gr.Error(f"Failed to synthesize speech. Please check your network connection and that your API key is valid. Error: {e}")

	# --- Gradio User Interface ---
	with gr.Blocks(theme=gr.themes.Soft()) as iface:
	gr.Markdown(
	"""
	# ✨ Gemini Text-to-Speech Synthesizer
	This app uses a Google AI API key stored securely in Hugging Face secrets.
	Just enter the text, choose a voice, and generate speech!
	"""
	)

	# List of available voices from the documentation
	voice_options = [
	"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda", "Orus", "Aoede",
	"Callirrhoe", "Autonoe", "Enceladus", "Iapetus", "Umbriel", "Algieba",
	"Despina", "Erinome", "Algenib", "Rasalgethi", "Laomedeia", "Achernar",
	"Alnilam", "Schedar", "Gacrux", "Pulcherrima", "Achird", "Zubenelgenubi",
	"Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat"
	]

	# UI Components
	text_input = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Hello! Welcome to the text-to-speech demonstration.",
	lines=4,
	)

	voice_dropdown = gr.Dropdown(
	voice_options, label="Choose a Voice", value="Kore"
	)

	submit_btn = gr.Button("Generate Speech", variant="primary")

	audio_output = gr.Audio(label="Generated Audio", type="filepath")

	# Connect the button click event to the core function
	submit_btn.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_dropdown],
	outputs=audio_output
	)

	gr.Examples(
	examples=[
	["The weather is wonderful today, perfect for a walk in the park.", "Puck"],
	["This is a demonstration of high-quality speech synthesis.", "Charon"],
	["By the pricking of my thumbs, something wicked this way comes.", "Enceladus"],
	],
	inputs=[text_input, voice_dropdown],
	label="Example Prompts & Voices"
	)

	# --- Main execution block ---
	if __name__ == "__main__":
	iface.launch()