Upload app.py with huggingface_hub

fb33bb7 verified 3 months ago

8.92 kB

	"""PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces."""
	import logging
	import os
	from pathlib import Path

	import gradio as gr
	import numpy as np

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Check if running on Hugging Face Spaces
	IS_SPACES = os.environ.get("SPACE_ID") is not None

	# Import spaces conditionally
	if IS_SPACES:
	import spaces

	# Import local modules
	from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID

	# Lazy import pipeline to avoid loading models at import time
	_pipeline = None


	def get_pipeline():
	"""Get the audio pipeline, creating it if needed."""
	global _pipeline
	if _pipeline is None:
	from src.pipeline import AudioPipeline
	device = "cuda" if IS_SPACES else "cpu"
	_pipeline = AudioPipeline(device=device)
	return _pipeline


	def _process_audio_impl(audio_tuple, character_id, conversation_history):
	"""Implementation of audio processing pipeline."""
	if audio_tuple is None:
	return None, "", "", "No audio recorded"

	sample_rate, audio_data = audio_tuple

	# Check for valid audio
	if len(audio_data) == 0:
	return None, "", "", "No audio detected"

	# Get character
	character = get_character(character_id)
	if character is None:
	character = get_character(DEFAULT_CHARACTER_ID)

	logger.info(f"Processing audio for character: {character.name}")

	try:
	# Get pipeline and process
	pipeline = get_pipeline()
	audio_out, user_text, response_text, timings = pipeline.process(
	audio_tuple=audio_tuple,
	system_prompt=character.system_prompt,
	voice=character.voice,
	conversation_history=conversation_history,
	)

	# Format timing info
	timing_str = f"STT: {timings['stt']1000:.0f}ms \| LLM: {timings['llm']1000:.0f}ms \| TTS: {timings['tts']1000:.0f}ms \| Total: {timings['total']1000:.0f}ms"

	return audio_out, user_text, response_text, timing_str

	except Exception as e:
	logger.error(f"Error processing audio: {e}", exc_info=True)
	return None, "", f"Error: {str(e)}", ""


	# Define the GPU-decorated function conditionally
	if IS_SPACES:
	@spaces.GPU(duration=30)
	def process_audio_gpu(audio_tuple, character_id, conversation_history):
	"""Process audio with GPU acceleration on Spaces."""
	return _process_audio_impl(audio_tuple, character_id, conversation_history)
	else:
	def process_audio_gpu(audio_tuple, character_id, conversation_history):
	"""Process audio locally (no GPU decorator)."""
	return _process_audio_impl(audio_tuple, character_id, conversation_history)


	def create_portrait_html(character):
	"""Create HTML for the animated portrait."""
	emoji = '🚀' if character.id == 'visionary' else '🤔' if character.id == 'skeptic' else '🌟'
	return f"""
	<div class="portrait-container portrait-idle" style="
	width: 200px;
	height: 200px;
	border-radius: 50%;
	background: {character.portrait_color};
	margin: 0 auto;
	display: flex;
	align-items: center;
	justify-content: center;
	box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
	position: relative;
	">
	<div class="portrait-placeholder" style="font-size: 80px;">
	{emoji}
	</div>
	<div class="mouth-overlay mouth-closed" style="
	position: absolute;
	bottom: 25%;
	left: 50%;
	transform: translateX(-50%);
	width: 40px;
	height: 8px;
	background: rgba(0, 0, 0, 0.2);
	border-radius: 4px;
	"></div>
	</div>
	<div class="status-indicator status-idle" style="
	display: flex;
	align-items: center;
	justify-content: center;
	gap: 8px;
	padding: 8px 16px;
	border-radius: 20px;
	margin: 15px auto;
	width: fit-content;
	background: #f3f4f6;
	">
	<div class="status-dot" style="width: 8px; height: 8px; border-radius: 50%; background: #9ca3af;"></div>
	<span class="status-text">Ready to listen</span>
	</div>
	"""


	def on_audio_record(audio, character_id, history):
	"""Handle audio recording completion."""
	if history is None:
	history = []

	if audio is None:
	return None, "", history, history

	# Convert history (list of tuples) to format expected by LLM
	conversation_history = []
	for user_msg, assistant_msg in history:
	conversation_history.append({"role": "user", "content": user_msg})
	conversation_history.append({"role": "assistant", "content": assistant_msg})

	# Process audio
	audio_out, user_text, response_text, timing = process_audio_gpu(
	audio, character_id, conversation_history
	)

	# Update history (Gradio 4.x uses list of tuples)
	new_history = list(history)
	if user_text and response_text:
	new_history.append((user_text, response_text))

	return audio_out, timing, new_history, new_history


	def update_character_info(character_id):
	"""Update character info when selection changes."""
	char = get_character(character_id)
	if char:
	return f"{char.tagline}\n\n{char.description}", create_portrait_html(char), [], []
	return "", "", [], []


	def clear_conversation():
	"""Clear the conversation history."""
	return [], []


	# Load CSS
	css_path = Path(__file__).parent / "static" / "styles.css"
	custom_css = ""
	if css_path.exists():
	custom_css = css_path.read_text()


	# Build the Gradio interface
	with gr.Blocks(
	title="PersonaFlow",
	theme=gr.themes.Soft(),
	css=custom_css,
	) as demo:
	# Sign in option to get rid of non-registered user GPU bug
	gr.LoginButton(value="Sign in to use your Pro Quota")

	# State
	conversation_state = gr.State([])

	# Header
	gr.Markdown("""
	# 🎭 PersonaFlow
	### Speak with AI characters that have distinct personalities and voices

	Select a character, then click the microphone to start talking!
	""")

	with gr.Row():
	# Left column: Character selection
	with gr.Column(scale=1):
	gr.Markdown("### Choose Your Character")

	character_dropdown = gr.Dropdown(
	choices=[(c.name, c.id) for c in get_all_characters()],
	value=DEFAULT_CHARACTER_ID,
	label="Character",
	interactive=True,
	)

	# Character info
	default_char = get_character(DEFAULT_CHARACTER_ID)
	character_info = gr.Markdown(
	f"{default_char.tagline}\n\n{default_char.description}"
	)

	# Middle column: Portrait and audio
	with gr.Column(scale=2):
	# Portrait display
	portrait_html = gr.HTML(
	value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)),
	)

	# Audio input
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="🎤 Click to speak",
	max_length=10,
	)

	# Audio output
	audio_output = gr.Audio(
	label="Character Response",
	type="numpy",
	autoplay=True,
	)

	# Timing display
	timing_display = gr.Textbox(
	label="Processing Time",
	interactive=False,
	)

	# Right column: Conversation
	with gr.Column(scale=1):
	gr.Markdown("### Conversation")

	chatbot = gr.Chatbot(
	label="Chat History",
	height=400,
	)

	clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")

	# Event handlers
	character_dropdown.change(
	fn=update_character_info,
	inputs=[character_dropdown],
	outputs=[character_info, portrait_html, chatbot, conversation_state],
	)

	# Audio processing
	audio_input.stop_recording(
	fn=on_audio_record,
	inputs=[audio_input, character_dropdown, conversation_state],
	outputs=[audio_output, timing_display, chatbot, conversation_state],
	)

	# Clear conversation
	clear_btn.click(
	fn=clear_conversation,
	outputs=[chatbot, conversation_state],
	)

	if __name__ == "__main__":
	demo.launch(show_api=False)