Spaces:

AAdonis
/

LAVCO

Sleeping

App Files Files Community

LAVCO / app.py

AAdonis

Update app.py

f993740 verified about 2 months ago

raw

history blame contribute delete

27.5 kB

	"""
	LAVCO Gradio App for HuggingFace Spaces

	A beautiful web interface for voice conversion using LAVCO (Llasa-VC).
	"""

	import os
	import re
	import tempfile
	import gradio as gr
	import torch
	import torch.nn as nn
	import numpy as np
	import soundfile as sf
	import librosa
	from typing import List, Optional, Dict, Tuple
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	WhisperModel,
	WhisperFeatureExtractor,
	)

	# Constants
	XCODEC2_FRAME_RATE = 50
	WHISPER_FRAME_RATE = 50

	# Model configuration
	MODEL_ID = os.getenv("MODEL_ID", "AdoCleanCode/LAVCO-v3")
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	# Default audio files (will be in examples/ directory)
	EXAMPLES_DIR = "examples"
	DEFAULT_SOURCE_PATH = os.path.join(EXAMPLES_DIR, "sample1_source.wav")
	DEFAULT_REFERENCE_PATH = os.path.join(EXAMPLES_DIR, "sample1_reference.wav")

	# Check if files exist and use absolute paths
	DEFAULT_SOURCE_AUDIO = None
	DEFAULT_REFERENCE_AUDIO = None

	if os.path.exists(DEFAULT_SOURCE_PATH):
	DEFAULT_SOURCE_AUDIO = os.path.abspath(DEFAULT_SOURCE_PATH)
	print(f"✅ Found default source audio: {DEFAULT_SOURCE_AUDIO}", flush=True)
	else:
	print(f"⚠️ Default source audio not found: {DEFAULT_SOURCE_PATH}", flush=True)

	if os.path.exists(DEFAULT_REFERENCE_PATH):
	DEFAULT_REFERENCE_AUDIO = os.path.abspath(DEFAULT_REFERENCE_PATH)
	print(f"✅ Found default reference audio: {DEFAULT_REFERENCE_AUDIO}", flush=True)
	else:
	print(f"⚠️ Default reference audio not found: {DEFAULT_REFERENCE_PATH}", flush=True)

	# Global model and tokenizer (loaded once)
	model = None
	tokenizer = None


	class SpeechOnlyLogitsProcessor:
	"""Only allow XCodec2 speech tokens and custom EOS."""

	def __init__(self, tokenizer, eos_id: int):
	self.allowed = torch.zeros(len(tokenizer), dtype=torch.bool)
	vocab = tokenizer.get_vocab()
	pat = re.compile(r"^<\\|s_\d+\\|>$")
	for t, tid in vocab.items():
	if pat.match(t):
	self.allowed[tid] = True
	self.allowed[eos_id] = True

	def __call__(self, input_ids, scores):
	mask = self.allowed.to(scores.device)
	return scores.masked_fill(~mask, float("-inf"))


	def apply_repetition_penalty(logits: torch.Tensor, generated_ids: List[int], penalty: float = 1.2, window: int = 5):
	"""Apply repetition penalty ONLY to recently repeated tokens."""
	if penalty == 1.0 or len(generated_ids) < 2:
	return logits

	recent_tokens = generated_ids[-window:] if len(generated_ids) >= window else generated_ids
	token_counts = {}
	for token_id in recent_tokens:
	token_counts[token_id] = token_counts.get(token_id, 0) + 1

	for token_id, count in token_counts.items():
	if count > 1:
	effective_penalty = penalty ** (count - 1)
	if logits[0, token_id] > 0:
	logits[0, token_id] /= effective_penalty
	else:
	logits[0, token_id] *= effective_penalty

	return logits


	def sample_with_temperature_and_top_p(logits: torch.Tensor, temperature: float = 1.0, top_p: float = 0.9):
	"""Sample token with temperature scaling and nucleus (top-p) sampling."""
	if temperature != 1.0:
	logits = logits / temperature

	probs = torch.softmax(logits, dim=-1)

	if top_p < 1.0:
	sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
	cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = False
	indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
	probs = probs.masked_fill(indices_to_remove, 0.0)
	probs = probs / probs.sum(dim=-1, keepdim=True)

	next_token_id = torch.multinomial(probs, num_samples=1).item()
	return next_token_id


	def greedy_generate_with_embeds(
	model,
	inputs_embeds: torch.Tensor,
	embed_layer,
	logits_processor,
	max_new_tokens: int,
	eos_token_id: int,
	pad_token_id: int = 0,
	verbose: bool = False,
	tokenizer=None,
	temperature: float = 1.0,
	repetition_penalty: float = 1.2,
	top_p: float = 0.9,
	repetition_window: int = 5,
	) -> List[int]:
	"""KV-cache enabled greedy generation starting from inputs_embeds."""
	device = inputs_embeds.device
	generated = []
	past_key_values = None

	cur_embeds = inputs_embeds
	dummy_input_ids = torch.zeros(1, inputs_embeds.shape[1], dtype=torch.long, device=device)

	with torch.no_grad():
	outputs = model(
	inputs_embeds=cur_embeds,
	use_cache=True,
	return_dict=True,
	)
	logits = outputs.logits[:, -1, :]
	past_key_values = outputs.past_key_values

	logits = logits_processor(dummy_input_ids, logits)
	logits = apply_repetition_penalty(logits, generated, repetition_penalty, repetition_window)

	if temperature == 1.0 and top_p == 1.0:
	next_token_id = torch.argmax(logits, dim=-1).item()
	else:
	next_token_id = sample_with_temperature_and_top_p(logits, temperature, top_p)

	generated.append(next_token_id)

	if next_token_id == eos_token_id:
	return generated

	for step in range(1, max_new_tokens):
	new_token_embed = embed_layer(torch.tensor([[next_token_id]], device=device))

	with torch.no_grad():
	outputs = model(
	inputs_embeds=new_token_embed,
	past_key_values=past_key_values,
	use_cache=True,
	return_dict=True,
	)
	logits = outputs.logits[:, -1, :]
	past_key_values = outputs.past_key_values

	dummy_input_ids = torch.cat([
	dummy_input_ids,
	torch.tensor([[next_token_id]], device=device)
	], dim=1)
	logits = logits_processor(dummy_input_ids, logits)
	logits = apply_repetition_penalty(logits, generated, repetition_penalty, repetition_window)

	if temperature == 1.0 and top_p == 1.0:
	next_token_id = torch.argmax(logits, dim=-1).item()
	else:
	next_token_id = sample_with_temperature_and_top_p(logits, temperature, top_p)

	generated.append(next_token_id)

	if next_token_id == eos_token_id:
	break

	return generated


	class LAVCOModel(nn.Module):
	"""LAVCO model for voice conversion."""

	def __init__(self, load_dir_or_repo: str, device: str = "cuda", cache_dir: str = None):
	super().__init__()
	import json
	from huggingface_hub import hf_hub_download, snapshot_download
	from xcodec2.modeling_xcodec2 import XCodec2Model

	is_local = os.path.isdir(load_dir_or_repo)

	if is_local:
	config_path = os.path.join(load_dir_or_repo, "llasa_vc_config.json")
	proj_path = os.path.join(load_dir_or_repo, "projection.pt")
	llasa_path = os.path.join(load_dir_or_repo, "llasa")
	else:
	print(f"📥 Downloading from HuggingFace: {load_dir_or_repo}")
	config_path = hf_hub_download(
	repo_id=load_dir_or_repo,
	filename="llasa_vc_config.json",
	cache_dir=cache_dir,
	)
	proj_path = hf_hub_download(
	repo_id=load_dir_or_repo,
	filename="projection.pt",
	cache_dir=cache_dir,
	)
	llasa_path = snapshot_download(
	repo_id=load_dir_or_repo,
	allow_patterns=["llasa/*"],
	cache_dir=cache_dir,
	)
	llasa_path = os.path.join(llasa_path, "llasa")

	with open(config_path, "r") as f:
	config = json.load(f)

	import sys
	print(f"📥 Loading LLASA from {llasa_path}...", flush=True)
	sys.stdout.flush()
	self.llasa = AutoModelForCausalLM.from_pretrained(
	llasa_path,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	)
	self.hidden_size = self.llasa.config.hidden_size
	print(f" ✅ LLASA loaded (hidden_size={self.hidden_size})", flush=True)
	sys.stdout.flush()

	print(f"📥 Loading Whisper encoder from {config['whisper_model']}...", flush=True)
	sys.stdout.flush()
	whisper_full = WhisperModel.from_pretrained(config["whisper_model"])
	self.whisper = whisper_full.encoder
	self.whisper_dim = self.whisper.config.d_model
	del whisper_full
	print(f" ✅ Whisper loaded (dim={self.whisper_dim})", flush=True)
	sys.stdout.flush()

	print(f"📥 Loading XCodec2 from {config['xcodec_model']}...", flush=True)
	sys.stdout.flush()
	self.xcodec = XCodec2Model.from_pretrained(config["xcodec_model"])
	self.xcodec.eval()
	print(f" ✅ XCodec2 loaded", flush=True)
	sys.stdout.flush()

	print(f"📥 Loading Whisper processor...", flush=True)
	sys.stdout.flush()
	self.whisper_processor = WhisperFeatureExtractor.from_pretrained(config["whisper_model"])
	print(f" ✅ Whisper processor loaded", flush=True)
	sys.stdout.flush()

	print(f"📥 Loading projection layer...", flush=True)
	sys.stdout.flush()
	proj_state = torch.load(proj_path, map_location="cpu", weights_only=False)
	self.projection = nn.Linear(self.whisper_dim, self.hidden_size)
	self.projection.load_state_dict(proj_state)
	print(f" ✅ Projection layer loaded", flush=True)
	sys.stdout.flush()

	self.u_start_id = config.get("u_start_id")
	self.u_end_id = config.get("u_end_id")
	self.g_start_id = config["g_start_id"]
	self.g_end_id = config["g_end_id"]
	self.pad_id = config["pad_id"]

	for param in self.whisper.parameters():
	param.requires_grad = False
	self.whisper.eval()

	for param in self.xcodec.parameters():
	param.requires_grad = False
	self.xcodec.eval()

	def set_special_token_ids(self, tokenizer):
	"""Set special token IDs and instruction text embeddings."""
	self.tokenizer = tokenizer
	self.u_start_id = tokenizer.convert_tokens_to_ids("<\|SPEECH_UNDERSTANDING_START\|>")
	self.u_end_id = tokenizer.convert_tokens_to_ids("<\|SPEECH_UNDERSTANDING_END\|>")
	self.g_start_id = tokenizer.convert_tokens_to_ids("<\|SPEECH_GENERATION_START\|>")
	self.g_end_id = tokenizer.convert_tokens_to_ids("<\|SPEECH_GENERATION_END\|>")
	self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

	prefix_text = "Convert "
	middle_text = " into speech using this speaker: "

	self.prefix_ids = tokenizer(prefix_text, add_special_tokens=False, return_tensors="pt")["input_ids"]
	self.middle_ids = tokenizer(middle_text, add_special_tokens=False, return_tensors="pt")["input_ids"]

	def _tokenizer_ids_to_xcodec_codes(self, tokenizer_ids: torch.Tensor) -> torch.Tensor:
	"""Convert LLASA tokenizer IDs back to raw XCodec2 codes (0-65535)."""
	batch_size, seq_len = tokenizer_ids.shape
	xcodec_codes = torch.zeros_like(tokenizer_ids)

	for i in range(batch_size):
	tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_ids[i].tolist())
	for j, tok in enumerate(tokens):
	if tok and tok.startswith("<\|s_") and tok.endswith("\|>"):
	try:
	code = int(tok[4:-2])
	xcodec_codes[i, j] = code
	except ValueError:
	xcodec_codes[i, j] = 0
	else:
	xcodec_codes[i, j] = 0

	return xcodec_codes

	def generate(
	self,
	wav_or_mel: np.ndarray,
	ref_ids: torch.Tensor,
	ref_length: int,
	max_new_tokens: int = 2000,
	tokenizer=None,
	temperature: float = 1.0,
	repetition_penalty: float = 1.2,
	top_p: float = 0.9,
	repetition_window: int = 5,
	verbose: bool = False,
	) -> List[int]:
	"""Generate voice conversion tokens."""
	device = ref_ids.device
	model_dtype = next(self.llasa.parameters()).dtype

	mel = self.whisper_processor(wav_or_mel, sampling_rate=16000, return_tensors="pt").input_features.to(device)
	whisper_out = self.whisper(mel).last_hidden_state

	audio_dur = len(wav_or_mel) / 16000
	num_frames = min(int(audio_dur * WHISPER_FRAME_RATE), 1500)
	soft_tokens = self.projection(whisper_out[:, :num_frames]).to(model_dtype)

	embed_layer = self.llasa.get_input_embeddings()

	prefix_emb = embed_layer(self.prefix_ids.to(device))
	middle_emb = embed_layer(self.middle_ids.to(device))
	u_start_emb = embed_layer(torch.tensor([[self.u_start_id]], device=device))
	u_end_emb = embed_layer(torch.tensor([[self.u_end_id]], device=device))
	g_start_emb = embed_layer(torch.tensor([[self.g_start_id]], device=device))

	ref_embeds = embed_layer(ref_ids[:, :ref_length])

	inputs_embeds = torch.cat([
	prefix_emb,
	soft_tokens,
	middle_emb,
	u_start_emb,
	ref_embeds,
	u_end_emb,
	g_start_emb,
	], dim=1).to(model_dtype)

	if tokenizer is not None:
	logits_processor = SpeechOnlyLogitsProcessor(tokenizer, self.g_end_id)

	generated = greedy_generate_with_embeds(
	model=self.llasa,
	inputs_embeds=inputs_embeds,
	embed_layer=embed_layer,
	logits_processor=logits_processor,
	max_new_tokens=max_new_tokens,
	eos_token_id=self.g_end_id,
	pad_token_id=self.pad_id,
	verbose=verbose,
	tokenizer=tokenizer,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	top_p=top_p,
	repetition_window=repetition_window,
	)
	return generated
	else:
	outputs = self.llasa.generate(
	inputs_embeds=inputs_embeds,
	max_new_tokens=max_new_tokens,
	pad_token_id=self.pad_id,
	eos_token_id=self.g_end_id,
	do_sample=False,
	)
	return outputs[0].tolist()


	def load_model():
	"""Load model once at startup."""
	global model, tokenizer

	if model is None:
	import sys
	import time

	print(f"📥 Loading model: {MODEL_ID}", flush=True)
	sys.stdout.flush()

	start_time = time.time()
	print(" → Loading LAVCO model components...", flush=True)
	model = LAVCOModel(MODEL_ID, device=DEVICE)
	print(f" → Moving model to {DEVICE}...", flush=True)
	model = model.to(DEVICE)
	model.eval()
	print(f" → Loading tokenizer...", flush=True)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	print(f" → Setting special tokens...", flush=True)
	model.set_special_token_ids(tokenizer)

	elapsed = time.time() - start_time
	print(f"✅ Model loaded in {elapsed:.1f}s", flush=True)
	sys.stdout.flush()

	return model, tokenizer


	def extract_xcodec2_from_generated(tokenizer, token_ids: list) -> list:
	"""Extract XCodec2 token IDs from generated token IDs."""
	xcodec2_ids = []
	for tid in token_ids:
	token = tokenizer.convert_ids_to_tokens(tid)
	if token and token.startswith("<\|s_") and token.endswith("\|>"):
	try:
	xcodec2_ids.append(int(token[4:-2]))
	except ValueError:
	pass
	return xcodec2_ids


	def convert_voice(source_audio, reference_audio, temperature, repetition_penalty, top_p, repetition_window, max_tokens, progress=gr.Progress()):
	"""Convert source voice to reference voice using LAVCO."""
	if source_audio is None:
	return None, "❌ Please provide source audio"

	if reference_audio is None:
	return None, "❌ Please provide reference audio"

	try:
	progress(0.1, desc="Loading model...")
	model, tokenizer = load_model()

	progress(0.2, desc="Loading audio files...")
	if isinstance(source_audio, tuple):
	source_path = source_audio[1]
	else:
	source_path = source_audio

	if isinstance(reference_audio, tuple):
	reference_path = reference_audio[1]
	else:
	reference_path = reference_audio

	source_wav = librosa.load(source_path, sr=16000)[0].astype(np.float32)
	reference_wav = librosa.load(reference_path, sr=16000)[0].astype(np.float32)

	progress(0.4, desc="Encoding audio...")
	with torch.no_grad():
	xcodec_device = next(model.xcodec.parameters()).device
	ref_tensor_audio = torch.from_numpy(reference_wav).float().unsqueeze(0).to(xcodec_device)
	ref_codes = model.xcodec.encode_code(input_waveform=ref_tensor_audio)

	if isinstance(ref_codes, torch.Tensor):
	ref_codes_np = ref_codes.cpu().numpy()
	else:
	ref_codes_np = np.array(ref_codes)
	ref_xcodec_ids = ref_codes_np.flatten().astype(int).tolist()

	ref_token_str = "".join([f"<\|s_{rid}\|>" for rid in ref_xcodec_ids])
	ref_tokenizer_ids = tokenizer(ref_token_str, add_special_tokens=False)["input_ids"]
	ref_ids = torch.tensor(ref_tokenizer_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
	ref_length = len(ref_tokenizer_ids)

	source_tensor_audio = torch.from_numpy(source_wav).float().unsqueeze(0).to(xcodec_device)
	source_codes = model.xcodec.encode_code(input_waveform=source_tensor_audio)

	if isinstance(source_codes, torch.Tensor):
	source_codes_np = source_codes.cpu().numpy()
	else:
	source_codes_np = np.array(source_codes)
	source_xcodec_ids = source_codes_np.flatten().astype(int).tolist()

	source_token_str = "".join([f"<\|s_{rid}\|>" for rid in source_xcodec_ids])
	source_tokenizer_ids = tokenizer(source_token_str, add_special_tokens=False)["input_ids"]
	seedvc_ids = torch.tensor(source_tokenizer_ids, dtype=torch.long, device=DEVICE).unsqueeze(0)
	seedvc_length = len(source_tokenizer_ids)

	xcodec_codes = model._tokenizer_ids_to_xcodec_codes(seedvc_ids)
	codes = xcodec_codes.unsqueeze(1).to(xcodec_device)
	wav = model.xcodec.decode_code(codes)
	if len(wav.shape) == 3:
	wav = wav.squeeze(1)
	num_samples_audio = int(seedvc_length / XCODEC2_FRAME_RATE * 16000)
	num_samples_audio = min(num_samples_audio, wav.shape[-1])
	source_wav_processed = wav[0, :num_samples_audio].cpu().numpy()

	progress(0.7, desc="Generating voice conversion...")
	import inspect
	gen_sig = inspect.signature(model.generate)
	gen_params = gen_sig.parameters

	gen_kwargs = {
	'max_new_tokens': max_tokens,
	'tokenizer': tokenizer,
	'verbose': False,
	}

	if 'temperature' in gen_params:
	gen_kwargs['temperature'] = temperature
	if 'repetition_penalty' in gen_params:
	gen_kwargs['repetition_penalty'] = repetition_penalty
	if 'top_p' in gen_params:
	gen_kwargs['top_p'] = top_p
	if 'repetition_window' in gen_params:
	gen_kwargs['repetition_window'] = repetition_window

	generated_token_ids = model.generate(
	source_wav_processed,
	ref_ids,
	ref_length,
	**gen_kwargs
	)

	progress(0.9, desc="Decoding audio...")
	gen_xcodec_ids = extract_xcodec2_from_generated(tokenizer, generated_token_ids)

	if not gen_xcodec_ids:
	return None, "❌ No audio tokens generated!"

	codes = torch.tensor(gen_xcodec_ids, device=xcodec_device).unsqueeze(0).unsqueeze(0)
	output_wav = model.xcodec.decode_code(codes)

	if len(output_wav.shape) == 3:
	output_wav = output_wav[0, 0, :].cpu().numpy()
	elif len(output_wav.shape) == 2:
	output_wav = output_wav[0, :].cpu().numpy()
	else:
	output_wav = output_wav.cpu().numpy()

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	sf.write(tmp_file.name, output_wav, 16000)
	output_path = tmp_file.name

	progress(1.0, desc="Complete!")
	return output_path, f"✅ Generated {len(gen_xcodec_ids)} tokens ({len(gen_xcodec_ids)/XCODEC2_FRAME_RATE:.2f}s)"

	except Exception as e:
	import traceback
	error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
	return None, error_msg


	# Custom CSS for beautiful UI
	css = """
	.gradio-container {
	font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
	}
	.main-header {
	text-align: center;
	padding: 2rem 0;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	border-radius: 10px;
	margin-bottom: 2rem;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	<div class="main-header">
	<h1>🎤 LAVCO: Voice Conversion</h1>
	<p>Convert speech to match any reference voice using semantic/acoustic interleaving</p>
	</div>
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📥 Input Audio")
	source_audio = gr.Audio(
	label="Source Audio (content to convert)",
	type="filepath",
	sources=["upload", "microphone"]
	)
	reference_audio = gr.Audio(
	label="Reference Audio (target voice)",
	type="filepath",
	sources=["upload", "microphone"]
	)

	# Add examples if default files exist
	if DEFAULT_SOURCE_AUDIO and DEFAULT_REFERENCE_AUDIO:
	gr.Examples(
	examples=[[DEFAULT_SOURCE_AUDIO, DEFAULT_REFERENCE_AUDIO]],
	inputs=[source_audio, reference_audio],
	label="📁 Example Audio Files (Click to load)",
	)

	with gr.Column():
	gr.Markdown("### ⚙️ Generation Parameters")
	temperature = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Temperature",
	info="Higher = more diverse, lower = more deterministic"
	)
	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.3,
	step=0.1,
	label="Repetition Penalty",
	info="Penalize repeated tokens (1.0 = off)"
	)
	top_p = gr.Slider(
	minimum=0.5,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-P (Nucleus Sampling)",
	info="Sample from top P probability mass"
	)
	repetition_window = gr.Slider(
	minimum=3,
	maximum=10,
	value=5,
	step=1,
	label="Repetition Window",
	info="Look at last N tokens for repetition"
	)
	max_tokens = gr.Slider(
	minimum=100,
	maximum=2000,
	value=2000,
	step=100,
	label="Max Tokens",
	info="Maximum tokens to generate"
	)

	convert_btn = gr.Button("🎯 Convert Voice", variant="primary", size="lg")

	with gr.Row():
	output_audio = gr.Audio(
	label="Converted Audio",
	type="filepath",
	autoplay=True
	)
	status_text = gr.Textbox(
	label="Status",
	interactive=False
	)

	gr.Markdown("""
	### 📖 How to Use

	1. Upload or record your source audio (the speech you want to convert)
	- Click the microphone icon to record directly from your microphone
	- Or upload an audio file (WAV, MP3, etc.)
	2. Upload or record your reference audio (the voice you want to mimic)
	- Click the microphone icon to record the target voice
	- Or upload a reference audio file
	3. Adjust generation parameters if needed (defaults work well)
	4. Click Convert Voice and wait for the result

	### 💡 Tips

	- Keep audio clips under 30 seconds for best results
	- Reference audio should be clear speech (1+ seconds recommended)
	- When recording, speak clearly and minimize background noise
	- Higher repetition penalty helps avoid repetitive outputs
	- Lower temperature = more stable, higher = more creative
	""")

	convert_btn.click(
	fn=convert_voice,
	inputs=[
	source_audio,
	reference_audio,
	temperature,
	repetition_penalty,
	top_p,
	repetition_window,
	max_tokens,
	],
	outputs=[output_audio, status_text]
	)

	if __name__ == "__main__":
	import sys
	print("=" * 60, flush=True)
	print("🚀 Starting LAVCO Gradio App", flush=True)
	print("=" * 60, flush=True)
	print(f"Device: {DEVICE}", flush=True)
	print(f"Model: {MODEL_ID}", flush=True)
	print(f"\n📁 Checking for default audio files...", flush=True)
	print(f" Examples directory: {os.path.abspath(EXAMPLES_DIR)}", flush=True)
	print(f" Source audio: {DEFAULT_SOURCE_AUDIO or 'Not found'}", flush=True)
	print(f" Reference audio: {DEFAULT_REFERENCE_AUDIO or 'Not found'}", flush=True)
	sys.stdout.flush()

	# Pre-load model at startup (so first user doesn't wait)
	print("\n⏳ Pre-loading model (this may take a few minutes)...", flush=True)
	sys.stdout.flush()
	try:
	load_model()
	print("✅ Model ready! Starting Gradio interface...", flush=True)
	sys.stdout.flush()
	except Exception as e:
	print(f"⚠️ Model pre-loading failed: {e}", flush=True)
	print(" Model will load on first use instead.", flush=True)
	import traceback
	traceback.print_exc()
	sys.stdout.flush()

	print("\n🌐 Launching web interface...", flush=True)
	sys.stdout.flush()
	demo.launch(
	server_name="0.0.0.0", # Listen on all network interfaces
	server_port=7860, # The default port HF expects
	share=False # Don't create a public share link (HF handles this)
	)