IndicF5

Runtime error

App Files Files Community

IndicF5 / app.py

high77

Update app.py

ee96f4d verified 5 months ago

raw

history blame contribute delete

8.7 kB

	import io
	import spaces
	import torch
	import requests
	import tempfile
	import numpy as np
	import gradio as gr
	import soundfile as sf
	from transformers import AutoModel
	from typing import Tuple
	import uuid
	import os

	# ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
	def detect_language_from_text(text: str) -> str:
	"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
	# 1. Check for English (Latin Script) first
	latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
	text_chars = set(text)
	# If text has significant Latin characters, treat as English
	if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
	return "en"

	# 2. Check Indian scripts
	scripts = {
	'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
	'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
	'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
	'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
	'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
	'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
	'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
	'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
	'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼਷ਸਹਕਸ਼ਜ਼'),
	'ta': set('அஆஇஈஉஊ஋எஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
	'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
	}
	txt = set(text.replace(' ', ''))
	for lang, chars in scripts.items():
	if txt & chars:
	return lang
	# Default to Hindi if nothing matches
	return 'hi'

	# ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
	def slow_down_text(text):
	"""
	Adds pauses to force the model to take its time processing complex scripts.
	"""
	if not text:
	return ""
	# Add a comma (pause) after every 3 words to force a breather
	words = text.split()
	paced_text = ""
	for i, word in enumerate(words):
	paced_text += word + " "
	if (i + 1) % 3 == 0:
	paced_text += ", "

	# Add padding at start/end
	return f". . . {paced_text} . . ."

	# Function to load reference audio from URL
	def load_audio_from_url(url):
	response = requests.get(url)
	if response.status_code == 200:
	audio_data, sample_rate = sf.read(io.BytesIO(response.content))
	return sample_rate, audio_data
	return None, None

	@spaces.GPU
	def synthesize_speech(text, ref_audio, ref_text):
	# 1. Basic Validation
	if ref_audio is None:
	raise gr.Error("Please upload a Reference Audio file.")
	if ref_text.strip() == "":
	raise gr.Error("Please enter the text transcript for the Reference Audio.")
	if text.strip() == "":
	raise gr.Error("Please enter the text you want to generate.")

	# 2. Reference Audio Processing
	if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
	sample_rate, audio_data = ref_audio
	else:
	raise gr.Error("Invalid reference audio input.")

	# Save reference audio to temp file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
	sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
	temp_audio.flush()

	# 3. Apply Text Pacing (The "Skipping" Fix)
	safe_text = slow_down_text(text)

	# 4. Generate Audio
	# Note: We are using safe_text for generation
	audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)

	# 5. Normalize Output
	if audio.dtype == np.int16:
	audio = audio.astype(np.float32) / 32768.0

	# 6. Save Output to File (The "Download" Fix)
	# We save the generated audio to a file so we can provide a download link
	output_filename = f"generated_{uuid.uuid4().hex}.wav"
	output_path = os.path.join(tempfile.gettempdir(), output_filename)

	sf.write(output_path, audio, 24000)

	# Return the file path twice: once for the player, once for the download button
	return output_path, output_path


	# Load TTS model
	repo_id = "ai4bharat/IndicF5"
	model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print("Device", device)
	model = model.to(device)

	# ---------- PRE-FETCH EXAMPLES ----------
	EXAMPLES = [
	{
	"audio_name": "PAN_F (Happy)",
	"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
	"ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
	"synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
	},
	{
	"audio_name": "TAM_F (Happy)",
	"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
	"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
	"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
	},
	]

	# Preload all example audios
	for example in EXAMPLES:
	sample_rate, audio_data = load_audio_from_url(example["audio_url"])
	example["sample_rate"] = sample_rate
	example["audio_data"] = audio_data


	# Define Gradio interface
	with gr.Blocks() as iface:
	gr.Markdown(
	"""
	# IndicF5 Dubbing Studio
	Instructions for Best Results:
	1. Reference Audio: Use a clear, 10-15 second clip. Slower speech works better.
	2. Reference Text: Must match the audio exactly.
	3. Target Text: Odia works best with punctuation. If it skips words, add commas.
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
	ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
	ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
	submit_btn = gr.Button("🎤 Generate Speech", variant="primary")

	with gr.Column():
	output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
	# This is the dedicated download button
	output_file = gr.File(label="Download Audio File", file_count="single")

	# Add multiple examples
	examples = [
	[ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES
	]

	gr.Examples(
	examples=examples,
	inputs=[text_input, ref_audio_input, ref_text_input],
	label="Quick Examples"
	)

	# When clicked, return audio to Player AND File Downloader
	submit_btn.click(
	synthesize_speech,
	inputs=[text_input, ref_audio_input, ref_text_input],
	outputs=[output_audio, output_file]
	)

	iface.launch(share=True)