Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from huggingface_hub import AsyncInferenceClient | |
| import asyncio | |
| import tempfile | |
| import os | |
| import uuid | |
| import requests | |
| from gtts import gTTS | |
| import re | |
| import torch | |
| import torchaudio | |
| from transformers import pipeline | |
| from transformers.utils import is_flash_attn_2_available | |
| MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" | |
| VOICE_MODELS = { | |
| "English": "ai4bharat/indic-parler-tts", | |
| "Hindi": "ai4bharat/indic-parler-tts", | |
| "Bengali": "ai4bharat/indic-parler-tts", | |
| "Tamil": "ai4bharat/indic-parler-tts", | |
| "Telugu": "ai4bharat/indic-parler-tts", | |
| "Malayalam": "ai4bharat/indic-parler-tts", | |
| "Punjabi": "ai4bharat/indic-parler-tts" | |
| } | |
| PARLER_LANG_CODES = { | |
| "English": "en", | |
| "Hindi": "hi", | |
| "Bengali": "bn", | |
| "Tamil": "ta", | |
| "Telugu": "te", | |
| "Malayalam": "ml", | |
| "Punjabi": "pa" | |
| } | |
| GTTS_CONFIG = { | |
| "English": {"lang": "en", "tld": "co.in"}, | |
| "Hindi": {"lang": "hi", "tld": "co.in"}, | |
| "Bengali": {"lang": "bn", "tld": "co.in"}, | |
| "Tamil": {"lang": "ta", "tld": "co.in"}, | |
| "Telugu": {"lang": "te", "tld": "co.in"}, | |
| "Malayalam": {"lang": "ml", "tld": "co.in"}, | |
| "Punjabi": {"lang": "pa", "tld": "co.in"} | |
| } | |
| TRANSLATE_LANG_CODES = { | |
| "English": "en", "Hindi": "hi", "Bengali": "bn", | |
| "Tamil": "ta", "Telugu": "te", "Malayalam": "ml", "Punjabi": "pa" | |
| } | |
| DEFAULT_HF_TOKEN = os.getenv("HF_TOKEN") | |
| _CLIENT_CACHE = {} | |
| _PARLER_CACHE = {} | |
| def robust_google_translate(text: str, target_lang: str) -> str: | |
| """Handles LONG English β FULL Telugu translation. Splits into sentences.""" | |
| if target_lang == "en": | |
| return text | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| translated_parts = [] | |
| url = "https://translate.googleapis.com/translate_a/single" | |
| for sentence in sentences: | |
| if len(sentence.strip()) < 3: | |
| continue | |
| params = { | |
| "client": "gtx", | |
| "sl": "en", | |
| "tl": target_lang, | |
| "dt": "t", | |
| "q": sentence.strip() | |
| } | |
| try: | |
| response = requests.get(url, params=params, timeout=8) | |
| if response.status_code == 200: | |
| data = response.json() | |
| translated_parts.append(data[0][0][0]) | |
| else: | |
| translated_parts.append(sentence.strip()) | |
| except: | |
| translated_parts.append(sentence.strip()) | |
| return ' '.join(translated_parts).strip() | |
| def get_async_client(api_token: str | None): | |
| token_to_use = api_token if api_token and api_token.strip() else DEFAULT_HF_TOKEN | |
| key = token_to_use or "NO_TOKEN" | |
| if key not in _CLIENT_CACHE: | |
| _CLIENT_CACHE[key] = AsyncInferenceClient(MODEL_ID, token=token_to_use) | |
| return _CLIENT_CACHE[key] | |
| def clean_text_for_tts(text: str) -> str: | |
| cleaned = re.sub(r"\*{1,3}", "", text) | |
| cleaned = re.sub(r"[ββ]", '"', cleaned) | |
| cleaned = re.sub(r"β", "'", cleaned) | |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() | |
| return cleaned | |
| def get_parler_tts_pipeline(device: str): | |
| """Cached Indic Parler-TTS pipeline for local inference.""" | |
| device_id = 0 if device == "cuda" else -1 | |
| key = f"{device}_{device_id}" | |
| if key not in _PARLER_CACHE: | |
| _PARLER_CACHE[key] = pipeline( | |
| "text-to-audio", | |
| model="ai4bharat/indic-parler-tts", | |
| device=device_id, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 | |
| ) | |
| return _PARLER_CACHE[key] | |
| async def generate_audio_file(text, language, api_token=None): | |
| output_dir = tempfile.gettempdir() | |
| filename = f"story_{uuid.uuid4()}.wav" | |
| tmp_path = os.path.join(output_dir, filename) | |
| # STEP 1: Try Indic Parler-TTS (LOCAL INDIAN VOICES) | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tts = get_parler_tts_pipeline(device) | |
| lang_code = PARLER_LANG_CODES.get(language, "hi") | |
| tts_text = clean_text_for_tts(text[:1000]) # Parler-TTS limit | |
| # Authentic regional voice description | |
| voice_desc = { | |
| "Hindi": "devotional male storyteller with Indian accent", | |
| "Telugu": "ancient Telugu storyteller voice", | |
| "Tamil": "devotional Tamil narrator", | |
| "Bengali": "epic Bengali storyteller", | |
| "Malayalam": "Malayalam devotional voice", | |
| "Punjabi": "Punjabi epic narrator", | |
| "English": "Indian English storyteller" | |
| }.get(language, "devotional storyteller") | |
| audio = tts(tts_text, voice_description=voice_desc, lang_code=lang_code) | |
| torchaudio.save(tmp_path, audio["audio"][0], audio["sampling_rate"]) | |
| print(f"β Indic Parler-TTS success: {language} local voice") | |
| return tmp_path | |
| except Exception as e: | |
| print(f"Parler-TTS failed: {e}. Falling back to gTTS...") | |
| # STEP 2: gTTS fallback (your original) | |
| try: | |
| filename = f"story_{uuid.uuid4()}.mp3" | |
| tmp_path = os.path.join(output_dir, filename) | |
| config = GTTS_CONFIG.get(language, {"lang": "en", "tld": "com"}) | |
| def _gtts(): | |
| tts = gTTS(text=text, lang=config["lang"], tld=config["tld"], slow=False) | |
| tts.save(tmp_path) | |
| await asyncio.to_thread(_gtts) | |
| print(f"β gTTS fallback: {language}") | |
| return tmp_path | |
| except Exception as e: | |
| raise Exception(f"All TTS failed: {e}") | |
| async def generate_story_text(prompt, system_prompt, api_token=None): | |
| client = get_async_client(api_token) | |
| messages = [{"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": prompt}] | |
| try: | |
| response = await client.chat_completion(messages, max_tokens=500, stream=False) | |
| return response.choices[0].message.content | |
| except: | |
| full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" | |
| return await client.text_generation(full_prompt, max_new_tokens=500, repetition_penalty=1.1) | |
| async def generate_story_and_audio(item_name, description, language, api_token): | |
| """ | |
| PERFECT PIPELINE: | |
| 1. Qwen β FULL English story | |
| 2. Google Translate β FULL Telugu | |
| 3. LOCAL Indic Parler-TTS β Authentic Indian voices | |
| """ | |
| system_prompt = "You are an ancient Pauranik storyteller. Epic, devotional tone." | |
| english_prompt = f"""Create a detailed mythological story about "{item_name}". | |
| Context: {description} | |
| Structure: | |
| - Powerful introduction | |
| - Complete legend | |
| - Moral/lesson at end | |
| Requirements: | |
| - English only | |
| - 200-250 words (full length) | |
| - Ancient, grand style | |
| - No modern words""" | |
| # STEP 1: Generate FULL English story | |
| try: | |
| core_story = await generate_story_text(english_prompt, system_prompt, api_token) | |
| except Exception as e: | |
| return f"Story generation failed: {e}", None | |
| # STEP 2: Translate FULL story | |
| final_story_text = core_story | |
| if language != "English": | |
| target_lang_code = TRANSLATE_LANG_CODES.get(language, "en") | |
| final_story_text = robust_google_translate(core_story, target_lang_code) | |
| print(f"Translated {len(core_story)} chars β {len(final_story_text)} chars") | |
| # STEP 3: Generate LOCAL INDIAN VOICE audio | |
| try: | |
| audio_path = await generate_audio_file(final_story_text, language, api_token) | |
| return final_story_text, audio_path | |
| except Exception as e: | |
| return final_story_text, None | |
| # ============================== | |
| # GRADIO UI | |
| # ============================== | |
| with gr.Blocks(title="Mythology Storyteller - LOCAL INDIAN VOICES") as demo: | |
| gr.Markdown("# ποΈ **LOCAL INDIAN VOICES** - Authentic Regional Accents") | |
| gr.Markdown("β Qwen β Full Translation β **Indic Parler-TTS Local Voices**") | |
| gr.Markdown("**Languages:** Hindi, Telugu, Tamil, Bengali, Malayalam, Punjabi") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Input") | |
| api_token_input = gr.Textbox(label="HF Token", placeholder="hf_...", type="password") | |
| item_input = gr.Textbox(label="Item Name", placeholder="Lord Shiva", value="Lord Shiva") | |
| desc_input = gr.Textbox(label="Description", placeholder="Cosmic dance of destruction...", | |
| value="The destroyer of evil, meditator in Himalayas", lines=3) | |
| lang_input = gr.Dropdown(choices=list(VOICE_MODELS.keys()), value="Telugu", label="Local Voice") | |
| submit_btn = gr.Button("π Generate Full Story & Local Audio", variant="primary", size="lg") | |
| with gr.Column(): | |
| gr.Markdown("### Output") | |
| story_output = gr.Textbox(label="Complete Story", lines=15) | |
| audio_output = gr.Audio(label="Local Indian Voice", type="filepath") | |
| submit_btn.click( | |
| fn=generate_story_and_audio, | |
| inputs=[item_input, desc_input, lang_input, api_token_input], | |
| outputs=[story_output, audio_output], | |
| api_name="predict" | |
| ) | |
| gr.Markdown("**Test:** 'Lord Shiva' + Telugu = **Authentic Telugu storyteller voice**") | |
| gr.Markdown("*First run downloads ~1GB model (one-time). Uses GPU/CPU local inference.*") | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |