| import io |
| import spaces |
| import torch |
| import requests |
| import tempfile |
| import numpy as np |
| import gradio as gr |
| import soundfile as sf |
| from transformers import AutoModel |
| from typing import Tuple |
| import uuid |
| import os |
|
|
| |
| def detect_language_from_text(text: str) -> str: |
| """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'.""" |
| |
| latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") |
| text_chars = set(text) |
| |
| if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3: |
| return "en" |
|
|
| |
| scripts = { |
| 'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'), |
| 'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'), |
| 'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'), |
| 'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'), |
| 'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'), |
| 'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'), |
| 'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'), |
| 'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'), |
| 'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼ਸਹਕਸ਼ਜ਼'), |
| 'ta': set('அஆஇஈஉஊஎஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'), |
| 'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'), |
| } |
| txt = set(text.replace(' ', '')) |
| for lang, chars in scripts.items(): |
| if txt & chars: |
| return lang |
| |
| return 'hi' |
|
|
| |
| def slow_down_text(text): |
| """ |
| Adds pauses to force the model to take its time processing complex scripts. |
| """ |
| if not text: |
| return "" |
| |
| words = text.split() |
| paced_text = "" |
| for i, word in enumerate(words): |
| paced_text += word + " " |
| if (i + 1) % 3 == 0: |
| paced_text += ", " |
| |
| |
| return f". . . {paced_text} . . ." |
|
|
| |
| def load_audio_from_url(url): |
| response = requests.get(url) |
| if response.status_code == 200: |
| audio_data, sample_rate = sf.read(io.BytesIO(response.content)) |
| return sample_rate, audio_data |
| return None, None |
|
|
| @spaces.GPU |
| def synthesize_speech(text, ref_audio, ref_text): |
| |
| if ref_audio is None: |
| raise gr.Error("Please upload a Reference Audio file.") |
| if ref_text.strip() == "": |
| raise gr.Error("Please enter the text transcript for the Reference Audio.") |
| if text.strip() == "": |
| raise gr.Error("Please enter the text you want to generate.") |
|
|
| |
| if isinstance(ref_audio, tuple) and len(ref_audio) == 2: |
| sample_rate, audio_data = ref_audio |
| else: |
| raise gr.Error("Invalid reference audio input.") |
| |
| |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: |
| sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV') |
| temp_audio.flush() |
| |
| |
| safe_text = slow_down_text(text) |
| |
| |
| |
| audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text) |
| |
| |
| if audio.dtype == np.int16: |
| audio = audio.astype(np.float32) / 32768.0 |
|
|
| |
| |
| output_filename = f"generated_{uuid.uuid4().hex}.wav" |
| output_path = os.path.join(tempfile.gettempdir(), output_filename) |
| |
| sf.write(output_path, audio, 24000) |
|
|
| |
| return output_path, output_path |
|
|
|
|
| |
| repo_id = "ai4bharat/IndicF5" |
| model = AutoModel.from_pretrained(repo_id, trust_remote_code=True) |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print("Device", device) |
| model = model.to(device) |
|
|
| |
| EXAMPLES = [ |
| { |
| "audio_name": "PAN_F (Happy)", |
| "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav", |
| "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।", |
| "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?" |
| }, |
| { |
| "audio_name": "TAM_F (Happy)", |
| "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav", |
| "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.", |
| "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?" |
| }, |
| ] |
|
|
| |
| for example in EXAMPLES: |
| sample_rate, audio_data = load_audio_from_url(example["audio_url"]) |
| example["sample_rate"] = sample_rate |
| example["audio_data"] = audio_data |
|
|
|
|
| |
| with gr.Blocks() as iface: |
| gr.Markdown( |
| """ |
| # **IndicF5 Dubbing Studio** |
| **Instructions for Best Results:** |
| 1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better. |
| 2. **Reference Text:** Must match the audio exactly. |
| 3. **Target Text:** Odia works best with punctuation. If it skips words, add commas. |
| """ |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3) |
| ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)") |
| ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2) |
| submit_btn = gr.Button("🎤 Generate Speech", variant="primary") |
| |
| with gr.Column(): |
| output_audio = gr.Audio(label="Play Generated Speech", type="filepath") |
| |
| output_file = gr.File(label="Download Audio File", file_count="single") |
| |
| |
| examples = [ |
| [ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES |
| ] |
| |
| gr.Examples( |
| examples=examples, |
| inputs=[text_input, ref_audio_input, ref_text_input], |
| label="Quick Examples" |
| ) |
|
|
| |
| submit_btn.click( |
| synthesize_speech, |
| inputs=[text_input, ref_audio_input, ref_text_input], |
| outputs=[output_audio, output_file] |
| ) |
|
|
| iface.launch(share=True) |