Spaces:

capstonedubtrack
/

Indiclanguagedubbing

Sleeping

File size: 3,661 Bytes

142f986
 
 
 
2c00bdf
142f986
1f1af2a
142f986
1d6ac10
f2f1db5
6b85d5e
1d6ac10
 
 
fd8a40d
176c9e2
142f986
674fe8f
142f986
 
 
0624db1
4cbdab1
 
 
1d6ac10
719767b
f191697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e06d9ec
e2950c6
e06d9ec
142f986
 
c0edc6e
0058c00

import os
import sys
import gradio as gr

device = "cuda"
os.system('git clone https://github.com/Rudrabha/Wav2Lip.git')
os.system('pip3 install --upgrade pip')
os.system('curl -o ./Wav2Lip/face_detection/detection/sfd/s3fd.pth https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth')
os.system('pip3 install moviepy')
os.system('pip3 uninstall numpy')
os.system('pip3 install --upgrade numpy')
os.system('pip3 install speechRecognition')
os.system('pip3 install gtts')
os.system('pip3 install googletrans==3.1.0a0')
os.system('pip3 install numba==0.48')
os.system('pip3 install transformers')

title = "Automatic translation and dubbing for Indic Languages"
description = "A demo application to dub and translate videos spoken in Tamil, Hindi, Bengali and Telugu"
article = "Official Repo: https://github.com/Rudrabha/Wav2Lip"

def inference(language,speed,voice,video):
    import moviepy.editor as mp
    clip = mp.VideoFileClip(video)
    clip.audio.write_audiofile(r"audio.wav")
    os.system('pip3 install pydub')
    os.system('pip3 install transformers==4.11.3 soundfile sentencepiece torchaudio librosa')
    speechlist = []
    from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
    import torch
    import torchaudio
    import librosa
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
    def get_transcription(audio_path):
      speech, sr = librosa.load(audio_path, sr=16000)
      resampler = torchaudio.transforms.Resample(sr, 16000)
      speech = resampler(speech)
      input_values = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"]
      logits = model(input_values)["logits"]
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.decode(predicted_ids[0])
      return transcription.lower()
    speechtext = get_transcription("audio.wav")
    speechlist.append(speechtext)
    text = " ".join(speechlist)
    from googletrans import Translator
    from gtts import gTTS
    translator= Translator()
    if speed == "Slow":
        con = True
    elif speed == "Fast":
        con = False
    if language == "Hindi":
        translation = translator.translate(text, src = 'en', dest='hi', slow=con)
        tts = gTTS(translation.text, lang= "hi")
        tts.save('input_audio.wav')
    elif language == "Tamil":
        translation = translator.translate(text, src = 'en', dest='ta', slow=con)
        tts = gTTS(translation.text, lang= "ta")
        tts.save('input_audio.wav')
    elif language == "Bengali":
        translation = translator.translate(text, src = 'en', dest='bn', slow=con)
        tts = gTTS(translation.text, lang= "hi")
        tts.save('input_audio.wav')
    elif language == "Telugu":
        translation = translator.translate(text, src = 'en', dest='te', slow=con)
        tts = gTTS(translation.text, lang= "hi")
        tts.save('input_audio.wav')
    audio = "input_audio.wav"
    os.system('mv ./Wav2Lip/* .')
    os.system("python inference.py --checkpoint_path ./wav2lip_gan.pth --face {} --audio {}".format(video, audio))
    return "./results/result_voice.mp4"

iface = gr.Interface(inference, inputs=[gr.Radio(["Tamil", "Hindi", "Bengali", "Telugu"], label = "Enter language to translate to"), gr.Radio(["Slow", "Fast"], label = "Enter speaking speed"), gr.Radio(["Male", "Female"], label = "Enter preferred voice"), gr.Video(format="mp4", sources="upload", label="Video to be Translated")], outputs=["video"], title=title, description=description, article=article)
iface.launch(allowed_paths=["."])