File size: 7,730 Bytes
8746ace |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import gradio as gr
import subprocess
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import librosa
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from gtts import gTTS
from numba import cuda
#variables
language_input_audio = 'en'
language_output_audio='ch'
dict_lang = {
'en': 'eng_latn',
'es': 'spa_Latn',
'fr': 'fra_Latn',
'de': 'deu_Latn',
'pl': 'pol_Latn',
'uk': 'ukr_Cyrl',
'ro': 'ron_Latn',
'hu': 'hun_Latn',
'bg': 'bul_Cyrl',
'nl': 'nld_Latn',
'fi': 'fin_Latn',
'sl': 'slv_Latn',
'lv': 'lvs_Latn',
'ga': 'gle_Latn',
'ch': 'zho_Hant',
'ru': 'rus_Cyrl'
}
#functions
def radio_lang_input(lang):
language_input_audio = lang
return {var: language_input_audio}
#a function that determines the language of the output audio
def radio_input(lang):
language_output_audio = lang
return {var_lang: language_output_audio}
##
#convert input video file to text, audio, video
def video_load(video, language_input_audio, language_output_audio):
#convert video to video720p -s 1280x720
#
subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True)
#convert video to audio
#
subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True)
#convert audio to text
#
# load model and tokenizer
if language_input_audio == 'en':
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
if language_input_audio == 'ru':
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
#convert text to text translations
#
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
device = 0 if torch.cuda.is_available() else -1
translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1)
result = translation_pipeline(transcription)
text_translations = result[0]['translation_text']
#convert text to audio
#
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
#audio to video
#
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return text_translations, audio, video
##
# function for create video from audio
def audio_to_video_custom(audio):
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return video
##
# function for create audio from custom translations
def text_to_audio_custom(text_translations, language_output_audio):
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
return audio
##### blocks
with gr.Blocks(title="Speak video in any language") as demo:
# state variable
var = gr.State('en')
var_lang = gr.State('ch')
# markdown text
gr.Markdown("Service for translating videos into other languages with support for the speaker's facial expressions")
gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.")
with gr.Row():
with gr.Column():
# radio button for change input lang
radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language')
# video input
seed = gr.Video(label="Input Video")
# radio button for change to output language
radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak')
# main button
btn_1 = gr.Button("1. Generate video with translated audio")
with gr.Column():
# text output
translations_text = gr.Text(label="Generated Translations Text", interactive=True)
# button to generate text to audio
btn_3 = gr.Button("Generate custom translations to speech")
# output audio
translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath")
# button to generate audio to video
btn_2 = gr.Button("Generate video with custom audio")
# video output
video_output = gr.Video(interactive=False, label="Generated Translations Video")
# change input lang video
radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var)
# change output lang
radio.change(fn=radio_input, inputs=radio, outputs=var_lang)
# main button click
btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output])
# button click to custom audio to video
btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output])
# button click to custom test to audio
btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio])
demo.launch(show_api=False)
|