File size: 7,730 Bytes
8746ace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import gradio as gr
import subprocess
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import librosa
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from gtts import gTTS
from numba import cuda

#variables
language_input_audio = 'en'
language_output_audio='ch'
dict_lang = {
    'en': 'eng_latn',
    'es': 'spa_Latn',
    'fr': 'fra_Latn',
    'de': 'deu_Latn',
    'pl': 'pol_Latn',
    'uk': 'ukr_Cyrl',
    'ro': 'ron_Latn',
    'hu': 'hun_Latn',
    'bg': 'bul_Cyrl',
    'nl': 'nld_Latn',
    'fi': 'fin_Latn',
    'sl': 'slv_Latn',
    'lv': 'lvs_Latn',
    'ga': 'gle_Latn',
    'ch': 'zho_Hant',
    'ru': 'rus_Cyrl'
    }

#functions
def radio_lang_input(lang):
    language_input_audio = lang
    return {var: language_input_audio}

#a function that determines the language of the output audio
def radio_input(lang):
    language_output_audio = lang
    return {var_lang: language_output_audio}

##
#convert input video file to text, audio, video
def video_load(video, language_input_audio, language_output_audio):
    #convert video to video720p -s 1280x720
    #
    subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True)
    #convert video to audio
    #
    subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True)
    #convert audio to text
    #
    # load model and tokenizer
    if language_input_audio == 'en':
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
        input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
        # retrieve logits
        logits = model(input_values).logits
        # take argmax and decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
    if language_input_audio == 'ru':
        processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
        model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
        audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
        input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
        # retrieve logits
        logits = model(input_values).logits
        # take argmax and decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
    #convert text to text translations
    #
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    device = 0 if torch.cuda.is_available() else -1
    translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1)
    result = translation_pipeline(transcription)
    text_translations = result[0]['translation_text']
    #convert text to audio
    #
    #ru
    if language_output_audio == 'ru':
        tts = gTTS(text_translations, lang='ru')
        tts.save('ru.mp3')
        audio = 'ru.mp3'
    #Vashington obcom
    if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
        coquiTTS = CoquiTTS()
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
        audio = fp.name
    #Chineese
    if language_output_audio == 'ch':
        tts = gTTS(text_translations, lang='zh-CN')
        tts.save('china.mp3')
        audio = 'china.mp3'
    #audio to video
    #
    subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
    video = 'results/result_voice.mp4'
    return text_translations, audio, video

##
# function for create video from audio
def audio_to_video_custom(audio):
    subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
    video = 'results/result_voice.mp4'
    return video

##
# function for create audio from custom translations
def text_to_audio_custom(text_translations, language_output_audio):
    #ru
    if language_output_audio == 'ru':
        tts = gTTS(text_translations, lang='ru')
        tts.save('ru.mp3')
        audio = 'ru.mp3'
    #Vashington obcom
    if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
        coquiTTS = CoquiTTS()
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
            coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
        audio = fp.name

    #Chineese
    if language_output_audio == 'ch':
        tts = gTTS(text_translations, lang='zh-CN')
        tts.save('china.mp3')
        audio = 'china.mp3'
    return audio

##### blocks
with gr.Blocks(title="Speak video in any language") as demo:
    # state variable
    var = gr.State('en')
    var_lang = gr.State('ch')
    # markdown text
    gr.Markdown("Service for translating videos into other languages ​​with support for the speaker's facial expressions")
    gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.")
    with gr.Row():
        with gr.Column():
            # radio button for change input lang
            radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language')
            # video input
            seed = gr.Video(label="Input Video")
            # radio button for change to output language
            radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak')
            # main button
            btn_1 = gr.Button("1. Generate video with translated audio")

        with gr.Column():
            # text output
            translations_text = gr.Text(label="Generated Translations Text", interactive=True)
            # button to generate text to audio
            btn_3 = gr.Button("Generate custom translations to speech")
            # output audio
            translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath")
            # button to generate audio to video
            btn_2 = gr.Button("Generate video with custom audio")
            # video output
            video_output = gr.Video(interactive=False, label="Generated Translations Video")
    # change input lang video
    radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var)
    # change output lang
    radio.change(fn=radio_input, inputs=radio, outputs=var_lang)
    # main button click
    btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output])
    # button click to custom audio to video
    btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output])
    # button click to custom test to audio
    btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio])

demo.launch(show_api=False)