# https://huggingface.co/deepkyu/ml-talking-face import os import subprocess import pkg_resources library = 'httpx' installed_version = pkg_resources.get_distribution(library).version desired_version = '0.25.0' if installed_version != desired_version: subprocess.run(f'pip install --force-reinstall {library}=={desired_version}', shell=True) print(f"Package is installed with newer version: {library}=={desired_version}") REST_IP = os.environ['REST_IP'] SERVICE_PORT = int(os.environ['SERVICE_PORT']) TRANSLATION_APIKEY_URL = os.environ['TRANSLATION_APIKEY_URL'] GOOGLE_APPLICATION_CREDENTIALS = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] subprocess.call(f"wget --no-check-certificate -O {GOOGLE_APPLICATION_CREDENTIALS} {TRANSLATION_APIKEY_URL}", shell=True) TOXICITY_THRESHOLD = float(os.getenv('TOXICITY_THRESHOLD', 0.7)) import gradio as gr from toxicity_estimator import PerspectiveAPI from translator import Translator from client_rest import RestAPIApplication from pathlib import Path import argparse import threading from utils import get_snippet_from_url class GradioApplication: def __init__(self, rest_ip, rest_port, max_seed, server_port=7860, share=False): self.lang_list = { 'ko': 'ko_KR', 'en': 'en_US', 'ja': 'ja_JP', 'zh': 'zh_CN', 'zh-CN': 'zh_CN' } self.background_list = [None, "background_image/cvpr.png", "background_image/black.png", "background_image/river.mp4", "background_image/sky.mp4"] self.perspective_api = PerspectiveAPI() self.translator = Translator() self.rest_application = RestAPIApplication(rest_ip, rest_port) self.output_dir = Path("output_file") self.max_seed = max_seed self._file_seed = 0 self.lock = threading.Lock() with gr.Blocks( theme="deepkyu/compact-theme", css=get_snippet_from_url("https://huggingface.co/spaces/deepkyu/compact-theme/raw/main/main.css") ) as demo: with gr.Row(equal_height=True): with gr.Column(scale=8): gr.Markdown(Path("docs/title.md").read_text(), sanitize_html=False) with gr.Column(scale=1): toggle_dark = gr.Button(value="Dark", variant='stop') toggle_dark.click( None, js=""" () => { document.body.classList.toggle('dark'); } """, ) gr.Markdown( Path("docs/description.md").read_text(), sanitize_html=False) with gr.Row(equal_height=True): with gr.Column(scale=1): text_input, lang_input, duration_rate_input, action_input, background_input = prepare_input() submit_button = gr.Button(value="Run", variant="primary") with gr.Column(scale=1): toxicity_output, translation_result_otuput, video_output = prepare_output() submit_button.click( fn=self.infer, inputs=[text_input, lang_input, duration_rate_input, action_input, background_input], outputs=[toxicity_output, translation_result_otuput, video_output], ) gr.Markdown(Path("docs/article.md").read_text(), sanitize_html=False) demo.queue().launch(share=share, server_port=server_port) def _get_file_seed(self): return f"{self._file_seed % self.max_seed:02d}" def _reset_file_seed(self): self._file_seed = 0 def _counter_file_seed(self): with self.lock: self._file_seed += 1 def get_lang_code(self, lang): return self.lang_list[lang] def get_background_data(self, background_index): # get background filename and its extension data_path = self.background_list[background_index] if data_path is not None: with open(data_path, 'rb') as rf: background_data = rf.read() is_video_background = str(data_path).endswith(".mp4") else: background_data = None is_video_background = False return background_data, is_video_background @staticmethod def return_format(toxicity_prob, target_text, lang_dest, video_filename, detail=""): return {'Toxicity': toxicity_prob}, f"Language: {lang_dest}\nText: {target_text}\n-\nDetails: {detail}", str(video_filename) def infer(self, text, lang, duration_rate, action, background_index): self._counter_file_seed() print(f"File Seed: {self._file_seed}") toxicity_prob = 0.0 target_text = "" lang_dest = "" video_filename = "vacant.mp4" # Toxicity estimation try: toxicity_prob = self.perspective_api.get_score(text) except Exception as e: # when Perspective API doesn't work pass if toxicity_prob > TOXICITY_THRESHOLD: detail = "Sorry, it seems that the input text is too toxic." return self.return_format(toxicity_prob, target_text, lang_dest, video_filename, detail=f"Error: {detail}") # Google Translate API try: target_text, lang_dest = self.translator.get_translation(text, lang) except Exception as e: raise e target_text = "" lang_dest = "" detail = f"Error from language translation: ({e})" return self.return_format(toxicity_prob, target_text, lang_dest, video_filename, detail=f"Error: {detail}") try: self.translator.length_check(lang_dest, target_text) # assertion check except AssertionError as e: return self.return_format(toxicity_prob, target_text, lang_dest, video_filename, detail=f"Error: {str(e)}") lang_rpc_code = self.get_lang_code(lang_dest) # Video Inference background_data, is_video_background = self.get_background_data(background_index) video_data = self.rest_application.get_video(target_text, lang_rpc_code, duration_rate, action.lower(), background_data, is_video_background) print(f"Video data size: {len(video_data)}") video_filename = self.output_dir / f"{self._file_seed:02d}.mkv" with open(video_filename, "wb") as video_file: video_file.write(video_data) return self.return_format(toxicity_prob, target_text, lang_dest, video_filename) def prepare_input(): text_input = gr.Textbox(lines=2, placeholder="Type your text with English, Chinese, Korean, and Japanese.", value="Hello, this is demonstration for talking face generation " "with multilingual text-to-speech.", label="Text") lang_input = gr.Radio(['Korean', 'English', 'Japanese', 'Chinese'], type='value', value='Korean', label="Language") duration_rate_input = gr.Slider(minimum=0.8, maximum=1.2, step=0.01, value=1.0, label="Duration (The bigger the value, the slower the speech)") action_input = gr.Radio(['Default', 'Hand', 'BothHand', 'HandDown', 'Sorry'], type='value', value='Default', label="Select an action ...") background_input = gr.Radio(['None', 'CVPR', 'Black', 'River', 'Sky'], type='index', value='None', label="Select a background image/video ...") return text_input, lang_input, duration_rate_input, action_input, background_input def prepare_output(): toxicity_output = gr.Label(num_top_classes=1, label="Toxicity (from Perspective API)") translation_result_otuput = gr.Textbox(type="text", label="Translation Result") video_output = gr.Video(format='mp4') return toxicity_output, translation_result_otuput, video_output def parse_args(): parser = argparse.ArgumentParser( description='GRADIO DEMO for talking face generation submitted to CVPR2022') parser.add_argument('-p', '--port', dest='gradio_port', type=int, default=7860, help="Port for gradio") parser.add_argument('--rest_ip', type=str, default=REST_IP, help="IP for REST API") parser.add_argument('--rest_port', type=int, default=SERVICE_PORT, help="Port for REST API") parser.add_argument('--max_seed', type=int, default=20, help="Max seed for saving video") parser.add_argument('--share', action='store_true', help='get publicly sharable link') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed, server_port=args.gradio_port, share=args.share)