Spaces:

CVPR
/

ml-talking-face

Running

App Files Files Community

deepkyu commited on Apr 4, 2024

Commit

afbc1dd

1 Parent(s): 5d56d4a

Update theme, fix error

Browse files

Files changed (10) hide show

README.md +1 -1
app.py +46 -33
app.sh +6 -0
cvpr-2022-demonstration-aae517bc44e5.json +12 -0
docs/article.md +15 -5
docs/description.md +5 -13
docs/title.md +6 -0
docs/title.txt +0 -1
requirements.txt +2 -1
utils.py +10 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 👄
 colorFrom: blue
 colorTo: blue
 sdk: gradio
-sdk_version: 3.0.6
 app_file: app.py
 pinned: false
 license: cc-by-nc-sa-4.0

 colorFrom: blue
 colorTo: blue
 sdk: gradio
+sdk_version: 4.13.0
 app_file: app.py
 pinned: false
 license: cc-by-nc-sa-4.0

app.py CHANGED Viewed

@@ -17,14 +17,12 @@ from client_rest import RestAPIApplication
 from pathlib import Path
 import argparse
 import threading
-import yaml
-TITLE = Path("docs/title.txt").read_text()
-DESCRIPTION = Path("docs/description.md").read_text()
 class GradioApplication:
-    def __init__(self, rest_ip, rest_port, max_seed):
         self.lang_list = {
             'ko': 'ko_KR',
             'en': 'en_US',
@@ -42,21 +40,46 @@ class GradioApplication:
         self.translator = Translator()
         self.rest_application = RestAPIApplication(rest_ip, rest_port)
         self.output_dir = Path("output_file")
-        inputs = prepare_input()
-        outputs = prepare_output()
-        self.iface = gr.Interface(fn=self.infer,
-                                  title=TITLE,
-                                  description=DESCRIPTION,
-                                  inputs=inputs,
-                                  outputs=outputs,
-                                  allow_flagging='never',
-                                  article=Path("docs/article.md").read_text())
         self.max_seed = max_seed
         self._file_seed = 0
         self.lock = threading.Lock()
     def _get_file_seed(self):
@@ -112,6 +135,7 @@ class GradioApplication:
         try:
             target_text, lang_dest = self.translator.get_translation(text, lang)
         except Exception as e:
             target_text = ""
             lang_dest = ""
             detail = f"Error from language translation: ({e})"
@@ -137,15 +161,6 @@ class GradioApplication:
         return self.return_format(toxicity_prob, target_text, lang_dest, video_filename)
-    def run(self, server_port=7860, share=False):
-        try:
-            self.iface.launch(height=900,
-                              share=share, server_port=server_port,
-                              enable_queue=True)
-        except KeyboardInterrupt:
-            gr.close_all()
 def prepare_input():
     text_input = gr.Textbox(lines=2,
@@ -155,7 +170,7 @@ def prepare_input():
                             label="Text")
     lang_input = gr.Radio(['Korean', 'English', 'Japanese', 'Chinese'],
                           type='value',
-                          value=None,
                           label="Language")
     duration_rate_input = gr.Slider(minimum=0.8,
                                     maximum=1.2,
@@ -171,15 +186,14 @@ def prepare_input():
                                 value='None',
                                 label="Select a background image/video ...")
-    return [text_input, lang_input, duration_rate_input,
-            action_input, background_input]
 def prepare_output():
     toxicity_output = gr.Label(num_top_classes=1, label="Toxicity (from Perspective API)")
-    translation_result_otuput = gr.Textbox(type="str", label="Translation Result")
     video_output = gr.Video(format='mp4')
-    return [toxicity_output, translation_result_otuput, video_output]
 def parse_args():
@@ -197,6 +211,5 @@ def parse_args():
 if __name__ == '__main__':
     args = parse_args()
-    gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed)
-    gradio_application.run(server_port=args.gradio_port, share=args.share)

 from pathlib import Path
 import argparse
 import threading
+from utils import get_snippet_from_url
 class GradioApplication:
+    def __init__(self, rest_ip, rest_port, max_seed, server_port=7860, share=False):
         self.lang_list = {
             'ko': 'ko_KR',
             'en': 'en_US',
         self.translator = Translator()
         self.rest_application = RestAPIApplication(rest_ip, rest_port)
         self.output_dir = Path("output_file")
         self.max_seed = max_seed
         self._file_seed = 0
         self.lock = threading.Lock()
+        with gr.Blocks(
+            theme="deepkyu/compact-theme",
+            css=get_snippet_from_url("https://huggingface.co/spaces/deepkyu/compact-theme/raw/main/main.css")
+        ) as demo:
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=8):
+                    gr.Markdown(Path("docs/title.md").read_text(), sanitize_html=False)
+                with gr.Column(scale=1):
+                    toggle_dark = gr.Button(value="Dark", variant='stop')
+            toggle_dark.click(
+                None,
+                js="""
+                () => {
+                    document.body.classList.toggle('dark');
+                }
+                """,
+            )
+            gr.Markdown( Path("docs/description.md").read_text(), sanitize_html=False)
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=1):
+                    text_input, lang_input, duration_rate_input, action_input, background_input = prepare_input()
+                    submit_button = gr.Button(value="Run", variant="primary")
+                with gr.Column(scale=1):
+                    toxicity_output, translation_result_otuput, video_output = prepare_output()
+            submit_button.click(
+                fn=self.infer,
+                inputs=[text_input, lang_input, duration_rate_input, action_input, background_input],
+                outputs=[toxicity_output, translation_result_otuput, video_output],
+            )
+            gr.Markdown(Path("docs/article.md").read_text(), sanitize_html=False)
+        demo.queue().launch(share=share, server_port=server_port)
     def _get_file_seed(self):
         try:
             target_text, lang_dest = self.translator.get_translation(text, lang)
         except Exception as e:
+            raise e
             target_text = ""
             lang_dest = ""
             detail = f"Error from language translation: ({e})"
         return self.return_format(toxicity_prob, target_text, lang_dest, video_filename)
 def prepare_input():
     text_input = gr.Textbox(lines=2,
                             label="Text")
     lang_input = gr.Radio(['Korean', 'English', 'Japanese', 'Chinese'],
                           type='value',
+                          value='Korean',
                           label="Language")
     duration_rate_input = gr.Slider(minimum=0.8,
                                     maximum=1.2,
                                 value='None',
                                 label="Select a background image/video ...")
+    return text_input, lang_input, duration_rate_input, action_input, background_input
 def prepare_output():
     toxicity_output = gr.Label(num_top_classes=1, label="Toxicity (from Perspective API)")
+    translation_result_otuput = gr.Textbox(type="text", label="Translation Result")
     video_output = gr.Video(format='mp4')
+    return toxicity_output, translation_result_otuput, video_output
 def parse_args():
 if __name__ == '__main__':
     args = parse_args()
+    gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed,
+                                           server_port=args.gradio_port, share=args.share)

app.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+export REST_IP=146.235.222.52
+export SERVICE_PORT=8080
+export TRANSLATION_APIKEY_URL="https://www.dropbox.com/s/4v75y419u2m7pvt/cvpr-2022-demonstration-aae517bc44e5.json"
+export GOOGLE_APPLICATION_CREDENTIALS=./cvpr-2022-demonstration-aae517bc44e5.json
+export PERSPECTIVE_API_KEY=AIzaSyD9bQ6lEZPqPGdWjrWXoG1aEQ_bt85MH18
+python app.py

cvpr-2022-demonstration-aae517bc44e5.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "type": "service_account",
+  "project_id": "cvpr-2022-demonstration",
+  "private_key_id": "aae517bc44e5e5aa00d26f27328d1d4bd37a47ae",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC3h0I+H5wNu6S2\n8kRF0QtYwOhI+miiy4SB/dmnNZGqi/vh01grMlc5VzeieLy06usWxmdIe95x+XRj\nP4NzUDEtr1XD8DJLR57nDDQDVP/28g/HBKZT6vV1qNkkF1NfWtiNpuwWHnt/4lyR\nuirDsQHwSecogYWxxUqBbC7b+7UHMuyRU8AF+lneHmbvLjkKrdJ4RrZCO7Invcju\nI78Jkbv0KkoQ7vRRLjzrrcfFR1LrC6LEpXQr+7Px13Qu16ZoXdFmkRhsRqSDBKBA\n26+RSdm6mwWgegCT3jkbDwu/CZZSKrY6jHpDh+NEapu/kB4n/yxeQ/qRJg6uQ66+\nXAItiSqhAgMBAAECggEABdKbgiuQEKp5oxlEG7/yX3Y9WGVjcPjKgzJJ18F+Fd5b\nlzJUm4g9FrNi7ztr50SFwWL5j2XorI6IyAZRlhS+/q9PpggGnHdQ/g0X0Fw/w7ax\n0/NckJCFiZ5aB0hbK40PDk1zs47zhZ7pp3Dz3uIMiPJbTGBxnK6A+uKzCOUvPONt\npSlRpr0+zPaAAVAf1dhGhJFv0oMWuY5C42SPP7zdugWU7NAb/+nyscSVJtaMl1c9\nSC8ZhtLLbpuqWuV9KY7NHrmEDUeSaP6XYTEogCjGJN3tHhE/Pp7wc1Q+Q8Qq8cEf\nxKDOLGKEQ5mmGBoU6T2X8+W0pqsCHWVk4cus6Jo2fQKBgQDwBKgOy2P2QOOr7lG/\nf7X2L++8+cqnTm8uhkF8p5x6yA/qhkAauCFeSaM58G5vrGHVURmzC5pIWulFdIl4\no7MtM5tb9SJOFoYOMCG5i2RFje8a3UMbjbJ7qWwquG0g22wxIzp5Zp85ATxDyJ2d\n1GnBK4iYzDBNLBa5qj1wm+qOpQKBgQDDv62XzBFpxn/l14HeuLmT99LwjDM00MB3\na51bAEcid6F5gKT2OH+ynw1dbHonj1QgMsSUaYF+IZ1Tv7YOHsu9OGQsOSKXLWzj\nKtqwBS+Rcs+vir/MQx/B92I6kIvPwIwJkcFkGydwSebg5/9GBQgc/IoBGpoealHc\nB+R95ZLHTQKBgQCtK27lXbF9lkutXID2nXn3aZaazc5874YgFxVgjr3DiFyTZNET\ndg39LTvrLaFASSs88QqNt61UzuLHDdGxjMpVropEypQ5qt6Flgx/BicOV91PkJw+\nYPmJZy93kyCJOEbTHQuvU29FfbKxZSjzGrJCgNpJA+lFhK3QvxcdDCErUQKBgD8W\njdo4mPwgT6RCGscvfhAkdUW1yeMhzIYwltx3cHW5XL+OKJx6hR2KiYIsrgoF9bUS\n56x1fJisOMp/JSvT3RI1FIP0PiO/LjLg4u6MHVKhUDJhY4NvttKK6ou5fnYMtpV2\n9n9PCRz1lIWz/+APSxgchFXqvvVCivOBT7ELxoyZAoGAaaS+GwZJWC3VX8Uv3GLy\nY3ss1xIVsMCldar/jH0D2ut2kdVRiqurqshF+I/4CwCe+HfU0Q1OE1fTF5nzJPJ6\nnRqXOd4Y4MMh7ZsljOMAz2BzJBCm6fXDtEFUDt2N5uWMfhWJq6UE5WmuCY9WHaxl\nL/Ywk/jgBUKkANvR/zHF4UQ=\n-----END PRIVATE KEY-----\n",
+  "client_email": "cvpr-2022-demo-translation@cvpr-2022-demonstration.iam.gserviceaccount.com",
+  "client_id": "104977058857183512162",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/cvpr-2022-demo-translation%40cvpr-2022-demonstration.iam.gserviceaccount.com"
+}

docs/article.md CHANGED Viewed

@@ -1,5 +1,5 @@
-## Why learn a new language, when your model can learn it for you?
 <div style="max-width: 720px;max-height: 405px;margin: auto;">
     <div style="float: none;clear: both;position: relative;padding-bottom: 56.25%;height: 0;width: 100%">
@@ -10,14 +10,24 @@
 ### Abstract
-Recent studies in talking face generation have focused on building a train-once-use-everywhere model i.e. a model that will generalize from any source speech to any target identity. A number of works have already claimed this functionality and have added that their models will also generalize to any language. However, we show, using languages from different language families, that these models do not translate well when the training language and the testing language are sufficiently different. We reduce the scope of the problem to building a language-robust talking face generation system on seen identities i.e. the target identity is the same as the training identity. In this work, we introduce a talking face generation system that will generalize to different languages. We evaluate the efficacy of our system using a multilingual text-to-speech system. We also discuss the usage of joint text-to-speech system and the talking face generation system as a neural dubber system.
-[CVPR Open Access](https://openaccess.thecvf.com/content/CVPR2022/html/Song_Talking_Face_Generation_With_Multilingual_TTS_CVPR_2022_paper.html) [arXiv](https://arxiv.org/abs/2205.06421)
-### News
 (2022.08.18.) We got the CVPR Hugging Face prize! Thank you all and special thanks to AK([@akhaliq](https://huggingface.co/akhaliq)).
 <center>
 <img alt="we-got-huggingface-prize" src="https://github.com/deepkyu/ml-talking-face/blob/main/docs/we-got-huggingface-prize.jpeg?raw=true" width="50%" />
-</center>

+<!-- ## Why learn a new language, when your model can learn it for you?
 <div style="max-width: 720px;max-height: 405px;margin: auto;">
     <div style="float: none;clear: both;position: relative;padding-bottom: 56.25%;height: 0;width: 100%">
 ### Abstract
+Recent studies in talking face generation have focused on building a train-once-use-everywhere model i.e. a model that will generalize from any source speech to any target identity. A number of works have already claimed this functionality and have added that their models will also generalize to any language. However, we show, using languages from different language families, that these models do not translate well when the training language and the testing language are sufficiently different. We reduce the scope of the problem to building a language-robust talking face generation system on seen identities i.e. the target identity is the same as the training identity. In this work, we introduce a talking face generation system that will generalize to different languages. We evaluate the efficacy of our system using a multilingual text-to-speech system. We also discuss the usage of joint text-to-speech system and the talking face generation system as a neural dubber system. -->
+## News
 (2022.08.18.) We got the CVPR Hugging Face prize! Thank you all and special thanks to AK([@akhaliq](https://huggingface.co/akhaliq)).
 <center>
 <img alt="we-got-huggingface-prize" src="https://github.com/deepkyu/ml-talking-face/blob/main/docs/we-got-huggingface-prize.jpeg?raw=true" width="50%" />
+</center>
+<br/>
+(2023.10.20.) It has been a year since the demonstration has suddenly shut down by MINDsLab (MAUM.AI for now).
+And today, I'm happy to share that I have restored the demonstration in my own lambdalabs instance!
+Over the past year, there have been numerous advancements in Gen AI, including multilingual TTS and talking face generation.
+This demo may become "old-fashioned" at this time... but I hope that it would help other researchers taking a journey in the same field.
+Now I'm using A10G instance from lambdalabs with my own expense... I'm sorry, but I don't know when it will shut down again. 😵‍💫 I'll keep you posted on the status.
+<center><a href="https://www.buymeacoffee.com/deepkyu" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 35px !important;width: 160px !important;" ></a></center>

docs/description.md CHANGED Viewed

@@ -1,26 +1,18 @@
 This system generates a talking face video based on the input text.
 You can provide the input text in one of the four languages: Chinese (Mandarin), English, Japanese, and Korean.
 You may also select the target language, the language of the output speech.
 If the input text language and the target language are different, the input text will be translated to the target language using Google Translate API.
-### Updates
-(2023.10.20.) It has been a year since the demonstration has suddenly shut down by MINDsLab (MAUM.AI for now).
-And today, I'm happy to share that ⭐I have restored the demonstration⭐ in my own lambdalabs instance!
-Over the past year, there have been numerous advancements in Gen AI, including multilingual TTS and talking face generation.
-This demo may become "old-fashioned" at this time 😅... but I hope that it would help other researchers taking a journey in the same field.
-⚠️By the way, I'm using A10G instance from lambdalabs with my own expense... I'm sorry, but I don't know when it will shut down again. 😵‍💫 I'll keep you posted on the status.
-<center><a href="https://www.buymeacoffee.com/deepkyu" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 35px !important;width: 160px !important;" ></a></center>
-(2022.06.17.) Thank you for visiting our demo!😊 This demo attracted a lot more attention than we anticipated. This, unfortunately, means that the computational burden is heavier than this demo was designed for. So, to maximize everyone's experience, we capped the length of the translated texts at:
 - 200 characters for English
 - 100 characters for Chinese, Japaense, and Korean.
-(2022.06.17.) We were originally planning to support any input text. However, when checking the logs recently, we found that there were a lot of inappropriate input texts. So, we decided to filter the inputs based on toxicity using [Perspective API @Google](https://developers.perspectiveapi.com/s/). Now, if you enter a possibily toxic text, the video generation will fail. We hope you understand.
 (2022.06.05.) Due to the latency from HuggingFace Spaces and video rendering, it takes 15 ~ 30 seconds to get a video result.

+<br/>
 This system generates a talking face video based on the input text.
 You can provide the input text in one of the four languages: Chinese (Mandarin), English, Japanese, and Korean.
 You may also select the target language, the language of the output speech.
 If the input text language and the target language are different, the input text will be translated to the target language using Google Translate API.
+## Updates
+(2022.06.17.) To maximize everyone's experience, we capped the length of the translated texts at:
 - 200 characters for English
 - 100 characters for Chinese, Japaense, and Korean.
+Also, we found that there were a lot of inappropriate input texts. We decided to filter the inputs based on toxicity using [Perspective API @Google](https://developers.perspectiveapi.com/s/). Now, if you enter a possibily toxic text, the video generation will fail. We hope you understand.
 (2022.06.05.) Due to the latency from HuggingFace Spaces and video rendering, it takes 15 ~ 30 seconds to get a video result.

docs/title.md ADDED Viewed

	@@ -0,0 +1,6 @@

+# Talking Face Generation with Multilingual TTS
+<center>
+<a href="https://openaccess.thecvf.com/content/CVPR2022/html/Song_Talking_Face_Generation_With_Multilingual_TTS_CVPR_2022_paper.html" target="_blank">CVPR 2022 Demo Track</a> |
+<a href="https://www.youtube.com/watch?v=toqdD1F_ZsU" target="_blank">Video</a>
+</center>

docs/title.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- Talking Face Generation with Multilingual TTS (CVPR 2022 Demo Track)

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ googletrans==4.0.0-rc1
 PyYAML
 opencv-python
 google-cloud-translate
-google-api-python-client

 PyYAML
 opencv-python
 google-cloud-translate
+google-api-python-client
+httpx==0.25.0

utils.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pathlib import Path
+from urllib import request
+def get_snippet_from_url(url: str) -> str:
+    response = request.urlopen(url)
+    data = response.read().decode()
+    return data
+def get_snippet_from_file(filepath: str) -> str:
+    return Path(filepath).read_text()