deepkyu commited on
Commit
afbc1dd
1 Parent(s): 5d56d4a

Update theme, fix error

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 👄
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.0.6
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
4
  colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.13.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
app.py CHANGED
@@ -17,14 +17,12 @@ from client_rest import RestAPIApplication
17
  from pathlib import Path
18
  import argparse
19
  import threading
20
- import yaml
21
 
22
- TITLE = Path("docs/title.txt").read_text()
23
- DESCRIPTION = Path("docs/description.md").read_text()
24
 
25
 
26
  class GradioApplication:
27
- def __init__(self, rest_ip, rest_port, max_seed):
28
  self.lang_list = {
29
  'ko': 'ko_KR',
30
  'en': 'en_US',
@@ -42,21 +40,46 @@ class GradioApplication:
42
  self.translator = Translator()
43
  self.rest_application = RestAPIApplication(rest_ip, rest_port)
44
  self.output_dir = Path("output_file")
45
-
46
- inputs = prepare_input()
47
- outputs = prepare_output()
48
-
49
- self.iface = gr.Interface(fn=self.infer,
50
- title=TITLE,
51
- description=DESCRIPTION,
52
- inputs=inputs,
53
- outputs=outputs,
54
- allow_flagging='never',
55
- article=Path("docs/article.md").read_text())
56
-
57
  self.max_seed = max_seed
58
  self._file_seed = 0
59
  self.lock = threading.Lock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  def _get_file_seed(self):
@@ -112,6 +135,7 @@ class GradioApplication:
112
  try:
113
  target_text, lang_dest = self.translator.get_translation(text, lang)
114
  except Exception as e:
 
115
  target_text = ""
116
  lang_dest = ""
117
  detail = f"Error from language translation: ({e})"
@@ -137,15 +161,6 @@ class GradioApplication:
137
 
138
  return self.return_format(toxicity_prob, target_text, lang_dest, video_filename)
139
 
140
- def run(self, server_port=7860, share=False):
141
- try:
142
- self.iface.launch(height=900,
143
- share=share, server_port=server_port,
144
- enable_queue=True)
145
-
146
- except KeyboardInterrupt:
147
- gr.close_all()
148
-
149
 
150
  def prepare_input():
151
  text_input = gr.Textbox(lines=2,
@@ -155,7 +170,7 @@ def prepare_input():
155
  label="Text")
156
  lang_input = gr.Radio(['Korean', 'English', 'Japanese', 'Chinese'],
157
  type='value',
158
- value=None,
159
  label="Language")
160
  duration_rate_input = gr.Slider(minimum=0.8,
161
  maximum=1.2,
@@ -171,15 +186,14 @@ def prepare_input():
171
  value='None',
172
  label="Select a background image/video ...")
173
 
174
- return [text_input, lang_input, duration_rate_input,
175
- action_input, background_input]
176
 
177
 
178
  def prepare_output():
179
  toxicity_output = gr.Label(num_top_classes=1, label="Toxicity (from Perspective API)")
180
- translation_result_otuput = gr.Textbox(type="str", label="Translation Result")
181
  video_output = gr.Video(format='mp4')
182
- return [toxicity_output, translation_result_otuput, video_output]
183
 
184
 
185
  def parse_args():
@@ -197,6 +211,5 @@ def parse_args():
197
  if __name__ == '__main__':
198
  args = parse_args()
199
 
200
- gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed)
201
- gradio_application.run(server_port=args.gradio_port, share=args.share)
202
-
17
  from pathlib import Path
18
  import argparse
19
  import threading
 
20
 
21
+ from utils import get_snippet_from_url
 
22
 
23
 
24
  class GradioApplication:
25
+ def __init__(self, rest_ip, rest_port, max_seed, server_port=7860, share=False):
26
  self.lang_list = {
27
  'ko': 'ko_KR',
28
  'en': 'en_US',
40
  self.translator = Translator()
41
  self.rest_application = RestAPIApplication(rest_ip, rest_port)
42
  self.output_dir = Path("output_file")
43
+
 
 
 
 
 
 
 
 
 
 
 
44
  self.max_seed = max_seed
45
  self._file_seed = 0
46
  self.lock = threading.Lock()
47
+
48
+ with gr.Blocks(
49
+ theme="deepkyu/compact-theme",
50
+ css=get_snippet_from_url("https://huggingface.co/spaces/deepkyu/compact-theme/raw/main/main.css")
51
+ ) as demo:
52
+ with gr.Row(equal_height=True):
53
+ with gr.Column(scale=8):
54
+ gr.Markdown(Path("docs/title.md").read_text(), sanitize_html=False)
55
+ with gr.Column(scale=1):
56
+ toggle_dark = gr.Button(value="Dark", variant='stop')
57
+
58
+ toggle_dark.click(
59
+ None,
60
+ js="""
61
+ () => {
62
+ document.body.classList.toggle('dark');
63
+ }
64
+ """,
65
+ )
66
+ gr.Markdown( Path("docs/description.md").read_text(), sanitize_html=False)
67
+ with gr.Row(equal_height=True):
68
+ with gr.Column(scale=1):
69
+ text_input, lang_input, duration_rate_input, action_input, background_input = prepare_input()
70
+ submit_button = gr.Button(value="Run", variant="primary")
71
+ with gr.Column(scale=1):
72
+ toxicity_output, translation_result_otuput, video_output = prepare_output()
73
+
74
+ submit_button.click(
75
+ fn=self.infer,
76
+ inputs=[text_input, lang_input, duration_rate_input, action_input, background_input],
77
+ outputs=[toxicity_output, translation_result_otuput, video_output],
78
+ )
79
+
80
+ gr.Markdown(Path("docs/article.md").read_text(), sanitize_html=False)
81
+
82
+ demo.queue().launch(share=share, server_port=server_port)
83
 
84
 
85
  def _get_file_seed(self):
135
  try:
136
  target_text, lang_dest = self.translator.get_translation(text, lang)
137
  except Exception as e:
138
+ raise e
139
  target_text = ""
140
  lang_dest = ""
141
  detail = f"Error from language translation: ({e})"
161
 
162
  return self.return_format(toxicity_prob, target_text, lang_dest, video_filename)
163
 
 
 
 
 
 
 
 
 
 
164
 
165
  def prepare_input():
166
  text_input = gr.Textbox(lines=2,
170
  label="Text")
171
  lang_input = gr.Radio(['Korean', 'English', 'Japanese', 'Chinese'],
172
  type='value',
173
+ value='Korean',
174
  label="Language")
175
  duration_rate_input = gr.Slider(minimum=0.8,
176
  maximum=1.2,
186
  value='None',
187
  label="Select a background image/video ...")
188
 
189
+ return text_input, lang_input, duration_rate_input, action_input, background_input
 
190
 
191
 
192
  def prepare_output():
193
  toxicity_output = gr.Label(num_top_classes=1, label="Toxicity (from Perspective API)")
194
+ translation_result_otuput = gr.Textbox(type="text", label="Translation Result")
195
  video_output = gr.Video(format='mp4')
196
+ return toxicity_output, translation_result_otuput, video_output
197
 
198
 
199
  def parse_args():
211
  if __name__ == '__main__':
212
  args = parse_args()
213
 
214
+ gradio_application = GradioApplication(args.rest_ip, args.rest_port, args.max_seed,
215
+ server_port=args.gradio_port, share=args.share)
 
app.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ export REST_IP=146.235.222.52
2
+ export SERVICE_PORT=8080
3
+ export TRANSLATION_APIKEY_URL="https://www.dropbox.com/s/4v75y419u2m7pvt/cvpr-2022-demonstration-aae517bc44e5.json"
4
+ export GOOGLE_APPLICATION_CREDENTIALS=./cvpr-2022-demonstration-aae517bc44e5.json
5
+ export PERSPECTIVE_API_KEY=AIzaSyD9bQ6lEZPqPGdWjrWXoG1aEQ_bt85MH18
6
+ python app.py
cvpr-2022-demonstration-aae517bc44e5.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "cvpr-2022-demonstration",
4
+ "private_key_id": "aae517bc44e5e5aa00d26f27328d1d4bd37a47ae",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQC3h0I+H5wNu6S2\n8kRF0QtYwOhI+miiy4SB/dmnNZGqi/vh01grMlc5VzeieLy06usWxmdIe95x+XRj\nP4NzUDEtr1XD8DJLR57nDDQDVP/28g/HBKZT6vV1qNkkF1NfWtiNpuwWHnt/4lyR\nuirDsQHwSecogYWxxUqBbC7b+7UHMuyRU8AF+lneHmbvLjkKrdJ4RrZCO7Invcju\nI78Jkbv0KkoQ7vRRLjzrrcfFR1LrC6LEpXQr+7Px13Qu16ZoXdFmkRhsRqSDBKBA\n26+RSdm6mwWgegCT3jkbDwu/CZZSKrY6jHpDh+NEapu/kB4n/yxeQ/qRJg6uQ66+\nXAItiSqhAgMBAAECggEABdKbgiuQEKp5oxlEG7/yX3Y9WGVjcPjKgzJJ18F+Fd5b\nlzJUm4g9FrNi7ztr50SFwWL5j2XorI6IyAZRlhS+/q9PpggGnHdQ/g0X0Fw/w7ax\n0/NckJCFiZ5aB0hbK40PDk1zs47zhZ7pp3Dz3uIMiPJbTGBxnK6A+uKzCOUvPONt\npSlRpr0+zPaAAVAf1dhGhJFv0oMWuY5C42SPP7zdugWU7NAb/+nyscSVJtaMl1c9\nSC8ZhtLLbpuqWuV9KY7NHrmEDUeSaP6XYTEogCjGJN3tHhE/Pp7wc1Q+Q8Qq8cEf\nxKDOLGKEQ5mmGBoU6T2X8+W0pqsCHWVk4cus6Jo2fQKBgQDwBKgOy2P2QOOr7lG/\nf7X2L++8+cqnTm8uhkF8p5x6yA/qhkAauCFeSaM58G5vrGHVURmzC5pIWulFdIl4\no7MtM5tb9SJOFoYOMCG5i2RFje8a3UMbjbJ7qWwquG0g22wxIzp5Zp85ATxDyJ2d\n1GnBK4iYzDBNLBa5qj1wm+qOpQKBgQDDv62XzBFpxn/l14HeuLmT99LwjDM00MB3\na51bAEcid6F5gKT2OH+ynw1dbHonj1QgMsSUaYF+IZ1Tv7YOHsu9OGQsOSKXLWzj\nKtqwBS+Rcs+vir/MQx/B92I6kIvPwIwJkcFkGydwSebg5/9GBQgc/IoBGpoealHc\nB+R95ZLHTQKBgQCtK27lXbF9lkutXID2nXn3aZaazc5874YgFxVgjr3DiFyTZNET\ndg39LTvrLaFASSs88QqNt61UzuLHDdGxjMpVropEypQ5qt6Flgx/BicOV91PkJw+\nYPmJZy93kyCJOEbTHQuvU29FfbKxZSjzGrJCgNpJA+lFhK3QvxcdDCErUQKBgD8W\njdo4mPwgT6RCGscvfhAkdUW1yeMhzIYwltx3cHW5XL+OKJx6hR2KiYIsrgoF9bUS\n56x1fJisOMp/JSvT3RI1FIP0PiO/LjLg4u6MHVKhUDJhY4NvttKK6ou5fnYMtpV2\n9n9PCRz1lIWz/+APSxgchFXqvvVCivOBT7ELxoyZAoGAaaS+GwZJWC3VX8Uv3GLy\nY3ss1xIVsMCldar/jH0D2ut2kdVRiqurqshF+I/4CwCe+HfU0Q1OE1fTF5nzJPJ6\nnRqXOd4Y4MMh7ZsljOMAz2BzJBCm6fXDtEFUDt2N5uWMfhWJq6UE5WmuCY9WHaxl\nL/Ywk/jgBUKkANvR/zHF4UQ=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "cvpr-2022-demo-translation@cvpr-2022-demonstration.iam.gserviceaccount.com",
7
+ "client_id": "104977058857183512162",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/cvpr-2022-demo-translation%40cvpr-2022-demonstration.iam.gserviceaccount.com"
12
+ }
docs/article.md CHANGED
@@ -1,5 +1,5 @@
1
 
2
- ## Why learn a new language, when your model can learn it for you?
3
 
4
  <div style="max-width: 720px;max-height: 405px;margin: auto;">
5
  <div style="float: none;clear: both;position: relative;padding-bottom: 56.25%;height: 0;width: 100%">
@@ -10,14 +10,24 @@
10
 
11
  ### Abstract
12
 
13
- Recent studies in talking face generation have focused on building a train-once-use-everywhere model i.e. a model that will generalize from any source speech to any target identity. A number of works have already claimed this functionality and have added that their models will also generalize to any language. However, we show, using languages from different language families, that these models do not translate well when the training language and the testing language are sufficiently different. We reduce the scope of the problem to building a language-robust talking face generation system on seen identities i.e. the target identity is the same as the training identity. In this work, we introduce a talking face generation system that will generalize to different languages. We evaluate the efficacy of our system using a multilingual text-to-speech system. We also discuss the usage of joint text-to-speech system and the talking face generation system as a neural dubber system.
14
 
15
- [CVPR Open Access](https://openaccess.thecvf.com/content/CVPR2022/html/Song_Talking_Face_Generation_With_Multilingual_TTS_CVPR_2022_paper.html) [arXiv](https://arxiv.org/abs/2205.06421)
16
 
17
- ### News
18
 
19
  (2022.08.18.) We got the CVPR Hugging Face prize! Thank you all and special thanks to AK([@akhaliq](https://huggingface.co/akhaliq)).
20
 
21
  <center>
22
  <img alt="we-got-huggingface-prize" src="https://github.com/deepkyu/ml-talking-face/blob/main/docs/we-got-huggingface-prize.jpeg?raw=true" width="50%" />
23
- </center>
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ <!-- ## Why learn a new language, when your model can learn it for you?
3
 
4
  <div style="max-width: 720px;max-height: 405px;margin: auto;">
5
  <div style="float: none;clear: both;position: relative;padding-bottom: 56.25%;height: 0;width: 100%">
10
 
11
  ### Abstract
12
 
13
+ Recent studies in talking face generation have focused on building a train-once-use-everywhere model i.e. a model that will generalize from any source speech to any target identity. A number of works have already claimed this functionality and have added that their models will also generalize to any language. However, we show, using languages from different language families, that these models do not translate well when the training language and the testing language are sufficiently different. We reduce the scope of the problem to building a language-robust talking face generation system on seen identities i.e. the target identity is the same as the training identity. In this work, we introduce a talking face generation system that will generalize to different languages. We evaluate the efficacy of our system using a multilingual text-to-speech system. We also discuss the usage of joint text-to-speech system and the talking face generation system as a neural dubber system. -->
14
 
 
15
 
16
+ ## News
17
 
18
  (2022.08.18.) We got the CVPR Hugging Face prize! Thank you all and special thanks to AK([@akhaliq](https://huggingface.co/akhaliq)).
19
 
20
  <center>
21
  <img alt="we-got-huggingface-prize" src="https://github.com/deepkyu/ml-talking-face/blob/main/docs/we-got-huggingface-prize.jpeg?raw=true" width="50%" />
22
+ </center>
23
+
24
+ <br/>
25
+
26
+ (2023.10.20.) It has been a year since the demonstration has suddenly shut down by MINDsLab (MAUM.AI for now).
27
+ And today, I'm happy to share that I have restored the demonstration in my own lambdalabs instance!
28
+ Over the past year, there have been numerous advancements in Gen AI, including multilingual TTS and talking face generation.
29
+ This demo may become "old-fashioned" at this time... but I hope that it would help other researchers taking a journey in the same field.
30
+
31
+ Now I'm using A10G instance from lambdalabs with my own expense... I'm sorry, but I don't know when it will shut down again. 😵‍💫 I'll keep you posted on the status.
32
+
33
+ <center><a href="https://www.buymeacoffee.com/deepkyu" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 35px !important;width: 160px !important;" ></a></center>
docs/description.md CHANGED
@@ -1,26 +1,18 @@
 
 
1
  This system generates a talking face video based on the input text.
2
  You can provide the input text in one of the four languages: Chinese (Mandarin), English, Japanese, and Korean.
3
  You may also select the target language, the language of the output speech.
4
  If the input text language and the target language are different, the input text will be translated to the target language using Google Translate API.
5
 
6
- ### Updates
7
-
8
- (2023.10.20.) It has been a year since the demonstration has suddenly shut down by MINDsLab (MAUM.AI for now).
9
- And today, I'm happy to share that ⭐I have restored the demonstration⭐ in my own lambdalabs instance!
10
- Over the past year, there have been numerous advancements in Gen AI, including multilingual TTS and talking face generation.
11
- This demo may become "old-fashioned" at this time 😅... but I hope that it would help other researchers taking a journey in the same field.
12
-
13
- ⚠️By the way, I'm using A10G instance from lambdalabs with my own expense... I'm sorry, but I don't know when it will shut down again. 😵‍💫 I'll keep you posted on the status.
14
-
15
- <center><a href="https://www.buymeacoffee.com/deepkyu" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 35px !important;width: 160px !important;" ></a></center>
16
-
17
 
18
- (2022.06.17.) Thank you for visiting our demo!😊 This demo attracted a lot more attention than we anticipated. This, unfortunately, means that the computational burden is heavier than this demo was designed for. So, to maximize everyone's experience, we capped the length of the translated texts at:
19
 
20
  - 200 characters for English
21
  - 100 characters for Chinese, Japaense, and Korean.
22
 
23
- (2022.06.17.) We were originally planning to support any input text. However, when checking the logs recently, we found that there were a lot of inappropriate input texts. So, we decided to filter the inputs based on toxicity using [Perspective API @Google](https://developers.perspectiveapi.com/s/). Now, if you enter a possibily toxic text, the video generation will fail. We hope you understand.
24
 
25
  (2022.06.05.) Due to the latency from HuggingFace Spaces and video rendering, it takes 15 ~ 30 seconds to get a video result.
26
 
1
+ <br/>
2
+
3
  This system generates a talking face video based on the input text.
4
  You can provide the input text in one of the four languages: Chinese (Mandarin), English, Japanese, and Korean.
5
  You may also select the target language, the language of the output speech.
6
  If the input text language and the target language are different, the input text will be translated to the target language using Google Translate API.
7
 
8
+ ## Updates
 
 
 
 
 
 
 
 
 
 
9
 
10
+ (2022.06.17.) To maximize everyone's experience, we capped the length of the translated texts at:
11
 
12
  - 200 characters for English
13
  - 100 characters for Chinese, Japaense, and Korean.
14
 
15
+ Also, we found that there were a lot of inappropriate input texts. We decided to filter the inputs based on toxicity using [Perspective API @Google](https://developers.perspectiveapi.com/s/). Now, if you enter a possibily toxic text, the video generation will fail. We hope you understand.
16
 
17
  (2022.06.05.) Due to the latency from HuggingFace Spaces and video rendering, it takes 15 ~ 30 seconds to get a video result.
18
 
docs/title.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ # Talking Face Generation with Multilingual TTS
2
+
3
+ <center>
4
+ <a href="https://openaccess.thecvf.com/content/CVPR2022/html/Song_Talking_Face_Generation_With_Multilingual_TTS_CVPR_2022_paper.html" target="_blank">CVPR 2022 Demo Track</a> |
5
+ <a href="https://www.youtube.com/watch?v=toqdD1F_ZsU" target="_blank">Video</a>
6
+ </center>
docs/title.txt DELETED
@@ -1 +0,0 @@
1
- Talking Face Generation with Multilingual TTS (CVPR 2022 Demo Track)
 
requirements.txt CHANGED
@@ -4,4 +4,5 @@ googletrans==4.0.0-rc1
4
  PyYAML
5
  opencv-python
6
  google-cloud-translate
7
- google-api-python-client
 
4
  PyYAML
5
  opencv-python
6
  google-cloud-translate
7
+ google-api-python-client
8
+ httpx==0.25.0
utils.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from urllib import request
3
+
4
+ def get_snippet_from_url(url: str) -> str:
5
+ response = request.urlopen(url)
6
+ data = response.read().decode()
7
+ return data
8
+
9
+ def get_snippet_from_file(filepath: str) -> str:
10
+ return Path(filepath).read_text()