XuminYu commited on
Commit
90440c8
1 Parent(s): 692eee7
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -6,7 +6,8 @@ import base64
6
  import json
7
  import time
8
  import re
9
-
 
10
 
11
 
12
  API_URL = os.environ.get("API_URL")
@@ -60,6 +61,24 @@ def predict(prompt, style, audio_file_pth, speed, agree):
60
  None,
61
  )
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # first detect the input language
64
  language_predicted = langid.classify(prompt)[0].strip()
65
  print(f"Detected language:{language_predicted}")
@@ -224,22 +243,25 @@ examples = [
224
  "examples/speaker3.mp3",
225
  True,
226
  ],
227
-
228
  ]
229
 
230
  with gr.Blocks(analytics_enabled=False) as demo:
231
 
232
  with gr.Row():
233
- gr.Markdown(
234
- """
235
- ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="10"/>
236
- """
237
- )
238
- with gr.Row():
239
- gr.Markdown(markdown_table)
240
- with gr.Row():
241
- gr.Markdown(description)
242
-
 
 
 
 
243
  with gr.Row():
244
  gr.HTML(wrapped_markdown_content)
245
 
 
6
  import json
7
  import time
8
  import re
9
+ import hashlib
10
+ import hash_code_for_cached_output
11
 
12
 
13
  API_URL = os.environ.get("API_URL")
 
61
  None,
62
  )
63
 
64
+ # Before we get into inference, we will detect if it is from example table or default value
65
+ # If so, we use a cached Audio. Noted that, it is just for demo efficiency.
66
+ # hash code were generated by `hash_code_for_cached_output.py`
67
+ cached_outputs = {
68
+ "d0f5806f6e_60565a5c20_en_us" : "cached_outputs/0.wav",
69
+ "d0f5806f6e_420ab8211d_en_us" : "cached_outputs/1.wav",
70
+ "6e8a024342_0f96bf44f5_es_default" : "cached_outputs/2.wav",
71
+ "54ad3237d7_3fef5adc6f_zh_default" : "cached_outputs/3.wav",
72
+ "8190e911f8_9897b60a4e_jp_default" : "cached_outputs/4.wav"
73
+ }
74
+ unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, style, prompt)
75
+ if unique_code in list(cached_outputs.keys()):
76
+ return (
77
+ 'We get the cached output for you, since you are try to generating an example cloning.',
78
+ cached_outputs[unique_code],
79
+ audio_file_pth,
80
+ )
81
+
82
  # first detect the input language
83
  language_predicted = langid.classify(prompt)[0].strip()
84
  print(f"Detected language:{language_predicted}")
 
243
  "examples/speaker3.mp3",
244
  True,
245
  ],
 
246
  ]
247
 
248
  with gr.Blocks(analytics_enabled=False) as demo:
249
 
250
  with gr.Row():
251
+ with gr.Column():
252
+ with gr.Row():
253
+ gr.Markdown(
254
+ """
255
+ ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
256
+ """
257
+ )
258
+ with gr.Row():
259
+ gr.Markdown(markdown_table_v2)
260
+ with gr.Row():
261
+ gr.Markdown(description)
262
+ with gr.Column():
263
+ gr.Video('./openvoicev2.mp4', autoplay=True)
264
+
265
  with gr.Row():
266
  gr.HTML(wrapped_markdown_content)
267
 
cached_outputs/0.wav ADDED
Binary file (36.9 kB). View file
 
cached_outputs/1.wav ADDED
Binary file (20.4 kB). View file
 
cached_outputs/2.wav ADDED
Binary file (37.5 kB). View file
 
cached_outputs/3.wav ADDED
Binary file (41.3 kB). View file
 
cached_outputs/4.wav ADDED
Binary file (40.1 kB). View file
 
hash_code_for_cached_output.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydub.utils import mediainfo
2
+ import hashlib
3
+
4
+ def audio_hash(audio_path):
5
+ with open(audio_path, "rb") as f:
6
+ audio_data = f.read()
7
+ hash_object = hashlib.sha256()
8
+ hash_object.update(audio_data)
9
+ audio_hash = hash_object.hexdigest()
10
+
11
+ return audio_hash[:10]
12
+
13
+ def str_to_hash(input_str):
14
+ input_bytes = input_str.encode('utf-8')
15
+ hash_object = hashlib.sha256()
16
+ hash_object.update(input_bytes)
17
+ hash_code = hash_object.hexdigest()
18
+
19
+ return hash_code[:10]
20
+
21
+ def get_unique_code(reference_speaker, text, language):
22
+ return f"{audio_hash(reference_speaker)}_{str_to_hash(text)}_{language}"
23
+
24
+ if __name__ == '__main__':
25
+
26
+ example_inputs = [
27
+ {
28
+ "text": "The bustling city square bustled with street performers, tourists, and local vendors.",
29
+ "language": 'en_us',
30
+ "reference_speaker": "examples/speaker0.mp3"
31
+ },
32
+ {
33
+ "text": "Did you ever hear a folk tale about a giant turtle?",
34
+ "language": 'en_us',
35
+ "reference_speaker": "examples/speaker0.mp3"
36
+ },
37
+ {
38
+ "text": "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
39
+ "language": 'es_default',
40
+ "reference_speaker": "examples/speaker1.mp3",
41
+ },
42
+ {
43
+ "text": "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。",
44
+ "language": 'zh_default',
45
+ "reference_speaker": "examples/speaker2.mp3",
46
+ },
47
+ {
48
+ "text": "彼は毎朝ジョギングをして体を健康に保っています。",
49
+ "language": 'jp_default',
50
+ "reference_speaker": "examples/speaker3.mp3",
51
+ }
52
+ ]
53
+
54
+ for example_input in example_inputs:
55
+ print(get_unique_code(example_input['reference_speaker'], example_input['text'], example_input['language']))
56
+
openvoicev2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e623abfdd5d858005d494b7c04c527927534a9a63ca0005739e40f097d6d75e
3
+ size 12042795
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- langid
 
 
1
+ langid
2
+ hashlib