update
Browse files- .gitattributes +1 -0
- app.py +34 -12
- cached_outputs/0.wav +0 -0
- cached_outputs/1.wav +0 -0
- cached_outputs/2.wav +0 -0
- cached_outputs/3.wav +0 -0
- cached_outputs/4.wav +0 -0
- hash_code_for_cached_output.py +56 -0
- openvoicev2.mp4 +3 -0
- requirements.txt +2 -1
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -6,7 +6,8 @@ import base64
|
|
6 |
import json
|
7 |
import time
|
8 |
import re
|
9 |
-
|
|
|
10 |
|
11 |
|
12 |
API_URL = os.environ.get("API_URL")
|
@@ -60,6 +61,24 @@ def predict(prompt, style, audio_file_pth, speed, agree):
|
|
60 |
None,
|
61 |
)
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# first detect the input language
|
64 |
language_predicted = langid.classify(prompt)[0].strip()
|
65 |
print(f"Detected language:{language_predicted}")
|
@@ -224,22 +243,25 @@ examples = [
|
|
224 |
"examples/speaker3.mp3",
|
225 |
True,
|
226 |
],
|
227 |
-
|
228 |
]
|
229 |
|
230 |
with gr.Blocks(analytics_enabled=False) as demo:
|
231 |
|
232 |
with gr.Row():
|
233 |
-
gr.
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
|
|
243 |
with gr.Row():
|
244 |
gr.HTML(wrapped_markdown_content)
|
245 |
|
|
|
6 |
import json
|
7 |
import time
|
8 |
import re
|
9 |
+
import hashlib
|
10 |
+
import hash_code_for_cached_output
|
11 |
|
12 |
|
13 |
API_URL = os.environ.get("API_URL")
|
|
|
61 |
None,
|
62 |
)
|
63 |
|
64 |
+
# Before we get into inference, we will detect if it is from example table or default value
|
65 |
+
# If so, we use a cached Audio. Noted that, it is just for demo efficiency.
|
66 |
+
# hash code were generated by `hash_code_for_cached_output.py`
|
67 |
+
cached_outputs = {
|
68 |
+
"d0f5806f6e_60565a5c20_en_us" : "cached_outputs/0.wav",
|
69 |
+
"d0f5806f6e_420ab8211d_en_us" : "cached_outputs/1.wav",
|
70 |
+
"6e8a024342_0f96bf44f5_es_default" : "cached_outputs/2.wav",
|
71 |
+
"54ad3237d7_3fef5adc6f_zh_default" : "cached_outputs/3.wav",
|
72 |
+
"8190e911f8_9897b60a4e_jp_default" : "cached_outputs/4.wav"
|
73 |
+
}
|
74 |
+
unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, style, prompt)
|
75 |
+
if unique_code in list(cached_outputs.keys()):
|
76 |
+
return (
|
77 |
+
'We get the cached output for you, since you are try to generating an example cloning.',
|
78 |
+
cached_outputs[unique_code],
|
79 |
+
audio_file_pth,
|
80 |
+
)
|
81 |
+
|
82 |
# first detect the input language
|
83 |
language_predicted = langid.classify(prompt)[0].strip()
|
84 |
print(f"Detected language:{language_predicted}")
|
|
|
243 |
"examples/speaker3.mp3",
|
244 |
True,
|
245 |
],
|
|
|
246 |
]
|
247 |
|
248 |
with gr.Blocks(analytics_enabled=False) as demo:
|
249 |
|
250 |
with gr.Row():
|
251 |
+
with gr.Column():
|
252 |
+
with gr.Row():
|
253 |
+
gr.Markdown(
|
254 |
+
"""
|
255 |
+
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
256 |
+
"""
|
257 |
+
)
|
258 |
+
with gr.Row():
|
259 |
+
gr.Markdown(markdown_table_v2)
|
260 |
+
with gr.Row():
|
261 |
+
gr.Markdown(description)
|
262 |
+
with gr.Column():
|
263 |
+
gr.Video('./openvoicev2.mp4', autoplay=True)
|
264 |
+
|
265 |
with gr.Row():
|
266 |
gr.HTML(wrapped_markdown_content)
|
267 |
|
cached_outputs/0.wav
ADDED
Binary file (36.9 kB). View file
|
|
cached_outputs/1.wav
ADDED
Binary file (20.4 kB). View file
|
|
cached_outputs/2.wav
ADDED
Binary file (37.5 kB). View file
|
|
cached_outputs/3.wav
ADDED
Binary file (41.3 kB). View file
|
|
cached_outputs/4.wav
ADDED
Binary file (40.1 kB). View file
|
|
hash_code_for_cached_output.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydub.utils import mediainfo
|
2 |
+
import hashlib
|
3 |
+
|
4 |
+
def audio_hash(audio_path):
|
5 |
+
with open(audio_path, "rb") as f:
|
6 |
+
audio_data = f.read()
|
7 |
+
hash_object = hashlib.sha256()
|
8 |
+
hash_object.update(audio_data)
|
9 |
+
audio_hash = hash_object.hexdigest()
|
10 |
+
|
11 |
+
return audio_hash[:10]
|
12 |
+
|
13 |
+
def str_to_hash(input_str):
|
14 |
+
input_bytes = input_str.encode('utf-8')
|
15 |
+
hash_object = hashlib.sha256()
|
16 |
+
hash_object.update(input_bytes)
|
17 |
+
hash_code = hash_object.hexdigest()
|
18 |
+
|
19 |
+
return hash_code[:10]
|
20 |
+
|
21 |
+
def get_unique_code(reference_speaker, text, language):
|
22 |
+
return f"{audio_hash(reference_speaker)}_{str_to_hash(text)}_{language}"
|
23 |
+
|
24 |
+
if __name__ == '__main__':
|
25 |
+
|
26 |
+
example_inputs = [
|
27 |
+
{
|
28 |
+
"text": "The bustling city square bustled with street performers, tourists, and local vendors.",
|
29 |
+
"language": 'en_us',
|
30 |
+
"reference_speaker": "examples/speaker0.mp3"
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "Did you ever hear a folk tale about a giant turtle?",
|
34 |
+
"language": 'en_us',
|
35 |
+
"reference_speaker": "examples/speaker0.mp3"
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"text": "El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.",
|
39 |
+
"language": 'es_default',
|
40 |
+
"reference_speaker": "examples/speaker1.mp3",
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"text": "我最近在学习machine learning,希望能够在未来的artificial intelligence领域有所建树。",
|
44 |
+
"language": 'zh_default',
|
45 |
+
"reference_speaker": "examples/speaker2.mp3",
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"text": "彼は毎朝ジョギングをして体を健康に保っています。",
|
49 |
+
"language": 'jp_default',
|
50 |
+
"reference_speaker": "examples/speaker3.mp3",
|
51 |
+
}
|
52 |
+
]
|
53 |
+
|
54 |
+
for example_input in example_inputs:
|
55 |
+
print(get_unique_code(example_input['reference_speaker'], example_input['text'], example_input['language']))
|
56 |
+
|
openvoicev2.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e623abfdd5d858005d494b7c04c527927534a9a63ca0005739e40f097d6d75e
|
3 |
+
size 12042795
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
langid
|
|
|
|
1 |
+
langid
|
2 |
+
hashlib
|