Spaces:
Sleeping
Sleeping
Yixuan Li
commited on
Commit
·
0509b90
1
Parent(s):
2176cd2
page optimization
Browse files- app.py +72 -8
- wav/human/1.wav +3 -0
- wav/human/10.wav +3 -0
- wav/human/2.wav +3 -0
- wav/human/3.wav +3 -0
- wav/human/4.wav +3 -0
- wav/human/5.wav +3 -0
- wav/human/6.wav +3 -0
- wav/human/7.wav +3 -0
- wav/human/8.wav +3 -0
- wav/human/9.wav +3 -0
- wav/vits/1.wav +3 -0
- wav/vits/10.wav +3 -0
- wav/vits/2.wav +3 -0
- wav/vits/3.wav +3 -0
- wav/vits/4.wav +3 -0
- wav/vits/5.wav +3 -0
- wav/vits/6.wav +3 -0
- wav/vits/7.wav +3 -0
- wav/vits/8.wav +3 -0
- wav/vits/9.wav +3 -0
app.py
CHANGED
|
@@ -89,17 +89,22 @@ def infer(audio_path: str) -> str:
|
|
| 89 |
|
| 90 |
return output_file
|
| 91 |
|
| 92 |
-
with gr.Blocks(title="STAR Online Inference", theme=gr.themes.
|
| 93 |
gr.Markdown("# STAR: Speech-to-Audio Generation via Representation Learning")
|
| 94 |
|
| 95 |
gr.Markdown("""
|
| 96 |
<div style="text-align: left; padding: 10px;">
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
## 🗣️ Input
|
| 99 |
|
| 100 |
A brief input speech utterance for the overall audio scene.
|
| 101 |
|
| 102 |
-
> Example
|
| 103 |
|
| 104 |
### 🎙️ Input Speech Example
|
| 105 |
""")
|
|
@@ -109,15 +114,27 @@ A brief input speech utterance for the overall audio scene.
|
|
| 109 |
gr.Markdown("""
|
| 110 |
<div style="text-align: left; padding: 10px;">
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
""")
|
| 114 |
|
| 115 |
audio = gr.Audio(value="wav/audio.wav", label="Generated Audio Example", type="filepath")
|
| 116 |
|
| 117 |
gr.Markdown("""
|
|
|
|
|
|
|
| 118 |
</div>
|
|
|
|
| 119 |
---
|
|
|
|
| 120 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
""")
|
| 122 |
|
| 123 |
with gr.Column():
|
|
@@ -125,8 +142,55 @@ A brief input speech utterance for the overall audio scene.
|
|
| 125 |
btn = gr.Button("🎵Generate Audio!", variant="primary")
|
| 126 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
| 127 |
btn.click(fn=infer, inputs=input_audio, outputs=output_audio)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
return output_file
|
| 91 |
|
| 92 |
+
with gr.Blocks(title="STAR Online Inference", theme=gr.themes.Glass()) as demo:
|
| 93 |
gr.Markdown("# STAR: Speech-to-Audio Generation via Representation Learning")
|
| 94 |
|
| 95 |
gr.Markdown("""
|
| 96 |
<div style="text-align: left; padding: 10px;">
|
| 97 |
+
|
| 98 |
+
## 📚️ Introduction
|
| 99 |
+
|
| 100 |
+
STAR is the first end-to-end speech-to-audio generation framework, designed to enhance efficiency and address error propagation inherent in cascaded systems.
|
| 101 |
+
Within this space, you have the opportunity to directly control our model through voice input, thereby generating the corresponding audio output.
|
| 102 |
+
|
| 103 |
## 🗣️ Input
|
| 104 |
|
| 105 |
A brief input speech utterance for the overall audio scene.
|
| 106 |
|
| 107 |
+
> Example:A cat meowing and young female speaking
|
| 108 |
|
| 109 |
### 🎙️ Input Speech Example
|
| 110 |
""")
|
|
|
|
| 114 |
gr.Markdown("""
|
| 115 |
<div style="text-align: left; padding: 10px;">
|
| 116 |
|
| 117 |
+
## 🎧️ Output
|
| 118 |
+
|
| 119 |
+
Capture both auditory events and scene cues and generate corresponding audio
|
| 120 |
+
|
| 121 |
+
### 🔊 Output Audio Example
|
| 122 |
""")
|
| 123 |
|
| 124 |
audio = gr.Audio(value="wav/audio.wav", label="Generated Audio Example", type="filepath")
|
| 125 |
|
| 126 |
gr.Markdown("""
|
| 127 |
+
<div style="text-align: left; padding: 10px;">
|
| 128 |
+
|
| 129 |
</div>
|
| 130 |
+
|
| 131 |
---
|
| 132 |
+
|
| 133 |
</div>
|
| 134 |
+
|
| 135 |
+
## 🛠️ Online Inference
|
| 136 |
+
|
| 137 |
+
You can upload your own samples, or try the quick examples provided below.
|
| 138 |
""")
|
| 139 |
|
| 140 |
with gr.Column():
|
|
|
|
| 142 |
btn = gr.Button("🎵Generate Audio!", variant="primary")
|
| 143 |
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
| 144 |
btn.click(fn=infer, inputs=input_audio, outputs=output_audio)
|
| 145 |
+
|
| 146 |
+
gr.Markdown("""
|
| 147 |
+
<div style="text-align: left; padding: 10px;">
|
| 148 |
+
|
| 149 |
+
## 🎯 Quick Examples
|
| 150 |
+
""")
|
| 151 |
|
| 152 |
+
with gr.Tabs():
|
| 153 |
+
with gr.Tab("VITS Generated Speech"):
|
| 154 |
+
gr.Markdown("| 🎧 Audio | 📝 Caption |\n|:--:|:--|")
|
| 155 |
+
gr.Examples(
|
| 156 |
+
examples=[
|
| 157 |
+
["wav/vits/1.wav", "A cat meowing and young female speaking"],
|
| 158 |
+
["wav/vits/2.wav", "Sustained industrial engine noise"],
|
| 159 |
+
["wav/vits/3.wav", "A woman talks and a baby whispers"],
|
| 160 |
+
["wav/vits/4.wav", "A man speaks followed by a toilet flush"],
|
| 161 |
+
["wav/vits/5.wav", "It is raining and thundering, and then a man speaks"],
|
| 162 |
+
["wav/vits/6.wav", "A man speaking as birds are chirping"],
|
| 163 |
+
["wav/vits/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
|
| 164 |
+
["wav/vits/8.wav", "Birds chirping and a horse neighing"],
|
| 165 |
+
["wav/vits/9.wav", "Several church bells ringing"],
|
| 166 |
+
["wav/vits/10.wav", "A telephone rings with bell sounds"]
|
| 167 |
+
],
|
| 168 |
+
inputs=[input_audio, _],
|
| 169 |
+
label="Click examples below to try!",
|
| 170 |
+
cache_examples = False,
|
| 171 |
+
examples_per_page = 5,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
with gr.Tab("Real human Speech"):
|
| 175 |
+
gr.Markdown("| 🎧 Audio | 📝 Caption |\n|:--:|:--|")
|
| 176 |
+
gr.Examples(
|
| 177 |
+
examples=[
|
| 178 |
+
["wav/human/1.wav", "A cat meowing and young female speaking"],
|
| 179 |
+
["wav/human/2.wav", "Sustained industrial engine noise"],
|
| 180 |
+
["wav/human/3.wav", "A woman talks and a baby whispers"],
|
| 181 |
+
["wav/human/4.wav", "A man speaks followed by a toilet flush"],
|
| 182 |
+
["wav/human/5.wav", "It is raining and thundering, and then a man speaks"],
|
| 183 |
+
["wav/human/6.wav", "A man speaking as birds are chirping"],
|
| 184 |
+
["wav/human/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
|
| 185 |
+
["wav/human/8.wav", "Birds chirping and a horse neighing"],
|
| 186 |
+
["wav/human/9.wav", "Several church bells ringing"],
|
| 187 |
+
["wav/human/10.wav", "A telephone rings with bell sounds"]
|
| 188 |
+
],
|
| 189 |
+
inputs=[input_audio, _],
|
| 190 |
+
label="Click examples below to try!",
|
| 191 |
+
cache_examples = False,
|
| 192 |
+
examples_per_page = 5,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
demo.launch()
|
wav/human/1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3df72406f58c50f323b0b532fb55df443628e715f03af66bc11ccd15eb649f11
|
| 3 |
+
size 603726
|
wav/human/10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdd320764912961861756d32e93ffd173cd53d924a42e03c28ce7005154579e8
|
| 3 |
+
size 672846
|
wav/human/2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44b06c20cc32deb1c5088e13164e858f81452f061f71290062f79e7e3a23fe9b
|
| 3 |
+
size 649806
|
wav/human/3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf4cb6eebb04c6454ecc65cea8df46445d4996cdc36f15ff1f2ce5d177c3f90b
|
| 3 |
+
size 617550
|
wav/human/4.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64b8eb089de3be0d18bd767cba861b907042f34c9ac03907b258d36794ea3e0b
|
| 3 |
+
size 732750
|
wav/human/5.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:115cc7b13c4d79179de4901b2443a520de644374bfcdd6e7a6d97f8b254cd814
|
| 3 |
+
size 857166
|
wav/human/6.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4fb1e9c2479fa9197969b8f23e3052d6225d437eb1300e1ae32511528303523
|
| 3 |
+
size 640590
|
wav/human/7.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d616cfa29f31f92c5f63faf8e225058b3e895ae9a4b32163f7f6889b1a52c679
|
| 3 |
+
size 1764942
|
wav/human/8.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4aa8af69c2dcf3dd9cdcfd514832de776256b4b34fec5f49d4e9f2329c1aa7f
|
| 3 |
+
size 649806
|
wav/human/9.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18c32b3183f51307b660a48d97285202fc849866dead7166f387c1f985770e41
|
| 3 |
+
size 488526
|
wav/vits/1.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d23cb7106c66ad61d2b9717daea77385883cf71772836a8c5d18b9496dbb8d5
|
| 3 |
+
size 130604
|
wav/vits/10.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09eb032c3b97cff6e64f65c28c3042b62c13c26027ed5cea0a879acae08de422
|
| 3 |
+
size 111660
|
wav/vits/2.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd3847a5d10b320f0332986c7722c6379ae54fc619f856be08ccc9b9c5653f4f
|
| 3 |
+
size 106028
|
wav/vits/3.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fe02704f94e440c78aaf17feee1a1810cd12c82f70e162433b70a374eac0853
|
| 3 |
+
size 116268
|
wav/vits/4.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9331107c335e4845cc3886e588cb09b0cde8f7e48280699f97bfa8b9cf8193cd
|
| 3 |
+
size 132652
|
wav/vits/5.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ec5223af568eddbced78c5402bd02abe1339cef96de7b6bc3f11362963e170
|
| 3 |
+
size 163884
|
wav/vits/6.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:562bc9e9f0b8c3033b34d3d9f965393f071a97f6a03d673fe9605bc39694168f
|
| 3 |
+
size 101932
|
wav/vits/7.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e4152c3394432b827841b1fafc994b1af3aad279025e74016c74073e9a6dc83
|
| 3 |
+
size 352300
|
wav/vits/8.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d208a28653f8c57cbfec0f9db7d4caeaa53af18b377991b9e92e080ff9505e8
|
| 3 |
+
size 100396
|
wav/vits/9.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0531c9efae71fcbc1a1739ba76335ac8b413c01b5e73ceb40ff65289c42e75b
|
| 3 |
+
size 76332
|