Yixuan Li commited on
Commit
0509b90
·
1 Parent(s): 2176cd2

page optimization

Browse files
app.py CHANGED
@@ -89,17 +89,22 @@ def infer(audio_path: str) -> str:
89
 
90
  return output_file
91
 
92
- with gr.Blocks(title="STAR Online Inference", theme=gr.themes.Soft()) as demo:
93
  gr.Markdown("# STAR: Speech-to-Audio Generation via Representation Learning")
94
 
95
  gr.Markdown("""
96
  <div style="text-align: left; padding: 10px;">
97
-
 
 
 
 
 
98
  ## 🗣️ Input
99
 
100
  A brief input speech utterance for the overall audio scene.
101
 
102
- > Example:A cat meowing and young female speaking
103
 
104
  ### 🎙️ Input Speech Example
105
  """)
@@ -109,15 +114,27 @@ A brief input speech utterance for the overall audio scene.
109
  gr.Markdown("""
110
  <div style="text-align: left; padding: 10px;">
111
 
112
- ### 🎧️ Output Audio Example
 
 
 
 
113
  """)
114
 
115
  audio = gr.Audio(value="wav/audio.wav", label="Generated Audio Example", type="filepath")
116
 
117
  gr.Markdown("""
 
 
118
  </div>
 
119
  ---
 
120
  </div>
 
 
 
 
121
  """)
122
 
123
  with gr.Column():
@@ -125,8 +142,55 @@ A brief input speech utterance for the overall audio scene.
125
  btn = gr.Button("🎵Generate Audio!", variant="primary")
126
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
127
  btn.click(fn=infer, inputs=input_audio, outputs=output_audio)
128
-
129
- demo.launch()
130
-
 
 
 
131
 
132
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  return output_file
91
 
92
+ with gr.Blocks(title="STAR Online Inference", theme=gr.themes.Glass()) as demo:
93
  gr.Markdown("# STAR: Speech-to-Audio Generation via Representation Learning")
94
 
95
  gr.Markdown("""
96
  <div style="text-align: left; padding: 10px;">
97
+
98
+ ## 📚️ Introduction
99
+
100
+ STAR is the first end-to-end speech-to-audio generation framework, designed to enhance efficiency and address error propagation inherent in cascaded systems.
101
+ Within this space, you have the opportunity to directly control our model through voice input, thereby generating the corresponding audio output.
102
+
103
  ## 🗣️ Input
104
 
105
  A brief input speech utterance for the overall audio scene.
106
 
107
+ > ExampleA cat meowing and young female speaking
108
 
109
  ### 🎙️ Input Speech Example
110
  """)
 
114
  gr.Markdown("""
115
  <div style="text-align: left; padding: 10px;">
116
 
117
+ ## 🎧️ Output
118
+
119
+ Capture both auditory events and scene cues and generate corresponding audio
120
+
121
+ ### 🔊 Output Audio Example
122
  """)
123
 
124
  audio = gr.Audio(value="wav/audio.wav", label="Generated Audio Example", type="filepath")
125
 
126
  gr.Markdown("""
127
+ <div style="text-align: left; padding: 10px;">
128
+
129
  </div>
130
+
131
  ---
132
+
133
  </div>
134
+
135
+ ## 🛠️ Online Inference
136
+
137
+ You can upload your own samples, or try the quick examples provided below.
138
  """)
139
 
140
  with gr.Column():
 
142
  btn = gr.Button("🎵Generate Audio!", variant="primary")
143
  output_audio = gr.Audio(label="Generated Audio", type="filepath")
144
  btn.click(fn=infer, inputs=input_audio, outputs=output_audio)
145
+
146
+ gr.Markdown("""
147
+ <div style="text-align: left; padding: 10px;">
148
+
149
+ ## 🎯 Quick Examples
150
+ """)
151
 
152
+ with gr.Tabs():
153
+ with gr.Tab("VITS Generated Speech"):
154
+ gr.Markdown("| 🎧 Audio | 📝 Caption |\n|:--:|:--|")
155
+ gr.Examples(
156
+ examples=[
157
+ ["wav/vits/1.wav", "A cat meowing and young female speaking"],
158
+ ["wav/vits/2.wav", "Sustained industrial engine noise"],
159
+ ["wav/vits/3.wav", "A woman talks and a baby whispers"],
160
+ ["wav/vits/4.wav", "A man speaks followed by a toilet flush"],
161
+ ["wav/vits/5.wav", "It is raining and thundering, and then a man speaks"],
162
+ ["wav/vits/6.wav", "A man speaking as birds are chirping"],
163
+ ["wav/vits/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
164
+ ["wav/vits/8.wav", "Birds chirping and a horse neighing"],
165
+ ["wav/vits/9.wav", "Several church bells ringing"],
166
+ ["wav/vits/10.wav", "A telephone rings with bell sounds"]
167
+ ],
168
+ inputs=[input_audio, _],
169
+ label="Click examples below to try!",
170
+ cache_examples = False,
171
+ examples_per_page = 5,
172
+ )
173
+
174
+ with gr.Tab("Real human Speech"):
175
+ gr.Markdown("| 🎧 Audio | 📝 Caption |\n|:--:|:--|")
176
+ gr.Examples(
177
+ examples=[
178
+ ["wav/human/1.wav", "A cat meowing and young female speaking"],
179
+ ["wav/human/2.wav", "Sustained industrial engine noise"],
180
+ ["wav/human/3.wav", "A woman talks and a baby whispers"],
181
+ ["wav/human/4.wav", "A man speaks followed by a toilet flush"],
182
+ ["wav/human/5.wav", "It is raining and thundering, and then a man speaks"],
183
+ ["wav/human/6.wav", "A man speaking as birds are chirping"],
184
+ ["wav/human/7.wav", "A muffled man talking as a goat baas before and after two goats baaing in the distance while wind blows into a microphone"],
185
+ ["wav/human/8.wav", "Birds chirping and a horse neighing"],
186
+ ["wav/human/9.wav", "Several church bells ringing"],
187
+ ["wav/human/10.wav", "A telephone rings with bell sounds"]
188
+ ],
189
+ inputs=[input_audio, _],
190
+ label="Click examples below to try!",
191
+ cache_examples = False,
192
+ examples_per_page = 5,
193
+ )
194
+
195
+
196
+ demo.launch()
wav/human/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df72406f58c50f323b0b532fb55df443628e715f03af66bc11ccd15eb649f11
3
+ size 603726
wav/human/10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdd320764912961861756d32e93ffd173cd53d924a42e03c28ce7005154579e8
3
+ size 672846
wav/human/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b06c20cc32deb1c5088e13164e858f81452f061f71290062f79e7e3a23fe9b
3
+ size 649806
wav/human/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf4cb6eebb04c6454ecc65cea8df46445d4996cdc36f15ff1f2ce5d177c3f90b
3
+ size 617550
wav/human/4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b8eb089de3be0d18bd767cba861b907042f34c9ac03907b258d36794ea3e0b
3
+ size 732750
wav/human/5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:115cc7b13c4d79179de4901b2443a520de644374bfcdd6e7a6d97f8b254cd814
3
+ size 857166
wav/human/6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4fb1e9c2479fa9197969b8f23e3052d6225d437eb1300e1ae32511528303523
3
+ size 640590
wav/human/7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d616cfa29f31f92c5f63faf8e225058b3e895ae9a4b32163f7f6889b1a52c679
3
+ size 1764942
wav/human/8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4aa8af69c2dcf3dd9cdcfd514832de776256b4b34fec5f49d4e9f2329c1aa7f
3
+ size 649806
wav/human/9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18c32b3183f51307b660a48d97285202fc849866dead7166f387c1f985770e41
3
+ size 488526
wav/vits/1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d23cb7106c66ad61d2b9717daea77385883cf71772836a8c5d18b9496dbb8d5
3
+ size 130604
wav/vits/10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09eb032c3b97cff6e64f65c28c3042b62c13c26027ed5cea0a879acae08de422
3
+ size 111660
wav/vits/2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd3847a5d10b320f0332986c7722c6379ae54fc619f856be08ccc9b9c5653f4f
3
+ size 106028
wav/vits/3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fe02704f94e440c78aaf17feee1a1810cd12c82f70e162433b70a374eac0853
3
+ size 116268
wav/vits/4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9331107c335e4845cc3886e588cb09b0cde8f7e48280699f97bfa8b9cf8193cd
3
+ size 132652
wav/vits/5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ec5223af568eddbced78c5402bd02abe1339cef96de7b6bc3f11362963e170
3
+ size 163884
wav/vits/6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:562bc9e9f0b8c3033b34d3d9f965393f071a97f6a03d673fe9605bc39694168f
3
+ size 101932
wav/vits/7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e4152c3394432b827841b1fafc994b1af3aad279025e74016c74073e9a6dc83
3
+ size 352300
wav/vits/8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d208a28653f8c57cbfec0f9db7d4caeaa53af18b377991b9e92e080ff9505e8
3
+ size 100396
wav/vits/9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0531c9efae71fcbc1a1739ba76335ac8b413c01b5e73ceb40ff65289c42e75b
3
+ size 76332