ylacombe commited on
Commit
2d82fd0
·
1 Parent(s): cedfb7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -62
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import torch
2
 
3
  import gradio as gr
@@ -14,6 +17,9 @@ import os
14
  import time
15
  import demucs.api
16
 
 
 
 
17
 
18
 
19
  MODEL_NAME = "openai/whisper-large-v3"
@@ -39,30 +45,32 @@ def separate_vocal(path):
39
  return path
40
 
41
 
42
-
43
- # def separate_vocal(path, track_name, output_folder, demucs_model_name = "htdemucs_ft"):
44
- #
45
- # os.system(f"python3 -m demucs.separate --two-stems=vocals -n {demucs_model_name} {path} -o {output_folder}")
46
- #
47
- # return os.path.join(output_folder, demucs_model_name, track_name, "vocals.wav")
48
-
49
-
50
- def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None):
51
  if inputs_path is None:
52
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
53
 
 
 
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
 
 
 
56
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
57
 
58
  text = out["text"]
59
 
 
 
60
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
61
 
 
 
 
 
62
  transcripts = []
63
  audios = []
64
  with tempfile.TemporaryDirectory() as tmpdirname:
65
- for i,chunk in enumerate(chunks):
66
 
67
  # TODO: make sure 1D or 2D?
68
  arr = chunk["audio"]
@@ -79,10 +87,12 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
79
 
80
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
81
 
82
-
83
- dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
84
 
85
- return text
 
86
 
87
 
88
  def _return_yt_html_embed(yt_url):
@@ -125,11 +135,18 @@ def download_yt_audio(yt_url, filename):
125
  raise gr.Error(str(err))
126
 
127
 
128
- def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000):
 
 
 
 
129
  html_embed_str = _return_yt_html_embed(yt_url)
130
 
131
  with tempfile.TemporaryDirectory() as tmpdirname:
132
  filepath = os.path.join(tmpdirname, "video.mp4")
 
 
 
133
  download_yt_audio(yt_url, filepath)
134
  with open(filepath, "rb") as f:
135
  inputs_path = f.read()
@@ -137,18 +154,25 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
137
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
138
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
139
 
 
 
140
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
141
 
142
  text = out["text"]
143
 
144
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
145
 
 
 
146
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
147
 
 
 
 
148
  transcripts = []
149
  audios = []
150
  with tempfile.TemporaryDirectory() as tmpdirname:
151
- for i,chunk in enumerate(chunks):
152
 
153
  # TODO: make sure 1D or 2D?
154
  arr = chunk["audio"]
@@ -165,23 +189,28 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
165
 
166
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
167
 
168
-
169
- dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
170
 
 
171
 
172
- return html_embed_str, text
173
 
174
 
175
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
176
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
177
  # return list of dictionnaries (text, audio)
178
  # min duration is in seconds
179
-
180
  min_duration = int(min_duration * sampling_rate)
 
181
 
182
  new_chunks = []
183
  while chunks:
184
  current_chunk = chunks.pop(0)
 
 
185
  begin, end = current_chunk["timestamp"]
186
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
187
 
@@ -193,7 +222,7 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
193
  chunk_to_concat = [audio_array[begin:end]]
194
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
195
  ch = chunks.pop(0)
196
-
197
  begin, end = ch["timestamp"]
198
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
199
  current_dur += end-begin
@@ -209,53 +238,75 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
209
  "audio": np.concatenate(chunk_to_concat),
210
  })
211
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
 
 
212
 
213
  return new_chunks
214
 
215
 
216
-
217
- mf_transcribe = gr.Interface(
218
- fn=transcribe,
219
- inputs=[
220
- gr.Audio(type="filepath"),
221
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
222
- gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
223
- gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
224
- ],
225
- outputs="text",
226
- theme="huggingface",
227
- title="Create your own TTS dataset using your own recordings",
228
- description=(
229
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
230
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
231
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
232
- ),
233
- allow_flagging="never",
234
- )
235
-
236
- yt_transcribe = gr.Interface(
237
- fn=yt_transcribe,
238
- inputs=[
239
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
240
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
241
- gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
242
- gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
243
- ],
244
- outputs=["html", "text"],
245
- theme="huggingface",
246
- title="Create your own TTS dataset using Youtube",
247
- description=(
248
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
249
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
250
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
251
- ),
252
- allow_flagging="never",
253
- )
254
 
255
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
256
  with gr.Row():
257
  gr.LoginButton().activate()
258
  gr.LogoutButton()
259
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Microphone or Audio file", "YouTube"])
260
-
261
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
  import torch
5
 
6
  import gradio as gr
 
17
  import time
18
  import demucs.api
19
 
20
+ import tqdm
21
+
22
+ os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
23
 
24
 
25
  MODEL_NAME = "openai/whisper-large-v3"
 
45
  return path
46
 
47
 
48
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
49
  if inputs_path is None:
50
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
51
 
52
+ pbar = tqdm.tqdm(total=4, desc="Overall progression")
53
+
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
 
56
+ pbar.update(1)
57
+ pbar.set_description("Transcribe using Whisper.")
58
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
59
 
60
  text = out["text"]
61
 
62
+ pbar.update(1)
63
+ pbar.set_description("Merge chunks.")
64
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
65
 
66
+ pbar.update(1)
67
+ pbar.set_description("Create dataset.")
68
+
69
+
70
  transcripts = []
71
  audios = []
72
  with tempfile.TemporaryDirectory() as tmpdirname:
73
+ for i,chunk in tqdm.tqdm(enumerate(chunks), desc="Creating dataset (and clean audio if asked for)"):
74
 
75
  # TODO: make sure 1D or 2D?
76
  arr = chunk["audio"]
 
87
 
88
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
89
 
90
+ pbar.update(1)
91
+ pbar.set_description("Push dataset.")
92
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
93
 
94
+ pbar.close()
95
+ return text, [[transcript] for transcript in transcripts]
96
 
97
 
98
  def _return_yt_html_embed(yt_url):
 
135
  raise gr.Error(str(err))
136
 
137
 
138
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000,
139
+ progress=gr.Progress(track_tqdm=True)):
140
+
141
+ pbar = tqdm.tqdm(total=5, desc="Overall progression")
142
+
143
  html_embed_str = _return_yt_html_embed(yt_url)
144
 
145
  with tempfile.TemporaryDirectory() as tmpdirname:
146
  filepath = os.path.join(tmpdirname, "video.mp4")
147
+ pbar.update(1)
148
+ pbar.set_description("Download Youtube video.")
149
+
150
  download_yt_audio(yt_url, filepath)
151
  with open(filepath, "rb") as f:
152
  inputs_path = f.read()
 
154
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
155
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
156
 
157
+ pbar.update(1)
158
+ pbar.set_description("Transcribe using Whisper.")
159
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
160
 
161
  text = out["text"]
162
 
163
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
164
 
165
+ pbar.update(1)
166
+ pbar.set_description("Merge chunks.")
167
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
168
 
169
+ pbar.update(1)
170
+ pbar.set_description("Create dataset.")
171
+
172
  transcripts = []
173
  audios = []
174
  with tempfile.TemporaryDirectory() as tmpdirname:
175
+ for i,chunk in tqdm.tqdm(enumerate(chunks), desc="Creating dataset (and clean audio if asked for)."):
176
 
177
  # TODO: make sure 1D or 2D?
178
  arr = chunk["audio"]
 
189
 
190
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
191
 
192
+ pbar.update(1)
193
+ pbar.set_description("Push dataset.")
194
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
195
 
196
+ pbar.close()
197
 
198
+ return html_embed_str, text, [[transcript] for transcript in transcripts]
199
 
200
 
201
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
202
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
203
  # return list of dictionnaries (text, audio)
204
  # min duration is in seconds
205
+ pbar = tqdm.tqdm(total=len(chunks), desc="Post-processing transcribed chunks")
206
  min_duration = int(min_duration * sampling_rate)
207
+
208
 
209
  new_chunks = []
210
  while chunks:
211
  current_chunk = chunks.pop(0)
212
+ pbar.update(1)
213
+
214
  begin, end = current_chunk["timestamp"]
215
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
216
 
 
222
  chunk_to_concat = [audio_array[begin:end]]
223
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
224
  ch = chunks.pop(0)
225
+ pbar.update(1)
226
  begin, end = ch["timestamp"]
227
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
228
  current_dur += end-begin
 
238
  "audio": np.concatenate(chunk_to_concat),
239
  })
240
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
241
+
242
+ pbar.close()
243
 
244
  return new_chunks
245
 
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ css = """
249
+ #container{
250
+ margin: 0 auto;
251
+ max-width: 80rem;
252
+ }
253
+ #intro{
254
+ max-width: 100%;
255
+ text-align: center;
256
+ margin: 0 auto;
257
+ }
258
+ """
259
+ with gr.Blocks(css=css) as demo:
260
  with gr.Row():
261
  gr.LoginButton().activate()
262
  gr.LogoutButton()
263
+
264
+ with gr.Tab("Microphone or Audio file"):
265
+ gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
266
+ gr.Markdown(
267
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
268
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
269
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
270
+ )
271
+ with gr.Column():
272
+ audio_file = gr.Audio(type="filepath")
273
+ task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
274
+ cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
275
+ textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name")
276
+
277
+ with gr.Row():
278
+ clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
279
+ submit_file = gr.Button("Submit")
280
+
281
+ with gr.Column():
282
+ transcript_file = gr.Textbox(label="Transcription")
283
+ dataset_file = gr.Dataset(components=["text"], headers=["Transcripts"])
284
+
285
+
286
+ with gr.Tab("YouTube"):
287
+ gr.Markdown("Create your own TTS dataset using Youtube", elem_id="intro")
288
+ gr.Markdown(
289
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
290
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
291
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
292
+ )
293
+ with gr.Column():
294
+ audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
295
+ task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
296
+ cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
297
+ textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name")
298
+
299
+ with gr.Row():
300
+ clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
301
+ submit_youtube = gr.Button("Submit")
302
+
303
+ with gr.Column():
304
+ html_youtube = gr.HTML()
305
+ transcript_youtube = gr.Textbox(label="Transcription")
306
+ dataset_youtube = gr.Dataset(components=["text"], headers=["Transcripts"])
307
+
308
+
309
+ submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[transcript_file, dataset_file])
310
+ submit_youtube.click(yt_transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[html_youtube, transcript_youtube, dataset_youtube])
311
+
312
+ demo.launch(debug=True)