ovieyra21 commited on
Commit
410193a
1 Parent(s): 6d3b05b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -117
app.py CHANGED
@@ -1,12 +1,15 @@
1
- import gradio as gr
2
  import torch
3
- from transformers import pipeline
4
  import yt_dlp as youtube_dl
5
- import os
6
- from scipy.io import wavfile
7
  import numpy as np
8
  from datasets import Dataset, Audio
 
 
 
 
 
9
  import tempfile
 
10
  import time
11
  import demucs
12
 
@@ -25,29 +28,33 @@ pipe = pipeline(
25
  device=device,
26
  )
27
 
28
- separator = demucs.pretrained.hdemucs()
29
 
30
  def separate_vocal(path):
31
- origin, separated = separator(path)
32
- vocal_path = os.path.splitext(path)[0] + "_vocals.wav"
33
- wavfile.write(vocal_path, separator.samplerate, separated[1].numpy())
34
- return vocal_path
35
 
36
- def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progress=gr.Progress()):
37
  if inputs_path is None:
38
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
39
- if not dataset_name:
40
  raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
41
- if oauth_token is None:
42
- raise gr.Error("No OAuth token submitted! Please login to use this demo.")
43
 
 
 
 
 
44
  total_step = 4
45
  current_step = 0
46
 
47
  current_step += 1
48
  progress((current_step, total_step), desc="Transcribe using Whisper.")
 
49
  sampling_rate, inputs = wavfile.read(inputs_path)
 
50
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
 
51
  text = out["text"]
52
 
53
  current_step += 1
@@ -56,6 +63,7 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progres
56
 
57
  current_step += 1
58
  progress((current_step, total_step), desc="Create dataset.")
 
59
  transcripts = []
60
  audios = []
61
  with tempfile.TemporaryDirectory() as tmpdirname:
@@ -75,30 +83,75 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token, progres
75
 
76
  current_step += 1
77
  progress((current_step, total_step), desc="Push dataset.")
78
- dataset.push_to_hub(dataset_name, token=oauth_token)
79
 
80
  return [[transcript] for transcript in transcripts], text
81
 
82
- def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token, progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  if yt_url is None:
84
- raise gr.Error("No YouTube URL submitted! Please provide a working link.")
85
- if not dataset_name:
86
  raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
87
- if oauth_token is None:
88
- raise gr.Error("No OAuth token submitted! Please login to use this demo.")
89
 
90
  total_step = 5
91
  current_step = 0
92
 
93
  html_embed_str = _return_yt_html_embed(yt_url)
94
 
 
 
 
 
95
  current_step += 1
96
  progress((current_step, total_step), desc="Load video.")
97
 
98
  with tempfile.TemporaryDirectory() as tmpdirname:
99
  filepath = os.path.join(tmpdirname, "video.mp4")
 
100
  download_yt_audio(yt_url, filepath)
101
- inputs_path = filepath
 
102
 
103
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
104
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
@@ -106,6 +159,7 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token, progress=
106
  current_step += 1
107
  progress((current_step, total_step), desc="Transcribe using Whisper.")
108
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
 
109
  text = out["text"]
110
 
111
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
@@ -116,6 +170,7 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token, progress=
116
 
117
  current_step += 1
118
  progress((current_step, total_step), desc="Create dataset.")
 
119
  transcripts = []
120
  audios = []
121
  with tempfile.TemporaryDirectory() as tmpdirname:
@@ -135,12 +190,13 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token, progress=
135
 
136
  current_step += 1
137
  progress((current_step, total_step), desc="Push dataset.")
138
- dataset.push_to_hub(dataset_name, token=oauth_token)
139
-
140
  return html_embed_str, [[transcript] for transcript in transcripts], text
141
 
142
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars=".!:;?", min_duration=5):
143
  min_duration = int(min_duration * sampling_rate)
 
144
  new_chunks = []
145
  while chunks:
146
  current_chunk = chunks.pop(0)
@@ -148,108 +204,72 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_ch
148
  begin, end = int(begin * sampling_rate), int(end * sampling_rate)
149
  current_dur = end - begin
150
  text = current_chunk["text"]
151
- chunk_to_concat = [audio_array[begin:end]]
152
- while chunks and (text[-1] not in stop_chars or (current_dur < min_duration)):
153
- ch = chunks.pop(0)
154
- begin, end = ch["timestamp"]
155
- begin, end = int(begin * sampling_rate), int(end * sampling_rate)
156
- current_dur += end - begin
157
- text = "".join([text, ch["text"]])
158
- chunk_to_concat.append(audio_array[begin:end])
159
- new_chunks.append({
160
- "text": text,
161
- "audio": np.concatenate(chunk_to_concat)
162
- })
 
 
 
 
 
 
163
  return new_chunks
164
 
165
- def _return_yt_html_embed(yt_url):
166
- video_id = yt_url.split("?v=")[-1]
167
- HTML_str = (
168
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
169
- " </center>"
 
 
 
 
 
 
 
 
 
 
 
170
  )
171
- return HTML_str
172
 
173
- def download_yt_audio(yt_url, filename):
174
- info_loader = youtube_dl.YoutubeDL()
175
- try:
176
- info = info_loader.extract_info(yt_url, download=False)
177
- except youtube_dl.utils.DownloadError as err:
178
- raise gr.Error(str(err))
179
 
180
- file_length = info["duration_string"]
181
- file_h_m_s = file_length.split(":")
182
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
183
-
184
- if len(file_h_m_s) == 1:
185
- file_h_m_s.insert(0, 0)
186
- if len(file_h_m_s) == 2:
187
- file_h_m_s.insert(0, 0)
188
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
189
-
190
- if file_length_s > YT_LENGTH_LIMIT_S:
191
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
192
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
193
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
194
 
195
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
196
-
197
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
198
- try:
199
- ydl.download([yt_url])
200
- except youtube_dl.utils.ExtractorError as err:
201
- raise gr.Error(str(err))
202
 
203
- css = """
204
- #intro {
205
- padding: 20px;
206
- background-color: #f0f0f0;
207
- margin-bottom: 10px;
208
- }
209
- #intro h1 {
210
- font-size: 30px;
211
- }
212
- """
213
- gr.config.css(css)
214
 
215
- with gr.Blocks() as demo:
216
- with gr.Tab("Local file"):
217
- with gr.Row():
218
- with gr.Column():
219
- local_audio_input = gr.Audio(type="filepath", label="Upload Audio")
220
- task_input = gr.Dropdown(choices=["transcribe", "translate"], value="transcribe", label="Task")
221
- use_demucs_input = gr.Dropdown(choices=["do-nothing", "separate-audio"], value="do-nothing", label="Audio preprocessing")
222
- dataset_name_input = gr.Textbox(label="Dataset name")
223
- hf_token = gr.Textbox(label="HuggingFace Token")
224
- submit_local_button = gr.Button("Transcribe")
225
- with gr.Column():
226
- local_output_text = gr.Dataframe(label="Transcripts")
227
- local_output_full_text = gr.Textbox(label="Full Text")
228
-
229
- submit_local_button.click(
230
- transcribe,
231
- inputs=[local_audio_input, task_input, use_demucs_input, dataset_name_input, hf_token],
232
- outputs=[local_output_text, local_output_full_text],
233
- )
234
 
235
- with gr.Tab("YouTube video"):
236
- with gr.Row():
237
- with gr.Column():
238
- yt_url_input = gr.Textbox(label="YouTube URL")
239
- yt_task_input = gr.Dropdown(choices=["transcribe", "translate"], value="transcribe", label="Task")
240
- yt_use_demucs_input = gr.Dropdown(choices=["do-nothing", "separate-audio"], value="do-nothing", label="Audio preprocessing")
241
- yt_dataset_name_input = gr.Textbox(label="Dataset name")
242
- yt_hf_token = gr.Textbox(label="HuggingFace Token")
243
- submit_yt_button = gr.Button("Transcribe")
244
- with gr.Column():
245
- yt_html_embed_str = gr.HTML()
246
- yt_output_text = gr.Dataframe(label="Transcripts")
247
- yt_output_full_text = gr.Textbox(label="Full Text")
248
-
249
- submit_yt_button.click(
250
- yt_transcribe,
251
- inputs=[yt_url_input, yt_task_input, yt_use_demucs_input, yt_dataset_name_input, yt_hf_token],
252
- outputs=[yt_html_embed_str, yt_output_text, yt_output_full_text],
253
- )
254
 
255
- demo.launch(share=True)
 
 
1
  import torch
2
+ import gradio as gr
3
  import yt_dlp as youtube_dl
 
 
4
  import numpy as np
5
  from datasets import Dataset, Audio
6
+ from scipy.io import wavfile
7
+
8
+ from transformers import pipeline
9
+ from transformers.pipelines.audio_utils import ffmpeg_read
10
+
11
  import tempfile
12
+ import os
13
  import time
14
  import demucs
15
 
 
28
  device=device,
29
  )
30
 
31
+ separator = demucs.api.Separator(model=DEMUCS_MODEL_NAME)
32
 
33
  def separate_vocal(path):
34
+ origin, separated = separator.separate_audio_file(path)
35
+ demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
36
+ return path
 
37
 
38
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
39
  if inputs_path is None:
40
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
41
+ if dataset_name is None:
42
  raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
 
 
43
 
44
+ if oauth_token is None:
45
+ gr.Warning("Make sure to click and login before using this demo.")
46
+ return [["transcripts will appear here"]], ""
47
+
48
  total_step = 4
49
  current_step = 0
50
 
51
  current_step += 1
52
  progress((current_step, total_step), desc="Transcribe using Whisper.")
53
+
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
+
56
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
57
+
58
  text = out["text"]
59
 
60
  current_step += 1
 
63
 
64
  current_step += 1
65
  progress((current_step, total_step), desc="Create dataset.")
66
+
67
  transcripts = []
68
  audios = []
69
  with tempfile.TemporaryDirectory() as tmpdirname:
 
83
 
84
  current_step += 1
85
  progress((current_step, total_step), desc="Push dataset.")
86
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
87
 
88
  return [[transcript] for transcript in transcripts], text
89
 
90
+ def _return_yt_html_embed(yt_url):
91
+ video_id = yt_url.split("?v=")[-1]
92
+ HTML_str = (
93
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
94
+ " </center>"
95
+ )
96
+ return HTML_str
97
+
98
+ def download_yt_audio(yt_url, filename):
99
+ info_loader = youtube_dl.YoutubeDL()
100
+
101
+ try:
102
+ info = info_loader.extract_info(yt_url, download=False)
103
+ except youtube_dl.utils.DownloadError as err:
104
+ raise gr.Error(str(err))
105
+
106
+ file_length = info["duration_string"]
107
+ file_h_m_s = file_length.split(":")
108
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
109
+
110
+ if len(file_h_m_s) == 1:
111
+ file_h_m_s.insert(0, 0)
112
+ if len(file_h_m_s) == 2:
113
+ file_h_m_s.insert(0, 0)
114
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
115
+
116
+ if file_length_s > YT_LENGTH_LIMIT_S:
117
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
118
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
119
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
120
+
121
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
122
+
123
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
124
+ try:
125
+ ydl.download([yt_url])
126
+ except youtube_dl.utils.ExtractorError as err:
127
+ raise gr.Error(str(err))
128
+
129
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate=24000,
130
+ progress=gr.Progress()):
131
+
132
  if yt_url is None:
133
+ raise gr.Error("No youtube link submitted! Please put a working link.")
134
+ if dataset_name is None:
135
  raise gr.Error("No dataset name submitted! Please submit a dataset name. Should be in the format : <user>/<dataset_name> or <org>/<dataset_name>. Also accepts <dataset_name>, which will default to the namespace of the logged-in user.")
 
 
136
 
137
  total_step = 5
138
  current_step = 0
139
 
140
  html_embed_str = _return_yt_html_embed(yt_url)
141
 
142
+ if oauth_token is None:
143
+ gr.Warning("Make sure to click and login before using this demo.")
144
+ return html_embed_str, [["transcripts will appear here"]], ""
145
+
146
  current_step += 1
147
  progress((current_step, total_step), desc="Load video.")
148
 
149
  with tempfile.TemporaryDirectory() as tmpdirname:
150
  filepath = os.path.join(tmpdirname, "video.mp4")
151
+
152
  download_yt_audio(yt_url, filepath)
153
+ with open(filepath, "rb") as f:
154
+ inputs_path = f.read()
155
 
156
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
157
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
 
159
  current_step += 1
160
  progress((current_step, total_step), desc="Transcribe using Whisper.")
161
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
162
+
163
  text = out["text"]
164
 
165
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
 
170
 
171
  current_step += 1
172
  progress((current_step, total_step), desc="Create dataset.")
173
+
174
  transcripts = []
175
  audios = []
176
  with tempfile.TemporaryDirectory() as tmpdirname:
 
190
 
191
  current_step += 1
192
  progress((current_step, total_step), desc="Push dataset.")
193
+ dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
194
+
195
  return html_embed_str, [[transcript] for transcript in transcripts], text
196
 
197
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars=".!:;?", min_duration=5):
198
  min_duration = int(min_duration * sampling_rate)
199
+
200
  new_chunks = []
201
  while chunks:
202
  current_chunk = chunks.pop(0)
 
204
  begin, end = int(begin * sampling_rate), int(end * sampling_rate)
205
  current_dur = end - begin
206
  text = current_chunk["text"]
207
+
208
+ chunk_to_concat = []
209
+ while chunks and (current_dur < min_duration or text[-1] not in stop_chars):
210
+ next_chunk = chunks.pop(0)
211
+ next_text = next_chunk["text"].strip()
212
+ next_begin, next_end = next_chunk["timestamp"]
213
+ next_begin, next_end = int(next_begin * sampling_rate), int(next_end * sampling_rate)
214
+ current_dur += next_end - next_begin
215
+ text += f" {next_text}"
216
+ end = next_end
217
+
218
+ new_chunks.append(
219
+ {
220
+ "audio": np.array(audio_array[begin:end]).astype(np.float32),
221
+ "text": text,
222
+ }
223
+ )
224
+
225
  return new_chunks
226
 
227
+ with gr.Blocks() as demo:
228
+ with gr.Row():
229
+ with gr.Column():
230
+ gr.Markdown("### Audio or YouTube Video Transcription")
231
+ with gr.Row():
232
+ yt_textbox = gr.Textbox(label="YouTube link")
233
+ yt_button = gr.Button("Transcribe YouTube video")
234
+ with gr.Column():
235
+ gr.Markdown("### Upload or Record Audio")
236
+ local_audio_input = gr.Audio(type="filepath", label="Upload Audio")
237
+ local_button = gr.Button("Transcribe Local Audio")
238
+
239
+ task = gr.Radio(
240
+ ["transcribe", "translate"],
241
+ label="Task",
242
+ value="transcribe",
243
  )
 
244
 
245
+ demucs_checkbox = gr.CheckboxGroup(["separate-audio"], label="Apply Demucs (Separate Vocal from Audio)")
246
+ dataset_name = gr.Textbox(label="Dataset name", placeholder="Dataset name to push to Hugging Face Hub")
 
 
 
 
247
 
248
+ with gr.Row():
249
+ login_button = gr.Button("Login")
250
+ login_output = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ with gr.Row():
253
+ output_transcriptions = gr.Dataframe(headers=["Transcriptions"])
254
+ output_text = gr.Markdown()
 
 
 
 
255
 
256
+ login_button.click(
257
+ fn=None,
258
+ inputs=None,
259
+ outputs=login_output,
260
+ _js="function() { return window.location = 'https://huggingface.co/login'; }",
261
+ )
 
 
 
 
 
262
 
263
+ yt_button.click(
264
+ yt_transcribe,
265
+ inputs=[yt_textbox, task, demucs_checkbox, dataset_name, login_button],
266
+ outputs=[login_output, output_transcriptions, output_text],
267
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
+ local_button.click(
270
+ transcribe,
271
+ inputs=[local_audio_input, task, demucs_checkbox, dataset_name, login_button],
272
+ outputs=[output_transcriptions, output_text],
273
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
+ demo.launch()