vumichien commited on
Commit
28f8c47
1 Parent(s): 05f6fe1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -68
app.py CHANGED
@@ -21,10 +21,8 @@ from gpuinfo import GPUInfo
21
 
22
  import wave
23
  import contextlib
24
-
25
  import psutil
26
- num_cores = psutil.cpu_count()
27
- os.environ["OMP_NUM_THREADS"] = f"{num_cores}"
28
 
29
  whisper_models = ["base", "small", "medium", "large"]
30
  source_languages = {
@@ -128,16 +126,60 @@ source_languages = {
128
  "jw": "Javanese",
129
  "su": "Sundanese",
130
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  embedding_model = PretrainedSpeakerEmbedding(
132
  "speechbrain/spkrec-ecapa-voxceleb",
133
- device=torch.device("cuda"))
134
 
135
- source_language_list = [key[0] for key in source_languages.items()]
 
 
 
 
 
 
 
 
 
136
 
137
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
138
- print("DEVICE IS: ")
139
- print(device)
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  def convert_time(secs):
143
  return datetime.timedelta(seconds=round(secs))
@@ -149,14 +191,12 @@ def get_youtube(video_url):
149
  print(abs_video_path)
150
  return abs_video_path
151
 
152
-
153
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
154
  """
155
  # Transcribe youtube link using OpenAI Whisper
156
- This space allows you to:
157
- 1. Download youtube video with a given url
158
- 2. Watch it in the first video component
159
- 3. Run automatic speech recognition and diarization (speaker identification)
160
 
161
  Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
162
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
@@ -257,7 +297,6 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
257
  # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
258
  video_in = gr.Video(label="Video file", mirror_webcam=False)
259
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
260
-
261
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
262
  memory = psutil.virtual_memory()
263
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
@@ -265,72 +304,111 @@ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value
265
  number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
266
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
267
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
268
-
269
  title = "Whisper speaker diarization"
270
  demo = gr.Blocks(title=title)
271
  demo.encrypt = False
272
 
 
273
  with demo:
274
- gr.Markdown('''
275
- <div>
276
- <h1 style='text-align: center'>Whisper speaker diarization</h1>
277
- This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
278
- </div>
279
- ''')
280
- with gr.Row():
281
  gr.Markdown('''
282
- ### What you can do with this space:
283
- ##### 1. Download youtube video with a given URL
284
- ##### 2. Watch it in the first video component
285
- ##### 3. Run automatic speech recognition and diarization (speaker identification)
286
  ''')
287
-
288
- with gr.Row():
289
- gr.Markdown('''
290
- ### You can test with some youtube links as below:
 
 
 
291
  ''')
292
-
293
- examples = gr.Examples(examples=
294
- [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
295
- "https://www.youtube.com/watch?v=-UX0X45sYe4",
296
- "https://www.youtube.com/watch?v=7minSgqi-Gw"],
297
- label="Examples", inputs=[youtube_url_in])
298
-
299
-
300
- with gr.Row():
301
- with gr.Column():
302
- youtube_url_in.render()
303
- download_youtube_btn = gr.Button("Download Youtube video")
304
- download_youtube_btn.click(get_youtube, [youtube_url_in], [
305
- video_in])
306
- print(video_in)
307
 
 
 
 
 
 
 
 
 
 
 
308
 
309
- with gr.Row():
310
- with gr.Column():
311
- video_in.render()
312
  with gr.Column():
313
- gr.Markdown('''
314
- ##### Here you can start the transcription process.
315
- ##### Please select the source language for transcription.
316
- ##### You should select a number of speakers for getting better results.
317
- ''')
318
- selected_source_lang.render()
319
- selected_whisper_model.render()
320
- number_speakers.render()
321
- transcribe_btn = gr.Button("Transcribe audio and diarization")
322
- transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
 
 
 
 
 
 
 
 
 
 
 
323
 
 
 
 
 
 
324
 
325
- with gr.Row():
326
- gr.Markdown('''
327
- ##### Here you will get transcription output
328
- ##### ''')
329
-
330
- with gr.Row():
331
- with gr.Column():
332
- transcription_df.render()
333
- system_info.render()
334
- gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'></center>''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  demo.launch(debug=True)
 
21
 
22
  import wave
23
  import contextlib
24
+ from transformers import pipeline
25
  import psutil
 
 
26
 
27
  whisper_models = ["base", "small", "medium", "large"]
28
  source_languages = {
 
126
  "jw": "Javanese",
127
  "su": "Sundanese",
128
  }
129
+
130
+ source_language_list = [key[0] for key in source_languages.items()]
131
+
132
+ MODEL_NAME = "vumichien/whisper-medium-jp"
133
+ lang = "ja"
134
+
135
+ device = 0 if torch.cuda.is_available() else "cpu"
136
+ pipe = pipeline(
137
+ task="automatic-speech-recognition",
138
+ model=MODEL_NAME,
139
+ chunk_length_s=30,
140
+ device=device,
141
+ )
142
+
143
+ pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
144
+
145
  embedding_model = PretrainedSpeakerEmbedding(
146
  "speechbrain/spkrec-ecapa-voxceleb",
147
+ device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
148
 
149
+ def transcribe(microphone, file_upload):
150
+ warn_output = ""
151
+ if (microphone is not None) and (file_upload is not None):
152
+ warn_output = (
153
+ "WARNING: You've uploaded an audio file and used the microphone. "
154
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
155
+ )
156
+
157
+ elif (microphone is None) and (file_upload is None):
158
+ return "ERROR: You have to either use the microphone or upload an audio file"
159
 
160
+ file = microphone if microphone is not None else file_upload
 
 
161
 
162
+ text = pipe(file)["text"]
163
+
164
+ return warn_output + text
165
+
166
+ def _return_yt_html_embed(yt_url):
167
+ video_id = yt_url.split("?v=")[-1]
168
+ HTML_str = (
169
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
170
+ " </center>"
171
+ )
172
+ return HTML_str
173
+
174
+ def yt_transcribe(yt_url):
175
+ yt = YouTube(yt_url)
176
+ html_embed_str = _return_yt_html_embed(yt_url)
177
+ stream = yt.streams.filter(only_audio=True)[0]
178
+ stream.download(filename="audio.mp3")
179
+
180
+ text = pipe("audio.mp3")["text"]
181
+
182
+ return html_embed_str, text
183
 
184
  def convert_time(secs):
185
  return datetime.timedelta(seconds=round(secs))
 
191
  print(abs_video_path)
192
  return abs_video_path
193
 
 
194
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
195
  """
196
  # Transcribe youtube link using OpenAI Whisper
197
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
198
+ 2. Generating speaker embeddings for each segments.
199
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
 
200
 
201
  Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
202
  Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
 
297
  # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
298
  video_in = gr.Video(label="Video file", mirror_webcam=False)
299
  youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
 
300
  df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
301
  memory = psutil.virtual_memory()
302
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
 
304
  number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
305
  system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
306
  transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 
307
  title = "Whisper speaker diarization"
308
  demo = gr.Blocks(title=title)
309
  demo.encrypt = False
310
 
311
+
312
  with demo:
313
+ with gr.Tab("Whisper speaker diarization"):
 
 
 
 
 
 
314
  gr.Markdown('''
315
+ <div>
316
+ <h1 style='text-align: center'>Whisper speaker diarization</h1>
317
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
318
+ </div>
319
  ''')
320
+
321
+ with gr.Row():
322
+ gr.Markdown('''
323
+ ### Transcribe youtube link using OpenAI Whisper
324
+ ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
325
+ ##### 2. Generating speaker embeddings for each segments.
326
+ ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
327
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ with gr.Row():
330
+ gr.Markdown('''
331
+ ### You can test by following examples:
332
+ ''')
333
+ examples = gr.Examples(examples=
334
+ [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
335
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
336
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
337
+ label="Examples", inputs=[youtube_url_in])
338
+
339
 
340
+ with gr.Row():
 
 
341
  with gr.Column():
342
+ youtube_url_in.render()
343
+ download_youtube_btn = gr.Button("Download Youtube video")
344
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
345
+ video_in])
346
+ print(video_in)
347
+
348
+
349
+ with gr.Row():
350
+ with gr.Column():
351
+ video_in.render()
352
+ with gr.Column():
353
+ gr.Markdown('''
354
+ ##### Here you can start the transcription process.
355
+ ##### Please select the source language for transcription.
356
+ ##### You should select a number of speakers for getting better results.
357
+ ''')
358
+ selected_source_lang.render()
359
+ selected_whisper_model.render()
360
+ number_speakers.render()
361
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
362
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
363
 
364
+
365
+ with gr.Row():
366
+ gr.Markdown('''
367
+ ##### Here you will get transcription output
368
+ ##### ''')
369
 
370
+
371
+ with gr.Row():
372
+ with gr.Column():
373
+ transcription_df.render()
374
+ system_info.render()
375
+ gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'></center>''')
376
+
377
+ with gr.Tab("Whisper Transcribe Japanese Audio"):
378
+ gr.Markdown(f'''
379
+ <div>
380
+ <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
381
+ </div>
382
+ Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
383
+ checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
384
+ ''')
385
+ microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
386
+ upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
387
+ transcribe_btn = gr.Button("Transcribe Audio")
388
+ text_output = gr.Textbox()
389
+ with gr.Row():
390
+ gr.Markdown('''
391
+ ### You can test by following examples:
392
+ ''')
393
+ examples = gr.Examples(examples=
394
+ [ "sample1.wav",
395
+ "sample2.wav",
396
+ ],
397
+ label="Examples", inputs=[upload])
398
+ transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
399
+
400
+ with gr.Tab("Whisper Transcribe Japanese YouTube"):
401
+ gr.Markdown(f'''
402
+ <div>
403
+ <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
404
+ </div>
405
+ Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
406
+ <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
407
+ ''')
408
+ youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
409
+ yt_transcribe_btn = gr.Button("Transcribe YouTube")
410
+ text_output2 = gr.Textbox()
411
+ html_output = gr.Markdown()
412
+ yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
413
 
414
  demo.launch(debug=True)