manosplitsis commited on
Commit
1e612c8
1 Parent(s): 01aaa98
Files changed (3) hide show
  1. app.py +539 -0
  2. packages.txt +1 -0
  3. requirements.txt +20 -0
app.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import datetime
3
+ import subprocess
4
+ import gradio as gr
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+ import numpy as np
11
+ from sklearn.cluster import AgglomerativeClustering
12
+ import base64
13
+
14
+ from pytube import YouTube
15
+ import torch
16
+ import pyannote.audio
17
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
18
+ from pyannote.audio import Audio
19
+ from pyannote.core import Segment
20
+
21
+ from gpuinfo import GPUInfo
22
+
23
+ import wave
24
+ import contextlib
25
+ from transformers import pipeline
26
+ import psutil
27
+
28
+ # os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
29
+ # os.system('make -C ./whisper.cpp')
30
+ # os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
31
+
32
+
33
+ whisper_models = ["base", "small", "medium", "large"]
34
+ source_languages = {
35
+ "en": "English",
36
+ "zh": "Chinese",
37
+ "de": "German",
38
+ "es": "Spanish",
39
+ "ru": "Russian",
40
+ "ko": "Korean",
41
+ "fr": "French",
42
+ "ja": "Japanese",
43
+ "pt": "Portuguese",
44
+ "tr": "Turkish",
45
+ "pl": "Polish",
46
+ "ca": "Catalan",
47
+ "nl": "Dutch",
48
+ "ar": "Arabic",
49
+ "sv": "Swedish",
50
+ "it": "Italian",
51
+ "id": "Indonesian",
52
+ "hi": "Hindi",
53
+ "fi": "Finnish",
54
+ "vi": "Vietnamese",
55
+ "he": "Hebrew",
56
+ "uk": "Ukrainian",
57
+ "el": "Greek",
58
+ "ms": "Malay",
59
+ "cs": "Czech",
60
+ "ro": "Romanian",
61
+ "da": "Danish",
62
+ "hu": "Hungarian",
63
+ "ta": "Tamil",
64
+ "no": "Norwegian",
65
+ "th": "Thai",
66
+ "ur": "Urdu",
67
+ "hr": "Croatian",
68
+ "bg": "Bulgarian",
69
+ "lt": "Lithuanian",
70
+ "la": "Latin",
71
+ "mi": "Maori",
72
+ "ml": "Malayalam",
73
+ "cy": "Welsh",
74
+ "sk": "Slovak",
75
+ "te": "Telugu",
76
+ "fa": "Persian",
77
+ "lv": "Latvian",
78
+ "bn": "Bengali",
79
+ "sr": "Serbian",
80
+ "az": "Azerbaijani",
81
+ "sl": "Slovenian",
82
+ "kn": "Kannada",
83
+ "et": "Estonian",
84
+ "mk": "Macedonian",
85
+ "br": "Breton",
86
+ "eu": "Basque",
87
+ "is": "Icelandic",
88
+ "hy": "Armenian",
89
+ "ne": "Nepali",
90
+ "mn": "Mongolian",
91
+ "bs": "Bosnian",
92
+ "kk": "Kazakh",
93
+ "sq": "Albanian",
94
+ "sw": "Swahili",
95
+ "gl": "Galician",
96
+ "mr": "Marathi",
97
+ "pa": "Punjabi",
98
+ "si": "Sinhala",
99
+ "km": "Khmer",
100
+ "sn": "Shona",
101
+ "yo": "Yoruba",
102
+ "so": "Somali",
103
+ "af": "Afrikaans",
104
+ "oc": "Occitan",
105
+ "ka": "Georgian",
106
+ "be": "Belarusian",
107
+ "tg": "Tajik",
108
+ "sd": "Sindhi",
109
+ "gu": "Gujarati",
110
+ "am": "Amharic",
111
+ "yi": "Yiddish",
112
+ "lo": "Lao",
113
+ "uz": "Uzbek",
114
+ "fo": "Faroese",
115
+ "ht": "Haitian creole",
116
+ "ps": "Pashto",
117
+ "tk": "Turkmen",
118
+ "nn": "Nynorsk",
119
+ "mt": "Maltese",
120
+ "sa": "Sanskrit",
121
+ "lb": "Luxembourgish",
122
+ "my": "Myanmar",
123
+ "bo": "Tibetan",
124
+ "tl": "Tagalog",
125
+ "mg": "Malagasy",
126
+ "as": "Assamese",
127
+ "tt": "Tatar",
128
+ "haw": "Hawaiian",
129
+ "ln": "Lingala",
130
+ "ha": "Hausa",
131
+ "ba": "Bashkir",
132
+ "jw": "Javanese",
133
+ "su": "Sundanese",
134
+ }
135
+
136
+ source_language_list = [key[0] for key in source_languages.items()]
137
+
138
+ # MODEL_NAME = "vumichien/whisper-medium-jp"
139
+ # lang = "ja"
140
+
141
+ # device = 0 if torch.cuda.is_available() else "cpu"
142
+ # pipe = pipeline(
143
+ # task="automatic-speech-recognition",
144
+ # model=MODEL_NAME,
145
+ # chunk_length_s=30,
146
+ # device=device,
147
+ # )
148
+ os.makedirs('output', exist_ok=True)
149
+ # pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
150
+
151
+
152
+ def transcribe(microphone, file_upload):
153
+ warn_output = ""
154
+ if (microphone is not None) and (file_upload is not None):
155
+ warn_output = (
156
+ "WARNING: You've uploaded an audio file and used the microphone. "
157
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
158
+ )
159
+
160
+ elif (microphone is None) and (file_upload is None):
161
+ return "ERROR: You have to either use the microphone or upload an audio file"
162
+
163
+ file = microphone if microphone is not None else file_upload
164
+
165
+ text = pipe(file)["text"]
166
+
167
+ return warn_output + text
168
+
169
+ def _return_yt_html_embed(yt_url):
170
+ video_id = yt_url.split("?v=")[-1]
171
+ HTML_str = (
172
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
173
+ " </center>"
174
+ )
175
+ return HTML_str
176
+
177
+ def yt_transcribe(yt_url):
178
+ yt = YouTube(yt_url)
179
+ html_embed_str = _return_yt_html_embed(yt_url)
180
+ stream = yt.streams.filter(only_audio=True)[0]
181
+ stream.download(filename="audio.mp3")
182
+
183
+ text = pipe("audio.mp3")["text"]
184
+
185
+ return html_embed_str, text
186
+
187
+ #format a float as "00"."000"
188
+ format_float = lambda x: '{:.3f}'.format(x)
189
+
190
+ def convert_time(secs):
191
+
192
+ td=str(datetime.timedelta(seconds=secs))
193
+ h,m,s = re.split(':', td)
194
+ #format float as 2 digits before decimal and 3 digits after
195
+ time=str(h)+':'+str(m)+':'+'{:06.3f}'.format(float(s))
196
+ # print(time)
197
+ return time
198
+
199
+ def get_youtube(video_url):
200
+ yt = YouTube(video_url)
201
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
202
+ print("Success download video")
203
+ print(abs_video_path)
204
+ return abs_video_path
205
+
206
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers,device):
207
+ """
208
+ # Transcribe youtube link using OpenAI Whisper
209
+ 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
210
+ 2. Generating speaker embeddings for each segments.
211
+ 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
212
+
213
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
214
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
215
+ """
216
+
217
+ if device=="gpu" and torch.cuda.is_available():
218
+ device = torch.device("cuda")
219
+ elif device=="cpu":
220
+ device = torch.device("cpu")
221
+ elif device=="gpu" and not torch.cuda.is_available():
222
+ raise ValueError("Error: GPU not available")
223
+
224
+ print("device is ", device)
225
+
226
+ embedding_model = PretrainedSpeakerEmbedding(
227
+ "speechbrain/spkrec-ecapa-voxceleb",
228
+ device=device)
229
+
230
+ model = whisper.load_model(whisper_model, device=device)
231
+ time_start = time.time()
232
+ if(video_file_path == None):
233
+ raise ValueError("Error no video input")
234
+ print(video_file_path)
235
+
236
+ try:
237
+ # Read and convert youtube video
238
+ _,file_ending = os.path.splitext(f'{video_file_path}')
239
+ print(f'file ending is {file_ending}')
240
+ audio_file = video_file_path.replace(file_ending, ".wav")
241
+ print("starting conversion to wav")
242
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
243
+
244
+ # Get duration
245
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
246
+ frames = f.getnframes()
247
+ rate = f.getframerate()
248
+ duration = frames / float(rate)
249
+ print(f"conversion to wav ready, duration of audio file: {duration}")
250
+
251
+ # Transcribe audio
252
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
253
+ transcribe_options = dict(task="transcribe", **options)
254
+ result = model.transcribe(audio_file, **transcribe_options)
255
+ segments = result["segments"]
256
+ print("starting whisper done with whisper")
257
+ print(segments[0])
258
+ except Exception as e:
259
+ raise RuntimeError("Error converting video to audio")
260
+
261
+ try:
262
+ # Create embedding
263
+ def segment_embedding(segment):
264
+ audio = Audio()
265
+ start = segment["start"]
266
+ # Whisper overshoots the end timestamp in the last segment
267
+ end = min(duration, segment["end"])
268
+ clip = Segment(start, end)
269
+ waveform, sample_rate = audio.crop(audio_file, clip)
270
+ return embedding_model(waveform[None])
271
+
272
+ embeddings = np.zeros(shape=(len(segments), 192))
273
+ for i, segment in enumerate(segments):
274
+ embeddings[i] = segment_embedding(segment)
275
+ embeddings = np.nan_to_num(embeddings)
276
+ print(f'Embedding shape: {embeddings.shape}')
277
+
278
+ # Assign speaker label
279
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
280
+ labels = clustering.labels_
281
+ for i in range(len(segments)):
282
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
283
+
284
+ # Make output
285
+ objects = {
286
+ 'Start' : [],
287
+ 'End': [],
288
+ 'Speaker': [],
289
+ 'Text': []
290
+ }
291
+ text = ''
292
+
293
+ for (i, segment) in enumerate(segments):
294
+
295
+ objects['Start'].append(convert_time(segment["start"]))
296
+ objects['Speaker'].append(segment["speaker"])
297
+
298
+
299
+ text += segment["text"] + ' '
300
+ objects['End'].append(str(convert_time(segment["end"])))
301
+ objects['Text'].append(text)
302
+ text = ''
303
+ # for (i, segment) in enumerate(segments):
304
+ # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
305
+ # objects['Start'].append(str(convert_time(segment["start"])))
306
+ # objects['Speaker'].append(segment["speaker"])
307
+ # if i != 0:
308
+ # objects['End'].append(str(convert_time(segments[i - 1]["end"])))
309
+ # objects['Text'].append(text)
310
+ # text = ''
311
+ # text += segment["text"] + ' '
312
+ # objects['End'].append(str(convert_time(segments[i - 1]["end"])))
313
+ # objects['Text'].append(text)
314
+
315
+
316
+ time_end = time.time()
317
+ time_diff = time_end - time_start
318
+ memory = psutil.virtual_memory()
319
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
320
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
321
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
322
+ system_info = f"""
323
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
324
+ *Processing time: {time_diff:.5} seconds.*
325
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
326
+ """
327
+ save_path = "output/transcript_result.csv"
328
+ df_results = pd.DataFrame(objects)
329
+ df_results.to_csv(save_path)
330
+ #save an srt file from df_results
331
+ srt_file = "output/subtitles.srt"
332
+ def to_srt(df, srt_file):
333
+ with open(srt_file, 'w') as f:
334
+ for i, row in df.iterrows():
335
+ f.write(f'{i + 1}\n')
336
+
337
+ f.write(f'{row["Start"]} --> {row["End"]}\n')
338
+ f.write(f'{row["Speaker"]} : {row["Text"]}\n\n')
339
+ to_srt(df_results, srt_file)
340
+ def to_vtt(df, vtt_file):
341
+ with open(vtt_file, 'w') as f:
342
+ f.write("WEBVTT\n\n")
343
+ for i, row in df.iterrows():
344
+ f.write(f'{i + 1}\n')
345
+ f.write(f'{row["Start"]} --> {row["End"]}\n')
346
+ f.write(f'{row["Speaker"]} : {row["Text"]}\n\n')
347
+ vtt_file = "output/subtitles.vtt"
348
+ to_vtt(df_results, vtt_file)
349
+
350
+
351
+ return df_results, system_info, save_path, srt_file,vtt_file
352
+
353
+ except Exception as e:
354
+ raise RuntimeError("Error Running inference with local model", e)
355
+
356
+ def create_video_player(subtitle_files, video_in):
357
+
358
+ with open(video_in, "rb") as file:
359
+ video_base64 = base64.b64encode(file.read())
360
+ with open('output/subtitles.vtt', "rb") as file:
361
+ subtitle_base64 = base64.b64encode(file.read())
362
+
363
+ video_player = f'''<video id="video" controls preload="metadata">
364
+ <source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" />
365
+ <track
366
+ label="English"
367
+ kind="subtitles"
368
+ srclang="en"
369
+ src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}"
370
+ default />
371
+ </video>
372
+ '''
373
+ #video_player = gr.HTML(video_player)
374
+ return video_player
375
+
376
+ # def create_video_player(subtitle_file, video_in):
377
+ # video_player=gr.Video(
378
+ # label="Video File Test",
379
+ # show_label=True,
380
+ # interactive=True,
381
+ # value="mp4/en.mp4",
382
+ # caption="tmp/en.vtt",
383
+ # )
384
+ # def add_subtitles(video_in, subtitle_file):
385
+ # video_player = gr.Video(
386
+ # label="Video File Test",
387
+ # show_label=True,
388
+ # interactive=True,
389
+ # value=video_in,
390
+ # caption=subtitle_file,
391
+ # )
392
+ # return video_player
393
+
394
+ # ---- Gradio Layout -----
395
+ # Inspiration from https://huggingface.co/spaces/RASMUS/Whisper-youtube-crosslingual-subtitles
396
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
397
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
398
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
399
+
400
+ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
401
+ memory = psutil.virtual_memory()
402
+
403
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
404
+ selected_device = gr.Dropdown(choices=['cpu','gpu'], type="value", value="cpu", label="Device on which to perform the computations.", interactive=True)
405
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
406
+
407
+ number_speakers = gr.Number(precision=0, value=2, label="Selected number of speakers", interactive=True)
408
+ system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
409
+ download_transcript = gr.File(label="Download transcript")
410
+ download_srt = gr.File(label="Download .srt file")
411
+ download_vtt = gr.File(label="Download .vtt file")
412
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
413
+ title = "Whisper speaker diarization"
414
+ demo = gr.Blocks(title=title)
415
+ demo.encrypt = False
416
+ video_player = gr.HTML('<p>video will be played here after you press the button at step 4')
417
+
418
+
419
+ with demo:
420
+ with gr.Tab("Whisper speaker diarization"):
421
+ gr.Markdown('''
422
+ <div>
423
+ <h1 style='text-align: center'>Whisper speaker diarization</h1>
424
+ This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
425
+ It is based on https://huggingface.co/spaces/vumichien/Whisper_speaker_diarization
426
+ </div>
427
+ ''')
428
+
429
+ with gr.Row():
430
+ gr.Markdown('''
431
+ ### Transcribe youtube link using OpenAI Whisper
432
+ ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
433
+ ##### 2. Generating speaker embeddings for each segments.
434
+ ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
435
+ ''')
436
+
437
+ with gr.Row():
438
+ gr.Markdown('''
439
+ ### You can test by following examples:
440
+ ''')
441
+ examples = gr.Examples(examples=
442
+ [ "https://www.youtube.com/watch?v=guEyxTpevFo",
443
+ "https://www.youtube.com/watch?v=-UX0X45sYe4",
444
+ "https://www.youtube.com/watch?v=7minSgqi-Gw"],
445
+ label="Examples", inputs=[youtube_url_in])
446
+
447
+
448
+ with gr.Row():
449
+ with gr.Column():
450
+ youtube_url_in.render()
451
+ download_youtube_btn = gr.Button("Download Youtube video")
452
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
453
+ video_in])
454
+ print(video_in)
455
+
456
+
457
+ with gr.Row():
458
+ with gr.Column():
459
+ video_in.render()
460
+ with gr.Column():
461
+ gr.Markdown('''
462
+ ##### Here you can start the transcription process.
463
+ ##### Please select the source language for transcription.
464
+ ##### You should select a number of speakers for getting better results.
465
+ ''')
466
+ selected_device.render()
467
+ selected_source_lang.render()
468
+ selected_whisper_model.render()
469
+ number_speakers.render()
470
+ transcribe_btn = gr.Button("Transcribe audio and diarization")
471
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers,selected_device], [transcription_df, system_info, download_transcript, download_srt,download_vtt])
472
+
473
+
474
+ with gr.Row():
475
+ gr.Markdown('''
476
+ ##### Here you will get transcription output
477
+ ##### ''')
478
+
479
+
480
+ with gr.Row():
481
+ with gr.Column():
482
+ download_transcript.render()
483
+ download_srt.render()
484
+ download_vtt.render()
485
+ transcription_df.render()
486
+ system_info.render()
487
+ # gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
488
+
489
+ with gr.Row():
490
+ with gr.Column():
491
+ gr.Markdown('''
492
+ ##### Now press the Step 4. Button to create output video with translated transcriptions
493
+ ##### ''')
494
+ create_video_button = gr.Button("Step 4. Create and add subtitles to video")
495
+ print(video_in)
496
+ create_video_button.click(create_video_player, [download_srt,video_in], [
497
+ video_player])
498
+ video_player.render()
499
+
500
+
501
+
502
+ # with gr.Tab("Whisper Transcribe Japanese Audio"):
503
+ # gr.Markdown(f'''
504
+ # <div>
505
+ # <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
506
+ # </div>
507
+ # Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
508
+ # checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
509
+ # ''')
510
+ # microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
511
+ # upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
512
+ # transcribe_btn = gr.Button("Transcribe Audio")
513
+ # text_output = gr.Textbox()
514
+ # with gr.Row():
515
+ # gr.Markdown('''
516
+ # ### You can test by following examples:
517
+ # ''')
518
+ # examples = gr.Examples(examples=
519
+ # [ "sample1.wav",
520
+ # "sample2.wav",
521
+ # ],
522
+ # label="Examples", inputs=[upload])
523
+ # transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
524
+
525
+ # with gr.Tab("Whisper Transcribe Japanese YouTube"):
526
+ # gr.Markdown(f'''
527
+ # <div>
528
+ # <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
529
+ # </div>
530
+ # Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
531
+ # <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
532
+ # ''')
533
+ # youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
534
+ # yt_transcribe_btn = gr.Button("Transcribe YouTube")
535
+ # text_output2 = gr.Textbox()
536
+ # html_output = gr.Markdown()
537
+ # yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
538
+
539
+ demo.launch(debug=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/pyannote/pyannote-audio
3
+ git+https://github.com/openai/whisper.git
4
+ gradio==3.12
5
+ ffmpeg-python
6
+ pandas==1.5.0
7
+ pytube==12.1.0
8
+ sacremoses
9
+ sentencepiece
10
+ tokenizers
11
+ torch
12
+ torchaudio
13
+ tqdm==4.64.1
14
+ EasyNMT==2.0.2
15
+ nltk
16
+ transformers
17
+ pysrt
18
+ psutil==5.9.2
19
+ requests
20
+ gpuinfo