Spaces:
Runtime error
Runtime error
whisper
#14
by
AdinEnvironment
- opened
- app.py +35 -67
- requirements.txt +7 -9
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
from faster_whisper import WhisperModel
|
3 |
import datetime
|
4 |
import subprocess
|
5 |
import gradio as gr
|
@@ -13,7 +12,6 @@ from sklearn.cluster import AgglomerativeClustering
|
|
13 |
from sklearn.metrics import silhouette_score
|
14 |
|
15 |
from pytube import YouTube
|
16 |
-
import yt_dlp
|
17 |
import torch
|
18 |
import pyannote.audio
|
19 |
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
|
@@ -27,7 +25,7 @@ import contextlib
|
|
27 |
from transformers import pipeline
|
28 |
import psutil
|
29 |
|
30 |
-
whisper_models = ["
|
31 |
source_languages = {
|
32 |
"en": "English",
|
33 |
"zh": "Chinese",
|
@@ -175,48 +173,26 @@ def _return_yt_html_embed(yt_url):
|
|
175 |
return HTML_str
|
176 |
|
177 |
def yt_transcribe(yt_url):
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
ydl_opts = {
|
184 |
-
'format': 'bestvideo*+bestaudio/best',
|
185 |
-
'postprocessors': [{
|
186 |
-
'key': 'FFmpegExtractAudio',
|
187 |
-
'preferredcodec': 'mp3',
|
188 |
-
'preferredquality': '192',
|
189 |
-
}],
|
190 |
-
'outtmpl':'audio.%(ext)s',
|
191 |
-
}
|
192 |
-
|
193 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
194 |
-
ydl.download([yt_url])
|
195 |
-
|
196 |
text = pipe("audio.mp3")["text"]
|
|
|
197 |
return html_embed_str, text
|
198 |
|
199 |
def convert_time(secs):
|
200 |
return datetime.timedelta(seconds=round(secs))
|
201 |
|
202 |
def get_youtube(video_url):
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
ydl_opts = {
|
207 |
-
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
208 |
-
}
|
209 |
-
|
210 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
211 |
-
info = ydl.extract_info(video_url, download=False)
|
212 |
-
abs_video_path = ydl.prepare_filename(info)
|
213 |
-
ydl.process_info(info)
|
214 |
-
|
215 |
print("Success download video")
|
216 |
print(abs_video_path)
|
217 |
return abs_video_path
|
218 |
|
219 |
-
def speech_to_text(video_file_path, selected_source_lang, whisper_model,
|
220 |
"""
|
221 |
# Transcribe youtube link using OpenAI Whisper
|
222 |
1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
|
@@ -227,9 +203,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
227 |
Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
|
228 |
"""
|
229 |
|
230 |
-
|
231 |
-
# model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
|
232 |
-
model = WhisperModel(whisper_model, compute_type="int8")
|
233 |
time_start = time.time()
|
234 |
if(video_file_path == None):
|
235 |
raise ValueError("Error no video input")
|
@@ -253,19 +227,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
253 |
# Transcribe audio
|
254 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
255 |
transcribe_options = dict(task="transcribe", **options)
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
segments = []
|
260 |
-
i = 0
|
261 |
-
for segment_chunk in segments_raw:
|
262 |
-
chunk = {}
|
263 |
-
chunk["start"] = segment_chunk.start
|
264 |
-
chunk["end"] = segment_chunk.end
|
265 |
-
chunk["text"] = segment_chunk.text
|
266 |
-
segments.append(chunk)
|
267 |
-
i += 1
|
268 |
-
print("transcribe audio done with fast whisper")
|
269 |
except Exception as e:
|
270 |
raise RuntimeError("Error converting video to audio")
|
271 |
|
@@ -286,19 +250,22 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
286 |
embeddings = np.nan_to_num(embeddings)
|
287 |
print(f'Embedding shape: {embeddings.shape}')
|
288 |
|
289 |
-
if num_speakers == 0:
|
290 |
# Find the best number of speakers
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
295 |
-
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
296 |
-
score_num_speakers[num_speakers] = score
|
297 |
-
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
298 |
-
print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
|
299 |
else:
|
300 |
-
|
301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
# Assign speaker label
|
303 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
304 |
labels = clustering.labels_
|
@@ -353,7 +320,8 @@ df_init = pd.DataFrame(columns=['Start', 'End', 'Speaker', 'Text'])
|
|
353 |
memory = psutil.virtual_memory()
|
354 |
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
|
355 |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
|
356 |
-
|
|
|
357 |
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
|
358 |
download_transcript = gr.File(label="Download transcript")
|
359 |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
@@ -367,8 +335,7 @@ with demo:
|
|
367 |
gr.Markdown('''
|
368 |
<div>
|
369 |
<h1 style='text-align: center'>Whisper speaker diarization</h1>
|
370 |
-
This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a>
|
371 |
-
and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
|
372 |
</div>
|
373 |
''')
|
374 |
|
@@ -411,10 +378,11 @@ with demo:
|
|
411 |
''')
|
412 |
selected_source_lang.render()
|
413 |
selected_whisper_model.render()
|
414 |
-
|
|
|
415 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
416 |
transcribe_btn.click(speech_to_text,
|
417 |
-
[video_in, selected_source_lang, selected_whisper_model,
|
418 |
[transcription_df, system_info, download_transcript]
|
419 |
)
|
420 |
|
|
|
1 |
+
import whisper
|
|
|
2 |
import datetime
|
3 |
import subprocess
|
4 |
import gradio as gr
|
|
|
12 |
from sklearn.metrics import silhouette_score
|
13 |
|
14 |
from pytube import YouTube
|
|
|
15 |
import torch
|
16 |
import pyannote.audio
|
17 |
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
|
|
|
25 |
from transformers import pipeline
|
26 |
import psutil
|
27 |
|
28 |
+
whisper_models = ["base", "small", "medium", "large"]
|
29 |
source_languages = {
|
30 |
"en": "English",
|
31 |
"zh": "Chinese",
|
|
|
173 |
return HTML_str
|
174 |
|
175 |
def yt_transcribe(yt_url):
|
176 |
+
yt = YouTube(yt_url)
|
177 |
+
html_embed_str = _return_yt_html_embed(yt_url)
|
178 |
+
stream = yt.streams.filter(only_audio=True)[0]
|
179 |
+
stream.download(filename="audio.mp3")
|
180 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
text = pipe("audio.mp3")["text"]
|
182 |
+
|
183 |
return html_embed_str, text
|
184 |
|
185 |
def convert_time(secs):
|
186 |
return datetime.timedelta(seconds=round(secs))
|
187 |
|
188 |
def get_youtube(video_url):
|
189 |
+
yt = YouTube(video_url)
|
190 |
+
abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
print("Success download video")
|
192 |
print(abs_video_path)
|
193 |
return abs_video_path
|
194 |
|
195 |
+
def speech_to_text(video_file_path, selected_source_lang, whisper_model, min_num_speakers, max_number_speakers):
|
196 |
"""
|
197 |
# Transcribe youtube link using OpenAI Whisper
|
198 |
1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
|
|
|
203 |
Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
|
204 |
"""
|
205 |
|
206 |
+
model = whisper.load_model(whisper_model)
|
|
|
|
|
207 |
time_start = time.time()
|
208 |
if(video_file_path == None):
|
209 |
raise ValueError("Error no video input")
|
|
|
227 |
# Transcribe audio
|
228 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
229 |
transcribe_options = dict(task="transcribe", **options)
|
230 |
+
result = model.transcribe(audio_file, **transcribe_options)
|
231 |
+
segments = result["segments"]
|
232 |
+
print("starting whisper done with whisper")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
except Exception as e:
|
234 |
raise RuntimeError("Error converting video to audio")
|
235 |
|
|
|
250 |
embeddings = np.nan_to_num(embeddings)
|
251 |
print(f'Embedding shape: {embeddings.shape}')
|
252 |
|
|
|
253 |
# Find the best number of speakers
|
254 |
+
if min_num_speakers > max_number_speakers:
|
255 |
+
min_speakers = max_number_speakers
|
256 |
+
max_speakers = min_num_speakers
|
|
|
|
|
|
|
|
|
|
|
257 |
else:
|
258 |
+
min_speakers = min_num_speakers
|
259 |
+
max_speakers = max_number_speakers
|
260 |
+
score_num_speakers = {}
|
261 |
+
|
262 |
+
for num_speakers in range(min_speakers, max_speakers+1):
|
263 |
+
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
264 |
+
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
265 |
+
score_num_speakers[num_speakers] = score
|
266 |
+
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
267 |
+
print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
|
268 |
+
|
269 |
# Assign speaker label
|
270 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
271 |
labels = clustering.labels_
|
|
|
320 |
memory = psutil.virtual_memory()
|
321 |
selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="en", label="Spoken language in video", interactive=True)
|
322 |
selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
|
323 |
+
input_min_number_speakers = gr.Number(precision=0, value=2, label="Select minimum number of speakers", interactive=True)
|
324 |
+
input_max_number_speakers = gr.Number(precision=0, value=2, label="Select maximum number of speakers", interactive=True)
|
325 |
system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
|
326 |
download_transcript = gr.File(label="Download transcript")
|
327 |
transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
|
|
|
335 |
gr.Markdown('''
|
336 |
<div>
|
337 |
<h1 style='text-align: center'>Whisper speaker diarization</h1>
|
338 |
+
This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> to recoginze the speech and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers</h2>
|
|
|
339 |
</div>
|
340 |
''')
|
341 |
|
|
|
378 |
''')
|
379 |
selected_source_lang.render()
|
380 |
selected_whisper_model.render()
|
381 |
+
input_min_number_speakers.render()
|
382 |
+
input_max_number_speakers.render()
|
383 |
transcribe_btn = gr.Button("Transcribe audio and diarization")
|
384 |
transcribe_btn.click(speech_to_text,
|
385 |
+
[video_in, selected_source_lang, selected_whisper_model, input_min_number_speakers, input_max_number_speakers],
|
386 |
[transcription_df, system_info, download_transcript]
|
387 |
)
|
388 |
|
requirements.txt
CHANGED
@@ -1,22 +1,20 @@
|
|
1 |
git+https://github.com/huggingface/transformers
|
2 |
git+https://github.com/pyannote/pyannote-audio
|
3 |
git+https://github.com/openai/whisper.git
|
4 |
-
gradio
|
5 |
ffmpeg-python
|
6 |
-
pandas
|
7 |
-
pytube
|
8 |
sacremoses
|
9 |
sentencepiece
|
10 |
tokenizers
|
11 |
torch
|
12 |
torchaudio
|
13 |
-
tqdm
|
14 |
-
EasyNMT
|
15 |
nltk
|
16 |
transformers
|
17 |
pysrt
|
18 |
-
psutil
|
19 |
requests
|
20 |
-
gpuinfo
|
21 |
-
faster-whisper
|
22 |
-
yt-dlp
|
|
|
1 |
git+https://github.com/huggingface/transformers
|
2 |
git+https://github.com/pyannote/pyannote-audio
|
3 |
git+https://github.com/openai/whisper.git
|
4 |
+
gradio==3.12
|
5 |
ffmpeg-python
|
6 |
+
pandas==1.5.0
|
7 |
+
pytube==12.1.0
|
8 |
sacremoses
|
9 |
sentencepiece
|
10 |
tokenizers
|
11 |
torch
|
12 |
torchaudio
|
13 |
+
tqdm==4.64.1
|
14 |
+
EasyNMT==2.0.2
|
15 |
nltk
|
16 |
transformers
|
17 |
pysrt
|
18 |
+
psutil==5.9.2
|
19 |
requests
|
20 |
+
gpuinfo
|
|
|
|