Audio-WebUI / app.py
kadirnar's picture
Update app.py
4f7fe11 verified
import gradio as gr
from whisperplus.pipelines.whisper import SpeechToTextPipeline
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
from whisperplus.utils.download_utils import download_and_convert_to_mp3
from whisperplus.utils.text_utils import format_speech_to_dialogue
import subprocess
def install_package(package):
subprocess.check_call(['pip', 'install', package, '--no-build-isolation'])
# Then install flash-attn
install_package('flash-attn')
def youtube_url_to_text(url, model_id, language_choice):
"""
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
a specified model, and returns the transcript along with the video path.
Args:
url (str): The URL of the video to download and convert.
model_id (str): The ID of the speech-to-text model to use.
language_choice (str): The language choice for the speech-to-text conversion.
Returns:
transcript (str): The transcript of the speech-to-text conversion.
video_path (str): The path of the downloaded video.
"""
video_path = download_and_convert_to_mp3(url)
output = SpeechToTextPipeline(model_id)
print(video_path)
transcript = output(audio_path=video_path, language=language_choice)
return transcript, video_path
def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker):
"""
Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
a specified model, and returns the transcript along with the video path.
Args:
url (str): The URL of the video to download and convert.
model_id (str): The ID of the speech-to-text model to use.
language_choice (str): The language choice for the speech-to-text conversion.
Returns:
transcript (str): The transcript of the speech-to-text conversion.
video_path (str): The path of the downloaded video.
"""
pipeline = ASRDiarizationPipeline.from_pretrained(
asr_model=model_id,
diarizer_model="pyannote/speaker-diarization",
chunk_length_s=30,
device="cuda",
)
audio_path = download_and_convert_to_mp3(url)
output_text = pipeline(
audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
dialogue = format_speech_to_dialogue(output_text)
return dialogue, audio_path
def youtube_url_to_text_app():
with gr.Blocks():
with gr.Row():
with gr.Column():
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
language_choice = gr.Dropdown(
choices=[
"English",
"Turkish",
"Spanish",
"French",
"Chinese",
"Japanese",
"Korean",
],
value="Turkish",
label="Language",
)
whisper_model_id = gr.Dropdown(
choices=[
"openai/whisper-large-v3",
"openai/whisper-large",
"openai/whisper-medium",
"openai/whisper-base",
"openai/whisper-small",
"openai/whisper-tiny",
],
value="openai/whisper-large-v3",
label="Whisper Model",
)
whisperplus_in_predict = gr.Button(value="Generator")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
output_audio = gr.Audio(label="Output Audio")
whisperplus_in_predict.click(
fn=youtube_url_to_text,
inputs=[
youtube_url_path,
whisper_model_id,
language_choice,
],
outputs=[output_text, output_audio],
)
gr.Examples(
examples=[
[
"https://www.youtube.com/watch?v=di3rHkEZuUw",
"distil-whisper/distil-large-v3",
"English",
],
],
fn=youtube_url_to_text,
inputs=[
youtube_url_path,
whisper_model_id,
language_choice,
],
outputs=[output_text, output_audio],
cache_examples=True,
)
def speaker_diarization_app():
with gr.Blocks():
with gr.Row():
with gr.Column():
youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
whisper_model_id = gr.Dropdown(
choices=[
"openai/whisper-large-v3",
"distil-whisper/distil-large-v3",
"distil-whisper/distil-large-v2",
],
value="distil-whisper/distil-large-v3",
label="Whisper Model",
)
num_speakers = gr.Number(value=2, label="Number of Speakers")
min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
whisperplus_in_predict = gr.Button(value="Generator")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
output_audio = gr.Audio(label="Output Audio")
whisperplus_in_predict.click(
fn=speaker_diarization,
inputs=[
youtube_url_path,
whisper_model_id,
num_speakers,
min_speaker,
max_speaker,
],
outputs=[output_text, output_audio],
)
gr.Examples(
examples=[
[
"https://www.youtube.com/shorts/o8PgLUgte2k",
"distil-whisper/distil-large-v3",
2,
1,
2,
],
],
fn=speaker_diarization,
inputs=[
youtube_url_path,
whisper_model_id,
num_speakers,
min_speaker,
max_speaker,
],
outputs=[output_text, output_audio],
cache_examples=False,
)
gradio_app = gr.Blocks()
with gradio_app:
gr.HTML(
"""
<h1 style='text-align: center'>
WhisperPlus: Advancing Speech-to-Text Processing 🚀
</h1>
""")
gr.HTML(
"""
<h3 style='text-align: center'>
Follow me for more!
<a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
</h3>
""")
with gr.Row():
with gr.Column():
with gr.Tab(label="Youtube URL to Text"):
youtube_url_to_text_app()
with gr.Tab(label="Speaker Diarization"):
speaker_diarization_app()
gradio_app.launch(debug=True)