import gradio as gr from whisperplus.pipelines.whisper import SpeechToTextPipeline from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline from whisperplus.utils.download_utils import download_and_convert_to_mp3 from whisperplus.utils.text_utils import format_speech_to_dialogue import subprocess def install_package(package): subprocess.check_call(['pip', 'install', package, '--no-build-isolation']) # Then install flash-attn install_package('flash-attn') def youtube_url_to_text(url, model_id, language_choice): """ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using a specified model, and returns the transcript along with the video path. Args: url (str): The URL of the video to download and convert. model_id (str): The ID of the speech-to-text model to use. language_choice (str): The language choice for the speech-to-text conversion. Returns: transcript (str): The transcript of the speech-to-text conversion. video_path (str): The path of the downloaded video. """ video_path = download_and_convert_to_mp3(url) output = SpeechToTextPipeline(model_id) print(video_path) transcript = output(audio_path=video_path, language=language_choice) return transcript, video_path def speaker_diarization(url, model_id, num_speakers, min_speaker, max_speaker): """ Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using a specified model, and returns the transcript along with the video path. Args: url (str): The URL of the video to download and convert. model_id (str): The ID of the speech-to-text model to use. language_choice (str): The language choice for the speech-to-text conversion. Returns: transcript (str): The transcript of the speech-to-text conversion. video_path (str): The path of the downloaded video. """ pipeline = ASRDiarizationPipeline.from_pretrained( asr_model=model_id, diarizer_model="pyannote/speaker-diarization", chunk_length_s=30, device="cuda", ) audio_path = download_and_convert_to_mp3(url) output_text = pipeline( audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker) dialogue = format_speech_to_dialogue(output_text) return dialogue, audio_path def youtube_url_to_text_app(): with gr.Blocks(): with gr.Row(): with gr.Column(): youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") language_choice = gr.Dropdown( choices=[ "English", "Turkish", "Spanish", "French", "Chinese", "Japanese", "Korean", ], value="Turkish", label="Language", ) whisper_model_id = gr.Dropdown( choices=[ "openai/whisper-large-v3", "openai/whisper-large", "openai/whisper-medium", "openai/whisper-base", "openai/whisper-small", "openai/whisper-tiny", ], value="openai/whisper-large-v3", label="Whisper Model", ) whisperplus_in_predict = gr.Button(value="Generator") with gr.Column(): output_text = gr.Textbox(label="Output Text") output_audio = gr.Audio(label="Output Audio") whisperplus_in_predict.click( fn=youtube_url_to_text, inputs=[ youtube_url_path, whisper_model_id, language_choice, ], outputs=[output_text, output_audio], ) gr.Examples( examples=[ [ "https://www.youtube.com/watch?v=di3rHkEZuUw", "distil-whisper/distil-large-v3", "English", ], ], fn=youtube_url_to_text, inputs=[ youtube_url_path, whisper_model_id, language_choice, ], outputs=[output_text, output_audio], cache_examples=True, ) def speaker_diarization_app(): with gr.Blocks(): with gr.Row(): with gr.Column(): youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") whisper_model_id = gr.Dropdown( choices=[ "openai/whisper-large-v3", "distil-whisper/distil-large-v3", "distil-whisper/distil-large-v2", ], value="distil-whisper/distil-large-v3", label="Whisper Model", ) num_speakers = gr.Number(value=2, label="Number of Speakers") min_speaker = gr.Number(value=1, label="Minimum Number of Speakers") max_speaker = gr.Number(value=2, label="Maximum Number of Speakers") whisperplus_in_predict = gr.Button(value="Generator") with gr.Column(): output_text = gr.Textbox(label="Output Text") output_audio = gr.Audio(label="Output Audio") whisperplus_in_predict.click( fn=speaker_diarization, inputs=[ youtube_url_path, whisper_model_id, num_speakers, min_speaker, max_speaker, ], outputs=[output_text, output_audio], ) gr.Examples( examples=[ [ "https://www.youtube.com/shorts/o8PgLUgte2k", "distil-whisper/distil-large-v3", 2, 1, 2, ], ], fn=speaker_diarization, inputs=[ youtube_url_path, whisper_model_id, num_speakers, min_speaker, max_speaker, ], outputs=[output_text, output_audio], cache_examples=False, ) gradio_app = gr.Blocks() with gradio_app: gr.HTML( """

WhisperPlus: Advancing Speech-to-Text Processing 🚀

""") gr.HTML( """

Follow me for more! Twitter | Github | Linkedin | HuggingFace

""") with gr.Row(): with gr.Column(): with gr.Tab(label="Youtube URL to Text"): youtube_url_to_text_app() with gr.Tab(label="Speaker Diarization"): speaker_diarization_app() gradio_app.launch(debug=True)