Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from whisperplus.utils.download_utils import download_and_convert_to_mp3 | |
| import logging | |
| import torch | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| class SpeechToTextPipeline: | |
| """Class for converting audio to text using a pre-trained speech recognition model.""" | |
| def __init__(self, model_id: str = "openai/whisper-large-v3"): | |
| self.model = None | |
| self.device = None | |
| if self.model is None: | |
| self.load_model(model_id) | |
| else: | |
| logging.info("Model already loaded.") | |
| def load_model(self, model_id: str = "openai/whisper-large-v3"): | |
| """ | |
| Loads the pre-trained speech recognition model and moves it to the specified device. | |
| Args: | |
| model_id (str): Identifier of the pre-trained model to be loaded. | |
| """ | |
| logging.info("Loading model...") | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True) | |
| model.to(self.device) | |
| logging.info("Model loaded successfully.") | |
| self.model = model | |
| def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"): | |
| """ | |
| Converts audio to text using the pre-trained speech recognition model. | |
| Args: | |
| audio_path (str): Path to the audio file to be transcribed. | |
| model_id (str): Identifier of the pre-trained model to be used for transcription. | |
| Returns: | |
| str: Transcribed text from the audio. | |
| """ | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=self.model, | |
| torch_dtype=torch.float16, | |
| chunk_length_s=30, | |
| max_new_tokens=128, | |
| batch_size=24, | |
| return_timestamps=True, | |
| device="cuda", | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| model_kwargs={"use_flash_attention_2": True}, | |
| generate_kwargs={"language": language}, | |
| ) | |
| logging.info("Transcribing audio...") | |
| result = pipe(audio_path)["text"] | |
| return result | |
| def youtube_url_to_text(url, model_id, language_choice): | |
| """ | |
| Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using | |
| a specified model, and returns the transcript along with the video path. | |
| Args: | |
| url (str): The URL of the video to download and convert. | |
| model_id (str): The ID of the speech-to-text model to use. | |
| language_choice (str): The language choice for the speech-to-text conversion. | |
| Returns: | |
| transcript (str): The transcript of the speech-to-text conversion. | |
| video_path (str): The path of the downloaded video. | |
| """ | |
| video_path = download_and_convert_to_mp3(url) | |
| pipeline = SpeechToTextPipeline(model_id) | |
| transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice) | |
| return transcript, video_path | |
| def youtube_url_to_text_app(): | |
| with gr.Blocks(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") | |
| language_choice = gr.Dropdown( | |
| choices=[ | |
| "English", | |
| "Turkish", | |
| "Spanish", | |
| "French", | |
| "Chinese", | |
| "Japanese", | |
| "Korean", | |
| ], | |
| value="Turkish", | |
| label="Language", | |
| ) | |
| whisper_model_id = gr.Dropdown( | |
| choices=[ | |
| "openai/whisper-large-v3", | |
| "openai/whisper-large", | |
| "openai/whisper-medium", | |
| "openai/whisper-base", | |
| "openai/whisper-small", | |
| "openai/whisper-tiny", | |
| ], | |
| value="openai/whisper-large-v3", | |
| label="Whisper Model", | |
| ) | |
| whisperplus_in_predict = gr.Button(value="Generator") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Output Text") | |
| output_audio = gr.Audio(label="Output Audio") | |
| whisperplus_in_predict.click( | |
| fn=youtube_url_to_text, | |
| inputs=[ | |
| youtube_url_path, | |
| whisper_model_id, | |
| language_choice, | |
| ], | |
| outputs=[output_text, output_audio], | |
| ) | |
| gradio_app = gr.Blocks() | |
| with gradio_app: | |
| gr.HTML( | |
| """ | |
| <h1 style='text-align: center'> | |
| WhisperPlus: Advancing Speech-to-Text Processing 🚀 | |
| </h1> | |
| """) | |
| gr.HTML( | |
| """ | |
| <h3 style='text-align: center'> | |
| Follow me for more! | |
| <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a> | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a> | |
| </h3> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Tab(label="Youtube URL to Text"): | |
| youtube_url_to_text_app() | |
| gradio_app.queue() | |
| gradio_app.launch(debug=True) | |