import spaces import gradio as gr import numpy as np import torch from peft import PeftModel, PeftConfig from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline peft_model_id = "mfidabel/Modelo_3_Whisper_Large_V3" language = "guarani" task = "transcribe" peft_config = PeftConfig.from_pretrained(peft_model_id) model = WhisperForConditionalGeneration.from_pretrained( peft_config.base_model_name_or_path, load_in_8bit=False, device_map="cuda:0" ) model = PeftModel.from_pretrained(model, peft_model_id) model.merge_and_unload() tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task) feature_extractor = processor.feature_extractor forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task=task) pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) @spaces.GPU def transcribe(audio): if audio is None: return "Espera a que la grabación termine de subirse al servidor !! Intentelo de nuevo en unos segundos" sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) with torch.autocast("cuda"): return pipeline({"sampling_rate": sr, "raw": y}, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"] examples = [ "./examples/audio_1.mp3", "./examples/audio_2.mp3", "./examples/audio_3.mp3", "./examples/audio_4.mp3" ] title = "# 🇵🇾 Reconocimiento de Voz en Guaraní" description = """Esta es una demostración del reconocimiento de voz en Guaraní utilizando el modelo speech-to-text [Whisper](https://arxiv.org/pdf/2212.04356.pdf) Autores: - Mateo Andrés Fidabel Gill - Santiago Ruben Acevedo Zarza """ audio_input = gr.Audio(value="./examples/audio_1.mp3", sources=["upload", "microphone"], label="🎤 Audio a transcribir", interactive=True) transcription = gr.Textbox(label="📝 Transcripción", interactive=False) with gr.Blocks() as demo: with gr.Row(): # Model Title and Description gr.Markdown(title) gr.Markdown(description) with gr.Row(): # Audio Input audio_input.render() with gr.Row(): # Text Output transcription.render() with gr.Row(): # Submit and Clear Buttons submit = gr.Button("📝 Transcribir el Audio") with gr.Row(): gr.Examples(examples=examples, inputs=[audio_input], outputs=[transcription], fn=transcribe, label="Ejemplos") submit.click(transcribe, inputs=[audio_input], outputs = [transcription]) demo.queue() demo.launch(share=True)