import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper from transformers import pipeline import numpy as np p = pipeline("automatic-speech-recognition", model="openai/whisper-base.ch") model = whisper.load_model("base") transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.ch") def transcribe(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] def inference(audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) return result.text with gr.Blocks() as demo: gr.Markdown("Flip text or image files using this demo.") with gr.Tab("語音轉文字"): fn=inference, inputs=gr.Audio(type="filepath", label="格式可為 WAV、MP3、OGG、FLAC、AAC、M4A、WMA,單聲道、多聲道均可。"), outputs="text" with gr.Tab("Real Time Speech Recognition"): with gr.Row(): transcribe, gr.Audio(sources=["microphone"]), "text", demo.launch() # 兩個頁面 # ################################################################################################################################################ import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper model = whisper.load_model("base") def inference(audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) return result.text iface = gr.Interface( fn=inference, inputs=gr.Audio(type="filepath", label="格式可為 WAV、MP3、OGG、FLAC、AAC、M4A、WMA,單聲道、多聲道均可。"), outputs="text" ) iface.launch()