import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from transformers import pipeline
import numpy as np


p = pipeline("automatic-speech-recognition", model="openai/whisper-base.ch")


model = whisper.load_model("base")
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.ch")

def transcribe(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    return transcriber({"sampling_rate": sr, "raw": y})["text"]


def inference(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)

    return result.text
    
with gr.Blocks() as demo:
    gr.Markdown("Flip text or image files using this demo.")
    with gr.Tab("語音轉文字"):
            fn=inference,
            inputs=gr.Audio(type="filepath", label="格式可為 WAV、MP3、OGG、FLAC、AAC、M4A、WMA，單聲道、多聲道均可。"),
            outputs="text"
    with gr.Tab("Real Time Speech Recognition"):
        with gr.Row():
            transcribe,
        gr.Audio(sources=["microphone"]),
        "text",


demo.launch()
# 兩個頁面

# ################################################################################################################################################
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper


model = whisper.load_model("base")


def inference(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)

    return result.text

iface = gr.Interface(
    fn=inference,
    inputs=gr.Audio(type="filepath", label="格式可為 WAV、MP3、OGG、FLAC、AAC、M4A、WMA，單聲道、多聲道均可。"),
    outputs="text"
)

iface.launch()