speechtotext / app.py
neuralleap's picture
Update app.py
#This code for CPU
#import torch
#from transformers import AutoTokenizer, TextStreamer, pipeline
import whisper
#transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
model = whisper.load_model("small")
import gradio as gr
import time
import googletrans
from googletrans import Translator
translator = Translator()
lan = googletrans.LANGUAGES
keys = list(lan.keys())
vals = list(lan.values())
def transcribe(lang,audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log_Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
#print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options, fp16=False)
lang = lang.lower()
#state += translator.translate(result.text,dest=keys[vals.index(lang)]).text + " "
return translator.translate(result.text,dest=keys[vals.index(lang)]).text
def clear(msg):
return ""
with gr.Blocks() as demo:
state = gr.State(value="")
audio = gr.Audio(label="press start record to speek",source="microphone", type="filepath")
dropdown = gr.Dropdown(label="first select the destination language",choices=vals)
msg = gr.Textbox()
clearBTN = gr.Button("Clear")
dropdown.select(transcribe, [dropdown,audio], outputs=[msg])
#audio.stream(transcribe, [dropdown,audio,state], outputs=[msg,state])
#audio.stop_recording(clear, [state], outputs=[msg,state])
clearBTN.click(clear, [msg], outputs=[msg])