JPLTedCas's picture
Update app.py
736ba09
from transformers import pipeline
import gradio as gr
import time
import whisper
#p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None
model = whisper.load_model("large")
# load model and processor
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None
# load dummy dataset and read audio files
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
#sample = ds[0]["audio"]
#input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
# generate token ids
#predicted_ids = model.generate(input_features)
# decode token ids to text
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
#['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
#[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
#def speech_to_text(tmp_filename, model_size):
# model = whisper.load_model(model_size)
# result = model.transcribe(tmp_filename)
#
# return result["text"]
#gr.Interface(
# fn=speech_to_text,
# inputs=[
# gr.Audio(source="microphone", type="filepath"),
# gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"]),
# ],
# outputs="text").launch()
def transcribe(language,audio, state=""):#language="Spanish",
time.sleep(1)
if language=="Multi":
state=""
result = model.transcribe(audio)
text = result["text"]#processor.batch_decode(predicted_ids, skip_special_tokens=False)
# if language=="Catalan":
# state=""
# text = pc(audio)["text"]
# if language=="English":
# state=""
# text = pe(audio)["text"]
# if language=="French":
# state=""
# text = pf(audio)["text"]
# if language=="Japanese":
# state=""
# text = pj(audio)["text"]
state += text + " "
#text2="Esto es loq ue te he entendido"
return state, state
demo=gr.Interface(
fn=transcribe,
title="TEDCAS Offline Speech recognition",
description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'",
inputs=[
#gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"),
gr.Dropdown(["Multi","Spanish"],value="Multi"),
#gr.Audio(source="microphone", type="filepath", streaming=True),
gr.inputs.Audio(source="microphone", type="filepath"),
"state"#,"language"
],
outputs=[
"textbox",
"state"
],
#live=True).launch()
)
demo.launch()
#demo.launch(auth=("TedCas", "Kike1234"))