from transformers import pipeline import gradio as gr import time import whisper #p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish") from transformers import WhisperProcessor, WhisperForConditionalGeneration from datasets import load_dataset #processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") #model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") #model.config.forced_decoder_ids = None model = whisper.load_model("large") # load model and processor #processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") #model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") #model.config.forced_decoder_ids = None # load dummy dataset and read audio files #ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") #sample = ds[0]["audio"] #input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features # generate token ids #predicted_ids = model.generate(input_features) # decode token ids to text #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False) #['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>'] #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) #[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'] #def speech_to_text(tmp_filename, model_size): # model = whisper.load_model(model_size) # result = model.transcribe(tmp_filename) # # return result["text"] #gr.Interface( # fn=speech_to_text, # inputs=[ # gr.Audio(source="microphone", type="filepath"), # gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"]), # ], # outputs="text").launch() def transcribe(language,audio, state=""):#language="Spanish", time.sleep(1) if language=="Multi": state="" result = model.transcribe(audio) text = result["text"]#processor.batch_decode(predicted_ids, skip_special_tokens=False) # if language=="Catalan": # state="" # text = pc(audio)["text"] # if language=="English": # state="" # text = pe(audio)["text"] # if language=="French": # state="" # text = pf(audio)["text"] # if language=="Japanese": # state="" # text = pj(audio)["text"] state += text + " " #text2="Esto es loq ue te he entendido" return state, state demo=gr.Interface( fn=transcribe, title="TEDCAS Offline Speech recognition", description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'", inputs=[ #gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"), gr.Dropdown(["Multi","Spanish"],value="Multi"), #gr.Audio(source="microphone", type="filepath", streaming=True), gr.inputs.Audio(source="microphone", type="filepath"), "state"#,"language" ], outputs=[ "textbox", "state" ], #live=True).launch() ) demo.launch() #demo.launch(auth=("TedCas", "Kike1234"))