from transformers import pipeline
import gradio as gr
import time
import whisper

#p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None
model = whisper.load_model("large")

# load model and processor
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
#sample = ds[0]["audio"]
#input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
#predicted_ids = model.generate(input_features)
# decode token ids to text
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
#['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']

#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
#[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
#def speech_to_text(tmp_filename, model_size):
#    model = whisper.load_model(model_size)
#    result = model.transcribe(tmp_filename)
#
 #   return result["text"]


#gr.Interface(
#    fn=speech_to_text,
#    inputs=[
#        gr.Audio(source="microphone", type="filepath"),
#        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"]),
#        ],
#    outputs="text").launch()

def transcribe(language,audio, state=""):#language="Spanish",
    time.sleep(1)
    if language=="Multi":
        state=""
        result = model.transcribe(audio)
        text = result["text"]#processor.batch_decode(predicted_ids, skip_special_tokens=False)

#    if language=="Catalan":
#        state=""
#        text = pc(audio)["text"]
#    if language=="English":
#        state=""
#        text = pe(audio)["text"]
#    if language=="French":
#        state=""
#        text = pf(audio)["text"]
#    if language=="Japanese":
#        state=""
#        text = pj(audio)["text"]
    state += text + " "
    #text2="Esto es loq ue te he entendido"
    return state, state

demo=gr.Interface(
    fn=transcribe, 
    
    title="TEDCAS Offline Speech recognition",
    description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'",

    inputs=[
        #gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"),
        gr.Dropdown(["Multi","Spanish"],value="Multi"),
        
        #gr.Audio(source="microphone", type="filepath", streaming=True), 
        gr.inputs.Audio(source="microphone", type="filepath"), 
        "state"#,"language"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    #live=True).launch()
)
demo.launch()
#demo.launch(auth=("TedCas", "Kike1234"))