from transformers import pipeline
import gradio as gr

asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
classifier = pipeline("text-classification", "michellejieli/emotion_text_classifier")

def transcribe(speech, state=""):
    text = asr(speech)["text"]
    state += text + " "
    return text, state
    
def speech_to_text(speech):
    text = asr(speech)["text"]
    return text

def text_to_sentiment(text):
    return classifier(text)[0]["label"]


demo = gr.Blocks()
with demo:

    microphone = gr.Audio(source="microphone", type="filepath")
    audio_file = gr.Audio(type="filepath")
    text = gr.Textbox()
    label = gr.Label()

    b0 = gr.Button("Speech From Microphone")
    b1 = gr.Button("Recognize Speech")
    b2 = gr.Button("Classify Sentiment")

    #b0.click(transcribe, inputs=[microphone, "state"], outputs=[text, "state"], live=True)
    b0.click(transcribe, inputs=[microphone], outputs=[text])
    b1.click(speech_to_text, inputs=audio_file, outputs=text)
    b2.click(text_to_sentiment, inputs=text, outputs=label)

    gr.Markdown("""References:
    1. ASR Model: https://huggingface.co/facebook/wav2vec2-base-960h
    2. Sentiment: https://huggingface.co/michellejieli/emotion_text_classifier
    3. ASR Lesson: https://gradio.app/real-time-speech-recognition/
    4. State: https://gradio.app/interface-state/
    5. Deepspeech: https://deepspeech.readthedocs.io/en/r0.9/
    """)

demo.launch()