from transformers import pipeline import gradio as gr asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") classifier = pipeline("text-classification", "michellejieli/emotion_text_classifier") def transcribe(speech, state=""): text = asr(speech)["text"] state += text + " " return text, state def speech_to_text(speech): text = asr(speech)["text"] return text def text_to_sentiment(text): return classifier(text)[0]["label"] demo = gr.Blocks() with demo: microphone = gr.Audio(source="microphone", type="filepath") audio_file = gr.Audio(type="filepath") text = gr.Textbox() label = gr.Label() b0 = gr.Button("Speech From Microphone") b1 = gr.Button("Recognize Speech") b2 = gr.Button("Classify Sentiment") #b0.click(transcribe, inputs=[microphone, "state"], outputs=[text, "state"], live=True) b0.click(transcribe, inputs=[microphone], outputs=[text]) b1.click(speech_to_text, inputs=audio_file, outputs=text) b2.click(text_to_sentiment, inputs=text, outputs=label) gr.Markdown("""References: 1. ASR Model: https://huggingface.co/facebook/wav2vec2-base-960h 2. Sentiment: https://huggingface.co/michellejieli/emotion_text_classifier 3. ASR Lesson: https://gradio.app/real-time-speech-recognition/ 4. State: https://gradio.app/interface-state/ 5. Deepspeech: https://deepspeech.readthedocs.io/en/r0.9/ """) demo.launch()