from transformers import pipeline import gradio as gr asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") classifier = pipeline("text-classification") def speech_to_text(mic=None, file=None): if mic is not None: audio = mic elif file is not None: audio = file else: return "You must either provide a mic recording or a file" text = asr(audio)["text"] return text def text_to_sentiment(text): return classifier(text)[0]["label"] title = "Speech-Text-Sentiment" description = """ Task: Speech to Text to Sentiment\n Model: \n speech to text (Wav2Vec2ForCTC)\n text to sentiment (DistilBertForSequenceClassification)\n """ theme="freddyaboulton/dracula_revamped" demo = gr.Blocks( title=title, description=description, theme=theme ) with demo: audio_file = [ gr.Audio(source="microphone", type="filepath", optional=True), gr.Audio(source="upload", type="filepath", optional=True), ] text = gr.Textbox() label = gr.Label() b1 = gr.Button("Recognize Speech") b2 = gr.Button("Classify Sentiment") b1.click(speech_to_text, inputs=audio_file, outputs=text) b2.click(text_to_sentiment, inputs=text, outputs=label) demo.launch()