import gradio as gr from transformers import pipeline from transformers import AutoProcessor, SeamlessM4Tv2Model input_audio = gr.Audio(sources=['microphone','upload'], label='Speak with me...', show_label=True, interactive=True, format=['wav']) def voice_to_emotion(audio): processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") pipe = pipeline("automatic-speech-recognition", model="facebook/seamless-m4t-v2-large") emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa') classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) output_tokens = model.generate(audio, tgt_lang="eng", generate_speech=False) translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) model_outputs = classifier(translated_text_from_audio) res = '' for each in model_outputs[0]: res = str(each['label']) return res demo = gr.Interface( fn=voice_to_emotion, inputs = input_audio, outputs = "textbox") if __name__ == "__main__": demo.launch()