File size: 5,427 Bytes
971b6fe
 
 
dc86169
971b6fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc86169
 
daa5ad7
971b6fe
dc86169
 
 
 
 
 
 
 
 
 
 
 
971b6fe
 
 
 
dc86169
971b6fe
 
 
dc86169
 
 
 
 
 
 
 
 
971b6fe
dc86169
971b6fe
 
dc86169
 
 
 
 
971b6fe
 
 
dc86169
971b6fe
 
dc86169
 
 
 
 
971b6fe
 
 
 
 
 
 
dc86169
971b6fe
 
 
 
 
 
 
 
 
 
dc86169
971b6fe
dc86169
971b6fe
dc86169
971b6fe
dc86169
971b6fe
 
 
 
 
dc86169
971b6fe
 
dc86169
971b6fe
dc86169
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
import whisper
from transformers import pipeline


model = whisper.load_model("base")
sentiment_analysis = pipeline("sentiment-analysis", framework="pt", model="SamLowe/roberta-base-go_emotions")

def analyze_sentiment(text):
    results = sentiment_analysis(text)
    sentiment_results = {result['label']: result['score'] for result in results}
    return sentiment_results

def get_sentiment_emoji(sentiment):
    # Define the emojis corresponding to each sentiment
    emoji_mapping = {
        "disappointment": "๐Ÿ˜ž",
        "sadness": "๐Ÿ˜ข",
        "annoyance": "๐Ÿ˜ ",
        "neutral": "๐Ÿ˜",
        "disapproval": "๐Ÿ‘Ž",
        "realization": "๐Ÿ˜ฎ",
        "nervousness": "๐Ÿ˜ฌ",
        "approval": "๐Ÿ‘",
        "joy": "๐Ÿ˜„",
        "anger": "๐Ÿ˜ก",
        "embarrassment": "๐Ÿ˜ณ",
        "caring": "๐Ÿค—",
        "remorse": "๐Ÿ˜”",
        "disgust": "๐Ÿคข",
        "grief": "๐Ÿ˜ฅ",
        "confusion": "๐Ÿ˜•",
        "relief": "๐Ÿ˜Œ",
        "desire": "๐Ÿ˜",
        "admiration": "๐Ÿ˜Œ",
        "optimism": "๐Ÿ˜Š",
        "fear": "๐Ÿ˜จ",
        "love": "โค๏ธ",
        "excitement": "๐ŸŽ‰",
        "curiosity": "๐Ÿค”",
        "amusement": "๐Ÿ˜„",
        "surprise": "๐Ÿ˜ฒ",
        "gratitude": "๐Ÿ™",
        "pride": "๐Ÿฆ"
    }
    return emoji_mapping.get(sentiment, "")

def display_sentiment_results(sentiment_results, option):
    sentiment_text = ""
    for sentiment, score in sentiment_results.items():
        emoji = get_sentiment_emoji(sentiment)
        if option == "Sentiment Only":
            sentiment_text += f"{sentiment} {emoji}\n"
        elif option == "Sentiment + Score":
            sentiment_text += f"{sentiment} {emoji}: {score}\n"
    return sentiment_text

def inference(audio, sentiment_option):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)
    lang = max(probs, key=probs.get)

    options = whisper.DecodingOptions(fp16=False)
    result = whisper.decode(model, mel, options)

    sentiment_results = analyze_sentiment(result.text)
    sentiment_output = display_sentiment_results(sentiment_results, sentiment_option)

    return lang.upper(), result.text, sentiment_output

title = """<h1 align="center">Audio Sentiment Analysis</h1>"""
subtitle = """<h6 align="center">Automatic Speech Recognition</h6>"""
image_path = "Arquitecture_W.jpg"
description = """
<p align="justify">With cross-modal interaction and AI (tools and pre-trained models in NLP), we can analyze large audio data
in real-time, such as recorded conversations, customer service calls, or voice recordings, in order to identify and categorize
emotions (from positive and neutral to sad and angry.</p><br>

Components of the tool:<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Input: Real-time multilingual<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Video Call speech recognition<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Pre-trained model: Whisper<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Model size: Large with 769M Parameters<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Encoder/Decoder Arquitecture <br>
&nbsp;&nbsp;&nbsp;&nbsp; - Transcribe, Translate, and Identify Audio<br>
&nbsp;&nbsp;&nbsp;&nbsp; - Output: Sentiment analysis<br>
<br>
"""

custom_css = """
banner-image {
    margin-left: auto;
    margin-right: auto;
}
chat-message {
    font-size: 300px;
    min-height: 600px;
}

img {
  border-radius: 8px;
  max-width: 100%;
  height: auto;
}

"""


block = gr.Blocks(css=custom_css, theme='gradio/default',title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=custom_css, title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=".gradio-container {background-color: black}", title="Analytics Projects by Ray Espinoza")
#block = gr.Blocks(css=".gradio-container {background: url('file=pic4.jpg')}", title="Analytics Projects by Ray Espinoza")

with block:
    gr.HTML(title)
    gr.HTML(subtitle)

    with gr.Row():
        with gr.Column(scale=2):
            gr.Image(image_path, elem_id="banner-image", show_label=False, show_download_button=False)
            #banner-image
            #gr.Markdown(value=image_path, elem_id="img")
            #gr.Image(image_path, elem_id="chat-message", show_label=False)
        with gr.Column():
            gr.HTML(description)

    with gr.Group():
        with gr.Box():
            audio = gr.Audio(
                label="Input Audio",
                show_label=False,#Here#False
                source="microphone",
                type="filepath"
            )

            sentiment_option = gr.Radio(
                choices=["Sentiment Only", "Sentiment + Score"],
                label="Select an option",
                default="Sentiment Only"
            )

            btn = gr.Button("Execute: Transcribe",variant="primary")

        lang_str = gr.Textbox(label="Language:")

        text = gr.Textbox(label="Transcription:")

        sentiment_output = gr.Textbox(label="Sentiment Analysis Results:", output=True)

        btn.click(inference, inputs=[audio, sentiment_option], outputs=[lang_str, text, sentiment_output])

        gr.HTML('''
        <div class="footer">
            <p>By <a href="https://github.com/rayespinozah" style="text-decoration: underline;" target="_blank"> Ray Espinoza Github</a>
            </p>
        </div>
        ''')

block.launch(share=True)