File size: 8,010 Bytes
d51980e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import whisper
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
import pandas as pd
import random
import classify
import replace_explitives
from whisper.model import Whisper
from whisper.tokenizer import get_tokenizer
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer


# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

model_cache = {}

# Building prediction function for gradio
emo_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "Racism": ["racism", "hate speech", "bigotry", "racially targeted", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
    "LGBTQ+ Hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+"],
    "Sexually Explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexist", "sexually demeaning", "sexual violence", "victim blaming"],
    "Pregnancy Complications": ["miscarriage", "child loss", "child death", "abortion", "pregnancy", "childbirth", "baby shower", "postpartum"],
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_emotion(audio):
    #### Emotion classification ####
    # EMO MODEL LINE emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
    out_prob, score, index, text_lab = emotion_classifier.classify_file(audio)
    return  emo_dict[text_lab[0]]

def slider_logic(slider):
    threshold = 0
    if slider == 1:
        threshold = .90
    elif slider == 2:
        threshold = .80
    elif slider == 3:
        threshold = .60
    elif slider == 4:
        threshold = .50
    elif slider == 5:
        threshold = .40
    else:
        threshold = []
    return threshold

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, classify_anxiety, emo_class, explitive_selection, slider):
    
    # Transcribe the audio file using Whisper ASR
    transcribed_text = pipe(audio_file)["text"]
    
    ## SLIDER ##
    threshold = slider_logic(slider)
    
    #------- explitive call ---------------
    
    if replace_explitives != None and emo_class == None:
        transcribed_text = replace_explitives.sub_explitives(transcribed_text, explitive_selection)
    
    #### Toxicity Classifier ####
        
    # TOX MODEL LINE toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
    #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")

    toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
 
    toxicity_score = toxicity_results["toxicity"][0]
    print(toxicity_score)
    
    # emo call
    if emo_class != None:
        classify_emotion(audio_file)

    #### Text classification #####
    if classify_anxiety != None: 
        # DEVICE LINE device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
        # CLASSIFICATION LINE text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    
        sequence_to_classify = transcribed_text
        print(classify_anxiety, class_options)
        candidate_labels = class_options.get(classify_anxiety, [])
        # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
        classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
        print("class output ", type(classification_output))
        # classification_df = pd.DataFrame.from_dict(classification_output)
        print("keys ", classification_output.keys())
        
        # formatted_classification_output = "\n".join([f"{key}: {value}" for key, value in classification_output.items()])
        # label_score_pairs = [(label, score) for label, score in zip(classification_output['labels'], classification_output['scores'])]
        label_score_dict = {label: score for label, score in zip(classification_output['labels'], classification_output['scores'])}
        k = max(label_score_dict, key=label_score_dict.get)
        print("k keys: ", k)
        maxval = label_score_dict[k]
        print("max value: ", maxval)
        topScore = ""
        affirm = ""
        if maxval > threshold:
            print("Toxic")
            affirm = positive_affirmations()
            topScore = maxval
        else:
            print("Not Toxic")
            affirm = ""
            topScore = maxval
    else:
        topScore = ""
        affirm = ""
        if toxicity_score > threshold:
            affirm = positive_affirmations()
            topScore = toxicity_score
        else:
            affirm = ""
            topScore = toxicity_score
        label_score_dict = {"toxicity" : toxicity_score}

    return transcribed_text, topScore, label_score_dict, affirm
    # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
    
def positive_affirmations():
    affirmations = [
        "I have survived my anxiety before and I will survive again now",
        "I am not in danger; I am just uncomfortable; this too will pass",
        "I forgive and release the past and look forward to the future",
        "I can't control what other people say but I can control my breathing and my response"
    ]
    selected_affirm = random.choice(affirmations)
    return selected_affirm
    
with gr.Blocks() as iface:
    show_state = gr.State([])
    with gr.Column():
        anxiety_class = gr.Radio(label="Specify Subclass", choices=["Racism", "LGBTQ+ Hate", "Sexually Explicit", "Pregnancy Complications"])
        explit_preference = gr.Radio(choices=["N-Word", "B-Word", "All Explitives"], label="Words to omit from general anxiety classes", info="certain words may be acceptible within certain contects for given groups of people, and some people may be unbothered by explitives broadly speaking.")
        emo_class = gr.Radio(choices=["negaitve emotionality"], label="Negative Emotionality", info="Select if you would like explitives to be considered anxiety-indiucing in the case of anger/ negative emotionality.")
        sense_slider = gr.Slider(minimum=1, maximum=5, step=1.0, label="How readily do you want the tool to intervene? 1 = in extreme cases and 5 = at every opportunity")
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_text = gr.Textbox(label="Transcribed Audio")
        out_val = gr.Textbox(label="Overall Toxicity")
        out_affirm = gr.Textbox(label="Intervention")
        out_class = gr.Label(label="Toxicity Class Breakdown")
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, anxiety_class, emo_class, explit_preference, sense_slider], outputs=[out_text, out_val, out_class, out_affirm])

iface.launch()