import os os.system("pip install git+https://github.com/openai/whisper.git") import whisper import evaluate from evaluate.utils import launch_gradio_widget import gradio as gr import torch import pandas as pd import random import classify import replace_explitives from whisper.model import Whisper from whisper.tokenizer import get_tokenizer from speechbrain.pretrained.interfaces import foreign_class from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer # pull in emotion detection # --- Add element for specification # pull in text classification # --- Add custom labels # --- Associate labels with radio elements # add logic to initiate mock notificaiton when detected # pull in misophonia-specific model model_cache = {} # Building prediction function for gradio emo_dict = { 'sad': 'Sad', 'hap': 'Happy', 'ang': 'Anger', 'neu': 'Neutral' } # static classes for now, but it would be best ot have the user select from multiple, and to enter their own class_options = { "racism": ["racism", "hate speech", "bigotry", "racially targeted", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"], "LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"], "sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexist", "sexually demeaning", "sexual violence", "victim blaming"], "misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"] } pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large") def classify_emotion(audio): #### Emotion classification #### emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") out_prob, score, index, text_lab = emotion_classifier.classify_file(audio) return emo_dict[text_lab[0]] def slider_logic(slider): threshold = 0 if slider == 1: threshold = .98 elif slider == 2: threshold = .88 elif slider == 3: threshold = .77 elif slider == 4: threshold = .66 elif slider == 5: threshold = .55 else: threshold = [] return threshold # Create a Gradio interface with audio file and text inputs def classify_toxicity(audio_file, text_input, classify_anxiety, emo_class, explitive_selection, slider): # Transcribe the audio file using Whisper ASR if audio_file != None: transcribed_text = pipe(audio_file)["text"] else: transcribed_text = text_input if classify_anxiety != "misophonia": print("emo_class ", emo_class, "explitive select", explitive_selection) ## SLIDER ## threshold = slider_logic(slider) #------- explitive call --------------- if replace_explitives != None and emo_class == None: transcribed_text = replace_explitives.sub_explitives(transcribed_text, explitive_selection) #### Toxicity Classifier #### toxicity_module = evaluate.load("toxicity", "facebook/roberta-hate-speech-dynabench-r4-target") #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement") toxicity_results = toxicity_module.compute(predictions=[transcribed_text]) toxicity_score = toxicity_results["toxicity"][0] print(toxicity_score) # emo call if emo_class != None: classify_emotion(audio_file) #### Text classification ##### device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") sequence_to_classify = transcribed_text print(classify_anxiety, class_options) candidate_labels = class_options.get(classify_anxiety, []) # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False) classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True) print("class output ", type(classification_output)) # classification_df = pd.DataFrame.from_dict(classification_output) print("keys ", classification_output.keys()) # formatted_classification_output = "\n".join([f"{key}: {value}" for key, value in classification_output.items()]) label_score_pairs = [(label, score) for label, score in zip(classification_output['labels'], classification_output['scores'])] # plot.update(x=classification_df["labels"], y=classification_df["scores"]) if toxicity_score > threshold: print("threshold exceeded!! Launch intervention") affirm = positive_affirmations() else: affirm = "" return toxicity_score, label_score_pairs, transcribed_text, affirm # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}" else: threshold = slider_logic(slider) model = whisper.load_model("large") # model = model_cache[model_name] # class_names = classify_anxiety.split(",") class_names_list = class_options.get(classify_anxiety, []) class_str = "" for elm in class_names_list: class_str += elm + "," #class_names = class_names_temp.split(",") class_names = class_str.split(",") print("class names ", class_names, "classify_anxiety ", classify_anxiety) tokenizer = get_tokenizer("large") # tokenizer= WhisperTokenizer.from_pretrained("openai/whisper-large") internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs( model=model, class_names=class_names, # class_names=classify_anxiety, tokenizer=tokenizer, ) audio_features = classify.calculate_audio_features(audio_file, model) average_logprobs = classify.calculate_average_logprobs( model=model, audio_features=audio_features, class_names=class_names, tokenizer=tokenizer, ) average_logprobs -= internal_lm_average_logprobs scores = average_logprobs.softmax(-1).tolist() return {class_name: score for class_name, score in zip(class_names, scores)} if toxicity_score > threshold: print("threshold exceeded!! Launch intervention") return classify_anxiety def positive_affirmations(): affirmations = [ "I have survived my anxiety before and I will survive again now", "I am not in danger; I am just uncomfortable; this too will pass", "I forgive and release the past and look forward to the future", "I can't control what other people say but I can control my breathing and my response" ] selected_affirm = random.choice(affirmations) return selected_affirm with gr.Blocks() as iface: show_state = gr.State([]) with gr.Column(): anxiety_class = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "misophonia"]) explit_preference = gr.Radio(choices=["N-Word", "B-Word", "All Explitives"], label="Words to omit from general anxiety classes", info="certain words may be acceptible within certain contects for given groups of people, and some people may be unbothered by explitives broadly speaking.") emo_class = gr.Radio(choices=["negaitve emotionality"], label="label", info="Select if you would like explitives to be considered anxiety-indiucing in the case of anger/ negative emotionality.") sense_slider = gr.Slider(minimum=1, maximum=5, step=1.0, label="How readily do you want the tool to intervene? 1 = in extreme cases and 5 = at every opportunity") with gr.Column(): aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File") text = gr.Textbox(label="Enter Text", placeholder="Enter text here...") submit_btn = gr.Button(label="Run") with gr.Column(): out_val = gr.Textbox() out_class = gr.Label() out_text = gr.Textbox() out_affirm = gr.Textbox() submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, anxiety_class, emo_class, explit_preference, sense_slider], outputs=[out_val, out_class, out_text, out_affirm]) iface.launch()