File size: 2,119 Bytes
1a5f890
bf79d0d
9a33247
 
 
bf79d0d
2268b75
5c49b11
33ec467
5c49b11
dd26b28
2268b75
5c49b11
9a8f76b
 
c125d59
 
c5be7aa
 
9a8f76b
c125d59
 
c5be7aa
 
9a8f76b
c125d59
 
67d3612
 
9a8f76b
c125d59
 
c5be7aa
 
9a8f76b
5c49b11
 
2268b75
1345311
5c49b11
 
 
9a33247
1345311
67e928c
 
53a5efc
9a33247
36ce82a
 
bf79d0d
5c49b11
79c7e0d
 
8b74bcc
36ce82a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
import plotly.express as px
import torch

from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

option = st.selectbox("Select a toxicity analysis model:", ("RoBERTa", "DistilBERT", "XLM-RoBERTa"))
defaultTxt = "I hate you cancerous insects so much"
txt = st.text_area("Text to analyze", defaultTxt)
st.button("Submit Text")

# Load tokenizer and model weights, try to default to RoBERTa.
# Huggingface does not support Python 3.10 match statements and I'm too lazy to implement an equivalent.
if (option == "RoBERTa"):
    tokenizerPath = "s-nlp/roberta_toxicity_classifier"
    modelPath = "s-nlp/roberta_toxicity_classifier"
    neutralIndex = 0
    toxicIndex = 1
elif (option == "DistilBERT"):
    tokenizerPath = "citizenlab/distilbert-base-multilingual-cased-toxicity"
    modelPath = "citizenlab/distilbert-base-multilingual-cased-toxicity"
    neutralIndex = 1
    toxicIndex = 0
elif (option == "XLM-RoBERTa"):
    tokenizerPath = "unitary/multilingual-toxic-xlm-roberta"
    modelPath = "unitary/multilingual-toxic-xlm-roberta"
    neutralIndex = 1
    toxicIndex = 0
else:
    tokenizerPath = "s-nlp/roberta_toxicity_classifier"
    modelPath = "s-nlp/roberta_toxicity_classifier"
    neutralIndex = 0
    toxicIndex = 1

tokenizer = AutoTokenizer.from_pretrained(tokenizerPath)
model = AutoModelForSequenceClassification.from_pretrained(modelPath)

# Run encoding through model to get classification output.
# RoBERTA: [0]: neutral, [1]: toxic
encoding = tokenizer.encode(txt, return_tensors='pt')
result = model(encoding)

# Transform logit to get probabilities.
if (result.logits.size(dim=1) < 2):
    pad = (0, 1)
    result.logits = nn.functional.pad(result.logits, pad, "constant", 0)
prediction = nn.functional.softmax(result.logits, dim=-1)
neutralProb = prediction.data[0][neutralIndex]
toxicProb = prediction.data[0][toxicIndex]

# Expected returns from RoBERTa on default text:
# Neutral: 0.0052
# Toxic: 0.9948
st.write("Classification Probabilities")
st.write(f"{neutralProb:.4f} - NEUTRAL")
st.write(f"{toxicProb:.4f} - TOXIC")