nppmatt commited on
Commit
5c49b11
1 Parent(s): 3965ceb
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -5,26 +5,38 @@ import torch
5
  from torch import nn
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
 
 
8
  defaultTxt = "I hate you cancerous insects so much"
9
- txt = st.text_area('Text to analyze', defaultTxt)
10
 
11
- # load tokenizer and model weights
12
- tokenizer = AutoTokenizer.from_pretrained("s-nlp/roberta_toxicity_classifier")
13
- model = AutoModelForSequenceClassification.from_pretrained("s-nlp/roberta_toxicity_classifier")
14
- batch = tokenizer.encode(txt, return_tensors='pt')
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # run encoding through model to get classification output
17
- # e.g. "logits": tensor([[ 4.8982, -5.1952]], grad_fn=<AddmmBackward0>)
18
- result = model(batch)
 
19
 
20
  # transform logit to get probabilities
21
- # e.g. tensor([[9.9996e-01, 4.2627e-05]], grad_fn=<SoftmaxBackward0>)
22
- # first indice is neutral, second is toxic
23
  prediction = nn.functional.softmax(result.logits, dim=-1)
24
  neutralProb = prediction.data[0][0]
25
  toxicProb = prediction.data[0][1]
26
 
27
- # default text input ought to return:
28
  # Neutral: 0.0052
29
  # Toxic: 0.9948
30
  st.write("Classification Probabilities")
 
5
  from torch import nn
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
 
8
+ option = st.selectbox("Select a toxicity analysis model:", ("RoBERTa", "DistilBERT", "XLM-RoBERTa"))
9
  defaultTxt = "I hate you cancerous insects so much"
10
+ txt = st.text_area("Text to analyze", defaultTxt)
11
 
12
+ # Load tokenizer and model weights, try to default to RoBERTa.
13
+ match option:
14
+ case "RoBERTa":
15
+ tokenizerPath = "s-nlp/roberta_toxicity_classifier"
16
+ modelPath = "s-nlp/roberta_toxicity_classifier"
17
+ case "DistilBERT":
18
+ tokenizerPath = "citizenlab/distilbert-base-multilingual-cased-toxicity"
19
+ modelPath = "citizenlab/distilbert-base-multilingual-cased-toxicity"
20
+ case "XLM-RoBERTa":
21
+ tokenizerPath = "unitary/multilingual-toxic-xlm-roberta"
22
+ modelPath = "unitary/multilingual-toxic-xlm-roberta"
23
+ case _:
24
+ tokenizerPath = "s-nlp/roberta_toxicity_classifier"
25
+ modelPath = "s-nlp/roberta_toxicity_classifier"
26
+ tokenizer = AutoTokenizer.from_pretrained(tokenizerPath)
27
+ model = AutoModelForSequenceClassification.from_pretrained(modelPath)
28
 
29
  # run encoding through model to get classification output
30
+ # RoBERTA: [0]: neutral, [1]: toxic
31
+ encoding = tokenizer.encode(txt, return_tensors='pt')
32
+ result = model(encoding)
33
 
34
  # transform logit to get probabilities
 
 
35
  prediction = nn.functional.softmax(result.logits, dim=-1)
36
  neutralProb = prediction.data[0][0]
37
  toxicProb = prediction.data[0][1]
38
 
39
+ # Expected returns from RoBERTa on default text:
40
  # Neutral: 0.0052
41
  # Toxic: 0.9948
42
  st.write("Classification Probabilities")