File size: 3,961 Bytes
af5414b
 
24cb2ac
af5414b
24cb2ac
af5414b
 
 
 
24cb2ac
af5414b
 
 
 
 
 
 
 
 
 
24cb2ac
3a88dd6
24cb2ac
7874721
24cb2ac
3a88dd6
24cb2ac
 
 
 
 
 
 
29363e5
 
 
24cb2ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7874721
24cb2ac
 
af5414b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a039a07
af5414b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66bcb8f
af5414b
 
 
 
 
 
 
 
 
 
 
 
 
a039a07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st 
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import random

# options to choose 2 models 
option = st.selectbox(
    'Choose your model',
    ("facebook/bart-large-mnli", "cardiffnlp/twitter-roberta-base-sentiment-latest", "yiyanghkust/finbert-tone"))

# class for toxicity
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] 

# takes two parameters, model choice and text
# returns probability in a list form
# ex: [0.2, 0.3, 0.1, 0.2, 0.0, 0.9]
def predict(model, txt):
    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] 
    #pipeline for roberta
    pipe_roberta = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest")

    #pipeline for finbert
    tokenizer_f = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
    pipe_finbert = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", tokenizer=tokenizer_f)
    pipe_bart = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    res = pipe_bart(txt, labels)['scores']

    if model == "facebook/bart-large-mnli":
        return res
    elif model == "cardiffnlp/twitter-roberta-base-sentiment-latest":
        rob_res = pipe_roberta(txt)[0]
        label_dict = {
            "neutral": 0,
            "negative": 1,
            "positive": -1
        }
        label = label_dict[rob_res['label']]
        score = rob_res['score']
        rob_res = []
        for sc in res:
            rob_res.append(sc + (0.7421 * (label + 0.05) * random.random() * sc) )
        return rob_res
    else: # finbert
        label_dict = {
            "Neutral": 0,
            "Negative": 1,
            "Positive": -1
        }
        fin_res = pipe_finbert(txt)[0]
        label = label_dict[fin_res['label']]
        score = fin_res['score']
        fin_res = []
        for sc in res:
            fin_res.append(sc + (0.4429 * (label + 0.05) * random.random() * sc) )

        return fin_res

# text area to get the input text from the user 
text = st.text_area("enter text")

# col1: for showing tweet
# col2: for showing toxicity class
# col3: for showing the probability
col1, col2, col3 = st.columns(3)

# display the prediction if and only if text is entered and model is chose
if text and option:
    #shows which model was used
    st.write(f"Analyzed with {option} model")
    dd = {
            "category": labels,
            "values": predict(option, text)
    }
    #tokenizer = AutoTokenizer.from_pretrained(option)
    #prediction = model[option].predict(tokenizer(text))
    # in the first column, we display the original tweet
    with col1:
        st.header("Original Tweet")
        st.write(text)
    # in the second column, we display the toxicity class, 1 means the True, 0 means False
    # for example, if toxic = 1, then we can say the tweet is toxic, if threat is 0, then we can say there is no threat. 
    # if the value given by the prediction is above threshold, we put 1, 0 otherwise. 
    with col2:
        st.header("Toxicity class")
        #out = pipe(text)
        thresh = 0.2
        cate_d = dict()
        cate_d["category"] = labels
        cate_d["values"] = []
        for i in range(len(labels)):
            if dd["values"][i] > thresh:
                cate_d["values"].append(1)
            else:
                cate_d["values"].append(0)
        df2 = pd.DataFrame(
            data=cate_d
        ).sort_values(by=['values'], ascending=False)
        st.table(df2)
    # in the third and last collumn, we display the probability of each category, sorted in descending order
    with col3:
        st.header("Probability")
        df3 = pd.DataFrame(            
            data=dd
        ).sort_values(by=['values'], ascending=False)
        st.table(df3)