File size: 5,079 Bytes
ffc96c9
668f6af
 
ffc96c9
 
aa2cadb
ea09ee5
bcfb40b
668f6af
b6852b8
668f6af
b6852b8
 
 
228ca50
 
668f6af
228ca50
 
b6852b8
66f2f1a
b6852b8
a7fbbb7
ffc96c9
b6852b8
228ca50
bcfb40b
e294a0a
b6852b8
228ca50
b6852b8
228ca50
b6852b8
228ca50
 
 
c0f871b
b6852b8
228ca50
aa2cadb
b6852b8
228ca50
 
c789552
668f6af
b6852b8
c789552
 
 
 
 
aa2cadb
bcfb40b
aa2cadb
a7fbbb7
 
 
 
 
 
 
 
 
03582b6
 
8942604
 
 
 
 
 
 
 
 
 
 
 
aa2cadb
8942604
 
 
 
 
 
 
aa2cadb
8942604
bcfb40b
8942604
 
 
 
 
 
 
668f6af
 
 
 
 
23238de
 
b6852b8
ffc96c9
23238de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Define global variables
FINE_TUNED_MODEL = "andyqin18/finetuned-bert-uncased"
NUM_SAMPLE_TEXT = 10

# Define analyze function
def analyze(model_name: str, text: str, top_k=1) -> dict:
    '''
    Output result of sentiment analysis of a text through a defined model
    '''
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k)
    return classifier(text)

# App title 
st.title("Toxic Tweet Detection and Sentiment Analysis App")
st.write("This app is to analyze the sentiments behind a text.")
st.write("You can choose to use my fine-tuned model or pre-trained models.")

# Model hub
model_descrip = {
    FINE_TUNED_MODEL: "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \
        Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate",
    "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \
        Labels: POSITIVE; NEGATIVE ",
    "cardiffnlp/twitter-roberta-base-sentiment": "This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. \
        Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive",
    "finiteautomata/bertweet-base-sentiment-analysis": "Model trained with SemEval 2017 corpus (around ~40k tweets). Base model is BERTweet, a RoBERTa model trained on English tweets.  \
        Labels: POS; NEU; NEG"
}

user_input = st.text_input("Enter your text:", value="I hate NLP. Always lacking GPU.")
user_model = st.selectbox("Please select a model:", model_descrip)


# Display model information
st.write("### Model Description:")
st.write(model_descrip[user_model])


# Perform analysis and print result
if st.button("Analyze"):
    if not user_input:
        st.write("Please enter a text.")
    else:
        with st.spinner("Hang on.... Analyzing..."):
            # If fine-tuned
            if user_model == FINE_TUNED_MODEL:
                result = analyze(user_model, user_input, top_k=2)  # Top 2 labels with highest score
                result_dict = {
                        "Text": [user_input],
                        "Highest Toxicity Class": [result[0][0]['label']],
                        "Highest Score": [result[0][0]['score']],
                        "Second Highest Toxicity Class": [result[0][1]['label']],
                        "Second Highest Score": [result[0][1]['score']]
                            }
                st.dataframe(pd.DataFrame(result_dict))

                # 10 Sample Table   
                st.write("Here are 10 more examples.")                    
                sample_texts = [
                    "Please stop. If you continue to vandalize Wikipedia, as you did to Homosexuality, you will be blocked from editing.",
                    "knock it off you bloody CWI trot",
                    "No, he is an arrogant, self serving, immature idiot. Get it right.",
                    "to fuck you and ur family",
                    "Search Google, it's listed as 1966 everywhere I've seen, including many PJ related sites.",
                    "That entry made a lot of sense to me. ",
                    "KSchwartz is an annoying person who often smells of rotten fish and burnt animal hair.",
                    "Cool!",
                    "u suck u suck u suck u suck u sucku suck u suck u suck u suck u u suck",
                    "go fuck yourself ...cunt"
                    ]

                init_table_dict = {
                            "Text": [],
                            "Highest Toxicity Class": [],
                            "Highest Score": [],
                            "Second Highest Toxicity Class": [],
                            "Second Highest Score": []
                                }

                for text in sample_texts:
                    result = analyze(FINE_TUNED_MODEL, text[:50], top_k=2)
                    init_table_dict["Text"].append(text[:50])
                    init_table_dict["Highest Toxicity Class"].append(result[0][0]['label'])
                    init_table_dict["Highest Score"].append(result[0][0]['score'])
                    init_table_dict["Second Highest Toxicity Class"].append(result[0][1]['label'])
                    init_table_dict["Second Highest Score"].append(result[0][1]['score'])
                st.dataframe(pd.DataFrame(init_table_dict))
                st.write("( ─ ‿ ‿ ─ )")


            else:
                result = analyze(user_model, user_input)
                st.write("Result:")
                st.write(f"Label: **{result[0][0]['label']}**")
                st.write(f"Confidence Score: **{result[0][0]['score']}**")

else:
    st.write("Go on! Try the app!")