File size: 10,408 Bytes
35f56ba
7749ef6
47ef74f
cfa2b70
cd87a42
 
 
8389a97
391374c
8389a97
9c5b410
d71bb22
e5de86c
 
26f6079
cd87a42
26dac8d
cd87a42
26f6079
e43f53b
e5de86c
 
26f6079
2b66ed3
 
 
 
 
391374c
21ef14c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5de86c
 
21ef14c
 
 
e5de86c
 
c39351b
 
 
 
 
 
 
21ef14c
 
 
 
 
 
 
 
 
 
2b66ed3
e5de86c
84b6ab2
 
391374c
84b6ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae0f2db
 
4ba5dd1
fe3f137
 
 
 
 
 
 
 
 
 
 
 
 
21ef14c
df89f66
 
 
 
 
 
 
 
4ba5dd1
0a5c3d5
 
4ba5dd1
0a5c3d5
 
 
 
 
 
4ba5dd1
fe3f137
391374c
 
0a5c3d5
26f6079
e5de86c
26f6079
 
 
 
 
 
 
 
 
 
e5de86c
26dac8d
 
 
 
 
 
 
 
 
 
 
47ef74f
d5b90e7
dff0151
21d64ee
dff0151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import streamlit as st
import tensorflow as tf
from transformers import pipeline
from textblob import TextBlob
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from transformers import BertForMaskedLM
import pandas as pd

# model = BertForMaskedLM.from_pretrained("remi/bertabs-finetuned-extractive-abstractive-summarization")

# setup pre-popultaed text and drop down menu for model options

textIn = st.text_input("Input Text Here:", "I really like the color of your car!")

option = st.selectbox('Which pre-trained model would you like for your sentiment analysis?',('MILESTONE 3', 'Pipeline', 'TextBlob'))

st.write('You selected:', option)

# milestone 3 models, all loaded and pre-trained using "TrainingAlgo" and loaded into huggingface, but could not successfully use them on input text
# For my multi headed model, I trained 6 models each return a value between 0 and 1 with the confidence that input text is 'Toxic', 'Obscene', etc.
if option == 'MILESTONE 3':
    model_name_0 = "Rathgeberj/milestone3_0"
    # model_0 = AutoModelForSequenceClassification.from_pretrained(model_name_0)
    model_0 = BertForMaskedLM.from_pretrained(model_name_0)
    tokenizer_0 = AutoTokenizer.from_pretrained(model_name_0)
    classifier_0 = pipeline(task="sentiment-analysis", model=model_0, tokenizer=tokenizer_0)

    model_name_1 = "Rathgeberj/milestone3_1"
    # model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1)
    model_1 = BertForMaskedLM.from_pretrained(model_name_1)
    tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)
    classifier_1 = pipeline(task="sentiment-analysis", model=model_1, tokenizer=tokenizer_1)

    model_name_2 = "Rathgeberj/milestone3_2"
    # model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2)
    model_2 = BertForMaskedLM.from_pretrained(model_name_2)
    tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)
    classifier_2 = pipeline(task="sentiment-analysis", model=model_2, tokenizer=tokenizer_2)

    model_name_3 = "Rathgeberj/milestone3_3"
    # model_3 = AutoModelForSequenceClassification.from_pretrained(model_name_3)
    model_3 = BertForMaskedLM.from_pretrained(model_name_3)
    tokenizer_3 = AutoTokenizer.from_pretrained(model_name_3)
    classifier_3 = pipeline(task="sentiment-analysis", model=model_3, tokenizer=tokenizer_3)

    model_name_4 = "Rathgeberj/milestone3_4"
    # model_4 = AutoModelForSequenceClassification.from_pretrained(model_name_4)
    model_4 = BertForMaskedLM.from_pretrained(model_name_4)
    tokenizer_4 = AutoTokenizer.from_pretrained(model_name_4)
    classifier_4 = pipeline(task="sentiment-analysis", model=model_4, tokenizer=tokenizer_4)

    model_name_5 = "Rathgeberj/milestone3_5"
    # model_5 = AutoModelForSequenceClassification.from_pretrained(model_name_5)
    model_5 = BertForMaskedLM.from_pretrained(model_name_5)
    tokenizer_5 = AutoTokenizer.from_pretrained(model_name_5)
    classifier_5 = pipeline(task="sentiment-analysis", model=model_5, tokenizer=tokenizer_5)
    
#     list of models and associated tokenizers & classifiers
    models = [model_0, model_1, model_2, model_3, model_4, model_5]
    tokenizers = [tokenizer_0, tokenizer_1, tokenizer_2, tokenizer_3, tokenizer_4, tokenizer_5]
    classifiers = [classifier_0, classifier_1, classifier_2, classifier_3, classifier_4, classifier_5]
    
#     Note that pops up when milestone 3 is selected from drop down, indicating models were successfully loaded into the space
    st.write('IF YOURE READING THIS: I was unable to complete a fully functioning milestone 3. \
              If this message print, that means my program successfully loaded my pretrained models. \
              They are fine tuned iterations of the Bert uncased model, trained on the given training data. \
              The issue arose when I tried to use my models to analyze the input string, and after much troubleshooting, \
              I was unable to get it to work. My pre-processing and training algorithm, along with each models .json and config \
              files will be linked in the github along with the tokenizer I used.')

    # X_train = [textIn]
    # batch = tokenizer_0(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # with torch.no_grad():
    #     outputs = model_0(**batch_0, labels=torch.tensor([1, 0]))
    #     predictions = F.softmax(outputs.logits, dim=1)
    #     labels = torch.argmax(predictions, dim=1)
    #     labels = [model.config.id2label[label_id] for label_id in labels.tolist()]

    # st.write(predictions['label'])

#     table of 10 pre-populated tweets and where their toxicity labels and scores would go
    col = ['Tweet', 'Highest_Toxicity_Class_Overall', 'Score_Overall', 'Highest_Toxicity_Class_Except_Toxic', 'Score_Except_Toxic']
    df = pd.DataFrame(columns=col)
    pre_populated_tweets = ['Yo bitch Ja Rule is more succesful then youll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.', 
                                'If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadnt updated. I shall update the information once again but thank you for your message.', 
                                'I dont anonymously edit articles at all.', 
                                'Thank you for understanding. I think very highly of you and would not revert without discussion.', 
                                'Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you.   -', 
                                'Dear god this site is horrible.', 
                                'I think its crap that the link to roggenbier is to this article. Somebody that knows how to do things should change it.', 
                                'Please stop. If you continue to vandalize Wikipedia, as you did to Homosexuality, you will be blocked from editing.', 
                                'yeah, thanks for reviving the tradition of pissing all over articles because you want to live out your ethnic essentialism. Why let mere facts get into the way of enjoying that.', 
                                'Ive deleted the page , as we have no evidence that you are the person named on that page, and its content goes against Wikipedias policies for the use of user pages.', 
                                ]
    HTCO = [0]*10
    SO = [0]*10
    HTCET = [0]*10
    SET = [0]*10



    pred_data = []

    # for i in range(10):
    #     X_train = pre_populated_tweets[i]
    #     for j in range(6):
    #         batch = tokenizers[j](X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
    #         with torch.no_grad():
    #             outputs = models[j](**batch, labels=torch.tensor([1, 0]))
    #             predictions = F.softmax(outputs.logits, dim=1)
    #             labels = torch.argmax(predictions, dim=1)
    #             labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    #             pred_data.append(predictions)

    # st.write(pred_data[0]['label'])

    # for i in range(10):
    #     HTCO[i]=pred_data[i]['label']

    df = df.assign(Tweet=pre_populated_tweets)
    df = df.assign(Highest_Toxicity_Class_Overall=HTCO)
    df = df.assign(Score_Overall=SO)
    df = df.assign(Highest_Toxicity_Class_Except_Toxic=HTCET)
    df = df.assign(Score_Except_Toxic=SET)

    # X_train = 'I dont anonymously edit articles at all.'
    # batch = tokenizers[0](X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # with torch.no_grad():
    #     outputs = models[0](**batch, labels=torch.tensor([1, 0]))
    #     predictions = F.softmax(outputs.logits, dim=1)
    #     labels = torch.argmax(predictions, dim=1)
    #     labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    #     pred_data.append(predictions)

            
    st.table(df)

    # st.write(pred_data)

#     Pre trained distilbert model from huggingface
if option == 'Pipeline':

    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    classifier = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)
    preds = classifier(textIn)
    preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
    st.write('According to Pipeline, input text is ', preds[0]['label'], ' with a confidence of ', preds[0]['score'])

#     pre trained model Textblob from huggingface 
if option == 'TextBlob':
    polarity = TextBlob(textIn).sentiment.polarity
    subjectivity = TextBlob(textIn).sentiment.subjectivity
    sentiment = ''
    if polarity < 0:
        sentiment = 'Negative'
    elif polarity == 0:
        sentiment = 'Neutral'
    else:
        sentiment = 'Positive'
    st.write('According to TextBlob, input text is ', sentiment, ' and a subjectivity score (from 0 being objective to 1 being subjective) of ', subjectivity)


#------------------------------------------------------------------------

# tokens = tokenizer.tokenize(textIn)
# token_ids = tokenizer.convert_tokens_to_ids(tokens)
# input_ids = tokenizer(textIn)


# X_train = [textIn]

# batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
# # batch = torch.tensor(batchbatch["input_ids"])

# with torch.no_grad():
#     outputs = model(**batch, labels=torch.tensor([1, 0]))
#     predictions = F.softmax(outputs.logits, dim=1)
#     labels = torch.argmax(predictions, dim=1)
#     labels = [model.config.id2label[label_id] for label_id in labels.tolist()]

# # save_directory = "saved"
# tokenizer.save_pretrained(save_directory)
# model.save_pretrained(save_directory)

# tokenizer = AutoTokenizer.from_pretrained(save_directory)
# model = AutoModelForSequenceClassification.from_pretrained(save_directory)