Spaces:

andyqin18
/

sentiment-analysis-app

Sleeping

App Files Files Community

andyqin18 commited on Apr 28, 2023

Commit

668f6af

•

1 Parent(s): f96a9e8

Test Table

Browse files

Files changed (3) hide show

app.py +45 -7
milestone3/milestone3.py +16 -73
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import streamlit as st
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 # Define analyze function
-def analyze(model_name: str, text: str) -> dict:
     '''
     Output result of sentiment analysis of a text through a defined model
     '''
     model = AutoModelForSequenceClassification.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
     return classifier(text)
 # App title
@@ -18,7 +24,7 @@ st.write("Currently it uses pre-trained models without fine-tuning.")
 # Model hub
 model_descrip = {
-    "andyqin18/test-finetuned": "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \
         Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate",
     "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \
         Labels: POSITIVE; NEGATIVE ",
@@ -28,6 +34,27 @@ model_descrip = {
         Labels: POS; NEU; NEG"
 }
 user_input = st.text_input("Enter your text:", value="NYU is the better than Columbia.")
 user_model = st.selectbox("Please select a model:", model_descrip)
@@ -35,16 +62,27 @@ user_model = st.selectbox("Please select a model:", model_descrip)
 st.write("### Model Description:")
 st.write(model_descrip[user_model])
 # Perform analysis and print result
 if st.button("Analyze"):
     if not user_input:
         st.write("Please enter a text.")
     else:
         with st.spinner("Hang on.... Analyzing..."):
-            result = analyze(user_model, user_input)
-            st.write("Result:")
-            st.write(f"Label: **{result[0]['label']}**")
-            st.write(f"Confidence Score: **{result[0]['score']}**")
 else:
     st.write("Go on! Try the app!")

 import streamlit as st
+import pandas as pd
+import numpy as np
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+fine_tuned_model = "andyqin18/test-finetuned"
+sample_text_num = 10
 # Define analyze function
+def analyze(model_name: str, text: str, top_k=1) -> dict:
     '''
     Output result of sentiment analysis of a text through a defined model
     '''
     model = AutoModelForSequenceClassification.from_pretrained(model_name)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
+    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k)
     return classifier(text)
 # App title
 # Model hub
 model_descrip = {
+    fine_tuned_model: "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \
         Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate",
     "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \
         Labels: POSITIVE; NEGATIVE ",
         Labels: POS; NEU; NEG"
 }
+df = pd.read_csv("/milestone3/comp/test_comment.csv")
+test_texts = df["comment_text"].values
+sample_texts = np.random.choice(test_texts, size=sample_text_num, replace=False)
+init_table_dict = {
+            "Text": [],
+            "Highest Toxicity Class": [],
+            "Highest Score": [],
+            "Second Highest Toxicity Class": [],
+            "Second Highest Score": []
+                }
+for text in sample_texts:
+    result = analyze(fine_tuned_model, text, top_k=2)
+    init_table_dict["Text"].append(text[:50])
+    init_table_dict["Highest Toxicity Class"].append(result[0][0]['label'])
+    init_table_dict["Highest Score"].append(result[0][0]['score'])
+    init_table_dict["Second Highest Toxicity Class"].append(result[0][1]['label'])
+    init_table_dict["Second Highest Score"].append(result[0][1]['score'])
 user_input = st.text_input("Enter your text:", value="NYU is the better than Columbia.")
 user_model = st.selectbox("Please select a model:", model_descrip)
 st.write("### Model Description:")
 st.write(model_descrip[user_model])
 # Perform analysis and print result
 if st.button("Analyze"):
     if not user_input:
         st.write("Please enter a text.")
     else:
         with st.spinner("Hang on.... Analyzing..."):
+            if user_model == fine_tuned_model:
+                result = analyze(user_model, user_input, top_k=2)
+                df = pd.DataFrame(init_table_dict)
+                st.dataframe(df)
+            else:
+                result = analyze(user_model, user_input)
+                st.write("Result:")
+                st.write(f"Label: **{result[0]['label']}**")
+                st.write(f"Confidence Score: **{result[0]['score']}**")
 else:
     st.write("Go on! Try the app!")

milestone3/milestone3.py CHANGED Viewed

@@ -1,82 +1,25 @@
 # from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-# import torch
-# import torch.nn.functional as F
-# model_name = "andyqin18/test-finetuned"
-# model = AutoModelForSequenceClassification.from_pretrained(model_name)
-# tokenizer = AutoTokenizer.from_pretrained(model_name)
-# classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
-# res = classifier(["Fuck your mom",
-#            "Hope you don't hate it"])
-# for result in res:
-#     print(result)
 import pandas as pd
-from sklearn.model_selection import train_test_split
-import torch
-from torch.utils.data import Dataset
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 import numpy as np
-df = pd.read_csv("comp/train.csv")
-train_texts = df["comment_text"].values
-train_labels = df[df.columns[2:]].values
-# print(train_labels[0])
-# np.random.seed(123)
-# small_train_texts = np.random.choice(train_texts, size=1000, replace=False)
-# small_train_labels_idx = np.random.choice(train_labels.shape[0], size=1000, replace=False)
-# small_train_labels = train_labels[small_train_labels_idx, :]
-# train_texts, val_texts, train_labels, val_labels = train_test_split(small_train_texts, small_train_labels, test_size=.2)
-train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
-class TextDataset(Dataset):
-  def __init__(self,texts,labels):
-    self.texts = texts
-    self.labels = labels
-  def __getitem__(self,idx):
-    encodings = tokenizer(self.texts[idx], truncation=True, padding="max_length")
-    item = {key: torch.tensor(val) for key, val in encodings.items()}
-    item['labels'] = torch.tensor(self.labels[idx],dtype=torch.float32)
-    del encodings
-    return item
-  def __len__(self):
-    return len(self.labels)
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-train_dataset = TextDataset(train_texts,train_labels)
-val_dataset = TextDataset(val_texts, val_labels)
-# small_train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
-# small_val_dataset = val_dataset.shuffle(seed=42).select(range(1000))
-model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6,  problem_type="multi_label_classification")
-model.to(device)
-training_args = TrainingArguments(
-    output_dir="finetuned-bert-uncased",
-    per_device_train_batch_size=16,
-    per_device_eval_batch_size=64,
-    learning_rate=5e-4,
-    weight_decay=0.01,
-    evaluation_strategy="epoch",
-    push_to_hub=True)
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=val_dataset,
-)
-trainer.train()

 # from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+# def analyze(model_name: str, text: str, top_k=1) -> dict:
+#     '''
+#     Output result of sentiment analysis of a text through a defined model
+#     '''
+#     model = AutoModelForSequenceClassification.from_pretrained(model_name)
+#     tokenizer = AutoTokenizer.from_pretrained(model_name)
+#     classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k)
+#     return classifier(text)
+# user_input = "Go fuck yourself"
+# user_model = "andyqin18/test-finetuned"
+# result = analyze(user_model, user_input, top_k=4)
+# print(result[0][0]['label'])
 import pandas as pd
 import numpy as np
+df = pd.read_csv("milestone3/comp/test_comment.csv")
+test_texts = df["comment_text"].values
+sample_texts = np.random.choice(test_texts, size=10, replace=False)
+print(sample_texts)

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 transformers
-torch

 transformers
+torch
+pandas