import streamlit as st import pandas as pd import numpy as np from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification fine_tuned_model = "andyqin18/test-finetuned" sample_text_num = 10 # Define analyze function def analyze(model_name: str, text: str, top_k=1) -> dict: ''' Output result of sentiment analysis of a text through a defined model ''' model = AutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, top_k=top_k) return classifier(text) # App title st.title("Sentiment Analysis App - Milestone3") st.write("This app is to analyze the sentiments behind a text.") st.write("You can choose to use my fine-tuned model or pre-trained models.") # Model hub model_descrip = { fine_tuned_model: "This is a customized BERT-base finetuned model that detects multiple toxicity for a text. \ Labels: toxic, severe_toxic, obscene, threat, insult, identity_hate", "distilbert-base-uncased-finetuned-sst-2-english": "This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. \ Labels: POSITIVE; NEGATIVE ", "cardiffnlp/twitter-roberta-base-sentiment": "This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. \ Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive", "finiteautomata/bertweet-base-sentiment-analysis": "Model trained with SemEval 2017 corpus (around ~40k tweets). Base model is BERTweet, a RoBERTa model trained on English tweets. \ Labels: POS; NEU; NEG" } user_input = st.text_input("Enter your text:", value="NYU is the better than Columbia.") user_model = st.selectbox("Please select a model:", model_descrip) # Display model information st.write("### Model Description:") st.write(model_descrip[user_model]) # Perform analysis and print result if st.button("Analyze"): if not user_input: st.write("Please enter a text.") else: with st.spinner("Hang on.... Analyzing..."): if user_model == fine_tuned_model: result = analyze(user_model, user_input, top_k=2) result_dict = { "Text": [user_input], "Highest Toxicity Class": [result[0][0]['label']], "Highest Score": [result[0][0]['score']], "Second Highest Toxicity Class": [result[0][1]['label']], "Second Highest Score": [result[0][1]['score']] } st.dataframe(pd.DataFrame(result_dict)) if st.button("Click to generate ten sample analysis"): df = pd.read_csv("milestone3/comp/test_comment.csv") test_texts = df["comment_text"].values sample_texts = np.random.choice(test_texts, size=sample_text_num, replace=False) init_table_dict = { "Text": [], "Highest Toxicity Class": [], "Highest Score": [], "Second Highest Toxicity Class": [], "Second Highest Score": [] } for text in sample_texts: result = analyze(fine_tuned_model, text[:50], top_k=2) init_table_dict["Text"].append(text[:50]) init_table_dict["Highest Toxicity Class"].append(result[0][0]['label']) init_table_dict["Highest Score"].append(result[0][0]['score']) init_table_dict["Second Highest Toxicity Class"].append(result[0][1]['label']) init_table_dict["Second Highest Score"].append(result[0][1]['score']) st.dataframe(pd.DataFrame(init_table_dict)) else: st.write("(─‿‿─)") else: result = analyze(user_model, user_input) st.write("Result:") st.write(f"Label: **{result[0]['label']}**") st.write(f"Confidence Score: **{result[0]['score']}**") else: st.write("Go on! Try the app!")