Spaces:
Runtime error
Runtime error
tanquangduong
commited on
Commit
•
b93e9c1
1
Parent(s):
55c74d2
:tada: add application files
Browse files- .vscode/settings.json +19 -0
- app.py +56 -0
- figs/AI-driven-Solutions.png +0 -0
- figs/sentiment-analysis-streaming.png +0 -0
- pages/1_Review_Sentiment_Analysis.py +140 -0
- requirements.txt +16 -0
- utils.py +92 -0
.vscode/settings.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"editor.tabSize": 2,
|
3 |
+
"editor.codeActionsOnSave": {
|
4 |
+
"source.fixAll.eslint": true
|
5 |
+
},
|
6 |
+
"python.linting.pycodestyleEnabled": true,
|
7 |
+
"python.linting.pycodestyleArgs": [
|
8 |
+
"--max-line-length=150"
|
9 |
+
],
|
10 |
+
"python.linting.pylintEnabled": true,
|
11 |
+
"python.linting.pylintArgs": [
|
12 |
+
"--generated-members=numpy.* ,torch.*"
|
13 |
+
],
|
14 |
+
"python.formatting.provider": "none",
|
15 |
+
"[python]": {
|
16 |
+
"editor.formatOnSave": true,
|
17 |
+
"editor.defaultFormatter": "ms-python.black-formatter"
|
18 |
+
}
|
19 |
+
}
|
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
@author: Tan Quang Duong
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
import streamlit as st
|
7 |
+
import pandas as pd
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
9 |
+
from datasets import load_dataset
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
|
13 |
+
# setting logos in the page
|
14 |
+
app_logo = Image.open("./figs/AI-driven-Solutions.png")
|
15 |
+
|
16 |
+
# set page config
|
17 |
+
st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
|
18 |
+
st.sidebar.image(app_logo, use_column_width=True)
|
19 |
+
st.sidebar.markdown(
|
20 |
+
"<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
|
21 |
+
unsafe_allow_html=True,
|
22 |
+
)
|
23 |
+
|
24 |
+
# model name
|
25 |
+
model_name = "tanquangduong/distilbert-imdb"
|
26 |
+
|
27 |
+
# Load tokenizer, model and imdb dataset from hugging face hub and add them to st.session_state
|
28 |
+
if "tokenizer" not in st.session_state:
|
29 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
+
st.session_state["tokenizer"] = tokenizer
|
31 |
+
|
32 |
+
if "model" not in st.session_state:
|
33 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
34 |
+
st.session_state["model"] = model
|
35 |
+
|
36 |
+
if "df_imdb_test" not in st.session_state:
|
37 |
+
imdb = load_dataset("imdb")
|
38 |
+
df_test = pd.DataFrame(imdb["test"])
|
39 |
+
df_test = df_test.sample(frac=1)
|
40 |
+
st.session_state["df_imdb_test"] = df_test
|
41 |
+
|
42 |
+
st.write("# Welcome to LLM-based sentiment analysis app!👋")
|
43 |
+
|
44 |
+
# st.sidebar.success("Select a demo above.")
|
45 |
+
|
46 |
+
st.markdown(
|
47 |
+
"""
|
48 |
+
# Objective
|
49 |
+
This app leverages LLM to perform **:green[sentiment analysis]** for **:green[user reviews]**. Some potential use-cases are as bellow, but not limitted to:
|
50 |
+
- User reviews for drug efficiency on drug/medicin forums
|
51 |
+
- User reviews for mobile applications on app stores, e.g. Google Play, App Store
|
52 |
+
- User reviews for food quality on food delivery app
|
53 |
+
- User reviews for product quality on e-commerce websites
|
54 |
+
- etc.
|
55 |
+
"""
|
56 |
+
)
|
figs/AI-driven-Solutions.png
ADDED
figs/sentiment-analysis-streaming.png
ADDED
pages/1_Review_Sentiment_Analysis.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
@author: Tan Quang Duong
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
import streamlit as st
|
7 |
+
import hydralit_components as hc
|
8 |
+
from hydralit_components import HyLoader, Loaders
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
from sklearn import metrics
|
12 |
+
from utils import inference_from_pytorch, plot_confusion_matric, plot_donut_sentiment_percentage, create_classification_report, get_100_random_test_review
|
13 |
+
from PIL import Image
|
14 |
+
|
15 |
+
|
16 |
+
# setting logos in the page
|
17 |
+
app_logo = Image.open("./figs/AI-driven-Solutions.png")
|
18 |
+
|
19 |
+
# set page config
|
20 |
+
st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
|
21 |
+
st.sidebar.image(app_logo, use_column_width=True)
|
22 |
+
st.sidebar.markdown(
|
23 |
+
"<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
|
24 |
+
unsafe_allow_html=True,
|
25 |
+
)
|
26 |
+
|
27 |
+
# specify the primary menu definition
|
28 |
+
menu_data = [{"id": "tab1", "icon": "😊😒", "label": "Review Sentiment Analysis"}]
|
29 |
+
|
30 |
+
over_theme = {
|
31 |
+
"menu_background": "#7BB657",
|
32 |
+
"txc_active": "#000000",
|
33 |
+
"txc_inactive": "#FFFFFF",
|
34 |
+
}
|
35 |
+
menu_id = hc.nav_bar(
|
36 |
+
menu_definition=menu_data,
|
37 |
+
override_theme=over_theme,
|
38 |
+
# home_name='Home',
|
39 |
+
# login_name='Logout',
|
40 |
+
hide_streamlit_markers=False, # will show the st hamburger as well as the navbar now!
|
41 |
+
sticky_nav=True, # at the top or not
|
42 |
+
sticky_mode="pinned", # jumpy or not-jumpy, but sticky or pinned
|
43 |
+
)
|
44 |
+
|
45 |
+
# Load tokenizer from st.session_state if exist
|
46 |
+
if "tokenizer" in st.session_state:
|
47 |
+
tokenizer = st.session_state["tokenizer"]
|
48 |
+
else:
|
49 |
+
st.write(
|
50 |
+
"Please come back to Home page for loading tokenizer, model and dataset from Hugging Face hub."
|
51 |
+
)
|
52 |
+
|
53 |
+
# load model from st.session_state if exist
|
54 |
+
if "model" in st.session_state:
|
55 |
+
model = st.session_state["model"]
|
56 |
+
|
57 |
+
# load test imdb dataset from st.session_state if exist
|
58 |
+
if "df_imdb_test" in st.session_state:
|
59 |
+
df_test = st.session_state["df_imdb_test"]
|
60 |
+
|
61 |
+
# create boolean variable for checking if df_test_100 is loaded
|
62 |
+
if "is_df_test_100_loaded" not in st.session_state:
|
63 |
+
st.session_state["is_df_test_100_loaded"] = False
|
64 |
+
|
65 |
+
|
66 |
+
with HyLoader("", loader_name=Loaders.pulse_bars):
|
67 |
+
if menu_id == "tab1":
|
68 |
+
input_mode = st.radio(
|
69 |
+
"**Select input mode** 👇",
|
70 |
+
("Review streaming", "Add review manually"),
|
71 |
+
horizontal=True,
|
72 |
+
)
|
73 |
+
|
74 |
+
# ner for querry from
|
75 |
+
if input_mode == "Review streaming":
|
76 |
+
if st.button("Simulate streaming 100 random reviews"):
|
77 |
+
# get 100 random reviews as dataframe df_test_100
|
78 |
+
df_test_100 = get_100_random_test_review(df_test)
|
79 |
+
st.session_state["df_test_100"] = df_test_100
|
80 |
+
|
81 |
+
# display 100 random reviews
|
82 |
+
st.dataframe(df_test_100, use_container_width=True)
|
83 |
+
st.session_state["is_df_test_100_loaded"] = True
|
84 |
+
|
85 |
+
if st.session_state["is_df_test_100_loaded"]:
|
86 |
+
if st.button("Inference"):
|
87 |
+
# make prediction on 100 reviews
|
88 |
+
df_test_100_loaded = st.session_state["df_test_100"]
|
89 |
+
df_test_100_loaded["predicted_class_id"] = df_test_100_loaded[
|
90 |
+
"text"
|
91 |
+
].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[0])
|
92 |
+
df_test_100_loaded["predicted_class"] = df_test_100_loaded[
|
93 |
+
"text"
|
94 |
+
].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[1])
|
95 |
+
|
96 |
+
st.write("Sentiment analysis completed! Here is the result: 👇")
|
97 |
+
# display dataframe
|
98 |
+
st.dataframe(df_test_100_loaded, use_container_width=True)
|
99 |
+
|
100 |
+
# label prediction count
|
101 |
+
pred_labels = {
|
102 |
+
"label": ["positive", "negative"],
|
103 |
+
"count": list(
|
104 |
+
df_test_100_loaded.predicted_class_id.value_counts()
|
105 |
+
),
|
106 |
+
}
|
107 |
+
df_pred_labels = pd.DataFrame(pred_labels)
|
108 |
+
|
109 |
+
# calculate confusion matrix
|
110 |
+
confusion_matrix = metrics.confusion_matrix(
|
111 |
+
df_test_100_loaded.class_id,
|
112 |
+
df_test_100_loaded.predicted_class_id,
|
113 |
+
)
|
114 |
+
|
115 |
+
# get classification report
|
116 |
+
df_report = create_classification_report(
|
117 |
+
df_test_100_loaded.class_id,
|
118 |
+
df_test_100_loaded.predicted_class_id,
|
119 |
+
)
|
120 |
+
|
121 |
+
col1, col2 = st.columns(2, gap="large")
|
122 |
+
with col1:
|
123 |
+
# plot donut chart for sentiment percentage
|
124 |
+
st.pyplot(plot_donut_sentiment_percentage(df_pred_labels))
|
125 |
+
with col2:
|
126 |
+
# plot confusion matrix
|
127 |
+
st.pyplot(plot_confusion_matric(confusion_matrix))
|
128 |
+
|
129 |
+
# display classification report
|
130 |
+
st.dataframe(df_report, use_container_width=True)
|
131 |
+
|
132 |
+
# ner for manually add text
|
133 |
+
elif input_mode == "Add review manually":
|
134 |
+
text_input = st.text_area("Type your review here:", height=200)
|
135 |
+
if text_input:
|
136 |
+
st.write(
|
137 |
+
"Predicted sentiment: **{}**".format(
|
138 |
+
inference_from_pytorch(text_input, tokenizer, model)[1]
|
139 |
+
)
|
140 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pylint
|
2 |
+
black
|
3 |
+
pycodestyle
|
4 |
+
requests
|
5 |
+
hydralit_components
|
6 |
+
pandas
|
7 |
+
numpy
|
8 |
+
matplotlib
|
9 |
+
seaborn
|
10 |
+
torch
|
11 |
+
transformers
|
12 |
+
datasets
|
13 |
+
evaluate
|
14 |
+
scikit-learn
|
15 |
+
pillow
|
16 |
+
streamlit-aggrid
|
utils.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
@author: Tan Quang Duong
|
3 |
+
"""
|
4 |
+
import torch
|
5 |
+
import matplotlib
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
from sklearn.metrics import classification_report
|
11 |
+
|
12 |
+
|
13 |
+
# custom color map
|
14 |
+
norm = matplotlib.colors.Normalize(-1, 1)
|
15 |
+
colors = [[norm(-1.0), "#DAF7A6"], [norm(1.0), "#673FEE"]]
|
16 |
+
custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)
|
17 |
+
|
18 |
+
|
19 |
+
def create_classification_report(y, y_pred):
|
20 |
+
target_class = ["negative", "positive"]
|
21 |
+
cls_report = classification_report(
|
22 |
+
y, y_pred, target_names=target_class, output_dict=True
|
23 |
+
)
|
24 |
+
df_report = pd.DataFrame(cls_report).transpose()
|
25 |
+
return df_report.round(2)
|
26 |
+
|
27 |
+
|
28 |
+
def get_100_random_test_review(df_test):
|
29 |
+
# get random 100 reviews
|
30 |
+
n_random = np.random.randint(len(df_test) - 101)
|
31 |
+
|
32 |
+
# get dataframe of 100 reviews
|
33 |
+
df_test_100 = df_test.iloc[n_random : n_random + 100]
|
34 |
+
|
35 |
+
# column rename
|
36 |
+
df_test_100 = df_test_100.rename(columns={"label": "class_id"})
|
37 |
+
|
38 |
+
return df_test_100
|
39 |
+
|
40 |
+
|
41 |
+
def inference_from_pytorch(text, tokenizer, model):
|
42 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True)
|
43 |
+
# do inference
|
44 |
+
with torch.no_grad():
|
45 |
+
logits = model(**inputs).logits
|
46 |
+
# get label
|
47 |
+
predicted_class_id = logits.argmax().item()
|
48 |
+
predicted_label = model.config.id2label[predicted_class_id]
|
49 |
+
return predicted_class_id, predicted_label
|
50 |
+
|
51 |
+
|
52 |
+
def plot_confusion_matric(confusion_matrix):
|
53 |
+
# annot=True to annotate cells, ftm='g' to disable scientific notation
|
54 |
+
sentiment_labels = ["Negative", "Positive"]
|
55 |
+
|
56 |
+
fig_cm, ax = plt.subplots(figsize=(8, 8))
|
57 |
+
sns.heatmap(
|
58 |
+
confusion_matrix,
|
59 |
+
annot=True,
|
60 |
+
fmt="g",
|
61 |
+
cmap=custom_cmap,
|
62 |
+
ax=ax,
|
63 |
+
)
|
64 |
+
# labels, title and ticks
|
65 |
+
ax.set_xlabel("Predicted labels", size=12, weight="bold")
|
66 |
+
ax.set_ylabel("True labels", size=12, weight="bold")
|
67 |
+
ax.set_title("Confusion matrix", size=16, weight="bold")
|
68 |
+
ax.xaxis.set_ticklabels(sentiment_labels)
|
69 |
+
ax.yaxis.set_ticklabels(sentiment_labels)
|
70 |
+
return fig_cm
|
71 |
+
|
72 |
+
|
73 |
+
def plot_donut_sentiment_percentage(df):
|
74 |
+
# explosion
|
75 |
+
explode_val = (0.05, 0.05)
|
76 |
+
custom_colors = ["#673FEE", "#DAF7A6"]
|
77 |
+
# Give color names
|
78 |
+
fig_pie, ax_pie = plt.subplots()
|
79 |
+
ax_pie.pie(
|
80 |
+
df["count"],
|
81 |
+
labels=df["label"],
|
82 |
+
autopct="%1.1f%%",
|
83 |
+
pctdistance=0.5,
|
84 |
+
explode=explode_val,
|
85 |
+
colors=custom_colors,
|
86 |
+
)
|
87 |
+
ax_pie.set_title("Sentiment analysis", size=12, weight="bold")
|
88 |
+
# Create a circle at the center of the plot
|
89 |
+
my_circle = plt.Circle((0, 0), 0.7, color="white")
|
90 |
+
p = plt.gcf()
|
91 |
+
p.gca().add_artist(my_circle)
|
92 |
+
return fig_pie
|