tanquangduong commited on
Commit
b93e9c1
1 Parent(s): 55c74d2

:tada: add application files

Browse files
.vscode/settings.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.tabSize": 2,
3
+ "editor.codeActionsOnSave": {
4
+ "source.fixAll.eslint": true
5
+ },
6
+ "python.linting.pycodestyleEnabled": true,
7
+ "python.linting.pycodestyleArgs": [
8
+ "--max-line-length=150"
9
+ ],
10
+ "python.linting.pylintEnabled": true,
11
+ "python.linting.pylintArgs": [
12
+ "--generated-members=numpy.* ,torch.*"
13
+ ],
14
+ "python.formatting.provider": "none",
15
+ "[python]": {
16
+ "editor.formatOnSave": true,
17
+ "editor.defaultFormatter": "ms-python.black-formatter"
18
+ }
19
+ }
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @author: Tan Quang Duong
3
+ """
4
+
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+ from datasets import load_dataset
10
+ from PIL import Image
11
+
12
+
13
+ # setting logos in the page
14
+ app_logo = Image.open("./figs/AI-driven-Solutions.png")
15
+
16
+ # set page config
17
+ st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
18
+ st.sidebar.image(app_logo, use_column_width=True)
19
+ st.sidebar.markdown(
20
+ "<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
21
+ unsafe_allow_html=True,
22
+ )
23
+
24
+ # model name
25
+ model_name = "tanquangduong/distilbert-imdb"
26
+
27
+ # Load tokenizer, model and imdb dataset from hugging face hub and add them to st.session_state
28
+ if "tokenizer" not in st.session_state:
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ st.session_state["tokenizer"] = tokenizer
31
+
32
+ if "model" not in st.session_state:
33
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
34
+ st.session_state["model"] = model
35
+
36
+ if "df_imdb_test" not in st.session_state:
37
+ imdb = load_dataset("imdb")
38
+ df_test = pd.DataFrame(imdb["test"])
39
+ df_test = df_test.sample(frac=1)
40
+ st.session_state["df_imdb_test"] = df_test
41
+
42
+ st.write("# Welcome to LLM-based sentiment analysis app!👋")
43
+
44
+ # st.sidebar.success("Select a demo above.")
45
+
46
+ st.markdown(
47
+ """
48
+ # Objective
49
+ This app leverages LLM to perform **:green[sentiment analysis]** for **:green[user reviews]**. Some potential use-cases are as bellow, but not limitted to:
50
+ - User reviews for drug efficiency on drug/medicin forums
51
+ - User reviews for mobile applications on app stores, e.g. Google Play, App Store
52
+ - User reviews for food quality on food delivery app
53
+ - User reviews for product quality on e-commerce websites
54
+ - etc.
55
+ """
56
+ )
figs/AI-driven-Solutions.png ADDED
figs/sentiment-analysis-streaming.png ADDED
pages/1_Review_Sentiment_Analysis.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @author: Tan Quang Duong
3
+ """
4
+
5
+
6
+ import streamlit as st
7
+ import hydralit_components as hc
8
+ from hydralit_components import HyLoader, Loaders
9
+ import pandas as pd
10
+ import numpy as np
11
+ from sklearn import metrics
12
+ from utils import inference_from_pytorch, plot_confusion_matric, plot_donut_sentiment_percentage, create_classification_report, get_100_random_test_review
13
+ from PIL import Image
14
+
15
+
16
+ # setting logos in the page
17
+ app_logo = Image.open("./figs/AI-driven-Solutions.png")
18
+
19
+ # set page config
20
+ st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
21
+ st.sidebar.image(app_logo, use_column_width=True)
22
+ st.sidebar.markdown(
23
+ "<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
24
+ unsafe_allow_html=True,
25
+ )
26
+
27
+ # specify the primary menu definition
28
+ menu_data = [{"id": "tab1", "icon": "😊😒", "label": "Review Sentiment Analysis"}]
29
+
30
+ over_theme = {
31
+ "menu_background": "#7BB657",
32
+ "txc_active": "#000000",
33
+ "txc_inactive": "#FFFFFF",
34
+ }
35
+ menu_id = hc.nav_bar(
36
+ menu_definition=menu_data,
37
+ override_theme=over_theme,
38
+ # home_name='Home',
39
+ # login_name='Logout',
40
+ hide_streamlit_markers=False, # will show the st hamburger as well as the navbar now!
41
+ sticky_nav=True, # at the top or not
42
+ sticky_mode="pinned", # jumpy or not-jumpy, but sticky or pinned
43
+ )
44
+
45
+ # Load tokenizer from st.session_state if exist
46
+ if "tokenizer" in st.session_state:
47
+ tokenizer = st.session_state["tokenizer"]
48
+ else:
49
+ st.write(
50
+ "Please come back to Home page for loading tokenizer, model and dataset from Hugging Face hub."
51
+ )
52
+
53
+ # load model from st.session_state if exist
54
+ if "model" in st.session_state:
55
+ model = st.session_state["model"]
56
+
57
+ # load test imdb dataset from st.session_state if exist
58
+ if "df_imdb_test" in st.session_state:
59
+ df_test = st.session_state["df_imdb_test"]
60
+
61
+ # create boolean variable for checking if df_test_100 is loaded
62
+ if "is_df_test_100_loaded" not in st.session_state:
63
+ st.session_state["is_df_test_100_loaded"] = False
64
+
65
+
66
+ with HyLoader("", loader_name=Loaders.pulse_bars):
67
+ if menu_id == "tab1":
68
+ input_mode = st.radio(
69
+ "**Select input mode** 👇",
70
+ ("Review streaming", "Add review manually"),
71
+ horizontal=True,
72
+ )
73
+
74
+ # ner for querry from
75
+ if input_mode == "Review streaming":
76
+ if st.button("Simulate streaming 100 random reviews"):
77
+ # get 100 random reviews as dataframe df_test_100
78
+ df_test_100 = get_100_random_test_review(df_test)
79
+ st.session_state["df_test_100"] = df_test_100
80
+
81
+ # display 100 random reviews
82
+ st.dataframe(df_test_100, use_container_width=True)
83
+ st.session_state["is_df_test_100_loaded"] = True
84
+
85
+ if st.session_state["is_df_test_100_loaded"]:
86
+ if st.button("Inference"):
87
+ # make prediction on 100 reviews
88
+ df_test_100_loaded = st.session_state["df_test_100"]
89
+ df_test_100_loaded["predicted_class_id"] = df_test_100_loaded[
90
+ "text"
91
+ ].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[0])
92
+ df_test_100_loaded["predicted_class"] = df_test_100_loaded[
93
+ "text"
94
+ ].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[1])
95
+
96
+ st.write("Sentiment analysis completed! Here is the result: 👇")
97
+ # display dataframe
98
+ st.dataframe(df_test_100_loaded, use_container_width=True)
99
+
100
+ # label prediction count
101
+ pred_labels = {
102
+ "label": ["positive", "negative"],
103
+ "count": list(
104
+ df_test_100_loaded.predicted_class_id.value_counts()
105
+ ),
106
+ }
107
+ df_pred_labels = pd.DataFrame(pred_labels)
108
+
109
+ # calculate confusion matrix
110
+ confusion_matrix = metrics.confusion_matrix(
111
+ df_test_100_loaded.class_id,
112
+ df_test_100_loaded.predicted_class_id,
113
+ )
114
+
115
+ # get classification report
116
+ df_report = create_classification_report(
117
+ df_test_100_loaded.class_id,
118
+ df_test_100_loaded.predicted_class_id,
119
+ )
120
+
121
+ col1, col2 = st.columns(2, gap="large")
122
+ with col1:
123
+ # plot donut chart for sentiment percentage
124
+ st.pyplot(plot_donut_sentiment_percentage(df_pred_labels))
125
+ with col2:
126
+ # plot confusion matrix
127
+ st.pyplot(plot_confusion_matric(confusion_matrix))
128
+
129
+ # display classification report
130
+ st.dataframe(df_report, use_container_width=True)
131
+
132
+ # ner for manually add text
133
+ elif input_mode == "Add review manually":
134
+ text_input = st.text_area("Type your review here:", height=200)
135
+ if text_input:
136
+ st.write(
137
+ "Predicted sentiment: **{}**".format(
138
+ inference_from_pytorch(text_input, tokenizer, model)[1]
139
+ )
140
+ )
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pylint
2
+ black
3
+ pycodestyle
4
+ requests
5
+ hydralit_components
6
+ pandas
7
+ numpy
8
+ matplotlib
9
+ seaborn
10
+ torch
11
+ transformers
12
+ datasets
13
+ evaluate
14
+ scikit-learn
15
+ pillow
16
+ streamlit-aggrid
utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @author: Tan Quang Duong
3
+ """
4
+ import torch
5
+ import matplotlib
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import numpy as np
9
+ import pandas as pd
10
+ from sklearn.metrics import classification_report
11
+
12
+
13
+ # custom color map
14
+ norm = matplotlib.colors.Normalize(-1, 1)
15
+ colors = [[norm(-1.0), "#DAF7A6"], [norm(1.0), "#673FEE"]]
16
+ custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)
17
+
18
+
19
+ def create_classification_report(y, y_pred):
20
+ target_class = ["negative", "positive"]
21
+ cls_report = classification_report(
22
+ y, y_pred, target_names=target_class, output_dict=True
23
+ )
24
+ df_report = pd.DataFrame(cls_report).transpose()
25
+ return df_report.round(2)
26
+
27
+
28
+ def get_100_random_test_review(df_test):
29
+ # get random 100 reviews
30
+ n_random = np.random.randint(len(df_test) - 101)
31
+
32
+ # get dataframe of 100 reviews
33
+ df_test_100 = df_test.iloc[n_random : n_random + 100]
34
+
35
+ # column rename
36
+ df_test_100 = df_test_100.rename(columns={"label": "class_id"})
37
+
38
+ return df_test_100
39
+
40
+
41
+ def inference_from_pytorch(text, tokenizer, model):
42
+ inputs = tokenizer(text, return_tensors="pt", truncation=True)
43
+ # do inference
44
+ with torch.no_grad():
45
+ logits = model(**inputs).logits
46
+ # get label
47
+ predicted_class_id = logits.argmax().item()
48
+ predicted_label = model.config.id2label[predicted_class_id]
49
+ return predicted_class_id, predicted_label
50
+
51
+
52
+ def plot_confusion_matric(confusion_matrix):
53
+ # annot=True to annotate cells, ftm='g' to disable scientific notation
54
+ sentiment_labels = ["Negative", "Positive"]
55
+
56
+ fig_cm, ax = plt.subplots(figsize=(8, 8))
57
+ sns.heatmap(
58
+ confusion_matrix,
59
+ annot=True,
60
+ fmt="g",
61
+ cmap=custom_cmap,
62
+ ax=ax,
63
+ )
64
+ # labels, title and ticks
65
+ ax.set_xlabel("Predicted labels", size=12, weight="bold")
66
+ ax.set_ylabel("True labels", size=12, weight="bold")
67
+ ax.set_title("Confusion matrix", size=16, weight="bold")
68
+ ax.xaxis.set_ticklabels(sentiment_labels)
69
+ ax.yaxis.set_ticklabels(sentiment_labels)
70
+ return fig_cm
71
+
72
+
73
+ def plot_donut_sentiment_percentage(df):
74
+ # explosion
75
+ explode_val = (0.05, 0.05)
76
+ custom_colors = ["#673FEE", "#DAF7A6"]
77
+ # Give color names
78
+ fig_pie, ax_pie = plt.subplots()
79
+ ax_pie.pie(
80
+ df["count"],
81
+ labels=df["label"],
82
+ autopct="%1.1f%%",
83
+ pctdistance=0.5,
84
+ explode=explode_val,
85
+ colors=custom_colors,
86
+ )
87
+ ax_pie.set_title("Sentiment analysis", size=12, weight="bold")
88
+ # Create a circle at the center of the plot
89
+ my_circle = plt.Circle((0, 0), 0.7, color="white")
90
+ p = plt.gcf()
91
+ p.gca().add_artist(my_circle)
92
+ return fig_pie