Spaces:

tanquangduong
/

sentiment-analysis-on-reviews

Runtime error

App Files Files Community

tanquangduong commited on Jun 27, 2023

Commit

b93e9c1

•

1 Parent(s): 55c74d2

:tada: add application files

Browse files

Files changed (7) hide show

.vscode/settings.json +19 -0
app.py +56 -0
figs/AI-driven-Solutions.png +0 -0
figs/sentiment-analysis-streaming.png +0 -0
pages/1_Review_Sentiment_Analysis.py +140 -0
requirements.txt +16 -0
utils.py +92 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "editor.tabSize": 2,
+    "editor.codeActionsOnSave": {
+        "source.fixAll.eslint": true
+    },
+    "python.linting.pycodestyleEnabled": true,
+    "python.linting.pycodestyleArgs": [
+        "--max-line-length=150"
+    ],
+    "python.linting.pylintEnabled": true,
+    "python.linting.pylintArgs": [
+        "--generated-members=numpy.* ,torch.*"
+        ],
+    "python.formatting.provider": "none",
+    "[python]": {
+        "editor.formatOnSave": true,
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    }
+}

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+@author: Tan Quang Duong
+"""
+import streamlit as st
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from datasets import load_dataset
+from PIL import Image
+# setting logos in the page
+app_logo = Image.open("./figs/AI-driven-Solutions.png")
+# set page config
+st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
+st.sidebar.image(app_logo, use_column_width=True)
+st.sidebar.markdown(
+    "<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
+    unsafe_allow_html=True,
+)
+# model name
+model_name = "tanquangduong/distilbert-imdb"
+# Load tokenizer, model and imdb dataset from hugging face hub and add them to st.session_state
+if "tokenizer" not in st.session_state:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    st.session_state["tokenizer"] = tokenizer
+if "model" not in st.session_state:
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    st.session_state["model"] = model
+if "df_imdb_test" not in st.session_state:
+    imdb = load_dataset("imdb")
+    df_test = pd.DataFrame(imdb["test"])
+    df_test = df_test.sample(frac=1)
+    st.session_state["df_imdb_test"] = df_test
+st.write("# Welcome to LLM-based sentiment analysis app!👋")
+# st.sidebar.success("Select a demo above.")
+st.markdown(
+    """
+    # Objective
+    This app leverages LLM to perform **:green[sentiment analysis]** for **:green[user reviews]**. Some potential use-cases are as bellow, but not limitted to:
+    - User reviews for drug efficiency on drug/medicin forums
+    - User reviews for mobile applications on app stores, e.g. Google Play, App Store
+    - User reviews for food quality on food delivery app
+    - User reviews for product quality on e-commerce websites
+    - etc.
+"""
+)

figs/AI-driven-Solutions.png ADDED Viewed

figs/sentiment-analysis-streaming.png ADDED Viewed

pages/1_Review_Sentiment_Analysis.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+@author: Tan Quang Duong
+"""
+import streamlit as st
+import hydralit_components as hc
+from hydralit_components import HyLoader, Loaders
+import pandas as pd
+import numpy as np
+from sklearn import metrics
+from utils import inference_from_pytorch, plot_confusion_matric, plot_donut_sentiment_percentage, create_classification_report, get_100_random_test_review
+from PIL import Image
+# setting logos in the page
+app_logo = Image.open("./figs/AI-driven-Solutions.png")
+# set page config
+st.set_page_config(page_title="Review Sentiment Analysis", page_icon="🚀", layout="wide")
+st.sidebar.image(app_logo, use_column_width=True)
+st.sidebar.markdown(
+    "<h1 style='text-align: center; color: grey;'> Quang Duong </h1>",
+    unsafe_allow_html=True,
+)
+# specify the primary menu definition
+menu_data = [{"id": "tab1", "icon": "😊😒", "label": "Review Sentiment Analysis"}]
+over_theme = {
+    "menu_background": "#7BB657",
+    "txc_active": "#000000",
+    "txc_inactive": "#FFFFFF",
+}
+menu_id = hc.nav_bar(
+    menu_definition=menu_data,
+    override_theme=over_theme,
+    # home_name='Home',
+    # login_name='Logout',
+    hide_streamlit_markers=False,  # will show the st hamburger as well as the navbar now!
+    sticky_nav=True,  # at the top or not
+    sticky_mode="pinned",  # jumpy or not-jumpy, but sticky or pinned
+)
+# Load tokenizer from st.session_state if exist
+if "tokenizer" in st.session_state:
+    tokenizer = st.session_state["tokenizer"]
+else:
+    st.write(
+        "Please come back to Home page for loading tokenizer, model and dataset from Hugging Face hub."
+    )
+# load model from st.session_state if exist
+if "model" in st.session_state:
+    model = st.session_state["model"]
+# load test imdb dataset from st.session_state if exist
+if "df_imdb_test" in st.session_state:
+    df_test = st.session_state["df_imdb_test"]
+# create boolean variable for checking if df_test_100 is loaded
+if "is_df_test_100_loaded" not in st.session_state:
+    st.session_state["is_df_test_100_loaded"] = False
+with HyLoader("", loader_name=Loaders.pulse_bars):
+    if menu_id == "tab1":
+        input_mode = st.radio(
+            "**Select input mode** 👇",
+            ("Review streaming", "Add review manually"),
+            horizontal=True,
+        )
+        # ner for querry from
+        if input_mode == "Review streaming":
+            if st.button("Simulate streaming 100 random reviews"):
+                # get 100 random reviews as dataframe df_test_100
+                df_test_100 = get_100_random_test_review(df_test)
+                st.session_state["df_test_100"] = df_test_100
+                # display 100 random reviews
+                st.dataframe(df_test_100, use_container_width=True)
+                st.session_state["is_df_test_100_loaded"] = True
+            if st.session_state["is_df_test_100_loaded"]:
+                if st.button("Inference"):
+                    # make prediction on 100 reviews
+                    df_test_100_loaded = st.session_state["df_test_100"]
+                    df_test_100_loaded["predicted_class_id"] = df_test_100_loaded[
+                        "text"
+                    ].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[0])
+                    df_test_100_loaded["predicted_class"] = df_test_100_loaded[
+                        "text"
+                    ].apply(lambda x: inference_from_pytorch(x, tokenizer, model)[1])
+                    st.write("Sentiment analysis completed! Here is the result: 👇")
+                    # display dataframe
+                    st.dataframe(df_test_100_loaded, use_container_width=True)
+                    # label prediction count
+                    pred_labels = {
+                        "label": ["positive", "negative"],
+                        "count": list(
+                            df_test_100_loaded.predicted_class_id.value_counts()
+                        ),
+                    }
+                    df_pred_labels = pd.DataFrame(pred_labels)
+                    # calculate confusion matrix
+                    confusion_matrix = metrics.confusion_matrix(
+                        df_test_100_loaded.class_id,
+                        df_test_100_loaded.predicted_class_id,
+                    )
+                    # get classification report
+                    df_report = create_classification_report(
+                        df_test_100_loaded.class_id,
+                        df_test_100_loaded.predicted_class_id,
+                    )
+                    col1, col2 = st.columns(2, gap="large")
+                    with col1:
+                        # plot donut chart for sentiment percentage
+                        st.pyplot(plot_donut_sentiment_percentage(df_pred_labels))
+                    with col2:
+                        # plot confusion matrix
+                        st.pyplot(plot_confusion_matric(confusion_matrix))
+                    # display classification report
+                    st.dataframe(df_report, use_container_width=True)
+        # ner for manually add text
+        elif input_mode == "Add review manually":
+            text_input = st.text_area("Type your review here:", height=200)
+            if text_input:
+                st.write(
+                    "Predicted sentiment: **{}**".format(
+                        inference_from_pytorch(text_input, tokenizer, model)[1]
+                    )
+                )

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+pylint
+black
+pycodestyle
+requests
+hydralit_components
+pandas
+numpy
+matplotlib
+seaborn
+torch
+transformers
+datasets
+evaluate
+scikit-learn
+pillow
+streamlit-aggrid

utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+@author: Tan Quang Duong
+"""
+import torch
+import matplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import pandas as pd
+from sklearn.metrics import classification_report
+# custom color map
+norm = matplotlib.colors.Normalize(-1, 1)
+colors = [[norm(-1.0), "#DAF7A6"], [norm(1.0), "#673FEE"]]
+custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)
+def create_classification_report(y, y_pred):
+    target_class = ["negative", "positive"]
+    cls_report = classification_report(
+        y, y_pred, target_names=target_class, output_dict=True
+    )
+    df_report = pd.DataFrame(cls_report).transpose()
+    return df_report.round(2)
+def get_100_random_test_review(df_test):
+    # get random 100 reviews
+    n_random = np.random.randint(len(df_test) - 101)
+    # get dataframe of 100 reviews
+    df_test_100 = df_test.iloc[n_random : n_random + 100]
+    # column rename
+    df_test_100 = df_test_100.rename(columns={"label": "class_id"})
+    return df_test_100
+def inference_from_pytorch(text, tokenizer, model):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True)
+    # do inference
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # get label
+    predicted_class_id = logits.argmax().item()
+    predicted_label = model.config.id2label[predicted_class_id]
+    return predicted_class_id, predicted_label
+def plot_confusion_matric(confusion_matrix):
+    # annot=True to annotate cells, ftm='g' to disable scientific notation
+    sentiment_labels = ["Negative", "Positive"]
+    fig_cm, ax = plt.subplots(figsize=(8, 8))
+    sns.heatmap(
+        confusion_matrix,
+        annot=True,
+        fmt="g",
+        cmap=custom_cmap,
+        ax=ax,
+    )
+    # labels, title and ticks
+    ax.set_xlabel("Predicted labels", size=12, weight="bold")
+    ax.set_ylabel("True labels", size=12, weight="bold")
+    ax.set_title("Confusion matrix", size=16, weight="bold")
+    ax.xaxis.set_ticklabels(sentiment_labels)
+    ax.yaxis.set_ticklabels(sentiment_labels)
+    return fig_cm
+def plot_donut_sentiment_percentage(df):
+    # explosion
+    explode_val = (0.05, 0.05)
+    custom_colors = ["#673FEE", "#DAF7A6"]
+    # Give color names
+    fig_pie, ax_pie = plt.subplots()
+    ax_pie.pie(
+        df["count"],
+        labels=df["label"],
+        autopct="%1.1f%%",
+        pctdistance=0.5,
+        explode=explode_val,
+        colors=custom_colors,
+    )
+    ax_pie.set_title("Sentiment analysis", size=12, weight="bold")
+    # Create a circle at the center of the plot
+    my_circle = plt.Circle((0, 0), 0.7, color="white")
+    p = plt.gcf()
+    p.gca().add_artist(my_circle)
+    return fig_pie