{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c454c018-02b7-4c3d-a21f-411748963a3f",
   "metadata": {},
   "source": [
    "# Workshop: Sentiment Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2eda2e01-dfc4-42a6-9b6a-5cdf39fbce78",
   "metadata": {},
   "source": [
    "<div>\n",
    "<img src=\"https://lh3.googleusercontent.com/pw/ADCreHdzakFbNdHwBE1ZrwOiNCQibViWOir9DF9Dv4fbZEdWpx4mzFOT_RxkUGLTyDW7fQ0OwEyNQwqllupbvm0WiU0RNuFs-kWx1fTIvjiSkPGE5m64PilOIeApxQLwX_rl-JU7uYT-ROxdppIsJimCeos=w406-h451-s-no-gm?authuser=0\" width=\"390\"/> \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ef9db65-1fda-4fc6-8bb9-bc52bdbb9529",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# !pip install nltk\n",
    "# !pip install transformers "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a0b8ed9-f240-47b4-aa62-0cf48bdd7868",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## Rule-Based Approaches\n",
    "\n",
    "- **Lexicon-Based Methods**: Use sentiment lexicons or dictionaries that contain words annotated with their sentiment polarity (positive, negative, neutral).\n",
    "- **Pattern Matching**: Identify sentiment based on predefined patterns or rules in the text.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f7f14b4-60ba-4a92-a9d0-a124e62fe03b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "# nltk.download('stopwords')\n",
    "# nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a25f60f-f202-49cd-b965-e3ebb1676786",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7652d6d2-ba4c-4d02-bfe3-313b6e0f24a5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "text = \"I had a good experience with the product. Highly recommended!\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53fc7d50-59fa-4bec-9ae4-b93a1a3847f1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "tokens = word_tokenize(text.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "faac761f-912e-44f7-b7b0-626baaea6a56",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f6543a2-76f4-4993-b535-f90e50bada72",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "stop_words = set(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d7f529d-f006-48db-a092-2262f17cb3cd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  #alnum = alphanumeric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4acfb41c-615d-4e8b-92dc-3f73a4188402",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3cfd1cc-3f30-43de-a469-dec0b3816313",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aed2ad01-27e5-45e3-a55c-63084966a482",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Sample positive and negative words\n",
    "positive_words = set(['good', 'awesome', 'excellent', 'happy', 'positive'])\n",
    "negative_words = set(['bad', 'terrible', 'poor', 'unhappy', 'negative'])\n",
    "\n",
    "def rule_based_sentiment_analysis(text):\n",
    "    # Tokenize the text\n",
    "    tokens = word_tokenize(text.lower())\n",
    "\n",
    "    # Remove stopwords\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  #alnum = alphanumeric\n",
    "\n",
    "    # Calculate sentiment score\n",
    "    sentiment_score = sum(1 for word in tokens if word in positive_words) - sum(1 for word in tokens if word in negative_words)\n",
    "\n",
    "    # Classify sentiment\n",
    "    if sentiment_score > 0:\n",
    "        return 'Positive'\n",
    "    elif sentiment_score < 0:\n",
    "        return 'Negative'\n",
    "    else:\n",
    "        return 'Neutral'\n",
    "\n",
    "# Example usage\n",
    "text_to_analyze = \"I had a good experience with the product. Highly recommended!\"\n",
    "sentiment_result = rule_based_sentiment_analysis(text_to_analyze)\n",
    "print(f\"Sentiment: {sentiment_result}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21764069-0b07-4b3e-8103-b2ab464a9182",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Machine Learning Approaches"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc739c8a-a453-43d1-bdc5-ad10d823d748",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e030b97-e111-45ea-b00f-09a360f3400e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "# from sklearn.svm import SVC\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54c4fe66-f52f-487f-bfd5-0ea6e05206ce",
   "metadata": {
    "tags": []
   },
   "source": [
    "### TF-IDF vectorizer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f5b7e92-5de4-4894-b2be-47dac1cf2482",
   "metadata": {},
   "source": [
    "\n",
    "<div>\n",
    "<img src=\"https://www.kdnuggets.com/wp-content/uploads/awan_convert_text_documents_tfidf_matrix_tfidfvectorizer_3.png\" width=\"590\"/> \n",
    "</div>\n",
    "\n",
    "\n",
    "Image sources: https://www.kdnuggets.com/2022/09/convert-text-documents-tfidf-matrix-tfidfvectorizer.html\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9bd125fc-11fd-414a-b8f0-ff7ef628fb94",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "##### Example on Small data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a61fdce-6544-4774-bc29-265bf4afaa90",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "# Sample data\n",
    "documents = [\n",
    "    \"This is the first document.\",\n",
    "    \"This document is the second document.\",\n",
    "    \"And this is the third one.\",\n",
    "    \"Is this the first document?\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5794027b-2bee-46d9-9b4d-9cbaa7c4120f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Create a DataFrame for better visualization\n",
    "df = pd.DataFrame({'Text': documents})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b49d5272-0383-4e39-910b-87276c4ffca2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# TF-IDF vectorization\n",
    "vectorizer = TfidfVectorizer()\n",
    "tfidf_matrix = vectorizer.fit_transform(df['Text'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46c0b47d-80ab-498b-91a2-7202f1c429fd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Convert the TF-IDF matrix to a DataFrame\n",
    "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91c2bee0-5bb6-44b9-a609-1f3d0e891ad4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Print the original data\n",
    "print(\"Original Data:\")\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24c4a522-8ef4-4001-ada6-031a043b9a54",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(tfidf_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6feb5892-284f-43d1-ab7b-5b13dbfadd0b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Print the TF-IDF matrix\n",
    "print(\"\\nTF-IDF Matrix:\")\n",
    "print(tfidf_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6802c239-edfa-462e-99ea-31386fd7aed4",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Naive Bayes classifier trained on the TF-IDF features."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3accf6f8-6cae-4265-8d5d-fb5d40a07a2d",
   "metadata": {},
   "source": [
    "<div>\n",
    "<img src=\"fig_bayes-nw.png\" width=\"800\"/> \n",
    "</div>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9062063a-557b-4971-ad84-e3601b1a520e",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Read data/Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d2eab09-03c7-441e-9c78-0c2e069f4d25",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# df = pd.read_csv(\"Womens_Clothing_E_Commerce_Reviews.csv\")\n",
    "df = pd.read_csv(\"imdb_reviews.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aca597f3-c8da-4314-990e-253d5ed719da",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d8131e4-4a69-45af-aa12-335c926e308f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43a27caf-779b-4bd1-a3cf-fa641021172e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df['label'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c72dd5ec-59b2-4c7f-a8fb-fdade866984d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df['label'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba556f9b-da1c-4d13-8d70-563e0bd528a1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "df.isna().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "819c31c3-873d-4d31-a21a-759059bd4c6d",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Split the dataset into training and testing sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ca318a2-26d7-446e-8324-6660171f239d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0cfc8fc-49e5-4c88-bb33-8084dcf00100",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51d0a415-4982-43dd-8864-c189ba6826f4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(train_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42987cdb-4cdf-46df-95d8-7c2b2824c1ee",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Create a pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06ffd548-c333-4c1a-87ce-9699ddd116ee",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "sentiment_pipeline = Pipeline([\n",
    "    ('tfidf', TfidfVectorizer()),\n",
    "    ('nb', MultinomialNB())\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bafa7cd-8d0b-4725-bd40-4a3b04634fab",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Train the model using the pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "712dea09-52c2-4a9f-8bf9-3cbb273fe4b5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "sentiment_pipeline.fit(train_data, train_labels)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c95c599-ae0d-433f-9ed5-856fd9fa35e0",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Make predictions on the test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37ae9eda-4a02-4f40-bdeb-ecb8ea67f9d3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "predictions = sentiment_pipeline.predict(test_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a33458e2-90cb-4c94-b977-8cc8ea5a273e",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "### Evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ad90567-93d2-4090-81be-5c77f41e379a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "report = classification_report(test_labels, predictions)\n",
    "\n",
    "print(\"Classification Report:\\n\", report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef002e29-d065-4825-a076-3d23fdfa7b59",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "cm = confusion_matrix(test_labels, predictions)\n",
    "cm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e7729bb-a833-4feb-bd2a-b04a2741bd70",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## Huggingface: Pre-trained sentiment analysis model\n",
    "\n",
    "https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9afad444-c2cc-4f3d-b49d-07a723be6154",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "from transformers import pipeline\n",
    "sentiment_analyzer = pipeline('sentiment-analysis', model =\"distilbert-base-uncased-finetuned-sst-2-english\") #, revision =\"af0f99b\")\n",
    "data = [\"I love you\", \"I hate you\"]\n",
    "sentiment_analyzer(data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4987efd9-8ca8-40b1-90cc-ff361207fb8f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "result = sentiment_analyzer(\"I love using this model!\")\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68436dda-e3c3-499d-b390-60443f9a1796",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## Huggingface: Thai "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72a9f8e0-12bf-403b-8b78-e381a65e9eaa",
   "metadata": {},
   "source": [
    "### model=\"poom-sci/WangchanBERTa-finetuned-sentiment\"\n",
    "\n",
    "https://huggingface.co/poom-sci/WangchanBERTa-finetuned-sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d698825b-3bd7-4370-871f-ac6e5fe5fe47",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "sentiment_analyzer = pipeline('sentiment-analysis', model=\"poom-sci/WangchanBERTa-finetuned-sentiment\")#, revision=\"b78d071\")\n",
    "\n",
    "data = [\"อร่อยจัดๆ\", \"รอนานแท้\"]\n",
    "sentiment_analyzer(data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87d815d4-135c-471e-93ee-cacc93653d4e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "sentiment_analyzer(\"ข้าวบูด\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60f5c43a-6cb7-47f1-85c5-751e91599ad9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "f894a4bd-1f04-4126-aa8d-e0211b41687e",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## Deploy on Streamlit Sharing\n",
    "\n",
    "https://share.streamlit.io/  or https://huggingface.co/spaces\n",
    "\n",
    "https://docs.streamlit.io/library/api-reference\n",
    "\n",
    "https://github.com/\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfd5baee-dc74-4f6d-84be-52a2b89d0f28",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "%%writefile app_senti.py\n",
    "\n",
    "\n",
    "import streamlit as st\n",
    "from transformers import pipeline\n",
    "\n",
    "# Load the sentiment analysis model\n",
    "model_name = \"poom-sci/WangchanBERTa-finetuned-sentiment\"\n",
    "sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)\n",
    "\n",
    "# Streamlit app\n",
    "st.title(\"Thai Sentiment Analysis App\")\n",
    "\n",
    "# Input text\n",
    "text_input = st.text_area(\"Enter Thai text for sentiment analysis\", \"ขอความเห็นหน่อย... \")\n",
    "\n",
    "# Button to trigger analysis\n",
    "if st.button(\"Analyze Sentiment\"):\n",
    "    # Analyze sentiment using the model\n",
    "    results = sentiment_analyzer([text_input])\n",
    "\n",
    "    # Extract sentiment and score\n",
    "    sentiment = results[0]['label']\n",
    "    score = results[0]['score']\n",
    "    \n",
    "\n",
    "    # Display result as progress bars\n",
    "    st.subheader(\"Sentiment Analysis Result:\")\n",
    "\n",
    "    if sentiment == 'pos':\n",
    "        st.success(f\"Positive Sentiment (Score: {score:.2f})\")\n",
    "        st.progress(score)\n",
    "    elif sentiment == 'neg':\n",
    "        st.error(f\"Negative Sentiment (Score: {score:.2f})\")\n",
    "        st.progress(score)\n",
    "    else:\n",
    "        st.warning(f\"Neutral Sentiment (Score: {score:.2f})\")\n",
    "        st.progress(score)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70111967-b904-4f18-a8d0-0c8701ec35ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%writefile requirements.txt\n",
    "transformers\n",
    "torch\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88001002-587d-403d-ab65-d060bde9d42d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}