{ "cells": [ { "cell_type": "markdown", "id": "c454c018-02b7-4c3d-a21f-411748963a3f", "metadata": {}, "source": [ "# Workshop: Sentiment Analysis" ] }, { "cell_type": "markdown", "id": "2eda2e01-dfc4-42a6-9b6a-5cdf39fbce78", "metadata": {}, "source": [ "
\n", " \n", "
" ] }, { "cell_type": "code", "execution_count": null, "id": "7ef9db65-1fda-4fc6-8bb9-bc52bdbb9529", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !pip install nltk\n", "# !pip install transformers " ] }, { "cell_type": "markdown", "id": "1a0b8ed9-f240-47b4-aa62-0cf48bdd7868", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Rule-Based Approaches\n", "\n", "- **Lexicon-Based Methods**: Use sentiment lexicons or dictionaries that contain words annotated with their sentiment polarity (positive, negative, neutral).\n", "- **Pattern Matching**: Identify sentiment based on predefined patterns or rules in the text.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9f7f14b4-60ba-4a92-a9d0-a124e62fe03b", "metadata": { "tags": [] }, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "\n", "# nltk.download('stopwords')\n", "# nltk.download('punkt')" ] }, { "cell_type": "code", "execution_count": null, "id": "8a25f60f-f202-49cd-b965-e3ebb1676786", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(stopwords.words('english'))" ] }, { "cell_type": "code", "execution_count": null, "id": "7652d6d2-ba4c-4d02-bfe3-313b6e0f24a5", "metadata": { "tags": [] }, "outputs": [], "source": [ "text = \"I had a good experience with the product. Highly recommended!\"" ] }, { "cell_type": "code", "execution_count": null, "id": "53fc7d50-59fa-4bec-9ae4-b93a1a3847f1", "metadata": { "tags": [] }, "outputs": [], "source": [ "tokens = word_tokenize(text.lower())" ] }, { "cell_type": "code", "execution_count": null, "id": "faac761f-912e-44f7-b7b0-626baaea6a56", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(tokens)" ] }, { "cell_type": "code", "execution_count": null, "id": "9f6543a2-76f4-4993-b535-f90e50bada72", "metadata": { "tags": [] }, "outputs": [], "source": [ "stop_words = set(stopwords.words('english'))" ] }, { "cell_type": "code", "execution_count": null, "id": "4d7f529d-f006-48db-a092-2262f17cb3cd", "metadata": { "tags": [] }, "outputs": [], "source": [ "tokens = [word for word in tokens if word.isalnum() and word not in stop_words] #alnum = alphanumeric" ] }, { "cell_type": "code", "execution_count": null, "id": "4acfb41c-615d-4e8b-92dc-3f73a4188402", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(tokens)" ] }, { "cell_type": "code", "execution_count": null, "id": "c3cfd1cc-3f30-43de-a469-dec0b3816313", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "aed2ad01-27e5-45e3-a55c-63084966a482", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Sample positive and negative words\n", "positive_words = set(['good', 'awesome', 'excellent', 'happy', 'positive'])\n", "negative_words = set(['bad', 'terrible', 'poor', 'unhappy', 'negative'])\n", "\n", "def rule_based_sentiment_analysis(text):\n", " # Tokenize the text\n", " tokens = word_tokenize(text.lower())\n", "\n", " # Remove stopwords\n", " stop_words = set(stopwords.words('english'))\n", " tokens = [word for word in tokens if word.isalnum() and word not in stop_words] #alnum = alphanumeric\n", "\n", " # Calculate sentiment score\n", " sentiment_score = sum(1 for word in tokens if word in positive_words) - sum(1 for word in tokens if word in negative_words)\n", "\n", " # Classify sentiment\n", " if sentiment_score > 0:\n", " return 'Positive'\n", " elif sentiment_score < 0:\n", " return 'Negative'\n", " else:\n", " return 'Neutral'\n", "\n", "# Example usage\n", "text_to_analyze = \"I had a good experience with the product. Highly recommended!\"\n", "sentiment_result = rule_based_sentiment_analysis(text_to_analyze)\n", "print(f\"Sentiment: {sentiment_result}\")" ] }, { "cell_type": "markdown", "id": "21764069-0b07-4b3e-8103-b2ab464a9182", "metadata": { "tags": [] }, "source": [ "## Machine Learning Approaches" ] }, { "cell_type": "markdown", "id": "dc739c8a-a453-43d1-bdc5-ad10d823d748", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Import packages" ] }, { "cell_type": "code", "execution_count": null, "id": "7e030b97-e111-45ea-b00f-09a360f3400e", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.utils import shuffle\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "# from sklearn.svm import SVC\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "\n" ] }, { "cell_type": "markdown", "id": "54c4fe66-f52f-487f-bfd5-0ea6e05206ce", "metadata": { "tags": [] }, "source": [ "### TF-IDF vectorizer" ] }, { "cell_type": "markdown", "id": "3f5b7e92-5de4-4894-b2be-47dac1cf2482", "metadata": {}, "source": [ "\n", "
\n", " \n", "
\n", "\n", "\n", "Image sources: https://www.kdnuggets.com/2022/09/convert-text-documents-tfidf-matrix-tfidfvectorizer.html\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "id": "9bd125fc-11fd-414a-b8f0-ff7ef628fb94", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "##### Example on Small data" ] }, { "cell_type": "code", "execution_count": null, "id": "8a61fdce-6544-4774-bc29-265bf4afaa90", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "\n", "# Sample data\n", "documents = [\n", " \"This is the first document.\",\n", " \"This document is the second document.\",\n", " \"And this is the third one.\",\n", " \"Is this the first document?\"\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "5794027b-2bee-46d9-9b4d-9cbaa7c4120f", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Create a DataFrame for better visualization\n", "df = pd.DataFrame({'Text': documents})" ] }, { "cell_type": "code", "execution_count": null, "id": "b49d5272-0383-4e39-910b-87276c4ffca2", "metadata": { "tags": [] }, "outputs": [], "source": [ "# TF-IDF vectorization\n", "vectorizer = TfidfVectorizer()\n", "tfidf_matrix = vectorizer.fit_transform(df['Text'].tolist())" ] }, { "cell_type": "code", "execution_count": null, "id": "46c0b47d-80ab-498b-91a2-7202f1c429fd", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Convert the TF-IDF matrix to a DataFrame\n", "tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": null, "id": "91c2bee0-5bb6-44b9-a609-1f3d0e891ad4", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Print the original data\n", "print(\"Original Data:\")\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "24c4a522-8ef4-4001-ada6-031a043b9a54", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(tfidf_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "6feb5892-284f-43d1-ab7b-5b13dbfadd0b", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Print the TF-IDF matrix\n", "print(\"\\nTF-IDF Matrix:\")\n", "print(tfidf_df)" ] }, { "cell_type": "markdown", "id": "6802c239-edfa-462e-99ea-31386fd7aed4", "metadata": { "tags": [] }, "source": [ "## Naive Bayes classifier trained on the TF-IDF features." ] }, { "cell_type": "markdown", "id": "3accf6f8-6cae-4265-8d5d-fb5d40a07a2d", "metadata": {}, "source": [ "
\n", " \n", "
\n" ] }, { "cell_type": "markdown", "id": "9062063a-557b-4971-ad84-e3601b1a520e", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Read data/Preparation" ] }, { "cell_type": "code", "execution_count": null, "id": "8d2eab09-03c7-441e-9c78-0c2e069f4d25", "metadata": { "tags": [] }, "outputs": [], "source": [ "# df = pd.read_csv(\"Womens_Clothing_E_Commerce_Reviews.csv\")\n", "df = pd.read_csv(\"imdb_reviews.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "aca597f3-c8da-4314-990e-253d5ed719da", "metadata": { "tags": [] }, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "7d8131e4-4a69-45af-aa12-335c926e308f", "metadata": { "tags": [] }, "outputs": [], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "id": "43a27caf-779b-4bd1-a3cf-fa641021172e", "metadata": { "tags": [] }, "outputs": [], "source": [ "df['label'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "c72dd5ec-59b2-4c7f-a8fb-fdade866984d", "metadata": { "tags": [] }, "outputs": [], "source": [ "df['label'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "ba556f9b-da1c-4d13-8d70-563e0bd528a1", "metadata": { "tags": [] }, "outputs": [], "source": [ "df.isna().sum()" ] }, { "cell_type": "markdown", "id": "819c31c3-873d-4d31-a21a-759059bd4c6d", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Split the dataset into training and testing sets" ] }, { "cell_type": "code", "execution_count": null, "id": "6ca318a2-26d7-446e-8324-6660171f239d", "metadata": { "tags": [] }, "outputs": [], "source": [ "train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0cfc8fc-49e5-4c88-bb33-8084dcf00100", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(train_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "51d0a415-4982-43dd-8864-c189ba6826f4", "metadata": { "tags": [] }, "outputs": [], "source": [ "print(train_labels)" ] }, { "cell_type": "markdown", "id": "42987cdb-4cdf-46df-95d8-7c2b2824c1ee", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Create a pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "06ffd548-c333-4c1a-87ce-9699ddd116ee", "metadata": { "tags": [] }, "outputs": [], "source": [ "sentiment_pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('nb', MultinomialNB())\n", "])" ] }, { "cell_type": "markdown", "id": "6bafa7cd-8d0b-4725-bd40-4a3b04634fab", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Train the model using the pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "712dea09-52c2-4a9f-8bf9-3cbb273fe4b5", "metadata": { "tags": [] }, "outputs": [], "source": [ "sentiment_pipeline.fit(train_data, train_labels)\n" ] }, { "cell_type": "markdown", "id": "4c95c599-ae0d-433f-9ed5-856fd9fa35e0", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Make predictions on the test set" ] }, { "cell_type": "code", "execution_count": null, "id": "37ae9eda-4a02-4f40-bdeb-ecb8ea67f9d3", "metadata": { "tags": [] }, "outputs": [], "source": [ "predictions = sentiment_pipeline.predict(test_data)" ] }, { "cell_type": "markdown", "id": "a33458e2-90cb-4c94-b977-8cc8ea5a273e", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Evaluate the model" ] }, { "cell_type": "code", "execution_count": null, "id": "9ad90567-93d2-4090-81be-5c77f41e379a", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "report = classification_report(test_labels, predictions)\n", "\n", "print(\"Classification Report:\\n\", report)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef002e29-d065-4825-a076-3d23fdfa7b59", "metadata": { "tags": [] }, "outputs": [], "source": [ "cm = confusion_matrix(test_labels, predictions)\n", "cm" ] }, { "cell_type": "markdown", "id": "6e7729bb-a833-4feb-bd2a-b04a2741bd70", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Huggingface: Pre-trained sentiment analysis model\n", "\n", "https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english" ] }, { "cell_type": "code", "execution_count": null, "id": "9afad444-c2cc-4f3d-b49d-07a723be6154", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "from transformers import pipeline\n", "sentiment_analyzer = pipeline('sentiment-analysis', model =\"distilbert-base-uncased-finetuned-sst-2-english\") #, revision =\"af0f99b\")\n", "data = [\"I love you\", \"I hate you\"]\n", "sentiment_analyzer(data)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4987efd9-8ca8-40b1-90cc-ff361207fb8f", "metadata": { "tags": [] }, "outputs": [], "source": [ "result = sentiment_analyzer(\"I love using this model!\")\n", "print(result)" ] }, { "cell_type": "markdown", "id": "68436dda-e3c3-499d-b390-60443f9a1796", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Huggingface: Thai " ] }, { "cell_type": "markdown", "id": "72a9f8e0-12bf-403b-8b78-e381a65e9eaa", "metadata": {}, "source": [ "### model=\"poom-sci/WangchanBERTa-finetuned-sentiment\"\n", "\n", "https://huggingface.co/poom-sci/WangchanBERTa-finetuned-sentiment" ] }, { "cell_type": "code", "execution_count": null, "id": "d698825b-3bd7-4370-871f-ac6e5fe5fe47", "metadata": { "tags": [] }, "outputs": [], "source": [ "from transformers import pipeline\n", "\n", "sentiment_analyzer = pipeline('sentiment-analysis', model=\"poom-sci/WangchanBERTa-finetuned-sentiment\")#, revision=\"b78d071\")\n", "\n", "data = [\"อร่อยจัดๆ\", \"รอนานแท้\"]\n", "sentiment_analyzer(data)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "87d815d4-135c-471e-93ee-cacc93653d4e", "metadata": { "tags": [] }, "outputs": [], "source": [ "sentiment_analyzer(\"ข้าวบูด\")" ] }, { "cell_type": "code", "execution_count": null, "id": "60f5c43a-6cb7-47f1-85c5-751e91599ad9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f894a4bd-1f04-4126-aa8d-e0211b41687e", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Deploy on Streamlit Sharing\n", "\n", "https://share.streamlit.io/ or https://huggingface.co/spaces\n", "\n", "https://docs.streamlit.io/library/api-reference\n", "\n", "https://github.com/\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "dfd5baee-dc74-4f6d-84be-52a2b89d0f28", "metadata": { "tags": [] }, "outputs": [], "source": [ "\n", "%%writefile app_senti.py\n", "\n", "\n", "import streamlit as st\n", "from transformers import pipeline\n", "\n", "# Load the sentiment analysis model\n", "model_name = \"poom-sci/WangchanBERTa-finetuned-sentiment\"\n", "sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)\n", "\n", "# Streamlit app\n", "st.title(\"Thai Sentiment Analysis App\")\n", "\n", "# Input text\n", "text_input = st.text_area(\"Enter Thai text for sentiment analysis\", \"ขอความเห็นหน่อย... \")\n", "\n", "# Button to trigger analysis\n", "if st.button(\"Analyze Sentiment\"):\n", " # Analyze sentiment using the model\n", " results = sentiment_analyzer([text_input])\n", "\n", " # Extract sentiment and score\n", " sentiment = results[0]['label']\n", " score = results[0]['score']\n", " \n", "\n", " # Display result as progress bars\n", " st.subheader(\"Sentiment Analysis Result:\")\n", "\n", " if sentiment == 'pos':\n", " st.success(f\"Positive Sentiment (Score: {score:.2f})\")\n", " st.progress(score)\n", " elif sentiment == 'neg':\n", " st.error(f\"Negative Sentiment (Score: {score:.2f})\")\n", " st.progress(score)\n", " else:\n", " st.warning(f\"Neutral Sentiment (Score: {score:.2f})\")\n", " st.progress(score)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "70111967-b904-4f18-a8d0-0c8701ec35ab", "metadata": {}, "outputs": [], "source": [ "%%writefile requirements.txt\n", "transformers\n", "torch\n" ] }, { "cell_type": "code", "execution_count": null, "id": "88001002-587d-403d-ab65-d060bde9d42d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }