{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "path_sa = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\reviews_data.csv\"\n", "data = pd.read_csv(os.path.join(path_sa,\"reviews_data.csv\"))" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 850 entries, 0 to 849\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 name 850 non-null object \n", " 1 location 850 non-null object \n", " 2 Date 850 non-null object \n", " 3 Rating 705 non-null float64\n", " 4 Review 850 non-null object \n", " 5 Image_Links 850 non-null object \n", "dtypes: float64(1), object(5)\n", "memory usage: 40.0+ KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "data = data.loc[data[\"Review\"]!=\"No Review Text\"].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "index_out = data.loc[data[\"Image_Links\"]!=\"['No Images']\"].index\n", "index_in = [i for i in list(data.index) if i not in list(index_out)]\n", "add_data = data.iloc[index_in].sample(54)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "reviews_data_clean = pd.concat([data.iloc[index_out].reset_index(drop=True),\n", " add_data.reset_index(drop=True)])" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "def clean_location(x):\n", " state = x.split(\",\")[1].strip().upper()\n", " if state == \"CALIFORNIA\":\n", " state = \"CA\"\n", " if state == \"ALBERTA\":\n", " state = \"OTHER\"\n", "\n", " return state" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "reviews_data_clean[\"location\"] = reviews_data_clean[\"location\"].apply(clean_location)\n", "reviews_data_clean[\"Date\"] = reviews_data_clean[\"Date\"].apply(lambda x: x.split(\"Reviewed\")[1].strip())" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "# FINAL DATASET\n", "reviews_data_final = reviews_data_clean.loc[reviews_data_clean[\"location\"].isin([\"CA\",\"TX\",\"PA\",\"OR\",\"MO\",\"MN\"])]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "\n", "def format_date(date):\n", " if \"Jan.\" in date:\n", " date = date.replace(\"Jan.\",\"January\")\n", " if \"Feb.\" in date:\n", " date = date.replace(\"Feb.\", \"February\")\n", " if \"Aug.\" in date:\n", " date = date.replace(\"Aug.\", \"August\")\n", " if \"Sept.\" in date:\n", " date = date.replace(\"Sept.\", \"September\")\n", " if \"Oct.\" in date:\n", " date = date.replace(\"Oct.\", \"October\")\n", " if \"Nov.\" in date: \n", " date = date.replace(\"Nov.\", \"November\")\n", " if \"Dec.\" in date:\n", " date = date.replace(\"Dec.\", \"December\")\n", "\n", " date = date.replace(\",\",\"\")\n", "\n", " parsed_date = datetime.strptime(date, \"%B %d %Y\")\n", "\n", " # Format the date as MM/DD/YYYY\n", " formatted_date = parsed_date.strftime(\"%m-%d-%Y\")\n", " \n", " return formatted_date" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\1306421503.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final[\"Date\"] = pd.to_datetime(reviews_data_final[\"Date\"].apply(format_date))\n" ] } ], "source": [ "reviews_data_final[\"Date\"] = pd.to_datetime(reviews_data_final[\"Date\"].apply(format_date))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "dict_states = {\"CA\":\"California\",\n", " \"TX\":\"Texas\",\n", " \"PA\":\"Pennsylvania\",\n", " \"OR\":\"Oregon\",\n", " \"MO\":\"Missouri\",\n", " \"MN\":\"Minnesota\"}" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\3901736406.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final[\"location\"] = reviews_data_final[\"location\"].map(dict_states)\n" ] } ], "source": [ "reviews_data_final[\"location\"] = reviews_data_final[\"location\"].map(dict_states)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\332411794.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final.drop(columns=[\"name\"],inplace=True)\n" ] } ], "source": [ "reviews_data_final.drop(columns=[\"name\"],inplace=True)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "import re \n", "\n", "def clean_text(text):\n", " pattern_punct = r\"[^\\w\\s.',:/]\"\n", " pattern_date = r'\\b\\d{1,2}/\\d{1,2}/\\d{2,4}\\b'\n", "\n", " text = text.lower()\n", " text = re.sub(pattern_date, '', text)\n", " text = re.sub(pattern_punct, '', text)\n", " text = text.replace(\"ggg\",\"g\")\n", " text = text.replace(\" \",\" \")\n", "\n", " return text" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "clean_image_urls = [url.replace(\"['\",\"\").replace(\"']\",\"\").replace(\"',\",\" \").replace(\"'\",'') for url in reviews_data_final[\"Image_Links\"].to_list()]\n", "clean_image_urls = [elem.split(\" \") for elem in clean_image_urls]" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "images_1 = []\n", "images_2 = []\n", "\n", "for elem in clean_image_urls:\n", "\n", " if elem[0] != \"No Images\":\n", " images_1.append(elem[0])\n", " else: \n", " images_1.append(np.nan)\n", "\n", " if len(elem) == 2:\n", " images_2.append(elem[1])\n", " else:\n", " images_2.append(np.nan)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\464558646.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final[\"Image 1\"] = images_1\n", "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\464558646.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final[\"Image 2\"] = images_2\n" ] } ], "source": [ "reviews_data_final[\"Image 1\"] = images_1\n", "reviews_data_final[\"Image 2\"] = images_2" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\1924389021.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " reviews_data_final.rename({\"location\":\"State\"}, axis=1, inplace=True)\n" ] } ], "source": [ "reviews_data_final.rename({\"location\":\"State\"}, axis=1, inplace=True)\n", "reviews_data_final.drop(columns=[\"Image_Links\"])\n", "reviews_data_final = reviews_data_final[[\"Date\",\"State\",\"Review\",\"Rating\",\"Image 1\",\"Image 2\"]]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "reviews_data_final[\"Year\"] = reviews_data_final[\"Date\"].dt.year\n", "reviews_data_final.insert(0, \"ID\", [f\"{i}\" for i in np.arange(1, len(reviews_data_final)+1)])" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "reviews_data_final.to_pickle(os.path.join(path_sa,\"reviews_raw.pkl\"))" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "reviews_data_final_clean = reviews_data_final.copy()\n", "reviews_data_final_clean[\"Review\"] = reviews_data_final_clean[\"Review\"].apply(clean_text)\n", "# reviews_data_final.to_pickle(os.path.join(path_sa,\"reviews_clean.pkl\"))" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "from pysentimiento import create_analyzer\n", "\n", "list_reviews = reviews_data_final_clean[\"Review\"].to_list()\n", "sentiment_analyzer = create_analyzer(task=\"sentiment\", lang=\"en\")\n", "predictions = []\n", "positive = []\n", "negative = []\n", "neutral = []\n", "\n", "for review in list_reviews:\n", " #if review.split(\" \")\n", " q = sentiment_analyzer.predict(review)\n", "\n", " predictions.append(q.output)\n", " positive.append(q.probas[\"POS\"])\n", " negative.append(q.probas[\"NEG\"])\n", " neutral.append(q.probas[\"NEU\"])\n", "\n", "# Results\n", "df_results = reviews_data_final_clean.copy()\n", "df_results[\"Result\"] = predictions\n", "df_results[\"Result\"] = df_results[\"Result\"].map({\"NEU\":\"Neutral\", \"NEG\":\"Negative\", \"POS\":\"Positive\"})\n", "df_results[\"Negative\"] = np.round(np.array(negative)*100)\n", "df_results[\"Neutral\"] = np.round(np.array(neutral)*100)\n", "df_results[\"Positive\"] = np.round(np.array(positive)*100)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "df_results.to_pickle(os.path.join(path_sa,\"reviews_results.pkl\"))" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 2 }