{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_sa = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\reviews_data.csv\"\n",
    "data = pd.read_csv(os.path.join(path_sa,\"reviews_data.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 850 entries, 0 to 849\n",
      "Data columns (total 6 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   name         850 non-null    object \n",
      " 1   location     850 non-null    object \n",
      " 2   Date         850 non-null    object \n",
      " 3   Rating       705 non-null    float64\n",
      " 4   Review       850 non-null    object \n",
      " 5   Image_Links  850 non-null    object \n",
      "dtypes: float64(1), object(5)\n",
      "memory usage: 40.0+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.loc[data[\"Review\"]!=\"No Review Text\"].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "index_out = data.loc[data[\"Image_Links\"]!=\"['No Images']\"].index\n",
    "index_in = [i for i in list(data.index) if i not in list(index_out)]\n",
    "add_data = data.iloc[index_in].sample(54)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_data_clean = pd.concat([data.iloc[index_out].reset_index(drop=True),\n",
    "                                add_data.reset_index(drop=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_location(x):\n",
    "    state = x.split(\",\")[1].strip().upper()\n",
    "    if state == \"CALIFORNIA\":\n",
    "        state = \"CA\"\n",
    "    if state == \"ALBERTA\":\n",
    "        state = \"OTHER\"\n",
    "\n",
    "    return state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_data_clean[\"location\"] = reviews_data_clean[\"location\"].apply(clean_location)\n",
    "reviews_data_clean[\"Date\"] = reviews_data_clean[\"Date\"].apply(lambda x: x.split(\"Reviewed\")[1].strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FINAL DATASET\n",
    "reviews_data_final = reviews_data_clean.loc[reviews_data_clean[\"location\"].isin([\"CA\",\"TX\",\"PA\",\"OR\",\"MO\",\"MN\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "\n",
    "def format_date(date):\n",
    "    if \"Jan.\" in date:\n",
    "        date = date.replace(\"Jan.\",\"January\")\n",
    "    if \"Feb.\" in date:\n",
    "        date = date.replace(\"Feb.\", \"February\")\n",
    "    if \"Aug.\" in date:\n",
    "        date = date.replace(\"Aug.\", \"August\")\n",
    "    if \"Sept.\" in date:\n",
    "        date = date.replace(\"Sept.\", \"September\")\n",
    "    if \"Oct.\" in date:\n",
    "        date = date.replace(\"Oct.\", \"October\")\n",
    "    if \"Nov.\" in date: \n",
    "        date = date.replace(\"Nov.\", \"November\")\n",
    "    if \"Dec.\" in date:\n",
    "        date = date.replace(\"Dec.\", \"December\")\n",
    "\n",
    "    date = date.replace(\",\",\"\")\n",
    "\n",
    "    parsed_date = datetime.strptime(date, \"%B %d %Y\")\n",
    "\n",
    "    # Format the date as MM/DD/YYYY\n",
    "    formatted_date = parsed_date.strftime(\"%m-%d-%Y\")\n",
    "    \n",
    "    return formatted_date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\1306421503.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final[\"Date\"] = pd.to_datetime(reviews_data_final[\"Date\"].apply(format_date))\n"
     ]
    }
   ],
   "source": [
    "reviews_data_final[\"Date\"] = pd.to_datetime(reviews_data_final[\"Date\"].apply(format_date))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "dict_states = {\"CA\":\"California\",\n",
    "               \"TX\":\"Texas\",\n",
    "               \"PA\":\"Pennsylvania\",\n",
    "               \"OR\":\"Oregon\",\n",
    "               \"MO\":\"Missouri\",\n",
    "               \"MN\":\"Minnesota\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\3901736406.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final[\"location\"] = reviews_data_final[\"location\"].map(dict_states)\n"
     ]
    }
   ],
   "source": [
    "reviews_data_final[\"location\"] = reviews_data_final[\"location\"].map(dict_states)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\332411794.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final.drop(columns=[\"name\"],inplace=True)\n"
     ]
    }
   ],
   "source": [
    "reviews_data_final.drop(columns=[\"name\"],inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re \n",
    "\n",
    "def clean_text(text):\n",
    "    pattern_punct = r\"[^\\w\\s.',:/]\"\n",
    "    pattern_date = r'\\b\\d{1,2}/\\d{1,2}/\\d{2,4}\\b'\n",
    "\n",
    "    text = text.lower()\n",
    "    text = re.sub(pattern_date, '', text)\n",
    "    text = re.sub(pattern_punct, '', text)\n",
    "    text = text.replace(\"ggg\",\"g\")\n",
    "    text = text.replace(\"  \",\" \")\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "clean_image_urls = [url.replace(\"['\",\"\").replace(\"']\",\"\").replace(\"',\",\" \").replace(\"'\",'') for url in reviews_data_final[\"Image_Links\"].to_list()]\n",
    "clean_image_urls = [elem.split(\"  \") for elem in clean_image_urls]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "images_1 = []\n",
    "images_2 = []\n",
    "\n",
    "for elem in clean_image_urls:\n",
    "\n",
    "    if elem[0] != \"No Images\":\n",
    "        images_1.append(elem[0])\n",
    "    else: \n",
    "        images_1.append(np.nan)\n",
    "\n",
    "    if len(elem) == 2:\n",
    "        images_2.append(elem[1])\n",
    "    else:\n",
    "        images_2.append(np.nan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\464558646.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final[\"Image 1\"] = images_1\n",
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\464558646.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final[\"Image 2\"] = images_2\n"
     ]
    }
   ],
   "source": [
    "reviews_data_final[\"Image 1\"] = images_1\n",
    "reviews_data_final[\"Image 2\"] = images_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_3128\\1924389021.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  reviews_data_final.rename({\"location\":\"State\"}, axis=1, inplace=True)\n"
     ]
    }
   ],
   "source": [
    "reviews_data_final.rename({\"location\":\"State\"}, axis=1, inplace=True)\n",
    "reviews_data_final.drop(columns=[\"Image_Links\"])\n",
    "reviews_data_final = reviews_data_final[[\"Date\",\"State\",\"Review\",\"Rating\",\"Image 1\",\"Image 2\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_data_final[\"Year\"] = reviews_data_final[\"Date\"].dt.year\n",
    "reviews_data_final.insert(0, \"ID\", [f\"{i}\" for i in np.arange(1, len(reviews_data_final)+1)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_data_final.to_pickle(os.path.join(path_sa,\"reviews_raw.pkl\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_data_final_clean = reviews_data_final.copy()\n",
    "reviews_data_final_clean[\"Review\"] = reviews_data_final_clean[\"Review\"].apply(clean_text)\n",
    "# reviews_data_final.to_pickle(os.path.join(path_sa,\"reviews_clean.pkl\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pysentimiento import create_analyzer\n",
    "\n",
    "list_reviews = reviews_data_final_clean[\"Review\"].to_list()\n",
    "sentiment_analyzer = create_analyzer(task=\"sentiment\", lang=\"en\")\n",
    "predictions = []\n",
    "positive = []\n",
    "negative = []\n",
    "neutral = []\n",
    "\n",
    "for review in list_reviews:\n",
    "    #if review.split(\" \")\n",
    "    q = sentiment_analyzer.predict(review)\n",
    "\n",
    "    predictions.append(q.output)\n",
    "    positive.append(q.probas[\"POS\"])\n",
    "    negative.append(q.probas[\"NEG\"])\n",
    "    neutral.append(q.probas[\"NEU\"])\n",
    "\n",
    "# Results\n",
    "df_results = reviews_data_final_clean.copy()\n",
    "df_results[\"Result\"] = predictions\n",
    "df_results[\"Result\"] = df_results[\"Result\"].map({\"NEU\":\"Neutral\", \"NEG\":\"Negative\", \"POS\":\"Positive\"})\n",
    "df_results[\"Negative\"] = np.round(np.array(negative)*100)\n",
    "df_results[\"Neutral\"] = np.round(np.array(neutral)*100)\n",
    "df_results[\"Positive\"] = np.round(np.array(positive)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_results.to_pickle(os.path.join(path_sa,\"reviews_results.pkl\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}