{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7e59ad5c", "metadata": {}, "outputs": [], "source": [ "import json\n", "import random\n", "\n", "# Define the path to the full Yelp dataset file\n", "full_data_path = \"yelp_academic_dataset_review.json\"\n", "\n", "# Define the path to save the sampled dataset file\n", "sampled_data_path = \"yelp_academic_dataset_review_sampled.json\"\n", "\n", "# Define the number of reviews to sample (adjust as needed)\n", "num_reviews_to_sample = 10000 # Example: Sample 10,000 reviews\n", "\n", "# Load all reviews from the full dataset\n", "all_reviews = []\n", "with open(full_data_path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " review = json.loads(line)\n", " all_reviews.append(review)\n", "\n", "# Randomly sample a subset of reviews\n", "sampled_reviews = random.sample(all_reviews, num_reviews_to_sample)\n", "\n", "# Save the sampled reviews to a new JSON file\n", "with open(sampled_data_path, \"w\", encoding=\"utf-8\") as f:\n", " for review in sampled_reviews:\n", " json.dump(review, f)\n", " f.write(\"\\n\")\n", "\n", "print(f\"Sampled {num_reviews_to_sample} reviews and saved to {sampled_data_path}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f562ff04", "metadata": {}, "outputs": [], "source": [ "import gzip\n", "\n", "# Define the path to save the compressed dataset file\n", "compressed_data_path = \"yelp_academic_dataset_review_sampled.json.gz\"\n", "\n", "# Compress the sampled dataset file using gzip\n", "with open(sampled_data_path, \"rb\") as f_in:\n", " with gzip.open(compressed_data_path, \"wb\") as f_out:\n", " f_out.writelines(f_in)\n", "\n", "print(f\"Compressed file saved to {compressed_data_path}\")\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "337f6649", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import classification_report, accuracy_score\n", "\n", "# Load the preprocessed Yelp dataset (sampled and compressed if applicable)\n", "data_path = \"yelp_academic_dataset_review_sampled.json.gz\" # Adjust the path\n", "data = pd.read_json(data_path, lines=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "e0936968", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
review_iduser_idbusiness_idstarsusefulfunnycooltextdate
0f9khuhJxadQhg6CaI1cRdA4Qijwb2RDiUGc4SBjA2lJgnTBStZYJfHGdSZJbpaBiPA4101I had read about this place adding a second lo...2011-02-08 17:48:40
1WH0c1wEMu4XRTIysI7uMig7JeW4Mlvqdp7R-FAUBB_vAH3Tmgv94pbGvBIKZ4Rs9Cw5101I had dinner at Tin Angel on Saturday and was ...2012-04-16 13:30:02
2S1Lg07IGrupUDk7Uu9rnQQumUy5DTpVrvQDXLR4gywHAH7BikysfQbS9bMULQsCU_Q2410I was really excited to visit the store, havin...2019-10-05 00:17:15
3AH4_Pua0yzK4oU9FoU8hXQuwYw0KKj16lC_nq_HsQGVQXb6QfBbleg2aJT2cG807jQ1100I hired Two Men and a Truck for my recent move...2016-06-02 13:27:24
49_CIDS98p6ZsTRiCvmuIKAl9bVKgzvjjcU8Iang3TvtglqSJkyNSE1yPeux4PoR-pg1000i was very disappointed to this company. They ...2020-06-05 22:28:47
..............................
99955MknizHCBH3jpj5DJd-6Uwd2VrfngFJ1f1nvNAsojJzwhy-E7DdXbdgTbwphKUYW1w1100This was such a trash experience. We signed up...2021-07-29 16:10:10
9996mXFlaWuiCnyCkZ_SIAGqewcHWDGVf4LofBk9wZ2mnXQQAYWSFv6QxF5IjQSxITMUug5000I have been going to Goshen Nail Salon for the...2018-03-16 00:30:50
9997W1Ij-zC3ufRU5MTEgHLjmgaN9nWudz5rfar7rHr9lHfAoyJ3gXNkV0DO0YxcaTgtTg5000Ok. This place surprised me. I always thought ...2018-06-01 23:56:44
9998HNejB5H9iD1qe3MMKxg6sg6JejVLZl5M-IB3UkNTkXtQWJLKQTduGumxjlXelqiuKg3000Meets expectations, but quirky. The trucks re...2016-06-29 15:57:34
9999LSJGzHJ7whqNn5uPxidMjQ_Av1LaAAY0Y8YcPp7Ck7fgM983OPfVRnwvG7zEOzykCA5000Jordan was our waiter. He was very attentive a...2017-03-15 23:54:07
\n", "

10000 rows × 9 columns

\n", "
" ], "text/plain": [ " review_id user_id business_id \\\n", "0 f9khuhJxadQhg6CaI1cRdA 4Qijwb2RDiUGc4SBjA2lJg nTBStZYJfHGdSZJbpaBiPA \n", "1 WH0c1wEMu4XRTIysI7uMig 7JeW4Mlvqdp7R-FAUBB_vA H3Tmgv94pbGvBIKZ4Rs9Cw \n", "2 S1Lg07IGrupUDk7Uu9rnQQ umUy5DTpVrvQDXLR4gywHA H7BikysfQbS9bMULQsCU_Q \n", "3 AH4_Pua0yzK4oU9FoU8hXQ uwYw0KKj16lC_nq_HsQGVQ Xb6QfBbleg2aJT2cG807jQ \n", "4 9_CIDS98p6ZsTRiCvmuIKA l9bVKgzvjjcU8Iang3Tvtg lqSJkyNSE1yPeux4PoR-pg \n", "... ... ... ... \n", "9995 5MknizHCBH3jpj5DJd-6Uw d2VrfngFJ1f1nvNAsojJzw hy-E7DdXbdgTbwphKUYW1w \n", "9996 mXFlaWuiCnyCkZ_SIAGqew cHWDGVf4LofBk9wZ2mnXQQ AYWSFv6QxF5IjQSxITMUug \n", "9997 W1Ij-zC3ufRU5MTEgHLjmg aN9nWudz5rfar7rHr9lHfA oyJ3gXNkV0DO0YxcaTgtTg \n", "9998 HNejB5H9iD1qe3MMKxg6sg 6JejVLZl5M-IB3UkNTkXtQ WJLKQTduGumxjlXelqiuKg \n", "9999 LSJGzHJ7whqNn5uPxidMjQ _Av1LaAAY0Y8YcPp7Ck7fg M983OPfVRnwvG7zEOzykCA \n", "\n", " stars useful funny cool \\\n", "0 4 1 0 1 \n", "1 5 1 0 1 \n", "2 2 4 1 0 \n", "3 1 1 0 0 \n", "4 1 0 0 0 \n", "... ... ... ... ... \n", "9995 1 1 0 0 \n", "9996 5 0 0 0 \n", "9997 5 0 0 0 \n", "9998 3 0 0 0 \n", "9999 5 0 0 0 \n", "\n", " text date \n", "0 I had read about this place adding a second lo... 2011-02-08 17:48:40 \n", "1 I had dinner at Tin Angel on Saturday and was ... 2012-04-16 13:30:02 \n", "2 I was really excited to visit the store, havin... 2019-10-05 00:17:15 \n", "3 I hired Two Men and a Truck for my recent move... 2016-06-02 13:27:24 \n", "4 i was very disappointed to this company. They ... 2020-06-05 22:28:47 \n", "... ... ... \n", "9995 This was such a trash experience. We signed up... 2021-07-29 16:10:10 \n", "9996 I have been going to Goshen Nail Salon for the... 2018-03-16 00:30:50 \n", "9997 Ok. This place surprised me. I always thought ... 2018-06-01 23:56:44 \n", "9998 Meets expectations, but quirky. The trucks re... 2016-06-29 15:57:34 \n", "9999 Jordan was our waiter. He was very attentive a... 2017-03-15 23:54:07 \n", "\n", "[10000 rows x 9 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 3, "id": "466ef010", "metadata": {}, "outputs": [], "source": [ "# Map stars to sentiment labels\n", "def map_sentiment(stars):\n", " if stars >= 4:\n", " return \"positive\"\n", " elif stars <= 2:\n", " return \"negative\"\n", " else:\n", " return \"neutral\" # Optional: Handle neutral sentiment if needed\n", "\n", "# Apply sentiment mapping to stars\n", "data['sentiment'] = data['stars'].apply(map_sentiment)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "756b3285", "metadata": {}, "outputs": [], "source": [ "# Apply sentiment mapping to stars\n", "data['sentiment'] = data['stars'].apply(map_sentiment)\n", "\n", "# Split the data into training and testing sets\n", "train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)\n", "\n", "# Save the preprocessed data\n", "train_data.to_csv(\"preprocessed_train_data.csv\", index=False)\n", "test_data.to_csv(\"preprocessed_test_data.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "7257dd9d", "metadata": {}, "outputs": [], "source": [ "pip install torch\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f03a2ad5", "metadata": {}, "outputs": [], "source": [ "pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "id": "ecdcf9c9", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import torch\n", "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n", "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", "\n", "# Load the preprocessed training and testing data\n", "train_data = pd.read_csv(\"preprocessed_train_data.csv\") # Adjust the path\n", "test_data = pd.read_csv(\"preprocessed_test_data.csv\") # Adjust the path\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c83718d7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }