{
"cells": [
{
"cell_type": "markdown",
"id": "6c97a769",
"metadata": {},
"source": [
"# Overview data\n",
"\n",
"**Note: In this notebook, I assume the dataset is cleaned and ignore EDA.**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a54afd58",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cdb44c97",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" User-ID | \n",
" Location | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" nyc, new york, usa | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" stockton, california, usa | \n",
" 18.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" moscow, yukon territory, russia | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" porto, v.n.gaia, portugal | \n",
" 17.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" farnborough, hants, united kingdom | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" User-ID Location Age\n",
"0 1 nyc, new york, usa NaN\n",
"1 2 stockton, california, usa 18.0\n",
"2 3 moscow, yukon territory, russia NaN\n",
"3 4 porto, v.n.gaia, portugal 17.0\n",
"4 5 farnborough, hants, united kingdom NaN"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = \"./dataset\"\n",
"\n",
"# user dataset\n",
"user_df = pd.read_csv(f\"{path}/users.csv\", delimiter=';', encoding='ISO-8859-1')\n",
"user_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fe62dfa3",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"b'Skipping line 6452: expected 8 fields, saw 9\\nSkipping line 43667: expected 8 fields, saw 10\\nSkipping line 51751: expected 8 fields, saw 9\\n'\n",
"b'Skipping line 92038: expected 8 fields, saw 9\\nSkipping line 104319: expected 8 fields, saw 9\\nSkipping line 121768: expected 8 fields, saw 9\\n'\n",
"b'Skipping line 144058: expected 8 fields, saw 9\\nSkipping line 150789: expected 8 fields, saw 9\\nSkipping line 157128: expected 8 fields, saw 9\\nSkipping line 180189: expected 8 fields, saw 9\\nSkipping line 185738: expected 8 fields, saw 9\\n'\n",
"b'Skipping line 209388: expected 8 fields, saw 9\\nSkipping line 220626: expected 8 fields, saw 9\\nSkipping line 227933: expected 8 fields, saw 11\\nSkipping line 228957: expected 8 fields, saw 10\\nSkipping line 245933: expected 8 fields, saw 9\\nSkipping line 251296: expected 8 fields, saw 9\\nSkipping line 259941: expected 8 fields, saw 9\\nSkipping line 261529: expected 8 fields, saw 9\\n'\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ISBN | \n",
" Book-Title | \n",
" Book-Author | \n",
" Year-Of-Publication | \n",
" Publisher | \n",
" Image-URL-S | \n",
" Image-URL-M | \n",
" Image-URL-L | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0195153448 | \n",
" Classical Mythology | \n",
" Mark P. O. Morford | \n",
" 2002 | \n",
" Oxford University Press | \n",
" http://images.amazon.com/images/P/0195153448.0... | \n",
" http://images.amazon.com/images/P/0195153448.0... | \n",
" http://images.amazon.com/images/P/0195153448.0... | \n",
"
\n",
" \n",
" 1 | \n",
" 0002005018 | \n",
" Clara Callan | \n",
" Richard Bruce Wright | \n",
" 2001 | \n",
" HarperFlamingo Canada | \n",
" http://images.amazon.com/images/P/0002005018.0... | \n",
" http://images.amazon.com/images/P/0002005018.0... | \n",
" http://images.amazon.com/images/P/0002005018.0... | \n",
"
\n",
" \n",
" 2 | \n",
" 0060973129 | \n",
" Decision in Normandy | \n",
" Carlo D'Este | \n",
" 1991 | \n",
" HarperPerennial | \n",
" http://images.amazon.com/images/P/0060973129.0... | \n",
" http://images.amazon.com/images/P/0060973129.0... | \n",
" http://images.amazon.com/images/P/0060973129.0... | \n",
"
\n",
" \n",
" 3 | \n",
" 0374157065 | \n",
" Flu: The Story of the Great Influenza Pandemic... | \n",
" Gina Bari Kolata | \n",
" 1999 | \n",
" Farrar Straus Giroux | \n",
" http://images.amazon.com/images/P/0374157065.0... | \n",
" http://images.amazon.com/images/P/0374157065.0... | \n",
" http://images.amazon.com/images/P/0374157065.0... | \n",
"
\n",
" \n",
" 4 | \n",
" 0393045218 | \n",
" The Mummies of Urumchi | \n",
" E. J. W. Barber | \n",
" 1999 | \n",
" W. W. Norton & Company | \n",
" http://images.amazon.com/images/P/0393045218.0... | \n",
" http://images.amazon.com/images/P/0393045218.0... | \n",
" http://images.amazon.com/images/P/0393045218.0... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ISBN Book-Title \\\n",
"0 0195153448 Classical Mythology \n",
"1 0002005018 Clara Callan \n",
"2 0060973129 Decision in Normandy \n",
"3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
"4 0393045218 The Mummies of Urumchi \n",
"\n",
" Book-Author Year-Of-Publication Publisher \\\n",
"0 Mark P. O. Morford 2002 Oxford University Press \n",
"1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
"2 Carlo D'Este 1991 HarperPerennial \n",
"3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
"4 E. J. W. Barber 1999 W. W. Norton & Company \n",
"\n",
" Image-URL-S \\\n",
"0 http://images.amazon.com/images/P/0195153448.0... \n",
"1 http://images.amazon.com/images/P/0002005018.0... \n",
"2 http://images.amazon.com/images/P/0060973129.0... \n",
"3 http://images.amazon.com/images/P/0374157065.0... \n",
"4 http://images.amazon.com/images/P/0393045218.0... \n",
"\n",
" Image-URL-M \\\n",
"0 http://images.amazon.com/images/P/0195153448.0... \n",
"1 http://images.amazon.com/images/P/0002005018.0... \n",
"2 http://images.amazon.com/images/P/0060973129.0... \n",
"3 http://images.amazon.com/images/P/0374157065.0... \n",
"4 http://images.amazon.com/images/P/0393045218.0... \n",
"\n",
" Image-URL-L \n",
"0 http://images.amazon.com/images/P/0195153448.0... \n",
"1 http://images.amazon.com/images/P/0002005018.0... \n",
"2 http://images.amazon.com/images/P/0060973129.0... \n",
"3 http://images.amazon.com/images/P/0374157065.0... \n",
"4 http://images.amazon.com/images/P/0393045218.0... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# book dataset\n",
"book_df = pd.read_csv(f\"{path}/books.csv\", delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)\n",
"book_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d9fa4750",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" User-ID | \n",
" ISBN | \n",
" Book-Rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 276725 | \n",
" 034545104X | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 276726 | \n",
" 0155061224 | \n",
" 5 | \n",
"
\n",
" \n",
" 2 | \n",
" 276727 | \n",
" 0446520802 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 276729 | \n",
" 052165615X | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" 276729 | \n",
" 0521795028 | \n",
" 6 | \n",
"
\n",
" \n",
" 5 | \n",
" 276733 | \n",
" 2080674722 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 276736 | \n",
" 3257224281 | \n",
" 8 | \n",
"
\n",
" \n",
" 7 | \n",
" 276737 | \n",
" 0600570967 | \n",
" 6 | \n",
"
\n",
" \n",
" 8 | \n",
" 276744 | \n",
" 038550120X | \n",
" 7 | \n",
"
\n",
" \n",
" 9 | \n",
" 276745 | \n",
" 342310538 | \n",
" 10 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" User-ID ISBN Book-Rating\n",
"0 276725 034545104X 0\n",
"1 276726 0155061224 5\n",
"2 276727 0446520802 0\n",
"3 276729 052165615X 3\n",
"4 276729 0521795028 6\n",
"5 276733 2080674722 0\n",
"6 276736 3257224281 8\n",
"7 276737 0600570967 6\n",
"8 276744 038550120X 7\n",
"9 276745 342310538 10"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rating dataset\n",
"rating_df = pd.read_csv(f\"{path}/ratings.csv\", delimiter=';', encoding='ISO-8859-1')\n",
"rating_df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "53c66ec4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rating_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "691767c0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Mean-Rating | \n",
" Num-Rating | \n",
"
\n",
" \n",
" ISBN | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0330299891 | \n",
" 3.0 | \n",
" 2 | \n",
"
\n",
" \n",
" 0375404120 | \n",
" 1.5 | \n",
" 2 | \n",
"
\n",
" \n",
" 0586045007 | \n",
" 0.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 9022906116 | \n",
" 3.5 | \n",
" 2 | \n",
"
\n",
" \n",
" 9032803328 | \n",
" 0.0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Mean-Rating Num-Rating\n",
"ISBN \n",
" 0330299891 3.0 2\n",
" 0375404120 1.5 2\n",
" 0586045007 0.0 1\n",
" 9022906116 3.5 2\n",
" 9032803328 0.0 1"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function = {\n",
" \"Book-Rating\": \"mean\",\n",
" \"User-ID\": \"count\"\n",
"}\n",
"\n",
"summary_rating = rating_df.groupby(\"ISBN\").agg(function, axis=0)\n",
"summary_rating = summary_rating.rename(columns={\"Book-Rating\": \"Mean-Rating\", \"User-ID\": \"Num-Rating\"})\n",
"summary_rating.head()"
]
},
{
"cell_type": "markdown",
"id": "3e20611a",
"metadata": {},
"source": [
"**Note:** In this repo, I only consider `book_df` and `rating_df`."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "82e1b680",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ISBN | \n",
" Book-Title | \n",
" Book-Author | \n",
" Year-Of-Publication | \n",
" Publisher | \n",
" Mean-Rating | \n",
" Num-Rating | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0195153448 | \n",
" Classical Mythology | \n",
" Mark P. O. Morford | \n",
" 2002 | \n",
" Oxford University Press | \n",
" 0.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0002005018 | \n",
" Clara Callan | \n",
" Richard Bruce Wright | \n",
" 2001 | \n",
" HarperFlamingo Canada | \n",
" 4.928571 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0060973129 | \n",
" Decision in Normandy | \n",
" Carlo D'Este | \n",
" 1991 | \n",
" HarperPerennial | \n",
" 5.000000 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0374157065 | \n",
" Flu: The Story of the Great Influenza Pandemic... | \n",
" Gina Bari Kolata | \n",
" 1999 | \n",
" Farrar Straus Giroux | \n",
" 4.272727 | \n",
" 11.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0393045218 | \n",
" The Mummies of Urumchi | \n",
" E. J. W. Barber | \n",
" 1999 | \n",
" W. W. Norton & Company | \n",
" 0.000000 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ISBN Book-Title \\\n",
"0 0195153448 Classical Mythology \n",
"1 0002005018 Clara Callan \n",
"2 0060973129 Decision in Normandy \n",
"3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
"4 0393045218 The Mummies of Urumchi \n",
"\n",
" Book-Author Year-Of-Publication Publisher \\\n",
"0 Mark P. O. Morford 2002 Oxford University Press \n",
"1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
"2 Carlo D'Este 1991 HarperPerennial \n",
"3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
"4 E. J. W. Barber 1999 W. W. Norton & Company \n",
"\n",
" Mean-Rating Num-Rating \n",
"0 0.000000 1.0 \n",
"1 4.928571 14.0 \n",
"2 5.000000 3.0 \n",
"3 4.272727 11.0 \n",
"4 0.000000 1.0 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = book_df.merge(summary_rating, how=\"left\", left_on=\"ISBN\", right_on=\"ISBN\")\n",
"df.drop(columns=[\"Image-URL-S\", \"Image-URL-M\", \"Image-URL-L\"], inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fb397a05",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ISBN 0\n",
"Book-Title 0\n",
"Book-Author 1\n",
"Year-Of-Publication 0\n",
"Publisher 2\n",
"Mean-Rating 1209\n",
"Num-Rating 1209\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7c7139ed",
"metadata": {},
"outputs": [],
"source": [
"# Save\n",
"df.to_csv(f\"{path}/summary_book.csv\", index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}