{ "cells": [ { "cell_type": "markdown", "id": "6c97a769", "metadata": {}, "source": [ "# Overview data\n", "\n", "**Note: In this notebook, I assume the dataset is cleaned and ignore EDA.**" ] }, { "cell_type": "code", "execution_count": 1, "id": "a54afd58", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "cdb44c97", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User-IDLocationAge
01nyc, new york, usaNaN
12stockton, california, usa18.0
23moscow, yukon territory, russiaNaN
34porto, v.n.gaia, portugal17.0
45farnborough, hants, united kingdomNaN
\n", "
" ], "text/plain": [ " User-ID Location Age\n", "0 1 nyc, new york, usa NaN\n", "1 2 stockton, california, usa 18.0\n", "2 3 moscow, yukon territory, russia NaN\n", "3 4 porto, v.n.gaia, portugal 17.0\n", "4 5 farnborough, hants, united kingdom NaN" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = \"./dataset\"\n", "\n", "# user dataset\n", "user_df = pd.read_csv(f\"{path}/users.csv\", delimiter=';', encoding='ISO-8859-1')\n", "user_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "fe62dfa3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "b'Skipping line 6452: expected 8 fields, saw 9\\nSkipping line 43667: expected 8 fields, saw 10\\nSkipping line 51751: expected 8 fields, saw 9\\n'\n", "b'Skipping line 92038: expected 8 fields, saw 9\\nSkipping line 104319: expected 8 fields, saw 9\\nSkipping line 121768: expected 8 fields, saw 9\\n'\n", "b'Skipping line 144058: expected 8 fields, saw 9\\nSkipping line 150789: expected 8 fields, saw 9\\nSkipping line 157128: expected 8 fields, saw 9\\nSkipping line 180189: expected 8 fields, saw 9\\nSkipping line 185738: expected 8 fields, saw 9\\n'\n", "b'Skipping line 209388: expected 8 fields, saw 9\\nSkipping line 220626: expected 8 fields, saw 9\\nSkipping line 227933: expected 8 fields, saw 11\\nSkipping line 228957: expected 8 fields, saw 10\\nSkipping line 245933: expected 8 fields, saw 9\\nSkipping line 251296: expected 8 fields, saw 9\\nSkipping line 259941: expected 8 fields, saw 9\\nSkipping line 261529: expected 8 fields, saw 9\\n'\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISBNBook-TitleBook-AuthorYear-Of-PublicationPublisherImage-URL-SImage-URL-MImage-URL-L
00195153448Classical MythologyMark P. O. Morford2002Oxford University Presshttp://images.amazon.com/images/P/0195153448.0...http://images.amazon.com/images/P/0195153448.0...http://images.amazon.com/images/P/0195153448.0...
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canadahttp://images.amazon.com/images/P/0002005018.0...http://images.amazon.com/images/P/0002005018.0...http://images.amazon.com/images/P/0002005018.0...
20060973129Decision in NormandyCarlo D'Este1991HarperPerennialhttp://images.amazon.com/images/P/0060973129.0...http://images.amazon.com/images/P/0060973129.0...http://images.amazon.com/images/P/0060973129.0...
30374157065Flu: The Story of the Great Influenza Pandemic...Gina Bari Kolata1999Farrar Straus Girouxhttp://images.amazon.com/images/P/0374157065.0...http://images.amazon.com/images/P/0374157065.0...http://images.amazon.com/images/P/0374157065.0...
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Companyhttp://images.amazon.com/images/P/0393045218.0...http://images.amazon.com/images/P/0393045218.0...http://images.amazon.com/images/P/0393045218.0...
\n", "
" ], "text/plain": [ " ISBN Book-Title \\\n", "0 0195153448 Classical Mythology \n", "1 0002005018 Clara Callan \n", "2 0060973129 Decision in Normandy \n", "3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n", "4 0393045218 The Mummies of Urumchi \n", "\n", " Book-Author Year-Of-Publication Publisher \\\n", "0 Mark P. O. Morford 2002 Oxford University Press \n", "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", "2 Carlo D'Este 1991 HarperPerennial \n", "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", "4 E. J. W. Barber 1999 W. W. Norton & Company \n", "\n", " Image-URL-S \\\n", "0 http://images.amazon.com/images/P/0195153448.0... \n", "1 http://images.amazon.com/images/P/0002005018.0... \n", "2 http://images.amazon.com/images/P/0060973129.0... \n", "3 http://images.amazon.com/images/P/0374157065.0... \n", "4 http://images.amazon.com/images/P/0393045218.0... \n", "\n", " Image-URL-M \\\n", "0 http://images.amazon.com/images/P/0195153448.0... \n", "1 http://images.amazon.com/images/P/0002005018.0... \n", "2 http://images.amazon.com/images/P/0060973129.0... \n", "3 http://images.amazon.com/images/P/0374157065.0... \n", "4 http://images.amazon.com/images/P/0393045218.0... \n", "\n", " Image-URL-L \n", "0 http://images.amazon.com/images/P/0195153448.0... \n", "1 http://images.amazon.com/images/P/0002005018.0... \n", "2 http://images.amazon.com/images/P/0060973129.0... \n", "3 http://images.amazon.com/images/P/0374157065.0... \n", "4 http://images.amazon.com/images/P/0393045218.0... " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# book dataset\n", "book_df = pd.read_csv(f\"{path}/books.csv\", delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)\n", "book_df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "d9fa4750", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User-IDISBNBook-Rating
0276725034545104X0
127672601550612245
227672704465208020
3276729052165615X3
427672905217950286
527673320806747220
627673632572242818
727673706005709676
8276744038550120X7
927674534231053810
\n", "
" ], "text/plain": [ " User-ID ISBN Book-Rating\n", "0 276725 034545104X 0\n", "1 276726 0155061224 5\n", "2 276727 0446520802 0\n", "3 276729 052165615X 3\n", "4 276729 0521795028 6\n", "5 276733 2080674722 0\n", "6 276736 3257224281 8\n", "7 276737 0600570967 6\n", "8 276744 038550120X 7\n", "9 276745 342310538 10" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rating dataset\n", "rating_df = pd.read_csv(f\"{path}/ratings.csv\", delimiter=';', encoding='ISO-8859-1')\n", "rating_df.head(10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "53c66ec4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_df.columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "691767c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Mean-RatingNum-Rating
ISBN
03302998913.02
03754041201.52
05860450070.01
90229061163.52
90328033280.01
\n", "
" ], "text/plain": [ " Mean-Rating Num-Rating\n", "ISBN \n", " 0330299891 3.0 2\n", " 0375404120 1.5 2\n", " 0586045007 0.0 1\n", " 9022906116 3.5 2\n", " 9032803328 0.0 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "function = {\n", " \"Book-Rating\": \"mean\",\n", " \"User-ID\": \"count\"\n", "}\n", "\n", "summary_rating = rating_df.groupby(\"ISBN\").agg(function, axis=0)\n", "summary_rating = summary_rating.rename(columns={\"Book-Rating\": \"Mean-Rating\", \"User-ID\": \"Num-Rating\"})\n", "summary_rating.head()" ] }, { "cell_type": "markdown", "id": "3e20611a", "metadata": {}, "source": [ "**Note:** In this repo, I only consider `book_df` and `rating_df`." ] }, { "cell_type": "code", "execution_count": 7, "id": "82e1b680", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISBNBook-TitleBook-AuthorYear-Of-PublicationPublisherMean-RatingNum-Rating
00195153448Classical MythologyMark P. O. Morford2002Oxford University Press0.0000001.0
10002005018Clara CallanRichard Bruce Wright2001HarperFlamingo Canada4.92857114.0
20060973129Decision in NormandyCarlo D'Este1991HarperPerennial5.0000003.0
30374157065Flu: The Story of the Great Influenza Pandemic...Gina Bari Kolata1999Farrar Straus Giroux4.27272711.0
40393045218The Mummies of UrumchiE. J. W. Barber1999W. W. Norton & Company0.0000001.0
\n", "
" ], "text/plain": [ " ISBN Book-Title \\\n", "0 0195153448 Classical Mythology \n", "1 0002005018 Clara Callan \n", "2 0060973129 Decision in Normandy \n", "3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n", "4 0393045218 The Mummies of Urumchi \n", "\n", " Book-Author Year-Of-Publication Publisher \\\n", "0 Mark P. O. Morford 2002 Oxford University Press \n", "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n", "2 Carlo D'Este 1991 HarperPerennial \n", "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n", "4 E. J. W. Barber 1999 W. W. Norton & Company \n", "\n", " Mean-Rating Num-Rating \n", "0 0.000000 1.0 \n", "1 4.928571 14.0 \n", "2 5.000000 3.0 \n", "3 4.272727 11.0 \n", "4 0.000000 1.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = book_df.merge(summary_rating, how=\"left\", left_on=\"ISBN\", right_on=\"ISBN\")\n", "df.drop(columns=[\"Image-URL-S\", \"Image-URL-M\", \"Image-URL-L\"], inplace=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "fb397a05", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ISBN 0\n", "Book-Title 0\n", "Book-Author 1\n", "Year-Of-Publication 0\n", "Publisher 2\n", "Mean-Rating 1209\n", "Num-Rating 1209\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 10, "id": "7c7139ed", "metadata": {}, "outputs": [], "source": [ "# Save\n", "df.to_csv(f\"{path}/summary_book.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false } }, "nbformat": 4, "nbformat_minor": 5 }