quyanh commited on
Commit
97ce7fb
1 Parent(s): 713f497

initial commit

Browse files
Demo Data.ipynb ADDED
@@ -0,0 +1,718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "6c97a769",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Overview data\n",
9
+ "\n",
10
+ "**Note: In this notebook, I assume the dataset is cleaned and ignore EDA.**"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "a54afd58",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import pandas as pd\n",
21
+ "import numpy as np\n",
22
+ "import warnings\n",
23
+ "\n",
24
+ "warnings.filterwarnings(\"ignore\")"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 2,
30
+ "id": "cdb44c97",
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "data": {
35
+ "text/html": [
36
+ "<div>\n",
37
+ "<style scoped>\n",
38
+ " .dataframe tbody tr th:only-of-type {\n",
39
+ " vertical-align: middle;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe tbody tr th {\n",
43
+ " vertical-align: top;\n",
44
+ " }\n",
45
+ "\n",
46
+ " .dataframe thead th {\n",
47
+ " text-align: right;\n",
48
+ " }\n",
49
+ "</style>\n",
50
+ "<table border=\"1\" class=\"dataframe\">\n",
51
+ " <thead>\n",
52
+ " <tr style=\"text-align: right;\">\n",
53
+ " <th></th>\n",
54
+ " <th>User-ID</th>\n",
55
+ " <th>Location</th>\n",
56
+ " <th>Age</th>\n",
57
+ " </tr>\n",
58
+ " </thead>\n",
59
+ " <tbody>\n",
60
+ " <tr>\n",
61
+ " <th>0</th>\n",
62
+ " <td>1</td>\n",
63
+ " <td>nyc, new york, usa</td>\n",
64
+ " <td>NaN</td>\n",
65
+ " </tr>\n",
66
+ " <tr>\n",
67
+ " <th>1</th>\n",
68
+ " <td>2</td>\n",
69
+ " <td>stockton, california, usa</td>\n",
70
+ " <td>18.0</td>\n",
71
+ " </tr>\n",
72
+ " <tr>\n",
73
+ " <th>2</th>\n",
74
+ " <td>3</td>\n",
75
+ " <td>moscow, yukon territory, russia</td>\n",
76
+ " <td>NaN</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>3</th>\n",
80
+ " <td>4</td>\n",
81
+ " <td>porto, v.n.gaia, portugal</td>\n",
82
+ " <td>17.0</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>4</th>\n",
86
+ " <td>5</td>\n",
87
+ " <td>farnborough, hants, united kingdom</td>\n",
88
+ " <td>NaN</td>\n",
89
+ " </tr>\n",
90
+ " </tbody>\n",
91
+ "</table>\n",
92
+ "</div>"
93
+ ],
94
+ "text/plain": [
95
+ " User-ID Location Age\n",
96
+ "0 1 nyc, new york, usa NaN\n",
97
+ "1 2 stockton, california, usa 18.0\n",
98
+ "2 3 moscow, yukon territory, russia NaN\n",
99
+ "3 4 porto, v.n.gaia, portugal 17.0\n",
100
+ "4 5 farnborough, hants, united kingdom NaN"
101
+ ]
102
+ },
103
+ "execution_count": 2,
104
+ "metadata": {},
105
+ "output_type": "execute_result"
106
+ }
107
+ ],
108
+ "source": [
109
+ "path = \"./dataset\"\n",
110
+ "\n",
111
+ "# user dataset\n",
112
+ "user_df = pd.read_csv(f\"{path}/users.csv\", delimiter=';', encoding='ISO-8859-1')\n",
113
+ "user_df.head()"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 3,
119
+ "id": "fe62dfa3",
120
+ "metadata": {},
121
+ "outputs": [
122
+ {
123
+ "name": "stderr",
124
+ "output_type": "stream",
125
+ "text": [
126
+ "b'Skipping line 6452: expected 8 fields, saw 9\\nSkipping line 43667: expected 8 fields, saw 10\\nSkipping line 51751: expected 8 fields, saw 9\\n'\n",
127
+ "b'Skipping line 92038: expected 8 fields, saw 9\\nSkipping line 104319: expected 8 fields, saw 9\\nSkipping line 121768: expected 8 fields, saw 9\\n'\n",
128
+ "b'Skipping line 144058: expected 8 fields, saw 9\\nSkipping line 150789: expected 8 fields, saw 9\\nSkipping line 157128: expected 8 fields, saw 9\\nSkipping line 180189: expected 8 fields, saw 9\\nSkipping line 185738: expected 8 fields, saw 9\\n'\n",
129
+ "b'Skipping line 209388: expected 8 fields, saw 9\\nSkipping line 220626: expected 8 fields, saw 9\\nSkipping line 227933: expected 8 fields, saw 11\\nSkipping line 228957: expected 8 fields, saw 10\\nSkipping line 245933: expected 8 fields, saw 9\\nSkipping line 251296: expected 8 fields, saw 9\\nSkipping line 259941: expected 8 fields, saw 9\\nSkipping line 261529: expected 8 fields, saw 9\\n'\n"
130
+ ]
131
+ },
132
+ {
133
+ "data": {
134
+ "text/html": [
135
+ "<div>\n",
136
+ "<style scoped>\n",
137
+ " .dataframe tbody tr th:only-of-type {\n",
138
+ " vertical-align: middle;\n",
139
+ " }\n",
140
+ "\n",
141
+ " .dataframe tbody tr th {\n",
142
+ " vertical-align: top;\n",
143
+ " }\n",
144
+ "\n",
145
+ " .dataframe thead th {\n",
146
+ " text-align: right;\n",
147
+ " }\n",
148
+ "</style>\n",
149
+ "<table border=\"1\" class=\"dataframe\">\n",
150
+ " <thead>\n",
151
+ " <tr style=\"text-align: right;\">\n",
152
+ " <th></th>\n",
153
+ " <th>ISBN</th>\n",
154
+ " <th>Book-Title</th>\n",
155
+ " <th>Book-Author</th>\n",
156
+ " <th>Year-Of-Publication</th>\n",
157
+ " <th>Publisher</th>\n",
158
+ " <th>Image-URL-S</th>\n",
159
+ " <th>Image-URL-M</th>\n",
160
+ " <th>Image-URL-L</th>\n",
161
+ " </tr>\n",
162
+ " </thead>\n",
163
+ " <tbody>\n",
164
+ " <tr>\n",
165
+ " <th>0</th>\n",
166
+ " <td>0195153448</td>\n",
167
+ " <td>Classical Mythology</td>\n",
168
+ " <td>Mark P. O. Morford</td>\n",
169
+ " <td>2002</td>\n",
170
+ " <td>Oxford University Press</td>\n",
171
+ " <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
172
+ " <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
173
+ " <td>http://images.amazon.com/images/P/0195153448.0...</td>\n",
174
+ " </tr>\n",
175
+ " <tr>\n",
176
+ " <th>1</th>\n",
177
+ " <td>0002005018</td>\n",
178
+ " <td>Clara Callan</td>\n",
179
+ " <td>Richard Bruce Wright</td>\n",
180
+ " <td>2001</td>\n",
181
+ " <td>HarperFlamingo Canada</td>\n",
182
+ " <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
183
+ " <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
184
+ " <td>http://images.amazon.com/images/P/0002005018.0...</td>\n",
185
+ " </tr>\n",
186
+ " <tr>\n",
187
+ " <th>2</th>\n",
188
+ " <td>0060973129</td>\n",
189
+ " <td>Decision in Normandy</td>\n",
190
+ " <td>Carlo D'Este</td>\n",
191
+ " <td>1991</td>\n",
192
+ " <td>HarperPerennial</td>\n",
193
+ " <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
194
+ " <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
195
+ " <td>http://images.amazon.com/images/P/0060973129.0...</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>3</th>\n",
199
+ " <td>0374157065</td>\n",
200
+ " <td>Flu: The Story of the Great Influenza Pandemic...</td>\n",
201
+ " <td>Gina Bari Kolata</td>\n",
202
+ " <td>1999</td>\n",
203
+ " <td>Farrar Straus Giroux</td>\n",
204
+ " <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
205
+ " <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
206
+ " <td>http://images.amazon.com/images/P/0374157065.0...</td>\n",
207
+ " </tr>\n",
208
+ " <tr>\n",
209
+ " <th>4</th>\n",
210
+ " <td>0393045218</td>\n",
211
+ " <td>The Mummies of Urumchi</td>\n",
212
+ " <td>E. J. W. Barber</td>\n",
213
+ " <td>1999</td>\n",
214
+ " <td>W. W. Norton &amp;amp; Company</td>\n",
215
+ " <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
216
+ " <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
217
+ " <td>http://images.amazon.com/images/P/0393045218.0...</td>\n",
218
+ " </tr>\n",
219
+ " </tbody>\n",
220
+ "</table>\n",
221
+ "</div>"
222
+ ],
223
+ "text/plain": [
224
+ " ISBN Book-Title \\\n",
225
+ "0 0195153448 Classical Mythology \n",
226
+ "1 0002005018 Clara Callan \n",
227
+ "2 0060973129 Decision in Normandy \n",
228
+ "3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
229
+ "4 0393045218 The Mummies of Urumchi \n",
230
+ "\n",
231
+ " Book-Author Year-Of-Publication Publisher \\\n",
232
+ "0 Mark P. O. Morford 2002 Oxford University Press \n",
233
+ "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
234
+ "2 Carlo D'Este 1991 HarperPerennial \n",
235
+ "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
236
+ "4 E. J. W. Barber 1999 W. W. Norton &amp; Company \n",
237
+ "\n",
238
+ " Image-URL-S \\\n",
239
+ "0 http://images.amazon.com/images/P/0195153448.0... \n",
240
+ "1 http://images.amazon.com/images/P/0002005018.0... \n",
241
+ "2 http://images.amazon.com/images/P/0060973129.0... \n",
242
+ "3 http://images.amazon.com/images/P/0374157065.0... \n",
243
+ "4 http://images.amazon.com/images/P/0393045218.0... \n",
244
+ "\n",
245
+ " Image-URL-M \\\n",
246
+ "0 http://images.amazon.com/images/P/0195153448.0... \n",
247
+ "1 http://images.amazon.com/images/P/0002005018.0... \n",
248
+ "2 http://images.amazon.com/images/P/0060973129.0... \n",
249
+ "3 http://images.amazon.com/images/P/0374157065.0... \n",
250
+ "4 http://images.amazon.com/images/P/0393045218.0... \n",
251
+ "\n",
252
+ " Image-URL-L \n",
253
+ "0 http://images.amazon.com/images/P/0195153448.0... \n",
254
+ "1 http://images.amazon.com/images/P/0002005018.0... \n",
255
+ "2 http://images.amazon.com/images/P/0060973129.0... \n",
256
+ "3 http://images.amazon.com/images/P/0374157065.0... \n",
257
+ "4 http://images.amazon.com/images/P/0393045218.0... "
258
+ ]
259
+ },
260
+ "execution_count": 3,
261
+ "metadata": {},
262
+ "output_type": "execute_result"
263
+ }
264
+ ],
265
+ "source": [
266
+ "# book dataset\n",
267
+ "book_df = pd.read_csv(f\"{path}/books.csv\", delimiter=';', encoding='ISO-8859-1', error_bad_lines=False)\n",
268
+ "book_df.head()"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 4,
274
+ "id": "d9fa4750",
275
+ "metadata": {},
276
+ "outputs": [
277
+ {
278
+ "data": {
279
+ "text/html": [
280
+ "<div>\n",
281
+ "<style scoped>\n",
282
+ " .dataframe tbody tr th:only-of-type {\n",
283
+ " vertical-align: middle;\n",
284
+ " }\n",
285
+ "\n",
286
+ " .dataframe tbody tr th {\n",
287
+ " vertical-align: top;\n",
288
+ " }\n",
289
+ "\n",
290
+ " .dataframe thead th {\n",
291
+ " text-align: right;\n",
292
+ " }\n",
293
+ "</style>\n",
294
+ "<table border=\"1\" class=\"dataframe\">\n",
295
+ " <thead>\n",
296
+ " <tr style=\"text-align: right;\">\n",
297
+ " <th></th>\n",
298
+ " <th>User-ID</th>\n",
299
+ " <th>ISBN</th>\n",
300
+ " <th>Book-Rating</th>\n",
301
+ " </tr>\n",
302
+ " </thead>\n",
303
+ " <tbody>\n",
304
+ " <tr>\n",
305
+ " <th>0</th>\n",
306
+ " <td>276725</td>\n",
307
+ " <td>034545104X</td>\n",
308
+ " <td>0</td>\n",
309
+ " </tr>\n",
310
+ " <tr>\n",
311
+ " <th>1</th>\n",
312
+ " <td>276726</td>\n",
313
+ " <td>0155061224</td>\n",
314
+ " <td>5</td>\n",
315
+ " </tr>\n",
316
+ " <tr>\n",
317
+ " <th>2</th>\n",
318
+ " <td>276727</td>\n",
319
+ " <td>0446520802</td>\n",
320
+ " <td>0</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>3</th>\n",
324
+ " <td>276729</td>\n",
325
+ " <td>052165615X</td>\n",
326
+ " <td>3</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>4</th>\n",
330
+ " <td>276729</td>\n",
331
+ " <td>0521795028</td>\n",
332
+ " <td>6</td>\n",
333
+ " </tr>\n",
334
+ " <tr>\n",
335
+ " <th>5</th>\n",
336
+ " <td>276733</td>\n",
337
+ " <td>2080674722</td>\n",
338
+ " <td>0</td>\n",
339
+ " </tr>\n",
340
+ " <tr>\n",
341
+ " <th>6</th>\n",
342
+ " <td>276736</td>\n",
343
+ " <td>3257224281</td>\n",
344
+ " <td>8</td>\n",
345
+ " </tr>\n",
346
+ " <tr>\n",
347
+ " <th>7</th>\n",
348
+ " <td>276737</td>\n",
349
+ " <td>0600570967</td>\n",
350
+ " <td>6</td>\n",
351
+ " </tr>\n",
352
+ " <tr>\n",
353
+ " <th>8</th>\n",
354
+ " <td>276744</td>\n",
355
+ " <td>038550120X</td>\n",
356
+ " <td>7</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>9</th>\n",
360
+ " <td>276745</td>\n",
361
+ " <td>342310538</td>\n",
362
+ " <td>10</td>\n",
363
+ " </tr>\n",
364
+ " </tbody>\n",
365
+ "</table>\n",
366
+ "</div>"
367
+ ],
368
+ "text/plain": [
369
+ " User-ID ISBN Book-Rating\n",
370
+ "0 276725 034545104X 0\n",
371
+ "1 276726 0155061224 5\n",
372
+ "2 276727 0446520802 0\n",
373
+ "3 276729 052165615X 3\n",
374
+ "4 276729 0521795028 6\n",
375
+ "5 276733 2080674722 0\n",
376
+ "6 276736 3257224281 8\n",
377
+ "7 276737 0600570967 6\n",
378
+ "8 276744 038550120X 7\n",
379
+ "9 276745 342310538 10"
380
+ ]
381
+ },
382
+ "execution_count": 4,
383
+ "metadata": {},
384
+ "output_type": "execute_result"
385
+ }
386
+ ],
387
+ "source": [
388
+ "# rating dataset\n",
389
+ "rating_df = pd.read_csv(f\"{path}/ratings.csv\", delimiter=';', encoding='ISO-8859-1')\n",
390
+ "rating_df.head(10)"
391
+ ]
392
+ },
393
+ {
394
+ "cell_type": "code",
395
+ "execution_count": 5,
396
+ "id": "53c66ec4",
397
+ "metadata": {},
398
+ "outputs": [
399
+ {
400
+ "data": {
401
+ "text/plain": [
402
+ "Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')"
403
+ ]
404
+ },
405
+ "execution_count": 5,
406
+ "metadata": {},
407
+ "output_type": "execute_result"
408
+ }
409
+ ],
410
+ "source": [
411
+ "rating_df.columns"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 6,
417
+ "id": "691767c0",
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "data": {
422
+ "text/html": [
423
+ "<div>\n",
424
+ "<style scoped>\n",
425
+ " .dataframe tbody tr th:only-of-type {\n",
426
+ " vertical-align: middle;\n",
427
+ " }\n",
428
+ "\n",
429
+ " .dataframe tbody tr th {\n",
430
+ " vertical-align: top;\n",
431
+ " }\n",
432
+ "\n",
433
+ " .dataframe thead th {\n",
434
+ " text-align: right;\n",
435
+ " }\n",
436
+ "</style>\n",
437
+ "<table border=\"1\" class=\"dataframe\">\n",
438
+ " <thead>\n",
439
+ " <tr style=\"text-align: right;\">\n",
440
+ " <th></th>\n",
441
+ " <th>Mean-Rating</th>\n",
442
+ " <th>Num-Rating</th>\n",
443
+ " </tr>\n",
444
+ " <tr>\n",
445
+ " <th>ISBN</th>\n",
446
+ " <th></th>\n",
447
+ " <th></th>\n",
448
+ " </tr>\n",
449
+ " </thead>\n",
450
+ " <tbody>\n",
451
+ " <tr>\n",
452
+ " <th>0330299891</th>\n",
453
+ " <td>3.0</td>\n",
454
+ " <td>2</td>\n",
455
+ " </tr>\n",
456
+ " <tr>\n",
457
+ " <th>0375404120</th>\n",
458
+ " <td>1.5</td>\n",
459
+ " <td>2</td>\n",
460
+ " </tr>\n",
461
+ " <tr>\n",
462
+ " <th>0586045007</th>\n",
463
+ " <td>0.0</td>\n",
464
+ " <td>1</td>\n",
465
+ " </tr>\n",
466
+ " <tr>\n",
467
+ " <th>9022906116</th>\n",
468
+ " <td>3.5</td>\n",
469
+ " <td>2</td>\n",
470
+ " </tr>\n",
471
+ " <tr>\n",
472
+ " <th>9032803328</th>\n",
473
+ " <td>0.0</td>\n",
474
+ " <td>1</td>\n",
475
+ " </tr>\n",
476
+ " </tbody>\n",
477
+ "</table>\n",
478
+ "</div>"
479
+ ],
480
+ "text/plain": [
481
+ " Mean-Rating Num-Rating\n",
482
+ "ISBN \n",
483
+ " 0330299891 3.0 2\n",
484
+ " 0375404120 1.5 2\n",
485
+ " 0586045007 0.0 1\n",
486
+ " 9022906116 3.5 2\n",
487
+ " 9032803328 0.0 1"
488
+ ]
489
+ },
490
+ "execution_count": 6,
491
+ "metadata": {},
492
+ "output_type": "execute_result"
493
+ }
494
+ ],
495
+ "source": [
496
+ "function = {\n",
497
+ " \"Book-Rating\": \"mean\",\n",
498
+ " \"User-ID\": \"count\"\n",
499
+ "}\n",
500
+ "\n",
501
+ "summary_rating = rating_df.groupby(\"ISBN\").agg(function, axis=0)\n",
502
+ "summary_rating = summary_rating.rename(columns={\"Book-Rating\": \"Mean-Rating\", \"User-ID\": \"Num-Rating\"})\n",
503
+ "summary_rating.head()"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "markdown",
508
+ "id": "3e20611a",
509
+ "metadata": {},
510
+ "source": [
511
+ "**Note:** In this repo, I only consider `book_df` and `rating_df`."
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 7,
517
+ "id": "82e1b680",
518
+ "metadata": {},
519
+ "outputs": [
520
+ {
521
+ "data": {
522
+ "text/html": [
523
+ "<div>\n",
524
+ "<style scoped>\n",
525
+ " .dataframe tbody tr th:only-of-type {\n",
526
+ " vertical-align: middle;\n",
527
+ " }\n",
528
+ "\n",
529
+ " .dataframe tbody tr th {\n",
530
+ " vertical-align: top;\n",
531
+ " }\n",
532
+ "\n",
533
+ " .dataframe thead th {\n",
534
+ " text-align: right;\n",
535
+ " }\n",
536
+ "</style>\n",
537
+ "<table border=\"1\" class=\"dataframe\">\n",
538
+ " <thead>\n",
539
+ " <tr style=\"text-align: right;\">\n",
540
+ " <th></th>\n",
541
+ " <th>ISBN</th>\n",
542
+ " <th>Book-Title</th>\n",
543
+ " <th>Book-Author</th>\n",
544
+ " <th>Year-Of-Publication</th>\n",
545
+ " <th>Publisher</th>\n",
546
+ " <th>Mean-Rating</th>\n",
547
+ " <th>Num-Rating</th>\n",
548
+ " </tr>\n",
549
+ " </thead>\n",
550
+ " <tbody>\n",
551
+ " <tr>\n",
552
+ " <th>0</th>\n",
553
+ " <td>0195153448</td>\n",
554
+ " <td>Classical Mythology</td>\n",
555
+ " <td>Mark P. O. Morford</td>\n",
556
+ " <td>2002</td>\n",
557
+ " <td>Oxford University Press</td>\n",
558
+ " <td>0.000000</td>\n",
559
+ " <td>1.0</td>\n",
560
+ " </tr>\n",
561
+ " <tr>\n",
562
+ " <th>1</th>\n",
563
+ " <td>0002005018</td>\n",
564
+ " <td>Clara Callan</td>\n",
565
+ " <td>Richard Bruce Wright</td>\n",
566
+ " <td>2001</td>\n",
567
+ " <td>HarperFlamingo Canada</td>\n",
568
+ " <td>4.928571</td>\n",
569
+ " <td>14.0</td>\n",
570
+ " </tr>\n",
571
+ " <tr>\n",
572
+ " <th>2</th>\n",
573
+ " <td>0060973129</td>\n",
574
+ " <td>Decision in Normandy</td>\n",
575
+ " <td>Carlo D'Este</td>\n",
576
+ " <td>1991</td>\n",
577
+ " <td>HarperPerennial</td>\n",
578
+ " <td>5.000000</td>\n",
579
+ " <td>3.0</td>\n",
580
+ " </tr>\n",
581
+ " <tr>\n",
582
+ " <th>3</th>\n",
583
+ " <td>0374157065</td>\n",
584
+ " <td>Flu: The Story of the Great Influenza Pandemic...</td>\n",
585
+ " <td>Gina Bari Kolata</td>\n",
586
+ " <td>1999</td>\n",
587
+ " <td>Farrar Straus Giroux</td>\n",
588
+ " <td>4.272727</td>\n",
589
+ " <td>11.0</td>\n",
590
+ " </tr>\n",
591
+ " <tr>\n",
592
+ " <th>4</th>\n",
593
+ " <td>0393045218</td>\n",
594
+ " <td>The Mummies of Urumchi</td>\n",
595
+ " <td>E. J. W. Barber</td>\n",
596
+ " <td>1999</td>\n",
597
+ " <td>W. W. Norton &amp;amp; Company</td>\n",
598
+ " <td>0.000000</td>\n",
599
+ " <td>1.0</td>\n",
600
+ " </tr>\n",
601
+ " </tbody>\n",
602
+ "</table>\n",
603
+ "</div>"
604
+ ],
605
+ "text/plain": [
606
+ " ISBN Book-Title \\\n",
607
+ "0 0195153448 Classical Mythology \n",
608
+ "1 0002005018 Clara Callan \n",
609
+ "2 0060973129 Decision in Normandy \n",
610
+ "3 0374157065 Flu: The Story of the Great Influenza Pandemic... \n",
611
+ "4 0393045218 The Mummies of Urumchi \n",
612
+ "\n",
613
+ " Book-Author Year-Of-Publication Publisher \\\n",
614
+ "0 Mark P. O. Morford 2002 Oxford University Press \n",
615
+ "1 Richard Bruce Wright 2001 HarperFlamingo Canada \n",
616
+ "2 Carlo D'Este 1991 HarperPerennial \n",
617
+ "3 Gina Bari Kolata 1999 Farrar Straus Giroux \n",
618
+ "4 E. J. W. Barber 1999 W. W. Norton &amp; Company \n",
619
+ "\n",
620
+ " Mean-Rating Num-Rating \n",
621
+ "0 0.000000 1.0 \n",
622
+ "1 4.928571 14.0 \n",
623
+ "2 5.000000 3.0 \n",
624
+ "3 4.272727 11.0 \n",
625
+ "4 0.000000 1.0 "
626
+ ]
627
+ },
628
+ "execution_count": 7,
629
+ "metadata": {},
630
+ "output_type": "execute_result"
631
+ }
632
+ ],
633
+ "source": [
634
+ "df = book_df.merge(summary_rating, how=\"left\", left_on=\"ISBN\", right_on=\"ISBN\")\n",
635
+ "df.drop(columns=[\"Image-URL-S\", \"Image-URL-M\", \"Image-URL-L\"], inplace=True)\n",
636
+ "df.head()"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 9,
642
+ "id": "fb397a05",
643
+ "metadata": {},
644
+ "outputs": [
645
+ {
646
+ "data": {
647
+ "text/plain": [
648
+ "ISBN 0\n",
649
+ "Book-Title 0\n",
650
+ "Book-Author 1\n",
651
+ "Year-Of-Publication 0\n",
652
+ "Publisher 2\n",
653
+ "Mean-Rating 1209\n",
654
+ "Num-Rating 1209\n",
655
+ "dtype: int64"
656
+ ]
657
+ },
658
+ "execution_count": 9,
659
+ "metadata": {},
660
+ "output_type": "execute_result"
661
+ }
662
+ ],
663
+ "source": [
664
+ "df.isnull().sum()"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "code",
669
+ "execution_count": 10,
670
+ "id": "7c7139ed",
671
+ "metadata": {},
672
+ "outputs": [],
673
+ "source": [
674
+ "# Save\n",
675
+ "df.to_csv(f\"{path}/summary_book.csv\", index=False)"
676
+ ]
677
+ }
678
+ ],
679
+ "metadata": {
680
+ "kernelspec": {
681
+ "display_name": "Python 3",
682
+ "language": "python",
683
+ "name": "python3"
684
+ },
685
+ "language_info": {
686
+ "codemirror_mode": {
687
+ "name": "ipython",
688
+ "version": 3
689
+ },
690
+ "file_extension": ".py",
691
+ "mimetype": "text/x-python",
692
+ "name": "python",
693
+ "nbconvert_exporter": "python",
694
+ "pygments_lexer": "ipython3",
695
+ "version": "3.8.10"
696
+ },
697
+ "latex_envs": {
698
+ "LaTeX_envs_menu_present": true,
699
+ "autoclose": false,
700
+ "autocomplete": true,
701
+ "bibliofile": "biblio.bib",
702
+ "cite_by": "apalike",
703
+ "current_citInitial": 1,
704
+ "eqLabelWithNumbers": true,
705
+ "eqNumInitial": 1,
706
+ "hotkeys": {
707
+ "equation": "Ctrl-E",
708
+ "itemize": "Ctrl-I"
709
+ },
710
+ "labels_anchors": false,
711
+ "latex_user_defs": false,
712
+ "report_style_numbering": false,
713
+ "user_envs_cfg": false
714
+ }
715
+ },
716
+ "nbformat": 4,
717
+ "nbformat_minor": 5
718
+ }
README.md CHANGED
@@ -1,12 +1,9 @@
1
  ---
2
  title: Book Recommender System
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: streamlit
7
- sdk_version: 1.21.0
8
  app_file: app.py
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Book Recommender System
3
+ emoji: 👀
4
+ colorFrom: purple
5
+ colorTo: purple
6
  sdk: streamlit
 
7
  app_file: app.py
8
  pinned: false
9
+ ---
 
 
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import streamlit as st
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ # Parameters
9
+ data_dir = f'./processed'
10
+ weight_dir = f'./weight'
11
+ info_path = f'./processed/summary_book.csv'
12
+ num = 10
13
+ lb = 0
14
+
15
+ # Load R matrix from file
16
+ R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
17
+ # Load prediction
18
+ prediction = np.load(f'{weight_dir}/predicted.npy', allow_pickle=True)
19
+ # Load dictionary from JSON file
20
+ with open(f'{data_dir}/user_id_map.json', 'r') as file:
21
+ user2id = json.load(file)
22
+ with open(f'{data_dir}/book_id_map.json', 'r') as file:
23
+ book2id = json.load(file)
24
+
25
+
26
+ # Define the input and output functions for Gradio
27
+ def recommend_books(user_id):
28
+ # Recommend
29
+ user_idx = user2id[str(user_id)]
30
+ predict = prediction[:, user_idx] # get prediction for user
31
+ predict_dict = {book: np.round(predict[idx], 2) for book, idx in book2id.items()}
32
+ # Load information about book
33
+ book_df = pd.read_csv(info_path)
34
+ book_df = book_df[book_df["Num-Rating"] > lb]
35
+ book_df['predict'] = book_df["ISBN"].map(predict_dict)
36
+ df = book_df.nlargest(num, "predict").reset_index(drop=True)
37
+ df["context"] = df.apply(
38
+ lambda book: f"{book['Book-Title']} ({book['Year-Of-Publication']}) - by {book['Book-Author']}", axis=1
39
+ )
40
+
41
+ return df['context'].values
42
+
43
+ st.title('Book Recommender System')
44
+
45
+ # Display dialogue box that contains content
46
+ user_id = st.selectbox(
47
+ 'Enter your ID:',
48
+ user2id.keys()
49
+ )
50
+
51
+ # Setting a button
52
+ if st.button('Recommend'):
53
+ recommendations = recommend_books(user_id)
54
+ st.write('**_Your ID:_**', user_id)
55
+ st.write('**_Your top 10 recommendations:_**')
56
+ for num, i in enumerate(recommendations):
57
+ st.write(num + 1, ':', i)
dataset/books.csv ADDED
Binary file (77.8 MB). View file
 
dataset/ratings.csv ADDED
The diff for this file is too large to render. See raw diff
 
dataset/users.csv ADDED
The diff for this file is too large to render. See raw diff
 
preprocessing.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import yaml
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ from pathlib import Path
7
+ from jsonargparse import ArgumentParser
8
+
9
+
10
+ def parse_args():
11
+ """Parse command-line arguments."""
12
+ parser = ArgumentParser()
13
+ parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv")
14
+ parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv")
15
+ parser.add_argument("--out_dir", type=str, required=True, default="./processed")
16
+ parser.add_argument("--limit", required=True, type=int, default=1000)
17
+
18
+ return vars(parser.parse_args())
19
+
20
+
21
+ def main(
22
+ rating_path,
23
+ book_path,
24
+ out_dir,
25
+ limit,
26
+ **kwargs
27
+ ):
28
+ data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1')
29
+
30
+ # Make Y
31
+ Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating')
32
+ Y = Y.fillna(0)
33
+ Y = Y.values
34
+
35
+ # Make R
36
+ R = np.where(Y != 0, 1, 0)
37
+
38
+ # Save Y and R as dense matrices
39
+ out_dir_path = Path(out_dir)
40
+ if out_dir_path.exists():
41
+ assert out_dir_path.is_dir()
42
+ else:
43
+ out_dir_path.mkdir(parents=True)
44
+ np.save(f'{out_dir_path}/Y.npy', Y)
45
+ np.save(f'{out_dir_path}/R.npy', R)
46
+
47
+ # Create mappings for book and user IDs
48
+ book_lst = data['ISBN'].unique()
49
+ user_lst = data['User-ID'].unique()
50
+ book_id_map = {book_id: i for i, book_id in enumerate(book_lst)}
51
+ user_id_map = {user_id: i for i, user_id in enumerate(user_lst)}
52
+ # Convert keys to compatible types
53
+ book_id_map = {str(key): value for key, value in book_id_map.items()}
54
+ user_id_map = {str(key): value for key, value in user_id_map.items()}
55
+
56
+ # Save book_id_map to file
57
+ with open(f'{out_dir_path}/book_id_map.json', 'w') as f:
58
+ json.dump(book_id_map, f)
59
+
60
+ # Save user_id_map to file
61
+ with open(f'{out_dir_path}/user_id_map.json', 'w') as f:
62
+ json.dump(user_id_map, f)
63
+
64
+ # Get summary
65
+ function = {
66
+ "Book-Rating": "mean",
67
+ "User-ID": "count"
68
+ }
69
+
70
+ book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
71
+ summary_rating = data.groupby("ISBN").agg(function, axis=0)
72
+ summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"})
73
+ df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN")
74
+ df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True)
75
+ df.to_csv(f"{out_dir_path}/summary_book.csv", index=False)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main(**parse_args())
processed/R.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e1255c1daea3561d8e326acb7271127549923abed46da5de0e092a8664b227f
3
+ size 1293760
processed/Y.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1701a14103e9b7259e27b06c7eb9e0b71af75078e0eef6b2e4b6c163f281f7ee
3
+ size 1293760
processed/book_id_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"034545104X": 0, "0155061224": 1, "0446520802": 2, "052165615X": 3, "0521795028": 4, "2080674722": 5, "3257224281": 6, "0600570967": 7, "038550120X": 8, "342310538": 9, "0425115801": 10, "0449006522": 11, "0553561618": 12, "055356451X": 13, "0786013990": 14, "0786014512": 15, "0060517794": 16, "0451192001": 17, "0609801279": 18, "0671537458": 19, "0679776818": 20, "0943066433": 21, "1570231028": 22, "1885408226": 23, "0747558167": 24, "3442437407": 25, "033390804X": 26, "3596218098": 27, "0684867621": 28, "0451166892": 29, "8440682697": 30, "034544003X": 31, "0380000059": 32, "0380711524": 33, "0451167317": 34, "0451454952": 35, "0843920262": 36, "3404122879": 37, "3404182928": 38, "3404611306": 39, "342662429": 40, "3426690179": 41, "3442424216": 42, "3442425573": 43, "3453092007": 44, "3453157745": 45, "3453176944": 46, "3453185137": 47, "3453185323": 48, "3453213025": 49, "3453877241": 50, "3492226604": 51, "3517017442": 52, "3596125006": 53, "B0000BLD7X": 54, "N3453124715": 55, "9029716894": 56, "9057868059": 57, "0140279091": 58, "0553572369": 59, "0571058086": 60, "3499230933": 61, "3596151465": 62, "0099543818": 63, "3404147723": 64, "3423111321": 65, "3442136644": 66, "3492232000": 67, "8434811634": 68, "8484330478": 69, "8484332039": 70, "2864322102": 71, "8402065945": 72, "8423314901": 73, "842333533X": 74, "8427911769": 75, "8433914456": 76, "8437606322": 77, "8445072919": 78, "8466300821": 79, "847765011X": 80, "8478442588": 81, "8495368099": 82, "0345443683": 83, "043935806X": 84, "055310666X": 85, "0330332775": 86, "0330367358": 87, "0006379702": 88, "3423084049": 89, "3442131340": 90, "3446202102": 91, "3453073398": 92, "3453115783": 93, "3499134004": 94, "349915398X": 95, "3548603203": 96, "3764501383": 97, "3791535889": 98, "0061054143": 99, "0061054151": 100, "0061056774": 101, "0671021346": 102, "0671024108": 103, "1562827898": 104, "9726954835": 105, "0064405176": 106, "0439104769": 107, "0440498058": 108, "8807817144": 109, "8845915913": 110, "0395547032": 111, "0440414121": 112, "8879839993": 113, "8401328322": 114, "8401461189": 115, "8401471796": 116, "8423996565": 117, "8424130081": 118, "8426449476": 119, "8426449573": 120, "8478884831": 121, "8478885218": 122, "8478885463": 123, "8478886044": 124, "8495618052": 125, "0091830893": 126, "0586207414": 127, "0812571029": 128, "0671749609": 129, "0140062718": 130, "0140260498": 131, "0060096195": 132, "0141310340": 133, "0142302198": 134, "0156006065": 135, "0375821813": 136, "038076041X": 137, "0439087597": 138, "0439401399": 139, "0552546933": 140, "0689804458": 141, "0699854289": 142, "0786812508": 143, "0786817070": 144, "0805057706": 145, "1551925060": 146, "1573248533": 147, "000651118X": 148, "0385272324": 149, "2038701962": 150, "2070264564": 151, "2070334430": 152, "2070403734": 153, "2080680676": 154, "2232122263": 155, "2253044903": 156, "2253150711": 157, "2266076000": 158, "2277221678": 159, "2290321028": 160, "2842190009": 161, "0440225825": 162, "0316781266": 163, "0345446860": 164, "0671876244": 165, "3125785006": 166, "0380005239": 167, "1593080255": 168, "0330201700": 169, "0385729340": 170, "3809407536": 171, "0446364193": 172, "3257200552": 173, "3379015180": 174, "3404145909": 175, "3404148576": 176, "3404921178": 177, "3423071516": 178, "3423204885": 179, "3423205806": 180, "3426029553": 181, "3426622610": 182, "3426671298": 183, "344215121X": 184, "3442413508": 185, "3442422035": 186, "3442435773": 187, "3442437717": 188, "3442441080": 189, "3442442796": 190, "3442444020": 191, "3442446414": 192, "3442448530": 193, "3442449820": 194, "3453137442": 195, "3453870190": 196, "3455077331": 197, "347354034X": 198, "3492231322": 199, "349912176X": 200, "3499222213": 201, "3499228297": 202, "3499232529": 203, "3499233436": 204, "3499264528": 205, "3499433443": 206, "3506464078": 207, "3548602967": 208, "3551551677": 209, "3551551685": 210, "3551551693": 211, "3551551936": 212, "359621078X": 213, "3608932240": 214, "360893541X": 215, "3608935428": 216, "3608935436": 217, "374661922X": 218, "3770131495": 219, "3809024589": 220, "3821815191": 221, "3932069234": 222, "3423100424": 223, "3442096596": 224, "3442440777": 225, "3453009304": 226, "3453042905": 227, "3453061187": 228, "3453071174": 229, "3453127013": 230, "3453211014": 231, "3462026062": 232, "349926028X": 233, "3596122279": 234, "3596287200": 235, "3922524443": 236, "0449217264": 237, "0140621741": 238, "055321358X": 239, "8420457477": 240, "8448034023": 241, "0671034944": 242, "074931012X": 243, "0843946415": 244, "2010173929": 245, "20103389": 246, "2012003494": 247, "2013218826": 248, "2013220162": 249, "207033015X": 250, "2070332985": 251, "2203142278": 252, "2205040561": 253, "221096900X": 254, "2253005274": 255, "2723402983": 256, "2747002748": 257, "2800108584": 258, "2800134259": 259, "2800134267": 260, "2800135522": 261, "2800135565": 262, "2800135719": 263, "2800135727": 264, "2803616998": 265, "2907572458": 266, "8439598459": 267, "014014899X": 268, "0140252517": 269, "0140269967": 270, "0140328742": 271, "0140366830": 272, "0140367446": 273, "0425131378": 274, "0517642689": 275, "0600571165": 276, "077104450X": 277, "0789706032": 278, "0836218833": 279, "0836220889": 280, "088365721X": 281, "1559712252": 282, "1567616089": 283, "2890510328": 284, "3257227264": 285, "0141011904": 286, "342313075X": 287, "3423201509": 288, "3423206616": 289, "344242529": 290, "3446200452": 291, "349223903X": 292, "3499231603": 293, "3704320196": 294, "3886807843": 295, "0345423402": 296, "1569312435": 297, "1892213141": 298, "8530805461": 299, "3257218516": 300, "3404126343": 301, "3404131606": 302, "3404139178": 303, "3404143299": 304, "3404614542": 305, "340645724X": 306, "3426192543": 307, "3426702266": 308, "3442421357": 309, "3442443806": 310, "345309221X": 311, "3453108361": 312, "3453126912": 313, "3453130901": 314, "3453132262": 315, "3596129389": 316, "3596237874": 317, "3596247500": 318, "3809410357": 319, "0345249372": 320, "044020562X": 321, "9026935722": 322, "0425167097": 323, "042518109X": 324, "0425188221": 325, "8500010452": 326, "8501023450": 327, "8571648972": 328, "8588615126": 329, "8589885291": 330, "009975181X": 331, "00273755": 332, "014366020444": 333, "0688172377": 334, "8481305464": 335, "8496075850": 336, "8804321008": 337, "8804375299": 338, "8804407808": 339, "8804464895": 340, "8804510579": 341, "8804512652": 342, "8806144146": 343, "8806155873": 344, "8807700735": 345, "8807806746": 346, "8807810751": 347, "880781112X": 348, "8807812495": 349, "8807813157": 350, "8807816059": 351, "8817106100": 352, "8817112917": 353, "8817877028": 354, "8820024381": 355, "883041915X": 356, "8831760122": 357, "8833908488": 358, "8838910987": 359, "8842806978": 360, "8845219747": 361, "8845249689": 362, "8846200624": 363, "88741800047": 364, "8876846565": 365, "8877825200": 366, "8879285513": 367, "8879285645": 368, "8879832905": 369, "8881110288": 370, "888634712X": 371, "8887432252": 372, "8888424121": 373, "0449210197": 374, "8817151068": 375, "0151446474": 376, "8433967606": 377, "8496280012": 378, "899792145": 379, "B158991965": 380, "0064430227": 381, "0671723650": 382, "0812533550": 383, "8806162160": 384, "884590184X": 385, "0446613843": 386, "0140270272": 387, "0440241537": 388, "0099460343": 389, "0375901582": 390, "0439317746": 391, "0440228840": 392, "0738205737": 393, "1566911605": 394, "0060542845": 395, "0449148831": 396, "1551666308": 397, "8420430943": 398, "8420636282": 399, "8432205311": 400, "000225669X": 401, "0099549611": 402, "0701162767": 403, "1852422580": 404, "042513976X": 405, "0441008291": 406, "0006511929": 407, "002542730X": 408, "0060520507": 409, "0060930934": 410, "0060951303": 411, "0099414732": 412, "0140154078": 413, "0140327592": 414, "0140367616": 415, "0141181222": 416, "0151010633": 417, "0192834312": 418, "0194216748": 419, "0240514866": 420, "0316666343": 421, "0345391810": 422, "0375400699": 423, "0385504209": 424, "043527242X": 425, "0439284031": 426, "0439286182": 427, "0439286239": 428, "0449221148": 429, "0451527747": 430, "0486282112": 431, "0486424499": 432, "0553275283": 433, "0582530431": 434, "0590502123": 435, "0595132189": 436, "0613329740": 437, "063403541X": 438, "067172939X": 439, "0671729438": 440, "0671746502": 441, "0679721851": 442, "0679745580": 443, "0691000980": 444, "0747545111": 445, "0749931434": 446, "0754000117": 447, "0804106304": 448, "0812583566": 449, "0843128240": 450, "0879517344": 451, "0971880107": 452, "1400001625": 453, "1400002672": 454, "1853262404": 455, "2061007074": 456, "2266095536": 457, "2290309494": 458, "2742739351": 459, "3522149904": 460, "3775713328": 461, "8401009421": 462, "8401327199": 463, "8401462231": 464, "840149236X": 465, "8408011200": 466, "8420427462": 467, "8420432113": 468, "8420444367": 469, "8420789895": 470, "8422655500": 471, "8422657104": 472, "842265783X": 473, "8422693445": 474, "8423325105": 475, "8423662152": 476, "8423951537": 477, "8423970647": 478, "8423976645": 479, "8426109799": 480, "8427007450": 481, "8429712372": 482, "8429714936": 483, "842975295": 484, "8429753419": 485, "8432227706": 486, "8434830809": 487, "8434840391": 488, "8440627203": 489, "8440696833": 490, "8445071572": 491, "8447306194": 492, "8447312054": 493, "8449416078": 494, "8472453723": 495, "8473068971": 496, "8474104823": 497, "8474263123": 498, "8475071163": 499, "8475961290": 500, "8476722338": 501, "8478809783": 502, "8482180088": 503, "8483221306": 504, "8489163499": 505, "8495501090": 506, "8495501198": 507, "8496077152": 508, "8496077209": 509, "8496246140": 510, "8497931467": 511, "950491036X": 512, "9508521481": 513, "9871138016": 514, "0460010239": 515, "0749336145": 516, "1899344705": 517, "0439135494": 518, "059030271X": 519, "0590453661": 520, "0590470108": 521, "0590483404": 522, "0671021354": 523, "0671026283": 524, "0671027506": 525, "0671727109": 526, "0060964049": 527, "0380807343": 528, "0439064864": 529, "2.02.032126.2": 530, "2.264.03602.8": 531, "2020058863": 532, "2020062399": 533, "2020101653": 534, "202011528X": 535, "20202006935": 536, "20203119888": 537, "2020386666": 538, "2020564777": 539, "2020591944": 540, "2070365832": 541, "2070378411": 542, "2070386023": 543, "2070394956": 544, "2070404587": 545, "2070404706": 546, "2070404722": 547, "2070406962": 548, "2070408450": 549, "2070425770": 550, "2070725804": 551, "2080680692": 552, "2226070109": 553, "2226126570": 554, "2226135022": 555, "2253030570": 556, "2253049417": 557, "2253050407": 558, "2253055972": 559, "225307659X": 560, "2253171670": 561, "2253172367": 562, "2264010991": 563, "2264013257": 564, "2264018194": 565, "2264024674": 566, "226402593X": 567, "2264027568": 568, "2264029463": 569, "226403114": 570, "2264031158": 571, "2264033282": 572, "226403601X": 573, "2264036036": 574, "2266040820": 575, "2266047280": 576, "226604960": 577, "22660861003": 578, "2266096451": 579, "2266102028": 580, "2266102621": 581, "2266104535": 582, "2266105698": 583, "2266121367": 584, "2277302228": 585, "2290303488": 586, "2290308285": 587, "2290312924": 588, "2290315524": 589, "2290321559": 590, "2702424131": 591, "2742724028": 592, "2742724613": 593, "2742729038": 594, "2743602295": 595, "2841469824": 596, "2842611462": 597, "2868696627": 598, "2869304129": 599, "2869304560": 600, "2869304870": 601, "2869305583": 602, "2895400644": 603, "2907572121": 604, "290757213X": 605, "1566190096": 606, "0749317256": 607, "3453150538": 608, "3492224628": 609, "3492224768": 610, "3596147700": 611, "0060011939": 612, "0099283697": 613, "0140187758": 614, "0316154601": 615, "059035342X": 616, "0713628944": 617, "0752844040": 618, "1853260665": 619, "2070414256": 620, "2070419657": 621, "2253063339": 622, "2253137243": 623, "2253144452": 624, "2253152072": 625, "2264027134": 626, "226612269X": 627, "2290318329": 628, "2702400612": 629, "2702401694": 630, "2878580753": 631, "8408040383": 632, "8423310353": 633, "844140321X": 634, "8484601072": 635, "0330373269": 636, "8882461327": 637, "349202436X": 638, "8807813858": 639, "0140182551": 640, "0446310786": 641, "0886776783": 642, "3257008155": 643, "329300301X": 644, "3419528078": 645, "3423118709": 646, "3440054594": 647, "3442723078": 648, "3453034120": 649, "3480204015": 650, "3492238882": 651, "3499121808": 652, "3499153629": 653, "3502513333": 654, "3502517371": 655, "3502551685": 656, "3596119502": 657, "3596221234": 658, "3596850185": 659, "3772402542": 660, "3772420176": 661, "3794170180": 662, "3806852456": 663, "3808572612": 664, "3821812028": 665, "3922708072": 666, "7321578936": 667, "0312966970": 668, "680ISBN359623": 669, "0340818182": 670, "0061096261": 671, "0312169817": 672, "0312864590": 673, "0345348664": 674, "0345348672": 675, "0345354621": 676, "0345385764": 677, "0345435036": 678, "0385264356": 679, "0425104273": 680, "0425172546": 681, "0440218667": 682, "0440220602": 683, "0446522856": 684, "0451180054": 685, "0451198514": 686, "0515131520": 687, "0517093715": 688, "0553266306": 689, "0765342987": 690, "0812550153": 691, "0812550307": 692, "0843949163": 693, "0886773741": 694, "0886774802": 695, "0886774829": 696, "0886775426": 697, "0886775957": 698, "0886777178": 699, "1572971835": 700, "9722900684": 701, "3499221489": 702, "2422614189": 703, "0345425596": 704, "0590259970": 705, "0590260251": 706, "0590417827": 707, "0590426702": 708, "0590436422": 709, "0590436449": 710, "0590436457": 711, "0590436465": 712, "0590436481": 713, "059043649X": 714, "0590436503": 715, "0590442589": 716, "0590442988": 717, "0590442996": 718, "0590448234": 719, "0590448595": 720, "0590456458": 721, "0590456504": 722, "0590470485": 723, "0590483056": 724, "0060930187": 725, "0375760911": 726, "0689817851": 727, "0874869870": 728, "0061081450": 729, "0061083259": 730, "0061087017": 731, "0061094404": 732, "0312925689": 733, "0440295653": 734, "0671793489": 735, "0684845768": 736, "0701169176": 737, "0312244266": 738, "880701601X": 739, "8817134899": 740, "0099268345": 741, "3100970616": 742, "3257062354": 743, "3257228317": 744, "3423128879": 745, "3423202327": 746, "342677609X": 747, "3442054753": 748, "3442727073": 749, "3453132041": 750, "3478387507": 751, "3492045170": 752, "3492230814": 753, "3499101505": 754, "3545202461": 755, "3548359698": 756, "3550075359": 757, "3596154766": 758, "3596214629": 759, "0671011367": 760, "0618045996": 761, "3822858617": 762, "0864425589": 763, "8817860751": 764, "0452282101": 765, "0671025368": 766, "3257060580": 767, "0553581112": 768, "0805047379": 769, "0892964456": 770, "0373250223": 771, "0451097009": 772, "0451179994": 773, "9681500830": 774, "8432087653": 775, "0553140779": 776, "0425182150": 777, "1883473004": 778, "0061007129": 779, "0061000027": 780, "0812511816": 781, "0833531654": 782, "0880381736": 783, "0880381744": 784, "0722536283": 785, "0060505885": 786, "0061097101": 787, "0299164942": 788, "0312283709": 789, "0312983271": 790, "0380731851": 791, "0446605484": 792, "0446611212": 793, "0451188454": 794, "0451207955": 795, "0609804138": 796, "0671003755": 797, "067104754X": 798, "0743407067": 799, "074343627X": 800, "0786013230": 801, "0812509560": 802, "3426615355": 803, "3442435838": 804, "3442455707": 805, "0075536498": 806, "0099287692": 807, "0099845008": 808, "0330262130": 809, "0385720920": 810, "0393319296": 811, "0553262505": 812, "06514251": 813, "0805062971": 814, "3257228007": 815, "3442430496": 816, "3442446325": 817, "3453171500": 818, "3464371506": 819, "9513098648": 820, "0590108395": 821, "3442451353": 822, "8425330866": 823, "8481301213": 824, "0460905589": 825, "0816704627": 826, "1573229571": 827, "0060595183": 828, "9782922145441": 829, "2830207904": 830, "3499263998": 831, "0434009407": 832, "1841193887": 833, "0375700668": 834, "457871971": 835, "840149768X": 836, "8401499917": 837, "8402007287": 838, "8420600369": 839, "8420603066": 840, "8423918335": 841, "8426105084": 842, "8426429807": 843, "843223138X": 844, "8437608570": 845, "8440630921": 846, "8440630922": 847, "8470394126": 848, "8471662531": 849, "8472230082": 850, "8474541913": 851, "8478091351": 852, "3426193310": 853, "0064404773": 854, "0064407667": 855, "0552545228": 856, "006054094X": 857, "0375706038": 858, "081297106X": 859, "0843951826": 860, "1400032628": 861, "1585861553": 862, "0066210151": 863, "0385334141": 864, "0425191184": 865, "0671027343": 866, "0751503894": 867, "3714500799": 868, "0385503822": 869, "3548208975": 870, "0399138684": 871, "0425189864": 872, "0440236053": 873, "0440241073": 874, "0553586122": 875, "0099935708": 876, "0140118608": 877, "0552998249": 878, "3518408127": 879, "1586609726": 880, "0865472807": 881, "0312960344": 882, "055358068X": 883, "0792270142": 884, "0688174590": 885, "031286504X": 886, "0345342968": 887, "0375756981": 888, "0553382411": 889, "0671027662": 890, "0767903382": 891, "3785527195": 892, "1558744592": 893, "0060002484": 894, "0060094117": 895, "0312253397": 896, "0312331754": 897, "0312874243": 898, "0312979517": 899, "0316152196": 900, "0316154059": 901, "0316287555": 902, "034541389X": 903, "0345450175": 904, "0345452550": 905, "0375727981": 906, "0380975017": 907, "0385305389": 908, "0393045390": 909, "0394543289": 910, "0399135804": 911, "0399138188": 912, "0399141340": 913, "0399146466": 914, "0399146504": 915, "0399146687": 916, "0399147101": 917, "0399147144": 918, "0399147322": 919, "0399147624": 920, "0399148337": 921, "0399148450": 922, "0399148639": 923, "0399148728": 924, "0399149783": 925, "0399150811": 926, "0399150870": 927, "0399151451": 928, "0399151478": 929, "039915177X": 930, "0399151885": 931, "0425183181": 932, "0440111323": 933, "0440122147": 934, "0440221463": 935, "0441005470": 936, "0446519480": 937, "0449221512": 938, "055358295X": 939, "0670894184": 940, "0671024094": 941, "0679450408": 942, "0684801663": 943, "0684846608": 944, "0684871726": 945, "0743201604": 946, "074320607X": 947, "0743407377": 948, "0743486226": 949, "0804109990": 950, "0812571118": 951, "0969691319": 952, "8475251471": 953, "0380724987": 954, "0380726246": 955, "0380816059": 956, "0399139419": 957, "0425175405": 958, "0425182932": 959, "0440225701": 960, "0446602485": 961, "0446603406": 962, "0451091949": 963, "0553095439": 964, "0553227041": 965, "0553263226": 966, "0553295098": 967, "0553564994": 968, "067102423X": 969, "0671032658": 970, "0671653849": 971, "0722509049": 972, "0812500067": 973, "0812516001": 974, "0812568710": 975, "0843921609": 976, "0890875588": 977, "1557730091": 978, "1558172882": 979, "0425178102": 980, "0446609404": 981, "0446610038": 982, "0451178017": 983, "0553211056": 984, "0553285920": 985}
processed/summary_book.csv ADDED
The diff for this file is too large to render. See raw diff
 
processed/user_id_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"276725": 0, "276726": 1, "276727": 2, "276729": 3, "276733": 4, "276736": 5, "276737": 6, "276744": 7, "276745": 8, "276746": 9, "276747": 10, "276748": 11, "276751": 12, "276754": 13, "276755": 14, "276760": 15, "276762": 16, "276765": 17, "276768": 18, "276772": 19, "276774": 20, "276780": 21, "276786": 22, "276788": 23, "276796": 24, "276798": 25, "276800": 26, "276803": 27, "276804": 28, "276806": 29, "276808": 30, "276811": 31, "276812": 32, "276813": 33, "276814": 34, "276817": 35, "276820": 36, "276822": 37, "276827": 38, "276828": 39, "276830": 40, "276832": 41, "276833": 42, "276835": 43, "276837": 44, "276838": 45, "276840": 46, "276842": 47, "276847": 48, "276848": 49, "276850": 50, "276852": 51, "276853": 52, "276854": 53, "276856": 54, "276857": 55, "276859": 56, "276861": 57, "276862": 58, "276863": 59, "276866": 60, "276869": 61, "276870": 62, "276872": 63, "276873": 64, "276875": 65, "276878": 66, "276879": 67, "276884": 68, "276887": 69, "276888": 70, "276889": 71, "276890": 72, "276896": 73, "276904": 74, "276905": 75, "276911": 76, "276912": 77, "276915": 78, "276916": 79, "276925": 80, "276927": 81, "276928": 82, "276929": 83, "276934": 84, "276936": 85, "276939": 86, "276943": 87, "276946": 88, "276949": 89, "276950": 90, "276953": 91, "276954": 92, "276957": 93, "276959": 94, "276963": 95, "276964": 96, "276965": 97, "276975": 98, "276981": 99, "276984": 100, "276986": 101, "276988": 102, "276989": 103, "276990": 104, "276992": 105, "276994": 106, "276997": 107, "276998": 108, "277002": 109, "277007": 110, "277009": 111, "277010": 112, "277012": 113, "277018": 114, "277019": 115, "277022": 116, "277023": 117, "277028": 118, "277031": 119, "277032": 120, "277035": 121, "277036": 122, "277040": 123, "277042": 124, "277048": 125, "277051": 126, "277052": 127, "277053": 128, "277056": 129, "277058": 130, "277064": 131, "277065": 132, "277072": 133, "277073": 134, "277074": 135, "277075": 136, "277079": 137, "277085": 138, "277087": 139, "277090": 140, "277094": 141, "277096": 142, "277102": 143, "277107": 144, "277109": 145, "277114": 146, "277116": 147, "277123": 148, "277124": 149, "277128": 150, "277129": 151, "277134": 152, "277135": 153, "277139": 154, "277142": 155, "277143": 156, "277149": 157, "277155": 158, "277157": 159, "277159": 160, "277165": 161, "277168": 162, "277170": 163}
recommend.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from jsonargparse import ArgumentParser
6
+
7
+
8
+ def parse_args():
9
+ """Parse command-line arguments."""
10
+ parser = ArgumentParser()
11
+ parser.add_argument("--data_dir", type=str, required=True, default="./processed")
12
+ parser.add_argument("--weight_dir", type=str, required=True, default="./weight")
13
+ parser.add_argument("--info_path", type=str, required=True, default="./processed/summary_book.csv")
14
+ parser.add_argument("--user_id", required=True, default="276729")
15
+ parser.add_argument("--num", type=int, required=True, default=10)
16
+ parser.add_argument("--lb", type=int, required=True, default=0)
17
+
18
+ return vars(parser.parse_args())
19
+
20
+ def main(
21
+ data_dir,
22
+ weight_dir,
23
+ info_path,
24
+ user_id,
25
+ num,
26
+ lb,
27
+ **kwargs
28
+ ):
29
+ # Load R matrix from file
30
+ R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
31
+ # Load prediction
32
+ prediction = np.load(f'{weight_dir}/predicted.npy', allow_pickle=True)
33
+ # Load dictionary from JSON file
34
+ with open(f'{data_dir}/user_id_map.json', 'r') as file:
35
+ user2id = json.load(file)
36
+ with open(f'{data_dir}/book_id_map.json', 'r') as file:
37
+ book2id = json.load(file)
38
+
39
+ # Recommend
40
+ user_idx = user2id[str(user_id)]
41
+ predict = prediction[:, user_idx] # get prediction for user
42
+ predict_dict = {book: np.round(predict[idx], 2) for book, idx in book2id.items()}
43
+ # Load information about book
44
+ book_df = pd.read_csv(info_path)
45
+ book_df = book_df[book_df["Num-Rating"] > lb]
46
+ book_df['predict'] = book_df["ISBN"].map(predict_dict)
47
+ recommendations = book_df.nlargest(num, "predict").reset_index(drop=True)
48
+ recommendations["context"] = recommendations.apply(
49
+ lambda book: f"{book['Book-Title']} ({book['Year-Of-Publication']}) - by {book['Book-Author']}", axis=1
50
+ )
51
+ print(recommendations)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main(**parse_args())
requirements.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.2
2
+ attrs==23.1.0
3
+ backports.zoneinfo==0.2.1
4
+ blinker==1.6.2
5
+ cachetools==5.3.0
6
+ certifi==2023.5.7
7
+ charset-normalizer==3.1.0
8
+ click==8.1.3
9
+ decorator==5.1.1
10
+ entrypoints==0.4
11
+ gitdb==4.0.10
12
+ GitPython==3.1.31
13
+ idna==3.4
14
+ importlib-metadata==6.6.0
15
+ importlib-resources==5.12.0
16
+ Jinja2==3.1.2
17
+ jsonargparse==4.21.1
18
+ jsonschema==4.17.3
19
+ markdown-it-py==2.2.0
20
+ MarkupSafe==2.1.2
21
+ mdurl==0.1.2
22
+ numpy==1.24.3
23
+ packaging==23.1
24
+ pandas==2.0.1
25
+ Pillow==9.5.0
26
+ pkgutil-resolve-name==1.3.10
27
+ protobuf==3.20.3
28
+ pyarrow==12.0.0
29
+ pydeck==0.8.1b0
30
+ Pygments==2.15.1
31
+ Pympler==1.0.1
32
+ pyrsistent==0.19.3
33
+ python-dateutil==2.8.2
34
+ pytz==2023.3
35
+ PyYAML==6.0
36
+ requests==2.31.0
37
+ rich==13.3.5
38
+ six==1.16.0
39
+ smmap==5.0.0
40
+ streamlit==1.22.0
41
+ tenacity==8.2.2
42
+ toml==0.10.2
43
+ toolz==0.12.0
44
+ tornado==6.3.2
45
+ typing-extensions==4.6.2
46
+ tzdata==2023.3
47
+ tzlocal==5.0.1
48
+ urllib3==2.0.2
49
+ validators==0.20.0
50
+ watchdog==3.0.0
51
+ zipp==3.15.0
train.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+
4
+ from tensorflow import keras
5
+ from pathlib import Path
6
+ from jsonargparse import ArgumentParser
7
+ from utils_c import normalize, cost_function
8
+
9
+
10
+ def parse_args():
11
+ """Parse command-line arguments."""
12
+ parser = ArgumentParser()
13
+ parser.add_argument("--data_dir", type=str, required=True, default="./processed")
14
+ parser.add_argument("--out_dir", type=str, required=True, default="./weight")
15
+ parser.add_argument("--num_features", type=int, required=True, default=10)
16
+ parser.add_argument("--num_iterators", type=int, required=True, default=200)
17
+ parser.add_argument("--learning_rate", type=float, required=True, default=1e-1)
18
+ parser.add_argument("--lambda_", type=float, required=True, default=2.0)
19
+ parser.add_argument("--seed", type=int, required=True, default=1234)
20
+ parser.add_argument("--freq", type=int, required=True, default=20)
21
+
22
+ return vars(parser.parse_args())
23
+
24
+ def main(
25
+ data_dir,
26
+ out_dir,
27
+ num_features,
28
+ num_iterators,
29
+ learning_rate,
30
+ lambda_,
31
+ seed,
32
+ freq
33
+ ):
34
+ # Load R matrix from file
35
+ R = np.load(f'{data_dir}/R.npy', allow_pickle=True)
36
+ # Load Y matrix from file
37
+ Y = np.load(f'{data_dir}/Y.npy', allow_pickle=True)
38
+ # Normalize the Dataset
39
+ Y_norm, Y_mean = normalize(Y, R)
40
+
41
+ num_books, num_users = Y.shape
42
+ # Set Initial Parameters (W, X), use tf.Variable to track these variables
43
+ tf.random.set_seed(seed) # for consistent results
44
+
45
+ W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name='W')
46
+ X = tf.Variable(tf.random.normal((num_books, num_features), dtype=tf.float64), name='X')
47
+ b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')
48
+
49
+ # Instantiate an optimizer.
50
+ optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
51
+ for iter in range(num_iterators):
52
+ # Use TensorFlow’s GradientTape
53
+ # to record the operations used to compute the cost
54
+ with tf.GradientTape() as tape:
55
+ # Compute the cost (forward pass included in cost)
56
+ cost_value = cost_function(X, W, b, Y_norm, R, lambda_)
57
+
58
+ # Use the gradient tape to automatically retrieve
59
+ # the gradients of the trainable variables with respect to the loss
60
+ grads = tape.gradient(cost_value, [X, W, b])
61
+
62
+ # Run one step of gradient descent by updating
63
+ # the value of the variables to minimize the loss.
64
+ optimizer.apply_gradients(zip(grads, [X, W, b]))
65
+
66
+ # Log periodically.
67
+ if iter % freq == 0:
68
+ print(f"Training loss at iteration {iter}: {cost_value:0.1f}")
69
+
70
+ predict = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
71
+ predict = predict + Y_mean
72
+
73
+ # Save weight
74
+ out_dir = Path(out_dir)
75
+ if out_dir.exists():
76
+ assert out_dir.is_dir()
77
+ else:
78
+ out_dir.mkdir(parents=True)
79
+ np.save(f'{out_dir}/W.npy', W)
80
+ np.save(f'{out_dir}/X.npy', X)
81
+ np.save(f'{out_dir}/b.npy', b)
82
+ np.save(f'{out_dir}/predicted.npy', predict)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main(**parse_args())
utils_c.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+
4
+
5
+ def normalize(Y, R):
6
+ """
7
+ Preprocess data by subtracting mean rating for every book (every row).
8
+ Only include real ratings R(i,j)=1.
9
+
10
+ [Y_norm, Y_mean] = normalize(Y, R) normalized Y so that each book
11
+ has a rating of 0 on average. Unrated moves then have a mean rating (0)
12
+
13
+ Returns the mean rating in Y_mean.
14
+ """
15
+ Y_mean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)
16
+ Y_norm = Y - np.multiply(Y_mean, R)
17
+
18
+ return Y_norm, Y_mean
19
+
20
+ def cost_function(X, W, b, Y, R, lambda_):
21
+ """
22
+ Returns the cost for the collaborative filtering
23
+ Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
24
+
25
+ Args:
26
+ X (ndarray (num_books,num_features)): matrix of item features
27
+ W (ndarray (num_users,num_features)) : matrix of user parameters
28
+ b (ndarray (1, num_users) : vector of user parameters
29
+ Y (ndarray (num_books,num_users) : matrix of user ratings of books
30
+ R (ndarray (num_books,num_users) : matrix, where R(i, j) = 1 if the i-th books was rated by the j-th user
31
+ lambda_ (float): regularization parameter
32
+
33
+ Returns:
34
+ J (float) : Cost
35
+ """
36
+
37
+ j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
38
+ J = 0.5 * tf.reduce_sum(j ** 2) + (lambda_ / 2) * (tf.reduce_sum(X ** 2) + tf.reduce_sum(W ** 2))
39
+
40
+ return J
weight/W.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8df3995b7b6243c4b68b3cecabb10414d982c1cf1baf4533e6c8b8fadd3dc751
3
+ size 13248
weight/X.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9f0e114c605d0b60edbf581ae35380ec3f1a1271f280d3f17c415a52828358f
3
+ size 79008
weight/b.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d69013d3a83302293d5e37e4fbbb2dd026297f6499cd71d9501fc91adc0d817f
3
+ size 1440
weight/predicted.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d02e08da558c5ad31c1eb64d15a6227b570ef73c4d0597d5ab49a5aa7f0310f
3
+ size 1293760