Igor Santana commited on
Commit
9c58361
1 Parent(s): c9be590

rnn model sent from github to huggingface

Browse files
.editorconfig ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # top-most EditorConfig file
2
+ root = true
3
+
4
+ # Unix-style newlines with a newline ending every file
5
+ [*]
6
+ end_of_line = lf
7
+ insert_final_newline = true
8
+
9
+ # 4 space indentation
10
+ [*.py]
11
+ indent_style = tab
12
+ indent_size = 4
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset/*
2
+ tmp/*
3
+ **/*.pyc
4
+ **/*.cpython-37.pyc
5
+ .ipynb_checkpoints/*
6
+ .history
7
+ .vscode
8
+ tmp
9
+ project/data/__pycache__/*.pyc
10
+ project/evaluation/__pycache__/*.pyc
11
+ project/recsys/__pycache__/*.pyc
12
+ project/__pycache__/*.pyc
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Igor André P. Santana
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- license: unlicense
3
- ---
 
1
+ # RNN Embeddings
2
+
3
+ ## Jointly learning music embeddings with Recurrent Neural Networks
4
+
5
+ This repository contains all the code that I did during my masters @ State University of Maringá. I do not intend to add new features to this project, as I will not continue this project in a PhD. To better understand what is the goal of this project, this quote is from my thesis and summarizes what I did:
6
+
7
+ > This work's goal is to use Recurrent Neural Networks to acquire contextual information for each song, given the sequence of songs that each user has listened to using embeddings.
8
+
9
+
10
+ If you have any doubts about the code, or want to use it in your project, let me know! I will be glad to help you in anything you need.
11
+
12
+ ### Installation and Setup
13
+
14
+ As this code was written in Python, I highly recommend you to use [conda](https://docs.conda.io/en/latest/) to install all the dependencies that you'll need to run it. I have provided the [environment file](environment.yml) that I ended up with, and to create the repository using this file, you should run the following command (assuming you already have conda):
15
+
16
+ ```
17
+ conda env create -f environment.yml
18
+ ```
19
+
20
+ It is important to know that I used Tensorflow 1.14.0, Cuda 9.2 and Python 3.6.9 to run the experiments. If you cannot run with the environment file that I have provided, perhaps its because one of those versions.
21
+
22
+ ### Directory Structure and General Instructions
23
+
24
+ ```
25
+ .
26
+ |-- analysis
27
+ |-- configs
28
+ |-- dataset
29
+ | |-- dataset #1
30
+ | |-- dataset #2
31
+ | `-- ...
32
+ |-- outputs
33
+ |-- project
34
+ | |-- data
35
+ | |-- evaluation
36
+ | |-- models
37
+ | `-- recsys
38
+ |-- tmp
39
+ ```
40
+
41
+ This project follows this directory structure in order to work. The main python files are in the **project** folder, and any change that you'll want to do in the code must be done in the files in this folder. The **outputs** folder will contain the output file for the models that you built.
42
+
43
+ The **dataset** contains all the datasets that you'll use in the project, and for each dataset, you should create a separate folder for it inside the **dataset** folder. The project will then look for a `listening_history.csv` file inside of this folder to run it. This file **must be** comma-separated.
44
+
45
+ A temporary folder, **tmp**, will be created while the project works. For each dataset that you'll run this project with, a folder inside the **tmp** folder will be created. There you can find the cross-validation folds, the models that you built and the individual recommendations for each user, as well as some auxiliary matrixes used in the UserKNN algorithm.
46
+
47
+ I have also included an **analysis** folder that I used to create some graphs with the results. You just have to point to the `main.py` file in the analysis folder where are the results, and it will show an graphical comparison between the models with all the metrics.
48
+
49
+ The project will only work if you provide a configuration file to it. In my case, I stored my configuration files in the **configs** folder, but feel free to delete the folder if you don' want it. The configuration file contains the parameters for the models, and I don't recommend deleting any parameter even if you are not going to use it. I've included a [sample configuration](configs/config_sample.yml) file that you can use as guideline for your project.
50
+
51
+
52
+ To run the project, you have to pass the config to the `main.py` as a parameter.
53
+
54
+ ```
55
+ $ python main.py --config=configs/config_sample.yml
56
+ ```
57
+
58
+
59
+ ###### DISCLAIMER:
60
+
61
+ The `model` and `bi` parameters in the `models/rnn` configuration object are not working, as I hardcoded it in my project. If you want to change the layer (to a GRU or a Simple RNN), you should do it [directly in the code](project/models/rnn.py#L147).
62
+
63
+
64
+ ### What is included in this project?
65
+
66
+ To better understand the project, I highly recommend you to go check the work that I used as a baseline for my model:
67
+
68
+ - [link](https://doi.org/10.1007/s10791-017-9317-7) - Wang, D., Deng, S. & Xu, G. Sequence-based context-aware music recommendation. Information Retrieval Journal (2018)
69
+
70
+ Their work, *music2vec*, is one of the baselines for my RNN model. The following embeddings are implemented in this project:
71
+
72
+ - music2vec
73
+ - doc2vec - [link](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)
74
+ - GloVe - [link](https://nlp.stanford.edu/projects/glove/)
75
+
76
+ To evaluate these embeddings models, the CARS that are implemented are the ones that were proposed by Wang et. al (M-TN, SM-TN, CSM-TN, CSM-UK). Besides the metrics that were used in the paper, I have included MAP, NDCG@5 and Precision@5 as well. The cutoff of these metrics is not configurable, sorry.
77
+
78
+
79
+
80
+
81
  ---
82
+
83
+ If you have any doubts about this project, feel free to contact me!
analysis/main.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+
6
+ sns.set(font_scale =1, style='whitegrid', context='paper')
7
+ colors = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", '#f1c40f']
8
+ palette = sns.color_palette(colors)
9
+
10
+ df = pd.read_csv('data/xiami.csv', sep='\t')
11
+ df['id'] = df.params
12
+
13
+ mtn = df[df.algo == 'm2vTN'][['id','prec','rec', 'f1']]
14
+ mtn = pd.DataFrame(mtn.groupby(by='id').mean())
15
+ mtn['id'] = mtn.index
16
+
17
+ smtn = df[df.algo == 'sm2vTN'][['id','prec','rec', 'f1']]
18
+ smtn = pd.DataFrame(smtn.groupby(by='id').mean())
19
+ smtn['id'] = smtn.index
20
+
21
+ csmtn = df[df.algo == 'csm2vTN'][['id','prec','rec', 'f1']]
22
+ csmtn = pd.DataFrame(csmtn.groupby(by='id').mean())
23
+ csmtn['id'] = csmtn.index
24
+
25
+ csmuk = df[df.algo == 'csm2vUK'][['id','prec','rec', 'f1']]
26
+ csmuk = pd.DataFrame(csmuk.groupby(by='id').mean())
27
+ csmuk['id'] = csmuk.index
28
+
29
+ mtn.sort_index(ascending=False, inplace=True)
30
+ smtn.sort_index(ascending=False, inplace=True)
31
+ csmtn.sort_index(ascending=False, inplace=True)
32
+ csmuk.sort_index(ascending=False, inplace=True)
33
+
34
+ melt_mtn = pd.melt(mtn, id_vars='id')
35
+ melt_smtn = pd.melt(smtn, id_vars='id')
36
+ melt_csmtn = pd.melt(csmtn, id_vars='id')
37
+ melt_csmuk = pd.melt(csmuk, id_vars='id')
38
+
39
+ fig, axes = plt.subplots(2, 2, figsize=(25, 25))
40
+
41
+ a1 = sns.catplot(x='variable', y='value', hue='id', data=melt_mtn, kind='bar', palette=palette, ax=axes[0][0])
42
+ a2 = sns.catplot(x='variable', y='value', hue='id', data=melt_smtn, kind='bar', palette=palette, ax=axes[0][1])
43
+ a3 = sns.catplot(x='variable', y='value', hue='id', data=melt_csmtn, kind='bar', palette=palette, ax=axes[1][0])
44
+ a4 = sns.catplot(x='variable', y='value', hue='id', data=melt_csmuk, kind='bar', palette=palette, ax=axes[1][1])
45
+
46
+ plt.close(2)
47
+ plt.close(3)
48
+ plt.close(4)
49
+ plt.close(5)
50
+
51
+ titles = ['M-TN', 'SM-TN', 'CSM-TN', 'CSM-UK']
52
+
53
+ last = axes.flatten()[-1]
54
+ handles, labels = last.get_legend_handles_labels()
55
+ fig.legend(handles, labels, loc='upper left')
56
+
57
+ i=0
58
+ for ax in axes.flatten():
59
+ ax.get_legend().remove()
60
+ ax.set(yticks=np.arange(0, 0.21, 0.025))
61
+ ax.set(xlabel='Metrics Used', ylabel='Valor')
62
+ ax.set(title=titles[i])
63
+ i+=1
64
+
65
+
66
+ plt.subplots_adjust(hspace=0.4)
67
+ fig.suptitle('Metrics', fontsize=18, y=.98)
68
+ plt.show()
69
+
70
+
configs/config_sample.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models:
2
+ rnn:
3
+ embedding_dim: [256]
4
+ batch: 64
5
+ epochs: [50]
6
+ model: ['LSTM']
7
+ window: [3]
8
+ bi: [False]
9
+ num_units: [512]
10
+ music2vec:
11
+ window: [5]
12
+ epochs: [5]
13
+ down_sample: [1e-3]
14
+ learning_rate: [0.025]
15
+ embedding_dim: [300]
16
+ negative_sample: [20]
17
+ doc2vec:
18
+ window: [10]
19
+ epochs: [10]
20
+ down_sample: [1e-4]
21
+ learning_rate: [0.025]
22
+ embedding_dim: [50]
23
+ negative_sample: [10]
24
+ glove:
25
+ window: [10]
26
+ embedding_dim: [100]
27
+ epochs: [15]
28
+ learning_rate: [0.025]
29
+ session:
30
+ interval: 30
31
+ evaluation:
32
+ dataset: 'sample'
33
+ cross-validation: 5
34
+ k: 5
35
+ topN: 5
36
+ results:
37
+ full: 'outputs/sample.csv'
38
+ embeddings:
39
+ music2vec:
40
+ usage: True
41
+ doc2vec:
42
+ usage: False
43
+ glove:
44
+ usage: False
45
+ rnn:
46
+ usage: False
environment.yml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: rnn-embeddings
2
+ channels:
3
+ - anaconda
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=main
7
+ - _tflow_select=2.1.0=gpu
8
+ - absl-py=0.8.1=py36_0
9
+ - astor=0.8.0=py36_0
10
+ - astroid=2.3.3=py36_0
11
+ - blas=1.0=mkl
12
+ - c-ares=1.15.0=h7b6447c_1001
13
+ - ca-certificates=2019.11.27=0
14
+ - cairo=1.14.12=h8948797_3
15
+ - certifi=2019.11.28=py36_0
16
+ - cudatoolkit=9.2=0
17
+ - cudnn=7.6.4=cuda9.2_0
18
+ - cupti=9.2.148=0
19
+ - cycler=0.10.0=py36_0
20
+ - dbus=1.13.12=h746ee38_0
21
+ - expat=2.2.6=he6710b0_0
22
+ - fontconfig=2.13.0=h9420a91_0
23
+ - freetype=2.9.1=h8a8886c_1
24
+ - fribidi=1.0.5=h7b6447c_0
25
+ - gast=0.3.2=py_0
26
+ - glib=2.63.1=h5a9c865_0
27
+ - google-pasta=0.1.8=py_0
28
+ - graphite2=1.3.13=h23475e2_0
29
+ - graphviz=2.40.1=h21bd128_2
30
+ - grpcio=1.16.1=py36hf8bcb03_1
31
+ - gst-plugins-base=1.14.0=hbbd80ab_1
32
+ - gstreamer=1.14.0=hb453b48_1
33
+ - h5py=2.9.0=py36h7918eee_0
34
+ - harfbuzz=1.8.8=hffaf4a1_0
35
+ - hdf5=1.10.4=hb1b8bf9_0
36
+ - icu=58.2=h9c2bf20_1
37
+ - intel-openmp=2019.4=243
38
+ - isort=4.3.21=py36_0
39
+ - joblib=0.14.0=py_0
40
+ - jpeg=9b=h024ee3a_2
41
+ - keras=2.2.4=0
42
+ - keras-applications=1.0.8=py_0
43
+ - keras-base=2.2.4=py36_0
44
+ - keras-preprocessing=1.1.0=py_1
45
+ - kiwisolver=1.1.0=py36he6710b0_0
46
+ - lazy-object-proxy=1.4.3=py36h7b6447c_0
47
+ - libedit=3.1.20181209=hc058e9b_0
48
+ - libffi=3.2.1=hd88cf55_4
49
+ - libgcc-ng=9.1.0=hdf63c60_0
50
+ - libgfortran-ng=7.3.0=hdf63c60_0
51
+ - libpng=1.6.37=hbc83047_0
52
+ - libprotobuf=3.10.1=hd408876_0
53
+ - libstdcxx-ng=9.1.0=hdf63c60_0
54
+ - libtiff=4.1.0=h2733197_0
55
+ - libuuid=1.0.3=h1bed415_2
56
+ - libxcb=1.13=h1bed415_1
57
+ - libxml2=2.9.9=hea5a465_1
58
+ - markdown=3.1.1=py36_0
59
+ - matplotlib=3.1.1=py36h5429711_0
60
+ - mccabe=0.6.1=py36_1
61
+ - mkl=2019.4=243
62
+ - mkl-service=2.3.0=py36he904b0f_0
63
+ - mkl_fft=1.0.15=py36ha843d7b_0
64
+ - mkl_random=1.1.0=py36hd6b4f25_0
65
+ - mock=3.0.5=py36_0
66
+ - ncurses=6.1=he6710b0_1
67
+ - openssl=1.1.1=h7b6447c_0
68
+ - pandas=0.25.3=py36he6710b0_0
69
+ - pango=1.42.4=h049681c_0
70
+ - patsy=0.5.1=py36_0
71
+ - pcre=8.43=he6710b0_0
72
+ - pip=19.3.1=py36_0
73
+ - pixman=0.38.0=h7b6447c_0
74
+ - protobuf=3.10.1=py36he6710b0_0
75
+ - pylint=2.4.4=py36_0
76
+ - pyparsing=2.4.5=py_0
77
+ - pyqt=5.9.2=py36h05f1152_2
78
+ - python=3.6.9=h265db76_0
79
+ - pytz=2019.3=py_0
80
+ - qt=5.9.7=h5867ecd_1
81
+ - readline=7.0=h7b6447c_5
82
+ - scikit-learn=0.21.3=py36hd81dba3_0
83
+ - scipy=1.3.1=py36h7c811a0_0
84
+ - seaborn=0.9.0=pyh91ea838_1
85
+ - setuptools=42.0.2=py36_0
86
+ - sip=4.19.8=py36hf484d3e_0
87
+ - six=1.13.0=py36_0
88
+ - sqlite=3.30.1=h7b6447c_0
89
+ - statsmodels=0.10.1=py36hdd07704_0
90
+ - tensorboard=1.14.0=py36hf484d3e_0
91
+ - tensorflow=1.14.0=gpu_py36hfc5689a_0
92
+ - tensorflow-base=1.14.0=gpu_py36h611c6d2_0
93
+ - tensorflow-estimator=1.14.0=py_0
94
+ - tensorflow-gpu=1.14.0=h0d30ee6_0
95
+ - termcolor=1.1.0=py36_1
96
+ - tk=8.6.8=hbc83047_0
97
+ - tornado=6.0.3=py36h7b6447c_0
98
+ - typed-ast=1.4.0=py36h7b6447c_0
99
+ - werkzeug=0.16.0=py_0
100
+ - wheel=0.33.6=py36_0
101
+ - wrapt=1.11.2=py36h7b6447c_0
102
+ - xz=5.2.4=h14c3975_4
103
+ - yaml=0.1.7=had09818_2
104
+ - zlib=1.2.11=h7b6447c_3
105
+ - zstd=1.3.7=h0b5b093_0
106
+ - pip:
107
+ - bilm==0.1.post5
108
+ - blessings==1.7
109
+ - boto==2.49.0
110
+ - boto3==1.10.33
111
+ - botocore==1.13.33
112
+ - chardet==3.0.4
113
+ - docutils==0.15.2
114
+ - gensim==3.8.1
115
+ - glove-python==0.1.0
116
+ - gpustat==0.6.0
117
+ - idna==2.8
118
+ - jmespath==0.9.4
119
+ - ml-metrics==0.1.4
120
+ - numpy==1.16.4
121
+ - nvidia-ml-py3==7.352.0
122
+ - psutil==5.6.7
123
+ - pydot==1.4.1
124
+ - python-dateutil==2.8.0
125
+ - pyyaml>=3.11, <6.0
126
+ - requests==2.22.0
127
+ - s3transfer==0.2.1
128
+ - smart-open==1.9.0
129
+ - urllib3==1.25.7
main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import yaml
4
+ import pickle
5
+ import argparse
6
+ import pandas as pd
7
+ import numpy as np
8
+ import multiprocessing as mp
9
+ import project.evaluation.run as r
10
+ from os.path import exists
11
+ from datetime import datetime
12
+ from project.data.preprocess import preprocess, remove_sessions
13
+ from project.models.embeddings import embeddings
14
+ from project.evaluation.run import cross_validation
15
+
16
+
17
+ if __name__ == '__main__':
18
+
19
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
20
+
21
+ parser = argparse.ArgumentParser(description='RNN Embeddings')
22
+ parser.add_argument('--config', help='Configuration file', type=str)
23
+ args = parser.parse_args()
24
+ conf = yaml.safe_load(open(args.config))
25
+
26
+ print('The configuration file "%s" was read.' % args.config)
27
+ print('Pre-process started for dataset "%s"' % conf['evaluation']['dataset'])
28
+
29
+ preprocess(conf)
30
+
31
+ ds = conf['evaluation']['dataset']
32
+ df = pd.read_csv('dataset/{}/session_listening_history.csv'.format(ds), sep = ',')
33
+
34
+ emb_path = 'tmp/{}/models/ids.npy'.format(ds)
35
+
36
+ if not exists(emb_path):
37
+ embeddings(df, conf)
38
+ ids = np.load(emb_path)
39
+ cross_validation(df, conf, ids)
project/__init__.py ADDED
File without changes
project/data/preparation.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import random
4
+ import numpy as np
5
+ import pickle
6
+ from os import makedirs
7
+ from os.path import exists
8
+ from gensim.models import Word2Vec, Doc2Vec
9
+ from glove import Glove
10
+ from sklearn.model_selection import KFold
11
+
12
+ def _rnn_load(path, songs):
13
+ data = pickle.load(open(path, 'rb'))
14
+ emb_dict = {}
15
+ for song in songs:
16
+ emb_dict[song] = data[song]
17
+ return emb_dict
18
+
19
+ def __w2v_load(path, songs):
20
+ wv = Word2Vec.load(path).wv
21
+ emb_dict = {}
22
+ for song in songs:
23
+ emb_dict[song] = wv[song]
24
+ return emb_dict
25
+
26
+ def __g_load(path, songs):
27
+ glove = Glove.load(path)
28
+ emb_dict = {}
29
+ for song in songs:
30
+ emb_dict[song] = glove.word_vectors[glove.dictionary[song]]
31
+ return emb_dict
32
+
33
+ def __load_exp(path, songs):
34
+ data = pickle.load(open(path, 'rb'))
35
+ return data
36
+
37
+
38
+ def get_embeddings(path, songs):
39
+ path_arr = path.split('/')
40
+ session_file = '/'.join(path_arr[:-1] + ['s' + path_arr[-1]])
41
+ user_file = path
42
+
43
+ if 'experiments' in path:
44
+ return __load_exp(user_file, songs), __load_exp(session_file, songs)
45
+ if 'glove' in path:
46
+ return __g_load(user_file, songs),__g_load(session_file, songs)
47
+ if 'music2vec' in path:
48
+ return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
49
+ if 'doc2vec' in path:
50
+ return __w2v_load(user_file, songs), __w2v_load(session_file, songs)
51
+ if 'rnn' in path:
52
+ return _rnn_load(user_file, songs), _rnn_load(session_file, songs)
53
+ return {},{}
54
+
55
+ def prepare_data(df, conf):
56
+ ds = conf['evaluation']['dataset']
57
+ path_kfold = 'tmp/{}/kfold/'.format(ds)
58
+ if exists(path_kfold):
59
+ kfold = []
60
+ for i in range(0, conf['evaluation']['k']):
61
+ j = i + 1
62
+ train = pd.read_pickle(path_kfold + 'train_{}.pkl'.format(j))
63
+ test = pd.read_pickle(path_kfold + 'test_{}.pkl'.format(j))
64
+ kfold.append((train, test))
65
+ return kfold
66
+ makedirs('tmp/{}/kfold/'.format(ds))
67
+ sessions = df.groupby('session')['song'].apply(lambda x: x.tolist())
68
+ users = df.groupby('user').agg(list)
69
+ users['history'] = users['session'].apply(lambda x: [sessions[session] for session in list(set(x))])
70
+ users = users.drop(['song', 'timestamp','session'], axis=1)
71
+ unique_users = df.user.unique()
72
+ kf = KFold(n_splits=conf['evaluation']['k'], shuffle=True)
73
+ i = 1
74
+ kfold = []
75
+ for train, test in kf.split(unique_users):
76
+ train_df = users[users.index.isin(unique_users[train])]
77
+ test_df = users[users.index.isin(unique_users[test])]
78
+ train_df.to_pickle('tmp/{}/kfold/train_{}.pkl'.format(ds, i))
79
+ test_df.to_pickle('tmp/{}/kfold/test_{}.pkl'.format(ds, i))
80
+ kfold.append((train_df, test_df))
81
+ i += 1
82
+ return kfold
83
+
84
+
85
+
86
+
87
+
project/data/preprocess.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path
2
+ import csv
3
+ import math
4
+ import json
5
+ import yaml
6
+ import numpy as np
7
+ import pandas as pd
8
+ import multiprocessing as mp
9
+ from datetime import datetime, timedelta
10
+
11
+ def remove_sessions(df, leq=1):
12
+ group = df.groupby(by='session').agg(list)
13
+ group = group['song'].apply(len)
14
+ to_stay = group[group > leq].index.values
15
+ return df[df.session.isin(to_stay)]
16
+
17
+
18
+ def sessionize_user(ds, session_time, s_path):
19
+ df = pd.read_csv('dataset/{}/listening_history.csv'.format(ds), sep = ',')
20
+ df['timestamp'] = df['timestamp'].astype('datetime64')
21
+ df['dif'] = df['timestamp'].diff()
22
+ df['session'] = df.apply(lambda x: 'NEW_SESSION' if x.dif >= timedelta(minutes=session_time) else 'SAME_SESSION', axis=1)
23
+ s_no = 0
24
+ l_u = ''
25
+ f = open(s_path, 'w+')
26
+ print(','.join(['user', 'song', 'timestamp', 'session']), file=f)
27
+ print('Sessionized "%s" data file: %s' % (ds, s_path))
28
+ for row in df.values:
29
+ if s_no == 0:
30
+ l_u = row[0]
31
+ if (row[4] == 'NEW_SESSION' and l_u == row[0]) or (l_u != row[0]):
32
+ s_no+=1
33
+ row[3] = 's{}'.format(s_no)
34
+ l_u = row[0]
35
+ row[2] = str(row[2])
36
+ print(','.join(row[:-1]), file=f)
37
+
38
+ def gen_seq_files(df, pwd, window_size):
39
+ c_sessions = df.groupby('session')['song'].agg(list)
40
+ u_sessions = df.groupby('user')['song'].agg(list)
41
+ num_w = window_size // 2
42
+ fc = open(pwd + 'c_seqs.csv', 'w+')
43
+ fu = open(pwd + 'u_seqs.csv', 'w+')
44
+ dict_song = {}
45
+ for session in c_sessions:
46
+ for ix in range(len(session)):
47
+ b4 = list(range(ix - num_w, ix))
48
+ af = list(range(ix + 1, ix + num_w + 1))
49
+ b4 = [session[i] if i >= 0 else '-' for i in b4]
50
+ af = [session[i] if i < len(session) else '-' for i in af]
51
+ if session[ix] not in dict_song:
52
+ dict_song[session[ix]] = []
53
+ dict_song[session[ix]].append(b4 + [session[ix]] + af)
54
+ for song, values in dict_song.items():
55
+ for seq in values:
56
+ print(song + '\t'+ '{}'.format(seq), file=fc)
57
+
58
+ dict_song = {}
59
+ for session in u_sessions:
60
+ for ix in range(len(session)):
61
+ b4 = list(range(ix - num_w, ix))
62
+ af = list(range(ix + 1, ix + num_w + 1))
63
+ b4 = [session[i] if i >= 0 else '-' for i in b4]
64
+ af = [session[i] if i < len(session) else '-' for i in af]
65
+ if session[ix] not in dict_song:
66
+ dict_song[session[ix]] = []
67
+ dict_song[session[ix]].append(b4 + [session[ix]] + af)
68
+ for song, values in dict_song.items():
69
+ for seq in values:
70
+ print(song + '\t'+ '{}'.format(seq), file=fu)
71
+
72
+
73
+ def preprocess(conf):
74
+ ds = conf['evaluation']['dataset']
75
+ interval = conf['session']['interval']
76
+ if path.exists('dataset/{}/session_listening_history.csv'.format(ds)):
77
+ print('The "%s" dataset is already sessionized' % ds)
78
+ return
79
+ print('Started to sessionize dataset "%s"' % ds)
80
+ sessionize_user(ds, interval, 'dataset/{}/session_listening_history.csv'.format(ds))
81
+
82
+
project/evaluation/ResultReport.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ class Results():
6
+ def __init__(self, setups, k):
7
+ self.metrics = {}
8
+ self.k = k
9
+ self.final_df = pd.DataFrame()
10
+
11
+ def fold_results(self, params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold):
12
+ metrics = np.vstack([m2vTN, sm2vTN, csm2vTN, csm2vUK])
13
+ print()
14
+ data = {
15
+ 'params': [params] * 4,
16
+ 'algo': ['m2vTN','sm2vTN','csm2vTN','csm2vUK'],
17
+ 'folds':[fold] * 4,
18
+ 'prec': metrics[:,0],
19
+ 'rec': metrics[:,1],
20
+ 'f1': metrics[:,2],
21
+ 'map': metrics[:,3],
22
+ 'ndcg@5': metrics[:,4],
23
+ 'p@5': metrics[:,5]
24
+ }
25
+ df = pd.DataFrame(data)
26
+ return df
27
+
project/evaluation/metrics.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from project.evaluation.ranking_metrics import mean_average_precision, ndcg_at, precision_at
2
+
3
+ def __Prec(topn, test):
4
+ num_intersect = len(set.intersection(set(topn), set(test)))
5
+ num_rec = len(topn)
6
+ return num_intersect / num_rec
7
+
8
+ def __Rec(topn, test):
9
+ num_intersect = len(set.intersection(set(topn), set(test)))
10
+ num_test = len(list(set(test)))
11
+ return num_intersect / num_test
12
+
13
+ def Hitrate(topn, test):
14
+ num_intersect = len([value for value in list(set(test)) if value in topn])
15
+ num_rec = len(topn)
16
+ return num_intersect / num_rec
17
+
18
+ def __F1(prec, rec):
19
+ return (2 * ((prec * rec) / (prec + rec))) if (prec + rec) > 0 else 0
20
+
21
+
22
+ def get_metrics(topn, test):
23
+ prec = __Prec(topn, test)
24
+ rec = __Rec(topn, test)
25
+ f = __F1(prec, rec)
26
+ MAP = mean_average_precision([test], [topn], assume_unique=False)
27
+ ndcg_5 = ndcg_at([test], [topn], k=5, assume_unique=False)
28
+ p_5 = precision_at([test], [topn], k=5, assume_unique=False)
29
+
30
+ return [prec, rec, f, MAP, ndcg_5, p_5]
project/evaluation/ranking_metrics.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Author: Taylor G Smith
4
+ #
5
+ # Recommender system ranking metrics derived from Spark source for use with
6
+ # Python-based recommender libraries (i.e., implicit,
7
+ # http://github.com/benfred/implicit/). These metrics are derived from the
8
+ # original Spark Scala source code for recommender metrics.
9
+ # https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
10
+
11
+ import numpy as np
12
+
13
+ import warnings
14
+
15
+ __all__ = [
16
+ 'mean_average_precision',
17
+ 'ndcg_at',
18
+ 'precision_at',
19
+ ]
20
+ def _require_positive_k(k):
21
+ """Helper function to avoid copy/pasted code for validating K"""
22
+ if k <= 0:
23
+ raise ValueError("ranking position k should be positive")
24
+
25
+
26
+ def _mean_ranking_metric(predictions, labels, metric):
27
+ """Helper function for precision_at_k and mean_average_precision"""
28
+ # do not zip, as this will require an extra pass of O(N). Just assert
29
+ # equal length and index (compute in ONE pass of O(N)).
30
+ # if len(predictions) != len(labels):
31
+ # raise ValueError("dim mismatch in predictions and labels!")
32
+ # return np.mean([
33
+ # metric(np.asarray(predictions[i]), np.asarray(labels[i]))
34
+ # for i in xrange(len(predictions))
35
+ # ])
36
+
37
+ # Actually probably want lazy evaluation in case preds is a
38
+ # generator, since preds can be very dense and could blow up
39
+ # memory... but how to assert lengths equal? FIXME
40
+ return np.mean([
41
+ metric(np.asarray(prd), np.asarray(labels[i]))
42
+ for i, prd in enumerate(predictions) # lazy eval if generator
43
+ ])
44
+
45
+
46
+ def _warn_for_empty_labels():
47
+ """Helper for missing ground truth sets"""
48
+ warnings.warn("Empty ground truth set! Check input data")
49
+ return 0.
50
+
51
+
52
+ def precision_at(predictions, labels, k=10, assume_unique=True):
53
+ """Compute the precision at K.
54
+ Compute the average precision of all the queries, truncated at
55
+ ranking position k. If for a query, the ranking algorithm returns
56
+ n (n is less than k) results, the precision value will be computed
57
+ as #(relevant items retrieved) / k. This formula also applies when
58
+ the size of the ground truth set is less than k.
59
+ If a query has an empty ground truth set, zero will be used as
60
+ precision together with a warning.
61
+ Parameters
62
+ ----------
63
+ predictions : array-like, shape=(n_predictions,)
64
+ The prediction array. The items that were predicted, in descending
65
+ order of relevance.
66
+ labels : array-like, shape=(n_ratings,)
67
+ The labels (positively-rated items).
68
+ k : int, optional (default=10)
69
+ The rank at which to measure the precision.
70
+ assume_unique : bool, optional (default=True)
71
+ Whether to assume the items in the labels and predictions are each
72
+ unique. That is, the same item is not predicted multiple times or
73
+ rated multiple times.
74
+ Examples
75
+ --------
76
+ >>> # predictions for 3 users
77
+ >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
78
+ ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
79
+ ... [1, 2, 3, 4, 5]]
80
+ >>> # labels for the 3 users
81
+ >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
82
+ >>> precision_at(preds, labels, 1)
83
+ 0.33333333333333331
84
+ >>> precision_at(preds, labels, 5)
85
+ 0.26666666666666666
86
+ >>> precision_at(preds, labels, 15)
87
+ 0.17777777777777778
88
+ """
89
+ # validate K
90
+ _require_positive_k(k)
91
+
92
+ def _inner_pk(pred, lab):
93
+ # need to compute the count of the number of values in the predictions
94
+ # that are present in the labels. We'll use numpy in1d for this (set
95
+ # intersection in O(1))
96
+ if lab.shape[0] > 0:
97
+ n = min(pred.shape[0], k)
98
+ cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum()
99
+ return float(cnt) / k
100
+ else:
101
+ return _warn_for_empty_labels()
102
+
103
+ return _mean_ranking_metric(predictions, labels, _inner_pk)
104
+
105
+
106
+ def mean_average_precision(predictions, labels, assume_unique=True):
107
+ """Compute the mean average precision on predictions and labels.
108
+ Returns the mean average precision (MAP) of all the queries. If a query
109
+ has an empty ground truth set, the average precision will be zero and a
110
+ warning is generated.
111
+ Parameters
112
+ ----------
113
+ predictions : array-like, shape=(n_predictions,)
114
+ The prediction array. The items that were predicted, in descending
115
+ order of relevance.
116
+ labels : array-like, shape=(n_ratings,)
117
+ The labels (positively-rated items).
118
+ assume_unique : bool, optional (default=True)
119
+ Whether to assume the items in the labels and predictions are each
120
+ unique. That is, the same item is not predicted multiple times or
121
+ rated multiple times.
122
+ Examples
123
+ --------
124
+ >>> # predictions for 3 users
125
+ >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
126
+ ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
127
+ ... [1, 2, 3, 4, 5]]
128
+ >>> # labels for the 3 users
129
+ >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
130
+ >>> mean_average_precision(preds, labels)
131
+ 0.35502645502645497
132
+ """
133
+ def _inner_map(pred, lab):
134
+ if lab.shape[0]:
135
+ # compute the number of elements within the predictions that are
136
+ # present in the actual labels, and get the cumulative sum weighted
137
+ # by the index of the ranking
138
+ n = pred.shape[0]
139
+
140
+ # Scala code from Spark source:
141
+ # var i = 0
142
+ # var cnt = 0
143
+ # var precSum = 0.0
144
+ # val n = pred.length
145
+ # while (i < n) {
146
+ # if (labSet.contains(pred(i))) {
147
+ # cnt += 1
148
+ # precSum += cnt.toDouble / (i + 1)
149
+ # }
150
+ # i += 1
151
+ # }
152
+ # precSum / labSet.size
153
+
154
+ arange = np.arange(n, dtype=np.float32) + 1. # this is the denom
155
+ present = np.in1d(pred[:n], lab, assume_unique=assume_unique)
156
+ prec_sum = np.ones(present.sum()).cumsum()
157
+ denom = arange[present]
158
+ return (prec_sum / denom).sum() / lab.shape[0]
159
+
160
+ else:
161
+ return _warn_for_empty_labels()
162
+
163
+ return _mean_ranking_metric(predictions, labels, _inner_map)
164
+
165
+
166
+ def ndcg_at(predictions, labels, k=10, assume_unique=True):
167
+ """Compute the normalized discounted cumulative gain at K.
168
+ Compute the average NDCG value of all the queries, truncated at ranking
169
+ position k. The discounted cumulative gain at position k is computed as:
170
+ sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1)
171
+ and the NDCG is obtained by dividing the DCG value on the ground truth set.
172
+ In the current implementation, the relevance value is binary.
173
+ If a query has an empty ground truth set, zero will be used as
174
+ NDCG together with a warning.
175
+ Parameters
176
+ ----------
177
+ predictions : array-like, shape=(n_predictions,)
178
+ The prediction array. The items that were predicted, in descending
179
+ order of relevance.
180
+ labels : array-like, shape=(n_ratings,)
181
+ The labels (positively-rated items).
182
+ k : int, optional (default=10)
183
+ The rank at which to measure the NDCG.
184
+ assume_unique : bool, optional (default=True)
185
+ Whether to assume the items in the labels and predictions are each
186
+ unique. That is, the same item is not predicted multiple times or
187
+ rated multiple times.
188
+ Examples
189
+ --------
190
+ >>> # predictions for 3 users
191
+ >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
192
+ ... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
193
+ ... [1, 2, 3, 4, 5]]
194
+ >>> # labels for the 3 users
195
+ >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
196
+ >>> ndcg_at(preds, labels, 3)
197
+ 0.3333333432674408
198
+ >>> ndcg_at(preds, labels, 10)
199
+ 0.48791273434956867
200
+ References
201
+ ----------
202
+ .. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for
203
+ retrieving highly relevant documents."
204
+ """
205
+ # validate K
206
+ _require_positive_k(k)
207
+
208
+ def _inner_ndcg(pred, lab):
209
+ if lab.shape[0]:
210
+ # if we do NOT assume uniqueness, the set is a bit different here
211
+ if not assume_unique:
212
+ lab = np.unique(lab)
213
+
214
+ n_lab = lab.shape[0]
215
+ n_pred = pred.shape[0]
216
+ n = min(max(n_pred, n_lab), k) # min(min(p, l), k)?
217
+
218
+ # similar to mean_avg_prcsn, we need an arange, but this time +2
219
+ # since python is zero-indexed, and the denom typically needs +1.
220
+ # Also need the log base2...
221
+ arange = np.arange(n, dtype=np.float32) # length n
222
+
223
+ # since we are only interested in the arange up to n_pred, truncate
224
+ # if necessary
225
+ arange = arange[:n_pred]
226
+ denom = np.log2(arange + 2.) # length n
227
+ gains = 1. / denom # length n
228
+
229
+ # compute the gains where the prediction is present in the labels
230
+ dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique)
231
+ dcg = gains[dcg_mask].sum()
232
+
233
+ # the max DCG is sum of gains where the index < the label set size
234
+ max_dcg = gains[arange < n_lab].sum()
235
+ return dcg / max_dcg
236
+
237
+ else:
238
+ return _warn_for_empty_labels()
239
+
240
+ return _mean_ranking_metric(predictions, labels, _inner_ndcg)
project/evaluation/run.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import csv
3
+ import os
4
+ import yaml
5
+ import pickle
6
+ import numpy as np
7
+ import pandas as pd
8
+ import project.evaluation.metrics as m
9
+ from os.path import exists
10
+ from project.data.preparation import prepare_data, get_embeddings
11
+ from project.recsys.helper import Helper
12
+ from datetime import datetime
13
+ from project.recsys.algorithms import execute_algo
14
+ from project.evaluation.ResultReport import Results
15
+ from keras.models import model_from_yaml
16
+
17
+ def get_rnn():
18
+ model = model_from_yaml(open('training_model.yaml','r'))
19
+ model.load_weights('training_weights.h5')
20
+ return model
21
+
22
+ def skip_all(executed, params, k):
23
+ folds = executed[executed['params'] == params]['folds']
24
+ return folds.max() == k
25
+
26
+ def skip_fold(executed, params, fold):
27
+ folds = executed[executed['params'] == params]['folds']
28
+ return folds.max() >= fold
29
+
30
+ def cross_validation(df, conf, setups):
31
+ params = conf['evaluation']
32
+ r_paths = conf['results']
33
+
34
+ kfold = prepare_data(df, conf)
35
+ dataset = params['dataset']
36
+ topN = int(params['topN'])
37
+ k = int(params['k'])
38
+ results = Results(setups, k)
39
+ exec_path = r_paths['full']
40
+ pwd_rec = 'tmp/{}/rec/'.format(dataset)
41
+
42
+ if not exists(pwd_rec):
43
+ os.mkdir(pwd_rec)
44
+ if not exists(exec_path):
45
+ pd.DataFrame({},columns=['params','algo','folds','prec','rec','f1','map','ndcg@5','p@5']).to_csv(exec_path,index=None,sep='\t')
46
+
47
+ executed = pd.read_csv(exec_path, sep='\t')
48
+
49
+ for setup in setups:
50
+ _, params, path = setup
51
+ if not exists(pwd_rec + params):
52
+ os.mkdir(pwd_rec + params)
53
+ if skip_all(executed, params, k):
54
+ continue
55
+ songs = df['song'].unique().tolist()
56
+ m2v, sm2v = get_embeddings(path, songs)
57
+ songs = pd.DataFrame({ 'm2v': [m2v[x] for x in songs], 'sm2v': [sm2v[x] for x in songs]}, index=songs, columns=['m2v','sm2v'])
58
+ fold = 1
59
+ for train, test in kfold:
60
+ if skip_fold(executed, params, fold):
61
+ fold+=1
62
+ continue
63
+ time = datetime.now().strftime('%d/%m/%Y %H:%M')
64
+ print('%s | fold-%d | Running recsys w/ k-fold with the following params: %s' % (time, fold, params))
65
+ helper = Helper(train, test, songs, dataset)
66
+ m2vTN, sm2vTN, csm2vTN, csm2vUK = execute_algo(train.index, test.index, songs, topN, k, helper, pwd_rec + params)
67
+ res = results.fold_results(params, m2vTN, sm2vTN, csm2vTN, csm2vUK, fold)
68
+ res.to_csv(exec_path, sep='\t', mode='a', index=None, header=None)
69
+ fold+=1
project/models/embeddings.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import pickle
4
+ import pandas as pd
5
+ import numpy as np
6
+ from os import makedirs
7
+ from os.path import exists
8
+ from gensim.models import Word2Vec, Doc2Vec
9
+ from gensim.models.doc2vec import TaggedDocument
10
+ from datetime import datetime
11
+ from glove import Glove, Corpus
12
+ from project.models.rnn import rnn
13
+ from project.models.setups import Setups
14
+ from project.models.seq2seq import start as rnn_start
15
+
16
+ def data_prep(model, df):
17
+ if model == 'user':
18
+ return df.groupby(by='user')['song'].apply(list).values.tolist()
19
+ if model == 'user_doc':
20
+ return df.groupby(by='user')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()
21
+ if model == 'session':
22
+ return df.groupby(by='session')['song'].apply(list).values.tolist()
23
+ if model == 'session_doc':
24
+ return df.groupby(by='session')['song'].apply(lambda x: TaggedDocument(words=x.tolist(), tags=[x.name])).values.tolist()
25
+
26
+ def music2vec(data, w2v_type, dim, lr, window, down, neg_sample, epochs):
27
+ sentences = data_prep(w2v_type, data)
28
+ return Word2Vec(sentences, size=dim, alpha=lr, window=window, sample=down,
29
+ sg=1, hs=0, negative=neg_sample, iter=epochs, min_count=1, compute_loss=True)
30
+
31
+ def doc2vec(data, d2v_type, dim, lr, window, down, neg_sample, epochs):
32
+ sequence = data_prep(d2v_type, data)
33
+ return Doc2Vec(sequence, dm=1, vector_size=dim, alpha=lr, window=window, sample=down,
34
+ negative=neg_sample, epochs=epochs, min_count=1, compute_loss=True)
35
+
36
+ def glove(data, glove_type, window, dim, lr, epochs):
37
+ sentences = data_prep(glove_type, data)
38
+ corpus = Corpus()
39
+ corpus.fit(sentences, window=window)
40
+ glove = Glove(no_components=dim, learning_rate=lr)
41
+ glove.fit(corpus.matrix, epochs=epochs, no_threads=4, verbose=True)
42
+ glove.add_dictionary(corpus.dictionary)
43
+ return glove
44
+
45
+ def embeddings(df, conf):
46
+ ds = conf['evaluation']['dataset']
47
+ cwd = 'tmp/{}/models'.format(ds)
48
+
49
+ if not exists(cwd):
50
+ makedirs(cwd)
51
+
52
+ setups = Setups(conf)
53
+ generators = setups.get_generators()
54
+
55
+ c_id = 0
56
+ setups_id = []
57
+ for method, generator in generators:
58
+ if method == 'rnn':
59
+ for s in generator:
60
+ to_str = setups.setup_to_string(c_id, s, method)
61
+ print(to_str)
62
+
63
+ path = '{}/{}__{}.pickle'.format(cwd, method, c_id)
64
+ path_s = '{}/s{}__{}.pickle'.format(cwd, method, c_id)
65
+
66
+ if not exists(path):
67
+ user, session = rnn(df, ds, s['model'], s['window'], s['epochs'],
68
+ s['batch'], s['dim'], s['num_units'], s['bidi'])
69
+ fu = open(path, 'wb')
70
+ fs = open(path_s, 'wb')
71
+
72
+ pickle.dump(user, fu, protocol=pickle.HIGHEST_PROTOCOL)
73
+ pickle.dump(session, fs, protocol=pickle.HIGHEST_PROTOCOL)
74
+
75
+ fu.close()
76
+ fs.close()
77
+
78
+ setups_id.append([c_id, to_str, path])
79
+ c_id+=1
80
+ if method == 'music2vec':
81
+ for s in generator:
82
+ to_str = setups.setup_to_string(c_id, s, method)
83
+ print(to_str)
84
+
85
+ path = '{}/{}__{}.model'.format(cwd, method, c_id)
86
+ path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
87
+
88
+ if not exists(path):
89
+
90
+ m2v = music2vec(df,'user', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
91
+ sm2v = music2vec(df,'session', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
92
+
93
+ m2v.save(path)
94
+ sm2v.save(path_s)
95
+
96
+ setups_id.append([c_id, to_str, path])
97
+
98
+ c_id+=1
99
+ if method == 'doc2vec':
100
+ for s in generator:
101
+ to_str = setups.setup_to_string(c_id, s, method)
102
+ path = '{}/{}__{}.model'.format(cwd, method, c_id)
103
+ path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
104
+ print(to_str)
105
+
106
+ if not exists(path):
107
+
108
+ d2v = doc2vec(df,'user_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
109
+ sd2v = doc2vec(df,'session_doc', s['dim'], s['lr'], s['window'], s['down'], s['neg_sample'], s['epochs'])
110
+
111
+ d2v.save(path)
112
+ sd2v.save(path_s)
113
+
114
+ setups_id.append([c_id, to_str, path])
115
+
116
+ c_id+=1
117
+ if method == 'glove':
118
+ for s in generator:
119
+ to_str = setups.setup_to_string(c_id, s, method)
120
+ path = '{}/{}__{}.model'.format(cwd, method, c_id)
121
+ path_s = '{}/s{}__{}.model'.format(cwd, method, c_id)
122
+ print(to_str)
123
+
124
+ if not exists(path):
125
+
126
+ glv = glove(df, 'user', s['window'], s['dim'], s['lr'], s['epochs'])
127
+ sglv = glove(df, 'session', s['window'], s['dim'], s['lr'], s['epochs'])
128
+
129
+ glv.save(path)
130
+ sglv.save(path_s)
131
+
132
+ c_id+=1
133
+ if method == 'genres':
134
+ for s in generator:
135
+ to_str = s
136
+ print(to_str)
137
+ path = 'tmp/{}/experiments/'.format(ds)
138
+ path_s = 'tmp/{}/experiments/'.format(ds)
139
+
140
+ if s == 'add-all':
141
+ path += 'all_genres/add/all_add.pickle'
142
+ path_s += 'all_genres/add/sall_add.pickle'
143
+ if s == 'mul-all':
144
+ path += 'all_genres/mul/all_mul.pickle'
145
+ path_s += 'all_genres/mul/sall_mul.pickle'
146
+ if s == 'avg-all':
147
+ path += 'all_genres/avg/all_avg.pickle'
148
+ path_s += 'all_genres/avg/sall_avg.pickle'
149
+ if s == 'add-ran':
150
+ path += 'random_genres/add/ran_add.pickle'
151
+ path_s += 'random_genres/add/sran_add.pickle'
152
+ if s == 'mul-ran':
153
+ path += 'random_genres/mul/ran_mul.pickle'
154
+ path_s += 'random_genres/mul/sran_mul.pickle'
155
+ if s == 'avg-ran':
156
+ path += 'random_genres/avg/ran_avg.pickle'
157
+ path_s += 'random_genres/avg/sran_avg.pickle'
158
+
159
+ setups_id.append([c_id, to_str, path])
160
+
161
+ c_id+=1
162
+
163
+ setups_id = np.stack(setups_id, axis=0)
164
+
165
+ np.save('{}/ids'.format(cwd), setups_id)
166
+
project/models/rnn.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os.path import exists
2
+ from keras.utils import to_categorical
3
+ from keras.models import Model
4
+ from keras.layers import Embedding, LSTM, Dense, CuDNNGRU, LSTM, Input, Bidirectional, Dropout, Concatenate, Bidirectional
5
+ from keras.models import Sequential, load_model
6
+ from keras.callbacks import EarlyStopping, ModelCheckpoint
7
+ from keras.preprocessing.sequence import TimeseriesGenerator
8
+ import concurrent.futures as fut
9
+ import os
10
+ import gc
11
+ import keras
12
+ import pickle
13
+ import time
14
+ import numpy as np
15
+ import pickle as pk
16
+ import pandas as pd
17
+ import tensorflow as tf
18
+ import matplotlib.pyplot as plt
19
+ from math import floor
20
+
21
+
22
+ tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
23
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'
24
+
25
+ def get_window(playlist, ix, window):
26
+ el = playlist[ix]
27
+
28
+ # This is the perfect case:
29
+ if (ix - window >= 0) and (ix + window + 1) < len(playlist):
30
+ window = playlist[ix - window:ix] + playlist[ix + 1:(ix + 1) + window]
31
+ return window
32
+
33
+ # Not running into the perfect case, will turn into the damage reduction clause:
34
+ b4 = []
35
+ after = []
36
+ # If the problem is in the before clause, prepend the song until it mets the window size.
37
+ if (ix - window < 0):
38
+ b4 = (abs(ix - window) * ['0']) + playlist[0:ix]
39
+ else:
40
+ b4 = playlist[ix - window:ix]
41
+ # If the problem is in the after clause, append the song until it mets the window size.
42
+
43
+ if (ix + window + 1) > len(playlist):
44
+ num = (ix + window + 1) - len(playlist)
45
+ after = playlist[ix + 1:ix + window + 1] + (num * ['0'])
46
+ else:
47
+ after = playlist[ix + 1:(ix + 1) + window]
48
+
49
+ return b4 + after
50
+
51
+
52
+ def window_seqs(sequence, w_size):
53
+ ix = 0
54
+ max_ix = (len(sequence) - 1) - w_size
55
+ x = []
56
+ y = []
57
+ while ix < max_ix:
58
+ x.append(sequence[ix:ix+w_size])
59
+ y.append([sequence[ix+w_size]])
60
+ ix+=1
61
+ return x, y
62
+
63
+ def rnn(df, DS, MODEL, W_SIZE, EPOCHS, BATCH_SIZE, EMBEDDING_DIM, NUM_UNITS, BIDIRECTIONAL):
64
+ pwd = 'dataset/{}/'.format(DS)
65
+ WINDOW = W_SIZE * 2
66
+
67
+ vocab = sorted(set(df.song.unique().tolist()))
68
+ vocab_size = len(vocab) +1
69
+ song2ix = {u:i for i, u in enumerate(vocab, 1)}
70
+ pickle.dump(song2ix, open('{}_song2ix.pickle'.format(DS), 'wb'), pickle.HIGHEST_PROTOCOL)
71
+
72
+
73
+ if not exists(pwd + 'song_context_{}.txt'.format(W_SIZE)):
74
+ df['song'] = df.song.apply(lambda song: song2ix[song])
75
+ u_playlists = df[['user', 'song']].groupby('user').agg(tuple)['song'].values
76
+ u_playlists = [list(p) for p in u_playlists]
77
+ s_playlists = df[['session', 'song']].groupby('session').agg(tuple)['song'].values
78
+ s_playlists = [list(p) for p in s_playlists]
79
+
80
+ nou_playlists = len(u_playlists)
81
+ nos_playlists = len(s_playlists)
82
+
83
+ user_windows = dict()
84
+ session_windows = dict()
85
+
86
+
87
+ for song in vocab:
88
+ user_windows[song2ix[song]] = []
89
+ session_windows[song2ix[song]] = []
90
+
91
+ k4 = 1
92
+ for pl in u_playlists:
93
+ print('[{}/{}] [USER] Playlist'.format(k4, nou_playlists), flush=False, end='\r')
94
+ k4+=1
95
+ ixes = range(0, len(pl))
96
+ s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
97
+ for song, window in s_windows:
98
+ user_windows[song].append(window)
99
+ print()
100
+ k4 = 1
101
+ for pl in s_playlists:
102
+ print('[{}/{}] [SESSION] Playlist'.format(k4, nos_playlists), flush=False, end='\r')
103
+ k4+=1
104
+ ixes = range(0, len(pl))
105
+ s_windows = [(pl[ix], get_window(pl, ix, W_SIZE)) for ix in ixes]
106
+ for song, window in s_windows:
107
+ session_windows[song].append(window)
108
+ print()
109
+
110
+ f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), 'w')
111
+ for song in vocab:
112
+ u_occurrences = user_windows[song2ix[song]]
113
+ s_occurrences = session_windows[song2ix[song]]
114
+ for u_o, s_o in zip(u_occurrences, s_occurrences):
115
+ print('{}\t{}\t{}'.format(','.join([str(i) for i in u_o]), ','.join([str(i) for i in s_o]), str(song2ix[song])), file=f)
116
+ f.close()
117
+
118
+ f = open(pwd + 'song_context_{}.txt'.format(W_SIZE), mode='r')
119
+
120
+ data = []
121
+ for line in f:
122
+ line = line.replace('\n', '')
123
+ input_user, input_session, target = line.split('\t')
124
+ line = [np.array([int(x) for x in input_user.split(',')]), np.array([int(x) for x in input_session.split(',')]), int(target)]
125
+ data.append(line)
126
+
127
+ data = np.vstack(data)
128
+
129
+ np.random.shuffle(data)
130
+
131
+ def batch(data, bs):
132
+ while True:
133
+ for ix in range(0, len(data), bs):
134
+ u_input = data[ix:ix+bs,0]
135
+ s_input = data[ix:ix+bs,1]
136
+ target = data[ix:ix+bs,2]
137
+ yield [np.vstack(u_input), np.vstack(s_input)], to_categorical(target, num_classes=vocab_size)
138
+
139
+
140
+ train, test = data[int(len(data) *.20):], data[:int(len(data) *.20)]
141
+
142
+ input_session = Input(batch_shape=(None, WINDOW))
143
+ embedding_session = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='Session_Embeddings', mask_zero=True)(input_session)
144
+ drop_session = Dropout(0.2)(embedding_session)
145
+ rec_session = LSTM(NUM_UNITS, name='Session_LSTM')(drop_session)
146
+ drop_session = Dropout(0.2)(rec_session)
147
+
148
+ input_user = Input(batch_shape=(None, WINDOW))
149
+ embedding_user = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, name='User_Embeddings', mask_zero=True)(input_user)
150
+ drop_user = Dropout(0.2)(embedding_user)
151
+ rec_user = LSTM(NUM_UNITS, name='User_LSTM')(drop_user)
152
+ drop_user = Dropout(0.2)(rec_user)
153
+ combination = Concatenate()([drop_session, drop_user])
154
+ dense = Dense(vocab_size, activation='softmax', name='Densa')(combination)
155
+ model = Model(inputs=[input_session, input_user], outputs=dense)
156
+ checkpoint = ModelCheckpoint('{}_model_checkpoint.h5'.format(DS), monitor='loss', verbose=0, save_best_only=False, period=1)
157
+ es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
158
+
159
+ model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
160
+ model.summary()
161
+
162
+ if exists('{}_model_checkpoint.h5'.format(DS)):
163
+ model = load_model('{}_model_checkpoint.h5'.format(DS))
164
+
165
+ model.fit_generator(generator=batch(train, BATCH_SIZE), steps_per_epoch=len(train) // BATCH_SIZE, epochs=EPOCHS,
166
+ validation_data=batch(test, BATCH_SIZE), validation_steps=len(test) // BATCH_SIZE, callbacks=[es, checkpoint])
167
+
168
+ session_embeddings = model.get_layer('Session_Embeddings').get_weights()[0]
169
+ user_embeddings = model.get_layer('User_Embeddings').get_weights()[0]
170
+
171
+ u_emb = {}
172
+ s_emb = {}
173
+
174
+ for song in vocab:
175
+ u_emb[song] = user_embeddings[song2ix[song]]
176
+ s_emb[song] = session_embeddings[song2ix[song]]
177
+
178
+ del model
179
+ gc.collect()
180
+ return u_emb, s_emb
project/models/seq2seq.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ from project.data.preprocess import gen_seq_files
5
+ from os.path import exists
6
+ from keras.models import Model
7
+ from keras.callbacks import EarlyStopping
8
+ from keras.layers import Dense, CuDNNLSTM, CuDNNGRU, Embedding, Input, SimpleRNN
9
+
10
+ def read_input_targets(path, win_size, t):
11
+ d = {}
12
+ if t == 'session':
13
+ f = open(path + 'c_seqs.csv')
14
+ s_i = []
15
+ for line in f:
16
+ l = line.rstrip('\n').split('\t')
17
+ x = ' '.join(l[1].replace('[', '').replace(']', '').split(','))
18
+ s_i.append(x)
19
+ if l[0] in d:
20
+ d[l[0]].append(x)
21
+ else:
22
+ d[l[0]] = [x]
23
+ f.close()
24
+ s_t = ['START_ ' + session + ' _END' for session in s_i]
25
+ return s_i, s_t, d
26
+ if t == 'listening':
27
+ f = open(path + 'u_seqs.csv')
28
+ s_i = []
29
+ for line in f:
30
+ l = line.rstrip('\n').split('\t')
31
+ x = ' '.join(l[1].replace('[', '').replace(']', '').split(','))
32
+ s_i.append(x)
33
+ if l[0] in d:
34
+ d[l[0]].append(x)
35
+ else:
36
+ d[l[0]] = [x]
37
+ f.close()
38
+ s_t = ['START_ ' + session + ' _END' for session in s_i]
39
+ return s_i, s_t, d
40
+
41
+ def get_unique_songs(s_i, s_t):
42
+ all_i = set()
43
+ all_t = set()
44
+ for songs in s_i:
45
+ for song in songs.split():
46
+ if song not in all_i:
47
+ all_i.add(song)
48
+ for songs in s_t:
49
+ for song in songs.split():
50
+ if song not in all_t:
51
+ all_t.add(song)
52
+ return sorted(list(all_i)), sorted(list(all_t))
53
+
54
+ def get_max_length(s_i, s_t):
55
+ max_i = np.max([len(session.split()) for session in s_i])
56
+ max_t = np.max([len(session.split()) for session in s_t])
57
+ return max_i, max_t
58
+
59
+ def get_dicts(i_songs, t_songs):
60
+ song_ix_i = dict([(song, i+1) for i, song in enumerate(i_songs)])
61
+ song_ix_t = dict([(word, i+1) for i, word in enumerate(t_songs)])
62
+ ix_song_i = dict((i, song) for song, i in song_ix_i.items())
63
+ ix_song_t = dict((i, song) for song, i in song_ix_t.items())
64
+ return song_ix_i, song_ix_t, ix_song_i, ix_song_t
65
+
66
+ def __run_s2s(sessions_i, sessions_t, num_songs, song_ix, max_l, NUM_DIM=128, BATCH_SIZE= 128, EPOCHS=50, MODEL='RNN', WINDOW_SIZE=5):
67
+ X, y = sessions_i, sessions_t
68
+ num_encoder_songs, num_decoder_songs = num_songs
69
+ song_ix_i, song_ix_t = song_ix
70
+ max_length_i, max_length_t = max_l
71
+
72
+ def generate_batch(X, y, batch_size= 128):
73
+ while True:
74
+ for j in range(0, len(X), batch_size):
75
+ encoder_input_data = np.zeros((batch_size, max_length_i), dtype='float32')
76
+ decoder_input_data = np.zeros((batch_size, max_length_t), dtype='float32')
77
+ decoder_target_data = np.zeros((batch_size, max_length_t, num_decoder_songs), dtype='float32')
78
+ for i, (input_sequence, target_sequence) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
79
+ for t, word in enumerate(input_sequence.split()):
80
+ encoder_input_data[i, t] = song_ix_i[word] if word != '-' else 0
81
+ for t, word in enumerate(target_sequence.split()):
82
+ if t < len(target_sequence.split()) - 1:
83
+ decoder_input_data[i, t] = song_ix_t[word] if word != '-' else 0
84
+ if t > 0:
85
+ decoder_target_data[i, t - 1, song_ix_t[word] if word != '-' else 0] = 1
86
+ yield([encoder_input_data, decoder_input_data], decoder_target_data)
87
+
88
+ np.random.shuffle(X)
89
+ np.random.shuffle(y)
90
+
91
+ X_train, X_test = X[int(len(X) *.1):], X[:int(len(X) *.1)]
92
+ y_train, y_test = y[int(len(y) *.1):], y[:int(len(y) *.1)]
93
+
94
+ TRAIN_SAMPLES = len(X_train)
95
+ VAL_SAMPLES = len(X_test)
96
+
97
+ ENCODER_INPUT = Input(shape=(None,))
98
+ ENCODER_EMBEDDING = Embedding(num_encoder_songs, NUM_DIM)(ENCODER_INPUT)
99
+ if MODEL == 'LSTM':
100
+ ENCODER_NN = CuDNNLSTM(NUM_DIM, return_state=True)
101
+ _, state_h, state_c = ENCODER_NN(ENCODER_EMBEDDING)
102
+ ENCODER_STATE = [state_h, state_c]
103
+ if MODEL == 'GRU':
104
+ ENCODER_NN = CuDNNGRU(NUM_DIM, return_state=True)
105
+ _, ENCODER_STATE = ENCODER_NN(ENCODER_EMBEDDING)
106
+ if MODEL == 'RNN':
107
+ ENCODER_NN = SimpleRNN(NUM_DIM, return_state=True)
108
+ _, ENCODER_STATE = ENCODER_NN(ENCODER_EMBEDDING)
109
+
110
+ DECODER_INPUT = Input(shape=(None,))
111
+ DECODER_EMBEDDING = Embedding(num_decoder_songs, NUM_DIM)(DECODER_INPUT)
112
+ if MODEL == 'LSTM':
113
+ DECODER_NN = CuDNNLSTM(NUM_DIM, return_sequences=True, return_state=True)
114
+ DECODER_OUTPUT,_,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
115
+ if MODEL == 'GRU':
116
+ DECODER_NN = CuDNNGRU(NUM_DIM, return_sequences=True, return_state=True)
117
+ DECODER_OUTPUT,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
118
+ if MODEL == 'RNN':
119
+ DECODER_NN = SimpleRNN(NUM_DIM, return_sequences=True, return_state=True)
120
+ DECODER_OUTPUT,_ = DECODER_NN(DECODER_EMBEDDING, initial_state=ENCODER_STATE)
121
+ DENSE_DECODER = Dense(num_decoder_songs, activation='softmax')
122
+ DECODER_OUTPUT = DENSE_DECODER(DECODER_OUTPUT)
123
+
124
+ es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
125
+
126
+ model = Model([ENCODER_INPUT, DECODER_INPUT], DECODER_OUTPUT)
127
+ model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
128
+ model.summary()
129
+ model.fit_generator(generator= generate_batch(X_train, y_train, batch_size= BATCH_SIZE),
130
+ steps_per_epoch= TRAIN_SAMPLES // BATCH_SIZE,
131
+ epochs=EPOCHS,
132
+ validation_data= generate_batch(X_test, y_test, batch_size= BATCH_SIZE),
133
+ validation_steps= VAL_SAMPLES // BATCH_SIZE, callbacks=[es])
134
+
135
+
136
+ return Model(ENCODER_INPUT, ENCODER_STATE), generate_batch
137
+
138
+ def start(df, conf, id, ds):
139
+ s2s = conf
140
+ if not exists('dataset/{}/u_seqs.csv'.format(ds)):
141
+ print('Files %s and %s are going to be at "%s"' % ('u_seqs.csv', 'c_seqs.csv', 'dataset/{}/'.format(ds)))
142
+ gen_seq_files(df, 'dataset/{}/'.format(ds), conf['window_size'])
143
+ songs = df.song.unique()
144
+ del df
145
+
146
+ sessions_i, sessions_t, song_seqs_ses = read_input_targets('dataset/{}/'.format(ds), s2s['window_size'], 'session')
147
+ listening_i,listening_t, song_seqs_list = read_input_targets('dataset/{}/'.format(ds), s2s['window_size'], 'listening')
148
+ input_songs, target_songs = get_unique_songs(listening_i, listening_t)
149
+ max_length_i, max_length_t = get_max_length(listening_i, listening_t)
150
+ num_encoder_songs, num_decoder_songs = len(input_songs) + 1, len(target_songs) + 1
151
+ song_ix_i, song_ix_t, _, _ = get_dicts(input_songs, target_songs)
152
+
153
+ model, gen = __run_s2s(listening_i, listening_t, (num_encoder_songs, num_decoder_songs), (song_ix_i, song_ix_t),
154
+ (max_length_i, max_length_t), NUM_DIM=s2s['vector_dim'], BATCH_SIZE=s2s['batch_size'], EPOCHS=s2s['epochs'],
155
+ MODEL=s2s['model'], WINDOW_SIZE=s2s['window_size'])
156
+
157
+ embeddings = []
158
+ for song in songs:
159
+ seqs = song_seqs_list[song]
160
+ get_seq = gen(seqs, ['START_ ' + seq + ' _END' for seq in seqs], batch_size=1)
161
+ seq_embeddings = []
162
+ i=0
163
+ for (input_seq, _), _ in get_seq:
164
+ if i == len(seqs):
165
+ break
166
+ if s2s['model'] == 'LSTM':
167
+ state, _ = model.predict(input_seq)
168
+ else:
169
+ state = model.predict(input_seq)
170
+ seq_embeddings.append(state[0])
171
+ i+=1
172
+ emb_final = np.mean(np.array(seq_embeddings), 0)
173
+ embeddings.append(emb_final)
174
+ emb_values = np.array([songs, embeddings])
175
+ np.save('tmp/{}/models/{}'.format(ds, id), emb_values)
176
+
177
+ ######################################################################################################################
178
+
179
+ model, gen = __run_s2s(sessions_i, sessions_t, (num_encoder_songs, num_decoder_songs), (song_ix_i, song_ix_t),
180
+ (max_length_i, max_length_t), NUM_DIM=s2s['vector_dim'], BATCH_SIZE=s2s['batch_size'], EPOCHS=s2s['epochs'],
181
+ MODEL=s2s['model'], WINDOW_SIZE=s2s['window_size'])
182
+
183
+ embeddings = []
184
+ for song in songs:
185
+ seqs = song_seqs_ses[song]
186
+ get_seq = gen(seqs, ['START_ ' + seq + ' _END' for seq in seqs], batch_size=1)
187
+ seq_embeddings = []
188
+ i=0
189
+ for (input_seq, _), _ in get_seq:
190
+ if i == len(seqs):
191
+ break
192
+ if s2s['model'] == 'LSTM':
193
+ state, _ = model.predict(input_seq)
194
+ else:
195
+ state = model.predict(input_seq)
196
+ seq_embeddings.append(state[0])
197
+ i+=1
198
+ emb_final = np.mean(np.array(seq_embeddings), 0)
199
+ embeddings.append(emb_final)
200
+ emb_values = np.array([songs, embeddings])
201
+ np.save('tmp/{}/models/s{}'.format(ds, id), emb_values)
project/models/setups.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Setups():
2
+ def __init__(self, config):
3
+ self.__config = config
4
+ self.models_config = config['models']
5
+
6
+ def get_config(self):
7
+ return self.__config
8
+
9
+ def rnn_setups(self):
10
+ c = self.models_config['rnn']
11
+
12
+ for m in c['model']:
13
+ for w in c['window']:
14
+ for n in c['num_units']:
15
+ for e in c['embedding_dim']:
16
+ for ep in c['epochs']:
17
+ for bi in c['bi']:
18
+ yield { 'window': int(w), 'model': m, 'dim': int(e), 'batch': int(c['batch']),
19
+ 'epochs': int(ep), 'num_units': int(n), 'bidi': bi}
20
+
21
+ def d2v_m2v_setups(self, model):
22
+ c = self.models_config[model]
23
+ for w in c['window']:
24
+ for sample in c['negative_sample']:
25
+ for down in c['down_sample']:
26
+ for lr in c['learning_rate']:
27
+ for ep in c['epochs']:
28
+ for dim in c['embedding_dim']:
29
+ yield { 'window': w, 'dim': int(dim), 'lr': float(lr), 'down': float(down), 'epochs': int(ep), 'neg_sample': float(sample)}
30
+
31
+ def glove_setups(self):
32
+ c = self.models_config['glove']
33
+ for w in c['window']:
34
+ for dim in c['embedding_dim']:
35
+ for lr in c['learning_rate']:
36
+ for ep in c['epochs']:
37
+ yield { 'window': int(w), 'dim': int(dim), 'lr': float(lr), 'epochs': int(ep)}
38
+
39
+ def genre_setups(self):
40
+ c = self.models_config['genres']
41
+ for a in c['all']:
42
+ yield '{}-{}'.format(a, 'all')
43
+ for r in c['ran']:
44
+ yield '{}-{}'.format(r, 'ran')
45
+ def __return_gen(self, model):
46
+ if model == 'rnn':
47
+ return self.rnn_setups()
48
+ if model == 'music2vec' or model == 'doc2vec':
49
+ return self.d2v_m2v_setups(model)
50
+ if model == 'glove':
51
+ return self.glove_setups()
52
+ if model == 'genres':
53
+ return self.genre_setups()
54
+
55
+ def get_generators(self):
56
+ generators = []
57
+ for emb_methods in self.__config['embeddings'].items():
58
+ k, v = emb_methods
59
+ if v['usage'] == True:
60
+ generators.append((k, self.__return_gen(k)))
61
+ return generators
62
+
63
+ def setup_to_string(self, id, setup_obj, model_type):
64
+ setup_str = '--'.join([x + ':' + str(y) for x,y in list(setup_obj.items())])
65
+ return '{}--{}--{}'.format(model_type, id, setup_str)
project/recsys/algorithms.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import yaml
5
+ import pickle
6
+ import multiprocessing as mp
7
+ import numpy as np
8
+ from project.evaluation.metrics import get_metrics
9
+ from datetime import datetime
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+
13
+ def write_rec(pwd, sessions):
14
+ f = open(pwd, 'wb')
15
+ pickle.dump(sessions, f, protocol=pickle.HIGHEST_PROTOCOL)
16
+ f.close()
17
+
18
+ def recs(session, original, mtn_rec, smtn_rec, csmtn_rec, csmuk_rec):
19
+ return ({ 'session': session, 'original': original, 'mtn_rec': mtn_rec.tolist(), 'smtn_rec': smtn_rec.tolist(), 'csmtn_rec': csmtn_rec.tolist(), 'csmuk_rec': csmtn_rec.tolist()})
20
+
21
+ def execute_algo(train, test, songs, topN, k_sim, data, pwd):
22
+
23
+ m2vTN = []
24
+ sm2vTN = []
25
+ csm2vTN = []
26
+ csm2vUK = []
27
+
28
+ u_songs = data.us_matrix()
29
+ users = data.uu_matrix()
30
+
31
+ def report_users(num_users):
32
+ def f_aux(ix_user, user_id, algo):
33
+ return '[{}/{}] Running algorithm {} for user {}!'.format(ix_user, num_users,algo, user_id)
34
+ return f_aux
35
+
36
+ num_users = len(test)
37
+ rep = report_users(num_users)
38
+ u = 1
39
+
40
+ def pref(u, k_similar, song):
41
+ listened_to = [(k, u_songs[k, data.song_ix(song)] == 1) for k in k_similar]
42
+ sum_sims = 0
43
+ for u_k, listen in listened_to:
44
+ if listen == True:
45
+ sum_sims += users[u][u_k] / [v[1] for v in listened_to].count(True)
46
+ return sum_sims
47
+
48
+
49
+ for user in test:
50
+ f = open(pwd + '/' + user.replace('/', '_'), 'wb')
51
+ pickle.dump({}, f, protocol=pickle.HIGHEST_PROTOCOL)
52
+ f.close()
53
+
54
+ print(rep(u, user, 'M-TN'), flush=False, end='\r')
55
+ user_cos = cosine_similarity(data.u_pref(user).reshape(1, -1), data.m2v_songs)[0]
56
+ user_tn = data.get_n_largest(user_cos, topN)
57
+
58
+ sim_ix = np.argpartition(users[data.ix_user(user)], -k_sim)[-k_sim:]
59
+ song_sim = np.array([pref(data.ix_user(user), sim_ix, s) for s in songs.index.values])
60
+ to_write = []
61
+ s = 1
62
+
63
+ sessions = data.user_sessions(user)
64
+ for (train_songs, test_songs) in sessions:
65
+ if len(train_songs) > 0:
66
+ m2vTN.append(get_metrics(user_tn, test_songs))
67
+ c_pref = data.c_pref(train_songs)
68
+
69
+ print(rep(u, user, 'SM-TN'), flush=False, end='\r')
70
+ con_cos = cosine_similarity(c_pref.reshape(1, -1), data.sm2v_songs)[0]
71
+ cos_tn = data.get_n_largest(con_cos, topN)
72
+ sm2vTN.append(get_metrics(cos_tn, test_songs))
73
+
74
+ print(rep(u, user, 'CSM-TN'), flush=False, end='\r')
75
+ f_cos = np.sum([user_cos, con_cos], axis=0)
76
+ both_tn = data.get_n_largest(f_cos, topN)
77
+ csm2vTN.append(get_metrics(both_tn, test_songs))
78
+
79
+ print(rep(u, user, 'CSM-UK'), flush=False, end='\r')
80
+ UK_cos = np.sum([song_sim, con_cos], axis=0)
81
+ uk_tn = data.get_n_largest(UK_cos, topN)
82
+ csm2vUK.append(get_metrics(uk_tn, test_songs))
83
+ to_write.append(recs(s, test_songs, user_tn, cos_tn, both_tn, uk_tn))
84
+ s+=1
85
+ write_rec(pwd + '/' + user.replace('/', '_'), to_write)
86
+ u+=1
87
+
88
+ m_m2vTN = np.mean(m2vTN, axis=0).tolist()
89
+ m_sm2vTN = np.mean(sm2vTN, axis=0).tolist()
90
+ m_csm2vTN = np.mean(csm2vTN, axis=0).tolist()
91
+ m_csm2vUK = np.mean(csm2vUK, axis=0).tolist()
92
+ return (m_m2vTN, m_sm2vTN, m_csm2vTN, m_csm2vUK)
project/recsys/helper.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import math
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ import warnings
6
+
7
+ class Helper():
8
+ def __init__(self, train, test, songs, ds):
9
+ self.ds = ds
10
+ self.train = train
11
+ self.test = test
12
+ self.songs = songs
13
+ self.m2v_songs = self.songs.m2v.tolist()
14
+ self.sm2v_songs = self.songs.sm2v.tolist()
15
+ self.songs_ix = { v:k for k,v in enumerate(songs.index, 0) }
16
+ self.ix_songs = { k:v for k,v in enumerate(songs.index, 0) }
17
+ self.ix_users = { v:k for k,v in enumerate(np.concatenate([train.index.values, test.index.values]).tolist(), 0) }
18
+ self.num_users = len(self.ix_users)
19
+ self.num_songs = len(songs.index)
20
+ self.ix_pref = { v:self.u_pref(k) for (k,v) in self.ix_users.items() }
21
+ self.ix_u_songs = { v:self.unique_songs(k) for (k,v) in self.ix_users.items() }
22
+
23
+ def user_sessions(self, user):
24
+ history = self.test.loc[user, 'history']
25
+ return [(s[:len(s)//2], s[len(s)//2:]) for s in history]
26
+
27
+ def song_ix(self, song):
28
+ return self.songs_ix[song]
29
+
30
+ def ix_user(self, ix):
31
+ return self.ix_users[ix]
32
+
33
+ def unique_songs(self, user):
34
+ if user in self.train.index:
35
+ history = self.train[self.train.index == user]['history'].values[0]
36
+ if user in self.test.index:
37
+ history = self.test[self.test.index == user]['history'].values[0]
38
+ flat_history = [song for session in history for song in session]
39
+ unique_songs = list(set(flat_history))
40
+ return unique_songs
41
+
42
+ def u_pref(self, user):
43
+ if user in self.train.index:
44
+ history = self.train[self.train.index == user]['history'].values[0]
45
+ if user in self.test.index:
46
+ history = self.test[self.test.index == user]['history'].values[0]
47
+ history = [s[:len(s)//2] for s in history]
48
+ flat_history = [song for session in history for song in session]
49
+ flat_history = [self.songs.loc[song, 'm2v'] for song in flat_history]
50
+ mean = np.mean(flat_history, axis=0)
51
+ return mean
52
+
53
+ def c_pref(self, songs):
54
+ flat_vecs = self.songs.loc[songs, 'sm2v'].tolist()
55
+ return np.mean(np.array(flat_vecs), axis=0)
56
+
57
+ def get_n_largest(self, cos,n):
58
+ songs = self.songs.index.values
59
+ index = np.argpartition(cos, -n)[-n:]
60
+ return songs[index]
61
+
62
+ def uu_matrix(self):
63
+ if os.path.isfile('tmp/{}/matrix_users.npy'.format(self.ds)):
64
+ return np.load('tmp/{}/matrix_users.npy'.format(self.ds))
65
+
66
+ matrix_users = np.zeros((self.num_users, self.num_users))
67
+
68
+ for ix in range(self.num_users):
69
+ u_array = np.array([self.ix_pref[i] for i in range(self.num_users)])
70
+ y_array = np.zeros(self.num_users)
71
+ for j in range(self.num_users):
72
+ y_array[j] = math.sqrt(len(self.ix_u_songs[ix]) + len(self.ix_u_songs[j]))
73
+ cos = cosine_similarity(self.ix_pref[ix].reshape(1, -1), u_array)
74
+ val = np.sum([cos, y_array], axis=0)
75
+ matrix_users[ix] = np.divide(np.ones(val.shape), val)
76
+ np.save('tmp/{}/matrix_users'.format(self.ds), matrix_users)
77
+ return matrix_users
78
+
79
+ def us_matrix(self):
80
+ if os.path.isfile('tmp/{}/matrix_user_songs.npy'.format(self.ds)):
81
+ return np.load('tmp/{}/matrix_user_songs.npy'.format(self.ds))
82
+
83
+ matrix_u_songs = np.zeros((self.num_users, self.num_songs))
84
+ for u in list(self.ix_u_songs.keys()):
85
+ songs = self.ix_u_songs[u]
86
+ songs_ids = [self.songs_ix[s] for s in songs]
87
+ y_array = np.zeros(self.num_songs)
88
+ y_array[songs_ids] = 1
89
+ matrix_u_songs[u] = y_array
90
+ np.save('tmp/{}/matrix_user_songs'.format(self.ds), matrix_u_songs)
91
+ return matrix_u_songs
92
+