amirhosseinkarami commited on
Commit
6879b6f
β€’
1 Parent(s): 6b2cf04

Added codes

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
App/app.py ADDED
File without changes
App/tfidfrecommender.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TfidfRecommender.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1pgFsGrn_MiauSCowY6fVgY1yq8vM3WRJ
8
+ """
9
+
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
12
+ from transformers import BertTokenizer
13
+ import re
14
+ import unicodedata
15
+ import pandas as pd
16
+ import numpy as np
17
+
18
+ import nltk
19
+ from nltk.stem.porter import PorterStemmer
20
+
21
+ class TfidfRecommender :
22
+ def __init__(self, df, id_col, text_col, tokenization_method) :
23
+ """Initialize model parameters
24
+
25
+ Args:
26
+ id_col (str): Name of column containing item IDs.
27
+ tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method.
28
+ """
29
+ self.id_col = id_col
30
+ self.text_col = text_col
31
+ self.df = df
32
+
33
+ if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]:
34
+ raise ValueError(
35
+ 'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]'
36
+ )
37
+ self.tokenization_method = tokenization_method.lower()
38
+
39
+ # Initialize other variables used in this class
40
+ self.tf = TfidfVectorizer()
41
+ self.tfidf_matrix = dict()
42
+ self.tokens = dict()
43
+ self.stop_words = frozenset()
44
+ self.recommendations = dict()
45
+ self.top_k_recommendations = pd.DataFrame()
46
+
47
+ def __clean_text (self, text, for_Bert=False, verbose=False) :
48
+ try:
49
+ # Remove new line and tabs
50
+ clean = text.replace("\n", " ")
51
+ clean = clean.replace("\t", " ")
52
+ clean = clean.replace("\r", " ")
53
+ clean = clean.replace("Γ‚\xa0", "") # non-breaking space
54
+
55
+ # Remove all punctuation and special characters
56
+ # clean = re.sub(
57
+ # r"([^\s\w]|_)+", "", clean
58
+ # ) # noqa W695 invalid escape sequence '\s'
59
+
60
+ # If you want to keep some punctuation, see below commented out example
61
+ clean = re.sub(r'([^,.:\s\w\-]|_)+','', clean)
62
+
63
+ # Skip further processing if the text will be used in BERT tokenization
64
+ if for_Bert is False:
65
+ # Lower case
66
+ clean = clean.lower()
67
+ clean = re.sub(
68
+ r"([^\s\w]|_)+", "", clean
69
+ )
70
+ except Exception:
71
+ if verbose :
72
+ print("Cannot clean non-existent text")
73
+ clean = ""
74
+
75
+ return clean
76
+
77
+ def _clean_df (self):
78
+ self.df = self.df.replace(np.nan, "", regex=True)
79
+ # df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1)
80
+
81
+ # Check if for BERT tokenization
82
+ if self.tokenization_method in ["bert", "scibert"]:
83
+ for_BERT = True
84
+ else:
85
+ for_BERT = False
86
+
87
+ # Clean the text in the dataframe
88
+ self.df[self.text_col] = self.df[self.text_col].map(
89
+ lambda x: self.__clean_text(x, for_BERT)
90
+ )
91
+
92
+ def tokenize_text (self, ngram_range=(1, 3), min_df=0) :
93
+ """Tokenize the input text.
94
+
95
+ Args:
96
+ df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column.
97
+ text_col (str): Name of column containing the cleaned text.
98
+ ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
99
+ min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
100
+
101
+ Returns:
102
+ TfidfVectorizer, pandas.Series:
103
+ - Scikit-learn TfidfVectorizer object defined in `.tokenize_text()`.
104
+ - Each row contains tokens for respective documents separated by spaces.
105
+ """
106
+ self._clean_df()
107
+ vectors = self.df[self.text_col]
108
+
109
+ if self.tokenization_method in ["bert", "scibert"] :
110
+ # vectorizer
111
+ tf = TfidfVectorizer(
112
+ analyzer="word",
113
+ ngram_range=ngram_range,
114
+ min_df=min_df,
115
+ stop_words="english",
116
+ )
117
+
118
+ if self.tokenization_method == "bert":
119
+ bert_method = "bert-base-cased"
120
+ elif self.tokenization_method == "scibert":
121
+ bert_method = "allenai/scibert_scivocab_cased"
122
+
123
+ # Load pre-trained bert model (vocabulary)
124
+ tokenizer = BertTokenizer.from_pretrained(bert_method)
125
+
126
+ # tokenization
127
+ vectors_tokenized = vectors.copy()
128
+ for i in range(0, len(vectors)):
129
+ vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i]))
130
+
131
+ elif self.tokenization_method == "nltk":
132
+ # NLTK Stemming
133
+ token_dict = {} # noqa: F841
134
+ stemmer = PorterStemmer()
135
+
136
+ def stem_tokens(tokens, stemmer):
137
+ stemmed = []
138
+ for item in tokens:
139
+ stemmed.append(stemmer.stem(item))
140
+ return stemmed
141
+
142
+ def tokenize(text):
143
+ tokens = nltk.word_tokenize(text)
144
+ stems = stem_tokens(tokens, stemmer)
145
+ return stems
146
+
147
+ # The tokenization using a custom tokenizer is applied in the fit function
148
+ tf = TfidfVectorizer(
149
+ tokenizer=tokenize,
150
+ analyzer="word",
151
+ ngram_range=ngram_range,
152
+ min_df=min_df,
153
+ stop_words="english",
154
+ )
155
+ vectors_tokenized = vectors
156
+
157
+ elif self.tokenization_method == "none":
158
+ # No tokenization applied
159
+ tf = TfidfVectorizer(
160
+ analyzer="word",
161
+ ngram_range=ngram_range,
162
+ min_df=min_df,
163
+ stop_words="english",
164
+ )
165
+ vectors_tokenized = vectors
166
+
167
+ # Save to class variable
168
+ self.tf = tf
169
+
170
+ return tf, vectors_tokenized
171
+
172
+
173
+ def fit (self, tf, vectors_tokenized) :
174
+ self.tfidf_matrix = tf.fit_transform(vectors_tokenized)
175
+
176
+ def get_tokens (self) :
177
+ try:
178
+ self.tokens = self.tf.vocabulary_
179
+ except Exception:
180
+ self.tokens = "Run .tokenize_text() and .fit_tfidf() first"
181
+ return self.tokens
182
+
183
+ def get_stop_words (self) :
184
+ try:
185
+ self.stop_words = self.tf.get_stop_words()
186
+ except Exception:
187
+ self.stop_words = "Run .tokenize_text() and .fit_tfidf() first"
188
+ return self.stop_words
189
+
190
+ def recommend_k_items (self, title, k) :
191
+ idx = self.df[self.df['title'] == title].index[0]
192
+ cosine_sim = cosine_similarity(self.tfidf_matrix[int(idx)], self.tfidf_matrix)
193
+ similarity_scores = list(enumerate(cosine_sim[0]))
194
+ similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
195
+ similarity_scores = similarity_scores[1: k + 1]
196
+ movie_indices = [i[0] for i in similarity_scores]
197
+ return self.df.iloc[movie_indices]['id']
198
+
199
+ d = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv')
200
+ model = TfidfRecommender(d,'id','description', 'bert')
201
+ tf, vec = model.tokenize_text()
202
+ model.fit(tf, vec)
203
+ model.recommend_k_items ('Toy Story', 5)
204
+
Data_Analysis/ColaborativeFiltering.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyO0vypYbutZtCfRSUrje+SF"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"WySw99phdaSV"},"outputs":[],"source":["from surprise import KNNWithMeans, SVD\n","from surprise import Dataset, Reader\n","from surprise.model_selection import GridSearchCV, train_test_split\n","from surprise.accuracy import rmse\n","\n","class SVD :\n"," def __init__(self) :\n"]}]}
Data_Analysis/metadata_information_extraction.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":["fQcMREQvThwU","ugf5R7Ihi2eU","fmm6lJZH27-5","8SMtZaf6EkMD","uYO9OW7sXZMF"],"mount_file_id":"1nkLBMUOcoheh7EH5xe3uev4aZqAvPdVG","authorship_tag":"ABX9TyNIByCmZPZzgkENHepyEbKv"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Initial instructions"],"metadata":{"id":"fQcMREQvThwU"}},{"cell_type":"code","source":["! pip install kaggle"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"opfjSdEKbA6q","executionInfo":{"status":"ok","timestamp":1688566779143,"user_tz":-210,"elapsed":4727,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9a722f41-7e7f-48e5-b73a-a6d3766296e4"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.13)\n","Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2023.5.7)\n","Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.27.1)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.65.0)\n","Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.1)\n","Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.26.16)\n","Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.4)\n"]}]},{"cell_type":"code","source":["from google.colab import files"],"metadata":{"id":"9gjOv_WUJBzz","executionInfo":{"status":"ok","timestamp":1688566817902,"user_tz":-210,"elapsed":467,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["files.upload()\n","! mkdir ~/.kaggle\n","! cp kaggle.json ~/.kaggle/\n","! chmod 600 ~/.kaggle/kaggle.json\n","! kaggle datasets download -d rounakbanik/the-movies-dataset"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":401},"id":"cQoNEnDDJOvf","executionInfo":{"status":"error","timestamp":1688566832628,"user_tz":-210,"elapsed":12771,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"26b37601-be8f-4ba9-a01c-1e21bba10e66"},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["\n"," <input type=\"file\" id=\"files-b2ffe85c-4b2c-4a7a-82b0-5fa9df40ca2d\" name=\"files[]\" multiple disabled\n"," style=\"border:none\" />\n"," <output id=\"result-b2ffe85c-4b2c-4a7a-82b0-5fa9df40ca2d\">\n"," Upload widget is only available when the cell has been executed in the\n"," current browser session. Please rerun this cell to enable.\n"," </output>\n"," <script>// Copyright 2017 Google LLC\n","//\n","// Licensed under the Apache License, Version 2.0 (the \"License\");\n","// you may not use this file except in compliance with the License.\n","// You may obtain a copy of the License at\n","//\n","// http://www.apache.org/licenses/LICENSE-2.0\n","//\n","// Unless required by applicable law or agreed to in writing, software\n","// distributed under the License is distributed on an \"AS IS\" BASIS,\n","// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","// See the License for the specific language governing permissions and\n","// limitations under the License.\n","\n","/**\n"," * @fileoverview Helpers for google.colab Python module.\n"," */\n","(function(scope) {\n","function span(text, styleAttributes = {}) {\n"," const element = document.createElement('span');\n"," element.textContent = text;\n"," for (const key of Object.keys(styleAttributes)) {\n"," element.style[key] = styleAttributes[key];\n"," }\n"," return element;\n","}\n","\n","// Max number of bytes which will be uploaded at a time.\n","const MAX_PAYLOAD_SIZE = 100 * 1024;\n","\n","function _uploadFiles(inputId, outputId) {\n"," const steps = uploadFilesStep(inputId, outputId);\n"," const outputElement = document.getElementById(outputId);\n"," // Cache steps on the outputElement to make it available for the next call\n"," // to uploadFilesContinue from Python.\n"," outputElement.steps = steps;\n","\n"," return _uploadFilesContinue(outputId);\n","}\n","\n","// This is roughly an async generator (not supported in the browser yet),\n","// where there are multiple asynchronous steps and the Python side is going\n","// to poll for completion of each step.\n","// This uses a Promise to block the python side on completion of each step,\n","// then passes the result of the previous step as the input to the next step.\n","function _uploadFilesContinue(outputId) {\n"," const outputElement = document.getElementById(outputId);\n"," const steps = outputElement.steps;\n","\n"," const next = steps.next(outputElement.lastPromiseValue);\n"," return Promise.resolve(next.value.promise).then((value) => {\n"," // Cache the last promise value to make it available to the next\n"," // step of the generator.\n"," outputElement.lastPromiseValue = value;\n"," return next.value.response;\n"," });\n","}\n","\n","/**\n"," * Generator function which is called between each async step of the upload\n"," * process.\n"," * @param {string} inputId Element ID of the input file picker element.\n"," * @param {string} outputId Element ID of the output display.\n"," * @return {!Iterable<!Object>} Iterable of next steps.\n"," */\n","function* uploadFilesStep(inputId, outputId) {\n"," const inputElement = document.getElementById(inputId);\n"," inputElement.disabled = false;\n","\n"," const outputElement = document.getElementById(outputId);\n"," outputElement.innerHTML = '';\n","\n"," const pickedPromise = new Promise((resolve) => {\n"," inputElement.addEventListener('change', (e) => {\n"," resolve(e.target.files);\n"," });\n"," });\n","\n"," const cancel = document.createElement('button');\n"," inputElement.parentElement.appendChild(cancel);\n"," cancel.textContent = 'Cancel upload';\n"," const cancelPromise = new Promise((resolve) => {\n"," cancel.onclick = () => {\n"," resolve(null);\n"," };\n"," });\n","\n"," // Wait for the user to pick the files.\n"," const files = yield {\n"," promise: Promise.race([pickedPromise, cancelPromise]),\n"," response: {\n"," action: 'starting',\n"," }\n"," };\n","\n"," cancel.remove();\n","\n"," // Disable the input element since further picks are not allowed.\n"," inputElement.disabled = true;\n","\n"," if (!files) {\n"," return {\n"," response: {\n"," action: 'complete',\n"," }\n"," };\n"," }\n","\n"," for (const file of files) {\n"," const li = document.createElement('li');\n"," li.append(span(file.name, {fontWeight: 'bold'}));\n"," li.append(span(\n"," `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n"," `last modified: ${\n"," file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n"," 'n/a'} - `));\n"," const percent = span('0% done');\n"," li.appendChild(percent);\n","\n"," outputElement.appendChild(li);\n","\n"," const fileDataPromise = new Promise((resolve) => {\n"," const reader = new FileReader();\n"," reader.onload = (e) => {\n"," resolve(e.target.result);\n"," };\n"," reader.readAsArrayBuffer(file);\n"," });\n"," // Wait for the data to be ready.\n"," let fileData = yield {\n"," promise: fileDataPromise,\n"," response: {\n"," action: 'continue',\n"," }\n"," };\n","\n"," // Use a chunked sending to avoid message size limits. See b/62115660.\n"," let position = 0;\n"," do {\n"," const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n"," const chunk = new Uint8Array(fileData, position, length);\n"," position += length;\n","\n"," const base64 = btoa(String.fromCharCode.apply(null, chunk));\n"," yield {\n"," response: {\n"," action: 'append',\n"," file: file.name,\n"," data: base64,\n"," },\n"," };\n","\n"," let percentDone = fileData.byteLength === 0 ?\n"," 100 :\n"," Math.round((position / fileData.byteLength) * 100);\n"," percent.textContent = `${percentDone}% done`;\n","\n"," } while (position < fileData.byteLength);\n"," }\n","\n"," // All done.\n"," yield {\n"," response: {\n"," action: 'complete',\n"," }\n"," };\n","}\n","\n","scope.google = scope.google || {};\n","scope.google.colab = scope.google.colab || {};\n","scope.google.colab._files = {\n"," _uploadFiles,\n"," _uploadFilesContinue,\n","};\n","})(self);\n","</script> "]},"metadata":{}},{"output_type":"error","ename":"KeyboardInterrupt","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-3-dd9d5815f8a5>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' mkdir ~/.kaggle'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' cp kaggle.json ~/.kaggle/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' chmod 600 ~/.kaggle/kaggle.json'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' kaggle datasets download -d rounakbanik/the-movies-dataset'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/files.py\u001b[0m in \u001b[0;36mupload\u001b[0;34m()\u001b[0m\n\u001b[1;32m 67\u001b[0m \"\"\"\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0muploaded_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_upload_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmultiple\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;31m# Mapping from original filename to filename as saved locally.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0mlocal_filenames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/files.py\u001b[0m in \u001b[0;36m_upload_files\u001b[0;34m(multiple)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;31m# First result is always an indication that the file picker has completed.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m result = _output.eval_js(\n\u001b[0m\u001b[1;32m 154\u001b[0m 'google.colab._files._uploadFiles(\"{input_id}\", \"{output_id}\")'.format(\n\u001b[1;32m 155\u001b[0m \u001b[0minput_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_id\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/output/_js.py\u001b[0m in \u001b[0;36meval_js\u001b[0;34m(script, ignore_result, timeout_sec)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mignore_result\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_message\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m if (\n","\u001b[0;31mKeyboardInterrupt\u001b[0m: "]}]},{"cell_type":"code","source":["import os\n","if not os.path.isdir('/content/data/cleaned') :\n"," os.mkdir('/content/data/cleaned')"],"metadata":{"id":"HaEx9oF3Li7f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["! unzip '/content/the-movies-dataset.zip' -d '/content/data/raw'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pLeGJcfvLwwJ","executionInfo":{"status":"ok","timestamp":1688546148833,"user_tz":-210,"elapsed":10746,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"debf1aaf-4932-47f9-b076-49457210b967"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive: /content/the-movies-dataset.zip\n"," inflating: /content/data/raw/credits.csv \n"," inflating: /content/data/raw/keywords.csv \n"," inflating: /content/data/raw/links.csv \n"," inflating: /content/data/raw/links_small.csv \n"," inflating: /content/data/raw/movies_metadata.csv \n"," inflating: /content/data/raw/ratings.csv \n"," inflating: /content/data/raw/ratings_small.csv \n"]}]},{"cell_type":"markdown","source":["# Needed Imports"],"metadata":{"id":"a0M_2XYkbY_O"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import re"],"metadata":{"id":"z2eLwbMCbbxB","executionInfo":{"status":"ok","timestamp":1688584736915,"user_tz":-210,"elapsed":1969,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":["# Load datasets"],"metadata":{"id":"ugf5R7Ihi2eU"}},{"cell_type":"code","source":["metadata = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_metadata.csv')\n","credits = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_credits.csv')\n","keywords = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_keywords.csv')\n","links = pd.read_csv('/content/drive/MyDrive/Rec/data/raw/links.csv')\n","links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')"],"metadata":{"id":"pW6Sw6dCi4aX","executionInfo":{"status":"ok","timestamp":1688574375594,"user_tz":-210,"elapsed":3626,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["print('shape: {}'.format(metadata.shape))\n","print('columns: \\n {}'.format(metadata.columns))\n","metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":723},"id":"N_nTAGZUjZTc","executionInfo":{"status":"ok","timestamp":1688571998020,"user_tz":-210,"elapsed":809,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3d810160-7989-4473-f969-bddaeea13eb7"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stdout","text":["shape: (45447, 30)\n","columns: \n"," Index(['adult', 'budget', 'homepage', 'id', 'imdb_id', 'original_language',\n"," 'original_title', 'overview', 'popularity', 'poster_path',\n"," 'production_companies', 'release_date', 'revenue', 'runtime', 'status',\n"," 'tagline', 'title', 'video', 'vote_average', 'vote_count',\n"," 'name_belongs_to_collection', 'id_belongs_to_collection',\n"," 'poster_path_belongs_to_collection',\n"," 'backdrop_path_belongs_to_collection', 'name_genres', 'id_genres',\n"," 'name_production_countries', 'iso_3166_1_production_companies',\n"," 'name_production_companies', 'id_production_companies'],\n"," dtype='object')\n"]},{"output_type":"execute_result","data":{"text/plain":[" adult budget homepage id imdb_id \\\n","0 False 30000000 http://toystory.disney.com/toy-story 862 tt0114709 \n","1 False 65000000 NaN 8844 tt0113497 \n","2 False 0 NaN 15602 tt0113228 \n","\n"," original_language original_title \\\n","0 en Toy Story \n","1 en Jumanji \n","2 en Grumpier Old Men \n","\n"," overview popularity \\\n","0 Led by Woody, Andy's toys live happily in his ... 21.946943 \n","1 When siblings Judy and Peter discover an encha... 17.015539 \n","2 A family wedding reignites the ancient feud be... 11.712900 \n","\n"," poster_path ... name_belongs_to_collection \\\n","0 /rhIRbceoE9lR4veEXuwCC2wARtG.jpg ... Toy Story Collection \n","1 /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg ... NaN \n","2 /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg ... Grumpy Old Men Collection \n","\n"," id_belongs_to_collection poster_path_belongs_to_collection \\\n","0 10194.0 /7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg \n","1 NaN NaN \n","2 119050.0 /nLvUdqgPgm3F85NMCii9gVFUcet.jpg \n","\n"," backdrop_path_belongs_to_collection name_genres \\\n","0 /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg Animation, Comedy, Family \n","1 NaN Adventure, Fantasy, Family \n","2 /hypTnLot2z8wpFS7qwsQHW1uV8u.jpg Romance, Comedy \n","\n"," id_genres name_production_countries iso_3166_1_production_companies \\\n","0 16, 35, 10751 United States of America US \n","1 12, 14, 10751 United States of America US \n","2 10749, 35 United States of America US \n","\n"," name_production_companies id_production_companies \n","0 Pixar Animation Studios 3 \n","1 TriStar Pictures, Teitler Film, Interscope Com... 559, 2550, 10201 \n","2 Warner Bros., Lancaster Gate 6194, 19464 \n","\n","[3 rows x 30 columns]"],"text/html":["\n"," <div id=\"df-3fa5403e-5fb8-409e-8a96-2f930c77c93e\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>adult</th>\n"," <th>budget</th>\n"," <th>homepage</th>\n"," <th>id</th>\n"," <th>imdb_id</th>\n"," <th>original_language</th>\n"," <th>original_title</th>\n"," <th>overview</th>\n"," <th>popularity</th>\n"," <th>poster_path</th>\n"," <th>...</th>\n"," <th>name_belongs_to_collection</th>\n"," <th>id_belongs_to_collection</th>\n"," <th>poster_path_belongs_to_collection</th>\n"," <th>backdrop_path_belongs_to_collection</th>\n"," <th>name_genres</th>\n"," <th>id_genres</th>\n"," <th>name_production_countries</th>\n"," <th>iso_3166_1_production_companies</th>\n"," <th>name_production_companies</th>\n"," <th>id_production_companies</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>False</td>\n"," <td>30000000</td>\n"," <td>http://toystory.disney.com/toy-story</td>\n"," <td>862</td>\n"," <td>tt0114709</td>\n"," <td>en</td>\n"," <td>Toy Story</td>\n"," <td>Led by Woody, Andy's toys live happily in his ...</td>\n"," <td>21.946943</td>\n"," <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n"," <td>...</td>\n"," <td>Toy Story Collection</td>\n"," <td>10194.0</td>\n"," <td>/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg</td>\n"," <td>/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg</td>\n"," <td>Animation, Comedy, Family</td>\n"," <td>16, 35, 10751</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>Pixar Animation Studios</td>\n"," <td>3</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>False</td>\n"," <td>65000000</td>\n"," <td>NaN</td>\n"," <td>8844</td>\n"," <td>tt0113497</td>\n"," <td>en</td>\n"," <td>Jumanji</td>\n"," <td>When siblings Judy and Peter discover an encha...</td>\n"," <td>17.015539</td>\n"," <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>Adventure, Fantasy, Family</td>\n"," <td>12, 14, 10751</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>TriStar Pictures, Teitler Film, Interscope Com...</td>\n"," <td>559, 2550, 10201</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>False</td>\n"," <td>0</td>\n"," <td>NaN</td>\n"," <td>15602</td>\n"," <td>tt0113228</td>\n"," <td>en</td>\n"," <td>Grumpier Old Men</td>\n"," <td>A family wedding reignites the ancient feud be...</td>\n"," <td>11.712900</td>\n"," <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n"," <td>...</td>\n"," <td>Grumpy Old Men Collection</td>\n"," <td>119050.0</td>\n"," <td>/nLvUdqgPgm3F85NMCii9gVFUcet.jpg</td>\n"," <td>/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg</td>\n"," <td>Romance, Comedy</td>\n"," <td>10749, 35</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>Warner Bros., Lancaster Gate</td>\n"," <td>6194, 19464</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>3 rows Γ— 30 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3fa5403e-5fb8-409e-8a96-2f930c77c93e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-3fa5403e-5fb8-409e-8a96-2f930c77c93e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-3fa5403e-5fb8-409e-8a96-2f930c77c93e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["a = metadata['original_language'].value_counts()\n","a[a>10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QPmkUByIlo1G","executionInfo":{"status":"ok","timestamp":1688572010282,"user_tz":-210,"elapsed":438,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"de2a56b6-96a2-461c-f4bb-56bd552c714e"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":["en 32260\n","fr 2437\n","it 1529\n","ja 1349\n","de 1079\n","es 994\n","ru 826\n","hi 508\n","ko 444\n","zh 409\n","sv 384\n","pt 316\n","cn 313\n","fi 295\n","nl 248\n","da 224\n","pl 219\n","tr 150\n","cs 130\n","el 113\n","no 106\n","fa 101\n","hu 100\n","ta 78\n","th 75\n","he 67\n","sr 63\n","ro 57\n","te 45\n","ar 39\n","ml 36\n","xx 33\n","hr 29\n","bn 29\n","mr 25\n","et 24\n","is 24\n","tl 23\n","id 20\n","lv 18\n","ka 18\n","sl 17\n","uk 16\n","bs 14\n","ca 12\n","Name: original_language, dtype: int64"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# Constructing the dataset used for item based recommendation"],"metadata":{"id":"fmm6lJZH27-5"}},{"cell_type":"code","source":["keywords['id'] = keywords['id'].astype('int')\n","credits['id'] = credits['id'].astype('int')"],"metadata":{"id":"qHk24Ai_l_tH","executionInfo":{"status":"ok","timestamp":1688574139845,"user_tz":-210,"elapsed":589,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":24,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.merge(credits, on='id')\n","metadata = metadata.merge(keywords, on='id')"],"metadata":{"id":"2MNqsMX13JZJ","executionInfo":{"status":"ok","timestamp":1688574651033,"user_tz":-210,"elapsed":503,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":32,"outputs":[]},{"cell_type":"markdown","source":["## Use only the available movies in TMDB"],"metadata":{"id":"32SR5rG64Vxy"}},{"cell_type":"code","source":["rec_data = metadata[metadata['id'].isin(links)].copy()\n","rec_data.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GLHc1UW-3NDu","executionInfo":{"status":"ok","timestamp":1688574670536,"user_tz":-210,"elapsed":593,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"f66c62c1-7700-4a92-e249-56e145731564"},"execution_count":33,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45459, 45)"]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["rec_data = rec_data.drop_duplicates(subset='id')"],"metadata":{"id":"hoaS3X9ma9F-","executionInfo":{"status":"ok","timestamp":1688583554915,"user_tz":-210,"elapsed":579,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":140,"outputs":[]},{"cell_type":"markdown","source":["## Adding 'director' column"],"metadata":{"id":"Ax8jTRG05rWa"}},{"cell_type":"code","source":["def is_float(string):\n"," try:\n"," float(string)\n"," return True\n"," except ValueError:\n"," return False"],"metadata":{"id":"JorU4WBB40kq","executionInfo":{"status":"ok","timestamp":1688574823282,"user_tz":-210,"elapsed":505,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":34,"outputs":[]},{"cell_type":"code","source":["def directors_names(job_crew, index):\n"," if not isinstance(job_crew, float):\n"," jobs = job_crew.split(', ')\n"," director_indices = [i for i, job in enumerate(jobs) if job == 'Director']\n"," if director_indices:\n"," names = rec_data.loc[index, 'name_crew']\n"," if not isinstance(names, float):\n"," names = names.split(', ')\n"," director_names = [names[i] for i in director_indices]\n"," return ', '.join(director_names)\n"," return np.nan"],"metadata":{"id":"4Wv7jSsX5wS7","executionInfo":{"status":"ok","timestamp":1688575844013,"user_tz":-210,"elapsed":467,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":37,"outputs":[]},{"cell_type":"code","source":["for i in rec_data.index:\n"," rec_data.loc[i, 'director'] = directors_names(rec_data.loc[i, 'job_crew'], i)\n","rec_data['director']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RJZBCGZS62pU","executionInfo":{"status":"ok","timestamp":1688577018900,"user_tz":-210,"elapsed":7640,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9e4659dd-06ad-4334-9192-53b2f2f33640"},"execution_count":38,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 John Lasseter\n","1 Joe Johnston\n","2 Howard Deutch\n","3 Forest Whitaker\n","4 Charles Shyer\n"," ... \n","45454 Hamid Nematollah\n","45455 Lav Diaz\n","45456 Mark L. Lester\n","45457 Yakov Protazanov\n","45458 Daisy Asquith\n","Name: director, Length: 45459, dtype: object"]},"metadata":{},"execution_count":38}]},{"cell_type":"markdown","source":["# Generating the final dataset"],"metadata":{"id":"8SMtZaf6EkMD"}},{"cell_type":"code","source":["rec_data = rec_data [['id', 'original_language', 'overview','tagline', 'title', 'name_genres', 'name_cast', 'name_keywords', 'director']]"],"metadata":{"id":"DhM20iQwHV_1","executionInfo":{"status":"ok","timestamp":1688579331516,"user_tz":-210,"elapsed":613,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":84,"outputs":[]},{"cell_type":"code","source":["rec_data = rec_data.replace(np.nan, '')"],"metadata":{"id":"yawZztUkFTD1","executionInfo":{"status":"ok","timestamp":1688580086363,"user_tz":-210,"elapsed":1800,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":87,"outputs":[]},{"cell_type":"code","source":["rec_data = rec_data.replace('', 'Not mentioned')"],"metadata":{"id":"F_96ZhozO--3","executionInfo":{"status":"ok","timestamp":1688580411504,"user_tz":-210,"elapsed":498,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":95,"outputs":[]},{"cell_type":"code","source":["rec_data['description'] = 'Title: ' + rec_data['title'] + '. Director: ' + rec_data['director'] + '. Genres: ' + rec_data['name_genres'] + '. Overview: ' + rec_data['overview'] + ' Keywords: ' + rec_data['name_keywords'] + '. Language: ' + rec_data['original_language'] + '.'"],"metadata":{"id":"fywEpVC9N1ae","executionInfo":{"status":"ok","timestamp":1688582041083,"user_tz":-210,"elapsed":448,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":115,"outputs":[]},{"cell_type":"code","source":["def clean_text (text) :\n"," try:\n"," # Remove new line and tabs\n"," clean = text.replace(\"\\n\", \" \")\n"," clean = clean.replace(\"\\t\", \" \")\n"," clean = clean.replace(\"\\r\", \" \")\n"," clean = clean.replace(\"Γ‚\\xa0\", \"\") # non-breaking space\n","\n"," # Remove all punctuation and special characters\n"," # clean = re.sub(\n"," # r\"([^\\s\\w]|_)+\", \"\", clean\n"," # ) # noqa W695 invalid escape sequence '\\s'\n","\n"," # If you want to keep some punctuation, see below commented out example\n"," clean = re.sub(r'([^,.:\\s\\w\\-]|_)+','', clean)\n","\n"," # Skip further processing if the text will be used in BERT tokenization\n","\n"," except Exception:\n"," print(\"Cannot clean non-existent text\")\n"," clean = \"\"\n","\n"," return clean"],"metadata":{"id":"EWUD2uTEGdTa","executionInfo":{"status":"ok","timestamp":1688582398315,"user_tz":-210,"elapsed":472,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":128,"outputs":[]},{"cell_type":"code","source":["rec_data['description'] = rec_data['description'].apply(clean_text)"],"metadata":{"id":"yQPx5nkFGlb-","executionInfo":{"status":"ok","timestamp":1688582648519,"user_tz":-210,"elapsed":2256,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":130,"outputs":[]},{"cell_type":"code","source":["final_rec_data = rec_data[['id', 'title', 'description']]\n","final_rec_data = final_rec_data.drop_duplicates(subset='id')"],"metadata":{"id":"4YjLmb1TXtvr","executionInfo":{"status":"ok","timestamp":1688584254391,"user_tz":-210,"elapsed":988,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":141,"outputs":[]},{"cell_type":"code","source":["final_rec_data.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vEbcByGnZWLY","executionInfo":{"status":"ok","timestamp":1688584258284,"user_tz":-210,"elapsed":931,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"20c3aea6-aa57-4e5f-c2e6-94e884b98d26"},"execution_count":142,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45429, 3)"]},"metadata":{},"execution_count":142}]},{"cell_type":"markdown","source":["## Saving the dataset"],"metadata":{"id":"uYO9OW7sXZMF"}},{"cell_type":"code","source":["final_rec_data.to_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv', index=False)"],"metadata":{"id":"SVR8F9QaZZXL","executionInfo":{"status":"ok","timestamp":1688584265792,"user_tz":-210,"elapsed":1909,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":143,"outputs":[]},{"cell_type":"markdown","source":["# REC"],"metadata":{"id":"ty8JI6oFhQ_d"}},{"cell_type":"code","source":["d = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv')"],"metadata":{"id":"gAthfffufZXF","executionInfo":{"status":"ok","timestamp":1688584746369,"user_tz":-210,"elapsed":1187,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.metrics.pairwise import linear_kernel, cosine_similarity\n","\n","tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')\n","tfidf_matrix = tfidf.fit_transform(d['description'])\n","# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)\n","# cosine_sim = cosine_sim.astype(np.float16)\n"],"metadata":{"id":"4nGOUldZY6X7","executionInfo":{"status":"ok","timestamp":1688584778153,"user_tz":-210,"elapsed":17021,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["def recommender (title, num_recommends):\n"," idx = d[d['title'] == title].index[0]\n"," cosine_sim = cosine_similarity(tfidf_matrix[int(idx)], tfidf_matrix)\n"," similarity_scores = list(enumerate(cosine_sim[0]))\n"," similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)\n"," similarity_scores = similarity_scores[1: num_recommends + 1]\n"," movie_indices = [i[0] for i in similarity_scores]\n"," return d.iloc[movie_indices]['title']"],"metadata":{"id":"M40vz12Oajc3","executionInfo":{"status":"ok","timestamp":1688585022061,"user_tz":-210,"elapsed":848,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":23,"outputs":[]},{"cell_type":"markdown","source":[],"metadata":{"id":"zpcUEJrDhZLY"}},{"cell_type":"code","source":["! pip install scikit-surprise"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nx9G3cXEkbCZ","executionInfo":{"status":"ok","timestamp":1688598659233,"user_tz":-210,"elapsed":51093,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3433f6f6-58f4-4625-8ad7-3b565166eee0"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting scikit-surprise\n"," Downloading scikit-surprise-1.1.3.tar.gz (771 kB)\n","\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/772.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m772.0/772.0 kB\u001b[0m \u001b[31m43.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: joblib>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.2.0)\n","Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.22.4)\n","Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.10.1)\n","Building wheels for collected packages: scikit-surprise\n"," Building wheel for scikit-surprise (setup.py) ... \u001b[?25l\u001b[?25hdone\n"," Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096320 sha256=ab360f2850ab501540eeccaf1058521f2c63a69cb989d9308e7f3d63bc789795\n"," Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445\n","Successfully built scikit-surprise\n","Installing collected packages: scikit-surprise\n","Successfully installed scikit-surprise-1.1.3\n"]}]},{"cell_type":"code","source":["import pandas as pd\n","from surprise import Dataset, SVD ,Reader\n","from sklearn.metrics.pairwise import linear_kernel, cosine_similarity\n","from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n","from surprise.model_selection import cross_validate , KFold\n","from surprise import model_selection"],"metadata":{"id":"9IJpu0c3f7ub","executionInfo":{"status":"ok","timestamp":1688598692049,"user_tz":-210,"elapsed":1775,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["from surprise import Dataset, Reader, SVD\n","from surprise.model_selection import train_test_split\n","from surprise.accuracy import rmse\n","\n","# load data from a CSV file\n","data = pd.read_csv('/content/drive/MyDrive/Rec/data/raw/ratings_small.csv')\n","\n","# define the Reader object\n","reader = Reader(rating_scale=(1, 5))\n","\n","# load the data into the Dataset object\n","dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)\n","\n","# split the data into training and testing sets\n","trainset, testset = train_test_split(dataset, test_size=0.2)\n","\n","# define the SVD algorithm\n","algo = SVD(n_factors=100, n_epochs=20)\n","\n","# train the algorithm on the training set\n","algo.fit(trainset)\n","\n","# make predictions on the testing set\n","predictions = algo.test(testset)\n","\n","# evaluate the performance of the algorithm\n","rmse_score = rmse(predictions)\n","print('RMSE:', rmse_score)\n","\n","# make recommendations for a given user\n","user_id = 24256\n","items_to_recommend = []\n","for item_id in data['movieId'].unique():\n"," predicted_rating = algo.predict(user_id, item_id).est\n"," if predicted_rating >= 4.8:\n"," items_to_recommend.append(item_id)\n","print('Items to recommend:', items_to_recommend)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"s5T_lyVHoDOg","executionInfo":{"status":"ok","timestamp":1688599195314,"user_tz":-210,"elapsed":1948,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"845856e2-33ef-4459-9bb1-e81dca281acf"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["RMSE: 0.8969\n","RMSE: 0.8968864510559503\n","Items to recommend: []\n"]}]},{"cell_type":"code","source":["links = links.rename(columns={'imdbId' : 'imdb_id'})\n","links['movieId']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"DYxbl8NX79Dr","executionInfo":{"status":"ok","timestamp":1688610513785,"user_tz":-210,"elapsed":661,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"afb9d051-07a2-441a-dd84-04f8c1a56530"},"execution_count":34,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" movieId imdb_id tmdbId\n","0 1 114709 862.0\n","1 2 113497 8844.0\n","2 3 113228 15602.0\n","3 4 114885 31357.0\n","4 5 113041 11862.0"],"text/html":["\n"," <div id=\"df-e4283956-5412-49ab-9168-53e860d30c68\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>movieId</th>\n"," <th>imdb_id</th>\n"," <th>tmdbId</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>114709</td>\n"," <td>862.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>113497</td>\n"," <td>8844.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>3</td>\n"," <td>113228</td>\n"," <td>15602.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>4</td>\n"," <td>114885</td>\n"," <td>31357.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>5</td>\n"," <td>113041</td>\n"," <td>11862.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e4283956-5412-49ab-9168-53e860d30c68')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-e4283956-5412-49ab-9168-53e860d30c68 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-e4283956-5412-49ab-9168-53e860d30c68');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["links['imdb_id']=links['imdb_id'].astype(int)"],"metadata":{"id":"Oyogeoo--vRr","executionInfo":{"status":"ok","timestamp":1688609761885,"user_tz":-210,"elapsed":662,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":27,"outputs":[]},{"cell_type":"code","source":["2 in cr['id']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Sil04xEt_INh","executionInfo":{"status":"ok","timestamp":1688610594282,"user_tz":-210,"elapsed":719,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"87b039fd-7f7f-4a01-e6d7-75d5234e2e15"},"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["cr = cr.merge(links, on='imdb_id')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":311},"id":"R7IM3WBY8Fbh","executionInfo":{"status":"error","timestamp":1688609767176,"user_tz":-210,"elapsed":715,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"1eb8c903-36cf-4c15-9cb8-c4b3229b7edd"},"execution_count":28,"outputs":[{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-28-367d6dcecdf8>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlinks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'imdb_id'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 10091\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmerge\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10092\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m> 10093\u001b[0;31m return merge(\n\u001b[0m\u001b[1;32m 10094\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10095\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m ) -> DataFrame:\n\u001b[0;32m--> 110\u001b[0;31m op = _MergeOperation(\n\u001b[0m\u001b[1;32m 111\u001b[0m \u001b[0mleft\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, indicator, validate)\u001b[0m\n\u001b[1;32m 705\u001b[0m \u001b[0;31m# validate the merge keys dtypes. We may need to coerce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 706\u001b[0m \u001b[0;31m# to avoid incompatible dtypes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 707\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_coerce_merge_keys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[0;31m# If argument passed to validate,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m_maybe_coerce_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1338\u001b[0m \u001b[0minferred_right\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstring_types\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0minferred_left\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1339\u001b[0m ):\n\u001b[0;32m-> 1340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1342\u001b[0m \u001b[0;31m# datetimelikes must match exactly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mValueError\u001b[0m: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat"]}]},{"cell_type":"code","source":[],"metadata":{"id":"tkpi2hXk-gXH"},"execution_count":null,"outputs":[]}]}
Data_Analysis/preprocess.ipynb ADDED
The diff for this file is too large to render. See raw diff