trangannh commited on
Commit
b90cd32
1 Parent(s): c7bc9f4

Upload Job_Recommendation_System.ipynb

Browse files
Files changed (1) hide show
  1. Job_Recommendation_System.ipynb +1330 -0
Job_Recommendation_System.ipynb ADDED
@@ -0,0 +1,1330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "_NrjL2ccH3yp"
7
+ },
8
+ "source": [
9
+ "RECOMMENDATION MODEL"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 7,
15
+ "metadata": {
16
+ "id": "IZfnA6W_GDyf"
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "import numpy as np\n",
21
+ "import pandas as pd\n",
22
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23
+ "from sklearn.metrics.pairwise import cosine_similarity"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 8,
29
+ "metadata": {
30
+ "id": "MV-7idG1F_NU"
31
+ },
32
+ "outputs": [],
33
+ "source": [
34
+ "# Mock data creation\n",
35
+ "def create_mock_data():\n",
36
+ " users_data = \"rematch_train_candidate_field.csv\"\n",
37
+ " applicants = pd.read_csv(users_data)\n",
38
+ "\n",
39
+ " jobs_data = \"jobs_data.csv\"\n",
40
+ " companies = pd.read_csv(jobs_data)\n",
41
+ "\n",
42
+ " train_applicants = applicants\n",
43
+ " test_data = \"1st_test.csv\"\n",
44
+ " # \"/content/sample_data/test_train.csv\"\n",
45
+ " test_applicants = pd.read_csv(test_data)\n",
46
+ "\n",
47
+ " return train_applicants, test_applicants, companies"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 9,
53
+ "metadata": {
54
+ "id": "wF1oZ6Ez96BE"
55
+ },
56
+ "outputs": [],
57
+ "source": [
58
+ "train_user, test_user, jobs = create_mock_data()"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 10,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stdout",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "<class 'pandas.core.frame.DataFrame'>\n"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "print(type(train_user))"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 11,
81
+ "metadata": {
82
+ "colab": {
83
+ "base_uri": "https://localhost:8080/"
84
+ },
85
+ "id": "Gj8tJNrph8Go",
86
+ "outputId": "a44b8cf0-a56f-4cd2-bbda-ca9bcabf35a0"
87
+ },
88
+ "outputs": [
89
+ {
90
+ "name": "stdout",
91
+ "output_type": "stream",
92
+ "text": [
93
+ "Training data size: 23724\n",
94
+ "Test data size: 4745\n"
95
+ ]
96
+ }
97
+ ],
98
+ "source": [
99
+ "print(\"Training data size:\", train_user.shape[0])\n",
100
+ "print(\"Test data size:\", test_user.shape[0])"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 12,
106
+ "metadata": {
107
+ "id": "d0XY4al7K0UT"
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "list_hard_skill = [test_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]\n",
112
+ "list_soft_skill = [test_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 13,
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "<class 'list'>\n"
125
+ ]
126
+ }
127
+ ],
128
+ "source": [
129
+ "print(type(list_hard_skill))"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 14,
135
+ "metadata": {
136
+ "colab": {
137
+ "base_uri": "https://localhost:8080/",
138
+ "height": 213
139
+ },
140
+ "id": "JOZ9_NlLK8uS",
141
+ "outputId": "17d09f55-192f-4486-bb47-b56f525d44a3"
142
+ },
143
+ "outputs": [
144
+ {
145
+ "data": {
146
+ "text/html": [
147
+ "<div>\n",
148
+ "<style scoped>\n",
149
+ " .dataframe tbody tr th:only-of-type {\n",
150
+ " vertical-align: middle;\n",
151
+ " }\n",
152
+ "\n",
153
+ " .dataframe tbody tr th {\n",
154
+ " vertical-align: top;\n",
155
+ " }\n",
156
+ "\n",
157
+ " .dataframe thead th {\n",
158
+ " text-align: right;\n",
159
+ " }\n",
160
+ "</style>\n",
161
+ "<table border=\"1\" class=\"dataframe\">\n",
162
+ " <thead>\n",
163
+ " <tr style=\"text-align: right;\">\n",
164
+ " <th></th>\n",
165
+ " <th>User ID</th>\n",
166
+ " <th>candidate_field</th>\n",
167
+ " <th>label</th>\n",
168
+ " <th>hard_skill</th>\n",
169
+ " <th>soft_skill</th>\n",
170
+ " <th>final_hard_skill</th>\n",
171
+ " <th>final_soft_skill</th>\n",
172
+ " </tr>\n",
173
+ " </thead>\n",
174
+ " <tbody>\n",
175
+ " <tr>\n",
176
+ " <th>0</th>\n",
177
+ " <td>14649</td>\n",
178
+ " <td>it jobs</td>\n",
179
+ " <td>1</td>\n",
180
+ " <td>['act', 'advertising sales', 'algorithms', 'bu...</td>\n",
181
+ " <td>['collaboration', 'decision making', 'operatio...</td>\n",
182
+ " <td>act, advertising sales, algorithms, business, ...</td>\n",
183
+ " <td>collaboration, decision making, operations, wr...</td>\n",
184
+ " </tr>\n",
185
+ " <tr>\n",
186
+ " <th>1</th>\n",
187
+ " <td>801</td>\n",
188
+ " <td>marketing</td>\n",
189
+ " <td>0</td>\n",
190
+ " <td>['act', 'brand communication', 'business', 'bu...</td>\n",
191
+ " <td>['collaboration', 'customer service', 'managem...</td>\n",
192
+ " <td>act, brand communication, business, business d...</td>\n",
193
+ " <td>collaboration, customer service, management</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>2</th>\n",
197
+ " <td>4393</td>\n",
198
+ " <td>accounting</td>\n",
199
+ " <td>0</td>\n",
200
+ " <td>['application', 'balance sheet', 'finance', 'p...</td>\n",
201
+ " <td>['filing', 'management']</td>\n",
202
+ " <td>application, balance sheet, finance, property ...</td>\n",
203
+ " <td>filing, management</td>\n",
204
+ " </tr>\n",
205
+ " </tbody>\n",
206
+ "</table>\n",
207
+ "</div>"
208
+ ],
209
+ "text/plain": [
210
+ " User ID candidate_field label \\\n",
211
+ "0 14649 it jobs 1 \n",
212
+ "1 801 marketing 0 \n",
213
+ "2 4393 accounting 0 \n",
214
+ "\n",
215
+ " hard_skill \\\n",
216
+ "0 ['act', 'advertising sales', 'algorithms', 'bu... \n",
217
+ "1 ['act', 'brand communication', 'business', 'bu... \n",
218
+ "2 ['application', 'balance sheet', 'finance', 'p... \n",
219
+ "\n",
220
+ " soft_skill \\\n",
221
+ "0 ['collaboration', 'decision making', 'operatio... \n",
222
+ "1 ['collaboration', 'customer service', 'managem... \n",
223
+ "2 ['filing', 'management'] \n",
224
+ "\n",
225
+ " final_hard_skill \\\n",
226
+ "0 act, advertising sales, algorithms, business, ... \n",
227
+ "1 act, brand communication, business, business d... \n",
228
+ "2 application, balance sheet, finance, property ... \n",
229
+ "\n",
230
+ " final_soft_skill \n",
231
+ "0 collaboration, decision making, operations, wr... \n",
232
+ "1 collaboration, customer service, management \n",
233
+ "2 filing, management "
234
+ ]
235
+ },
236
+ "execution_count": 14,
237
+ "metadata": {},
238
+ "output_type": "execute_result"
239
+ }
240
+ ],
241
+ "source": [
242
+ "test_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
243
+ "test_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
244
+ "test_user.head(3)"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 15,
250
+ "metadata": {
251
+ "id": "kYbjYsDjABda"
252
+ },
253
+ "outputs": [],
254
+ "source": [
255
+ "list_hard_skill = [train_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]\n",
256
+ "list_soft_skill = [train_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 16,
262
+ "metadata": {
263
+ "colab": {
264
+ "base_uri": "https://localhost:8080/",
265
+ "height": 213
266
+ },
267
+ "id": "GC8bn3cjB8D5",
268
+ "outputId": "436e843d-425e-4ce2-e551-e4f249bdd10b"
269
+ },
270
+ "outputs": [
271
+ {
272
+ "data": {
273
+ "text/html": [
274
+ "<div>\n",
275
+ "<style scoped>\n",
276
+ " .dataframe tbody tr th:only-of-type {\n",
277
+ " vertical-align: middle;\n",
278
+ " }\n",
279
+ "\n",
280
+ " .dataframe tbody tr th {\n",
281
+ " vertical-align: top;\n",
282
+ " }\n",
283
+ "\n",
284
+ " .dataframe thead th {\n",
285
+ " text-align: right;\n",
286
+ " }\n",
287
+ "</style>\n",
288
+ "<table border=\"1\" class=\"dataframe\">\n",
289
+ " <thead>\n",
290
+ " <tr style=\"text-align: right;\">\n",
291
+ " <th></th>\n",
292
+ " <th>User ID</th>\n",
293
+ " <th>candidate_field</th>\n",
294
+ " <th>label</th>\n",
295
+ " <th>hard_skill</th>\n",
296
+ " <th>soft_skill</th>\n",
297
+ " <th>final_hard_skill</th>\n",
298
+ " <th>final_soft_skill</th>\n",
299
+ " </tr>\n",
300
+ " </thead>\n",
301
+ " <tbody>\n",
302
+ " <tr>\n",
303
+ " <th>0</th>\n",
304
+ " <td>1</td>\n",
305
+ " <td>retail &amp; consumer products</td>\n",
306
+ " <td>0</td>\n",
307
+ " <td>['business', 'merchandising', 'sales', 'service']</td>\n",
308
+ " <td>['customer service']</td>\n",
309
+ " <td>business, merchandising, sales, service</td>\n",
310
+ " <td>customer service</td>\n",
311
+ " </tr>\n",
312
+ " <tr>\n",
313
+ " <th>1</th>\n",
314
+ " <td>2</td>\n",
315
+ " <td>sales</td>\n",
316
+ " <td>0</td>\n",
317
+ " <td>['application', 'business', 'business requirem...</td>\n",
318
+ " <td>['accountability', 'collaboration', 'innovatio...</td>\n",
319
+ " <td>application, business, business requirements, ...</td>\n",
320
+ " <td>accountability, collaboration, innovation, man...</td>\n",
321
+ " </tr>\n",
322
+ " <tr>\n",
323
+ " <th>2</th>\n",
324
+ " <td>3</td>\n",
325
+ " <td>healthcare &amp; medical</td>\n",
326
+ " <td>0</td>\n",
327
+ " <td>['application', 'cancer', 'endocrinology', 'hy...</td>\n",
328
+ " <td>['research', 'training and development']</td>\n",
329
+ " <td>application, cancer, endocrinology, hydrothera...</td>\n",
330
+ " <td>research, training and development</td>\n",
331
+ " </tr>\n",
332
+ " </tbody>\n",
333
+ "</table>\n",
334
+ "</div>"
335
+ ],
336
+ "text/plain": [
337
+ " User ID candidate_field label \\\n",
338
+ "0 1 retail & consumer products 0 \n",
339
+ "1 2 sales 0 \n",
340
+ "2 3 healthcare & medical 0 \n",
341
+ "\n",
342
+ " hard_skill \\\n",
343
+ "0 ['business', 'merchandising', 'sales', 'service'] \n",
344
+ "1 ['application', 'business', 'business requirem... \n",
345
+ "2 ['application', 'cancer', 'endocrinology', 'hy... \n",
346
+ "\n",
347
+ " soft_skill \\\n",
348
+ "0 ['customer service'] \n",
349
+ "1 ['accountability', 'collaboration', 'innovatio... \n",
350
+ "2 ['research', 'training and development'] \n",
351
+ "\n",
352
+ " final_hard_skill \\\n",
353
+ "0 business, merchandising, sales, service \n",
354
+ "1 application, business, business requirements, ... \n",
355
+ "2 application, cancer, endocrinology, hydrothera... \n",
356
+ "\n",
357
+ " final_soft_skill \n",
358
+ "0 customer service \n",
359
+ "1 accountability, collaboration, innovation, man... \n",
360
+ "2 research, training and development "
361
+ ]
362
+ },
363
+ "execution_count": 16,
364
+ "metadata": {},
365
+ "output_type": "execute_result"
366
+ }
367
+ ],
368
+ "source": [
369
+ "train_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
370
+ "train_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
371
+ "train_user.head(3)"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 17,
377
+ "metadata": {
378
+ "id": "znBy9q8XDcM7"
379
+ },
380
+ "outputs": [],
381
+ "source": [
382
+ "list_hard_skill = [jobs[\"Hard Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]\n",
383
+ "list_soft_skill = [jobs[\"Soft Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 18,
389
+ "metadata": {
390
+ "colab": {
391
+ "base_uri": "https://localhost:8080/",
392
+ "height": 213
393
+ },
394
+ "id": "knFii8o3EQmv",
395
+ "outputId": "47afb484-0765-4ad9-8765-d084673450ac"
396
+ },
397
+ "outputs": [
398
+ {
399
+ "data": {
400
+ "text/html": [
401
+ "<div>\n",
402
+ "<style scoped>\n",
403
+ " .dataframe tbody tr th:only-of-type {\n",
404
+ " vertical-align: middle;\n",
405
+ " }\n",
406
+ "\n",
407
+ " .dataframe tbody tr th {\n",
408
+ " vertical-align: top;\n",
409
+ " }\n",
410
+ "\n",
411
+ " .dataframe thead th {\n",
412
+ " text-align: right;\n",
413
+ " }\n",
414
+ "</style>\n",
415
+ "<table border=\"1\" class=\"dataframe\">\n",
416
+ " <thead>\n",
417
+ " <tr style=\"text-align: right;\">\n",
418
+ " <th></th>\n",
419
+ " <th>Job ID</th>\n",
420
+ " <th>Major</th>\n",
421
+ " <th>Hard Skills</th>\n",
422
+ " <th>Soft Skills</th>\n",
423
+ " <th>final_hard_skill</th>\n",
424
+ " <th>final_soft_skill</th>\n",
425
+ " </tr>\n",
426
+ " </thead>\n",
427
+ " <tbody>\n",
428
+ " <tr>\n",
429
+ " <th>0</th>\n",
430
+ " <td>1</td>\n",
431
+ " <td>accounting</td>\n",
432
+ " <td>['business', 'finance', 'excel', 'tax', 'servi...</td>\n",
433
+ " <td>['management', 'planning', 'operations', 'lead...</td>\n",
434
+ " <td>business, finance, excel, tax, service, data, ...</td>\n",
435
+ " <td>management, planning, operations, leadership, ...</td>\n",
436
+ " </tr>\n",
437
+ " <tr>\n",
438
+ " <th>1</th>\n",
439
+ " <td>2</td>\n",
440
+ " <td>administration &amp; office support</td>\n",
441
+ " <td>['service', 'business', 'data', 'excel', 'appl...</td>\n",
442
+ " <td>['management', 'customer service', 'microsoft ...</td>\n",
443
+ " <td>service, business, data, excel, application, s...</td>\n",
444
+ " <td>management, customer service, microsoft office...</td>\n",
445
+ " </tr>\n",
446
+ " <tr>\n",
447
+ " <th>2</th>\n",
448
+ " <td>3</td>\n",
449
+ " <td>advertising, arts &amp; media</td>\n",
450
+ " <td>['business', 'digital', 'sales', 'service', 'a...</td>\n",
451
+ " <td>['management', 'social media', 'writing', 'com...</td>\n",
452
+ " <td>business, digital, sales, service, application...</td>\n",
453
+ " <td>management, social media, writing, communicati...</td>\n",
454
+ " </tr>\n",
455
+ " </tbody>\n",
456
+ "</table>\n",
457
+ "</div>"
458
+ ],
459
+ "text/plain": [
460
+ " Job ID Major \\\n",
461
+ "0 1 accounting \n",
462
+ "1 2 administration & office support \n",
463
+ "2 3 advertising, arts & media \n",
464
+ "\n",
465
+ " Hard Skills \\\n",
466
+ "0 ['business', 'finance', 'excel', 'tax', 'servi... \n",
467
+ "1 ['service', 'business', 'data', 'excel', 'appl... \n",
468
+ "2 ['business', 'digital', 'sales', 'service', 'a... \n",
469
+ "\n",
470
+ " Soft Skills \\\n",
471
+ "0 ['management', 'planning', 'operations', 'lead... \n",
472
+ "1 ['management', 'customer service', 'microsoft ... \n",
473
+ "2 ['management', 'social media', 'writing', 'com... \n",
474
+ "\n",
475
+ " final_hard_skill \\\n",
476
+ "0 business, finance, excel, tax, service, data, ... \n",
477
+ "1 service, business, data, excel, application, s... \n",
478
+ "2 business, digital, sales, service, application... \n",
479
+ "\n",
480
+ " final_soft_skill \n",
481
+ "0 management, planning, operations, leadership, ... \n",
482
+ "1 management, customer service, microsoft office... \n",
483
+ "2 management, social media, writing, communicati... "
484
+ ]
485
+ },
486
+ "execution_count": 18,
487
+ "metadata": {},
488
+ "output_type": "execute_result"
489
+ }
490
+ ],
491
+ "source": [
492
+ "jobs[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
493
+ "jobs[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
494
+ "jobs.head(3)"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "code",
499
+ "execution_count": 19,
500
+ "metadata": {
501
+ "id": "wiDiHL6lStnd"
502
+ },
503
+ "outputs": [],
504
+ "source": [
505
+ "# Feature Engineering\n",
506
+ "def feature_engineering(applicants, companies):\n",
507
+ " # Vectorize skills and majors\n",
508
+ " tfidf_vectorizer_skills = TfidfVectorizer()\n",
509
+ " tfidf_vectorizer_majors = TfidfVectorizer()\n",
510
+ "\n",
511
+ " all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],\n",
512
+ " companies['final_hard_skill'], companies['final_soft_skill']])\n",
513
+ " all_majors = pd.concat([applicants['candidate_field'], companies['Major']])\n",
514
+ "\n",
515
+ " all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)\n",
516
+ " all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)\n",
517
+ "\n",
518
+ " num_applicants = len(applicants)\n",
519
+ " num_companies = len(companies)\n",
520
+ "\n",
521
+ " # Split the TF-IDF vectors back into applicants and companies\n",
522
+ " applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2] # because each applicant has 2 skill entries\n",
523
+ " companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]\n",
524
+ "\n",
525
+ " applicants_majors_vectorized = all_majors_vectorized[:num_applicants]\n",
526
+ " companies_majors_vectorized = all_majors_vectorized[num_applicants:]\n",
527
+ "\n",
528
+ " return (applicants_skills_vectorized, applicants_majors_vectorized,\n",
529
+ " companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 20,
535
+ "metadata": {
536
+ "id": "THM0mszQGNyD"
537
+ },
538
+ "outputs": [],
539
+ "source": [
540
+ "def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,\n",
541
+ " companies_skills_vectorized, companies_majors_vectorized):\n",
542
+ " # Calculate similarity based on skills (averaging hard and soft skills similarities)\n",
543
+ " applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2\n",
544
+ " companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2\n",
545
+ "\n",
546
+ " skills_similarity = cosine_similarity(applicants_skills, companies_skills)\n",
547
+ "\n",
548
+ " # Calculate similarity based on majors\n",
549
+ " majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)\n",
550
+ "\n",
551
+ " # Ensure the number of companies in both similarities is aligned\n",
552
+ " if skills_similarity.shape[1] != majors_similarity.shape[1]:\n",
553
+ " min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])\n",
554
+ " skills_similarity = skills_similarity[:, :min_dim]\n",
555
+ " majors_similarity = majors_similarity[:, :min_dim]\n",
556
+ "\n",
557
+ " # Combine these similarities (simple average for this example)\n",
558
+ " combined_similarity = (skills_similarity + majors_similarity) / 2\n",
559
+ " return combined_similarity"
560
+ ]
561
+ },
562
+ {
563
+ "cell_type": "code",
564
+ "execution_count": 21,
565
+ "metadata": {
566
+ "id": "ter3YAzxoelD"
567
+ },
568
+ "outputs": [],
569
+ "source": [
570
+ "# Recommendation Function\n",
571
+ "def recommend_jobs(applicants, companies, similarity_scores):\n",
572
+ " recommendations = {}\n",
573
+ " for i, applicant in enumerate(applicants['User ID']):\n",
574
+ " if i < len(similarity_scores):\n",
575
+ " sorted_company_indices = np.argsort(-similarity_scores[i]) # Descending sort of scores\n",
576
+ " recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3] # Top 3 recommendations\n",
577
+ " recommendations[applicant] = recommended_companies\n",
578
+ " return recommendations\n",
579
+ "\n",
580
+ "# Testing and Evaluation Function\n",
581
+ "def print_recommendations(applicants, companies, recommendations):\n",
582
+ " # This is a mock function since we don't have ground truth to compare to.\n",
583
+ " # In a real scenario, we would compare against actual matches or use some form of feedback.\n",
584
+ " print(\"Recommendations for each applicant:\")\n",
585
+ " for applicant in recommendations:\n",
586
+ " print(f\"{applicant}: {recommendations[applicant]}\")"
587
+ ]
588
+ },
589
+ {
590
+ "cell_type": "code",
591
+ "execution_count": null,
592
+ "metadata": {
593
+ "colab": {
594
+ "base_uri": "https://localhost:8080/"
595
+ },
596
+ "collapsed": true,
597
+ "id": "Ajxp0xelIrl2",
598
+ "outputId": "08bafc5b-73cc-4695-924a-931840047dd5"
599
+ },
600
+ "outputs": [],
601
+ "source": [
602
+ "# Let's create and process the data, and compute recommendations\n",
603
+ "# train_applicants, test_applicants, companies = create_mock_data()\n",
604
+ "applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)\n",
605
+ "\n",
606
+ "similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)\n",
607
+ "recommendations = recommend_jobs(test_user, jobs, similarity_scores)\n",
608
+ "\n",
609
+ "# Output the recommendations to observe the results\n",
610
+ "print_recommendations(test_user, jobs, recommendations)"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "execution_count": 23,
616
+ "metadata": {
617
+ "colab": {
618
+ "base_uri": "https://localhost:8080/"
619
+ },
620
+ "id": "nj-HEdyJlYNY",
621
+ "outputId": "063b84bc-5717-4a0c-8367-939a054657bc"
622
+ },
623
+ "outputs": [],
624
+ "source": [
625
+ "# Process input skills and recommend jobs\n",
626
+ "def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
627
+ " input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])\n",
628
+ " input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])\n",
629
+ " input_major_vec = tfidf_vectorizer_majors.transform([input_major])\n",
630
+ "\n",
631
+ " # Average the vectorized hard and soft skills\n",
632
+ " input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
633
+ "\n",
634
+ " # Compute similarities\n",
635
+ " skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)\n",
636
+ " major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)\n",
637
+ "\n",
638
+ " # Ensure the number of companies in both similarities is aligned\n",
639
+ " if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
640
+ " min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
641
+ " skills_similarity = skills_similarity[:, :min_dim]\n",
642
+ " major_similarity = major_similarity[:, :min_dim]\n",
643
+ "\n",
644
+ " # Combine similarities\n",
645
+ " combined_similarity = (skills_similarity + major_similarity) / 2\n",
646
+ "\n",
647
+ " # Get top 3 job recommendations\n",
648
+ " sorted_company_indices = np.argsort(-combined_similarity[0])\n",
649
+ " recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
650
+ "\n",
651
+ " return recommended_companies"
652
+ ]
653
+ },
654
+ {
655
+ "cell_type": "markdown",
656
+ "metadata": {
657
+ "id": "IMTilMnQINZC"
658
+ },
659
+ "source": [
660
+ "TEST RECOMMENDED SYSTEM"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "code",
665
+ "execution_count": 24,
666
+ "metadata": {},
667
+ "outputs": [
668
+ {
669
+ "name": "stdout",
670
+ "output_type": "stream",
671
+ "text": [
672
+ "Recommended Jobs based on input skills and major:\n",
673
+ "['it jobs' 'sales' 'administration & office support']\n"
674
+ ]
675
+ }
676
+ ],
677
+ "source": [
678
+ "input_hard_skills = \"Java, Excel, Python\"\n",
679
+ "input_soft_skills = \"Communication, Teamwork\"\n",
680
+ "input_major = \"Economy\"\n",
681
+ "\n",
682
+ "recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
683
+ "print(\"Recommended Jobs based on input skills and major:\")\n",
684
+ "print(recommended_jobs)"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "markdown",
689
+ "metadata": {
690
+ "id": "kShd99z_NiTa"
691
+ },
692
+ "source": [
693
+ "Evaluating (PENDING)"
694
+ ]
695
+ },
696
+ {
697
+ "cell_type": "code",
698
+ "execution_count": 19,
699
+ "metadata": {
700
+ "id": "WfEgjqw9JE3l"
701
+ },
702
+ "outputs": [],
703
+ "source": [
704
+ "def create_ground_truth(csv_file_path):\n",
705
+ " data = pd.read_csv(csv_file_path)\n",
706
+ "\n",
707
+ " # Tạo dictionary `ground_truth`\n",
708
+ " ground_truth = {}\n",
709
+ " for index, row in data.iterrows():\n",
710
+ " user_id = row['User ID']\n",
711
+ " actual_major = row['candidate_field']\n",
712
+ "\n",
713
+ " # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc\n",
714
+ " ground_truth[user_id] = [actual_major]\n",
715
+ "\n",
716
+ " return ground_truth\n",
717
+ "\n",
718
+ "# Sử dụng hàm trên để tạo `ground_truth`\n",
719
+ "csv_file_path = '1st_test.csv'\n",
720
+ "ground_truth = create_ground_truth(csv_file_path)"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": null,
726
+ "metadata": {
727
+ "colab": {
728
+ "base_uri": "https://localhost:8080/",
729
+ "height": 1000
730
+ },
731
+ "collapsed": true,
732
+ "id": "TRiD4oS-AKFE",
733
+ "outputId": "256fadeb-b250-4602-affb-005cb9c658eb"
734
+ },
735
+ "outputs": [],
736
+ "source": [
737
+ "display(ground_truth)"
738
+ ]
739
+ },
740
+ {
741
+ "cell_type": "code",
742
+ "execution_count": 40,
743
+ "metadata": {
744
+ "colab": {
745
+ "base_uri": "https://localhost:8080/"
746
+ },
747
+ "id": "pXsa_wbANjmb",
748
+ "outputId": "9bd4fc1e-781b-439c-fe35-c28769f6714c"
749
+ },
750
+ "outputs": [
751
+ {
752
+ "name": "stdout",
753
+ "output_type": "stream",
754
+ "text": [
755
+ "Average Precision@3 with 18979 trains and 4745 tests: 0.12764313312258516\n"
756
+ ]
757
+ }
758
+ ],
759
+ "source": [
760
+ "def precision_at_k(recommendations, ground_truth, k=3):\n",
761
+ " \"\"\"\n",
762
+ " Calculate the precision at k for recommendation system.\n",
763
+ "\n",
764
+ " Parameters:\n",
765
+ " - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.\n",
766
+ " - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.\n",
767
+ " - k (int): The number of top recommendations to consider for calculating precision.\n",
768
+ "\n",
769
+ " Returns:\n",
770
+ " - float: The average precision at k for all users.\n",
771
+ " \"\"\"\n",
772
+ " precision_scores = []\n",
773
+ "\n",
774
+ " for applicant, recommended_major in recommendations.items():\n",
775
+ " if applicant in ground_truth:\n",
776
+ " # Get top k recommendations\n",
777
+ " top_k_recs = recommended_major[:k]\n",
778
+ " # Calculate the number of relevant recommendations\n",
779
+ " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])\n",
780
+ " # Precision at k for this user\n",
781
+ " precision = relevant_recs / k\n",
782
+ " precision_scores.append(precision)\n",
783
+ "\n",
784
+ " # Average precision at k over all users\n",
785
+ " average_precision = np.mean(precision_scores) if precision_scores else 0\n",
786
+ " return average_precision\n",
787
+ "\n",
788
+ "avg_precision = precision_at_k(recommendations, ground_truth)\n",
789
+ "print(\"Average Precision@3 with 18979 trains and 4745 tests:\", avg_precision)"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 41,
795
+ "metadata": {
796
+ "colab": {
797
+ "base_uri": "https://localhost:8080/"
798
+ },
799
+ "id": "KAIvtKEaRQml",
800
+ "outputId": "7dd82dc6-0e1b-43d5-bc95-cb457cde5d72"
801
+ },
802
+ "outputs": [
803
+ {
804
+ "name": "stdout",
805
+ "output_type": "stream",
806
+ "text": [
807
+ "Average Recall@3 with 18979 trains and 4745 tests: 0.38292939936775555\n"
808
+ ]
809
+ }
810
+ ],
811
+ "source": [
812
+ "def recall_at_k(recommendations, ground_truth, k=3):\n",
813
+ " recall_scores = []\n",
814
+ "\n",
815
+ " for user_id, recommended_majors in recommendations.items():\n",
816
+ " if user_id in ground_truth:\n",
817
+ " # Get top k recommendations\n",
818
+ " top_k_recs = recommended_majors[:k]\n",
819
+ " # Calculate the number of relevant recommendations\n",
820
+ " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])\n",
821
+ " # Calculate the total number of relevant items\n",
822
+ " total_relevant = len(ground_truth[user_id])\n",
823
+ " # Recall at k for this user\n",
824
+ " recall = relevant_recs / total_relevant if total_relevant else 0\n",
825
+ " recall_scores.append(recall)\n",
826
+ "\n",
827
+ " # Average recall at k over all users\n",
828
+ " average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0\n",
829
+ " return average_recall\n",
830
+ "\n",
831
+ "# Example usage:\n",
832
+ "avg_recall = recall_at_k(recommendations, ground_truth)\n",
833
+ "print(\"Average Recall@3 with 18979 trains and 4745 tests:\", avg_recall)\n"
834
+ ]
835
+ },
836
+ {
837
+ "cell_type": "code",
838
+ "execution_count": 42,
839
+ "metadata": {
840
+ "colab": {
841
+ "base_uri": "https://localhost:8080/"
842
+ },
843
+ "id": "QUHBsQS_-5Eu",
844
+ "outputId": "fdab3075-dab8-458e-e663-2564b20da97c"
845
+ },
846
+ "outputs": [
847
+ {
848
+ "name": "stdout",
849
+ "output_type": "stream",
850
+ "text": [
851
+ "Average F1 Score@3: 0.19146469968387775\n"
852
+ ]
853
+ }
854
+ ],
855
+ "source": [
856
+ "def f1_score_at_k(recommendations, ground_truth, k=3):\n",
857
+ " precision = precision_at_k(recommendations, ground_truth, k)\n",
858
+ " recall = recall_at_k(recommendations, ground_truth, k)\n",
859
+ "\n",
860
+ " if precision + recall == 0:\n",
861
+ " return 0\n",
862
+ "\n",
863
+ " f1_score = 2 * (precision * recall) / (precision + recall)\n",
864
+ " return f1_score\n",
865
+ "\n",
866
+ "avg_f1_score = f1_score_at_k(recommendations, ground_truth)\n",
867
+ "\n",
868
+ "print(\"Average F1 Score@3:\", avg_f1_score)"
869
+ ]
870
+ },
871
+ {
872
+ "cell_type": "markdown",
873
+ "metadata": {},
874
+ "source": [
875
+ "Create pipline"
876
+ ]
877
+ },
878
+ {
879
+ "cell_type": "code",
880
+ "execution_count": 25,
881
+ "metadata": {},
882
+ "outputs": [],
883
+ "source": [
884
+ "import numpy as np\n",
885
+ "import pandas as pd\n",
886
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
887
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
888
+ "from sklearn.pipeline import Pipeline\n",
889
+ "from sklearn.base import BaseEstimator, TransformerMixin"
890
+ ]
891
+ },
892
+ {
893
+ "cell_type": "code",
894
+ "execution_count": 26,
895
+ "metadata": {},
896
+ "outputs": [],
897
+ "source": [
898
+ "class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):\n",
899
+ " def __init__(self):\n",
900
+ " self.tfidf_vectorizer_skills = TfidfVectorizer()\n",
901
+ " self.tfidf_vectorizer_majors = TfidfVectorizer()\n",
902
+ "\n",
903
+ " def fit(self, X, y=None):\n",
904
+ " all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n",
905
+ " all_majors = X['candidate_field']\n",
906
+ " \n",
907
+ " self.tfidf_vectorizer_skills.fit(all_skills)\n",
908
+ " self.tfidf_vectorizer_majors.fit(all_majors)\n",
909
+ " return self\n",
910
+ " \n",
911
+ " def transform(self, X):\n",
912
+ " all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n",
913
+ " all_majors = X['candidate_field']\n",
914
+ " \n",
915
+ " applicants_skills_vec = self.tfidf_vectorizer_skills.transform(all_skills)\n",
916
+ " applicants_majors_vec = self.tfidf_vectorizer_majors.transform(all_majors)\n",
917
+ " \n",
918
+ " return applicants_skills_vec, applicants_majors_vec"
919
+ ]
920
+ },
921
+ {
922
+ "cell_type": "code",
923
+ "execution_count": 27,
924
+ "metadata": {},
925
+ "outputs": [],
926
+ "source": [
927
+ "class JobRecommender(BaseEstimator, TransformerMixin):\n",
928
+ " def __init__(self, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
929
+ " self.jobs = jobs\n",
930
+ " self.tfidf_vectorizer_skills = tfidf_vectorizer_skills\n",
931
+ " self.tfidf_vectorizer_majors = tfidf_vectorizer_majors\n",
932
+ " self.companies_skills_vec = companies_skills_vec\n",
933
+ " self.companies_majors_vec = companies_majors_vec\n",
934
+ "\n",
935
+ " def fit(self, X, y=None):\n",
936
+ " return self\n",
937
+ "\n",
938
+ " def transform(self, X):\n",
939
+ " input_hard_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_hard_skill'])\n",
940
+ " input_soft_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_soft_skill'])\n",
941
+ " input_major_vec = self.tfidf_vectorizer_majors.transform(X['candidate_field'])\n",
942
+ "\n",
943
+ " input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
944
+ "\n",
945
+ " skills_similarity = cosine_similarity(input_skills_vec, self.companies_skills_vec)\n",
946
+ " major_similarity = cosine_similarity(input_major_vec, self.companies_majors_vec)\n",
947
+ "\n",
948
+ " if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
949
+ " min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
950
+ " skills_similarity = skills_similarity[:, :min_dim]\n",
951
+ " major_similarity = major_similarity[:, :min_dim]\n",
952
+ "\n",
953
+ " combined_similarity = (skills_similarity + major_similarity) / 2\n",
954
+ "\n",
955
+ " recommendations = []\n",
956
+ " for i in range(combined_similarity.shape[0]):\n",
957
+ " sorted_company_indices = np.argsort(-combined_similarity[i])\n",
958
+ " recommended_companies = self.jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
959
+ " recommendations.append(recommended_companies)\n",
960
+ "\n",
961
+ " return recommendations"
962
+ ]
963
+ },
964
+ {
965
+ "cell_type": "code",
966
+ "execution_count": 28,
967
+ "metadata": {},
968
+ "outputs": [],
969
+ "source": [
970
+ "def create_recommendation_pipeline():\n",
971
+ " # Instantiate the feature engineering transformer\n",
972
+ " feature_engineering = FeatureEngineeringTransformer()\n",
973
+ "\n",
974
+ " # Define the recommendation function as a callable estimator\n",
975
+ " def recommend_jobs_function(X, y=None):\n",
976
+ " applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(X)\n",
977
+ " companies_skills_vec, companies_majors_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill']), feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n",
978
+ " \n",
979
+ " return recommend_jobs_for_input_skills(X['final_hard_skill'], X['final_soft_skill'], X['candidate_field'], jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
980
+ "\n",
981
+ " pipeline = Pipeline([\n",
982
+ " ('feature_engineering', feature_engineering),\n",
983
+ " ('recommendation', recommend_jobs_function)\n",
984
+ " ])\n",
985
+ " \n",
986
+ " return pipeline\n",
987
+ "recommendation_pipeline = create_recommendation_pipeline()"
988
+ ]
989
+ },
990
+ {
991
+ "cell_type": "code",
992
+ "execution_count": 29,
993
+ "metadata": {},
994
+ "outputs": [
995
+ {
996
+ "name": "stdout",
997
+ "output_type": "stream",
998
+ "text": [
999
+ "Model components saved successfully!\n"
1000
+ ]
1001
+ }
1002
+ ],
1003
+ "source": [
1004
+ "import pickle\n",
1005
+ "def create_recommendation_pipeline(jobs):\n",
1006
+ " feature_engineering = FeatureEngineeringTransformer()\n",
1007
+ "\n",
1008
+ " # Fit feature engineering transformer to get the vectorizers and company vectors\n",
1009
+ " applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(train_user)\n",
1010
+ " companies_skills_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill'])\n",
1011
+ " companies_majors_vec = feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n",
1012
+ "\n",
1013
+ " recommender = JobRecommender(jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
1014
+ "\n",
1015
+ " pipeline = Pipeline([\n",
1016
+ " ('feature_engineering', feature_engineering),\n",
1017
+ " ('recommendation', recommender)\n",
1018
+ " ])\n",
1019
+ " \n",
1020
+ " return pipeline\n",
1021
+ "\n",
1022
+ "# Create the pipeline\n",
1023
+ "recommendation_pipeline = create_recommendation_pipeline(jobs)\n",
1024
+ "\n",
1025
+ "# Save the pipeline using pickle\n",
1026
+ "model_path = \"recommendation_pipeline.pkl\"\n",
1027
+ "with open(model_path, mode=\"bw\") as f:\n",
1028
+ " pickle.dump(recommendation_pipeline, f)\n",
1029
+ "print(\"Model components saved successfully!\")\n"
1030
+ ]
1031
+ },
1032
+ {
1033
+ "cell_type": "markdown",
1034
+ "metadata": {},
1035
+ "source": [
1036
+ "Push to Hugging face"
1037
+ ]
1038
+ },
1039
+ {
1040
+ "cell_type": "code",
1041
+ "execution_count": 48,
1042
+ "metadata": {},
1043
+ "outputs": [
1044
+ {
1045
+ "data": {
1046
+ "application/vnd.jupyter.widget-view+json": {
1047
+ "model_id": "1c9a071d0a244c4a8e8fe7403a96295c",
1048
+ "version_major": 2,
1049
+ "version_minor": 0
1050
+ },
1051
+ "text/plain": [
1052
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
1053
+ ]
1054
+ },
1055
+ "metadata": {},
1056
+ "output_type": "display_data"
1057
+ }
1058
+ ],
1059
+ "source": [
1060
+ "from huggingface_hub import notebook_login\n",
1061
+ "notebook_login()"
1062
+ ]
1063
+ },
1064
+ {
1065
+ "cell_type": "code",
1066
+ "execution_count": 50,
1067
+ "metadata": {},
1068
+ "outputs": [
1069
+ {
1070
+ "name": "stderr",
1071
+ "output_type": "stream",
1072
+ "text": [
1073
+ "c:\\Program Files\\Python311\\Lib\\site-packages\\skops\\hub_utils\\_hf_hub.py:577: FutureWarning: Creating repos on hf.co is subject to strict rate limits now and therefore this feature is to be removed from this library in version 0.10. You can use tools directly available in the huggingface_hub library instead to create and push files.\n",
1074
+ " warnings.warn(\n"
1075
+ ]
1076
+ },
1077
+ {
1078
+ "data": {
1079
+ "application/vnd.jupyter.widget-view+json": {
1080
+ "model_id": "2d4d813e6bf0451c9dbef4b9ba67b808",
1081
+ "version_major": 2,
1082
+ "version_minor": 0
1083
+ },
1084
+ "text/plain": [
1085
+ "recommendation_pipeline.pkl: 0%| | 0.00/163k [00:00<?, ?B/s]"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ }
1091
+ ],
1092
+ "source": [
1093
+ "import shutil\n",
1094
+ "import os\n",
1095
+ "from skops import card, hub_utils\n",
1096
+ "from pathlib import Path\n",
1097
+ "\n",
1098
+ "model_path = \"recommendation_pipeline.pkl\"\n",
1099
+ "local_repo = \"job-recommendation-model\"\n",
1100
+ "# Clear the existing directory if it exists\n",
1101
+ "if os.path.exists(local_repo):\n",
1102
+ " shutil.rmtree(local_repo)\n",
1103
+ "\n",
1104
+ "sample_data = pd.DataFrame({\n",
1105
+ " 'final_hard_skill': [\"Python, Java, Finance, Excel\"],\n",
1106
+ " 'final_soft_skill': [\"Communication, Teamwork\"],\n",
1107
+ " 'candidate_field': [\"\"]\n",
1108
+ "})\n",
1109
+ "\n",
1110
+ "# Initialize the local repository\n",
1111
+ "hub_utils.init(\n",
1112
+ " model=model_path,\n",
1113
+ " requirements=[\"scikit-learn\", \"pandas\", \"numpy\"],\n",
1114
+ " dst=local_repo,\n",
1115
+ " task=\"tabular-classification\",\n",
1116
+ " data=sample_data,\n",
1117
+ ")\n",
1118
+ "\n",
1119
+ "# # Create model card metadata manually\n",
1120
+ "# metadata = {\n",
1121
+ "# \"model_type\": \"Custom Recommendation Model\",\n",
1122
+ "# \"model_description\": \"This is a recommendation model for job applicants based on their skills and majors.\",\n",
1123
+ "# \"author\": \"trangannh\",\n",
1124
+ "# \"license\": \"mit\",\n",
1125
+ "# \"citation\": \"\"\"\n",
1126
+ "# @misc{example2024recommendation,\n",
1127
+ "# author = {trangannh},\n",
1128
+ "# title = {Job Recommendation Model},\n",
1129
+ "# year = {2024},\n",
1130
+ "# howpublished = {\\\\url{https://huggingface.co/job-recommendation-model}},\n",
1131
+ "# }\n",
1132
+ "# \"\"\",\n",
1133
+ "# \"limitations\": \"This model is not ready to be used in production.\",\n",
1134
+ "# }\n",
1135
+ "\n",
1136
+ "# # Create and save the model card\n",
1137
+ "# model_card = card.Card(model=model_path, metadata=metadata)\n",
1138
+ "\n",
1139
+ "# # Add the get started code\n",
1140
+ "# get_started_code = \"\"\"\n",
1141
+ "# import pickle\n",
1142
+ "# import pandas as pd\n",
1143
+ "\n",
1144
+ "# with open('recommendation_model.pkl', 'rb') as file:\n",
1145
+ "# tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec = pickle.load(file)\n",
1146
+ "\n",
1147
+ "# input_hard_skills = \"Python, Java, Finance, Excel\"\n",
1148
+ "# input_soft_skills = \"Communication, Teamwork\"\n",
1149
+ "# input_major = \"\"\n",
1150
+ "# jobs_data = pd.read_csv(\"/content/sample_data/jobs_data.csv\")\n",
1151
+ "\n",
1152
+ "# recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs_data, 'recommendation_model.pkl')\n",
1153
+ "# print(\"Recommended Jobs based on input skills and major:\")\n",
1154
+ "# print(recommended_jobs)\n",
1155
+ "# \"\"\"\n",
1156
+ "\n",
1157
+ "# model_card.add(\n",
1158
+ "# get_started_code=get_started_code,\n",
1159
+ "# model_card_authors=\"trangannh\",\n",
1160
+ "# model_description=\"This is a recommendation model for job applicants based on their skills and majors.\",\n",
1161
+ "# limitations=\"This model is not ready to be used in production.\"\n",
1162
+ "# )\n",
1163
+ "\n",
1164
+ "# # Save the model card\n",
1165
+ "# model_card.save(Path(local_repo) / \"README.md\")\n",
1166
+ "\n",
1167
+ "# Push the repository to Hugging Face Hub\n",
1168
+ "repo_id = \"trangannh/job-recommendation-model\"\n",
1169
+ "token = \"\"\n",
1170
+ "\n",
1171
+ "hub_utils.push(\n",
1172
+ " repo_id=repo_id,\n",
1173
+ " source=local_repo,\n",
1174
+ " token=token,\n",
1175
+ " commit_message=\"Initial commit of the job recommendation model\",\n",
1176
+ " create_remote=True,\n",
1177
+ ")\n"
1178
+ ]
1179
+ },
1180
+ {
1181
+ "cell_type": "code",
1182
+ "execution_count": 30,
1183
+ "metadata": {},
1184
+ "outputs": [
1185
+ {
1186
+ "name": "stdout",
1187
+ "output_type": "stream",
1188
+ "text": [
1189
+ "Recommended Jobs based on input skills and major:\n",
1190
+ "['sales' 'it jobs' 'administration & office support']\n"
1191
+ ]
1192
+ }
1193
+ ],
1194
+ "source": [
1195
+ "import pickle\n",
1196
+ "import pandas as pd\n",
1197
+ "\n",
1198
+ "# Load the model (pipeline)\n",
1199
+ "with open('recommendation_pipeline.pkl', 'rb') as file:\n",
1200
+ " recommendation_pipeline = pickle.load(file)\n",
1201
+ "\n",
1202
+ "# Example input data\n",
1203
+ "input_hard_skills = \"Python, Java, Finance, Excel\"\n",
1204
+ "input_soft_skills = \"Communication, Teamwork\"\n",
1205
+ "input_major = \"Data Science\"\n",
1206
+ "recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
1207
+ "print(\"Recommended Jobs based on input skills and major:\")\n",
1208
+ "print(recommended_jobs)"
1209
+ ]
1210
+ },
1211
+ {
1212
+ "cell_type": "markdown",
1213
+ "metadata": {},
1214
+ "source": [
1215
+ "Test API"
1216
+ ]
1217
+ },
1218
+ {
1219
+ "cell_type": "code",
1220
+ "execution_count": 31,
1221
+ "metadata": {},
1222
+ "outputs": [
1223
+ {
1224
+ "name": "stdout",
1225
+ "output_type": "stream",
1226
+ "text": [
1227
+ "Error: 503\n",
1228
+ "{'error': 'Model trangannh/job-recommendation-model is currently loading', 'estimated_time': 20.0}\n"
1229
+ ]
1230
+ }
1231
+ ],
1232
+ "source": [
1233
+ "import requests\n",
1234
+ "\n",
1235
+ "# Set up the endpoint URL and token\n",
1236
+ "endpoint = \"https://api-inference.huggingface.co/models/trangannh/job-recommendation-model\"\n",
1237
+ "token = \"\"\n",
1238
+ "\n",
1239
+ "# Prepare data\n",
1240
+ "data = {\n",
1241
+ " \"inputs\": {\n",
1242
+ " \"input_hard_skills\": \"Python, Java, Finance, Excel\",\n",
1243
+ " \"input_soft_skills\": \"Communication, Teamwork\",\n",
1244
+ " \"input_major\": \"Data Science\"\n",
1245
+ " }\n",
1246
+ "}\n",
1247
+ "\n",
1248
+ "# Send POST request\n",
1249
+ "headers = {\n",
1250
+ " \"Authorization\": f\"Bearer {token}\",\n",
1251
+ " \"Content-Type\": \"application/json\"\n",
1252
+ "}\n",
1253
+ "response = requests.post(endpoint, headers=headers, json=data)\n",
1254
+ "\n",
1255
+ "# Print the response\n",
1256
+ "if response.status_code == 200:\n",
1257
+ " print(response.json())\n",
1258
+ "else:\n",
1259
+ " print(f\"Error: {response.status_code}\")\n",
1260
+ " print(response.json())\n"
1261
+ ]
1262
+ },
1263
+ {
1264
+ "cell_type": "code",
1265
+ "execution_count": 32,
1266
+ "metadata": {},
1267
+ "outputs": [
1268
+ {
1269
+ "ename": "TypeError",
1270
+ "evalue": "tuple indices must be integers or slices, not str",
1271
+ "output_type": "error",
1272
+ "traceback": [
1273
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
1274
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
1275
+ "Cell \u001b[1;32mIn[32], line 16\u001b[0m\n\u001b[0;32m 9\u001b[0m input_data \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame({\n\u001b[0;32m 10\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfinal_hard_skill\u001b[39m\u001b[38;5;124m'\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPython, Java, Finance, Excel\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m 11\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfinal_soft_skill\u001b[39m\u001b[38;5;124m'\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunication, Teamwork\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m 12\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcandidate_field\u001b[39m\u001b[38;5;124m'\u001b[39m: [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData Science\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 13\u001b[0m })\n\u001b[0;32m 15\u001b[0m \u001b[38;5;66;03m# Make recommendations\u001b[39;00m\n\u001b[1;32m---> 16\u001b[0m recommended_jobs \u001b[38;5;241m=\u001b[39m \u001b[43mrecommendation_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecommended Jobs based on input skills and major:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m rec \u001b[38;5;129;01min\u001b[39;00m recommended_jobs:\n",
1276
+ "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\pipeline.py:658\u001b[0m, in \u001b[0;36mPipeline.transform\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 656\u001b[0m Xt \u001b[38;5;241m=\u001b[39m X\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _, _, transform \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter():\n\u001b[1;32m--> 658\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[43mtransform\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Xt\n",
1277
+ "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\utils\\_set_output.py:140\u001b[0m, in \u001b[0;36m_wrap_method_output.<locals>.wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 138\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 139\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 140\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 141\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 142\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[0;32m 144\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 145\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 146\u001b[0m )\n",
1278
+ "Cell \u001b[1;32mIn[27], line 13\u001b[0m, in \u001b[0;36mJobRecommender.transform\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtransform\u001b[39m(\u001b[38;5;28mself\u001b[39m, X):\n\u001b[1;32m---> 13\u001b[0m input_hard_skills_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_skills\u001b[38;5;241m.\u001b[39mtransform(\u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfinal_hard_skill\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[0;32m 14\u001b[0m input_soft_skills_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_skills\u001b[38;5;241m.\u001b[39mtransform(X[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfinal_soft_skill\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 15\u001b[0m input_major_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_majors\u001b[38;5;241m.\u001b[39mtransform(X[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcandidate_field\u001b[39m\u001b[38;5;124m'\u001b[39m])\n",
1279
+ "\u001b[1;31mTypeError\u001b[0m: tuple indices must be integers or slices, not str"
1280
+ ]
1281
+ }
1282
+ ],
1283
+ "source": [
1284
+ "import pickle\n",
1285
+ "import pandas as pd\n",
1286
+ "\n",
1287
+ "# Load the model (pipeline)\n",
1288
+ "with open('recommendation_pipeline.pkl', 'rb') as file:\n",
1289
+ " recommendation_pipeline = pickle.load(file)\n",
1290
+ "\n",
1291
+ "# Example input data, converting list to string\n",
1292
+ "input_data = pd.DataFrame({\n",
1293
+ " 'final_hard_skill': [\"Python, Java, Finance, Excel\"],\n",
1294
+ " 'final_soft_skill': [\"Communication, Teamwork\"],\n",
1295
+ " 'candidate_field': [\"Data Science\"]\n",
1296
+ "})\n",
1297
+ "\n",
1298
+ "# Make recommendations\n",
1299
+ "recommended_jobs = recommendation_pipeline.transform(input_data)\n",
1300
+ "\n",
1301
+ "print(\"Recommended Jobs based on input skills and major:\")\n",
1302
+ "for rec in recommended_jobs:\n",
1303
+ " print(rec)\n"
1304
+ ]
1305
+ }
1306
+ ],
1307
+ "metadata": {
1308
+ "colab": {
1309
+ "provenance": []
1310
+ },
1311
+ "kernelspec": {
1312
+ "display_name": "Python 3",
1313
+ "name": "python3"
1314
+ },
1315
+ "language_info": {
1316
+ "codemirror_mode": {
1317
+ "name": "ipython",
1318
+ "version": 3
1319
+ },
1320
+ "file_extension": ".py",
1321
+ "mimetype": "text/x-python",
1322
+ "name": "python",
1323
+ "nbconvert_exporter": "python",
1324
+ "pygments_lexer": "ipython3",
1325
+ "version": "3.11.2"
1326
+ }
1327
+ },
1328
+ "nbformat": 4,
1329
+ "nbformat_minor": 0
1330
+ }