Michelangiolo commited on
Commit
0928f44
1 Parent(s): b468dba

Upload 1_data_processing.ipynb

Browse files
Files changed (1) hide show
  1. 1_data_processing.ipynb +356 -0
1_data_processing.ipynb ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%pip install sentence-transformers==2.0.0"
10
+ ]
11
+ },
12
+ {
13
+ "attachments": {},
14
+ "cell_type": "markdown",
15
+ "metadata": {},
16
+ "source": [
17
+ "https://www.kaggle.com/datasets/dataranch/upwork-1"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "1. Load dataset with pandas"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 135,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "import pandas as pd\n",
34
+ "\n",
35
+ "df = pd.read_csv('freelancers.csv')\n",
36
+ "df = df[['shortName', 'title', 'description', 'location', 'hourlyRate', 'avgFeedbackScore', 'skills']]\n",
37
+ "df = df.dropna(subset='skills')\n",
38
+ "df = df.dropna(subset='title')\n",
39
+ "df['location'] = df['location'].apply(lambda x : eval(x)['state'])\n",
40
+ "df['skills'] = df['skills'].apply(lambda x : [x['skill']['name'] for x in eval(x)])\n",
41
+ "df['hourlyRate'] = df['hourlyRate'].apply(lambda x : eval(x)['amount'])"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "def keyword_extractor(total_keywords, str1):\n",
51
+ " word_list = list()\n",
52
+ " for keyword in total_keywords:\n",
53
+ " for word in str1.split(' '):\n",
54
+ " if word == keyword:\n",
55
+ " word_list.append(word)\n",
56
+ " return word_list\n",
57
+ "total_keywords = df.explode('skills')['skills'].unique().tolist()\n",
58
+ "\n",
59
+ "skill_keywords = keyword_extractor(total_keywords, 'I want to hire a wordpress')\n",
60
+ "df[df['skills'].apply(lambda x: all(val in x for val in skill_keywords))]"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {},
66
+ "source": [
67
+ "2. Encode 100 samples into vectors (1 column with product text, 1 column with vectors)"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 136,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "100%|██████████| 5815/5815 [12:10<00:00, 7.96it/s]\n"
80
+ ]
81
+ },
82
+ {
83
+ "data": {
84
+ "text/html": [
85
+ "<div>\n",
86
+ "<style scoped>\n",
87
+ " .dataframe tbody tr th:only-of-type {\n",
88
+ " vertical-align: middle;\n",
89
+ " }\n",
90
+ "\n",
91
+ " .dataframe tbody tr th {\n",
92
+ " vertical-align: top;\n",
93
+ " }\n",
94
+ "\n",
95
+ " .dataframe thead th {\n",
96
+ " text-align: right;\n",
97
+ " }\n",
98
+ "</style>\n",
99
+ "<table border=\"1\" class=\"dataframe\">\n",
100
+ " <thead>\n",
101
+ " <tr style=\"text-align: right;\">\n",
102
+ " <th></th>\n",
103
+ " <th>shortName</th>\n",
104
+ " <th>title</th>\n",
105
+ " <th>description</th>\n",
106
+ " <th>location</th>\n",
107
+ " <th>hourlyRate</th>\n",
108
+ " <th>avgFeedbackScore</th>\n",
109
+ " <th>skills</th>\n",
110
+ " <th>text_vector_</th>\n",
111
+ " </tr>\n",
112
+ " </thead>\n",
113
+ " <tbody>\n",
114
+ " <tr>\n",
115
+ " <th>0</th>\n",
116
+ " <td>Jason V.</td>\n",
117
+ " <td>Expert WordPress Developer</td>\n",
118
+ " <td>Hello! Welcome to my profile!\\n\\nMy name is Ja...</td>\n",
119
+ " <td>IL</td>\n",
120
+ " <td>60.00</td>\n",
121
+ " <td>4.925208</td>\n",
122
+ " <td>[wordpress, seo, wp-ecommerce, woocommerce, bo...</td>\n",
123
+ " <td>[0.078628771007061, 0.024731114506721497, -0.0...</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>1</th>\n",
127
+ " <td>Miranda S.</td>\n",
128
+ " <td>Social Media Manager</td>\n",
129
+ " <td>I am a Social Media Manager who specializes in...</td>\n",
130
+ " <td>NY</td>\n",
131
+ " <td>20.00</td>\n",
132
+ " <td>4.675676</td>\n",
133
+ " <td>[social-media-content-creation, video-editing,...</td>\n",
134
+ " <td>[0.07423530519008636, -0.022386642172932625, -...</td>\n",
135
+ " </tr>\n",
136
+ " <tr>\n",
137
+ " <th>2</th>\n",
138
+ " <td>Gagan S J.</td>\n",
139
+ " <td>Solution Architect</td>\n",
140
+ " <td>More than 25 years in IT with 20 years in US h...</td>\n",
141
+ " <td>NJ</td>\n",
142
+ " <td>65.00</td>\n",
143
+ " <td>0.000000</td>\n",
144
+ " <td>[oracle-java-ee, spring-framework, hibernate, ...</td>\n",
145
+ " <td>[0.04637446999549866, 0.03554175794124603, -0....</td>\n",
146
+ " </tr>\n",
147
+ " <tr>\n",
148
+ " <th>3</th>\n",
149
+ " <td>Roxana L.</td>\n",
150
+ " <td>Procurement, Logistics and Supply Chain profes...</td>\n",
151
+ " <td>I work FAST - I am COST EFFICIENT - I deliver ...</td>\n",
152
+ " <td>NC</td>\n",
153
+ " <td>70.00</td>\n",
154
+ " <td>4.916684</td>\n",
155
+ " <td>[procurement-function, pharmaceutical-industry...</td>\n",
156
+ " <td>[0.026502298191189766, -0.02052873745560646, -...</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>4</th>\n",
160
+ " <td>John M.</td>\n",
161
+ " <td>3d generalist, post production artist and Vide...</td>\n",
162
+ " <td>I am a 3D artist, animator, and designer with ...</td>\n",
163
+ " <td>OK</td>\n",
164
+ " <td>50.00</td>\n",
165
+ " <td>5.000000</td>\n",
166
+ " <td>[animation, motion-graphics, video-editing, vi...</td>\n",
167
+ " <td>[0.05356863886117935, 0.032190944999456406, -0...</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>...</th>\n",
171
+ " <td>...</td>\n",
172
+ " <td>...</td>\n",
173
+ " <td>...</td>\n",
174
+ " <td>...</td>\n",
175
+ " <td>...</td>\n",
176
+ " <td>...</td>\n",
177
+ " <td>...</td>\n",
178
+ " <td>...</td>\n",
179
+ " </tr>\n",
180
+ " <tr>\n",
181
+ " <th>5818</th>\n",
182
+ " <td>Megan D.</td>\n",
183
+ " <td>Web Research/Content Curation/Data Entry/Socia...</td>\n",
184
+ " <td>Extremely driven, yet easy-going gal who refus...</td>\n",
185
+ " <td>CA</td>\n",
186
+ " <td>22.37</td>\n",
187
+ " <td>4.574029</td>\n",
188
+ " <td>[data-entry, internet-research, virtual-assist...</td>\n",
189
+ " <td>[0.07812528312206268, -0.018792806193232536, -...</td>\n",
190
+ " </tr>\n",
191
+ " <tr>\n",
192
+ " <th>5819</th>\n",
193
+ " <td>Austin V.</td>\n",
194
+ " <td>Product Manager</td>\n",
195
+ " <td>Experienced in building and growing digital pl...</td>\n",
196
+ " <td>AZ</td>\n",
197
+ " <td>100.00</td>\n",
198
+ " <td>0.000000</td>\n",
199
+ " <td>[atlassian-jira, atlassian-confluence, project...</td>\n",
200
+ " <td>[0.056266412138938904, -0.007661229465156794, ...</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>5820</th>\n",
204
+ " <td>Eric M.</td>\n",
205
+ " <td>Web Developer/Designer And Graphic Designer</td>\n",
206
+ " <td>4 YEARS experience in Wordpress / Shopify / D...</td>\n",
207
+ " <td>OH</td>\n",
208
+ " <td>25.00</td>\n",
209
+ " <td>4.451507</td>\n",
210
+ " <td>[html, css, wordpress, shopify, joomla, drupal...</td>\n",
211
+ " <td>[0.048749279230833054, -0.013894445262849331, ...</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>5821</th>\n",
215
+ " <td>Kristina A.</td>\n",
216
+ " <td>Voice Over Talent, video editing, video produc...</td>\n",
217
+ " <td>A musician from birth. I studied music educati...</td>\n",
218
+ " <td>VA</td>\n",
219
+ " <td>60.00</td>\n",
220
+ " <td>5.000000</td>\n",
221
+ " <td>[articulate]</td>\n",
222
+ " <td>[0.03207482025027275, -0.027680398896336555, -...</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>5822</th>\n",
226
+ " <td>Ashley C.</td>\n",
227
+ " <td>Print and Digital Layout Designer</td>\n",
228
+ " <td>Hello, I'm Ashley, a professional designer and...</td>\n",
229
+ " <td>OR</td>\n",
230
+ " <td>25.00</td>\n",
231
+ " <td>4.951186</td>\n",
232
+ " <td>[print-layout-design, brochure-design, flyer-d...</td>\n",
233
+ " <td>[0.04141489043831825, -0.04500063508749008, -0...</td>\n",
234
+ " </tr>\n",
235
+ " </tbody>\n",
236
+ "</table>\n",
237
+ "<p>5815 rows × 8 columns</p>\n",
238
+ "</div>"
239
+ ],
240
+ "text/plain": [
241
+ " shortName title \\\n",
242
+ "0 Jason V. Expert WordPress Developer \n",
243
+ "1 Miranda S. Social Media Manager \n",
244
+ "2 Gagan S J. Solution Architect \n",
245
+ "3 Roxana L. Procurement, Logistics and Supply Chain profes... \n",
246
+ "4 John M. 3d generalist, post production artist and Vide... \n",
247
+ "... ... ... \n",
248
+ "5818 Megan D. Web Research/Content Curation/Data Entry/Socia... \n",
249
+ "5819 Austin V. Product Manager \n",
250
+ "5820 Eric M. Web Developer/Designer And Graphic Designer \n",
251
+ "5821 Kristina A. Voice Over Talent, video editing, video produc... \n",
252
+ "5822 Ashley C. Print and Digital Layout Designer \n",
253
+ "\n",
254
+ " description location hourlyRate \\\n",
255
+ "0 Hello! Welcome to my profile!\\n\\nMy name is Ja... IL 60.00 \n",
256
+ "1 I am a Social Media Manager who specializes in... NY 20.00 \n",
257
+ "2 More than 25 years in IT with 20 years in US h... NJ 65.00 \n",
258
+ "3 I work FAST - I am COST EFFICIENT - I deliver ... NC 70.00 \n",
259
+ "4 I am a 3D artist, animator, and designer with ... OK 50.00 \n",
260
+ "... ... ... ... \n",
261
+ "5818 Extremely driven, yet easy-going gal who refus... CA 22.37 \n",
262
+ "5819 Experienced in building and growing digital pl... AZ 100.00 \n",
263
+ "5820 4 YEARS experience in Wordpress / Shopify / D... OH 25.00 \n",
264
+ "5821 A musician from birth. I studied music educati... VA 60.00 \n",
265
+ "5822 Hello, I'm Ashley, a professional designer and... OR 25.00 \n",
266
+ "\n",
267
+ " avgFeedbackScore skills \\\n",
268
+ "0 4.925208 [wordpress, seo, wp-ecommerce, woocommerce, bo... \n",
269
+ "1 4.675676 [social-media-content-creation, video-editing,... \n",
270
+ "2 0.000000 [oracle-java-ee, spring-framework, hibernate, ... \n",
271
+ "3 4.916684 [procurement-function, pharmaceutical-industry... \n",
272
+ "4 5.000000 [animation, motion-graphics, video-editing, vi... \n",
273
+ "... ... ... \n",
274
+ "5818 4.574029 [data-entry, internet-research, virtual-assist... \n",
275
+ "5819 0.000000 [atlassian-jira, atlassian-confluence, project... \n",
276
+ "5820 4.451507 [html, css, wordpress, shopify, joomla, drupal... \n",
277
+ "5821 5.000000 [articulate] \n",
278
+ "5822 4.951186 [print-layout-design, brochure-design, flyer-d... \n",
279
+ "\n",
280
+ " text_vector_ \n",
281
+ "0 [0.078628771007061, 0.024731114506721497, -0.0... \n",
282
+ "1 [0.07423530519008636, -0.022386642172932625, -... \n",
283
+ "2 [0.04637446999549866, 0.03554175794124603, -0.... \n",
284
+ "3 [0.026502298191189766, -0.02052873745560646, -... \n",
285
+ "4 [0.05356863886117935, 0.032190944999456406, -0... \n",
286
+ "... ... \n",
287
+ "5818 [0.07812528312206268, -0.018792806193232536, -... \n",
288
+ "5819 [0.056266412138938904, -0.007661229465156794, ... \n",
289
+ "5820 [0.048749279230833054, -0.013894445262849331, ... \n",
290
+ "5821 [0.03207482025027275, -0.027680398896336555, -... \n",
291
+ "5822 [0.04141489043831825, -0.04500063508749008, -0... \n",
292
+ "\n",
293
+ "[5815 rows x 8 columns]"
294
+ ]
295
+ },
296
+ "execution_count": 136,
297
+ "metadata": {},
298
+ "output_type": "execute_result"
299
+ }
300
+ ],
301
+ "source": [
302
+ "import pandas as pd\n",
303
+ "from tqdm import tqdm\n",
304
+ "from sentence_transformers import SentenceTransformer\n",
305
+ "tqdm.pandas()\n",
306
+ "\n",
307
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
308
+ "\n",
309
+ "#encode df version: for small dataset only\n",
310
+ "df['text_vector_'] = df['title'].progress_apply(lambda x : model.encode(x).tolist())\n",
311
+ "df"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": 133,
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": [
320
+ "import random\n",
321
+ "\n",
322
+ "df = df.drop_duplicates(subset=['shortName', 'location', 'title', 'hourlyRate', 'avgFeedbackScore', 'description'])\n",
323
+ "df = df.reset_index(drop=True)\n",
324
+ "df['location'] = df['location'].apply(lambda x : random.choice(['New York', 'Chicago', 'Washington']))\n",
325
+ "df.to_parquet('df_encoded.parquet', index=None)"
326
+ ]
327
+ }
328
+ ],
329
+ "metadata": {
330
+ "kernelspec": {
331
+ "display_name": "Python 3.9.0 64-bit",
332
+ "language": "python",
333
+ "name": "python3"
334
+ },
335
+ "language_info": {
336
+ "codemirror_mode": {
337
+ "name": "ipython",
338
+ "version": 3
339
+ },
340
+ "file_extension": ".py",
341
+ "mimetype": "text/x-python",
342
+ "name": "python",
343
+ "nbconvert_exporter": "python",
344
+ "pygments_lexer": "ipython3",
345
+ "version": "3.9.13"
346
+ },
347
+ "orig_nbformat": 4,
348
+ "vscode": {
349
+ "interpreter": {
350
+ "hash": "fdf377d643bc1cb065454f0ad2ceac75d834452ecf289e7ba92c6b3f59a7cee1"
351
+ }
352
+ }
353
+ },
354
+ "nbformat": 4,
355
+ "nbformat_minor": 2
356
+ }