{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%pip install sentence-transformers==2.0.0" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "https://www.kaggle.com/datasets/dataranch/upwork-1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Load dataset with pandas" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv('freelancers.csv')\n", "df = df[['shortName', 'title', 'description', 'location', 'hourlyRate', 'avgFeedbackScore', 'skills']]\n", "df = df.dropna(subset='skills')\n", "df = df.dropna(subset='title')\n", "df['location'] = df['location'].apply(lambda x : eval(x)['state'])\n", "df['skills'] = df['skills'].apply(lambda x : [x['skill']['name'] for x in eval(x)])\n", "df['hourlyRate'] = df['hourlyRate'].apply(lambda x : eval(x)['amount'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def keyword_extractor(total_keywords, str1):\n", " word_list = list()\n", " for keyword in total_keywords:\n", " for word in str1.split(' '):\n", " if word == keyword:\n", " word_list.append(word)\n", " return word_list\n", "total_keywords = df.explode('skills')['skills'].unique().tolist()\n", "\n", "skill_keywords = keyword_extractor(total_keywords, 'I want to hire a wordpress')\n", "df[df['skills'].apply(lambda x: all(val in x for val in skill_keywords))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "2. Encode 100 samples into vectors (1 column with product text, 1 column with vectors)" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5815/5815 [12:10<00:00, 7.96it/s]\n" ] }, { "data": { "text/html": [ "
\n", " | shortName | \n", "title | \n", "description | \n", "location | \n", "hourlyRate | \n", "avgFeedbackScore | \n", "skills | \n", "text_vector_ | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "Jason V. | \n", "Expert WordPress Developer | \n", "Hello! Welcome to my profile!\\n\\nMy name is Ja... | \n", "IL | \n", "60.00 | \n", "4.925208 | \n", "[wordpress, seo, wp-ecommerce, woocommerce, bo... | \n", "[0.078628771007061, 0.024731114506721497, -0.0... | \n", "
1 | \n", "Miranda S. | \n", "Social Media Manager | \n", "I am a Social Media Manager who specializes in... | \n", "NY | \n", "20.00 | \n", "4.675676 | \n", "[social-media-content-creation, video-editing,... | \n", "[0.07423530519008636, -0.022386642172932625, -... | \n", "
2 | \n", "Gagan S J. | \n", "Solution Architect | \n", "More than 25 years in IT with 20 years in US h... | \n", "NJ | \n", "65.00 | \n", "0.000000 | \n", "[oracle-java-ee, spring-framework, hibernate, ... | \n", "[0.04637446999549866, 0.03554175794124603, -0.... | \n", "
3 | \n", "Roxana L. | \n", "Procurement, Logistics and Supply Chain profes... | \n", "I work FAST - I am COST EFFICIENT - I deliver ... | \n", "NC | \n", "70.00 | \n", "4.916684 | \n", "[procurement-function, pharmaceutical-industry... | \n", "[0.026502298191189766, -0.02052873745560646, -... | \n", "
4 | \n", "John M. | \n", "3d generalist, post production artist and Vide... | \n", "I am a 3D artist, animator, and designer with ... | \n", "OK | \n", "50.00 | \n", "5.000000 | \n", "[animation, motion-graphics, video-editing, vi... | \n", "[0.05356863886117935, 0.032190944999456406, -0... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5818 | \n", "Megan D. | \n", "Web Research/Content Curation/Data Entry/Socia... | \n", "Extremely driven, yet easy-going gal who refus... | \n", "CA | \n", "22.37 | \n", "4.574029 | \n", "[data-entry, internet-research, virtual-assist... | \n", "[0.07812528312206268, -0.018792806193232536, -... | \n", "
5819 | \n", "Austin V. | \n", "Product Manager | \n", "Experienced in building and growing digital pl... | \n", "AZ | \n", "100.00 | \n", "0.000000 | \n", "[atlassian-jira, atlassian-confluence, project... | \n", "[0.056266412138938904, -0.007661229465156794, ... | \n", "
5820 | \n", "Eric M. | \n", "Web Developer/Designer And Graphic Designer | \n", "4 YEARS experience in Wordpress / Shopify / D... | \n", "OH | \n", "25.00 | \n", "4.451507 | \n", "[html, css, wordpress, shopify, joomla, drupal... | \n", "[0.048749279230833054, -0.013894445262849331, ... | \n", "
5821 | \n", "Kristina A. | \n", "Voice Over Talent, video editing, video produc... | \n", "A musician from birth. I studied music educati... | \n", "VA | \n", "60.00 | \n", "5.000000 | \n", "[articulate] | \n", "[0.03207482025027275, -0.027680398896336555, -... | \n", "
5822 | \n", "Ashley C. | \n", "Print and Digital Layout Designer | \n", "Hello, I'm Ashley, a professional designer and... | \n", "OR | \n", "25.00 | \n", "4.951186 | \n", "[print-layout-design, brochure-design, flyer-d... | \n", "[0.04141489043831825, -0.04500063508749008, -0... | \n", "
5815 rows × 8 columns
\n", "