{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "_NrjL2ccH3yp" }, "source": [ "RECOMMENDATION MODEL" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "IZfnA6W_GDyf" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "MV-7idG1F_NU" }, "outputs": [], "source": [ "# Mock data creation\n", "def create_mock_data():\n", " users_data = \"rematch_train_candidate_field.csv\"\n", " applicants = pd.read_csv(users_data)\n", "\n", " jobs_data = \"jobs_data.csv\"\n", " companies = pd.read_csv(jobs_data)\n", "\n", " train_applicants = applicants\n", " test_data = \"1st_test.csv\"\n", " # \"/content/sample_data/test_train.csv\"\n", " test_applicants = pd.read_csv(test_data)\n", "\n", " return train_applicants, test_applicants, companies" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "wF1oZ6Ez96BE" }, "outputs": [], "source": [ "train_user, test_user, jobs = create_mock_data()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(train_user))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Gj8tJNrph8Go", "outputId": "a44b8cf0-a56f-4cd2-bbda-ca9bcabf35a0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data size: 23724\n", "Test data size: 4745\n" ] } ], "source": [ "print(\"Training data size:\", train_user.shape[0])\n", "print(\"Test data size:\", test_user.shape[0])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "d0XY4al7K0UT" }, "outputs": [], "source": [ "list_hard_skill = [test_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]\n", "list_soft_skill = [test_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(list_hard_skill))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 213 }, "id": "JOZ9_NlLK8uS", "outputId": "17d09f55-192f-4486-bb47-b56f525d44a3" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User IDcandidate_fieldlabelhard_skillsoft_skillfinal_hard_skillfinal_soft_skill
014649it jobs1['act', 'advertising sales', 'algorithms', 'bu...['collaboration', 'decision making', 'operatio...act, advertising sales, algorithms, business, ...collaboration, decision making, operations, wr...
1801marketing0['act', 'brand communication', 'business', 'bu...['collaboration', 'customer service', 'managem...act, brand communication, business, business d...collaboration, customer service, management
24393accounting0['application', 'balance sheet', 'finance', 'p...['filing', 'management']application, balance sheet, finance, property ...filing, management
\n", "
" ], "text/plain": [ " User ID candidate_field label \\\n", "0 14649 it jobs 1 \n", "1 801 marketing 0 \n", "2 4393 accounting 0 \n", "\n", " hard_skill \\\n", "0 ['act', 'advertising sales', 'algorithms', 'bu... \n", "1 ['act', 'brand communication', 'business', 'bu... \n", "2 ['application', 'balance sheet', 'finance', 'p... \n", "\n", " soft_skill \\\n", "0 ['collaboration', 'decision making', 'operatio... \n", "1 ['collaboration', 'customer service', 'managem... \n", "2 ['filing', 'management'] \n", "\n", " final_hard_skill \\\n", "0 act, advertising sales, algorithms, business, ... \n", "1 act, brand communication, business, business d... \n", "2 application, balance sheet, finance, property ... \n", "\n", " final_soft_skill \n", "0 collaboration, decision making, operations, wr... \n", "1 collaboration, customer service, management \n", "2 filing, management " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n", "test_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n", "test_user.head(3)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "kYbjYsDjABda" }, "outputs": [], "source": [ "list_hard_skill = [train_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]\n", "list_soft_skill = [train_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 213 }, "id": "GC8bn3cjB8D5", "outputId": "436e843d-425e-4ce2-e551-e4f249bdd10b" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User IDcandidate_fieldlabelhard_skillsoft_skillfinal_hard_skillfinal_soft_skill
01retail & consumer products0['business', 'merchandising', 'sales', 'service']['customer service']business, merchandising, sales, servicecustomer service
12sales0['application', 'business', 'business requirem...['accountability', 'collaboration', 'innovatio...application, business, business requirements, ...accountability, collaboration, innovation, man...
23healthcare & medical0['application', 'cancer', 'endocrinology', 'hy...['research', 'training and development']application, cancer, endocrinology, hydrothera...research, training and development
\n", "
" ], "text/plain": [ " User ID candidate_field label \\\n", "0 1 retail & consumer products 0 \n", "1 2 sales 0 \n", "2 3 healthcare & medical 0 \n", "\n", " hard_skill \\\n", "0 ['business', 'merchandising', 'sales', 'service'] \n", "1 ['application', 'business', 'business requirem... \n", "2 ['application', 'cancer', 'endocrinology', 'hy... \n", "\n", " soft_skill \\\n", "0 ['customer service'] \n", "1 ['accountability', 'collaboration', 'innovatio... \n", "2 ['research', 'training and development'] \n", "\n", " final_hard_skill \\\n", "0 business, merchandising, sales, service \n", "1 application, business, business requirements, ... \n", "2 application, cancer, endocrinology, hydrothera... \n", "\n", " final_soft_skill \n", "0 customer service \n", "1 accountability, collaboration, innovation, man... \n", "2 research, training and development " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n", "train_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n", "train_user.head(3)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "znBy9q8XDcM7" }, "outputs": [], "source": [ "list_hard_skill = [jobs[\"Hard Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]\n", "list_soft_skill = [jobs[\"Soft Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 213 }, "id": "knFii8o3EQmv", "outputId": "47afb484-0765-4ad9-8765-d084673450ac" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Job IDMajorHard SkillsSoft Skillsfinal_hard_skillfinal_soft_skill
01accounting['business', 'finance', 'excel', 'tax', 'servi...['management', 'planning', 'operations', 'lead...business, finance, excel, tax, service, data, ...management, planning, operations, leadership, ...
12administration & office support['service', 'business', 'data', 'excel', 'appl...['management', 'customer service', 'microsoft ...service, business, data, excel, application, s...management, customer service, microsoft office...
23advertising, arts & media['business', 'digital', 'sales', 'service', 'a...['management', 'social media', 'writing', 'com...business, digital, sales, service, application...management, social media, writing, communicati...
\n", "
" ], "text/plain": [ " Job ID Major \\\n", "0 1 accounting \n", "1 2 administration & office support \n", "2 3 advertising, arts & media \n", "\n", " Hard Skills \\\n", "0 ['business', 'finance', 'excel', 'tax', 'servi... \n", "1 ['service', 'business', 'data', 'excel', 'appl... \n", "2 ['business', 'digital', 'sales', 'service', 'a... \n", "\n", " Soft Skills \\\n", "0 ['management', 'planning', 'operations', 'lead... \n", "1 ['management', 'customer service', 'microsoft ... \n", "2 ['management', 'social media', 'writing', 'com... \n", "\n", " final_hard_skill \\\n", "0 business, finance, excel, tax, service, data, ... \n", "1 service, business, data, excel, application, s... \n", "2 business, digital, sales, service, application... \n", "\n", " final_soft_skill \n", "0 management, planning, operations, leadership, ... \n", "1 management, customer service, microsoft office... \n", "2 management, social media, writing, communicati... " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "jobs[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n", "jobs[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n", "jobs.head(3)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "wiDiHL6lStnd" }, "outputs": [], "source": [ "# Feature Engineering\n", "def feature_engineering(applicants, companies):\n", " # Vectorize skills and majors\n", " tfidf_vectorizer_skills = TfidfVectorizer()\n", " tfidf_vectorizer_majors = TfidfVectorizer()\n", "\n", " all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],\n", " companies['final_hard_skill'], companies['final_soft_skill']])\n", " all_majors = pd.concat([applicants['candidate_field'], companies['Major']])\n", "\n", " all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)\n", " all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)\n", "\n", " num_applicants = len(applicants)\n", " num_companies = len(companies)\n", "\n", " # Split the TF-IDF vectors back into applicants and companies\n", " applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2] # because each applicant has 2 skill entries\n", " companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]\n", "\n", " applicants_majors_vectorized = all_majors_vectorized[:num_applicants]\n", " companies_majors_vectorized = all_majors_vectorized[num_applicants:]\n", "\n", " return (applicants_skills_vectorized, applicants_majors_vectorized,\n", " companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "THM0mszQGNyD" }, "outputs": [], "source": [ "def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,\n", " companies_skills_vectorized, companies_majors_vectorized):\n", " # Calculate similarity based on skills (averaging hard and soft skills similarities)\n", " applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2\n", " companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2\n", "\n", " skills_similarity = cosine_similarity(applicants_skills, companies_skills)\n", "\n", " # Calculate similarity based on majors\n", " majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)\n", "\n", " # Ensure the number of companies in both similarities is aligned\n", " if skills_similarity.shape[1] != majors_similarity.shape[1]:\n", " min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])\n", " skills_similarity = skills_similarity[:, :min_dim]\n", " majors_similarity = majors_similarity[:, :min_dim]\n", "\n", " # Combine these similarities (simple average for this example)\n", " combined_similarity = (skills_similarity + majors_similarity) / 2\n", " return combined_similarity" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "id": "ter3YAzxoelD" }, "outputs": [], "source": [ "# Recommendation Function\n", "def recommend_jobs(applicants, companies, similarity_scores):\n", " recommendations = {}\n", " for i, applicant in enumerate(applicants['User ID']):\n", " if i < len(similarity_scores):\n", " sorted_company_indices = np.argsort(-similarity_scores[i]) # Descending sort of scores\n", " recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3] # Top 3 recommendations\n", " recommendations[applicant] = recommended_companies\n", " return recommendations\n", "\n", "# Testing and Evaluation Function\n", "def print_recommendations(applicants, companies, recommendations):\n", " # This is a mock function since we don't have ground truth to compare to.\n", " # In a real scenario, we would compare against actual matches or use some form of feedback.\n", " print(\"Recommendations for each applicant:\")\n", " for applicant in recommendations:\n", " print(f\"{applicant}: {recommendations[applicant]}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, "id": "Ajxp0xelIrl2", "outputId": "08bafc5b-73cc-4695-924a-931840047dd5" }, "outputs": [], "source": [ "# Let's create and process the data, and compute recommendations\n", "# train_applicants, test_applicants, companies = create_mock_data()\n", "applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)\n", "\n", "similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)\n", "recommendations = recommend_jobs(test_user, jobs, similarity_scores)\n", "\n", "# Output the recommendations to observe the results\n", "print_recommendations(test_user, jobs, recommendations)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nj-HEdyJlYNY", "outputId": "063b84bc-5717-4a0c-8367-939a054657bc" }, "outputs": [], "source": [ "# Process input skills and recommend jobs\n", "def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n", " input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])\n", " input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])\n", " input_major_vec = tfidf_vectorizer_majors.transform([input_major])\n", "\n", " # Average the vectorized hard and soft skills\n", " input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n", "\n", " # Compute similarities\n", " skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)\n", " major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)\n", "\n", " # Ensure the number of companies in both similarities is aligned\n", " if skills_similarity.shape[1] != major_similarity.shape[1]:\n", " min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n", " skills_similarity = skills_similarity[:, :min_dim]\n", " major_similarity = major_similarity[:, :min_dim]\n", "\n", " # Combine similarities\n", " combined_similarity = (skills_similarity + major_similarity) / 2\n", "\n", " # Get top 3 job recommendations\n", " sorted_company_indices = np.argsort(-combined_similarity[0])\n", " recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]\n", "\n", " return recommended_companies" ] }, { "cell_type": "markdown", "metadata": { "id": "IMTilMnQINZC" }, "source": [ "TEST RECOMMENDED SYSTEM" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommended Jobs based on input skills and major:\n", "['it jobs' 'sales' 'administration & office support']\n" ] } ], "source": [ "input_hard_skills = \"Java, Excel, Python\"\n", "input_soft_skills = \"Communication, Teamwork\"\n", "input_major = \"Economy\"\n", "\n", "recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n", "print(\"Recommended Jobs based on input skills and major:\")\n", "print(recommended_jobs)" ] }, { "cell_type": "markdown", "metadata": { "id": "kShd99z_NiTa" }, "source": [ "Evaluating (PENDING)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "WfEgjqw9JE3l" }, "outputs": [], "source": [ "def create_ground_truth(csv_file_path):\n", " data = pd.read_csv(csv_file_path)\n", "\n", " # Tạo dictionary `ground_truth`\n", " ground_truth = {}\n", " for index, row in data.iterrows():\n", " user_id = row['User ID']\n", " actual_major = row['candidate_field']\n", "\n", " # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc\n", " ground_truth[user_id] = [actual_major]\n", "\n", " return ground_truth\n", "\n", "# Sử dụng hàm trên để tạo `ground_truth`\n", "csv_file_path = '1st_test.csv'\n", "ground_truth = create_ground_truth(csv_file_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "collapsed": true, "id": "TRiD4oS-AKFE", "outputId": "256fadeb-b250-4602-affb-005cb9c658eb" }, "outputs": [], "source": [ "display(ground_truth)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pXsa_wbANjmb", "outputId": "9bd4fc1e-781b-439c-fe35-c28769f6714c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average Precision@3 with 18979 trains and 4745 tests: 0.12764313312258516\n" ] } ], "source": [ "def precision_at_k(recommendations, ground_truth, k=3):\n", " \"\"\"\n", " Calculate the precision at k for recommendation system.\n", "\n", " Parameters:\n", " - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.\n", " - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.\n", " - k (int): The number of top recommendations to consider for calculating precision.\n", "\n", " Returns:\n", " - float: The average precision at k for all users.\n", " \"\"\"\n", " precision_scores = []\n", "\n", " for applicant, recommended_major in recommendations.items():\n", " if applicant in ground_truth:\n", " # Get top k recommendations\n", " top_k_recs = recommended_major[:k]\n", " # Calculate the number of relevant recommendations\n", " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])\n", " # Precision at k for this user\n", " precision = relevant_recs / k\n", " precision_scores.append(precision)\n", "\n", " # Average precision at k over all users\n", " average_precision = np.mean(precision_scores) if precision_scores else 0\n", " return average_precision\n", "\n", "avg_precision = precision_at_k(recommendations, ground_truth)\n", "print(\"Average Precision@3 with 18979 trains and 4745 tests:\", avg_precision)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KAIvtKEaRQml", "outputId": "7dd82dc6-0e1b-43d5-bc95-cb457cde5d72" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average Recall@3 with 18979 trains and 4745 tests: 0.38292939936775555\n" ] } ], "source": [ "def recall_at_k(recommendations, ground_truth, k=3):\n", " recall_scores = []\n", "\n", " for user_id, recommended_majors in recommendations.items():\n", " if user_id in ground_truth:\n", " # Get top k recommendations\n", " top_k_recs = recommended_majors[:k]\n", " # Calculate the number of relevant recommendations\n", " relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])\n", " # Calculate the total number of relevant items\n", " total_relevant = len(ground_truth[user_id])\n", " # Recall at k for this user\n", " recall = relevant_recs / total_relevant if total_relevant else 0\n", " recall_scores.append(recall)\n", "\n", " # Average recall at k over all users\n", " average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0\n", " return average_recall\n", "\n", "# Example usage:\n", "avg_recall = recall_at_k(recommendations, ground_truth)\n", "print(\"Average Recall@3 with 18979 trains and 4745 tests:\", avg_recall)\n" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QUHBsQS_-5Eu", "outputId": "fdab3075-dab8-458e-e663-2564b20da97c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average F1 Score@3: 0.19146469968387775\n" ] } ], "source": [ "def f1_score_at_k(recommendations, ground_truth, k=3):\n", " precision = precision_at_k(recommendations, ground_truth, k)\n", " recall = recall_at_k(recommendations, ground_truth, k)\n", "\n", " if precision + recall == 0:\n", " return 0\n", "\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " return f1_score\n", "\n", "avg_f1_score = f1_score_at_k(recommendations, ground_truth)\n", "\n", "print(\"Average F1 Score@3:\", avg_f1_score)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create pipline" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.base import BaseEstimator, TransformerMixin" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):\n", " def __init__(self):\n", " self.tfidf_vectorizer_skills = TfidfVectorizer()\n", " self.tfidf_vectorizer_majors = TfidfVectorizer()\n", "\n", " def fit(self, X, y=None):\n", " all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n", " all_majors = X['candidate_field']\n", " \n", " self.tfidf_vectorizer_skills.fit(all_skills)\n", " self.tfidf_vectorizer_majors.fit(all_majors)\n", " return self\n", " \n", " def transform(self, X):\n", " all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n", " all_majors = X['candidate_field']\n", " \n", " applicants_skills_vec = self.tfidf_vectorizer_skills.transform(all_skills)\n", " applicants_majors_vec = self.tfidf_vectorizer_majors.transform(all_majors)\n", " \n", " return applicants_skills_vec, applicants_majors_vec" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "class JobRecommender(BaseEstimator, TransformerMixin):\n", " def __init__(self, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n", " self.jobs = jobs\n", " self.tfidf_vectorizer_skills = tfidf_vectorizer_skills\n", " self.tfidf_vectorizer_majors = tfidf_vectorizer_majors\n", " self.companies_skills_vec = companies_skills_vec\n", " self.companies_majors_vec = companies_majors_vec\n", "\n", " def fit(self, X, y=None):\n", " return self\n", "\n", " def transform(self, X):\n", " input_hard_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_hard_skill'])\n", " input_soft_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_soft_skill'])\n", " input_major_vec = self.tfidf_vectorizer_majors.transform(X['candidate_field'])\n", "\n", " input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n", "\n", " skills_similarity = cosine_similarity(input_skills_vec, self.companies_skills_vec)\n", " major_similarity = cosine_similarity(input_major_vec, self.companies_majors_vec)\n", "\n", " if skills_similarity.shape[1] != major_similarity.shape[1]:\n", " min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n", " skills_similarity = skills_similarity[:, :min_dim]\n", " major_similarity = major_similarity[:, :min_dim]\n", "\n", " combined_similarity = (skills_similarity + major_similarity) / 2\n", "\n", " recommendations = []\n", " for i in range(combined_similarity.shape[0]):\n", " sorted_company_indices = np.argsort(-combined_similarity[i])\n", " recommended_companies = self.jobs.iloc[sorted_company_indices]['Major'].values[:3]\n", " recommendations.append(recommended_companies)\n", "\n", " return recommendations" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def create_recommendation_pipeline():\n", " # Instantiate the feature engineering transformer\n", " feature_engineering = FeatureEngineeringTransformer()\n", "\n", " # Define the recommendation function as a callable estimator\n", " def recommend_jobs_function(X, y=None):\n", " applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(X)\n", " companies_skills_vec, companies_majors_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill']), feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n", " \n", " return recommend_jobs_for_input_skills(X['final_hard_skill'], X['final_soft_skill'], X['candidate_field'], jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n", "\n", " pipeline = Pipeline([\n", " ('feature_engineering', feature_engineering),\n", " ('recommendation', recommend_jobs_function)\n", " ])\n", " \n", " return pipeline\n", "recommendation_pipeline = create_recommendation_pipeline()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model components saved successfully!\n" ] } ], "source": [ "import pickle\n", "def create_recommendation_pipeline(jobs):\n", " feature_engineering = FeatureEngineeringTransformer()\n", "\n", " # Fit feature engineering transformer to get the vectorizers and company vectors\n", " applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(train_user)\n", " companies_skills_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill'])\n", " companies_majors_vec = feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n", "\n", " recommender = JobRecommender(jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n", "\n", " pipeline = Pipeline([\n", " ('feature_engineering', feature_engineering),\n", " ('recommendation', recommender)\n", " ])\n", " \n", " return pipeline\n", "\n", "# Create the pipeline\n", "recommendation_pipeline = create_recommendation_pipeline(jobs)\n", "\n", "# Save the pipeline using pickle\n", "model_path = \"recommendation_pipeline.pkl\"\n", "with open(model_path, mode=\"bw\") as f:\n", " pickle.dump(recommendation_pipeline, f)\n", "print(\"Model components saved successfully!\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Push to Hugging face" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1c9a071d0a244c4a8e8fe7403a96295c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
16\u001b[0m recommended_jobs \u001b[38;5;241m=\u001b[39m \u001b[43mrecommendation_pipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecommended Jobs based on input skills and major:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m rec \u001b[38;5;129;01min\u001b[39;00m recommended_jobs:\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\pipeline.py:658\u001b[0m, in \u001b[0;36mPipeline.transform\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 656\u001b[0m Xt \u001b[38;5;241m=\u001b[39m X\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _, _, transform \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter():\n\u001b[1;32m--> 658\u001b[0m Xt \u001b[38;5;241m=\u001b[39m \u001b[43mtransform\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mXt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Xt\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\utils\\_set_output.py:140\u001b[0m, in \u001b[0;36m_wrap_method_output..wrapped\u001b[1;34m(self, X, *args, **kwargs)\u001b[0m\n\u001b[0;32m 138\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(f)\n\u001b[0;32m 139\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapped\u001b[39m(\u001b[38;5;28mself\u001b[39m, X, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 140\u001b[0m data_to_wrap \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 141\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_to_wrap, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[0;32m 142\u001b[0m \u001b[38;5;66;03m# only wrap the first output for cross decomposition\u001b[39;00m\n\u001b[0;32m 143\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[0;32m 144\u001b[0m _wrap_data_with_container(method, data_to_wrap[\u001b[38;5;241m0\u001b[39m], X, \u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m 145\u001b[0m \u001b[38;5;241m*\u001b[39mdata_to_wrap[\u001b[38;5;241m1\u001b[39m:],\n\u001b[0;32m 146\u001b[0m )\n", "Cell \u001b[1;32mIn[27], line 13\u001b[0m, in \u001b[0;36mJobRecommender.transform\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtransform\u001b[39m(\u001b[38;5;28mself\u001b[39m, X):\n\u001b[1;32m---> 13\u001b[0m input_hard_skills_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_skills\u001b[38;5;241m.\u001b[39mtransform(\u001b[43mX\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfinal_hard_skill\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[0;32m 14\u001b[0m input_soft_skills_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_skills\u001b[38;5;241m.\u001b[39mtransform(X[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfinal_soft_skill\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 15\u001b[0m input_major_vec \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtfidf_vectorizer_majors\u001b[38;5;241m.\u001b[39mtransform(X[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcandidate_field\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", "\u001b[1;31mTypeError\u001b[0m: tuple indices must be integers or slices, not str" ] } ], "source": [ "import pickle\n", "import pandas as pd\n", "\n", "# Load the model (pipeline)\n", "with open('recommendation_pipeline.pkl', 'rb') as file:\n", " recommendation_pipeline = pickle.load(file)\n", "\n", "# Example input data, converting list to string\n", "input_data = pd.DataFrame({\n", " 'final_hard_skill': [\"Python, Java, Finance, Excel\"],\n", " 'final_soft_skill': [\"Communication, Teamwork\"],\n", " 'candidate_field': [\"Data Science\"]\n", "})\n", "\n", "# Make recommendations\n", "recommended_jobs = recommendation_pipeline.transform(input_data)\n", "\n", "print(\"Recommended Jobs based on input skills and major:\")\n", "for rec in recommended_jobs:\n", " print(rec)\n" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 0 }