{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "The main purpose of this notebook is to parse the file with the scraped jobs into a json file per job." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "from dotenv import load_dotenv\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'/Users/luisrodrigues/Documents/Projects/PERSONAL/resume-worth'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Change the current working directory to the pachage root\n", "# That's step is due to the way settings.py is defined\n", "ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])\n", "os.chdir(ROOT_DIR)\n", "os.getcwd()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "conf_dir = os.path.join(\"conf\")\n", "conf_file = \".env\"\n", "\n", "data_dir = os.path.join(\"data\", \"01_raw\")\n", "\n", "res_dir = os.path.join(\"data\", \"02_processed\")\n", "\n", "res_metadata_dir = os.path.join(\"data\", \"02_processed\", \"metadata\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Load OpenAI API Key\n", "# https://openai.com/pricing\n", "\n", "conf_path = os.path.join(conf_dir, conf_file)\n", "\n", "_ = load_dotenv(conf_path)\n", "\n", "#API_KEY = os.getenv('API_KEY')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Get list of file names in ingestion folder\n", "file_names = [file for file in os.listdir(data_dir) if \".csv\" in file]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['data_scientist_jobstreet_scraped_v2 2.csv',\n", " 'data_jobs_scraped_from_jobstreet_2024-04-07.csv']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "file_names" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "columns = ['job_id', 'job_title', 'company', 'location', 'salary', 'description']\n", "\n", "for file_name in file_names:\n", " data_path = os.path.join(data_dir, file_name)\n", " if 'data_df' not in locals():\n", " data_df = pd.read_csv(data_path, sep=\",\")[columns]\n", " else:\n", " data_file = pd.read_csv(data_path, sep=\",\")[columns]\n", " data_df = pd.concat([data_df, data_file])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 635 entries, 0 to 28\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 job_id 635 non-null int64 \n", " 1 job_title 635 non-null object\n", " 2 company 635 non-null object\n", " 3 location 635 non-null object\n", " 4 salary 227 non-null object\n", " 5 description 617 non-null object\n", "dtypes: int64(1), object(5)\n", "memory usage: 34.7+ KB\n" ] } ], "source": [ "data_df.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idjob_titlecompanylocationsalarydescription
16772376611Head of Advanced Data AnalyticsAgensi Pekerjaan Hays (Malaysia) Sdn BhdKuala LumpurRM 20,000 – RM 30,000 per monthBeing the point-of-contact for advanced analyt...
10172761527Data EngineerANHSIN TECHNOLOGY SDN BHDKuala LumpurNaNDesign, develop, and maintain scalable and rob...
37672531370ERP Analyst / Application AnalystAmazon Papyrus (M) Sdn BhdKuala LumpurNaNBased in Malaysia, Work from home arrangement\\...
43072683703Data EngineerDATABRICKS TECH SDN. BHD.Bukit JalilNaNMaintain, backup, secure and transform data fr...
16172403257Data EngineerKINESSO MALAYSIA SDN. BHD. (fka Mediabrands Gl...PetalingNaNPerform daily, weekly and monthly monitoring o...
\n", "
" ], "text/plain": [ " job_id job_title \\\n", "167 72376611 Head of Advanced Data Analytics \n", "101 72761527 Data Engineer \n", "376 72531370 ERP Analyst / Application Analyst \n", "430 72683703 Data Engineer \n", "161 72403257 Data Engineer \n", "\n", " company location \\\n", "167 Agensi Pekerjaan Hays (Malaysia) Sdn Bhd Kuala Lumpur \n", "101 ANHSIN TECHNOLOGY SDN BHD Kuala Lumpur \n", "376 Amazon Papyrus (M) Sdn Bhd Kuala Lumpur \n", "430 DATABRICKS TECH SDN. BHD. Bukit Jalil \n", "161 KINESSO MALAYSIA SDN. BHD. (fka Mediabrands Gl... Petaling \n", "\n", " salary \\\n", "167 RM 20,000 – RM 30,000 per month \n", "101 NaN \n", "376 NaN \n", "430 NaN \n", "161 NaN \n", "\n", " description \n", "167 Being the point-of-contact for advanced analyt... \n", "101 Design, develop, and maintain scalable and rob... \n", "376 Based in Malaysia, Work from home arrangement\\... \n", "430 Maintain, backup, secure and transform data fr... \n", "161 Perform daily, weekly and monthly monitoring o... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(data_df.sample(n=5))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "data_clean_df = data_df[['job_id', 'location', 'company', 'job_title', 'salary', 'description']].dropna().drop_duplicates().copy()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "61" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data_clean_df)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_titlejob_id
5Data Analyst10
13Data Engineer6
26Junior Data Analyst3
17Data Scientist3
28Machine Learning Engineer (Artificial Intellig...2
0BI Developer - Immediate Hiring1
31MicroStrategy Developers (MNC)1
25Intern - Data Science1
27Machine Learning Engineer (AI)1
29Manager - Data Engineer1
30Manager, RPA Developer1
32Principal Data Engineer1
23Head of Advanced Data Analytics1
33Principal Engineer, Automation Project1
34RPA Manager (UiPath)1
35Risk Operation Data Analyst ( Online Payment)1
36Scientist (Bioinformatics)1
37Senior Business Intelligence Developer / Data ...1
38Senior Data Analyst1
39Senior Data Scientist (Full Stack)1
40System Analyst1
24IT Business Analyst1
21E-commerce Data Analyst1
22ERP System Analyst1
10Data Analyst Internship1
2Big Data Engineer - Cloud1
3Contract Business Administration Analyst1
4DATA ANALYST1
6Data Analyst (E-Commerce)1
7Data Analyst (Intern)1
8Data Analyst - Internal Audit1
9Data Analyst Executive1
11Data Analyst/Scientist1
1BUSINESS SYSTEM ANALYST1
12Data Analysts/ Data Engineers (Bangsar South- ...1
14Data Engineer (Python)1
15Data Engineer Lead1
16Data Engineer/Data Cloud Engineer/Salesforce1
18Data Scientist Assistant Manager1
19Database Administrator1
20Database Administrator (DBA)1
41System Analyst - IT1
\n", "
" ], "text/plain": [ " job_title job_id\n", "5 Data Analyst 10\n", "13 Data Engineer 6\n", "26 Junior Data Analyst 3\n", "17 Data Scientist 3\n", "28 Machine Learning Engineer (Artificial Intellig... 2\n", "0 BI Developer - Immediate Hiring 1\n", "31 MicroStrategy Developers (MNC) 1\n", "25 Intern - Data Science 1\n", "27 Machine Learning Engineer (AI) 1\n", "29 Manager - Data Engineer 1\n", "30 Manager, RPA Developer 1\n", "32 Principal Data Engineer 1\n", "23 Head of Advanced Data Analytics 1\n", "33 Principal Engineer, Automation Project 1\n", "34 RPA Manager (UiPath) 1\n", "35 Risk Operation Data Analyst ( Online Payment) 1\n", "36 Scientist (Bioinformatics) 1\n", "37 Senior Business Intelligence Developer / Data ... 1\n", "38 Senior Data Analyst 1\n", "39 Senior Data Scientist (Full Stack) 1\n", "40 System Analyst 1\n", "24 IT Business Analyst 1\n", "21 E-commerce Data Analyst 1\n", "22 ERP System Analyst 1\n", "10 Data Analyst Internship 1\n", "2 Big Data Engineer - Cloud 1\n", "3 Contract Business Administration Analyst 1\n", "4 DATA ANALYST 1\n", "6 Data Analyst (E-Commerce) 1\n", "7 Data Analyst (Intern) 1\n", "8 Data Analyst - Internal Audit 1\n", "9 Data Analyst Executive 1\n", "11 Data Analyst/Scientist 1\n", "1 BUSINESS SYSTEM ANALYST 1\n", "12 Data Analysts/ Data Engineers (Bangsar South- ... 1\n", "14 Data Engineer (Python) 1\n", "15 Data Engineer Lead 1\n", "16 Data Engineer/Data Cloud Engineer/Salesforce 1\n", "18 Data Scientist Assistant Manager 1\n", "19 Database Administrator 1\n", "20 Database Administrator (DBA) 1\n", "41 System Analyst - IT 1" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_clean_df.groupby('job_title')[['job_id']].count().reset_index().sort_values(by=\"job_id\",ascending=False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "for index, row in data_clean_df.iterrows():\n", "\n", " job_vacancy = {\n", " \"id\": row['job_id'],\n", " \"location\": row['location'].lower(),\n", " \"company\": row['company'].lower(),\n", " \"job_title\": row['job_title'].lower(),\n", " \"salary\": row['salary'],\n", " \"description\": row['description'],\n", " }\n", "\n", " res_file = \"{job_id}.json\".format(job_id=row['job_id'])\n", " res_path = os.path.join(res_dir, res_file)\n", "\n", " with open(res_path, \"w\") as f:\n", " json.dump(job_vacancy, f)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'data engineer': ['senior business intelligence developer / data engineer',\n", " 'data engineer (python)',\n", " 'big data engineer - cloud',\n", " 'principal data engineer',\n", " 'data engineer lead',\n", " 'data engineer',\n", " 'data engineer/data cloud engineer/salesforce',\n", " 'manager - data engineer',\n", " 'data analysts/ data engineers (bangsar south- kerinchi lrt)'],\n", " 'data scientist': ['senior data scientist (full stack)',\n", " 'data scientist',\n", " 'data scientist assistant manager'],\n", " 'data analyst': ['e-commerce data analyst',\n", " 'risk operation data analyst ( online payment)',\n", " 'data analyst/scientist',\n", " 'data analyst (e-commerce)',\n", " 'senior data analyst',\n", " 'data analyst (intern)',\n", " 'junior data analyst',\n", " 'data analyst',\n", " 'data analyst - internal audit',\n", " 'data analyst internship',\n", " 'data analyst executive',\n", " 'data analysts/ data engineers (bangsar south- kerinchi lrt)'],\n", " 'machine learning engineer': ['machine learning engineer (artificial intelligence)',\n", " 'machine learning engineer (ai)']}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "JOB_TITLES = [\"data engineer\", \"data scientist\", \"data analyst\", \"machine learning engineer\"]\n", "\n", "scraped_job_titles = data_clean_df['job_title'].values\n", "\n", "job_vacancy_metadata = {}\n", "for job_title in JOB_TITLES:\n", " related_job_vacancies = set([job.lower() for job in scraped_job_titles if job_title.lower() in job.lower()])\n", " job_vacancy_metadata[job_title] = list(related_job_vacancies)\n", "\n", "job_vacancy_metadata" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "try:\n", " os.mkdir(res_metadata_dir)\n", "except:\n", " pass\n", "\n", "res_file = \"job_vacancy_metadata.json\"\n", "res_path = os.path.join(res_metadata_dir, res_file)\n", "\n", "with open(res_path, \"w\") as f:\n", " json.dump(job_vacancy_metadata, f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "resume-worth", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }