The main purpose of this notebook is to parse the file with the scraped jobs into a json file per job.

In [1]:
import os
import json
from dotenv import load_dotenv
import pandas as pd

In [2]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined
ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])
os.chdir(ROOT_DIR)
os.getcwd()

'/Users/luisrodrigues/Documents/Projects/PERSONAL/resume-worth'

In [3]:
conf_dir = os.path.join("conf")
conf_file = ".env"

data_dir = os.path.join("data", "01_raw")

res_dir = os.path.join("data", "02_processed")

res_metadata_dir = os.path.join("data", "02_processed", "metadata")

In [4]:
# Load OpenAI API Key
# https://openai.com/pricing

conf_path = os.path.join(conf_dir, conf_file)

_ = load_dotenv(conf_path)

#API_KEY = os.getenv('API_KEY')

In [5]:
# Get list of file names in ingestion folder
file_names = [file for file in os.listdir(data_dir) if ".csv" in file]

In [6]:
file_names

['data_scientist_jobstreet_scraped_v2 2.csv',
 'data_jobs_scraped_from_jobstreet_2024-04-07.csv']

In [16]:
columns = ['job_id', 'job_title', 'company', 'location', 'salary', 'description']

for file_name in file_names:
    data_path = os.path.join(data_dir, file_name)
    if 'data_df' not in locals():
        data_df = pd.read_csv(data_path, sep=",")[columns]
    else:
        data_file = pd.read_csv(data_path, sep=",")[columns]
        data_df = pd.concat([data_df, data_file])

In [8]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 635 entries, 0 to 28
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   job_id       635 non-null    int64 
 1   job_title    635 non-null    object
 2   company      635 non-null    object
 3   location     635 non-null    object
 4   salary       227 non-null    object
 5   description  617 non-null    object
dtypes: int64(1), object(5)
memory usage: 34.7+ KB


In [9]:
display(data_df.sample(n=5))

Unnamed: 0,job_id,job_title,company,location,salary,description
167,72376611,Head of Advanced Data Analytics,Agensi Pekerjaan Hays (Malaysia) Sdn Bhd,Kuala Lumpur,"RM 20,000 – RM 30,000 per month",Being the point-of-contact for advanced analyt...
101,72761527,Data Engineer,ANHSIN TECHNOLOGY SDN BHD,Kuala Lumpur,,"Design, develop, and maintain scalable and rob..."
376,72531370,ERP Analyst / Application Analyst,Amazon Papyrus (M) Sdn Bhd,Kuala Lumpur,,"Based in Malaysia, Work from home arrangement\..."
430,72683703,Data Engineer,DATABRICKS TECH SDN. BHD.,Bukit Jalil,,"Maintain, backup, secure and transform data fr..."
161,72403257,Data Engineer,KINESSO MALAYSIA SDN. BHD. (fka Mediabrands Gl...,Petaling,,"Perform daily, weekly and monthly monitoring o..."


In [10]:
data_clean_df = data_df[['job_id', 'location', 'company', 'job_title', 'salary', 'description']].dropna().drop_duplicates().copy()

In [11]:
len(data_clean_df)

61

In [12]:
data_clean_df.groupby('job_title')[['job_id']].count().reset_index().sort_values(by="job_id",ascending=False)

Unnamed: 0,job_title,job_id
5,Data Analyst,10
13,Data Engineer,6
26,Junior Data Analyst,3
17,Data Scientist,3
28,Machine Learning Engineer (Artificial Intellig...,2
0,BI Developer - Immediate Hiring,1
31,MicroStrategy Developers (MNC),1
25,Intern - Data Science,1
27,Machine Learning Engineer (AI),1
29,Manager - Data Engineer,1


In [13]:
for index, row in data_clean_df.iterrows():

    job_vacancy = {
        "id": row['job_id'],
        "location": row['location'].lower(),
        "company": row['company'].lower(),
        "job_title": row['job_title'].lower(),
        "salary": row['salary'],
        "description": row['description'],
    }

    res_file = "{job_id}.json".format(job_id=row['job_id'])
    res_path = os.path.join(res_dir, res_file)

    with open(res_path, "w") as f:
            json.dump(job_vacancy, f)

In [14]:
JOB_TITLES = ["data engineer", "data scientist", "data analyst", "machine learning engineer"]

scraped_job_titles = data_clean_df['job_title'].values

job_vacancy_metadata = {}
for job_title in JOB_TITLES:
    related_job_vacancies = set([job.lower() for job in scraped_job_titles if job_title.lower() in job.lower()])
    job_vacancy_metadata[job_title] = list(related_job_vacancies)

job_vacancy_metadata

{'data engineer': ['senior business intelligence developer / data engineer',
  'data engineer (python)',
  'big data engineer - cloud',
  'principal data engineer',
  'data engineer lead',
  'data engineer',
  'data engineer/data cloud engineer/salesforce',
  'manager - data engineer',
  'data analysts/ data engineers (bangsar south- kerinchi lrt)'],
 'data scientist': ['senior data scientist (full stack)',
  'data scientist',
  'data scientist assistant manager'],
 'data analyst': ['e-commerce data analyst',
  'risk operation data analyst ( online payment)',
  'data analyst/scientist',
  'data analyst (e-commerce)',
  'senior data analyst',
  'data analyst (intern)',
  'junior data analyst',
  'data analyst',
  'data analyst - internal audit',
  'data analyst internship',
  'data analyst executive',
  'data analysts/ data engineers (bangsar south- kerinchi lrt)'],
 'machine learning engineer': ['machine learning engineer (artificial intelligence)',
  'machine learning engineer (ai)']}

In [15]:
try:
    os.mkdir(res_metadata_dir)
except:
    pass

res_file = "job_vacancy_metadata.json"
res_path = os.path.join(res_metadata_dir, res_file)

with open(res_path, "w") as f:
        json.dump(job_vacancy_metadata, f)