The main goal of this notebook is to scrape Data Scientist job vacancy from [Jobstreet Malaysia](https://www.jobstreet.com.my/)(a job searching website).

We'll be using the combination API and manual HTML scraping technique from the Jobstreet for our webscraping.

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from resume_worth.utils.utils import get_params
import pandas as pd
pd.set_option('display.max_colwidth', 200)

In [2]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined
ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])
os.chdir(ROOT_DIR)
os.getcwd()

'/Users/luisrodrigues/Documents/Projects/PERSONAL/resume-worth'

In [3]:
max_pages = 15
# api url taken from Network -> Header 
api_url = 'https://www.jobstreet.com.my/api/chalice-search/v4/search?siteKey=MY-Main&sourcesystem=houston&userqueryid=2b00edd417ec163434fca9421e24c97a-7428735&userid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&usersessionid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&eventCaptureSessionId=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&seekSelectAllPages=true&keywords=data+scientist&pageSize=99&include=seodata&locale=en-MY&solId=568d2fe8-e8ef-4998-8e24-3e1ccfb1348b'

data_dir = os.path.join("data", "01_raw")
ingestion_date = pd.to_datetime('today').strftime('%Y-%m-%d')
data_file = f"data_jobs_scraped_from_jobstreet_{ingestion_date}.csv"

## Scrape available job articles

In [4]:
def scrape_article_ids(api_url, max_pages):
    job_id =[]
    titles = []
    companies = []
    locations = []
    categorys= []
    subCategorys= []
    job_types=[]
    salarys=[]
    for page_number in range(1, max_pages + 1):
        page_url = f'{api_url}&page={page_number}'
        
        # Send an HTTP request to the API endpoint
        response = requests.get(page_url)
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Extract advertiser IDs from each item in the 'data' list
            for item in data['data']:
                #print(item)
                jid = item['id']
                title = item['title']
                company = item['advertiser'].get('description', '')
                location = item.get('location', '')
                category = item['classification'].get('description', '')
                subCategory= item['subClassification'].get('description', '')
                job_type = item.get('workType', '')
                salary = item.get('salary', '')

                job_id.append(jid)
                titles.append(title)
                companies.append(company)
                locations.append(location)
                categorys.append(category)
                subCategorys.append(subCategory)
                job_types.append(job_type)
                salarys.append(salary)
                #print(f"Job ID: {job_id}")

        else:
            print(f"Failed to retrieve data from the API. Status Code: {response.status_code}")
            break

    return job_id, titles, companies, locations, categorys, subCategorys,job_types,salarys

In [5]:
job_ids, titles, companies, locations, categorys, sub_categorys, job_types, salarys = scrape_article_ids(api_url, max_pages)

jobs = []
for idx, job_id in enumerate(job_ids):
    job_info = {
        'job_id': job_ids[idx], 
        'job_title': titles[idx], 
        'company': companies[idx], 
        'location': locations[idx], 
        'category': categorys[idx], 
        'sub_category': sub_categorys[idx], 
        'job_type': job_types[idx], 
        'salary': salarys[idx]
    }
    jobs.append(job_info)

In [6]:
print(f"Number of retrieved jobs: {len(jobs)}")

Number of retrieved jobs: 1515


# Filter retrieved jobs by job title and salary availability

In [7]:
params = get_params()
target_job_titles = params['job_titles']

In [8]:
target_job_titles

['Data Engineer',
 'Data Scientist',
 'Data Analyst',
 'Machine Learning Engineer']

In [9]:
target_jobs = []
for job in jobs:
    if len(job['salary']) > 0:
        for target_job_title in target_job_titles:
            if target_job_title.lower() in job['job_title'].lower():
                target_jobs.append(job)
                break

In [10]:
print(f"Number of filtered jobs: {len(target_jobs)}")

Number of filtered jobs: 31


## Fetch job descriptions from individual job link

In [11]:
def fetch_job_article(job_id):
    article_url = f'https://www.jobstreet.com.my/job/{job_id}'
    response = requests.get(article_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve job article. Status Code: {response.status_code}")
        return None

def extract_text_from_ul(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    ul_tags = soup.find_all('ul')
    text_list = [ul.get_text(separator='\n') for ul in ul_tags]
    return '\n'.join(text_list)

def scrape_and_store_text(job_id):
    job_article_content = fetch_job_article(job_id)
        
    if job_article_content:
        text_from_ul = extract_text_from_ul(job_article_content)
        return text_from_ul
    
    return ''

In [12]:
completed_jobs = []
for job in target_jobs:
    description = scrape_and_store_text(job['job_id'])
    if len(description) > 0:
        job['description'] = description
        completed_jobs.append(job)

In [13]:
print(f"Number of completly retrieved jobs: {len(completed_jobs)}")

Number of completly retrieved jobs: 29


In [14]:
result_df = pd.DataFrame(completed_jobs)

In [15]:
result_df.head(3)

Unnamed: 0,job_id,job_title,company,location,category,sub_category,job_type,salary,description
0,74727704,Machine Learning Engineer (Artificial Intelligence),Private Advertiser,Kuala Lumpur,Science & Technology,"Mathematics, Statistics & Information Sciences",Full time,"RM 4,000 – RM 6,000 per month","Design, develop, and deploy machine learning models and algorithms for complex and unique datasets, using various techniques such as mathematical modeling, scikit-learn, NLP, CNN, RNN, DL, RL, Tra..."
1,74850440,Data Scientist,PHENOMENAL UAM Sdn Bhd,Kuala Lumpur,Science & Technology,"Mathematics, Statistics & Information Sciences",Full time,"RM 5,000 – RM 7,500 per month",Utilizing machine learning techniques to analyze and interpret complex data sets.\nDeveloping AI models and algorithms to solve specific business problems and improve operational efficiency.\nColl...
2,74946043,Data Engineer Lead,KK Group Of Companies,Bandar Tasik Selatan,Information & Communication Technology,Engineering - Software,Full time,"RM 7,000 – RM 8,000 per month","Making large and/or complex data more accessible, understandable and usable\nTransforming, improving and integrating data, depending on the business requirements\nDelivering the data in a useful a..."


In [16]:
result_df.groupby('job_title')[['job_id']].count().reset_index().sort_values(by="job_id",ascending=False)

Unnamed: 0,job_title,job_id
1,Data Analyst,5
11,Data Scientist,3
7,Data Engineer,2
13,Junior Data Analyst,2
0,Big Data Engineer - Cloud,1
12,Data Scientist Assistant Manager,1
19,Senior Data Analyst,1
18,Senior Business Intelligence Developer / Data Engineer,1
17,Risk Operation Data Analyst ( Online Payment),1
16,Principal Data Engineer,1


In [17]:
data_path = os.path.join(data_dir, data_file)

data_df = result_df.to_csv(data_path, sep=",")