{"cells":[{"cell_type":"markdown","metadata":{},"source":["The main goal of this notebook is to scrape Data Scientist job vacancy from [Jobstreet Malaysia](https://www.jobstreet.com.my/)(a job searching website).\n","\n","We'll be using the combination API and manual HTML scraping technique from the Jobstreet for our webscraping."]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:21:59.667260Z","iopub.status.busy":"2024-04-07T09:21:59.666761Z","iopub.status.idle":"2024-04-07T09:22:00.422635Z","shell.execute_reply":"2024-04-07T09:22:00.421244Z","shell.execute_reply.started":"2024-04-07T09:21:59.667217Z"},"tags":[],"trusted":true},"outputs":[],"source":["import os\n","import requests\n","from bs4 import BeautifulSoup\n","from resume_worth.utils.utils import get_params\n","import pandas as pd\n","pd.set_option('display.max_colwidth', 200)"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"data":{"text/plain":["'/Users/luisrodrigues/Documents/Projects/PERSONAL/resume-worth'"]},"execution_count":2,"metadata":{},"output_type":"execute_result"}],"source":["# Change the current working directory to the pachage root\n","# That's step is due to the way settings.py is defined\n","ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])\n","os.chdir(ROOT_DIR)\n","os.getcwd()"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["max_pages = 15\n","# api url taken from Network -> Header \n","api_url = 'https://www.jobstreet.com.my/api/chalice-search/v4/search?siteKey=MY-Main&sourcesystem=houston&userqueryid=2b00edd417ec163434fca9421e24c97a-7428735&userid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&usersessionid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&eventCaptureSessionId=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&seekSelectAllPages=true&keywords=data+scientist&pageSize=99&include=seodata&locale=en-MY&solId=568d2fe8-e8ef-4998-8e24-3e1ccfb1348b'\n","\n","data_dir = os.path.join(\"data\", \"01_raw\")\n","ingestion_date = pd.to_datetime('today').strftime('%Y-%m-%d')\n","data_file = f\"data_jobs_scraped_from_jobstreet_{ingestion_date}.csv\""]},{"cell_type":"markdown","metadata":{},"source":["## Scrape available job articles"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:00.425276Z","iopub.status.busy":"2024-04-07T09:22:00.424497Z","iopub.status.idle":"2024-04-07T09:22:01.980509Z","shell.execute_reply":"2024-04-07T09:22:01.979252Z","shell.execute_reply.started":"2024-04-07T09:22:00.425240Z"},"tags":[],"trusted":true},"outputs":[],"source":["def scrape_article_ids(api_url, max_pages):\n","    job_id =[]\n","    titles = []\n","    companies = []\n","    locations = []\n","    categorys= []\n","    subCategorys= []\n","    job_types=[]\n","    salarys=[]\n","    for page_number in range(1, max_pages + 1):\n","        page_url = f'{api_url}&page={page_number}'\n","        \n","        # Send an HTTP request to the API endpoint\n","        response = requests.get(page_url)\n","        if response.status_code == 200:\n","            # Parse the JSON response\n","            data = response.json()\n","\n","            # Extract advertiser IDs from each item in the 'data' list\n","            for item in data['data']:\n","                #print(item)\n","                jid = item['id']\n","                title = item['title']\n","                company = item['advertiser'].get('description', '')\n","                location = item.get('location', '')\n","                category = item['classification'].get('description', '')\n","                subCategory= item['subClassification'].get('description', '')\n","                job_type = item.get('workType', '')\n","                salary = item.get('salary', '')\n","\n","                job_id.append(jid)\n","                titles.append(title)\n","                companies.append(company)\n","                locations.append(location)\n","                categorys.append(category)\n","                subCategorys.append(subCategory)\n","                job_types.append(job_type)\n","                salarys.append(salary)\n","                #print(f\"Job ID: {job_id}\")\n","\n","        else:\n","            print(f\"Failed to retrieve data from the API. Status Code: {response.status_code}\")\n","            break\n","\n","    return job_id, titles, companies, locations, categorys, subCategorys,job_types,salarys"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["job_ids, titles, companies, locations, categorys, sub_categorys, job_types, salarys = scrape_article_ids(api_url, max_pages)\n","\n","jobs = []\n","for idx, job_id in enumerate(job_ids):\n","    job_info = {\n","        'job_id': job_ids[idx], \n","        'job_title': titles[idx], \n","        'company': companies[idx], \n","        'location': locations[idx], \n","        'category': categorys[idx], \n","        'sub_category': sub_categorys[idx], \n","        'job_type': job_types[idx], \n","        'salary': salarys[idx]\n","    }\n","    jobs.append(job_info)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:24:27.229473Z","iopub.status.busy":"2024-04-07T09:24:27.228315Z","iopub.status.idle":"2024-04-07T09:24:27.246251Z","shell.execute_reply":"2024-04-07T09:24:27.244966Z","shell.execute_reply.started":"2024-04-07T09:24:27.229409Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of retrieved jobs: 1515\n"]}],"source":["print(f\"Number of retrieved jobs: {len(jobs)}\")"]},{"cell_type":"markdown","metadata":{},"source":["# Filter retrieved jobs by job title and salary availability"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["params = get_params()\n","target_job_titles = params['job_titles']"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["['Data Engineer',\n"," 'Data Scientist',\n"," 'Data Analyst',\n"," 'Machine Learning Engineer']"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["target_job_titles"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["target_jobs = []\n","for job in jobs:\n","    if len(job['salary']) > 0:\n","        for target_job_title in target_job_titles:\n","            if target_job_title.lower() in job['job_title'].lower():\n","                target_jobs.append(job)\n","                break"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of filtered jobs: 31\n"]}],"source":["print(f\"Number of filtered jobs: {len(target_jobs)}\")"]},{"cell_type":"markdown","metadata":{},"source":["## Fetch job descriptions from individual job link"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:07.307243Z","iopub.status.busy":"2024-04-07T09:22:07.306836Z","iopub.status.idle":"2024-04-07T09:22:07.319745Z","shell.execute_reply":"2024-04-07T09:22:07.318194Z","shell.execute_reply.started":"2024-04-07T09:22:07.307211Z"},"tags":[],"trusted":true},"outputs":[],"source":["def fetch_job_article(job_id):\n","    article_url = f'https://www.jobstreet.com.my/job/{job_id}'\n","    response = requests.get(article_url)\n","    if response.status_code == 200:\n","        return response.text\n","    else:\n","        print(f\"Failed to retrieve job article. Status Code: {response.status_code}\")\n","        return None\n","\n","def extract_text_from_ul(html_content):\n","    soup = BeautifulSoup(html_content, 'html.parser')\n","    ul_tags = soup.find_all('ul')\n","    text_list = [ul.get_text(separator='\\n') for ul in ul_tags]\n","    return '\\n'.join(text_list)\n","\n","def scrape_and_store_text(job_id):\n","    job_article_content = fetch_job_article(job_id)\n","        \n","    if job_article_content:\n","        text_from_ul = extract_text_from_ul(job_article_content)\n","        return text_from_ul\n","    \n","    return ''"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:09.177829Z","iopub.status.busy":"2024-04-07T09:22:09.177433Z","iopub.status.idle":"2024-04-07T09:24:23.150504Z","shell.execute_reply":"2024-04-07T09:24:23.148399Z","shell.execute_reply.started":"2024-04-07T09:22:09.177798Z"},"tags":[],"trusted":true},"outputs":[],"source":["completed_jobs = []\n","for job in target_jobs:\n","    description = scrape_and_store_text(job['job_id'])\n","    if len(description) > 0:\n","        job['description'] = description\n","        completed_jobs.append(job)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of completly retrieved jobs: 29\n"]}],"source":["print(f\"Number of completly retrieved jobs: {len(completed_jobs)}\")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[],"source":["result_df = pd.DataFrame(completed_jobs)"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>job_id</th>\n","      <th>job_title</th>\n","      <th>company</th>\n","      <th>location</th>\n","      <th>category</th>\n","      <th>sub_category</th>\n","      <th>job_type</th>\n","      <th>salary</th>\n","      <th>description</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>74727704</td>\n","      <td>Machine Learning Engineer (Artificial Intelligence)</td>\n","      <td>Private Advertiser</td>\n","      <td>Kuala Lumpur</td>\n","      <td>Science &amp; Technology</td>\n","      <td>Mathematics, Statistics &amp; Information Sciences</td>\n","      <td>Full time</td>\n","      <td>RM 4,000 – RM 6,000 per month</td>\n","      <td>Design, develop, and deploy machine learning models and algorithms for complex and unique datasets, using various techniques such as mathematical modeling, scikit-learn, NLP, CNN, RNN, DL, RL, Tra...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>74850440</td>\n","      <td>Data Scientist</td>\n","      <td>PHENOMENAL UAM Sdn Bhd</td>\n","      <td>Kuala Lumpur</td>\n","      <td>Science &amp; Technology</td>\n","      <td>Mathematics, Statistics &amp; Information Sciences</td>\n","      <td>Full time</td>\n","      <td>RM 5,000 – RM 7,500 per month</td>\n","      <td>Utilizing machine learning techniques to analyze and interpret complex data sets.\\nDeveloping AI models and algorithms to solve specific business problems and improve operational efficiency.\\nColl...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>74946043</td>\n","      <td>Data Engineer Lead</td>\n","      <td>KK Group Of Companies</td>\n","      <td>Bandar Tasik Selatan</td>\n","      <td>Information &amp; Communication Technology</td>\n","      <td>Engineering - Software</td>\n","      <td>Full time</td>\n","      <td>RM 7,000 – RM 8,000 per month</td>\n","      <td>Making large and/or complex data more accessible, understandable and usable\\nTransforming, improving and integrating data, depending on the business requirements\\nDelivering the data in a useful a...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["     job_id                                            job_title  \\\n","0  74727704  Machine Learning Engineer (Artificial Intelligence)   \n","1  74850440                                       Data Scientist   \n","2  74946043                                   Data Engineer Lead   \n","\n","                  company              location  \\\n","0      Private Advertiser          Kuala Lumpur   \n","1  PHENOMENAL UAM Sdn Bhd          Kuala Lumpur   \n","2   KK Group Of Companies  Bandar Tasik Selatan   \n","\n","                                 category  \\\n","0                    Science & Technology   \n","1                    Science & Technology   \n","2  Information & Communication Technology   \n","\n","                                     sub_category   job_type  \\\n","0  Mathematics, Statistics & Information Sciences  Full time   \n","1  Mathematics, Statistics & Information Sciences  Full time   \n","2                          Engineering - Software  Full time   \n","\n","                          salary  \\\n","0  RM 4,000 – RM 6,000 per month   \n","1  RM 5,000 – RM 7,500 per month   \n","2  RM 7,000 – RM 8,000 per month   \n","\n","                                                                                                                                                                                               description  \n","0  Design, develop, and deploy machine learning models and algorithms for complex and unique datasets, using various techniques such as mathematical modeling, scikit-learn, NLP, CNN, RNN, DL, RL, Tra...  \n","1  Utilizing machine learning techniques to analyze and interpret complex data sets.\\nDeveloping AI models and algorithms to solve specific business problems and improve operational efficiency.\\nColl...  \n","2  Making large and/or complex data more accessible, understandable and usable\\nTransforming, improving and integrating data, depending on the business requirements\\nDelivering the data in a useful a...  "]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["result_df.head(3)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>job_title</th>\n","      <th>job_id</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>1</th>\n","      <td>Data Analyst</td>\n","      <td>5</td>\n","    </tr>\n","    <tr>\n","      <th>11</th>\n","      <td>Data Scientist</td>\n","      <td>3</td>\n","    </tr>\n","    <tr>\n","      <th>7</th>\n","      <td>Data Engineer</td>\n","      <td>2</td>\n","    </tr>\n","    <tr>\n","      <th>13</th>\n","      <td>Junior Data Analyst</td>\n","      <td>2</td>\n","    </tr>\n","    <tr>\n","      <th>0</th>\n","      <td>Big Data Engineer - Cloud</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>12</th>\n","      <td>Data Scientist Assistant Manager</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>19</th>\n","      <td>Senior Data Analyst</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>18</th>\n","      <td>Senior Business Intelligence Developer / Data Engineer</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>17</th>\n","      <td>Risk Operation Data Analyst ( Online Payment)</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>16</th>\n","      <td>Principal Data Engineer</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>15</th>\n","      <td>Manager - Data Engineer</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>14</th>\n","      <td>Machine Learning Engineer (Artificial Intelligence)</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>10</th>\n","      <td>Data Engineer/Data Cloud Engineer/Salesforce</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>9</th>\n","      <td>Data Engineer Lead</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>8</th>\n","      <td>Data Engineer (Python)</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>6</th>\n","      <td>Data Analyst/Scientist</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>5</th>\n","      <td>Data Analyst Internship</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>Data Analyst - Internal Audit</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>Data Analyst (Intern)</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>Data Analyst (E-Commerce)</td>\n","      <td>1</td>\n","    </tr>\n","    <tr>\n","      <th>20</th>\n","      <td>Senior Data Scientist (Full Stack)</td>\n","      <td>1</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                                                 job_title  job_id\n","1                                             Data Analyst       5\n","11                                          Data Scientist       3\n","7                                            Data Engineer       2\n","13                                     Junior Data Analyst       2\n","0                                Big Data Engineer - Cloud       1\n","12                        Data Scientist Assistant Manager       1\n","19                                     Senior Data Analyst       1\n","18  Senior Business Intelligence Developer / Data Engineer       1\n","17           Risk Operation Data Analyst ( Online Payment)       1\n","16                                 Principal Data Engineer       1\n","15                                 Manager - Data Engineer       1\n","14     Machine Learning Engineer (Artificial Intelligence)       1\n","10            Data Engineer/Data Cloud Engineer/Salesforce       1\n","9                                       Data Engineer Lead       1\n","8                                   Data Engineer (Python)       1\n","6                                   Data Analyst/Scientist       1\n","5                                  Data Analyst Internship       1\n","4                            Data Analyst - Internal Audit       1\n","3                                    Data Analyst (Intern)       1\n","2                                Data Analyst (E-Commerce)       1\n","20                      Senior Data Scientist (Full Stack)       1"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["result_df.groupby('job_title')[['job_id']].count().reset_index().sort_values(by=\"job_id\",ascending=False)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[],"source":["data_path = os.path.join(data_dir, data_file)\n","\n","data_df = result_df.to_csv(data_path, sep=\",\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":4284628,"sourceId":7654855,"sourceType":"datasetVersion"}],"dockerImageVersionId":30626,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.8"}},"nbformat":4,"nbformat_minor":4}