{"cells":[{"cell_type":"markdown","metadata":{},"source":["The main goal of this notebook is to scrape Data Scientist job vacancy from [Jobstreet Malaysia](https://www.jobstreet.com.my/)(a job searching website).\n","\n","We'll be using the combination API and manual HTML scraping technique from the Jobstreet for our webscraping."]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:21:59.667260Z","iopub.status.busy":"2024-04-07T09:21:59.666761Z","iopub.status.idle":"2024-04-07T09:22:00.422635Z","shell.execute_reply":"2024-04-07T09:22:00.421244Z","shell.execute_reply.started":"2024-04-07T09:21:59.667217Z"},"tags":[],"trusted":true},"outputs":[],"source":["import os\n","import requests\n","from bs4 import BeautifulSoup\n","from resume_worth.utils.utils import get_params\n","import pandas as pd\n","pd.set_option('display.max_colwidth', 200)"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"data":{"text/plain":["'/Users/luisrodrigues/Documents/Projects/PERSONAL/resume-worth'"]},"execution_count":2,"metadata":{},"output_type":"execute_result"}],"source":["# Change the current working directory to the pachage root\n","# That's step is due to the way settings.py is defined\n","ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])\n","os.chdir(ROOT_DIR)\n","os.getcwd()"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["max_pages = 15\n","# api url taken from Network -> Header \n","api_url = 'https://www.jobstreet.com.my/api/chalice-search/v4/search?siteKey=MY-Main&sourcesystem=houston&userqueryid=2b00edd417ec163434fca9421e24c97a-7428735&userid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&usersessionid=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&eventCaptureSessionId=e56c7e89-1d1a-42b9-b7a5-37c12653d6b9&seekSelectAllPages=true&keywords=data+scientist&pageSize=99&include=seodata&locale=en-MY&solId=568d2fe8-e8ef-4998-8e24-3e1ccfb1348b'\n","\n","data_dir = os.path.join(\"data\", \"01_raw\")\n","ingestion_date = pd.to_datetime('today').strftime('%Y-%m-%d')\n","data_file = f\"data_jobs_scraped_from_jobstreet_{ingestion_date}.csv\""]},{"cell_type":"markdown","metadata":{},"source":["## Scrape available job articles"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:00.425276Z","iopub.status.busy":"2024-04-07T09:22:00.424497Z","iopub.status.idle":"2024-04-07T09:22:01.980509Z","shell.execute_reply":"2024-04-07T09:22:01.979252Z","shell.execute_reply.started":"2024-04-07T09:22:00.425240Z"},"tags":[],"trusted":true},"outputs":[],"source":["def scrape_article_ids(api_url, max_pages):\n"," job_id =[]\n"," titles = []\n"," companies = []\n"," locations = []\n"," categorys= []\n"," subCategorys= []\n"," job_types=[]\n"," salarys=[]\n"," for page_number in range(1, max_pages + 1):\n"," page_url = f'{api_url}&page={page_number}'\n"," \n"," # Send an HTTP request to the API endpoint\n"," response = requests.get(page_url)\n"," if response.status_code == 200:\n"," # Parse the JSON response\n"," data = response.json()\n","\n"," # Extract advertiser IDs from each item in the 'data' list\n"," for item in data['data']:\n"," #print(item)\n"," jid = item['id']\n"," title = item['title']\n"," company = item['advertiser'].get('description', '')\n"," location = item.get('location', '')\n"," category = item['classification'].get('description', '')\n"," subCategory= item['subClassification'].get('description', '')\n"," job_type = item.get('workType', '')\n"," salary = item.get('salary', '')\n","\n"," job_id.append(jid)\n"," titles.append(title)\n"," companies.append(company)\n"," locations.append(location)\n"," categorys.append(category)\n"," subCategorys.append(subCategory)\n"," job_types.append(job_type)\n"," salarys.append(salary)\n"," #print(f\"Job ID: {job_id}\")\n","\n"," else:\n"," print(f\"Failed to retrieve data from the API. Status Code: {response.status_code}\")\n"," break\n","\n"," return job_id, titles, companies, locations, categorys, subCategorys,job_types,salarys"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["job_ids, titles, companies, locations, categorys, sub_categorys, job_types, salarys = scrape_article_ids(api_url, max_pages)\n","\n","jobs = []\n","for idx, job_id in enumerate(job_ids):\n"," job_info = {\n"," 'job_id': job_ids[idx], \n"," 'job_title': titles[idx], \n"," 'company': companies[idx], \n"," 'location': locations[idx], \n"," 'category': categorys[idx], \n"," 'sub_category': sub_categorys[idx], \n"," 'job_type': job_types[idx], \n"," 'salary': salarys[idx]\n"," }\n"," jobs.append(job_info)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:24:27.229473Z","iopub.status.busy":"2024-04-07T09:24:27.228315Z","iopub.status.idle":"2024-04-07T09:24:27.246251Z","shell.execute_reply":"2024-04-07T09:24:27.244966Z","shell.execute_reply.started":"2024-04-07T09:24:27.229409Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of retrieved jobs: 1515\n"]}],"source":["print(f\"Number of retrieved jobs: {len(jobs)}\")"]},{"cell_type":"markdown","metadata":{},"source":["# Filter retrieved jobs by job title and salary availability"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["params = get_params()\n","target_job_titles = params['job_titles']"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["['Data Engineer',\n"," 'Data Scientist',\n"," 'Data Analyst',\n"," 'Machine Learning Engineer']"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["target_job_titles"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["target_jobs = []\n","for job in jobs:\n"," if len(job['salary']) > 0:\n"," for target_job_title in target_job_titles:\n"," if target_job_title.lower() in job['job_title'].lower():\n"," target_jobs.append(job)\n"," break"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of filtered jobs: 31\n"]}],"source":["print(f\"Number of filtered jobs: {len(target_jobs)}\")"]},{"cell_type":"markdown","metadata":{},"source":["## Fetch job descriptions from individual job link"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:07.307243Z","iopub.status.busy":"2024-04-07T09:22:07.306836Z","iopub.status.idle":"2024-04-07T09:22:07.319745Z","shell.execute_reply":"2024-04-07T09:22:07.318194Z","shell.execute_reply.started":"2024-04-07T09:22:07.307211Z"},"tags":[],"trusted":true},"outputs":[],"source":["def fetch_job_article(job_id):\n"," article_url = f'https://www.jobstreet.com.my/job/{job_id}'\n"," response = requests.get(article_url)\n"," if response.status_code == 200:\n"," return response.text\n"," else:\n"," print(f\"Failed to retrieve job article. Status Code: {response.status_code}\")\n"," return None\n","\n","def extract_text_from_ul(html_content):\n"," soup = BeautifulSoup(html_content, 'html.parser')\n"," ul_tags = soup.find_all('ul')\n"," text_list = [ul.get_text(separator='\\n') for ul in ul_tags]\n"," return '\\n'.join(text_list)\n","\n","def scrape_and_store_text(job_id):\n"," job_article_content = fetch_job_article(job_id)\n"," \n"," if job_article_content:\n"," text_from_ul = extract_text_from_ul(job_article_content)\n"," return text_from_ul\n"," \n"," return ''"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2024-04-07T09:22:09.177829Z","iopub.status.busy":"2024-04-07T09:22:09.177433Z","iopub.status.idle":"2024-04-07T09:24:23.150504Z","shell.execute_reply":"2024-04-07T09:24:23.148399Z","shell.execute_reply.started":"2024-04-07T09:22:09.177798Z"},"tags":[],"trusted":true},"outputs":[],"source":["completed_jobs = []\n","for job in target_jobs:\n"," description = scrape_and_store_text(job['job_id'])\n"," if len(description) > 0:\n"," job['description'] = description\n"," completed_jobs.append(job)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Number of completly retrieved jobs: 29\n"]}],"source":["print(f\"Number of completly retrieved jobs: {len(completed_jobs)}\")"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[],"source":["result_df = pd.DataFrame(completed_jobs)"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
job_idjob_titlecompanylocationcategorysub_categoryjob_typesalarydescription
074727704Machine Learning Engineer (Artificial Intelligence)Private AdvertiserKuala LumpurScience & TechnologyMathematics, Statistics & Information SciencesFull timeRM 4,000 – RM 6,000 per monthDesign, develop, and deploy machine learning models and algorithms for complex and unique datasets, using various techniques such as mathematical modeling, scikit-learn, NLP, CNN, RNN, DL, RL, Tra...
174850440Data ScientistPHENOMENAL UAM Sdn BhdKuala LumpurScience & TechnologyMathematics, Statistics & Information SciencesFull timeRM 5,000 – RM 7,500 per monthUtilizing machine learning techniques to analyze and interpret complex data sets.\\nDeveloping AI models and algorithms to solve specific business problems and improve operational efficiency.\\nColl...
274946043Data Engineer LeadKK Group Of CompaniesBandar Tasik SelatanInformation & Communication TechnologyEngineering - SoftwareFull timeRM 7,000 – RM 8,000 per monthMaking large and/or complex data more accessible, understandable and usable\\nTransforming, improving and integrating data, depending on the business requirements\\nDelivering the data in a useful a...
\n","
"],"text/plain":[" job_id job_title \\\n","0 74727704 Machine Learning Engineer (Artificial Intelligence) \n","1 74850440 Data Scientist \n","2 74946043 Data Engineer Lead \n","\n"," company location \\\n","0 Private Advertiser Kuala Lumpur \n","1 PHENOMENAL UAM Sdn Bhd Kuala Lumpur \n","2 KK Group Of Companies Bandar Tasik Selatan \n","\n"," category \\\n","0 Science & Technology \n","1 Science & Technology \n","2 Information & Communication Technology \n","\n"," sub_category job_type \\\n","0 Mathematics, Statistics & Information Sciences Full time \n","1 Mathematics, Statistics & Information Sciences Full time \n","2 Engineering - Software Full time \n","\n"," salary \\\n","0 RM 4,000 – RM 6,000 per month \n","1 RM 5,000 – RM 7,500 per month \n","2 RM 7,000 – RM 8,000 per month \n","\n"," description \n","0 Design, develop, and deploy machine learning models and algorithms for complex and unique datasets, using various techniques such as mathematical modeling, scikit-learn, NLP, CNN, RNN, DL, RL, Tra... \n","1 Utilizing machine learning techniques to analyze and interpret complex data sets.\\nDeveloping AI models and algorithms to solve specific business problems and improve operational efficiency.\\nColl... \n","2 Making large and/or complex data more accessible, understandable and usable\\nTransforming, improving and integrating data, depending on the business requirements\\nDelivering the data in a useful a... "]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["result_df.head(3)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
job_titlejob_id
1Data Analyst5
11Data Scientist3
7Data Engineer2
13Junior Data Analyst2
0Big Data Engineer - Cloud1
12Data Scientist Assistant Manager1
19Senior Data Analyst1
18Senior Business Intelligence Developer / Data Engineer1
17Risk Operation Data Analyst ( Online Payment)1
16Principal Data Engineer1
15Manager - Data Engineer1
14Machine Learning Engineer (Artificial Intelligence)1
10Data Engineer/Data Cloud Engineer/Salesforce1
9Data Engineer Lead1
8Data Engineer (Python)1
6Data Analyst/Scientist1
5Data Analyst Internship1
4Data Analyst - Internal Audit1
3Data Analyst (Intern)1
2Data Analyst (E-Commerce)1
20Senior Data Scientist (Full Stack)1
\n","
"],"text/plain":[" job_title job_id\n","1 Data Analyst 5\n","11 Data Scientist 3\n","7 Data Engineer 2\n","13 Junior Data Analyst 2\n","0 Big Data Engineer - Cloud 1\n","12 Data Scientist Assistant Manager 1\n","19 Senior Data Analyst 1\n","18 Senior Business Intelligence Developer / Data Engineer 1\n","17 Risk Operation Data Analyst ( Online Payment) 1\n","16 Principal Data Engineer 1\n","15 Manager - Data Engineer 1\n","14 Machine Learning Engineer (Artificial Intelligence) 1\n","10 Data Engineer/Data Cloud Engineer/Salesforce 1\n","9 Data Engineer Lead 1\n","8 Data Engineer (Python) 1\n","6 Data Analyst/Scientist 1\n","5 Data Analyst Internship 1\n","4 Data Analyst - Internal Audit 1\n","3 Data Analyst (Intern) 1\n","2 Data Analyst (E-Commerce) 1\n","20 Senior Data Scientist (Full Stack) 1"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["result_df.groupby('job_title')[['job_id']].count().reset_index().sort_values(by=\"job_id\",ascending=False)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[],"source":["data_path = os.path.join(data_dir, data_file)\n","\n","data_df = result_df.to_csv(data_path, sep=\",\")"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":4284628,"sourceId":7654855,"sourceType":"datasetVersion"}],"dockerImageVersionId":30626,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.8"}},"nbformat":4,"nbformat_minor":4}