Spaces:

drift-ai
/

recruiter-assistant-jbfxrs

Sleeping

recruiter-assistant-jbfxrs / scripts /scrape_website.py

Vincent Claes

jobfixers create HTML table

a1372cb over 1 year ago

6.09 kB

	import os
	import openai
	import pandas as pd
	import json
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import (
	NoSuchElementException,
	TimeoutException,
	WebDriverException,
	)
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	import time

	# Set up OpenAI API
	openai.api_key = os.getenv("OPENAI")

	# Initialize followup number
	followup_number = 0

	# Start with page 1
	page_number = 4

	# Get the current working directory
	cwd = os.getcwd()

	# Path to the CSV file
	csv_file = os.path.join(cwd, "vacancies.csv")

	while True:
	try:
	# Setup webdriver
	s = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=s)

	# The URL of the page with the buttons
	url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"

	# Navigate to the page
	driver.get(url)

	# Wait for the page to load
	time.sleep(5)

	# Find the buttons
	buttons = WebDriverWait(driver, 10).until(
	EC.presence_of_all_elements_located(
	(
	By.XPATH,
	'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
	)
	)
	)

	for i in range(len(buttons)):
	# Find the buttons again to avoid StaleElementReferenceException
	buttons = WebDriverWait(driver, 10).until(
	EC.presence_of_all_elements_located(
	(
	By.XPATH,
	'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
	)
	)
	)

	# Click the button
	buttons[i].click()

	# Wait for the new page to load
	time.sleep(5)

	# Get the page source
	html = driver.page_source

	# Parse the HTML with BeautifulSoup
	soup = BeautifulSoup(html, "html.parser")

	# Extract relevant items related to the vacancy
	vacancy_detail = {}

	# Extracting the job position
	vacancy_detail["Position"] = soup.select_one(
	".vacancy-detail__content__header__position"
	).text.strip()

	# Extracting the location
	vacancy_detail["Location"] = soup.select_one(
	".vacancy-detail__content__header__location a"
	).text.strip()

	# Extracting the description
	description = soup.select_one(
	".vacancy-detail__content__body__description__details"
	).get_text(separator=" ")
	vacancy_detail["Description"] = description.strip()

	# Extracting the profile details
	profile_details = soup.select(
	".vacancy-detail__content__body__profile__details li"
	)
	vacancy_detail["Profile"] = [
	detail.text.strip() for detail in profile_details
	]

	# Extracting the list of competences
	competences = soup.select(
	".vacancy-detail__content__body__competences__details li"
	)
	vacancy_detail["Competences"] = [
	competence.text.strip() for competence in competences
	]

	# Extracting the offer details
	offer_details = soup.select(
	".vacancy-detail__content__body__offer__details li"
	)
	vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]

	# Add the webpage and followup number
	vacancy_detail["Webpage"] = driver.current_url
	vacancy_detail["Followup_Number"] = followup_number

	# Get the final part of the URL
	parsed_url = urlparse(driver.current_url)
	vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)

	# Add the full URL of the webpage
	vacancy_detail["Full_URL"] = driver.current_url

	# Concatenate all the vacancy details into a single string
	vacancy_detail[
	"Vacancy"
	] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"

	# Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
	vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)

	# Increment the followup number
	followup_number += 1

	# Print the vacancy detail, page number and follow-up number
	print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
	print(vacancy_detail)

	# Append the vacancy detail to the CSV file
	df = pd.DataFrame([vacancy_detail])
	df.to_csv(
	csv_file, mode="a", header=not os.path.exists(csv_file), index=False
	)

	# Go back to the list page
	driver.back()
	time.sleep(5)

	# Go to the next page
	page_number += 1

	except WebDriverException as e:
	print(
	f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
	)
	driver.quit()
	time.sleep(60)
	page_number += 1

	except Exception as e:
	print(
	f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
	)
	time.sleep(60)
	page_number += 1

	# Close the driver
	driver.quit()