Vincent Claes
jobfixers create HTML table
a1372cb
raw
history blame
6.09 kB
import os
import openai
import pandas as pd
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
WebDriverException,
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
# Set up OpenAI API
openai.api_key = os.getenv("OPENAI")
# Initialize followup number
followup_number = 0
# Start with page 1
page_number = 4
# Get the current working directory
cwd = os.getcwd()
# Path to the CSV file
csv_file = os.path.join(cwd, "vacancies.csv")
while True:
try:
# Setup webdriver
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
# The URL of the page with the buttons
url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"
# Navigate to the page
driver.get(url)
# Wait for the page to load
time.sleep(5)
# Find the buttons
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
for i in range(len(buttons)):
# Find the buttons again to avoid StaleElementReferenceException
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
# Click the button
buttons[i].click()
# Wait for the new page to load
time.sleep(5)
# Get the page source
html = driver.page_source
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Extract relevant items related to the vacancy
vacancy_detail = {}
# Extracting the job position
vacancy_detail["Position"] = soup.select_one(
".vacancy-detail__content__header__position"
).text.strip()
# Extracting the location
vacancy_detail["Location"] = soup.select_one(
".vacancy-detail__content__header__location a"
).text.strip()
# Extracting the description
description = soup.select_one(
".vacancy-detail__content__body__description__details"
).get_text(separator=" ")
vacancy_detail["Description"] = description.strip()
# Extracting the profile details
profile_details = soup.select(
".vacancy-detail__content__body__profile__details li"
)
vacancy_detail["Profile"] = [
detail.text.strip() for detail in profile_details
]
# Extracting the list of competences
competences = soup.select(
".vacancy-detail__content__body__competences__details li"
)
vacancy_detail["Competences"] = [
competence.text.strip() for competence in competences
]
# Extracting the offer details
offer_details = soup.select(
".vacancy-detail__content__body__offer__details li"
)
vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]
# Add the webpage and followup number
vacancy_detail["Webpage"] = driver.current_url
vacancy_detail["Followup_Number"] = followup_number
# Get the final part of the URL
parsed_url = urlparse(driver.current_url)
vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)
# Add the full URL of the webpage
vacancy_detail["Full_URL"] = driver.current_url
# Concatenate all the vacancy details into a single string
vacancy_detail[
"Vacancy"
] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"
# Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)
# Increment the followup number
followup_number += 1
# Print the vacancy detail, page number and follow-up number
print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
print(vacancy_detail)
# Append the vacancy detail to the CSV file
df = pd.DataFrame([vacancy_detail])
df.to_csv(
csv_file, mode="a", header=not os.path.exists(csv_file), index=False
)
# Go back to the list page
driver.back()
time.sleep(5)
# Go to the next page
page_number += 1
except WebDriverException as e:
print(
f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
)
driver.quit()
time.sleep(60)
page_number += 1
except Exception as e:
print(
f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
)
time.sleep(60)
page_number += 1
# Close the driver
driver.quit()