import json import os import time from urllib.parse import urlparse import openai import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.common.exceptions import ( WebDriverException, ) from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from webdriver_manager.chrome import ChromeDriverManager # Set up OpenAI API openai.api_key = os.getenv("OPENAI") # Initialize followup number followup_number = 0 # Start with page 1 page_number = 4 # Get the current working directory cwd = os.getcwd() # Path to the CSV file csv_file = os.path.join(cwd, "vacancies.csv") while True: try: # Setup webdriver s = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=s) # The URL of the page with the buttons url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true" # Navigate to the page driver.get(url) # Wait for the page to load time.sleep(5) # Find the buttons buttons = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located( ( By.XPATH, '//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]', ) ) ) for i in range(len(buttons)): # Find the buttons again to avoid StaleElementReferenceException buttons = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located( ( By.XPATH, '//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]', ) ) ) # Click the button buttons[i].click() # Wait for the new page to load time.sleep(5) # Get the page source html = driver.page_source # Parse the HTML with BeautifulSoup soup = BeautifulSoup(html, "html.parser") # Extract relevant items related to the vacancy vacancy_detail = {} # Extracting the job position vacancy_detail["Position"] = soup.select_one( ".vacancy-detail__content__header__position" ).text.strip() # Extracting the location vacancy_detail["Location"] = soup.select_one( ".vacancy-detail__content__header__location a" ).text.strip() # Extracting the description description = soup.select_one( ".vacancy-detail__content__body__description__details" ).get_text(separator=" ") vacancy_detail["Description"] = description.strip() # Extracting the profile details profile_details = soup.select( ".vacancy-detail__content__body__profile__details li" ) vacancy_detail["Profile"] = [ detail.text.strip() for detail in profile_details ] # Extracting the list of competences competences = soup.select( ".vacancy-detail__content__body__competences__details li" ) vacancy_detail["Competences"] = [ competence.text.strip() for competence in competences ] # Extracting the offer details offer_details = soup.select( ".vacancy-detail__content__body__offer__details li" ) vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details] # Add the webpage and followup number vacancy_detail["Webpage"] = driver.current_url vacancy_detail["Followup_Number"] = followup_number # Get the final part of the URL parsed_url = urlparse(driver.current_url) vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path) # Add the full URL of the webpage vacancy_detail["Full_URL"] = driver.current_url # Concatenate all the vacancy details into a single string vacancy_detail[ "Vacancy" ] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}" # Add the entire dictionary as a JSON string to a new key "Vacancy_JSON" vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail) # Increment the followup number followup_number += 1 # Print the vacancy detail, page number and follow-up number print(f"Page Number: {page_number}, Follow-up Number: {followup_number}") print(vacancy_detail) # Append the vacancy detail to the CSV file df = pd.DataFrame([vacancy_detail]) df.to_csv( csv_file, mode="a", header=not os.path.exists(csv_file), index=False ) # Go back to the list page driver.back() time.sleep(5) # Go to the next page page_number += 1 except WebDriverException as e: print( f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page." ) driver.quit() time.sleep(60) page_number += 1 except Exception as e: print( f"Exception occurred: {e}. Waiting for 1 minute before trying the next page." ) time.sleep(60) page_number += 1 # Close the driver driver.quit()