Spaces:

drift-ai
/

recruiter-assistant-jbfxrs

Sleeping

File size: 6,042 Bytes

237ec40
a1372cb
237ec40
 
 
a1372cb
 
237ec40
a1372cb
 
 
 
237ec40
 
a1372cb
237ec40
 
a1372cb

import json
import os
import time
from urllib.parse import urlparse

import openai
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import (
    WebDriverException,
)
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

# Set up OpenAI API
openai.api_key = os.getenv("OPENAI")

# Initialize followup number
followup_number = 0

# Start with page 1
page_number = 4

# Get the current working directory
cwd = os.getcwd()

# Path to the CSV file
csv_file = os.path.join(cwd, "vacancies.csv")

while True:
    try:
        # Setup webdriver
        s = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=s)

        # The URL of the page with the buttons
        url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"

        # Navigate to the page
        driver.get(url)

        # Wait for the page to load
        time.sleep(5)

        # Find the buttons
        buttons = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (
                    By.XPATH,
                    '//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
                )
            )
        )

        for i in range(len(buttons)):
            # Find the buttons again to avoid StaleElementReferenceException
            buttons = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (
                        By.XPATH,
                        '//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
                    )
                )
            )

            # Click the button
            buttons[i].click()

            # Wait for the new page to load
            time.sleep(5)

            # Get the page source
            html = driver.page_source

            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")

            # Extract relevant items related to the vacancy
            vacancy_detail = {}

            # Extracting the job position
            vacancy_detail["Position"] = soup.select_one(
                ".vacancy-detail__content__header__position"
            ).text.strip()

            # Extracting the location
            vacancy_detail["Location"] = soup.select_one(
                ".vacancy-detail__content__header__location a"
            ).text.strip()

            # Extracting the description
            description = soup.select_one(
                ".vacancy-detail__content__body__description__details"
            ).get_text(separator=" ")
            vacancy_detail["Description"] = description.strip()

            # Extracting the profile details
            profile_details = soup.select(
                ".vacancy-detail__content__body__profile__details li"
            )
            vacancy_detail["Profile"] = [
                detail.text.strip() for detail in profile_details
            ]

            # Extracting the list of competences
            competences = soup.select(
                ".vacancy-detail__content__body__competences__details li"
            )
            vacancy_detail["Competences"] = [
                competence.text.strip() for competence in competences
            ]

            # Extracting the offer details
            offer_details = soup.select(
                ".vacancy-detail__content__body__offer__details li"
            )
            vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]

            # Add the webpage and followup number
            vacancy_detail["Webpage"] = driver.current_url
            vacancy_detail["Followup_Number"] = followup_number

            # Get the final part of the URL
            parsed_url = urlparse(driver.current_url)
            vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)

            # Add the full URL of the webpage
            vacancy_detail["Full_URL"] = driver.current_url

            # Concatenate all the vacancy details into a single string
            vacancy_detail[
                "Vacancy"
            ] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"

            # Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
            vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)

            # Increment the followup number
            followup_number += 1

            # Print the vacancy detail, page number and follow-up number
            print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
            print(vacancy_detail)

            # Append the vacancy detail to the CSV file
            df = pd.DataFrame([vacancy_detail])
            df.to_csv(
                csv_file, mode="a", header=not os.path.exists(csv_file), index=False
            )

            # Go back to the list page
            driver.back()
            time.sleep(5)

        # Go to the next page
        page_number += 1

    except WebDriverException as e:
        print(
            f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
        )
        driver.quit()
        time.sleep(60)
        page_number += 1

    except Exception as e:
        print(
            f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
        )
        time.sleep(60)
        page_number += 1

# Close the driver
driver.quit()