File size: 6,042 Bytes
237ec40 a1372cb 237ec40 a1372cb 237ec40 a1372cb 237ec40 a1372cb 237ec40 a1372cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import json
import os
import time
from urllib.parse import urlparse
import openai
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import (
WebDriverException,
)
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
# Set up OpenAI API
openai.api_key = os.getenv("OPENAI")
# Initialize followup number
followup_number = 0
# Start with page 1
page_number = 4
# Get the current working directory
cwd = os.getcwd()
# Path to the CSV file
csv_file = os.path.join(cwd, "vacancies.csv")
while True:
try:
# Setup webdriver
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
# The URL of the page with the buttons
url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true"
# Navigate to the page
driver.get(url)
# Wait for the page to load
time.sleep(5)
# Find the buttons
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
for i in range(len(buttons)):
# Find the buttons again to avoid StaleElementReferenceException
buttons = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located(
(
By.XPATH,
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]',
)
)
)
# Click the button
buttons[i].click()
# Wait for the new page to load
time.sleep(5)
# Get the page source
html = driver.page_source
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Extract relevant items related to the vacancy
vacancy_detail = {}
# Extracting the job position
vacancy_detail["Position"] = soup.select_one(
".vacancy-detail__content__header__position"
).text.strip()
# Extracting the location
vacancy_detail["Location"] = soup.select_one(
".vacancy-detail__content__header__location a"
).text.strip()
# Extracting the description
description = soup.select_one(
".vacancy-detail__content__body__description__details"
).get_text(separator=" ")
vacancy_detail["Description"] = description.strip()
# Extracting the profile details
profile_details = soup.select(
".vacancy-detail__content__body__profile__details li"
)
vacancy_detail["Profile"] = [
detail.text.strip() for detail in profile_details
]
# Extracting the list of competences
competences = soup.select(
".vacancy-detail__content__body__competences__details li"
)
vacancy_detail["Competences"] = [
competence.text.strip() for competence in competences
]
# Extracting the offer details
offer_details = soup.select(
".vacancy-detail__content__body__offer__details li"
)
vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details]
# Add the webpage and followup number
vacancy_detail["Webpage"] = driver.current_url
vacancy_detail["Followup_Number"] = followup_number
# Get the final part of the URL
parsed_url = urlparse(driver.current_url)
vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path)
# Add the full URL of the webpage
vacancy_detail["Full_URL"] = driver.current_url
# Concatenate all the vacancy details into a single string
vacancy_detail[
"Vacancy"
] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}"
# Add the entire dictionary as a JSON string to a new key "Vacancy_JSON"
vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail)
# Increment the followup number
followup_number += 1
# Print the vacancy detail, page number and follow-up number
print(f"Page Number: {page_number}, Follow-up Number: {followup_number}")
print(vacancy_detail)
# Append the vacancy detail to the CSV file
df = pd.DataFrame([vacancy_detail])
df.to_csv(
csv_file, mode="a", header=not os.path.exists(csv_file), index=False
)
# Go back to the list page
driver.back()
time.sleep(5)
# Go to the next page
page_number += 1
except WebDriverException as e:
print(
f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page."
)
driver.quit()
time.sleep(60)
page_number += 1
except Exception as e:
print(
f"Exception occurred: {e}. Waiting for 1 minute before trying the next page."
)
time.sleep(60)
page_number += 1
# Close the driver
driver.quit()
|