|
import json |
|
import os |
|
import time |
|
from urllib.parse import urlparse |
|
|
|
import openai |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
from selenium import webdriver |
|
from selenium.common.exceptions import ( |
|
WebDriverException, |
|
) |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support import expected_conditions as EC |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
|
|
|
|
openai.api_key = os.getenv("OPENAI") |
|
|
|
|
|
followup_number = 0 |
|
|
|
|
|
page_number = 4 |
|
|
|
|
|
cwd = os.getcwd() |
|
|
|
|
|
csv_file = os.path.join(cwd, "vacancies.csv") |
|
|
|
while True: |
|
try: |
|
|
|
s = Service(ChromeDriverManager().install()) |
|
driver = webdriver.Chrome(service=s) |
|
|
|
|
|
url = f"https://vacatures.jobfixers.be/zoek?page={page_number}&size=50&sortBy=updated:desc&initial=true" |
|
|
|
|
|
driver.get(url) |
|
|
|
|
|
time.sleep(5) |
|
|
|
|
|
buttons = WebDriverWait(driver, 10).until( |
|
EC.presence_of_all_elements_located( |
|
( |
|
By.XPATH, |
|
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]', |
|
) |
|
) |
|
) |
|
|
|
for i in range(len(buttons)): |
|
|
|
buttons = WebDriverWait(driver, 10).until( |
|
EC.presence_of_all_elements_located( |
|
( |
|
By.XPATH, |
|
'//button[@mat-button and contains(@class, "mat-focus-indicator mat-button mat-button-base")]', |
|
) |
|
) |
|
) |
|
|
|
|
|
buttons[i].click() |
|
|
|
|
|
time.sleep(5) |
|
|
|
|
|
html = driver.page_source |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
|
|
vacancy_detail = {} |
|
|
|
|
|
vacancy_detail["Position"] = soup.select_one( |
|
".vacancy-detail__content__header__position" |
|
).text.strip() |
|
|
|
|
|
vacancy_detail["Location"] = soup.select_one( |
|
".vacancy-detail__content__header__location a" |
|
).text.strip() |
|
|
|
|
|
description = soup.select_one( |
|
".vacancy-detail__content__body__description__details" |
|
).get_text(separator=" ") |
|
vacancy_detail["Description"] = description.strip() |
|
|
|
|
|
profile_details = soup.select( |
|
".vacancy-detail__content__body__profile__details li" |
|
) |
|
vacancy_detail["Profile"] = [ |
|
detail.text.strip() for detail in profile_details |
|
] |
|
|
|
|
|
competences = soup.select( |
|
".vacancy-detail__content__body__competences__details li" |
|
) |
|
vacancy_detail["Competences"] = [ |
|
competence.text.strip() for competence in competences |
|
] |
|
|
|
|
|
offer_details = soup.select( |
|
".vacancy-detail__content__body__offer__details li" |
|
) |
|
vacancy_detail["Offer"] = [offer.text.strip() for offer in offer_details] |
|
|
|
|
|
vacancy_detail["Webpage"] = driver.current_url |
|
vacancy_detail["Followup_Number"] = followup_number |
|
|
|
|
|
parsed_url = urlparse(driver.current_url) |
|
vacancy_detail["Vacancy_Id"] = os.path.basename(parsed_url.path) |
|
|
|
|
|
vacancy_detail["Full_URL"] = driver.current_url |
|
|
|
|
|
vacancy_detail[ |
|
"Vacancy" |
|
] = f"Position: {vacancy_detail['Position']}\nLocation: {vacancy_detail['Location']}\nDescription: {vacancy_detail['Description']}\nProfile: {' '.join(vacancy_detail['Profile'])}\nCompetences: {' '.join(vacancy_detail['Competences'])}\nOffer: {' '.join(vacancy_detail['Offer'])}" |
|
|
|
|
|
vacancy_detail["Vacancy_JSON"] = json.dumps(vacancy_detail) |
|
|
|
|
|
followup_number += 1 |
|
|
|
|
|
print(f"Page Number: {page_number}, Follow-up Number: {followup_number}") |
|
print(vacancy_detail) |
|
|
|
|
|
df = pd.DataFrame([vacancy_detail]) |
|
df.to_csv( |
|
csv_file, mode="a", header=not os.path.exists(csv_file), index=False |
|
) |
|
|
|
|
|
driver.back() |
|
time.sleep(5) |
|
|
|
|
|
page_number += 1 |
|
|
|
except WebDriverException as e: |
|
print( |
|
f"WebDriverException occurred: {e}. Restarting the browser and waiting for 1 minute before trying the next page." |
|
) |
|
driver.quit() |
|
time.sleep(60) |
|
page_number += 1 |
|
|
|
except Exception as e: |
|
print( |
|
f"Exception occurred: {e}. Waiting for 1 minute before trying the next page." |
|
) |
|
time.sleep(60) |
|
page_number += 1 |
|
|
|
|
|
driver.quit() |
|
|