Spaces:

faridans27
/

jobs-scraper

Paused

File size: 3,394 Bytes

1400c09

import streamlit as st
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Instantiate global variables
df = pd.DataFrame(columns=["Title", "Location", "Company", "Link", "Description"])

# Get user input
inputJobTitle = st.text_input("Enter Job Title:")
inputJobLocation = st.text_input("Enter Job Location:")
totalPages = st.number_input("Enter Total Pages:", min_value=1, value=1)
submit_button = st.button("Submit")

def scrapeJobDescription(url):
    options = Options()
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    try:
        jobDescription = soup.find(
            "div", class_="show-more-less-html__markup"
        ).text.strip()
        return jobDescription
    except:
        return ""

def scrapeLinkedin():
    global df
    global inputJobTitle
    global inputJobLocation
    counter = 0
    pageCounter = 1
    options = Options()
    options.add_argument("--headless=new")
    driver = webdriver.Chrome(options=options)
    
    while pageCounter <= totalPages:
        try:
            driver.get(
                f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}"
            )

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            ulElement = soup.find("ul", class_="jobs-search__results-list")
            liElements = ulElement.find_all("li")

            for item in liElements:
                jobTitle = item.find(
                    "h3", class_="base-search-card__title"
                ).text.strip()
                jobLocation = item.find(
                    "span", class_="job-search-card__location"
                ).text.strip()
                jobCompany = item.find(
                    "h4", class_="base-search-card__subtitle"
                ).text.strip()
                jobLink = item.find_all("a")[0]["href"]

                jobDescription = scrapeJobDescription(jobLink)

                if jobTitle and jobLocation and jobCompany and jobLink:
                    df = pd.concat(
                        [
                            df,
                            pd.DataFrame(
                                {
                                    "Title": [jobTitle],
                                    "Location": [jobLocation],
                                    "Company": [jobCompany],
                                    "Link": [jobLink],
                                    "Description": [jobDescription],
                                }
                            ),
                        ]
                    )

            counter += 25
            pageCounter += 1
        except:
            break

    driver.quit()

def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')


if submit_button:
    with st.spinner("Operation in progress. Please wait..."):
        scrapeLinkedin()
        time.sleep(1)
    st.write(df)


    csv = convert_df(df)
    st.download_button(
        "Press to Download",
        csv,
        "file.csv",
        "text/csv",
        key='download-csv'
    )