File size: 11,848 Bytes
008fedd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import gradio as gr
from docx import Document
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import csv
import time
import pickle
import logging
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

class LinkedInBot:
    def __init__(self, delay=5):
        if not os.path.exists("data"):
            os.makedirs("data")
        self.delay = delay
        self.driver = webdriver.Chrome()

    def login(self, email, password):
        """Go to LinkedIn and login"""
        self.driver.maximize_window()
        self.driver.get('https://www.linkedin.com/login')
        self.driver.find_element(By.ID, 'username').send_keys(email)
        self.driver.find_element(By.ID, 'password').send_keys(password)
        self.driver.find_element(By.XPATH, "//button[@type='submit']").click()

    def save_cookie(self, path):
        with open(path, 'wb') as filehandler:
            pickle.dump(self.driver.get_cookies(), filehandler)

    def load_cookie(self, path):
        with open(path, 'rb') as cookiesfile:
            cookies = pickle.load(cookiesfile)
            for cookie in cookies:
                self.driver.add_cookie(cookie)

    def search_linkedin(self, keywords, location, date_posted):
        """Enter keywords into the search bar"""
        self.driver.get("https://www.linkedin.com/jobs/")
        self.driver.get(f"https://www.linkedin.com/jobs/search/?keywords={keywords}&location={location}&f_TPR={date_posted}")

    def wait(self, by=By.ID, text=None, t_delay=None, max_retries=3):
        """Wait until a specific element is present on the page."""
        delay = self.delay if t_delay is None else t_delay
        retries = 0
        while retries < max_retries:
            try:
                WebDriverWait(self.driver, delay).until(EC.presence_of_element_located((by, text)))
                return  # Element found, exit the loop
            except TimeoutException:
                retries += 1
                logging.warning(f"Element not found, retrying... ({retries}/{max_retries})")
                time.sleep(delay)  # Wait before retrying
        logging.error("Element not found after retries.")

    def scroll_to(self, job_list_item):
        """Scroll to the list item in the column and click on it."""
        self.driver.execute_script("arguments[0].scrollIntoView();", job_list_item)
        job_list_item.click()

    def extract_additional_details(self, job):
        """Extracts additional details like company size, position level, salary, job type, industry, and skills if available."""
        company_size = None
        position_level = None
        salary = None
        job_type = None
        industry = None
        skills = None

        try:
            additional_info = job.find_element(By.CLASS_NAME, "job-details-jobs-unified-top-card__job-insight")

            # Extract salary
            salary_element = additional_info.find_element(By.XPATH, ".//span[contains(@class, 'job-details-jobs-unified-top-card__job-insight-view-model-secondary')]")
            salary = salary_element.text.strip()
            
            # Extract job type, position level, and industry
            for span in additional_info.find_elements(By.XPATH, ".//span[contains(@class, 'job-details-jobs-unified-top-card__job-insight-view-model-secondary')]"):
                text = span.text.strip()
                if "Hybrid" in text:
                    job_type = text
                elif "Full-time" in text:
                    job_type = text
                elif "Mid-Senior level" in text:
                    position_level = text
                else:
                    industry = text

            # Extract company size and industry
            company_info = additional_info.find_element(By.XPATH, ".//span")
            company_info_text = company_info.text.strip()
            if "employees" in company_info_text:
                company_size = company_info_text.split(" · ")[0]
                industry = company_info_text.split(" · ")[1]
            else:
                industry = company_info_text

            # Extract skills
            skills_button = additional_info.find_element(By.CLASS_NAME, "job-details-jobs-unified-top-card__job-insight-text-button")
            skills_link = skills_button.find_element(By.TAG_NAME, "a")
            skills = skills_link.text.split(": ")[1]

        except NoSuchElementException:
            pass

        return company_size, position_level, salary, job_type, industry, skills

    def get_position_data(self, job):
        """Gets the position data for a posting."""
        job_info = job.text.split('\n')
        if len(job_info) < 3:
            logging.warning("Incomplete job information, skipping...")
            return None

        position, company, *details = job_info
        location = details[0] if details else None
        description = self.get_job_description(job)

        return [position, company, location, description]


    def extract_additional_details(self, job):
        """Extracts additional details like company size, position level, salary, and job type if available."""
        company_size = None
        position_level = None
        salary = None
        job_type = None

        try:
            additional_info = job.find_element(By.CLASS_NAME, "job-card-search__company-size").text
            if "employees" in additional_info:
                company_size = additional_info.strip()
        except NoSuchElementException:
            pass

        try:
            position_level = job.find_element(By.CLASS_NAME, "job-card-search__badge").text
        except NoSuchElementException:
            pass

        try:
            salary = job.find_element(By.CLASS_NAME, "job-card-search__salary").text
        except NoSuchElementException:
            pass

        try:
            job_type = job.find_element(By.CLASS_NAME, "job-card-search__job-type").text
        except NoSuchElementException:
            pass

        return company_size, position_level, salary, job_type

    def get_job_description(self, job):
        """Gets the job description."""
        self.scroll_to(job)
        try:
            description_element = self.driver.find_element(By.CLASS_NAME, "jobs-description")
            description = description_element.text
        except NoSuchElementException:
            description = None
        return description

    def get_application_link(self, job):
        """Gets the job application link."""
        try:
            application_link_element = job.find_element(By.CLASS_NAME, "job-card-search__apply-button-container").find_element(By.TAG_NAME, "a")
            application_link = application_link_element.get_attribute("href")
        except NoSuchElementException:
            application_link = None
        return application_link

    def run(self, email, password, keywords, location, date_posted):
        if os.path.exists("data/cookies.txt"):
            self.driver.get("https://www.linkedin.com/")
            self.load_cookie("data/cookies.txt")
            self.driver.get("https://www.linkedin.com/")
        else:
            self.login(email=email, password=password)
            self.save_cookie("data/cookies.txt")

        logging.info("Begin LinkedIn keyword search")
        self.search_linkedin(keywords, location, date_posted)
        self.wait()

        csv_file_path = os.path.join("data", "data.csv")
        with open(csv_file_path, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Position", "Company", "Location", "Description"])

            page = 1
            while True:
                jobs = self.driver.find_elements(By.CLASS_NAME, "occludable-update")
                for job in jobs:
                    job_data = self.get_position_data(job)
                    if job_data:
                        position, company, location, description = job_data
                        writer.writerow([position, company, location, description])

                next_button_xpath = f"//button[@aria-label='Page {page + 1}']"
                next_button = self.driver.find_elements(By.XPATH, next_button_xpath)
                if next_button:
                    next_button[0].click()
                    self.wait()
                    page += 1
                else:
                    break

        logging.info("Done scraping.")
        logging.info("Closing session.")
        self.close_session()

    def close_session(self):
        """Close the actual session"""
        logging.info("Closing session")
        self.driver.close()

# Function to extract keywords from text
def extract_keywords(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    stopwords_list = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stopwords_list and token not in string.punctuation]
    return tokens

# Function to process uploaded resume
def process_resume(uploaded_file):
    docx = Document(uploaded_file.name)
    resume_text = ""
    for paragraph in docx.paragraphs:
        resume_text += paragraph.text + "\n"
    return resume_text

def keyword_similarity_check(resume_text, df, keywords):
    vectorizer = TfidfVectorizer()
    job_descriptions = df["Description"].fillna("")
    tfidf_matrix = vectorizer.fit_transform(job_descriptions)

    # Extract keywords from the resume and job descriptions
    resume_keywords = extract_keywords(resume_text)
    job_description_keywords = [extract_keywords(desc) for desc in job_descriptions]

    # Calculate the number of common keywords
    common_keywords_count = sum(1 for keyword in resume_keywords if keyword in keywords)
    job_common_keywords_counts = [sum(1 for keyword in job_keywords if keyword in keywords) for job_keywords in job_description_keywords]

    # Calculate similarity scores based on the number of common keywords
    similarity_scores = [count / len(keywords) * 100 for count in job_common_keywords_counts]
    df["Similarity (%)"] = similarity_scores
    df.to_csv("data/data.csv", index=False)
    return df

def cosine_similarity_check(resume_text, df):
    vectorizer = TfidfVectorizer()
    job_descriptions = df["Description"].fillna("")
    tfidf_matrix = vectorizer.fit_transform(job_descriptions)
    resume_tfidf = vectorizer.transform([resume_text])
    similarity_scores = cosine_similarity(resume_tfidf, tfidf_matrix)[0]
    df["Similarity (%)"] = similarity_scores * 100
    df.to_csv("data/data.csv", index=False)
    return df

def main(email, password, keywords, location, date_posted, resume_file):
    bot = LinkedInBot()
    bot.run(email, password, keywords, location, date_posted)

    df = pd.read_csv("data/data.csv")

    if resume_file:
        resume_text = process_resume(resume_file)
        keywords = extract_keywords(resume_text)
        df = keyword_similarity_check(resume_text, df, keywords)
        df = cosine_similarity_check(resume_text, df)

    return df

iface = gr.Interface(fn=main, 
                      inputs=["text", "text", "text", "text", "text", "file"],
                      outputs="csv",
                      title="LinkedIn Job Analysis",
                      description="Enter your LinkedIn credentials and search criteria to scrape job postings. Upload a resume to check for job similarity.")
iface.launch()