ProSpectAI / src /job_extractor.py
TejaCherukuri
bug fixes and added feature
2544e0a
from src.chat_model import ChatModel
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
from src.utils import clean_text
import json
import requests
class JobExtractor:
"""
A class responsible for extracting job posting details from a given job listing URL. The class uses
a prompt-based approach to process scraped text and extract relevant job details.
Attributes:
-----------
chat_model : ChatModel
An instance of the ChatModel to handle processing and extraction.
extract_prompt : PromptTemplate
The template used to instruct the model on how to process the scraped text.
json_parser : JsonOutputParser
The output parser to convert model responses into structured JSON format.
Methods:
--------
parse_job_from_web(url: str) -> str:
Scrapes and cleans the content from a given job listing URL.
extract_jobdata(text: str) -> dict:
Extracts and parses the job data from the cleaned text into a structured JSON format.
"""
def __init__(self):
"""
Initializes the JobExtractor instance with the necessary models, prompt templates,
and output parsers.
"""
self.chat_model = ChatModel()
# Define the template to extract job data using the language model
self.extract_prompt = PromptTemplate.from_template(
"""
### SCRAPED TEXT FROM WEBSITE:
{page_data}
### INSTRUCTION:
The scraped text is from the career's page of a website.
Your job is to extract the job postings and return them in JSON format containing the following keys:
`role`, `experience`, `skills`, `responsibilities`, `basic qualifications`,
`preferred qualifications`, and `description`.
Only return the valid JSON.
If you do not find any data to form a JSON, return
```json{{'job_postings': []}}```
### VALID JSON (NO PREAMBLE):
"""
)
self.json_parser = JsonOutputParser()
def parse_job_from_web(self, url):
"""
Scrapes and cleans the content from a given job listing URL.
Parameters:
-----------
url : str
The URL of the job listing page.
Returns:
--------
str:
The cleaned text content extracted from the job listing page.
Raises:
-------
ValueError: If the content could not be loaded or cleaned properly.
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
loader = WebBaseLoader(url, headers)
page_data = loader.load().pop().page_content
# Check for blocking or unsupported browser messages
if "unsupported browser" in page_data.lower():
raise ValueError(f"Unsupported browser message detected.")
# return None
if not page_data:
raise ValueError(f"Failed to fetch content from the URL {url}.")
print(f"===Page Data===\n {page_data}")
cleaned_data = clean_text(page_data)
print(f"=== Scraped and cleaned data ===\n {cleaned_data}...") # Displaying a snippet of data for debugging
return cleaned_data
except Exception as e:
print(f"WebBaseLoader Error: {e}")
# raise ValueError(f"Failed to fetch content from the URL {url}.")
return None
def extract_jobdata(self, text):
"""
Extracts and parses the job data from the cleaned text into a structured JSON format.
Parameters:
-----------
text : str
The cleaned text content from the job listing page.
Returns:
--------
dict:
A dictionary containing the extracted job information in JSON format.
Raises:
-------
OutputParserException: If the extracted response cannot be parsed as valid JSON.
ValueError: If the extraction process fails.
"""
try:
extract_chain = self.extract_prompt | self.chat_model.groq
res = extract_chain.invoke(input={"page_data": text})
print(f"=== Result Content ===\n {res.content}")
if not res.content.strip(): # Check if response is empty
raise ValueError("No valid job data extracted.")
try:
job_data = self.json_parser.parse(res.content)
print(f"=== JSON Job Data ===\n {job_data}")
return job_data
except json.decoder.JSONDecodeError:
print("Invalid JSON received. Returning empty job data.")
return {"job_postings": []} # Fail gracefully
except requests.exceptions.HTTPError as http_err:
if http_err.response.status_code == 413:
raise ValueError("The input is too large. Please reduce the size and try again.")
elif http_err.response.status_code == 429:
raise ValueError("Too many requests. Please try again later.")
else:
raise ValueError(f"HTTP error occurred: {http_err}") from http_err
except OutputParserException as e:
raise OutputParserException("Unable to parse job data as valid JSON.") from e
except Exception as e:
raise ValueError(f"An error occurred during job extraction: {e}") from e