Spaces:
Running
Running
from src.chat_model import ChatModel | |
from langchain_community.document_loaders import WebBaseLoader | |
from langchain_core.prompts import PromptTemplate | |
from langchain_core.output_parsers import JsonOutputParser | |
from langchain_core.exceptions import OutputParserException | |
from src.utils import clean_text | |
import json | |
import requests | |
class JobExtractor: | |
""" | |
A class responsible for extracting job posting details from a given job listing URL. The class uses | |
a prompt-based approach to process scraped text and extract relevant job details. | |
Attributes: | |
----------- | |
chat_model : ChatModel | |
An instance of the ChatModel to handle processing and extraction. | |
extract_prompt : PromptTemplate | |
The template used to instruct the model on how to process the scraped text. | |
json_parser : JsonOutputParser | |
The output parser to convert model responses into structured JSON format. | |
Methods: | |
-------- | |
parse_job_from_web(url: str) -> str: | |
Scrapes and cleans the content from a given job listing URL. | |
extract_jobdata(text: str) -> dict: | |
Extracts and parses the job data from the cleaned text into a structured JSON format. | |
""" | |
def __init__(self): | |
""" | |
Initializes the JobExtractor instance with the necessary models, prompt templates, | |
and output parsers. | |
""" | |
self.chat_model = ChatModel() | |
# Define the template to extract job data using the language model | |
self.extract_prompt = PromptTemplate.from_template( | |
""" | |
### SCRAPED TEXT FROM WEBSITE: | |
{page_data} | |
### INSTRUCTION: | |
The scraped text is from the career's page of a website. | |
Your job is to extract the job postings and return them in JSON format containing the following keys: | |
`role`, `experience`, `skills`, `responsibilities`, `basic qualifications`, | |
`preferred qualifications`, and `description`. | |
Only return the valid JSON. | |
If you do not find any data to form a JSON, return | |
```json{{'job_postings': []}}``` | |
### VALID JSON (NO PREAMBLE): | |
""" | |
) | |
self.json_parser = JsonOutputParser() | |
def parse_job_from_web(self, url): | |
""" | |
Scrapes and cleans the content from a given job listing URL. | |
Parameters: | |
----------- | |
url : str | |
The URL of the job listing page. | |
Returns: | |
-------- | |
str: | |
The cleaned text content extracted from the job listing page. | |
Raises: | |
------- | |
ValueError: If the content could not be loaded or cleaned properly. | |
""" | |
try: | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" | |
} | |
loader = WebBaseLoader(url, headers) | |
page_data = loader.load().pop().page_content | |
# Check for blocking or unsupported browser messages | |
if "unsupported browser" in page_data.lower(): | |
raise ValueError(f"Unsupported browser message detected.") | |
# return None | |
if not page_data: | |
raise ValueError(f"Failed to fetch content from the URL {url}.") | |
print(f"===Page Data===\n {page_data}") | |
cleaned_data = clean_text(page_data) | |
print(f"=== Scraped and cleaned data ===\n {cleaned_data}...") # Displaying a snippet of data for debugging | |
return cleaned_data | |
except Exception as e: | |
print(f"WebBaseLoader Error: {e}") | |
# raise ValueError(f"Failed to fetch content from the URL {url}.") | |
return None | |
def extract_jobdata(self, text): | |
""" | |
Extracts and parses the job data from the cleaned text into a structured JSON format. | |
Parameters: | |
----------- | |
text : str | |
The cleaned text content from the job listing page. | |
Returns: | |
-------- | |
dict: | |
A dictionary containing the extracted job information in JSON format. | |
Raises: | |
------- | |
OutputParserException: If the extracted response cannot be parsed as valid JSON. | |
ValueError: If the extraction process fails. | |
""" | |
try: | |
extract_chain = self.extract_prompt | self.chat_model.groq | |
res = extract_chain.invoke(input={"page_data": text}) | |
print(f"=== Result Content ===\n {res.content}") | |
if not res.content.strip(): # Check if response is empty | |
raise ValueError("No valid job data extracted.") | |
try: | |
job_data = self.json_parser.parse(res.content) | |
print(f"=== JSON Job Data ===\n {job_data}") | |
return job_data | |
except json.decoder.JSONDecodeError: | |
print("Invalid JSON received. Returning empty job data.") | |
return {"job_postings": []} # Fail gracefully | |
except requests.exceptions.HTTPError as http_err: | |
if http_err.response.status_code == 413: | |
raise ValueError("The input is too large. Please reduce the size and try again.") | |
elif http_err.response.status_code == 429: | |
raise ValueError("Too many requests. Please try again later.") | |
else: | |
raise ValueError(f"HTTP error occurred: {http_err}") from http_err | |
except OutputParserException as e: | |
raise OutputParserException("Unable to parse job data as valid JSON.") from e | |
except Exception as e: | |
raise ValueError(f"An error occurred during job extraction: {e}") from e | |