Spaces:

tejacherukuri
/

ProSpectAI

Running

ProSpectAI / src /job_extractor.py

TejaCherukuri

bug fixes and added feature

2544e0a 5 months ago

5.83 kB

	from src.chat_model import ChatModel
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_core.exceptions import OutputParserException
	from src.utils import clean_text
	import json
	import requests


	class JobExtractor:
	"""
	A class responsible for extracting job posting details from a given job listing URL. The class uses
	a prompt-based approach to process scraped text and extract relevant job details.

	Attributes:
	-----------
	chat_model : ChatModel
	An instance of the ChatModel to handle processing and extraction.
	extract_prompt : PromptTemplate
	The template used to instruct the model on how to process the scraped text.
	json_parser : JsonOutputParser
	The output parser to convert model responses into structured JSON format.

	Methods:
	--------
	parse_job_from_web(url: str) -> str:
	Scrapes and cleans the content from a given job listing URL.

	extract_jobdata(text: str) -> dict:
	Extracts and parses the job data from the cleaned text into a structured JSON format.
	"""

	def __init__(self):
	"""
	Initializes the JobExtractor instance with the necessary models, prompt templates,
	and output parsers.
	"""
	self.chat_model = ChatModel()

	# Define the template to extract job data using the language model
	self.extract_prompt = PromptTemplate.from_template(
	"""
	### SCRAPED TEXT FROM WEBSITE:
	{page_data}
	### INSTRUCTION:
	The scraped text is from the career's page of a website.
	Your job is to extract the job postings and return them in JSON format containing the following keys:
	`role`, `experience`, `skills`, `responsibilities`, `basic qualifications`,
	`preferred qualifications`, and `description`.
	Only return the valid JSON.
	If you do not find any data to form a JSON, return
	```json{{'job_postings': []}}```
	### VALID JSON (NO PREAMBLE):
	"""
	)

	self.json_parser = JsonOutputParser()

	def parse_job_from_web(self, url):
	"""
	Scrapes and cleans the content from a given job listing URL.

	Parameters:
	-----------
	url : str
	The URL of the job listing page.

	Returns:
	--------
	str:
	The cleaned text content extracted from the job listing page.

	Raises:
	-------
	ValueError: If the content could not be loaded or cleaned properly.
	"""
	try:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
	}
	loader = WebBaseLoader(url, headers)
	page_data = loader.load().pop().page_content

	# Check for blocking or unsupported browser messages
	if "unsupported browser" in page_data.lower():
	raise ValueError(f"Unsupported browser message detected.")
	# return None

	if not page_data:
	raise ValueError(f"Failed to fetch content from the URL {url}.")

	print(f"===Page Data===\n {page_data}")

	cleaned_data = clean_text(page_data)
	print(f"=== Scraped and cleaned data ===\n {cleaned_data}...") # Displaying a snippet of data for debugging
	return cleaned_data
	except Exception as e:
	print(f"WebBaseLoader Error: {e}")
	# raise ValueError(f"Failed to fetch content from the URL {url}.")
	return None


	def extract_jobdata(self, text):
	"""
	Extracts and parses the job data from the cleaned text into a structured JSON format.

	Parameters:
	-----------
	text : str
	The cleaned text content from the job listing page.

	Returns:
	--------
	dict:
	A dictionary containing the extracted job information in JSON format.

	Raises:
	-------
	OutputParserException: If the extracted response cannot be parsed as valid JSON.
	ValueError: If the extraction process fails.
	"""
	try:
	extract_chain = self.extract_prompt \| self.chat_model.groq
	res = extract_chain.invoke(input={"page_data": text})

	print(f"=== Result Content ===\n {res.content}")

	if not res.content.strip(): # Check if response is empty
	raise ValueError("No valid job data extracted.")

	try:
	job_data = self.json_parser.parse(res.content)
	print(f"=== JSON Job Data ===\n {job_data}")
	return job_data
	except json.decoder.JSONDecodeError:
	print("Invalid JSON received. Returning empty job data.")
	return {"job_postings": []} # Fail gracefully

	except requests.exceptions.HTTPError as http_err:
	if http_err.response.status_code == 413:
	raise ValueError("The input is too large. Please reduce the size and try again.")
	elif http_err.response.status_code == 429:
	raise ValueError("Too many requests. Please try again later.")
	else:
	raise ValueError(f"HTTP error occurred: {http_err}") from http_err
	except OutputParserException as e:
	raise OutputParserException("Unable to parse job data as valid JSON.") from e
	except Exception as e:
	raise ValueError(f"An error occurred during job extraction: {e}") from e