Spaces:
Running
Running
File size: 5,832 Bytes
3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a 3d74a95 2544e0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
from src.chat_model import ChatModel
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
from src.utils import clean_text
import json
import requests
class JobExtractor:
"""
A class responsible for extracting job posting details from a given job listing URL. The class uses
a prompt-based approach to process scraped text and extract relevant job details.
Attributes:
-----------
chat_model : ChatModel
An instance of the ChatModel to handle processing and extraction.
extract_prompt : PromptTemplate
The template used to instruct the model on how to process the scraped text.
json_parser : JsonOutputParser
The output parser to convert model responses into structured JSON format.
Methods:
--------
parse_job_from_web(url: str) -> str:
Scrapes and cleans the content from a given job listing URL.
extract_jobdata(text: str) -> dict:
Extracts and parses the job data from the cleaned text into a structured JSON format.
"""
def __init__(self):
"""
Initializes the JobExtractor instance with the necessary models, prompt templates,
and output parsers.
"""
self.chat_model = ChatModel()
# Define the template to extract job data using the language model
self.extract_prompt = PromptTemplate.from_template(
"""
### SCRAPED TEXT FROM WEBSITE:
{page_data}
### INSTRUCTION:
The scraped text is from the career's page of a website.
Your job is to extract the job postings and return them in JSON format containing the following keys:
`role`, `experience`, `skills`, `responsibilities`, `basic qualifications`,
`preferred qualifications`, and `description`.
Only return the valid JSON.
If you do not find any data to form a JSON, return
```json{{'job_postings': []}}```
### VALID JSON (NO PREAMBLE):
"""
)
self.json_parser = JsonOutputParser()
def parse_job_from_web(self, url):
"""
Scrapes and cleans the content from a given job listing URL.
Parameters:
-----------
url : str
The URL of the job listing page.
Returns:
--------
str:
The cleaned text content extracted from the job listing page.
Raises:
-------
ValueError: If the content could not be loaded or cleaned properly.
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
loader = WebBaseLoader(url, headers)
page_data = loader.load().pop().page_content
# Check for blocking or unsupported browser messages
if "unsupported browser" in page_data.lower():
raise ValueError(f"Unsupported browser message detected.")
# return None
if not page_data:
raise ValueError(f"Failed to fetch content from the URL {url}.")
print(f"===Page Data===\n {page_data}")
cleaned_data = clean_text(page_data)
print(f"=== Scraped and cleaned data ===\n {cleaned_data}...") # Displaying a snippet of data for debugging
return cleaned_data
except Exception as e:
print(f"WebBaseLoader Error: {e}")
# raise ValueError(f"Failed to fetch content from the URL {url}.")
return None
def extract_jobdata(self, text):
"""
Extracts and parses the job data from the cleaned text into a structured JSON format.
Parameters:
-----------
text : str
The cleaned text content from the job listing page.
Returns:
--------
dict:
A dictionary containing the extracted job information in JSON format.
Raises:
-------
OutputParserException: If the extracted response cannot be parsed as valid JSON.
ValueError: If the extraction process fails.
"""
try:
extract_chain = self.extract_prompt | self.chat_model.groq
res = extract_chain.invoke(input={"page_data": text})
print(f"=== Result Content ===\n {res.content}")
if not res.content.strip(): # Check if response is empty
raise ValueError("No valid job data extracted.")
try:
job_data = self.json_parser.parse(res.content)
print(f"=== JSON Job Data ===\n {job_data}")
return job_data
except json.decoder.JSONDecodeError:
print("Invalid JSON received. Returning empty job data.")
return {"job_postings": []} # Fail gracefully
except requests.exceptions.HTTPError as http_err:
if http_err.response.status_code == 413:
raise ValueError("The input is too large. Please reduce the size and try again.")
elif http_err.response.status_code == 429:
raise ValueError("Too many requests. Please try again later.")
else:
raise ValueError(f"HTTP error occurred: {http_err}") from http_err
except OutputParserException as e:
raise OutputParserException("Unable to parse job data as valid JSON.") from e
except Exception as e:
raise ValueError(f"An error occurred during job extraction: {e}") from e
|