File size: 5,832 Bytes
3d74a95
 
 
 
 
 
2544e0a
 
 
3d74a95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2544e0a
 
3d74a95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2544e0a
 
 
 
3d74a95
2544e0a
 
 
 
 
 
3d74a95
2544e0a
 
 
 
3d74a95
2544e0a
3d74a95
 
2544e0a
 
 
 
3d74a95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2544e0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d74a95
2544e0a
3d74a95
 
 
2544e0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from src.chat_model import ChatModel
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.exceptions import OutputParserException
from src.utils import clean_text
import json
import requests


class JobExtractor:
    """
    A class responsible for extracting job posting details from a given job listing URL. The class uses 
    a prompt-based approach to process scraped text and extract relevant job details.

    Attributes:
    -----------
    chat_model : ChatModel
        An instance of the ChatModel to handle processing and extraction.
    extract_prompt : PromptTemplate
        The template used to instruct the model on how to process the scraped text.
    json_parser : JsonOutputParser
        The output parser to convert model responses into structured JSON format.

    Methods:
    --------
    parse_job_from_web(url: str) -> str:
        Scrapes and cleans the content from a given job listing URL.
    
    extract_jobdata(text: str) -> dict:
        Extracts and parses the job data from the cleaned text into a structured JSON format.
    """

    def __init__(self):
        """
        Initializes the JobExtractor instance with the necessary models, prompt templates, 
        and output parsers.
        """
        self.chat_model = ChatModel()

        # Define the template to extract job data using the language model
        self.extract_prompt = PromptTemplate.from_template(
            """
            ### SCRAPED TEXT FROM WEBSITE:
            {page_data}
            ### INSTRUCTION:
            The scraped text is from the career's page of a website.
            Your job is to extract the job postings and return them in JSON format containing the following keys: 
            `role`, `experience`, `skills`, `responsibilities`, `basic qualifications`, 
            `preferred qualifications`, and `description`.
            Only return the valid JSON.
            If you do not find any data to form a JSON, return 
            ```json{{'job_postings': []}}```
            ### VALID JSON (NO PREAMBLE):
            """
        )

        self.json_parser = JsonOutputParser()

    def parse_job_from_web(self, url):
        """
        Scrapes and cleans the content from a given job listing URL.

        Parameters:
        -----------
        url : str
            The URL of the job listing page.

        Returns:
        --------
        str:
            The cleaned text content extracted from the job listing page.
        
        Raises:
        -------
        ValueError: If the content could not be loaded or cleaned properly.
        """
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
            }
            loader = WebBaseLoader(url, headers)
            page_data = loader.load().pop().page_content

            # Check for blocking or unsupported browser messages
            if "unsupported browser" in page_data.lower():
                raise ValueError(f"Unsupported browser message detected.")
                # return None

            if not page_data:
                raise ValueError(f"Failed to fetch content from the URL {url}.")
        
            print(f"===Page Data===\n {page_data}")

            cleaned_data = clean_text(page_data)
            print(f"=== Scraped and cleaned data ===\n {cleaned_data}...")  # Displaying a snippet of data for debugging
            return cleaned_data
        except Exception as e:
            print(f"WebBaseLoader Error: {e}")
            # raise ValueError(f"Failed to fetch content from the URL {url}.")
            return None
        

    def extract_jobdata(self, text):
        """
        Extracts and parses the job data from the cleaned text into a structured JSON format.

        Parameters:
        -----------
        text : str
            The cleaned text content from the job listing page.

        Returns:
        --------
        dict:
            A dictionary containing the extracted job information in JSON format.

        Raises:
        -------
        OutputParserException: If the extracted response cannot be parsed as valid JSON.
        ValueError: If the extraction process fails.
        """
        try:
            extract_chain = self.extract_prompt | self.chat_model.groq
            res = extract_chain.invoke(input={"page_data": text})

            print(f"=== Result Content ===\n {res.content}")

            if not res.content.strip():  # Check if response is empty
                raise ValueError("No valid job data extracted.")

            try:
                job_data = self.json_parser.parse(res.content)
                print(f"=== JSON Job Data ===\n {job_data}")
                return job_data
            except json.decoder.JSONDecodeError:
                print("Invalid JSON received. Returning empty job data.")
                return {"job_postings": []}  # Fail gracefully
            
        except requests.exceptions.HTTPError as http_err:
            if http_err.response.status_code == 413:
                raise ValueError("The input is too large. Please reduce the size and try again.")
            elif http_err.response.status_code == 429:
                raise ValueError("Too many requests. Please try again later.")
            else:
                raise ValueError(f"HTTP error occurred: {http_err}") from http_err
        except OutputParserException as e:
            raise OutputParserException("Unable to parse job data as valid JSON.") from e
        except Exception as e:
            raise ValueError(f"An error occurred during job extraction: {e}") from e