abdo-Mansour's picture
constructor comment (#2)
cae5a7f verified
from web2json.ai_extractor import *
from web2json.postprocessor import *
from web2json.preprocessor import *
from pydantic import BaseModel
class Pipeline:
# constructor
def __init__(self,
preprocessor: Preprocessor,
ai_extractor: AIExtractor,
postprocessor: PostProcessor):
self.preprocessor = preprocessor
self.ai_extractor = ai_extractor
self.postprocessor = postprocessor
def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
"""
Run the entire pipeline: preprocess, extract, and postprocess.
Args:
content (str): The raw content to process.
is_url (bool): Whether the content is a URL or raw text.
schema (BaseModel): The schema defining the structure of the expected output.
Returns:
dict: The final structured data after processing.
"""
# Step 1: Preprocess the content
preprocessed_content = self.preprocessor.preprocess(content, is_url)
print(f"Preprocessed content: {preprocessed_content[:100]}...")
print('+'*80)
# Step 2: Extract structured information using AI
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
print(f"Extracted data: {extracted_data[:100]}...")
print('+'*80)
# Step 3: Post-process the extracted data
final_output = self.postprocessor.process(extracted_data)
print(f"Final output: {final_output}")
print('+'*80)
return final_output