import os from pydantic import BaseModel from typing import Dict, List from dotenv import load_dotenv load_dotenv() class DatabaseConfig(BaseModel): mongo_uri: str = os.getenv("mongo_uri") mongo_db: str = os.getenv("mongo_db") neo4j_uri: str = os.getenv("neo4j_uri") neo4j_user: str = os.getenv("neo4j_user") neo4j_password: str = os.getenv("neo4j_password") class ScrapingConfig(BaseModel): timeout: int = 30000 wait_for_selector: str = "body" headless: bool = True user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" max_retries: int = 3 delay_between_requests: float = 1.0 class ExtractionConfig(BaseModel): content_selectors: List[str] = [ "article", "main", ".content", "#content", ".post", ".article-body", "p", "h1", "h2", "h3" ] ignore_selectors: List[str] = [ "script", "style", "nav", "footer", "header", ".advertisement", ".ads", ".sidebar" ] min_text_length: int = 50 extract_images: bool = True extract_links: bool = True class Settings: def __init__(self): self.database = DatabaseConfig() self.scraping = ScrapingConfig() self.extraction = ExtractionConfig() def update_from_env(self): # Update from environment variables if available if os.getenv("mongo_uri"): self.database.mongo_uri = os.getenv("mongo_uri") if os.getenv("mongo_db"): self.database.mongo_db = os.getenv("mongo_db") if os.getenv("neo4j_uri"): self.database.neo4j_uri = os.getenv("neo4j_uri") if os.getenv("neo4j_user"): self.database.neo4j_user = os.getenv("neo4j_user") if os.getenv("neo4j_password"): self.database.neo4j_password = os.getenv("neo4j_password") settings = Settings() settings.update_from_env()