Spaces:
Sleeping
Sleeping
import os | |
from pydantic import BaseModel | |
from typing import Dict, List | |
from dotenv import load_dotenv | |
load_dotenv() | |
class DatabaseConfig(BaseModel): | |
mongo_uri: str = os.getenv("mongo_uri") | |
mongo_db: str = os.getenv("mongo_db") | |
neo4j_uri: str = os.getenv("neo4j_uri") | |
neo4j_user: str = os.getenv("neo4j_user") | |
neo4j_password: str = os.getenv("neo4j_password") | |
class ScrapingConfig(BaseModel): | |
timeout: int = 30000 | |
wait_for_selector: str = "body" | |
headless: bool = True | |
user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
max_retries: int = 3 | |
delay_between_requests: float = 1.0 | |
class ExtractionConfig(BaseModel): | |
content_selectors: List[str] = [ | |
"article", "main", ".content", "#content", | |
".post", ".article-body", "p", "h1", "h2", "h3" | |
] | |
ignore_selectors: List[str] = [ | |
"script", "style", "nav", "footer", "header", | |
".advertisement", ".ads", ".sidebar" | |
] | |
min_text_length: int = 50 | |
extract_images: bool = True | |
extract_links: bool = True | |
class Settings: | |
def __init__(self): | |
self.database = DatabaseConfig() | |
self.scraping = ScrapingConfig() | |
self.extraction = ExtractionConfig() | |
def update_from_env(self): | |
# Update from environment variables if available | |
if os.getenv("mongo_uri"): | |
self.database.mongo_uri = os.getenv("mongo_uri") | |
if os.getenv("mongo_db"): | |
self.database.mongo_db = os.getenv("mongo_db") | |
if os.getenv("neo4j_uri"): | |
self.database.neo4j_uri = os.getenv("neo4j_uri") | |
if os.getenv("neo4j_user"): | |
self.database.neo4j_user = os.getenv("neo4j_user") | |
if os.getenv("neo4j_password"): | |
self.database.neo4j_password = os.getenv("neo4j_password") | |
settings = Settings() | |
settings.update_from_env() |