NeerajCodz commited on
Commit
ff3e1be
·
1 Parent(s): bf7914e

chore: initialize backend project structure with FastAPI

Browse files
backend/app/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """ScrapeRL Backend - FastAPI-based RL environment for web scraping."""
2
+
3
+ __version__ = "0.1.0"
backend/app/config.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration management using Pydantic Settings."""
2
+
3
+ from functools import lru_cache
4
+ from typing import Literal
5
+
6
+ from pydantic import Field, SecretStr
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+
10
+ class Settings(BaseSettings):
11
+ """Application settings loaded from environment variables."""
12
+
13
+ model_config = SettingsConfigDict(
14
+ env_file=".env",
15
+ env_file_encoding="utf-8",
16
+ case_sensitive=False,
17
+ extra="ignore",
18
+ )
19
+
20
+ # Application
21
+ app_name: str = "ScrapeRL"
22
+ app_version: str = "0.1.0"
23
+ debug: bool = Field(default=False, description="Enable debug mode")
24
+ log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO"
25
+
26
+ # Server
27
+ host: str = "0.0.0.0"
28
+ port: int = 8000
29
+ reload: bool = False
30
+ workers: int = 1
31
+
32
+ # CORS
33
+ cors_origins: list[str] = Field(
34
+ default=["http://localhost:3000", "http://localhost:5173"],
35
+ description="Allowed CORS origins",
36
+ )
37
+ cors_allow_credentials: bool = True
38
+ cors_allow_methods: list[str] = ["*"]
39
+ cors_allow_headers: list[str] = ["*"]
40
+
41
+ # LLM Providers
42
+ openai_api_key: SecretStr | None = Field(default=None, description="OpenAI API key")
43
+ anthropic_api_key: SecretStr | None = Field(default=None, description="Anthropic API key")
44
+ google_api_key: SecretStr | None = Field(default=None, description="Google AI API key")
45
+ groq_api_key: SecretStr | None = Field(default=None, description="Groq API key")
46
+
47
+ # Model Defaults
48
+ default_model: str = "gpt-4o-mini"
49
+ default_temperature: float = 0.7
50
+ max_tokens: int = 4096
51
+
52
+ # Search Providers
53
+ google_search_api_key: SecretStr | None = None
54
+ google_search_engine_id: str | None = None
55
+ bing_search_api_key: SecretStr | None = None
56
+
57
+ # ChromaDB
58
+ chroma_persist_directory: str = "./data/chroma"
59
+ chroma_collection_name: str = "scraperl_memory"
60
+
61
+ # Episode Settings
62
+ max_steps_per_episode: int = 50
63
+ default_timeout_seconds: float = 30.0
64
+
65
+ # Browser Settings
66
+ headless_browser: bool = True
67
+ browser_timeout_ms: int = 30000
68
+
69
+ # Memory Settings
70
+ short_term_memory_size: int = 100
71
+ working_memory_size: int = 20
72
+ long_term_memory_top_k: int = 10
73
+
74
+ # Reward Weights
75
+ reward_accuracy_weight: float = 0.4
76
+ reward_efficiency_weight: float = 0.2
77
+ reward_cost_weight: float = 0.2
78
+ reward_completeness_weight: float = 0.2
79
+
80
+ @property
81
+ def available_providers(self) -> list[str]:
82
+ """Return list of configured LLM providers."""
83
+ providers = []
84
+ if self.openai_api_key:
85
+ providers.append("openai")
86
+ if self.anthropic_api_key:
87
+ providers.append("anthropic")
88
+ if self.google_api_key:
89
+ providers.append("google")
90
+ if self.groq_api_key:
91
+ providers.append("groq")
92
+ return providers
93
+
94
+
95
+ @lru_cache
96
+ def get_settings() -> Settings:
97
+ """Get cached settings instance."""
98
+ return Settings()
backend/app/main.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application entry point with CORS and lifespan management."""
2
+
3
+ import logging
4
+ from contextlib import asynccontextmanager
5
+ from typing import AsyncGenerator
6
+
7
+ import uvicorn
8
+ from fastapi import FastAPI
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+
11
+ from app.api.routes import agents, episode, health, memory, tasks, tools
12
+ from app.config import get_settings
13
+ from app.memory.manager import MemoryManager
14
+ from app.models.router import SmartModelRouter
15
+ from app.tools.registry import MCPToolRegistry
16
+ from app.utils.logging import setup_logging
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Global instances for dependency injection
21
+ _memory_manager: MemoryManager | None = None
22
+ _model_router: SmartModelRouter | None = None
23
+ _tool_registry: MCPToolRegistry | None = None
24
+
25
+
26
+ def get_memory_manager() -> MemoryManager:
27
+ """Get the global memory manager instance."""
28
+ if _memory_manager is None:
29
+ raise RuntimeError("Memory manager not initialized")
30
+ return _memory_manager
31
+
32
+
33
+ def get_model_router() -> SmartModelRouter:
34
+ """Get the global model router instance."""
35
+ if _model_router is None:
36
+ raise RuntimeError("Model router not initialized")
37
+ return _model_router
38
+
39
+
40
+ def get_tool_registry() -> MCPToolRegistry:
41
+ """Get the global tool registry instance."""
42
+ if _tool_registry is None:
43
+ raise RuntimeError("Tool registry not initialized")
44
+ return _tool_registry
45
+
46
+
47
+ @asynccontextmanager
48
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
49
+ """Manage application lifespan - startup and shutdown events."""
50
+ global _memory_manager, _model_router, _tool_registry
51
+
52
+ settings = get_settings()
53
+ logger.info(f"Starting {settings.app_name} v{settings.app_version}")
54
+
55
+ # Initialize components
56
+ logger.info("Initializing memory manager...")
57
+ _memory_manager = MemoryManager(settings)
58
+ await _memory_manager.initialize()
59
+
60
+ logger.info("Initializing model router...")
61
+ _model_router = SmartModelRouter(settings)
62
+ await _model_router.initialize()
63
+
64
+ logger.info("Initializing tool registry...")
65
+ _tool_registry = MCPToolRegistry()
66
+ await _tool_registry.initialize()
67
+
68
+ logger.info("Application startup complete")
69
+
70
+ yield
71
+
72
+ # Shutdown
73
+ logger.info("Shutting down application...")
74
+
75
+ if _memory_manager:
76
+ await _memory_manager.shutdown()
77
+ if _model_router:
78
+ await _model_router.shutdown()
79
+ if _tool_registry:
80
+ await _tool_registry.shutdown()
81
+
82
+ logger.info("Application shutdown complete")
83
+
84
+
85
+ def create_app() -> FastAPI:
86
+ """Create and configure the FastAPI application."""
87
+ settings = get_settings()
88
+ setup_logging(settings.log_level)
89
+
90
+ app = FastAPI(
91
+ title=settings.app_name,
92
+ description="FastAPI-based RL environment for intelligent web scraping",
93
+ version=settings.app_version,
94
+ debug=settings.debug,
95
+ lifespan=lifespan,
96
+ docs_url="/docs",
97
+ redoc_url="/redoc",
98
+ openapi_url="/openapi.json",
99
+ )
100
+
101
+ # Configure CORS
102
+ app.add_middleware(
103
+ CORSMiddleware,
104
+ allow_origins=settings.cors_origins,
105
+ allow_credentials=settings.cors_allow_credentials,
106
+ allow_methods=settings.cors_allow_methods,
107
+ allow_headers=settings.cors_allow_headers,
108
+ )
109
+
110
+ # Include routers
111
+ api_prefix = "/api"
112
+ app.include_router(health.router, prefix=api_prefix, tags=["Health"])
113
+ app.include_router(episode.router, prefix=api_prefix, tags=["Episode"])
114
+ app.include_router(tasks.router, prefix=api_prefix, tags=["Tasks"])
115
+ app.include_router(agents.router, prefix=api_prefix, tags=["Agents"])
116
+ app.include_router(tools.router, prefix=api_prefix, tags=["Tools"])
117
+ app.include_router(memory.router, prefix=api_prefix, tags=["Memory"])
118
+
119
+ return app
120
+
121
+
122
+ # Create the application instance
123
+ app = create_app()
124
+
125
+
126
+ def run() -> None:
127
+ """Run the application using uvicorn."""
128
+ settings = get_settings()
129
+ uvicorn.run(
130
+ "app.main:app",
131
+ host=settings.host,
132
+ port=settings.port,
133
+ reload=settings.reload,
134
+ workers=settings.workers,
135
+ log_level=settings.log_level.lower(),
136
+ )
137
+
138
+
139
+ if __name__ == "__main__":
140
+ run()
backend/pyproject.toml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "scraperl-backend"
7
+ version = "0.1.0"
8
+ description = "FastAPI-based RL environment for intelligent web scraping"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
+ authors = [
13
+ { name = "ScrapeRL Team" }
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Framework :: FastAPI",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ ]
26
+ dependencies = [
27
+ "fastapi>=0.109.0",
28
+ "uvicorn[standard]>=0.27.0",
29
+ "pydantic>=2.5.0",
30
+ "pydantic-settings>=2.1.0",
31
+ "httpx>=0.26.0",
32
+ "chromadb>=0.4.22",
33
+ "beautifulsoup4>=4.12.0",
34
+ "lxml>=5.1.0",
35
+ "openai>=1.10.0",
36
+ "anthropic>=0.18.0",
37
+ "google-generativeai>=0.4.0",
38
+ "groq>=0.4.0",
39
+ "playwright>=1.41.0",
40
+ "tiktoken>=0.5.0",
41
+ "numpy>=1.26.0",
42
+ "tenacity>=8.2.0",
43
+ "structlog>=24.1.0",
44
+ "python-dotenv>=1.0.0",
45
+ ]
46
+
47
+ [project.optional-dependencies]
48
+ dev = [
49
+ "pytest>=8.0.0",
50
+ "pytest-asyncio>=0.23.0",
51
+ "pytest-cov>=4.1.0",
52
+ "httpx>=0.26.0",
53
+ "ruff>=0.2.0",
54
+ "mypy>=1.8.0",
55
+ "pre-commit>=3.6.0",
56
+ ]
57
+
58
+ [project.scripts]
59
+ scraperl = "app.main:run"
60
+
61
+ [tool.hatch.build.targets.wheel]
62
+ packages = ["app"]
63
+
64
+ [tool.pytest.ini_options]
65
+ asyncio_mode = "auto"
66
+ testpaths = ["tests"]
67
+ addopts = "-v --tb=short"
68
+
69
+ [tool.ruff]
70
+ target-version = "py311"
71
+ line-length = 100
72
+
73
+ [tool.ruff.lint]
74
+ select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
75
+ ignore = ["E501"]
76
+
77
+ [tool.mypy]
78
+ python_version = "3.11"
79
+ strict = true
80
+ warn_return_any = true
81
+ warn_unused_ignores = true
backend/requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Framework
2
+ fastapi>=0.109.0
3
+ uvicorn[standard]>=0.27.0
4
+ pydantic>=2.5.0
5
+ pydantic-settings>=2.1.0
6
+
7
+ # HTTP Client
8
+ httpx>=0.26.0
9
+
10
+ # Vector Database
11
+ chromadb>=0.4.22
12
+
13
+ # HTML Processing
14
+ beautifulsoup4>=4.12.0
15
+ lxml>=5.1.0
16
+
17
+ # Search Providers
18
+ duckduckgo-search>=6.0.0
19
+
20
+ # LLM Providers
21
+ openai>=1.10.0
22
+ anthropic>=0.18.0
23
+ google-generativeai>=0.4.0
24
+ groq>=0.4.0
25
+
26
+ # Browser Automation
27
+ playwright>=1.41.0
28
+
29
+ # Tokenization
30
+ tiktoken>=0.5.0
31
+
32
+ # Utilities
33
+ numpy>=1.26.0
34
+ tenacity>=8.2.0
35
+ structlog>=24.1.0
36
+ python-dotenv>=1.0.0
37
+
38
+ # Development
39
+ pytest>=8.0.0
40
+ pytest-asyncio>=0.23.0
41
+ pytest-cov>=4.1.0
42
+ ruff>=0.2.0
43
+ mypy>=1.8.0