| import os |
| import asyncio |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from typing import Optional, Dict |
| from uuid import uuid4 |
| from src.web_extractor import WebExtractor |
| from src.scrapers.playwright_scraper import ScraperConfig |
|
|
| app = FastAPI() |
|
|
| |
| sessions: Dict[str, WebExtractor] = {} |
|
|
| class ScrapeRequest(BaseModel): |
| url: str |
| query: str |
| model_name: Optional[str] = "alias-fast" |
|
|
| class SessionCreateRequest(BaseModel): |
| model_name: Optional[str] = "alias-fast" |
|
|
| @app.get("/health") |
| async def health(): |
| return {"status": "ok", "message": "CyberScraper 2077 API is running"} |
|
|
| @app.get("/api-docs") |
| async def api_docs(): |
| """Comprehensive API documentation with examples""" |
| return { |
| "title": "CyberScraper 2077 API Documentation", |
| "version": "1.0.0", |
| "description": "Advanced web scraping API with session management and AI-powered content extraction", |
| "base_url": "https://grazieprego-scrapling.hf.space", |
| "endpoints": { |
| "health": { |
| "method": "GET", |
| "path": "/health", |
| "description": "Check if the API is running", |
| "response": { |
| "status": "ok", |
| "message": "CyberScraper 2077 API is running" |
| }, |
| "example": "curl https://grazieprego-scrapling.hf.space/health" |
| }, |
| "scrape": { |
| "method": "POST", |
| "path": "/api/scrape", |
| "description": "Stateless scrape request - creates a new extractor for each request", |
| "request_body": { |
| "url": "string - The URL to scrape", |
| "query": "string - The extraction query/instruction", |
| "model_name": "string (optional) - AI model to use (default: 'alias-fast')" |
| }, |
| "response": { |
| "url": "string - The scraped URL", |
| "query": "string - The query used", |
| "response": "any - The extracted content" |
| }, |
| "example": { |
| "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'", |
| "python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())" |
| } |
| }, |
| "create_session": { |
| "method": "POST", |
| "path": "/api/session", |
| "description": "Create a persistent scraping session for multiple requests", |
| "request_body": { |
| "model_name": "string (optional) - AI model to use (default: 'alias-fast')" |
| }, |
| "response": { |
| "session_id": "string - UUID of the created session", |
| "message": "string - Confirmation message", |
| "model": "string - Model used" |
| }, |
| "example": { |
| "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'", |
| "python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']" |
| } |
| }, |
| "session_scrape": { |
| "method": "POST", |
| "path": "/api/session/{session_id}/scrape", |
| "description": "Scrape using an existing session context (more efficient for multiple requests)", |
| "path_parameters": { |
| "session_id": "string - UUID of the session" |
| }, |
| "request_body": { |
| "url": "string - The URL to scrape", |
| "query": "string - The extraction query", |
| "model_name": "string (optional)" |
| }, |
| "response": { |
| "session_id": "string - The session ID", |
| "url": "string - The scraped URL", |
| "query": "string - The query used", |
| "response": "any - The extracted content" |
| }, |
| "example": { |
| "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'", |
| "python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())" |
| } |
| }, |
| "close_session": { |
| "method": "DELETE", |
| "path": "/api/session/{session_id}", |
| "description": "Close a session and release resources", |
| "path_parameters": { |
| "session_id": "string - UUID of the session to close" |
| }, |
| "response": { |
| "message": "string - Confirmation message", |
| "session_id": "string - The closed session ID" |
| }, |
| "example": { |
| "curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here", |
| "python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())" |
| } |
| } |
| }, |
| "usage_guide": { |
| "quick_start": [ |
| "1. Make a simple scrape request to /api/scrape", |
| "2. For multiple requests, create a session first", |
| "3. Use the session ID for subsequent requests", |
| "4. Close sessions when done to free resources" |
| ], |
| "best_practices": [ |
| "Use stateless /api/scrape for one-off requests", |
| "Use sessions for batch processing multiple URLs", |
| "Always close sessions when finished", |
| "Handle errors gracefully (500 errors may occur on complex sites)", |
| "Set appropriate timeouts for slow-loading pages" |
| ], |
| "error_handling": { |
| "404": "Session not found (for session endpoints)", |
| "500": "Internal server error - check the detail message", |
| "Common issues": [ |
| "URL unreachable or timeout", |
| "JavaScript-heavy sites may require different approaches", |
| "Bot protection may block requests" |
| ] |
| } |
| }, |
| "integration_examples": { |
| "python_script": """ |
| import requests |
| |
| # Stateless scrape |
| response = requests.post( |
| 'https://grazieprego-scrapling.hf.space/api/scrape', |
| json={ |
| 'url': 'https://example.com', |
| 'query': 'Extract all headings and prices' |
| } |
| ) |
| print("Result:", response.json()) |
| |
| # Session-based workflow |
| session_response = requests.post( |
| 'https://grazieprego-scrapling.hf.space/api/session', |
| json={'model_name': 'alias-fast'} |
| ) |
| session_id = session_response.json()['session_id'] |
| |
| try: |
| # Multiple requests using the same session |
| for url in ['https://example.com/page1', 'https://example.com/page2']: |
| result = requests.post( |
| f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', |
| json={'url': url, 'query': 'Extract product data'} |
| ) |
| print(f"Scraped {url}:", result.json()) |
| finally: |
| # Always close the session |
| requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}') |
| """, |
| "javascript": """ |
| // Fetch API example |
| async function scrapeUrl(url, query) { |
| const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', { |
| method: 'POST', |
| headers: { 'Content-Type': 'application/json' }, |
| body: JSON.stringify({ url, query }) |
| }); |
| return await response.json(); |
| } |
| |
| // Usage |
| scrapeUrl('https://example.com', 'Extract all links').then(console.log); |
| """ |
| }, |
| "rate_limits": { |
| "note": "Rate limits may apply. Please use responsibly.", |
| "recommendation": "For high-volume scraping, use session-based approach and implement retry logic" |
| } |
| } |
|
|
| @app.post("/api/scrape") |
| async def scrape(request: ScrapeRequest): |
| """Stateless scrape request (creates a new extractor for each request)""" |
| scraper_config = ScraperConfig( |
| headless=True, |
| max_retries=3, |
| delay_after_load=5 |
| ) |
|
|
| extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config) |
| try: |
| |
| full_query = f"{request.url} {request.query}" |
| response = await extractor.process_query(full_query) |
|
|
| |
| if isinstance(response, tuple): |
| response = response[0] |
| |
| |
| if hasattr(extractor.playwright_scraper, 'close'): |
| await extractor.playwright_scraper.close() |
|
|
| return { |
| "url": request.url, |
| "query": request.query, |
| "response": response |
| } |
| except Exception as e: |
| |
| if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'): |
| await extractor.playwright_scraper.close() |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @app.post("/api/session") |
| async def create_session(request: SessionCreateRequest): |
| """Create a persistent scraping session""" |
| session_id = str(uuid4()) |
| try: |
| scraper_config = ScraperConfig( |
| headless=True, |
| max_retries=3, |
| delay_after_load=5 |
| ) |
| extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config) |
| sessions[session_id] = extractor |
| return {"session_id": session_id, "message": "Session created", "model": request.model_name} |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}") |
|
|
| @app.post("/api/session/{session_id}/scrape") |
| async def session_scrape(session_id: str, request: ScrapeRequest): |
| """Scrape using an existing session context""" |
| if session_id not in sessions: |
| raise HTTPException(status_code=404, detail="Session not found") |
| |
| extractor = sessions[session_id] |
| try: |
| full_query = f"{request.url} {request.query}" |
| response = await extractor.process_query(full_query) |
| |
| if isinstance(response, tuple): |
| response = response[0] |
| |
| return { |
| "session_id": session_id, |
| "url": request.url, |
| "query": request.query, |
| "response": response |
| } |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @app.delete("/api/session/{session_id}") |
| async def close_session(session_id: str): |
| """Close a session and release resources""" |
| if session_id in sessions: |
| extractor = sessions[session_id] |
| if hasattr(extractor.playwright_scraper, 'close'): |
| await extractor.playwright_scraper.close() |
| del sessions[session_id] |
| return {"message": "Session closed", "session_id": session_id} |
| raise HTTPException(status_code=404, detail="Session not found") |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=8000) |
|
|