scrapling / api.py
GraziePrego's picture
Add comprehensive /api-docs endpoint with examples and usage guide
ce40d2a verified
import os
import asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Dict
from uuid import uuid4
from src.web_extractor import WebExtractor
from src.scrapers.playwright_scraper import ScraperConfig
app = FastAPI()
# Store active sessions
sessions: Dict[str, WebExtractor] = {}
class ScrapeRequest(BaseModel):
url: str
query: str
model_name: Optional[str] = "alias-fast"
class SessionCreateRequest(BaseModel):
model_name: Optional[str] = "alias-fast"
@app.get("/health")
async def health():
return {"status": "ok", "message": "CyberScraper 2077 API is running"}
@app.get("/api-docs")
async def api_docs():
"""Comprehensive API documentation with examples"""
return {
"title": "CyberScraper 2077 API Documentation",
"version": "1.0.0",
"description": "Advanced web scraping API with session management and AI-powered content extraction",
"base_url": "https://grazieprego-scrapling.hf.space",
"endpoints": {
"health": {
"method": "GET",
"path": "/health",
"description": "Check if the API is running",
"response": {
"status": "ok",
"message": "CyberScraper 2077 API is running"
},
"example": "curl https://grazieprego-scrapling.hf.space/health"
},
"scrape": {
"method": "POST",
"path": "/api/scrape",
"description": "Stateless scrape request - creates a new extractor for each request",
"request_body": {
"url": "string - The URL to scrape",
"query": "string - The extraction query/instruction",
"model_name": "string (optional) - AI model to use (default: 'alias-fast')"
},
"response": {
"url": "string - The scraped URL",
"query": "string - The query used",
"response": "any - The extracted content"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'",
"python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())"
}
},
"create_session": {
"method": "POST",
"path": "/api/session",
"description": "Create a persistent scraping session for multiple requests",
"request_body": {
"model_name": "string (optional) - AI model to use (default: 'alias-fast')"
},
"response": {
"session_id": "string - UUID of the created session",
"message": "string - Confirmation message",
"model": "string - Model used"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'",
"python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']"
}
},
"session_scrape": {
"method": "POST",
"path": "/api/session/{session_id}/scrape",
"description": "Scrape using an existing session context (more efficient for multiple requests)",
"path_parameters": {
"session_id": "string - UUID of the session"
},
"request_body": {
"url": "string - The URL to scrape",
"query": "string - The extraction query",
"model_name": "string (optional)"
},
"response": {
"session_id": "string - The session ID",
"url": "string - The scraped URL",
"query": "string - The query used",
"response": "any - The extracted content"
},
"example": {
"curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'",
"python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())"
}
},
"close_session": {
"method": "DELETE",
"path": "/api/session/{session_id}",
"description": "Close a session and release resources",
"path_parameters": {
"session_id": "string - UUID of the session to close"
},
"response": {
"message": "string - Confirmation message",
"session_id": "string - The closed session ID"
},
"example": {
"curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here",
"python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())"
}
}
},
"usage_guide": {
"quick_start": [
"1. Make a simple scrape request to /api/scrape",
"2. For multiple requests, create a session first",
"3. Use the session ID for subsequent requests",
"4. Close sessions when done to free resources"
],
"best_practices": [
"Use stateless /api/scrape for one-off requests",
"Use sessions for batch processing multiple URLs",
"Always close sessions when finished",
"Handle errors gracefully (500 errors may occur on complex sites)",
"Set appropriate timeouts for slow-loading pages"
],
"error_handling": {
"404": "Session not found (for session endpoints)",
"500": "Internal server error - check the detail message",
"Common issues": [
"URL unreachable or timeout",
"JavaScript-heavy sites may require different approaches",
"Bot protection may block requests"
]
}
},
"integration_examples": {
"python_script": """
import requests
# Stateless scrape
response = requests.post(
'https://grazieprego-scrapling.hf.space/api/scrape',
json={
'url': 'https://example.com',
'query': 'Extract all headings and prices'
}
)
print("Result:", response.json())
# Session-based workflow
session_response = requests.post(
'https://grazieprego-scrapling.hf.space/api/session',
json={'model_name': 'alias-fast'}
)
session_id = session_response.json()['session_id']
try:
# Multiple requests using the same session
for url in ['https://example.com/page1', 'https://example.com/page2']:
result = requests.post(
f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape',
json={'url': url, 'query': 'Extract product data'}
)
print(f"Scraped {url}:", result.json())
finally:
# Always close the session
requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')
""",
"javascript": """
// Fetch API example
async function scrapeUrl(url, query) {
const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url, query })
});
return await response.json();
}
// Usage
scrapeUrl('https://example.com', 'Extract all links').then(console.log);
"""
},
"rate_limits": {
"note": "Rate limits may apply. Please use responsibly.",
"recommendation": "For high-volume scraping, use session-based approach and implement retry logic"
}
}
@app.post("/api/scrape")
async def scrape(request: ScrapeRequest):
"""Stateless scrape request (creates a new extractor for each request)"""
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
try:
# Construct the query by combining URL and the specific request
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
# If response is a tuple (csv/excel), extract the first part
if isinstance(response, tuple):
response = response[0]
# Clean up
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
return {
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
# Try to clean up on error
if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/session")
async def create_session(request: SessionCreateRequest):
"""Create a persistent scraping session"""
session_id = str(uuid4())
try:
scraper_config = ScraperConfig(
headless=True,
max_retries=3,
delay_after_load=5
)
extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
sessions[session_id] = extractor
return {"session_id": session_id, "message": "Session created", "model": request.model_name}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")
@app.post("/api/session/{session_id}/scrape")
async def session_scrape(session_id: str, request: ScrapeRequest):
"""Scrape using an existing session context"""
if session_id not in sessions:
raise HTTPException(status_code=404, detail="Session not found")
extractor = sessions[session_id]
try:
full_query = f"{request.url} {request.query}"
response = await extractor.process_query(full_query)
if isinstance(response, tuple):
response = response[0]
return {
"session_id": session_id,
"url": request.url,
"query": request.query,
"response": response
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/api/session/{session_id}")
async def close_session(session_id: str):
"""Close a session and release resources"""
if session_id in sessions:
extractor = sessions[session_id]
if hasattr(extractor.playwright_scraper, 'close'):
await extractor.playwright_scraper.close()
del sessions[session_id]
return {"message": "Session closed", "session_id": session_id}
raise HTTPException(status_code=404, detail="Session not found")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)