Spaces:
Running
Running
# routers/scraping_router.py | |
from fastapi import APIRouter, HTTPException, Header, Depends | |
from fastapi.responses import JSONResponse | |
from fastapi.security import APIKeyHeader | |
from pydantic import BaseModel | |
from typing import Optional | |
import logging | |
import re | |
import os | |
from helpers.ai_client import AIClient | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
router = APIRouter( | |
prefix="/api/v1", | |
tags=["Web Scraping"] | |
) | |
ai_client = AIClient() | |
AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY") | |
# API Key security scheme | |
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True) | |
async def verify_api_key(api_key: str = Depends(api_key_header)): | |
if api_key != AI_SCRAPER_API_KEY: | |
raise HTTPException( | |
status_code=401, | |
detail="Invalid API key" | |
) | |
return api_key | |
class CheerioScriptRequest(BaseModel): | |
html: str | |
user_input: str | |
class CheerioScriptResponse(BaseModel): | |
cheerio_script: str | |
status: str | |
message: str | |
async def generate_cheerio_script( | |
request: CheerioScriptRequest, | |
#api_key: str = Depends(verify_api_key) | |
): | |
try: | |
example = """ | |
Input HTML: | |
<html> | |
<div class="product-card"> | |
<h2 class="title">iPhone 14</h2> | |
<span class="price">$999</span> | |
</div> | |
</html> | |
Input Request: "extract product title and price" | |
Expected Output: | |
<cheerio_script> | |
function extract(input, cheerio) { | |
let result = { | |
success: false, | |
data: null, | |
error: null | |
}; | |
try { | |
let $ = cheerio.load(input); | |
result.data = { | |
title: $('.product-card .title').text().trim() || null, | |
price: $('.product-card .price').text().trim() || null | |
}; | |
result.success = true; | |
} catch (error) { | |
result.error = error.message; | |
} | |
return result; | |
} | |
</cheerio_script> | |
""" | |
system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts. | |
Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML. | |
Requirements: | |
- Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters | |
- Return object must include: {{ success: boolean, data: object|null, error: string|null }} | |
- Use modern JavaScript syntax | |
- Include try-catch error handling | |
- Make the script reusable and efficient | |
- Enclose the entire script in <cheerio_script> tags | |
Here's an example of the expected format: | |
{example} | |
HTML to process: | |
{request.html}""" | |
user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}. | |
The script must: | |
1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters | |
2. Return an object with success, data, and error fields | |
3. Handle missing elements by returning null | |
4. Use proper Cheerio selectors | |
5. Include error handling | |
6. Be enclosed in <cheerio_script> tags""" | |
response = "" | |
response = ai_client.chat( | |
prompt=user_prompt, | |
system_message=system_prompt, | |
model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat" | |
) | |
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>' | |
matches = re.search(cheerio_pattern, response, re.DOTALL) | |
if matches: | |
cheerio_script = matches.group(1).strip() | |
return JSONResponse( | |
status_code=200, | |
content={ | |
"cheerio_script": cheerio_script, | |
"status": "success", | |
"message": "Cheerio script generated successfully" | |
} | |
) | |
else: | |
return JSONResponse( | |
status_code=200, | |
content={ | |
"cheerio_script": "", | |
"status": "error", | |
"message": f"No valid Cheerio script found in response: {response}" | |
} | |
) | |
except Exception as e: | |
logger.error(f"Error generating Cheerio script: {e}") | |
return JSONResponse( | |
status_code=500, | |
content={ | |
"cheerio_script": "", | |
"status": "error", | |
"message": f"Error generating script: {str(e)}" | |
} | |
) |