ai-web-scraper-chat / routers /scraping_router.py
pvanand's picture
Update routers/scraping_router.py
92ee903 verified
# routers/scraping_router.py
from fastapi import APIRouter, HTTPException, Header, Depends
from fastapi.responses import JSONResponse
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
from typing import Optional
import logging
import re
import os
from helpers.ai_client import AIClient
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/api/v1",
tags=["Web Scraping"]
)
ai_client = AIClient()
AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY")
# API Key security scheme
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True)
async def verify_api_key(api_key: str = Depends(api_key_header)):
if api_key != AI_SCRAPER_API_KEY:
raise HTTPException(
status_code=401,
detail="Invalid API key"
)
return api_key
class CheerioScriptRequest(BaseModel):
html: str
user_input: str
class CheerioScriptResponse(BaseModel):
cheerio_script: str
status: str
message: str
@router.post("/generate-cheerio-script", response_model=CheerioScriptResponse)
async def generate_cheerio_script(
request: CheerioScriptRequest,
#api_key: str = Depends(verify_api_key)
):
try:
example = """
Input HTML:
<html>
<div class="product-card">
<h2 class="title">iPhone 14</h2>
<span class="price">$999</span>
</div>
</html>
Input Request: "extract product title and price"
Expected Output:
<cheerio_script>
function extract(input, cheerio) {
let result = {
success: false,
data: null,
error: null
};
try {
let $ = cheerio.load(input);
result.data = {
title: $('.product-card .title').text().trim() || null,
price: $('.product-card .price').text().trim() || null
};
result.success = true;
} catch (error) {
result.error = error.message;
}
return result;
}
</cheerio_script>
"""
system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts.
Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML.
Requirements:
- Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters
- Return object must include: {{ success: boolean, data: object|null, error: string|null }}
- Use modern JavaScript syntax
- Include try-catch error handling
- Make the script reusable and efficient
- Enclose the entire script in <cheerio_script> tags
Here's an example of the expected format:
{example}
HTML to process:
{request.html}"""
user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}.
The script must:
1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters
2. Return an object with success, data, and error fields
3. Handle missing elements by returning null
4. Use proper Cheerio selectors
5. Include error handling
6. Be enclosed in <cheerio_script> tags"""
response = ""
response = ai_client.chat(
prompt=user_prompt,
system_message=system_prompt,
model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat"
)
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
matches = re.search(cheerio_pattern, response, re.DOTALL)
if matches:
cheerio_script = matches.group(1).strip()
return JSONResponse(
status_code=200,
content={
"cheerio_script": cheerio_script,
"status": "success",
"message": "Cheerio script generated successfully"
}
)
else:
return JSONResponse(
status_code=200,
content={
"cheerio_script": "",
"status": "error",
"message": f"No valid Cheerio script found in response: {response}"
}
)
except Exception as e:
logger.error(f"Error generating Cheerio script: {e}")
return JSONResponse(
status_code=500,
content={
"cheerio_script": "",
"status": "error",
"message": f"Error generating script: {str(e)}"
}
)