# routers/scraping_router.py from fastapi import APIRouter, HTTPException, Header, Depends from fastapi.responses import JSONResponse from fastapi.security import APIKeyHeader from pydantic import BaseModel from typing import Optional import logging import re import os from helpers.ai_client import AIClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) router = APIRouter( prefix="/api/v1", tags=["Web Scraping"] ) ai_client = AIClient() AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY") # API Key security scheme api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True) async def verify_api_key(api_key: str = Depends(api_key_header)): if api_key != AI_SCRAPER_API_KEY: raise HTTPException( status_code=401, detail="Invalid API key" ) return api_key class CheerioScriptRequest(BaseModel): html: str user_input: str class CheerioScriptResponse(BaseModel): cheerio_script: str status: str message: str @router.post("/generate-cheerio-script", response_model=CheerioScriptResponse) async def generate_cheerio_script( request: CheerioScriptRequest, #api_key: str = Depends(verify_api_key) ): try: example = """ Input HTML:

iPhone 14

$999
Input Request: "extract product title and price" Expected Output: function extract(input, cheerio) { let result = { success: false, data: null, error: null }; try { let $ = cheerio.load(input); result.data = { title: $('.product-card .title').text().trim() || null, price: $('.product-card .price').text().trim() || null }; result.success = true; } catch (error) { result.error = error.message; } return result; } """ system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts. Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML. Requirements: - Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters - Return object must include: {{ success: boolean, data: object|null, error: string|null }} - Use modern JavaScript syntax - Include try-catch error handling - Make the script reusable and efficient - Enclose the entire script in tags Here's an example of the expected format: {example} HTML to process: {request.html}""" user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}. The script must: 1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters 2. Return an object with success, data, and error fields 3. Handle missing elements by returning null 4. Use proper Cheerio selectors 5. Include error handling 6. Be enclosed in tags""" response = "" response = ai_client.chat( prompt=user_prompt, system_message=system_prompt, model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat" ) cheerio_pattern = r'(.*?)' matches = re.search(cheerio_pattern, response, re.DOTALL) if matches: cheerio_script = matches.group(1).strip() return JSONResponse( status_code=200, content={ "cheerio_script": cheerio_script, "status": "success", "message": "Cheerio script generated successfully" } ) else: return JSONResponse( status_code=200, content={ "cheerio_script": "", "status": "error", "message": f"No valid Cheerio script found in response: {response}" } ) except Exception as e: logger.error(f"Error generating Cheerio script: {e}") return JSONResponse( status_code=500, content={ "cheerio_script": "", "status": "error", "message": f"Error generating script: {str(e)}" } )