Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| # routers/scraping_router.py | |
| from fastapi import APIRouter, HTTPException, Header, Depends | |
| from fastapi.responses import JSONResponse | |
| from fastapi.security import APIKeyHeader | |
| from pydantic import BaseModel | |
| from typing import Optional | |
| import logging | |
| import re | |
| import os | |
| from helpers.ai_client import AIClient | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| router = APIRouter( | |
| prefix="/api/v1", | |
| tags=["Web Scraping"] | |
| ) | |
| ai_client = AIClient() | |
| AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY") | |
| # API Key security scheme | |
| api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True) | |
| async def verify_api_key(api_key: str = Depends(api_key_header)): | |
| if api_key != AI_SCRAPER_API_KEY: | |
| raise HTTPException( | |
| status_code=401, | |
| detail="Invalid API key" | |
| ) | |
| return api_key | |
| class CheerioScriptRequest(BaseModel): | |
| html: str | |
| user_input: str | |
| class CheerioScriptResponse(BaseModel): | |
| cheerio_script: str | |
| status: str | |
| message: str | |
| async def generate_cheerio_script( | |
| request: CheerioScriptRequest, | |
| #api_key: str = Depends(verify_api_key) | |
| ): | |
| try: | |
| example = """ | |
| Input HTML: | |
| <html> | |
| <div class="product-card"> | |
| <h2 class="title">iPhone 14</h2> | |
| <span class="price">$999</span> | |
| </div> | |
| </html> | |
| Input Request: "extract product title and price" | |
| Expected Output: | |
| <cheerio_script> | |
| function extract(input, cheerio) { | |
| let result = { | |
| success: false, | |
| data: null, | |
| error: null | |
| }; | |
| try { | |
| let $ = cheerio.load(input); | |
| result.data = { | |
| title: $('.product-card .title').text().trim() || null, | |
| price: $('.product-card .price').text().trim() || null | |
| }; | |
| result.success = true; | |
| } catch (error) { | |
| result.error = error.message; | |
| } | |
| return result; | |
| } | |
| </cheerio_script> | |
| """ | |
| system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts. | |
| Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML. | |
| Requirements: | |
| - Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters | |
| - Return object must include: {{ success: boolean, data: object|null, error: string|null }} | |
| - Use modern JavaScript syntax | |
| - Include try-catch error handling | |
| - Make the script reusable and efficient | |
| - Enclose the entire script in <cheerio_script> tags | |
| Here's an example of the expected format: | |
| {example} | |
| HTML to process: | |
| {request.html}""" | |
| user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}. | |
| The script must: | |
| 1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters | |
| 2. Return an object with success, data, and error fields | |
| 3. Handle missing elements by returning null | |
| 4. Use proper Cheerio selectors | |
| 5. Include error handling | |
| 6. Be enclosed in <cheerio_script> tags""" | |
| response = "" | |
| response = ai_client.chat( | |
| prompt=user_prompt, | |
| system_message=system_prompt, | |
| model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat" | |
| ) | |
| cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>' | |
| matches = re.search(cheerio_pattern, response, re.DOTALL) | |
| if matches: | |
| cheerio_script = matches.group(1).strip() | |
| return JSONResponse( | |
| status_code=200, | |
| content={ | |
| "cheerio_script": cheerio_script, | |
| "status": "success", | |
| "message": "Cheerio script generated successfully" | |
| } | |
| ) | |
| else: | |
| return JSONResponse( | |
| status_code=200, | |
| content={ | |
| "cheerio_script": "", | |
| "status": "error", | |
| "message": f"No valid Cheerio script found in response: {response}" | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error generating Cheerio script: {e}") | |
| return JSONResponse( | |
| status_code=500, | |
| content={ | |
| "cheerio_script": "", | |
| "status": "error", | |
| "message": f"Error generating script: {str(e)}" | |
| } | |
| ) | 

