web-crawling / main.py
pvanand's picture
Update main.py
75b1d67 verified
raw
history blame
3.92 kB
import os
import asyncio
from fastapi import FastAPI, HTTPException, Security, Depends, Query
from fastapi.security import APIKeyHeader
from pydantic import BaseModel, Field, create_model
from typing import List, Optional
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI()
# API key configuration
CHAT_AUTH_KEY = os.getenv("CHAT_AUTH_KEY")
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
async def verify_api_key(api_key: str = Security(api_key_header)):
if api_key != CHAT_AUTH_KEY:
logger.warning("Invalid API key used")
raise HTTPException(status_code=403, detail="Could not validate credentials")
return api_key
class CrawlerInput(BaseModel):
url: str = Field(..., description="URL to crawl")
columns: List[str] = Field(..., description="List of required columns")
descriptions: List[str] = Field(..., description="Descriptions for each column")
class CrawlerOutput(BaseModel):
data: List[dict]
async def simple_crawl(url: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url=url)
print(len(result.markdown))
return result
@app.post("/crawl", response_model=CrawlerOutput)
async def crawl(input: CrawlerInput, api_key: str = Depends(verify_api_key)):
if len(input.columns) != len(input.descriptions):
raise HTTPException(status_code=400, detail="Number of columns must match number of descriptions")
extraction_info = {col: desc for col, desc in zip(input.columns, input.descriptions)}
dynamic_model = create_model(
'DynamicModel',
**{col: (str, Field(..., description=desc)) for col, desc in extraction_info.items()}
)
instruction = f"Extract the following information: {json.dumps(extraction_info)}"
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=input.url,
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o-mini",
api_token=os.getenv('OPENAI_API_KEY'),
schema=dynamic_model.schema(),
extraction_type="schema",
verbose=True,
instruction=instruction
)
)
extracted_data = json.loads(result.extracted_content)
return CrawlerOutput(data=extracted_data)
@app.get("/basic-crawl")
async def test_url(api_key: str = Depends(verify_api_key), url: str = Query(..., description="URL to crawl")):
"""
A test endpoint that takes a URL as input and returns the result of crawling it.
"""
result = await simple_crawl(url=url)
return {"markdown": result.markdown}
@app.get("/test")
async def test(api_key: str = Depends(verify_api_key)):
result = await simple_crawl("https://www.nbcnews.com/business")
return {"markdown": result.markdown}
from fastapi.middleware.cors import CORSMiddleware
# CORS middleware setup
app.add_middleware(
CORSMiddleware,
#allow_origins=["*"],
allow_origins=[
"http://127.0.0.1:5501/",
"http://localhost:5501",
"http://localhost:3000",
"https://www.elevaticsai.com",
"https://www.elevatics.cloud",
"https://www.elevatics.online",
"https://www.elevatics.ai",
"https://elevaticsai.com",
"https://elevatics.cloud",
"https://elevatics.online",
"https://elevatics.ai",
"https://pvanand-specialized-agents.hf.space",
"https://pvanand-audio-chat.hf.space/"
],
allow_credentials=True,
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)