|
|
from fastapi import FastAPI, Request, HTTPException, Body |
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig |
|
|
import re |
|
|
from collections.abc import Mapping, Sequence |
|
|
|
|
|
app = FastAPI(title="Crawl4AI API") |
|
|
URL_RE = re.compile(r"https?://[^\s\"'>)]+", re.IGNORECASE) |
|
|
|
|
|
def find_url_anywhere(obj): |
|
|
if isinstance(obj, str): |
|
|
m = URL_RE.search(obj) |
|
|
return m.group(0) if m else None |
|
|
if isinstance(obj, Mapping): |
|
|
for k in ("url", "link", "q", "input"): |
|
|
v = obj.get(k) |
|
|
if isinstance(v, str) and v.startswith("http"): |
|
|
return v |
|
|
for v in obj.values(): |
|
|
u = find_url_anywhere(v) |
|
|
if u: return u |
|
|
if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)): |
|
|
for it in obj: |
|
|
u = find_url_anywhere(it) |
|
|
if u: return u |
|
|
return None |
|
|
|
|
|
@app.get("/healthz") |
|
|
def health(): |
|
|
return {"status": "ok"} |
|
|
|
|
|
@app.get("/crawl") |
|
|
async def crawl_get(url: str | None = None): |
|
|
if not url: |
|
|
raise HTTPException(status_code=400, detail="Provide ?url=https://...") |
|
|
return await do_crawl(url) |
|
|
|
|
|
@app.post("/crawl") |
|
|
async def crawl_post( |
|
|
request: Request, |
|
|
payload: dict | None = Body(None, example={"url": "https://example.com"}) |
|
|
): |
|
|
url = request.query_params.get("url") |
|
|
if not url and isinstance(payload, dict): |
|
|
url = find_url_anywhere(payload) |
|
|
if not url: |
|
|
try: |
|
|
form = await request.form() |
|
|
url = find_url_anywhere(dict(form)) |
|
|
except Exception: |
|
|
pass |
|
|
if not url: |
|
|
raw = await request.body() |
|
|
url = find_url_anywhere(raw.decode("utf-8", errors="ignore")) |
|
|
if not url: |
|
|
raise HTTPException(status_code=400, detail="No URL found. Send {'url':'https://...'}") |
|
|
|
|
|
return await do_crawl(url) |
|
|
|
|
|
async def do_crawl(url: str): |
|
|
try: |
|
|
cfg = CrawlerRunConfig() |
|
|
async with AsyncWebCrawler() as crawler: |
|
|
result = await crawler.arun(url=url, config=cfg) |
|
|
|
|
|
text_val = getattr(result, "cleaned_text", None) |
|
|
md_val = getattr(result, "markdown", None) |
|
|
content = md_val or text_val or "" |
|
|
|
|
|
|
|
|
return { |
|
|
"url": url, |
|
|
"status": "ok", |
|
|
"text": text_val, |
|
|
"markdown": md_val, |
|
|
|
|
|
"results": [ |
|
|
{ |
|
|
"url": url, |
|
|
"content": content |
|
|
} |
|
|
], |
|
|
"success": True |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Crawl error: {e}" ) |
|
|
|