crawl4ai / app.py
DarmacSEO's picture
Update app.py
40488af verified
from fastapi import FastAPI, Request, HTTPException, Body
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
import re
from collections.abc import Mapping, Sequence
app = FastAPI(title="Crawl4AI API")
URL_RE = re.compile(r"https?://[^\s\"'>)]+", re.IGNORECASE)
def find_url_anywhere(obj):
if isinstance(obj, str):
m = URL_RE.search(obj)
return m.group(0) if m else None
if isinstance(obj, Mapping):
for k in ("url", "link", "q", "input"):
v = obj.get(k)
if isinstance(v, str) and v.startswith("http"):
return v
for v in obj.values():
u = find_url_anywhere(v)
if u: return u
if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
for it in obj:
u = find_url_anywhere(it)
if u: return u
return None
@app.get("/healthz")
def health():
return {"status": "ok"}
@app.get("/crawl")
async def crawl_get(url: str | None = None):
if not url:
raise HTTPException(status_code=400, detail="Provide ?url=https://...")
return await do_crawl(url)
@app.post("/crawl")
async def crawl_post(
request: Request,
payload: dict | None = Body(None, example={"url": "https://example.com"})
):
url = request.query_params.get("url")
if not url and isinstance(payload, dict):
url = find_url_anywhere(payload)
if not url:
try:
form = await request.form()
url = find_url_anywhere(dict(form))
except Exception:
pass
if not url:
raw = await request.body()
url = find_url_anywhere(raw.decode("utf-8", errors="ignore"))
if not url:
raise HTTPException(status_code=400, detail="No URL found. Send {'url':'https://...'}")
return await do_crawl(url)
async def do_crawl(url: str):
try:
cfg = CrawlerRunConfig() # kompatybilne z Twoją wersją crawl4ai
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url, config=cfg)
text_val = getattr(result, "cleaned_text", None)
md_val = getattr(result, "markdown", None)
content = md_val or text_val or ""
# Odpowiedź zgodna i z Twoim API, i z wtyczką Dify:
return {
"url": url,
"status": "ok",
"text": text_val,
"markdown": md_val,
# 👇 WTYCZKA MARKETPLACE patrzy na results[0].content
"results": [
{
"url": url,
"content": content
}
],
"success": True
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Crawl error: {e}" )