Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

App Files Files Community

NightFury2710 commited on Jan 29

Commit

59bbc6d

1 Parent(s): 060a692

init repos

Browse files

Files changed (4) hide show

Dockerfile +27 -0
README.md +27 -0
app.py +65 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+FROM python:3.10-slim
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    gnupg \
+    && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
+    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python packages
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install playwright and browsers
+RUN playwright install chromium
+RUN playwright install-deps
+# Copy application code
+COPY app.py .
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -9,3 +9,30 @@ short_description: my Crawl4ai server
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Crawl4AI API
+This is a FastAPI-based web API for Crawl4AI, a web scraping service.
+## API Endpoints
+### POST /crawl
+Crawls a specified URL and returns the content in markdown format.
+Example request:
+```json
+{
+    "url": "https://example.com",
+    "cache_mode": "ENABLED",
+    "excluded_tags": ["nav", "footer", "aside"],
+    "remove_overlay_elements": true
+}
+```
+### GET /
+Returns basic information about the API.
+## Documentation
+- Interactive API documentation: `/docs`
+- ReDoc documentation: `/redoc`

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, HttpUrl
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+import uvicorn
+import asyncio
+import nest_asyncio
+# Apply nest_asyncio to allow nested event loops
+nest_asyncio.apply()
+app = FastAPI(
+    title="Crawl4AI API",
+    description="A web API for Crawl4AI web scraping service",
+    version="1.0.0"
+)
+class CrawlRequest(BaseModel):
+    url: HttpUrl
+    cache_mode: str = "ENABLED"
+    excluded_tags: list[str] = ["nav", "footer", "aside"]
+    remove_overlay_elements: bool = True
+class CrawlResponse(BaseModel):
+    url: str
+    markdown: str
+    success: bool
+    error: str = None
+@app.post("/crawl", response_model=CrawlResponse)
+async def crawl_url(request: CrawlRequest):
+    try:
+        # Convert cache_mode string to enum
+        cache_mode = getattr(CacheMode, request.cache_mode)
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                cache_mode=cache_mode,
+                excluded_tags=request.excluded_tags,
+                remove_overlay_elements=request.remove_overlay_elements
+            )
+            result = await crawler.arun(
+                url=str(request.url),
+                config=config
+            )
+            return CrawlResponse(
+                url=str(request.url),
+                markdown=result.markdown_v2.raw_markdown,
+                success=result.success
+            )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+def read_root():
+    return {
+        "message": "Welcome to Crawl4AI API",
+        "docs": "/docs",
+        "redoc": "/redoc"
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+crawl4ai
+nest-asyncio
+pydantic
+python-multipart