NightFury2710 commited on
Commit
59bbc6d
·
1 Parent(s): 060a692

init repos

Browse files
Files changed (4) hide show
  1. Dockerfile +27 -0
  2. README.md +27 -0
  3. app.py +65 -0
  4. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ wget \
8
+ gnupg \
9
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
10
+ && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/sources.list.d/google.list \
11
+ && apt-get update \
12
+ && apt-get install -y google-chrome-stable \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy requirements and install Python packages
16
+ COPY requirements.txt .
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Install playwright and browsers
20
+ RUN playwright install chromium
21
+ RUN playwright install-deps
22
+
23
+ # Copy application code
24
+ COPY app.py .
25
+
26
+ # Run the application
27
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -9,3 +9,30 @@ short_description: my Crawl4ai server
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
+
13
+
14
+ # Crawl4AI API
15
+
16
+ This is a FastAPI-based web API for Crawl4AI, a web scraping service.
17
+
18
+ ## API Endpoints
19
+
20
+ ### POST /crawl
21
+ Crawls a specified URL and returns the content in markdown format.
22
+
23
+ Example request:
24
+ ```json
25
+ {
26
+ "url": "https://example.com",
27
+ "cache_mode": "ENABLED",
28
+ "excluded_tags": ["nav", "footer", "aside"],
29
+ "remove_overlay_elements": true
30
+ }
31
+ ```
32
+
33
+ ### GET /
34
+ Returns basic information about the API.
35
+
36
+ ## Documentation
37
+ - Interactive API documentation: `/docs`
38
+ - ReDoc documentation: `/redoc`
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel, HttpUrl
3
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
4
+ import uvicorn
5
+ import asyncio
6
+ import nest_asyncio
7
+
8
+ # Apply nest_asyncio to allow nested event loops
9
+ nest_asyncio.apply()
10
+
11
+ app = FastAPI(
12
+ title="Crawl4AI API",
13
+ description="A web API for Crawl4AI web scraping service",
14
+ version="1.0.0"
15
+ )
16
+
17
+ class CrawlRequest(BaseModel):
18
+ url: HttpUrl
19
+ cache_mode: str = "ENABLED"
20
+ excluded_tags: list[str] = ["nav", "footer", "aside"]
21
+ remove_overlay_elements: bool = True
22
+
23
+ class CrawlResponse(BaseModel):
24
+ url: str
25
+ markdown: str
26
+ success: bool
27
+ error: str = None
28
+
29
+ @app.post("/crawl", response_model=CrawlResponse)
30
+ async def crawl_url(request: CrawlRequest):
31
+ try:
32
+ # Convert cache_mode string to enum
33
+ cache_mode = getattr(CacheMode, request.cache_mode)
34
+
35
+ async with AsyncWebCrawler() as crawler:
36
+ config = CrawlerRunConfig(
37
+ cache_mode=cache_mode,
38
+ excluded_tags=request.excluded_tags,
39
+ remove_overlay_elements=request.remove_overlay_elements
40
+ )
41
+
42
+ result = await crawler.arun(
43
+ url=str(request.url),
44
+ config=config
45
+ )
46
+
47
+ return CrawlResponse(
48
+ url=str(request.url),
49
+ markdown=result.markdown_v2.raw_markdown,
50
+ success=result.success
51
+ )
52
+
53
+ except Exception as e:
54
+ raise HTTPException(status_code=500, detail=str(e))
55
+
56
+ @app.get("/")
57
+ def read_root():
58
+ return {
59
+ "message": "Welcome to Crawl4AI API",
60
+ "docs": "/docs",
61
+ "redoc": "/redoc"
62
+ }
63
+
64
+ if __name__ == "__main__":
65
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ crawl4ai
4
+ nest-asyncio
5
+ pydantic
6
+ python-multipart