Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

NightFury2710 commited on Jan 29

Commit

85f58d9

1 Parent(s): 6c4f9d7

update api handle 3

Files changed (1) hide show

app.py CHANGED Viewed

@@ -23,7 +23,6 @@ class CrawlRequest(BaseModel):
     cache_mode: str = "ENABLED"
     excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
     remove_overlay_elements: bool = True
-    max_pages: int = 1  # Limit number of pages to crawl
     timeout: int = 30   # Timeout in seconds
 class Article(BaseModel):
@@ -169,15 +168,14 @@ async def crawl_url(request: CrawlRequest):
     try:
         cache_mode = getattr(CacheMode, request.cache_mode)
-        # Create crawler with improved configuration
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
                 remove_overlay_elements=request.remove_overlay_elements,
-                max_pages=request.max_pages,
                 timeout=request.timeout,
-                # Added from quickstart examples
                 remove_ads=True,
                 extract_text=True,
                 extract_links=True,
@@ -189,11 +187,11 @@ async def crawl_url(request: CrawlRequest):
                 config=config
             )
-            # Use both markdown and HTML results for better extraction
             markdown = result.markdown_v2.raw_markdown
             html = result.html
-            # Extract content using both markdown and HTML
             articles = extract_articles(markdown)
             metadata = extract_metadata(markdown, html)

     cache_mode: str = "ENABLED"
     excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
     remove_overlay_elements: bool = True
     timeout: int = 30   # Timeout in seconds
 class Article(BaseModel):
     try:
         cache_mode = getattr(CacheMode, request.cache_mode)
+        # Create crawler with correct configuration parameters
         async with AsyncWebCrawler() as crawler:
             config = CrawlerRunConfig(
                 cache_mode=cache_mode,
                 excluded_tags=request.excluded_tags,
                 remove_overlay_elements=request.remove_overlay_elements,
                 timeout=request.timeout,
+                # Core features from documentation
                 remove_ads=True,
                 extract_text=True,
                 extract_links=True,
                 config=config
             )
+            # Use both markdown and HTML results
             markdown = result.markdown_v2.raw_markdown
             html = result.html
+            # Extract content
             articles = extract_articles(markdown)
             metadata = extract_metadata(markdown, html)