Spaces:

NightFury2710
/

myCrawl4ai

Sleeping

App Files Files Community

NightFury2710 commited on Jan 29

Commit

c0e3878

1 Parent(s): f300b39

update api handle 3

Browse files

Files changed (2) hide show

app.py +77 -4
requirements.txt +8 -6

app.py CHANGED Viewed

@@ -4,6 +4,10 @@ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 import uvicorn
 import asyncio
 import nest_asyncio
 # Apply nest_asyncio to allow nested event loops
 nest_asyncio.apply()
@@ -20,11 +24,72 @@ class CrawlRequest(BaseModel):
     excluded_tags: list[str] = ["nav", "footer", "aside"]
     remove_overlay_elements: bool = True
 class CrawlResponse(BaseModel):
     url: str
-    markdown: str
     success: bool
-    error: str = None
 @app.post("/crawl", response_model=CrawlResponse)
 async def crawl_url(request: CrawlRequest):
@@ -44,10 +109,18 @@ async def crawl_url(request: CrawlRequest):
                 config=config
             )
             return CrawlResponse(
                 url=str(request.url),
-                markdown=result.markdown_v2.raw_markdown,
-                success=result.success
             )
     except Exception as e:

 import uvicorn
 import asyncio
 import nest_asyncio
+import re
+from typing import Optional, List, Dict
+from bs4 import BeautifulSoup
+from datetime import datetime
 # Apply nest_asyncio to allow nested event loops
 nest_asyncio.apply()
     excluded_tags: list[str] = ["nav", "footer", "aside"]
     remove_overlay_elements: bool = True
+class Article(BaseModel):
+    title: str
+    url: str
+    description: Optional[str] = None
+    image_url: Optional[str] = None
+    timestamp: Optional[str] = None
+    category: Optional[str] = None
 class CrawlResponse(BaseModel):
     url: str
     success: bool
+    error: Optional[str] = None
+    metadata: Dict = {}
+    articles: List[Article] = []
+    raw_markdown: Optional[str] = None
+def extract_articles(markdown: str) -> List[Article]:
+    articles = []
+    # Extract articles using regex
+    article_pattern = r'\[(.*?)\]\((.*?)\)(.*?)(?=\[|$)'
+    matches = re.finditer(article_pattern, markdown, re.DOTALL)
+    for match in matches:
+        title = match.group(1).strip()
+        url = match.group(2).replace('<', '').replace('>', '')
+        description = match.group(3).strip()
+        # Skip navigation links and other non-article content
+        if any(skip in title.lower() for skip in ['...', 'navigation', 'menu', 'logo', 'existing code']):
+            continue
+        # Extract image URL if present
+        image_url = None
+        image_match = re.search(r'!\[(.*?)\]\((.*?)\)', description)
+        if image_match:
+            image_url = image_match.group(2)
+            description = description.replace(image_match.group(0), '').strip()
+        # Clean up description
+        description = re.sub(r'\[(.*?)\]\((.*?)\)', '', description).strip()
+        if description and len(description) > 3:  # Only include if description is meaningful
+            article = Article(
+                title=title,
+                url=url,
+                description=description,
+                image_url=image_url
+            )
+            articles.append(article)
+    return articles
+def extract_metadata(markdown: str) -> Dict:
+    metadata = {
+        "timestamp": datetime.now().isoformat(),
+        "categories": [],
+        "total_articles": 0
+    }
+    # Extract categories
+    category_pattern = r'##\s+\[(.*?)\]'
+    categories = re.findall(category_pattern, markdown)
+    if categories:
+        metadata["categories"] = [cat.strip() for cat in categories]
+    return metadata
 @app.post("/crawl", response_model=CrawlResponse)
 async def crawl_url(request: CrawlRequest):
                 config=config
             )
+            # Extract articles and metadata
+            markdown = result.markdown_v2.raw_markdown
+            articles = extract_articles(markdown)
+            metadata = extract_metadata(markdown)
+            metadata["total_articles"] = len(articles)
             return CrawlResponse(
                 url=str(request.url),
+                success=result.success,
+                metadata=metadata,
+                articles=articles,
+                raw_markdown=markdown if result.success else None
             )
     except Exception as e:

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-fastapi
-uvicorn
-crawl4ai
-nest-asyncio
-pydantic
-python-multipart

+fastapi>=0.109.0
+uvicorn>=0.27.0
+pydantic>=2.5.3
+beautifulsoup4>=4.12.0
+crawl4ai>=0.1.0
+nest-asyncio>=1.6.0
+python-multipart>=0.0.6
+typing-extensions>=4.9.0