Spaces:
Sleeping
Sleeping
Commit
·
85f58d9
1
Parent(s):
6c4f9d7
update api handle 3
Browse files
app.py
CHANGED
|
@@ -23,7 +23,6 @@ class CrawlRequest(BaseModel):
|
|
| 23 |
cache_mode: str = "ENABLED"
|
| 24 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
| 25 |
remove_overlay_elements: bool = True
|
| 26 |
-
max_pages: int = 1 # Limit number of pages to crawl
|
| 27 |
timeout: int = 30 # Timeout in seconds
|
| 28 |
|
| 29 |
class Article(BaseModel):
|
|
@@ -169,15 +168,14 @@ async def crawl_url(request: CrawlRequest):
|
|
| 169 |
try:
|
| 170 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
| 171 |
|
| 172 |
-
# Create crawler with
|
| 173 |
async with AsyncWebCrawler() as crawler:
|
| 174 |
config = CrawlerRunConfig(
|
| 175 |
cache_mode=cache_mode,
|
| 176 |
excluded_tags=request.excluded_tags,
|
| 177 |
remove_overlay_elements=request.remove_overlay_elements,
|
| 178 |
-
max_pages=request.max_pages,
|
| 179 |
timeout=request.timeout,
|
| 180 |
-
#
|
| 181 |
remove_ads=True,
|
| 182 |
extract_text=True,
|
| 183 |
extract_links=True,
|
|
@@ -189,11 +187,11 @@ async def crawl_url(request: CrawlRequest):
|
|
| 189 |
config=config
|
| 190 |
)
|
| 191 |
|
| 192 |
-
# Use both markdown and HTML results
|
| 193 |
markdown = result.markdown_v2.raw_markdown
|
| 194 |
html = result.html
|
| 195 |
|
| 196 |
-
# Extract content
|
| 197 |
articles = extract_articles(markdown)
|
| 198 |
metadata = extract_metadata(markdown, html)
|
| 199 |
|
|
|
|
| 23 |
cache_mode: str = "ENABLED"
|
| 24 |
excluded_tags: list[str] = ["nav", "footer", "aside", "header", "script", "style"]
|
| 25 |
remove_overlay_elements: bool = True
|
|
|
|
| 26 |
timeout: int = 30 # Timeout in seconds
|
| 27 |
|
| 28 |
class Article(BaseModel):
|
|
|
|
| 168 |
try:
|
| 169 |
cache_mode = getattr(CacheMode, request.cache_mode)
|
| 170 |
|
| 171 |
+
# Create crawler with correct configuration parameters
|
| 172 |
async with AsyncWebCrawler() as crawler:
|
| 173 |
config = CrawlerRunConfig(
|
| 174 |
cache_mode=cache_mode,
|
| 175 |
excluded_tags=request.excluded_tags,
|
| 176 |
remove_overlay_elements=request.remove_overlay_elements,
|
|
|
|
| 177 |
timeout=request.timeout,
|
| 178 |
+
# Core features from documentation
|
| 179 |
remove_ads=True,
|
| 180 |
extract_text=True,
|
| 181 |
extract_links=True,
|
|
|
|
| 187 |
config=config
|
| 188 |
)
|
| 189 |
|
| 190 |
+
# Use both markdown and HTML results
|
| 191 |
markdown = result.markdown_v2.raw_markdown
|
| 192 |
html = result.html
|
| 193 |
|
| 194 |
+
# Extract content
|
| 195 |
articles = extract_articles(markdown)
|
| 196 |
metadata = extract_metadata(markdown, html)
|
| 197 |
|