Spaces:
Runtime error
Runtime error
| from pathlib import Path | |
| import aiofiles | |
| import crawl4ai | |
| import httpx | |
| import pytest | |
| async def crawl4ai_extract_markdown_of_url_async(url: str) -> str: | |
| """Extract markdown content from a URL using crawl4ai.""" | |
| async with crawl4ai.AsyncWebCrawler() as crawler: | |
| result = await crawler.arun(url=url) | |
| return result.markdown | |
| async def download_pdf_async(url: str, output_path: Path) -> str: | |
| """Download a PDF file from a URL.""" | |
| timeout = httpx.Timeout(30.0, connect=10.0) | |
| async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: | |
| response = await client.get(url) | |
| response.raise_for_status() | |
| async with aiofiles.open(output_path, "wb") as f: | |
| await f.write(response.content) | |
| return output_path | |
| async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str: | |
| """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs.""" | |
| # Extract the arXiv ID from the URL | |
| if "/abs/" in url: | |
| arxiv_id = url.split("/abs/")[1].rstrip("/") | |
| pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" | |
| else: | |
| # If it's already a PDF URL, use it as is | |
| pdf_url = url | |
| return await download_pdf_async(pdf_url, output_path) | |