from pathlib import Path import aiofiles import crawl4ai import httpx import pytest async def crawl4ai_extract_markdown_of_url_async(url: str) -> str: """Extract markdown content from a URL using crawl4ai.""" async with crawl4ai.AsyncWebCrawler() as crawler: result = await crawler.arun(url=url) return result.markdown async def download_pdf_async(url: str, output_path: Path) -> str: """Download a PDF file from a URL.""" timeout = httpx.Timeout(30.0, connect=10.0) async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: response = await client.get(url) response.raise_for_status() async with aiofiles.open(output_path, "wb") as f: await f.write(response.content) return output_path async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str: """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs.""" # Extract the arXiv ID from the URL if "/abs/" in url: arxiv_id = url.split("/abs/")[1].rstrip("/") pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" else: # If it's already a PDF URL, use it as is pdf_url = url return await download_pdf_async(pdf_url, output_path)