Spaces:

re-mind
/

Crawl4AI

Running

Crawl4AI / crawl4ai /docs_manager.py

amaye15

test

03c0888 about 1 month ago

2.93 kB

	import requests
	import shutil
	from pathlib import Path
	from crawl4ai.async_logger import AsyncLogger
	from crawl4ai.llmtxt import AsyncLLMTextManager

	class DocsManager:
	def __init__(self, logger=None):
	self.docs_dir = Path.home() / ".crawl4ai" / "docs"
	self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
	self.docs_dir.mkdir(parents=True, exist_ok=True)
	self.logger = logger or AsyncLogger(verbose=True)
	self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)

	async def ensure_docs_exist(self):
	"""Fetch docs if not present"""
	if not any(self.docs_dir.iterdir()):
	await self.fetch_docs()

	async def fetch_docs(self) -> bool:
	"""Copy from local docs or download from GitHub"""
	try:
	# Try local first
	if self.local_docs.exists() and (any(self.local_docs.glob(".md")) or any(self.local_docs.glob(".tokens"))):
	# Empty the local docs directory
	for file_path in self.docs_dir.glob("*.md"):
	file_path.unlink()
	# for file_path in self.docs_dir.glob("*.tokens"):
	# file_path.unlink()
	for file_path in self.local_docs.glob("*.md"):
	shutil.copy2(file_path, self.docs_dir / file_path.name)
	# for file_path in self.local_docs.glob("*.tokens"):
	# shutil.copy2(file_path, self.docs_dir / file_path.name)
	return True

	# Fallback to GitHub
	response = requests.get(
	"https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
	headers={'Accept': 'application/vnd.github.v3+json'}
	)
	response.raise_for_status()

	for item in response.json():
	if item['type'] == 'file' and item['name'].endswith('.md'):
	content = requests.get(item['download_url']).text
	with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f:
	f.write(content)
	return True

	except Exception as e:
	self.logger.error(f"Failed to fetch docs: {str(e)}")
	raise

	def list(self) -> list[str]:
	"""List available topics"""
	names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
	# Remove [0-9]+_ prefix
	names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
	# Exclude those end with .xs.md and .q.md
	names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")]
	return names

	def generate(self, sections, mode="extended"):
	return self.llm_text.generate(sections, mode)

	def search(self, query: str, top_k: int = 5):
	return self.llm_text.search(query, top_k)