File size: 42,495 Bytes
edca24f a855629 edca24f a855629 edca24f a855629 edca24f d00c422 edca24f d00c422 a855629 d00c422 a855629 d00c422 a855629 d00c422 a855629 d00c422 a855629 d00c422 a855629 95c4da0 a855629 95c4da0 a855629 95c4da0 d00c422 a855629 d00c422 95c4da0 d00c422 95c4da0 d00c422 a855629 95c4da0 d00c422 a855629 95c4da0 d00c422 a855629 95c4da0 d00c422 a855629 95c4da0 d00c422 a855629 95c4da0 d00c422 95c4da0 d00c422 b366845 d00c422 95c4da0 d00c422 c4a3626 b366845 edca24f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import json
from typing import List, Dict, Any, Optional
import re
from urllib.parse import urljoin
import time
import functools
import logging
from datetime import datetime, timedelta
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Renamed class for brevity to avoid long tool names
class HF_API:
def __init__(self):
self.base_url = "https://huggingface.co"
self.docs_url = "https://huggingface.co/docs"
self.api_url = "https://huggingface.co/api"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'HF-Info-Server/1.0 (Educational Purpose)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
})
self.cache = {}
self.cache_ttl = 3600 # 1 hour cache TTL
def _is_cache_valid(self, cache_key: str) -> bool:
if cache_key not in self.cache:
return False
cache_time = self.cache[cache_key].get('timestamp', 0)
return time.time() - cache_time < self.cache_ttl
def _get_from_cache(self, cache_key: str) -> Optional[str]:
if self._is_cache_valid(cache_key):
return self.cache[cache_key]['content']
return None
def _store_in_cache(self, cache_key: str, content: str):
self.cache[cache_key] = {
'content': content,
'timestamp': time.time()
}
def _fetch_with_retry(self, url: str, max_retries: int = 3) -> Optional[str]:
cache_key = f"url_{hash(url)}"
cached_content = self._get_from_cache(cache_key)
if cached_content:
logger.info(f"Cache hit for {url}")
return cached_content
for attempt in range(max_retries):
try:
logger.info(f"Fetching {url} (attempt {attempt + 1})")
response = self.session.get(url, timeout=20)
response.raise_for_status()
content = response.text
self._store_in_cache(cache_key, content)
return content
except requests.exceptions.RequestException as e:
logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt)
else:
logger.error(f"All attempts failed for {url}")
return None
return None
def _extract_code_examples(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
code_blocks = []
code_elements = soup.find_all(['code', 'pre'])
for code_elem in code_elements:
lang_class = code_elem.get('class', [])
language = 'python'
for cls in lang_class:
if 'language-' in str(cls):
language = str(cls).replace('language-', '')
break
elif any(lang in str(cls).lower() for lang in ['python', 'bash', 'javascript', 'json']):
language = str(cls).lower()
break
code_text = code_elem.get_text(strip=True)
if len(code_text) > 20 and any(keyword in code_text.lower() for keyword in ['import', 'from', 'def', 'class', 'pip install', 'transformers']):
code_blocks.append({'code': code_text, 'language': language, 'type': 'usage' if any(word in code_text.lower() for word in ['import', 'load', 'pipeline']) else 'example'})
highlight_blocks = soup.find_all('div', class_=re.compile(r'highlight|code-block|language'))
for block in highlight_blocks:
code_text = block.get_text(strip=True)
if len(code_text) > 20:
code_blocks.append({'code': code_text, 'language': 'python', 'type': 'example'})
seen = set()
unique_blocks = []
for block in code_blocks:
code_hash = hash(block['code'][:100])
if code_hash not in seen:
seen.add(code_hash)
unique_blocks.append(block)
if len(unique_blocks) >= 5:
break
return unique_blocks
def _extract_practical_content(self, soup: BeautifulSoup, topic: str) -> Dict[str, Any]:
content = {'overview': '', 'code_examples': [], 'usage_instructions': [], 'parameters': [], 'methods': [], 'installation': '', 'quickstart': ''}
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|docs|prose'))
if not main_content:
return content
overview_sections = main_content.find_all('p', limit=5)
overview_texts = []
for p in overview_sections:
text = p.get_text(strip=True)
if len(text) > 30 and not text.startswith('Table of contents'):
overview_texts.append(text)
if overview_texts:
overview = ' '.join(overview_texts)
content['overview'] = overview[:1000] + "..." if len(overview) > 1000 else overview
content['code_examples'] = self._extract_code_examples(main_content)
install_headings = main_content.find_all(['h1', 'h2', 'h3', 'h4'], string=re.compile(r'install|setup|getting started', re.IGNORECASE))
for heading in install_headings:
next_elem = heading.find_next_sibling()
install_text = []
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4'] and len(install_text) < 3:
if next_elem.name in ['p', 'pre', 'code']:
text = next_elem.get_text(strip=True)
if text and len(text) > 10:
install_text.append(text)
next_elem = next_elem.find_next_sibling()
if install_text:
content['installation'] = ' '.join(install_text)
break
usage_headings = main_content.find_all(['h1', 'h2', 'h3', 'h4'])
for heading in usage_headings:
heading_text = heading.get_text(strip=True).lower()
if any(keyword in heading_text for keyword in ['usage', 'example', 'how to', 'quickstart', 'getting started']):
next_elem = heading.find_next_sibling()
instruction_parts = []
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4']:
if next_elem.name in ['p', 'li', 'div', 'ol', 'ul']:
text = next_elem.get_text(strip=True)
if text and len(text) > 15:
instruction_parts.append(text)
next_elem = next_elem.find_next_sibling()
if len(instruction_parts) >= 5:
break
if instruction_parts:
content['usage_instructions'].extend(instruction_parts)
tables = main_content.find_all('table')
for table in tables:
headers = [th.get_text(strip=True).lower() for th in table.find_all('th')]
if any(keyword in ' '.join(headers) for keyword in ['parameter', 'argument', 'option', 'attribute', 'name', 'type']):
rows = table.find_all('tr')[1:]
for row in rows[:8]:
cells = [td.get_text(strip=True) for td in row.find_all('td')]
if len(cells) >= 2:
param_info = {'name': cells[0], 'description': cells[1] if len(cells) > 1 else '', 'type': cells[2] if len(cells) > 2 else '', 'default': cells[3] if len(cells) > 3 else ''}
content['parameters'].append(param_info)
return content
def search_documentation(self, query: str, max_results: int = 3) -> str:
"""
Searches the official Hugging Face documentation for a specific topic and returns a summary.
This tool is useful for finding how-to guides, explanations of concepts like 'pipeline' or 'tokenizer', and usage examples.
Args:
query (str): The topic or keyword to search for in the documentation (e.g., 'fine-tuning', 'peft', 'datasets').
max_results (int): The maximum number of documentation pages to retrieve and summarize. Defaults to 3.
"""
try:
max_results = int(max_results) if isinstance(max_results, str) else max_results
max_results = min(max_results, 5)
query_lower = query.lower().strip()
if not query_lower:
return "Please provide a search query."
doc_sections = {
'transformers': {'base_url': 'https://huggingface.co/docs/transformers', 'topics': {'pipeline': '/main_classes/pipelines', 'tokenizer': '/main_classes/tokenizer', 'trainer': '/main_classes/trainer', 'model': '/main_classes/model', 'quicktour': '/quicktour', 'installation': '/installation', 'fine-tuning': '/training', 'training': '/training', 'inference': '/main_classes/pipelines', 'preprocessing': '/preprocessing', 'tutorial': '/tutorials', 'configuration': '/main_classes/configuration', 'peft': '/peft', 'lora': '/peft', 'quantization': '/main_classes/quantization', 'generation': '/main_classes/text_generation', 'optimization': '/perf_train_gpu_one', 'deployment': '/deployment', 'custom': '/custom_models'}},
'datasets': {'base_url': 'https://huggingface.co/docs/datasets', 'topics': {'loading': '/load_hub', 'load': '/load_hub', 'processing': '/process', 'streaming': '/stream', 'audio': '/audio_process', 'image': '/image_process', 'text': '/nlp_process', 'arrow': '/about_arrow', 'cache': '/cache', 'upload': '/upload_dataset', 'custom': '/dataset_script'}},
'diffusers': {'base_url': 'https://huggingface.co/docs/diffusers', 'topics': {'pipeline': '/using-diffusers/loading', 'stable diffusion': '/using-diffusers/stable_diffusion', 'controlnet': '/using-diffusers/controlnet', 'inpainting': '/using-diffusers/inpaint', 'training': '/training/overview', 'optimization': '/optimization/fp16', 'schedulers': '/using-diffusers/schedulers'}},
'hub': {'base_url': 'https://huggingface.co/docs/hub', 'topics': {'repositories': '/repositories', 'git': '/repositories-getting-started', 'spaces': '/spaces', 'models': '/models', 'datasets': '/datasets'}}
}
relevant_urls = []
for section_name, section_data in doc_sections.items():
base_url = section_data['base_url']
topics = section_data['topics']
for topic, path in topics.items():
relevance = 0
if query_lower == topic.lower(): relevance = 1.0
elif query_lower in topic.lower(): relevance = 0.9
elif any(word in topic.lower() for word in query_lower.split()): relevance = 0.7
elif any(word in query_lower for word in topic.lower().split()): relevance = 0.6
if relevance > 0:
full_url = base_url + path
relevant_urls.append({'url': full_url, 'topic': topic, 'section': section_name, 'relevance': relevance})
relevant_urls.sort(key=lambda x: x['relevance'], reverse=True)
relevant_urls = relevant_urls[:max_results]
if not relevant_urls:
return f"β No documentation found for '{query}'. Try: pipeline, tokenizer, trainer, model, fine-tuning, datasets, diffusers, or peft."
result = f"# π Hugging Face Documentation: {query}\n\n"
for i, url_info in enumerate(relevant_urls, 1):
section_emoji = {'transformers': 'π€', 'datasets': 'π', 'diffusers': 'π¨', 'hub': 'π'}.get(url_info['section'], 'π')
result += f"## {i}. {section_emoji} {url_info['topic'].title()} ({url_info['section'].title()})\n\n"
content = self._fetch_with_retry(url_info['url'])
if content:
soup = BeautifulSoup(content, 'html.parser')
practical_content = self._extract_practical_content(soup, url_info['topic'])
if practical_content['overview']: result += f"**π Overview:**\n{practical_content['overview']}\n\n"
if practical_content['installation']: result += f"**βοΈ Installation:**\n{practical_content['installation']}\n\n"
if practical_content['code_examples']:
result += "**π» Code Examples:**\n\n"
for j, code_block in enumerate(practical_content['code_examples'][:3], 1):
lang = code_block.get('language', 'python')
code_type = code_block.get('type', 'example')
result += f"*{code_type.title()} {j}:*\n```{lang}\n{code_block['code']}\n```\n\n"
if practical_content['usage_instructions']:
result += "**π οΈ Usage Instructions:**\n"
for idx, instruction in enumerate(practical_content['usage_instructions'][:4], 1):
result += f"{idx}. {instruction}\n"
result += "\n"
if practical_content['parameters']:
result += "**βοΈ Parameters:**\n"
for param in practical_content['parameters'][:6]:
param_type = f" (`{param['type']}`)" if param.get('type') else ""
default_val = f" *Default: {param['default']}*" if param.get('default') else ""
result += f"β’ **{param['name']}**{param_type}: {param['description']}{default_val}\n"
result += "\n"
result += f"**π Full Documentation:** {url_info['url']}\n\n"
else:
result += f"β οΈ Could not fetch content. Visit directly: {url_info['url']}\n\n"
result += "---\n\n"
return result
except Exception as e:
logger.error(f"Error in search_documentation: {e}")
return f"β Error searching documentation: {str(e)}\n\nTry a simpler search term or check your internet connection."
def get_model_info(self, model_name: str) -> str:
"""
Fetches comprehensive information about a specific model from the Hugging Face Hub.
Provides statistics like downloads and likes, a description, usage examples, and a quick-start code snippet.
Args:
model_name (str): The full identifier of the model on the Hub, such as 'bert-base-uncased' or 'meta-llama/Llama-2-7b-hf'.
"""
try:
model_name = model_name.strip()
if not model_name: return "Please provide a model name."
api_url = f"{self.api_url}/models/{model_name}"
response = self.session.get(api_url, timeout=15)
if response.status_code == 404: return f"β Model '{model_name}' not found. Please check the model name."
elif response.status_code != 200: return f"β Error fetching model info (Status: {response.status_code})"
model_data = response.json()
result = f"# π€ Model: {model_name}\n\n"
downloads = model_data.get('downloads', 0)
likes = model_data.get('likes', 0)
task = model_data.get('pipeline_tag', 'N/A')
library = model_data.get('library_name', 'N/A')
result += f"**π Statistics:**\nβ’ **Downloads:** {downloads:,}\nβ’ **Likes:** {likes:,}\nβ’ **Task:** {task}\nβ’ **Library:** {library}\nβ’ **Created:** {model_data.get('createdAt', 'N/A')[:10]}\nβ’ **Updated:** {model_data.get('lastModified', 'N/A')[:10]}\n\n"
if 'tags' in model_data and model_data['tags']: result += f"**π·οΈ Tags:** {', '.join(model_data['tags'][:10])}\n\n"
model_url = f"{self.base_url}/{model_name}"
page_content = self._fetch_with_retry(model_url)
if page_content:
soup = BeautifulSoup(page_content, 'html.parser')
readme_content = soup.find('div', class_=re.compile(r'prose|readme|model-card'))
if readme_content:
paragraphs = readme_content.find_all('p')[:3]
description_parts = []
for p in paragraphs:
text = p.get_text(strip=True)
if len(text) > 30 and not any(skip in text.lower() for skip in ['table of contents', 'toc']):
description_parts.append(text)
if description_parts:
description = ' '.join(description_parts)
result += f"**π Description:**\n{description[:800]}{'...' if len(description) > 800 else ''}\n\n"
code_examples = self._extract_code_examples(soup)
if code_examples:
result += "**π» Usage Examples:**\n\n"
for i, code_block in enumerate(code_examples[:3], 1):
lang = code_block.get('language', 'python')
result += f"*Example {i}:*\n```{lang}\n{code_block['code']}\n```\n\n"
if task and task != 'N/A':
result += f"**π Quick Start Template:**\n"
if library == 'transformers':
result += f"```python\nfrom transformers import pipeline\n\n# Load the model\nmodel = pipeline('{task}', model='{model_name}')\n\n# Use the model\n# result = model(your_input_here)\n# print(result)\n```\n\n"
else:
result += f"```python\n# Load and use {model_name}\n# Refer to the documentation for specific usage\n```\n\n"
if 'siblings' in model_data:
files = [f['rfilename'] for f in model_data['siblings'][:10]]
if files:
result += f"**π Model Files:** {', '.join(files)}\n\n"
result += f"**π Model Page:** {model_url}\n"
return result
except requests.exceptions.RequestException as e: return f"β Network error: {str(e)}"
except Exception as e:
logger.error(f"Error in get_model_info: {e}")
return f"β Error fetching model info: {str(e)}"
def get_dataset_info(self, dataset_name: str) -> str:
"""
Retrieves detailed information about a specific dataset from the Hugging Face Hub.
Includes statistics, a description, and a quick-start code snippet showing how to load the dataset.
Args:
dataset_name (str): The full identifier of the dataset on the Hub, for example 'squad' or 'imdb'.
"""
try:
dataset_name = dataset_name.strip()
if not dataset_name: return "Please provide a dataset name."
api_url = f"{self.api_url}/datasets/{dataset_name}"
response = self.session.get(api_url, timeout=15)
if response.status_code == 404: return f"β Dataset '{dataset_name}' not found. Please check the dataset name."
elif response.status_code != 200: return f"β Error fetching dataset info (Status: {response.status_code})"
dataset_data = response.json()
result = f"# π Dataset: {dataset_name}\n\n"
downloads = dataset_data.get('downloads', 0)
likes = dataset_data.get('likes', 0)
result += f"**π Statistics:**\nβ’ **Downloads:** {downloads:,}\nβ’ **Likes:** {likes:,}\nβ’ **Created:** {dataset_data.get('createdAt', 'N/A')[:10]}\nβ’ **Updated:** {dataset_data.get('lastModified', 'N/A')[:10]}\n\n"
if 'tags' in dataset_data and dataset_data['tags']: result += f"**π·οΈ Tags:** {', '.join(dataset_data['tags'][:10])}\n\n"
dataset_url = f"{self.base_url}/datasets/{dataset_name}"
page_content = self._fetch_with_retry(dataset_url)
if page_content:
soup = BeautifulSoup(page_content, 'html.parser')
readme_content = soup.find('div', class_=re.compile(r'prose|readme|dataset-card'))
if readme_content:
paragraphs = readme_content.find_all('p')[:3]
description_parts = []
for p in paragraphs:
text = p.get_text(strip=True)
if len(text) > 30: description_parts.append(text)
if description_parts:
description = ' '.join(description_parts)
result += f"**π Description:**\n{description[:800]}{'...' if len(description) > 800 else ''}\n\n"
code_examples = self._extract_code_examples(soup)
if code_examples:
result += "**π» Usage Examples:**\n\n"
for i, code_block in enumerate(code_examples[:3], 1):
lang = code_block.get('language', 'python')
result += f"*Example {i}:*\n```{lang}\n{code_block['code']}\n```\n\n"
result += f"**π Quick Start Template:**\n"
result += f"```python\nfrom datasets import load_dataset\n\n# Load the dataset\ndataset = load_dataset('{dataset_name}')\n\n# Explore the dataset\n# print(dataset)\n# print(f\"Dataset keys: {{list(dataset.keys())}}\")\n\n# Access first example\n# if 'train' in dataset:\n# print(\"First example:\")\n# print(dataset['train'][0])\n```\n\n"
result += f"**π Dataset Page:** {dataset_url}\n"
return result
except requests.exceptions.RequestException as e: return f"β Network error: {str(e)}"
except Exception as e:
logger.error(f"Error in get_dataset_info: {e}")
return f"β Error fetching dataset info: {str(e)}"
def search_models(self, task: str, limit: str = "5") -> str:
"""
Searches the Hugging Face Hub for models based on a specified task or keyword and returns a list of top models.
Each result includes statistics and a quick usage example.
Args:
task (str): The task to search for, such as 'text-classification', 'image-generation', or 'question-answering'.
limit (str): The maximum number of models to return. Defaults to '5'.
"""
try:
task = task.strip()
if not task: return "Please provide a search task or keyword."
limit = int(limit) if isinstance(limit, str) and limit.isdigit() else 5
limit = min(max(limit, 1), 10)
params = {'search': task, 'limit': limit * 3, 'sort': 'downloads', 'direction': -1}
response = self.session.get(f"{self.api_url}/models", params=params, timeout=20)
response.raise_for_status()
models = response.json()
if not models: return f"β No models found for task: '{task}'. Try different keywords."
filtered_models = []
for model in models:
if (model.get('downloads', 0) > 0 or model.get('likes', 0) > 0 or 'pipeline_tag' in model):
filtered_models.append(model)
if len(filtered_models) >= limit: break
if not filtered_models: filtered_models = models[:limit]
result = f"# π Top {len(filtered_models)} Models for '{task}'\n\n"
for i, model in enumerate(filtered_models, 1):
model_id = model.get('id', 'Unknown')
downloads = model.get('downloads', 0)
likes = model.get('likes', 0)
task_type = model.get('pipeline_tag', 'N/A')
library = model.get('library_name', 'N/A')
quality_score = ""
if downloads > 10000: quality_score = "β Popular"
elif downloads > 1000: quality_score = "π₯ Active"
elif likes > 10: quality_score = "π Liked"
result += f"## {i}. {model_id} {quality_score}\n\n"
result += f"**π Stats:**\nβ’ **Downloads:** {downloads:,}\nβ’ **Likes:** {likes}\nβ’ **Task:** {task_type}\nβ’ **Library:** {library}\n\n"
if task_type and task_type != 'N/A':
result += f"**π Quick Usage:**\n"
if library == 'transformers':
result += f"```python\nfrom transformers import pipeline\n\n# Load model\nmodel = pipeline('{task_type}', model='{model_id}')\n\n# Use model\n# result = model(\"Your input here\")\n# print(result)\n```\n\n"
else:
result += f"```python\n# Load and use {model_id}\n# Check model page for specific usage instructions\n```\n\n"
result += f"**π Model Page:** {self.base_url}/{model_id}\n\n---\n\n"
return result
except requests.exceptions.RequestException as e: return f"β Network error: {str(e)}"
except Exception as e:
logger.error(f"Error in search_models: {e}")
return f"β Error searching models: {str(e)}"
def get_transformers_docs(self, topic: str) -> str:
"""
Fetches detailed documentation specifically for the Hugging Face Transformers library on a given topic.
This provides in-depth explanations, code examples, and parameter descriptions for core library components.
Args:
topic (str): The Transformers library topic to look up, such as 'pipeline', 'tokenizer', 'trainer', or 'generation'.
"""
try:
topic = topic.strip().lower()
if not topic: return "Please provide a topic to search for."
docs_url = "https://huggingface.co/docs/transformers"
topic_map = {'pipeline': f"{docs_url}/main_classes/pipelines", 'pipelines': f"{docs_url}/main_classes/pipelines", 'tokenizer': f"{docs_url}/main_classes/tokenizer", 'tokenizers': f"{docs_url}/main_classes/tokenizer", 'trainer': f"{docs_url}/main_classes/trainer", 'training': f"{docs_url}/training", 'model': f"{docs_url}/main_classes/model", 'models': f"{docs_url}/main_classes/model", 'configuration': f"{docs_url}/main_classes/configuration", 'config': f"{docs_url}/main_classes/configuration", 'quicktour': f"{docs_url}/quicktour", 'quick': f"{docs_url}/quicktour", 'installation': f"{docs_url}/installation", 'install': f"{docs_url}/installation", 'tutorial': f"{docs_url}/tutorials", 'tutorials': f"{docs_url}/tutorials", 'generation': f"{docs_url}/main_classes/text_generation", 'text_generation': f"{docs_url}/main_classes/text_generation", 'preprocessing': f"{docs_url}/preprocessing", 'preprocess': f"{docs_url}/preprocessing", 'peft': f"{docs_url}/peft", 'lora': f"{docs_url}/peft", 'quantization': f"{docs_url}/main_classes/quantization", 'optimization': f"{docs_url}/perf_train_gpu_one", 'performance': f"{docs_url}/perf_train_gpu_one", 'deployment': f"{docs_url}/deployment", 'custom': f"{docs_url}/custom_models", 'fine-tuning': f"{docs_url}/training", 'finetuning': f"{docs_url}/training"}
url = topic_map.get(topic)
if not url:
for key, value in topic_map.items():
if topic in key or key in topic:
url = value
topic = key
break
if not url:
url = f"{docs_url}/quicktour"
topic = "quicktour"
content = self._fetch_with_retry(url)
if not content: return f"β Could not fetch documentation for '{topic}'. Please try again or visit: {url}"
soup = BeautifulSoup(content, 'html.parser')
practical_content = self._extract_practical_content(soup, topic)
result = f"# π Transformers Documentation: {topic.replace('_', ' ').title()}\n\n"
if practical_content['overview']: result += f"**π Overview:**\n{practical_content['overview']}\n\n"
if practical_content['installation']: result += f"**βοΈ Installation:**\n{practical_content['installation']}\n\n"
if practical_content['code_examples']:
result += "**π» Code Examples:**\n\n"
for i, code_block in enumerate(practical_content['code_examples'][:4], 1):
lang = code_block.get('language', 'python')
code_type = code_block.get('type', 'example')
result += f"### {code_type.title()} {i}:\n```{lang}\n{code_block['code']}\n```\n\n"
if practical_content['usage_instructions']:
result += "**π οΈ Step-by-Step Usage:**\n"
for i, instruction in enumerate(practical_content['usage_instructions'][:6], 1):
result += f"{i}. {instruction}\n"
result += "\n"
if practical_content['parameters']:
result += "**βοΈ Key Parameters:**\n"
for param in practical_content['parameters'][:10]:
param_type = f" (`{param['type']}`)" if param.get('type') else ""
default_val = f" *Default: `{param['default']}`*" if param.get('default') else ""
result += f"β’ **`{param['name']}`**{param_type}: {param['description']}{default_val}\n"
result += "\n"
related_topics = [k for k in topic_map.keys() if k != topic][:5]
if related_topics: result += f"**π Related Topics:** {', '.join(related_topics)}\n\n"
result += f"**π Full Documentation:** {url}\n"
return result
except Exception as e:
logger.error(f"Error in get_transformers_docs: {e}")
return f"β Error fetching Transformers documentation: {str(e)}"
def get_trending_models(self, limit: str = "10") -> str:
"""
Fetches a list of the most downloaded models currently trending on the Hugging Face Hub.
This is useful for discovering popular and widely-used models.
Args:
limit (str): The number of trending models to return. Defaults to '10'.
"""
try:
limit = int(limit) if isinstance(limit, str) and limit.isdigit() else 10
limit = min(max(limit, 1), 20)
params = {'sort': 'downloads', 'direction': -1, 'limit': limit}
response = self.session.get(f"{self.api_url}/models", params=params, timeout=20)
response.raise_for_status()
models = response.json()
if not models: return "β Could not fetch trending models."
result = f"# π₯ Trending Models (Top {len(models)})\n\n"
for i, model in enumerate(models, 1):
model_id = model.get('id', 'Unknown')
downloads = model.get('downloads', 0)
likes = model.get('likes', 0)
task = model.get('pipeline_tag', 'N/A')
if downloads > 1000000: trend = "π Mega Popular"
elif downloads > 100000: trend = "π₯ Very Popular"
elif downloads > 10000: trend = "β Popular"
else: trend = "π Trending"
result += f"## {i}. {model_id} {trend}\n"
result += f"β’ **Downloads:** {downloads:,} | **Likes:** {likes} | **Task:** {task}\n"
result += f"β’ **Link:** {self.base_url}/{model_id}\n\n"
return result
except Exception as e:
logger.error(f"Error in get_trending_models: {e}")
return f"β Error fetching trending models: {str(e)}"
# Initialize the API server
hf_api = HF_API()
# --- Named Functions for Gradio UI ---
def clear_output():
"""Clears a Gradio output component."""
return ""
def set_textbox_value(text):
"""Sets a Gradio Textbox to a specific value."""
return text
# --- Doc Search Tab Functions ---
def run_doc_search(query, max_results):
return hf_api.search_documentation(query, int(max_results) if str(max_results).isdigit() else 2)
# --- Model Info Tab Functions ---
def run_model_info(model_name):
return hf_api.get_model_info(model_name)
# --- Dataset Info Tab Functions ---
def run_dataset_info(dataset_name):
return hf_api.get_dataset_info(dataset_name)
# --- Model Search Tab Functions ---
def run_model_search(task, limit):
return hf_api.search_models(task, int(limit) if str(limit).isdigit() else 5)
# --- Transformers Docs Tab Functions ---
def run_transformers_docs(topic):
return hf_api.get_transformers_docs(topic)
# --- Trending Models Tab Functions ---
def run_trending_models(limit):
return hf_api.get_trending_models(int(limit) if str(limit).isdigit() else 10)
# --- Create Gradio Interface ---
with gr.Blocks(
title="π€ Hugging Face Information Server",
theme=gr.themes.Soft(),
css="""
.gradio-container { font-family: 'Inter', sans-serif; }
.main-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; }
""") as demo:
# Header
with gr.Row():
gr.HTML("""
<div class="main-header">
<h1>π€ Hugging Face Information Server</h1>
<p>Get comprehensive documentation with <strong>real code examples</strong>, <strong>usage instructions</strong>, and <strong>practical content</strong></p>
</div>
""")
with gr.Tab("π Documentation Search", elem_id="docs"):
gr.Markdown("### Search for documentation with **comprehensive code examples** and **step-by-step instructions**")
with gr.Row():
with gr.Column(scale=3):
doc_query = gr.Textbox(label="π Search Query", placeholder="e.g., tokenizer, pipeline, fine-tuning, peft, trainer, quantization")
with gr.Column(scale=1):
doc_max_results = gr.Number(label="Max Results", value=2, minimum=1, maximum=5)
doc_output = gr.Textbox(label="π Documentation with Examples", lines=25, max_lines=30)
with gr.Row():
doc_btn = gr.Button("π Search Documentation", variant="primary", size="lg")
doc_clear = gr.Button("ποΈ Clear", variant="secondary")
gr.Markdown("**Quick Examples:**")
with gr.Row():
gr.Button("Pipeline", size="sm").click(functools.partial(set_textbox_value, "pipeline"), outputs=doc_query)
gr.Button("Tokenizer", size="sm").click(functools.partial(set_textbox_value, "tokenizer"), outputs=doc_query)
gr.Button("Fine-tuning", size="sm").click(functools.partial(set_textbox_value, "fine-tuning"), outputs=doc_query)
gr.Button("PEFT", size="sm").click(functools.partial(set_textbox_value, "peft"), outputs=doc_query)
doc_btn.click(run_doc_search, inputs=[doc_query, doc_max_results], outputs=doc_output)
doc_clear.click(clear_output, outputs=doc_output)
with gr.Tab("π€ Model Information", elem_id="models"):
gr.Markdown("### Get detailed model information with **usage examples** and **code snippets**")
model_name = gr.Textbox(label="π€ Model Name", placeholder="e.g., bert-base-uncased, gpt2, microsoft/DialoGPT-medium, meta-llama/Llama-2-7b-hf")
model_output = gr.Textbox(label="π Model Information + Usage Examples", lines=25, max_lines=30)
with gr.Row():
model_btn = gr.Button("π Get Model Info", variant="primary", size="lg")
model_clear = gr.Button("ποΈ Clear", variant="secondary")
gr.Markdown("**Popular Models:**")
with gr.Row():
gr.Button("BERT", size="sm").click(functools.partial(set_textbox_value, "bert-base-uncased"), outputs=model_name)
gr.Button("GPT-2", size="sm").click(functools.partial(set_textbox_value, "gpt2"), outputs=model_name)
gr.Button("T5", size="sm").click(functools.partial(set_textbox_value, "t5-small"), outputs=model_name)
gr.Button("DistilBERT", size="sm").click(functools.partial(set_textbox_value, "distilbert-base-uncased"), outputs=model_name)
model_btn.click(run_model_info, inputs=model_name, outputs=model_output)
model_clear.click(clear_output, outputs=model_output)
with gr.Tab("π Dataset Information", elem_id="datasets"):
gr.Markdown("### Get dataset information with **loading examples** and **usage code**")
dataset_name = gr.Textbox(label="π Dataset Name", placeholder="e.g., squad, imdb, glue, common_voice, wikitext")
dataset_output = gr.Textbox(label="π Dataset Information + Usage Examples", lines=25, max_lines=30)
with gr.Row():
dataset_btn = gr.Button("π Get Dataset Info", variant="primary", size="lg")
dataset_clear = gr.Button("ποΈ Clear", variant="secondary")
gr.Markdown("**Popular Datasets:**")
with gr.Row():
gr.Button("SQuAD", size="sm").click(functools.partial(set_textbox_value, "squad"), outputs=dataset_name)
gr.Button("IMDB", size="sm").click(functools.partial(set_textbox_value, "imdb"), outputs=dataset_name)
gr.Button("GLUE", size="sm").click(functools.partial(set_textbox_value, "glue"), outputs=dataset_name)
gr.Button("Common Voice", size="sm").click(functools.partial(set_textbox_value, "common_voice"), outputs=dataset_name)
dataset_btn.click(run_dataset_info, inputs=dataset_name, outputs=dataset_output)
dataset_clear.click(clear_output, outputs=dataset_output)
with gr.Tab("π Model Search", elem_id="search"):
gr.Markdown("### Search models with **quick usage examples** and **quality indicators**")
with gr.Row():
with gr.Column(scale=3):
search_task = gr.Textbox(label="π Task or Keyword", placeholder="e.g., text-classification, image-generation, question-answering, sentiment-analysis")
with gr.Column(scale=1):
search_limit = gr.Number(label="Max Results", value=5, minimum=1, maximum=10)
search_output = gr.Textbox(label="π Models with Usage Examples", lines=25, max_lines=30)
with gr.Row():
search_btn = gr.Button("π Search Models", variant="primary", size="lg")
search_clear = gr.Button("ποΈ Clear", variant="secondary")
gr.Markdown("**Popular Tasks:**")
with gr.Row():
gr.Button("Text Classification", size="sm").click(functools.partial(set_textbox_value, "text-classification"), outputs=search_task)
gr.Button("Question Answering", size="sm").click(functools.partial(set_textbox_value, "question-answering"), outputs=search_task)
gr.Button("Text Generation", size="sm").click(functools.partial(set_textbox_value, "text-generation"), outputs=search_task)
gr.Button("Image Classification", size="sm").click(functools.partial(set_textbox_value, "image-classification"), outputs=search_task)
search_btn.click(run_model_search, inputs=[search_task, search_limit], outputs=search_output)
search_clear.click(clear_output, outputs=search_output)
with gr.Tab("β‘ Transformers Docs", elem_id="transformers"):
gr.Markdown("### Get comprehensive Transformers documentation with **detailed examples** and **parameters**")
transformers_topic = gr.Textbox(label="π Topic", placeholder="e.g., pipeline, tokenizer, trainer, model, peft, generation, quantization")
transformers_output = gr.Textbox(label="π Comprehensive Documentation", lines=25, max_lines=30)
with gr.Row():
transformers_btn = gr.Button("π Get Documentation", variant="primary", size="lg")
transformers_clear = gr.Button("ποΈ Clear", variant="secondary")
gr.Markdown("**Core Topics:**")
with gr.Row():
gr.Button("Pipeline", size="sm").click(functools.partial(set_textbox_value, "pipeline"), outputs=transformers_topic)
gr.Button("Tokenizer", size="sm").click(functools.partial(set_textbox_value, "tokenizer"), outputs=transformers_topic)
gr.Button("Trainer", size="sm").click(functools.partial(set_textbox_value, "trainer"), outputs=transformers_topic)
gr.Button("Generation", size="sm").click(functools.partial(set_textbox_value, "generation"), outputs=transformers_topic)
transformers_btn.click(run_transformers_docs, inputs=transformers_topic, outputs=transformers_output)
transformers_clear.click(clear_output, outputs=transformers_output)
with gr.Tab("π₯ Trending Models", elem_id="trending"):
gr.Markdown("### Discover the most popular and trending models")
trending_limit = gr.Number(label="Number of Models", value=10, minimum=1, maximum=20)
trending_output = gr.Textbox(label="π₯ Trending Models", lines=20, max_lines=25)
with gr.Row():
trending_btn = gr.Button("π₯ Get Trending Models", variant="primary", size="lg")
trending_clear = gr.Button("ποΈ Clear", variant="secondary")
trending_btn.click(run_trending_models, inputs=trending_limit, outputs=trending_output)
trending_clear.click(clear_output, outputs=trending_output)
# Footer
with gr.Row():
gr.HTML("""
<div style="text-align: center; padding: 20px; color: #666;">
<h3>π‘ Features</h3>
<p><strong>β
Real code examples</strong> β’ <strong>β
Step-by-step instructions</strong> β’ <strong>β
Parameter documentation</strong> β’ <strong>β
Quality indicators</strong></p>
<p><em>Get practical, actionable information, directly from the source.</em></p>
<p><a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/HuggingFaceDoc/blob/main/README.md" target="_blank">π Read the Guide on Hugging Face Spaces</a></p>
</div>
""")
if __name__ == "__main__":
print("π Starting Hugging Face Information Server...")
print("π Features: Code examples, usage instructions, comprehensive documentation")
# Kept your original launch parameters
demo.launch(
mcp_server=True
) |