hugging-research / scripts /report_generator.py
daqc's picture
Update scripts/report_generator.py
e061bc4 verified
from typing import List, Optional, Dict
from smolagents import Tool
class HFLinkReportTool(Tool):
"""Generate a single-layout HTML report (cards + counters) from a final textual answer.
The tool extracts links from the provided text, categorizes them (HF models/datasets/spaces/papers, blogs, repos, videos, news),
and renders a consistent link report. Always returns a full HTML document (starts with <!DOCTYPE html>)."""
name = "hf_links_to_report"
description = (
"Create an HTML report from a final answer text. The tool parses links, groups them into categories "
"(Hugging Face models/datasets/spaces/papers and external resources like blogs/repos/videos/news), and renders cards. "
"Inputs: final_answer (string, required), query (string, optional), title (string, optional). Returns an HTML document."
)
inputs = {
"final_answer": {"type": "string", "description": "Final answer text containing inline links"},
"query": {"type": "string", "description": "Original user intent or topic", "nullable": True},
"title": {"type": "string", "description": "Dashboard title", "nullable": True},
}
output_type = "string"
def forward(self, final_answer: str, query: Optional[str] = None, title: Optional[str] = None) -> str:
try:
import re
import json as _json
doc_title = title or "Report"
query = (query or "").strip()
header_html = f"<div class=\"header\"><div><div class=\"title\">{title}</div></div></div>" if title else ""
# Extract URLs
urls = re.findall(r"https?://[^\s)\]]+", final_answer or "")
# Categorize
cats = {
"models": [], "datasets": [], "spaces": [], "papers": [],
"blogs": [], "repos": [], "videos": [], "news": [], "other": []
}
for u in urls:
low = u.lower()
if "huggingface.co/" in low:
# Prefer explicit kinds first to avoid misclassifying /datasets/* as generic owner/repo
if "/datasets/" in low:
cats["datasets"].append(u)
elif "/spaces/" in low:
cats["spaces"].append(u)
elif "/papers/" in low:
cats["papers"].append(u)
elif "/models/" in low:
cats["models"].append(u)
else:
# Treat bare owner/repo as models only if it is NOT under known sections
# e.g., huggingface.co/owner/repo → model repo; huggingface.co/blog/... → blog
m = re.search(r"huggingface\.co/([^/]+)/([^/]+)$", low)
if m and m.group(1) not in {"datasets", "spaces", "papers", "blog", "learn", "docs", "organizations", "collections"}:
cats["models"].append(u)
else:
cats["blogs"].append(u)
elif "github.com" in low:
cats["repos"].append(u)
elif "youtube.com" in low or "youtu.be" in low:
cats["videos"].append(u)
elif any(d in low for d in ["arxiv.org", "medium.com", "towardsdatascience.com", "huggingface.co/blog", "huggingface.co/learn"]):
cats["blogs"].append(u)
elif any(d in low for d in ["theverge.com", "techcrunch.com", "venturebeat.com", "wired.com", "mit.edu"]):
cats["news"].append(u)
else:
cats["other"].append(u)
def chips_section():
chips = [
("Models", len(cats["models"])),
("Datasets", len(cats["datasets"])),
("Spaces", len(cats["spaces"])),
("Papers", len(cats["papers"])),
("Blogs/Docs", len(cats["blogs"])),
("Repos", len(cats["repos"])),
("Videos", len(cats["videos"])),
("News", len(cats["news"]))
]
return "\n".join([f"<div class=stat-chip>{name}: {count}</div>" for name, count in chips])
def host_icon(host: str) -> str:
return ""
def card_list(urls: List[str], data_cat: str) -> str:
items = []
for u in urls:
host = re.sub(r"^https?://", "", u).split("/")[0]
icon = host_icon(host)
favicon = f"https://www.google.com/s2/favicons?sz=32&domain={host}"
items.append(
f"<div class=card data-cat='{data_cat}'>"
f"<div class=card-title>{icon} <img class=\"fav\" src=\"{favicon}\" alt=\"\"/> <a href='{u}' target=_blank rel=noopener>{u}</a></div>"
f"<div class=card-subtitle>{host}</div>"
f"<div class=card-actions><button onclick=\"copyLink('{u}')\">Copy</button></div>"
"</div>"
)
return "\n".join(items)
def section(title_text: str, urls: List[str], key: str) -> str:
if not urls:
return ""
return f"<section data-key='{key}'><h2>{title_text}</h2><div class=cards>{card_list(urls, key)}</div></section>"
html = f"""<!DOCTYPE html>
<html lang=\"en\">
<head>
<meta charset=\"utf-8\" />
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />
<title>{doc_title}</title>
<style>
:root {{ --bg:#0b0d12; --fg:#e6e9ef; --muted:#9aa4b2; --card:#121621; --accent:#5ac8fa; }}
body {{ background:var(--bg); color:var(--fg); font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Inter, Arial, sans-serif; margin:0; padding:24px; }}
.container {{ max-width: 1200px; margin: 0 auto; }}
.header {{ display:flex; justify-content:space-between; align-items:center; gap:12px; margin-bottom: 12px; }}
.title {{ font-size: 22px; margin: 0; }}
.subtitle {{ color: var(--muted); }}
.stats {{ display:flex; gap:10px; flex-wrap:wrap; margin: 8px 0 18px; }}
.stat-chip {{ background: var(--card); border: 1px solid rgba(255,255,255,0.08); border-radius: 999px; padding: 6px 10px; font-size: 12px; color: var(--muted); }}
h2 {{ font-size: 16px; margin: 18px 0 8px; color: var(--accent); }}
.cards {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(280px,1fr)); gap: 12px; }}
.card {{ background: var(--card); border: 1px solid rgba(255,255,255,0.06); border-radius: 10px; padding: 12px; }}
.card-title {{ font-weight: 600; margin-bottom: 4px; overflow-wrap:anywhere; }}
.card-subtitle {{ color: var(--muted); font-size: 12px; }}
.answer {{ line-height:1.55; color:#d2d7df; }}
.card-actions button {{ background:#1f2937;color:#e5e7eb;border:1px solid rgba(255,255,255,0.08);border-radius:6px;padding:4px 8px;cursor:pointer;font-size:12px; }}
.fav {{ width:14px; height:14px; vertical-align:middle; margin-right:6px; border-radius:4px; }}
.warn {{ margin-left:6px; cursor: help; }}
</style>
<script src=\"https://cdn.jsdelivr.net/npm/marked/marked.min.js\"></script>
<script src=\"https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js\"></script>
</head>
<body>
<div class=\"container\">{header_html}
<h2>You may be interested <span class=\"warn\" title=\"Links may be AI‑generated and might not resolve.\">⚠️</span></h2>
<div class=\"stats\">{chips_section()}</div>
{section('Models', cats['models'], 'models')}
{section('Datasets', cats['datasets'], 'datasets')}
{section('Spaces', cats['spaces'], 'spaces')}
{section('Papers', cats['papers'], 'papers')}
{section('Blogs / Docs', cats['blogs'], 'blogs')}
{section('Repositories', cats['repos'], 'repos')}
{section('Videos', cats['videos'], 'videos')}
{section('News', cats['news'], 'news')}
{section('Other', cats['other'], 'other')}
</div>
<script>
function copyLink(url){{ try{{navigator.clipboard && navigator.clipboard.writeText(url);}}catch(e){{}} }}
</script>
</body>
</html>
"""
return html
except Exception as e:
return f"<!DOCTYPE html><html><body><pre>Error generating report: {str(e)}</pre></body></html>"