Spaces:
Running
Running
| import json | |
| import glob | |
| import ssl | |
| import time | |
| from datetime import datetime, timedelta, timezone | |
| from pathlib import Path | |
| from urllib.request import urlopen, Request | |
| from urllib.error import HTTPError | |
| import streamlit as st | |
| # --------------------------------------------------------------------------- | |
| # Page config | |
| # --------------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="Daily Paper Reader", | |
| page_icon="📰", | |
| layout="wide", | |
| initial_sidebar_state="collapsed", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Custom CSS – HuggingFace-inspired design | |
| # --------------------------------------------------------------------------- | |
| st.markdown( | |
| """ | |
| <style> | |
| /* ---------- global ---------- */ | |
| [data-testid="stAppViewContainer"] { background: #f6f8fa; } | |
| [data-testid="stHeader"] { background: #f6f8fa; } | |
| .block-container { padding-top: 3rem !important; } | |
| h1, h2, h3, h4 { color: #1f2328 !important; } | |
| p, li, span, label { color: #424a53; } | |
| /* ---------- upvote / rank ---------- */ | |
| .upvote-badge { | |
| display: inline-flex; align-items: center; gap: 5px; | |
| background: #fff8e1; | |
| border: 1px solid #f0d060; | |
| padding: 4px 12px; border-radius: 20px; | |
| font-size: 13px; font-weight: 700; color: #9a6700; | |
| flex-shrink: 0; | |
| } | |
| .paper-rank { | |
| display: inline-flex; align-items: center; justify-content: center; | |
| width: 28px; height: 28px; border-radius: 8px; | |
| font-weight: 700; font-size: 13px; | |
| background: #eef1f5; color: #656d76; | |
| flex-shrink: 0; | |
| } | |
| .paper-rank.top3 { | |
| background: linear-gradient(135deg, #dbeafe, #ede9fe); | |
| color: #2563eb; | |
| } | |
| .paper-authors { | |
| font-size: 13px; | |
| color: #656d76; | |
| margin-bottom: 12px; | |
| line-height: 1.5; | |
| } | |
| .paper-links { | |
| display: flex; gap: 8px; flex-wrap: wrap; | |
| } | |
| .paper-links a { | |
| display: inline-flex; align-items: center; gap: 4px; | |
| padding: 4px 12px; border-radius: 8px; | |
| border: 1px solid #d1d9e0; color: #656d76; | |
| text-decoration: none; font-size: 12px; font-weight: 500; | |
| transition: all 0.2s; | |
| } | |
| .paper-links a:hover { | |
| border-color: #2563eb; color: #2563eb; | |
| background: rgba(37,99,235,0.05); | |
| } | |
| /* ---------- stats bar ---------- */ | |
| .stats-bar { | |
| display: flex; gap: 32px; padding: 16px 24px; | |
| background: #ffffff; border: 1px solid #d1d9e0; border-radius: 14px; | |
| margin-bottom: 28px; flex-wrap: wrap; | |
| } | |
| .stat-item { font-size: 13px; color: #656d76; } | |
| .stat-value { font-weight: 700; color: #1f2328; font-size: 18px; margin-right: 6px; } | |
| /* ---------- dialog styles ---------- */ | |
| div[role="dialog"] { | |
| background: #ffffff !important; | |
| border: 1px solid #d1d9e0 !important; | |
| border-radius: 16px !important; | |
| } | |
| div[role="dialog"] h3, div[role="dialog"] h4 { color: #1f2328 !important; } | |
| div[role="dialog"] p, div[role="dialog"] li { color: #424a53 !important; } | |
| div[role="dialog"] hr { border-color: #d1d9e0 !important; } | |
| /* pros / cons in dialog */ | |
| .pros-box, .cons-box { padding: 14px 16px; border-radius: 10px; margin-bottom: 12px; } | |
| .pros-box { background: #f0fdf4; border: 1px solid #bbf7d0; } | |
| .cons-box { background: #fef2f2; border: 1px solid #fecaca; } | |
| .section-label { | |
| font-size: 11px; font-weight: 700; text-transform: uppercase; | |
| letter-spacing: .8px; margin-bottom: 10px; | |
| } | |
| .pros-box .section-label { color: #16a34a; } | |
| .cons-box .section-label { color: #dc2626; } | |
| .point { | |
| font-size: 13px; line-height: 1.6; color: #424a53; | |
| padding: 6px 0 6px 18px; position: relative; | |
| border-bottom: 1px solid rgba(0,0,0,.05); | |
| } | |
| .point:last-child { border-bottom: none; } | |
| .point::before { | |
| content: ''; position: absolute; left: 0; top: 14px; | |
| width: 6px; height: 6px; border-radius: 50%; | |
| } | |
| .pros-box .point::before { background: #16a34a; } | |
| .cons-box .point::before { background: #dc2626; } | |
| /* card image – full width flush to container */ | |
| div[data-testid="stColumn"] div[data-testid="stImage"] { | |
| aspect-ratio: 2 / 1; | |
| overflow: hidden !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stImage"] img { | |
| width: 100% !important; | |
| height: 100% !important; | |
| object-fit: cover !important; | |
| border-radius: 14px 14px 0 0 !important; | |
| } | |
| /* ---------- hide streamlit defaults ---------- */ | |
| .stDeployButton, footer, #MainMenu, | |
| [data-testid="stSidebar"], [data-testid="collapsedControl"] { display: none !important; } | |
| /* style the card button (title) – max 3 lines */ | |
| div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"] { | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| text-align: left !important; | |
| color: #1f2328 !important; | |
| font-size: 16px !important; | |
| font-weight: 700 !important; | |
| line-height: 1.4 !important; | |
| width: 100% !important; | |
| display: -webkit-box !important; | |
| -webkit-line-clamp: 3 !important; | |
| -webkit-box-orient: vertical !important; | |
| overflow: hidden !important; | |
| min-height: calc(16px * 1.4 * 3) !important; | |
| max-height: calc(16px * 1.4 * 3) !important; | |
| } | |
| div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"]:hover { | |
| color: #2563eb !important; | |
| background: transparent !important; | |
| border: none !important; | |
| } | |
| /* authors – max 2 lines */ | |
| .paper-authors { | |
| display: -webkit-box; | |
| -webkit-line-clamp: 2; | |
| -webkit-box-orient: vertical; | |
| overflow: hidden; | |
| min-height: calc(13px * 1.5 * 2); | |
| max-height: calc(13px * 1.5 * 2); | |
| } | |
| /* card topic tags – max 2 lines, reserve space for 2 rows */ | |
| .card-topics { | |
| display: flex; | |
| align-items: flex-start; | |
| align-content: flex-start; | |
| gap: 4px; | |
| flex-wrap: wrap; | |
| padding: 0 4px; | |
| margin-top: 4px; | |
| margin-bottom: 8px; | |
| overflow: hidden; | |
| min-height: 42px; | |
| max-height: 42px; | |
| } | |
| /* container styling – equal height + clear border */ | |
| div[data-testid="stVerticalBlockBorderWrapper"] { | |
| border: 2px solid #d1d9e0 !important; | |
| border-radius: 16px !important; | |
| background: #ffffff !important; | |
| overflow: hidden !important; | |
| height: 100%; | |
| padding: 0 !important; | |
| } | |
| /* remove all inner padding from bordered container */ | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div { | |
| padding: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div > div { | |
| padding: 0 !important; | |
| gap: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div > div > div { | |
| padding: 0 !important; | |
| gap: 0.25rem !important; | |
| } | |
| /* add padding back to non-image elements */ | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] button, | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] div[data-testid="stMarkdownContainer"] { | |
| margin-left: 1rem !important; | |
| margin-right: 1rem !important; | |
| } | |
| div[data-testid="stVerticalBlockBorderWrapper"]:hover { | |
| border-color: #2563eb !important; | |
| box-shadow: 0 4px 16px rgba(0,0,0,0.08); | |
| } | |
| /* make columns stretch to equal height */ | |
| div[data-testid="stHorizontalBlock"] { | |
| align-items: stretch !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] { | |
| display: flex !important; | |
| flex-direction: column !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div { | |
| flex: 1 !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div > div[data-testid="stVerticalBlockBorderWrapper"] { | |
| flex: 1 !important; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Data helpers | |
| # --------------------------------------------------------------------------- | |
| DATA_DIR = Path(__file__).resolve().parent.parent / "data" | |
| HF_DATASET_REPO = "Elfsong/hf_paper_summary" | |
| HF_TRENDING_REPO = "Elfsong/hf_paper_trending" | |
| def _get_hf_token() -> str | None: | |
| import os | |
| token = os.getenv("HF_TOKEN", "") | |
| if token: | |
| return token | |
| env_path = Path(__file__).resolve().parent.parent / ".env" | |
| if env_path.exists(): | |
| for line in env_path.read_text().splitlines(): | |
| if line.startswith("HF_TOKEN="): | |
| return line.split("=", 1)[1].strip() | |
| return None | |
| def _date_to_split(date_str: str) -> str: | |
| """Convert '2026-03-11' to 'date_2026_03_11' for valid split name.""" | |
| return "date_" + date_str.replace("-", "_") | |
| def _split_to_date(split_name: str) -> str: | |
| """Convert 'date_2026_03_11' back to '2026-03-11'.""" | |
| return split_name.replace("date_", "", 1).replace("_", "-") | |
| def push_to_hf_dataset(papers: list[dict], date_str: str): | |
| """Push papers list to HuggingFace dataset as a date split.""" | |
| from datasets import Dataset | |
| token = _get_hf_token() | |
| if not token: | |
| return | |
| rows = [] | |
| for p in papers: | |
| rows.append( | |
| { | |
| "title": p.get("title", ""), | |
| "paper_id": p.get("paper_id", ""), | |
| "hf_url": p.get("hf_url", ""), | |
| "arxiv_url": p.get("arxiv_url", ""), | |
| "pdf_url": p.get("pdf_url", ""), | |
| "authors": p.get("authors", []), | |
| "summary": p.get("summary", ""), | |
| "upvotes": p.get("upvotes", 0), | |
| "published_at": p.get("published_at", ""), | |
| "concise_summary": p.get("concise_summary", ""), | |
| "concise_summary_zh": p.get("concise_summary_zh", ""), | |
| "detailed_analysis": json.dumps( | |
| p.get("detailed_analysis", {}), ensure_ascii=False | |
| ), | |
| "detailed_analysis_zh": json.dumps( | |
| p.get("detailed_analysis_zh", {}), ensure_ascii=False | |
| ), | |
| "topics": json.dumps(p.get("topics", []), ensure_ascii=False), | |
| "topics_zh": json.dumps(p.get("topics_zh", []), ensure_ascii=False), | |
| "keywords": json.dumps(p.get("keywords", []), ensure_ascii=False), | |
| "keywords_zh": json.dumps( | |
| p.get("keywords_zh", []), ensure_ascii=False | |
| ), | |
| } | |
| ) | |
| ds = Dataset.from_list(rows) | |
| split_name = _date_to_split(date_str) | |
| ds.push_to_hub(HF_DATASET_REPO, split=split_name, token=token) | |
| def _list_dataset_splits() -> list[str]: | |
| """List available date splits from the HF dataset repo without loading data.""" | |
| from huggingface_hub import HfApi | |
| token = _get_hf_token() | |
| api = HfApi(token=token) | |
| try: | |
| files = api.list_repo_files(HF_DATASET_REPO, repo_type="dataset") | |
| except Exception: | |
| return [] | |
| # Split dirs look like: data/date_2026_03_11-*.parquet or date_2026_03_11/... | |
| splits = set() | |
| for f in files: | |
| name = f.split("/")[-1] | |
| for part in name.replace(".parquet", "").replace(".arrow", "").split("-"): | |
| if part.startswith("date_"): | |
| splits.add(part) | |
| break | |
| return sorted(splits, reverse=True) | |
| def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]]: | |
| """Load a date split from HF dataset. If target_date is None, load the latest. | |
| Returns {date_str: papers_list}.""" | |
| from datasets import load_dataset | |
| token = _get_hf_token() | |
| splits = _list_dataset_splits() | |
| if not splits: | |
| return {} | |
| if target_date: | |
| target_split = _date_to_split(target_date) | |
| if target_split not in splits: | |
| return {} | |
| split_to_load = target_split | |
| else: | |
| split_to_load = splits[0] | |
| date_str = _split_to_date(split_to_load) | |
| try: | |
| ds = load_dataset(HF_DATASET_REPO, split=split_to_load, token=token) | |
| except Exception: | |
| return {} | |
| papers = [] | |
| for row in ds: | |
| paper = dict(row) | |
| paper["detailed_analysis"] = json.loads(paper.get("detailed_analysis", "{}")) | |
| paper["detailed_analysis_zh"] = json.loads( | |
| paper.get("detailed_analysis_zh", "{}") | |
| ) | |
| paper["topics"] = json.loads(paper.get("topics", "[]")) | |
| paper["topics_zh"] = json.loads(paper.get("topics_zh", "[]")) | |
| paper["keywords"] = json.loads(paper.get("keywords", "[]")) | |
| paper["keywords_zh"] = json.loads(paper.get("keywords_zh", "[]")) | |
| papers.append(paper) | |
| return {date_str: papers} | |
| def list_available_dates() -> list[str]: | |
| """Return available dates (YYYY-MM-DD) from HF dataset and local files, sorted descending.""" | |
| dates = set() | |
| # From HF dataset splits | |
| for split in _list_dataset_splits(): | |
| dates.add(_split_to_date(split)) | |
| # From local JSON files | |
| for date_str in find_json_files(): | |
| dates.add(date_str) | |
| return sorted(dates, reverse=True) | |
| def find_json_files() -> dict[str, Path]: | |
| """Return {date_str: path} for all summarized JSON files.""" | |
| files: dict[str, Path] = {} | |
| for fp in glob.glob(str(DATA_DIR / "hf_papers_*_summarized.json")): | |
| p = Path(fp) | |
| for part in p.stem.split("_"): | |
| if len(part) == 10 and part[4] == "-" and part[7] == "-": | |
| files[part] = p | |
| break | |
| return dict(sorted(files.items(), reverse=True)) | |
| def load_papers(source) -> list[dict]: | |
| if isinstance(source, (str, Path)): | |
| with open(source, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return json.loads(source.read()) | |
| # --------------------------------------------------------------------------- | |
| # Crawl & summarize | |
| # --------------------------------------------------------------------------- | |
| SSL_CTX = ssl.create_default_context() | |
| try: | |
| import certifi | |
| SSL_CTX.load_verify_locations(certifi.where()) | |
| except ImportError: | |
| SSL_CTX.check_hostname = False | |
| SSL_CTX.verify_mode = ssl.CERT_NONE | |
| HF_API_URL = "https://huggingface.co/api/daily_papers" | |
| HF_THUMB = "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/{pid}.png" | |
| SUMMARY_SYSTEM_PROMPT = """\ | |
| You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \ | |
| with exactly eight keys — English and Chinese versions of analyses, plus keywords and topics: | |
| 1. "concise_summary": A 2-4 sentence plain-language summary in English explaining WHAT the paper does \ | |
| and WHY it matters. Avoid jargon; end with the key result or takeaway. | |
| 2. "concise_summary_zh": The same concise summary translated into Chinese (简体中文). | |
| 3. "detailed_analysis": A longer analysis in English, structured as: | |
| - "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \ | |
| and explain how it fits into the broader research landscape. | |
| - "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.) | |
| - "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.) | |
| 4. "detailed_analysis_zh": The same detailed analysis translated into Chinese (简体中文), \ | |
| with the same structure: "summary", "pros", "cons". | |
| 5. "topics": A list of 2-3 short topic labels categorizing the paper's research area \ | |
| (e.g. "Multimodal LLMs", "Efficient Fine-tuning", "Code Generation", "Vision-Language Models"). \ | |
| Use concise, recognizable labels. | |
| 6. "topics_zh": The same topic labels translated into Chinese (简体中文). | |
| 7. "keywords": A list of 4-6 specific technical keywords or terms central to the paper \ | |
| (e.g. "LoRA", "RLHF", "diffusion", "chain-of-thought", "MoE", "RAG", "DPO", "transformer"). \ | |
| Use canonical technical terms, not paraphrases. Include method names, model names, and key techniques. | |
| 8. "keywords_zh": The same keywords translated into Chinese where applicable \ | |
| (keep English acronyms and proper nouns as-is, e.g. "LoRA", "RLHF", "扩散模型", "思维链"). | |
| Reply with ONLY valid JSON — no markdown fences, no extra text.""" | |
| TRENDING_SYSTEM_PROMPT = """\ | |
| You are a senior AI researcher. Given a collection of top papers from the last several days, \ | |
| identify the key research trends and produce a JSON object with exactly six keys: | |
| 1. "trending_summary": A 2-3 sentence English summary of the dominant research trends \ | |
| and themes across these papers. Focus on emerging patterns, hot topics, and notable shifts. | |
| 2. "trending_summary_zh": The same trending summary translated into Chinese (简体中文). | |
| 3. "top_topics": A list of 3-5 short topic labels (e.g. "Multimodal LLMs", "Efficient Fine-tuning") \ | |
| representing the most prominent themes, in English. | |
| 4. "top_topics_zh": The same topic labels translated into Chinese (简体中文). | |
| 5. "keywords": A list of 5-10 specific technical keywords or terms that appear frequently \ | |
| or are central to the papers (e.g. "LoRA", "RLHF", "diffusion", "chain-of-thought", "MoE", \ | |
| "RAG", "MLLM", "DPO"). Use the canonical technical term, not a paraphrase. | |
| 6. "keywords_zh": The same technical keywords translated into Chinese where applicable \ | |
| (keep English acronyms as-is, e.g. "LoRA", "RLHF", "扩散模型", "思维链"). | |
| Reply with ONLY valid JSON — no markdown fences, no extra text.""" | |
| def fetch_daily_papers(date_str: str) -> list[dict]: | |
| url = f"{HF_API_URL}?date={date_str}" | |
| req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| try: | |
| with urlopen(req, timeout=30, context=SSL_CTX) as resp: | |
| data = json.loads(resp.read().decode()) | |
| except HTTPError: | |
| return [] | |
| papers = [] | |
| for item in data: | |
| paper = item.get("paper", {}) | |
| paper_id = paper.get("id", "") | |
| authors = [a.get("name", "") for a in paper.get("authors", [])] | |
| papers.append( | |
| { | |
| "title": paper.get("title", ""), | |
| "paper_id": paper_id, | |
| "hf_url": f"https://huggingface.co/papers/{paper_id}", | |
| "arxiv_url": f"https://arxiv.org/abs/{paper_id}", | |
| "pdf_url": f"https://arxiv.org/pdf/{paper_id}", | |
| "authors": authors, | |
| "summary": paper.get("summary", ""), | |
| "upvotes": paper.get("upvotes", 0), | |
| "published_at": paper.get("publishedAt", ""), | |
| } | |
| ) | |
| papers.sort(key=lambda x: x["upvotes"], reverse=True) | |
| return papers | |
| def _get_gemini_key() -> str: | |
| import os | |
| api_key = os.getenv("GEMINI_API_KEY", "") | |
| if api_key: | |
| return api_key | |
| env_path = Path(__file__).resolve().parent.parent / ".env" | |
| if env_path.exists(): | |
| for line in env_path.read_text().splitlines(): | |
| if line.startswith("GEMINI_API_KEY="): | |
| return line.split("=", 1)[1].strip() | |
| raise RuntimeError( | |
| "GEMINI_API_KEY not found. Set it as a HF Space secret or in .env" | |
| ) | |
| def summarize_paper_gemini( | |
| title: str, abstract: str, pdf_url: str = "" | |
| ) -> dict: | |
| from google import genai | |
| api_key = _get_gemini_key() | |
| client = genai.Client(api_key=api_key) | |
| text_part = genai.types.Part.from_text( | |
| text=f"Title: {title}\n\nAbstract: {abstract}" | |
| ) | |
| contents = [text_part] | |
| if pdf_url: | |
| try: | |
| pdf_data = urlopen( | |
| pdf_url, context=SSL_CTX, timeout=30 | |
| ).read() | |
| pdf_part = genai.types.Part.from_bytes( | |
| data=pdf_data, mime_type="application/pdf" | |
| ) | |
| contents.append(pdf_part) | |
| except Exception: | |
| pass # fall back to text-only | |
| resp = client.models.generate_content( | |
| model="gemini-3.1-pro-preview", | |
| contents=contents, | |
| config=genai.types.GenerateContentConfig( | |
| system_instruction=SUMMARY_SYSTEM_PROMPT, | |
| temperature=0.3, | |
| max_output_tokens=16384, | |
| response_mime_type="application/json", | |
| ), | |
| ) | |
| decoder = json.JSONDecoder() | |
| result, _ = decoder.raw_decode(resp.text.strip()) | |
| return result | |
| def _paper_has_summary(paper: dict) -> bool: | |
| """Check if a paper already has a valid summary (not an error).""" | |
| cs = paper.get("concise_summary", "") | |
| return bool(cs) and not cs.startswith("Error:") | |
| def _save_papers_local(papers: list[dict], path: Path): | |
| """Atomically save papers list to local JSON.""" | |
| tmp = path.with_suffix(".tmp") | |
| with open(tmp, "w", encoding="utf-8") as f: | |
| json.dump(papers, f, ensure_ascii=False, indent=2) | |
| tmp.replace(path) | |
| def crawl_and_summarize(date_str: str) -> Path: | |
| DATA_DIR.mkdir(parents=True, exist_ok=True) | |
| output_path = DATA_DIR / f"hf_papers_{date_str}_summarized.json" | |
| progress = st.progress(0, text="Fetching papers from HuggingFace...") | |
| papers = fetch_daily_papers(date_str) | |
| if not papers: | |
| progress.empty() | |
| st.error(f"No papers found for {date_str}") | |
| return None | |
| # Resume: load existing partial results and merge | |
| if output_path.exists(): | |
| try: | |
| with open(output_path, "r", encoding="utf-8") as f: | |
| cached = {p["paper_id"]: p for p in json.load(f) if _paper_has_summary(p)} | |
| for paper in papers: | |
| pid = paper.get("paper_id", "") | |
| if pid in cached: | |
| paper.update(cached[pid]) | |
| except Exception: | |
| pass # corrupted cache, start fresh | |
| total = len(papers) | |
| skipped = sum(1 for p in papers if _paper_has_summary(p)) | |
| if skipped: | |
| st.toast(f"Resuming: {skipped}/{total} papers already summarized.", icon="⏩") | |
| for i, paper in enumerate(papers): | |
| # Skip already summarized papers | |
| if _paper_has_summary(paper): | |
| progress.progress( | |
| (i + 1) / (total + 1), | |
| text=f"Cached ({i+1}/{total}): {paper['title'][:60]}...", | |
| ) | |
| continue | |
| progress.progress( | |
| (i + 1) / (total + 1), | |
| text=f"Summarizing ({i+1}/{total}): {paper['title'][:60]}...", | |
| ) | |
| abstract = paper.get("summary", "") | |
| pdf_url = paper.get("pdf_url", "") | |
| if not abstract and not pdf_url: | |
| paper["concise_summary"] = "" | |
| paper["concise_summary_zh"] = "" | |
| paper["detailed_analysis"] = {} | |
| paper["detailed_analysis_zh"] = {} | |
| paper["topics"] = [] | |
| paper["topics_zh"] = [] | |
| paper["keywords"] = [] | |
| paper["keywords_zh"] = [] | |
| else: | |
| try: | |
| result = summarize_paper_gemini(paper["title"], abstract, pdf_url) | |
| paper["concise_summary"] = result.get("concise_summary", "") | |
| paper["concise_summary_zh"] = result.get("concise_summary_zh", "") | |
| paper["detailed_analysis"] = result.get("detailed_analysis", {}) | |
| paper["detailed_analysis_zh"] = result.get("detailed_analysis_zh", {}) | |
| paper["topics"] = result.get("topics", []) | |
| paper["topics_zh"] = result.get("topics_zh", []) | |
| paper["keywords"] = result.get("keywords", []) | |
| paper["keywords_zh"] = result.get("keywords_zh", []) | |
| except Exception as e: | |
| paper["concise_summary"] = f"Error: {e}" | |
| paper["concise_summary_zh"] = "" | |
| paper["detailed_analysis"] = {} | |
| paper["detailed_analysis_zh"] = {} | |
| paper["topics"] = [] | |
| paper["topics_zh"] = [] | |
| paper["keywords"] = [] | |
| paper["keywords_zh"] = [] | |
| # Save after each paper for resume support | |
| _save_papers_local(papers, output_path) | |
| if i < total - 1: | |
| time.sleep(1) | |
| # Push to HuggingFace only after all papers are done | |
| progress.progress(0.95, text="Uploading to HuggingFace Dataset...") | |
| try: | |
| push_to_hf_dataset(papers, date_str) | |
| except Exception as e: | |
| st.warning(f"Failed to push to HF dataset: {e}") | |
| progress.progress(1.0, text="Done!") | |
| time.sleep(0.5) | |
| progress.empty() | |
| return output_path | |
| # --------------------------------------------------------------------------- | |
| # Trending summary | |
| # --------------------------------------------------------------------------- | |
| def _load_recent_papers(n_days: int = 5) -> tuple[list[dict], str, str]: | |
| """Load top papers from the most recent n_days splits. | |
| Returns (papers, earliest_date, latest_date).""" | |
| from datasets import load_dataset | |
| token = _get_hf_token() | |
| splits = _list_dataset_splits()[:n_days] | |
| all_papers = [] | |
| loaded_dates = [] | |
| for split in splits: | |
| try: | |
| ds = load_dataset(HF_DATASET_REPO, split=split, token=token) | |
| date = _split_to_date(split) | |
| loaded_dates.append(date) | |
| for row in ds: | |
| paper = dict(row) | |
| paper["_date"] = date | |
| all_papers.append(paper) | |
| except Exception: | |
| continue | |
| all_papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True) | |
| earliest = min(loaded_dates) if loaded_dates else "" | |
| latest = max(loaded_dates) if loaded_dates else "" | |
| return all_papers, earliest, latest | |
| def generate_trending_summary(papers: list[dict]) -> dict: | |
| """Call Gemini to produce a trending summary from recent papers.""" | |
| from google import genai | |
| api_key = _get_gemini_key() | |
| client = genai.Client(api_key=api_key) | |
| # Build input: title + concise_summary + detailed analysis for each paper | |
| lines = [] | |
| for p in papers: | |
| date = p.get("_date", "") | |
| title = p.get("title", "") | |
| summary = p.get("concise_summary", "") or p.get("summary", "") | |
| upvotes = p.get("upvotes", 0) | |
| parts = [f"[{date}] (upvotes: {upvotes}) {title}", summary] | |
| analysis = p.get("detailed_analysis", {}) | |
| if isinstance(analysis, str): | |
| try: | |
| analysis = json.loads(analysis) | |
| except Exception: | |
| analysis = {} | |
| if analysis: | |
| if analysis.get("summary"): | |
| parts.append(f"Analysis: {analysis['summary']}") | |
| pros = analysis.get("pros", []) | |
| if pros: | |
| parts.append("Strengths: " + "; ".join(pros)) | |
| cons = analysis.get("cons", []) | |
| if cons: | |
| parts.append("Limitations: " + "; ".join(cons)) | |
| lines.append("\n".join(parts)) | |
| content = "\n\n".join(lines) | |
| resp = client.models.generate_content( | |
| model="gemini-3.1-pro-preview", | |
| contents=content, | |
| config=genai.types.GenerateContentConfig( | |
| system_instruction=TRENDING_SYSTEM_PROMPT, | |
| temperature=0.3, | |
| max_output_tokens=4096*6, | |
| response_mime_type="application/json", | |
| ), | |
| ) | |
| decoder = json.JSONDecoder() | |
| result, _ = decoder.raw_decode(resp.text.strip()) | |
| return result | |
| def push_trending_to_hf(trending: dict, date_str: str): | |
| """Push trending summary to HF dataset.""" | |
| from datasets import Dataset | |
| token = _get_hf_token() | |
| if not token: | |
| return | |
| row = { | |
| "trending_summary": trending.get("trending_summary", ""), | |
| "trending_summary_zh": trending.get("trending_summary_zh", ""), | |
| "top_topics": json.dumps(trending.get("top_topics", []), ensure_ascii=False), | |
| "top_topics_zh": json.dumps( | |
| trending.get("top_topics_zh", []), ensure_ascii=False | |
| ), | |
| "keywords": json.dumps(trending.get("keywords", []), ensure_ascii=False), | |
| "keywords_zh": json.dumps(trending.get("keywords_zh", []), ensure_ascii=False), | |
| "date_range": trending.get("date_range", ""), | |
| "generated_date": date_str, | |
| } | |
| ds = Dataset.from_list([row]) | |
| split_name = _date_to_split(date_str) | |
| ds.push_to_hub(HF_TRENDING_REPO, split=split_name, token=token) | |
| def pull_trending_from_hf(target_date: str | None = None) -> dict | None: | |
| """Load trending summary from HF dataset. Returns dict or None.""" | |
| from huggingface_hub import HfApi | |
| from datasets import load_dataset | |
| token = _get_hf_token() | |
| api = HfApi(token=token) | |
| try: | |
| files = api.list_repo_files(HF_TRENDING_REPO, repo_type="dataset") | |
| except Exception: | |
| return None | |
| splits = set() | |
| for f in files: | |
| name = f.split("/")[-1] | |
| for part in name.replace(".parquet", "").replace(".arrow", "").split("-"): | |
| if part.startswith("date_"): | |
| splits.add(part) | |
| break | |
| splits = sorted(splits, reverse=True) | |
| if not splits: | |
| return None | |
| if target_date: | |
| target_split = _date_to_split(target_date) | |
| if target_split not in splits: | |
| return None | |
| split_to_load = target_split | |
| else: | |
| split_to_load = splits[0] | |
| try: | |
| ds = load_dataset(HF_TRENDING_REPO, split=split_to_load, token=token) | |
| except Exception: | |
| return None | |
| row = dict(ds[0]) | |
| row["top_topics"] = json.loads(row.get("top_topics", "[]")) | |
| row["top_topics_zh"] = json.loads(row.get("top_topics_zh", "[]")) | |
| row["keywords"] = json.loads(row.get("keywords", "[]")) | |
| row["keywords_zh"] = json.loads(row.get("keywords_zh", "[]")) | |
| return row | |
| def get_or_generate_trending(date_str: str, status=None) -> tuple[dict | None, str]: | |
| """Get trending from HF cache, or generate and push it. | |
| Returns (trending_dict, date_range_str).""" | |
| if status: | |
| status.info("Checking cached trending summary...") | |
| trending = pull_trending_from_hf(target_date=date_str) | |
| if trending: | |
| date_range = trending.get("date_range", "") | |
| return trending, date_range | |
| # Generate fresh trending | |
| if status: | |
| status.info("Loading recent papers for trending analysis...") | |
| recent_papers, earliest, latest = _load_recent_papers(n_days=5) | |
| if not recent_papers: | |
| if status: | |
| status.warning("No recent papers available for trending analysis.") | |
| return None, "" | |
| date_range = f"{earliest} ~ {latest}" if earliest and latest else "" | |
| try: | |
| if status: | |
| status.info("Generating trending summary with Gemini...") | |
| trending = generate_trending_summary(recent_papers) | |
| trending["date_range"] = date_range | |
| except Exception as e: | |
| if status: | |
| status.error(f"Trending generation failed: {e}") | |
| return None, "" | |
| try: | |
| if status: | |
| status.info("Saving trending summary to HuggingFace...") | |
| push_trending_to_hf(trending, date_str) | |
| except Exception as e: | |
| if status: | |
| status.warning(f"HF push failed: {e}") | |
| return trending, date_range | |
| # --------------------------------------------------------------------------- | |
| # Summary dialog | |
| # --------------------------------------------------------------------------- | |
| def show_summary(paper: dict): | |
| st.markdown(f"### {paper.get('title', '')}") | |
| # Authors | |
| authors = paper.get("authors", []) | |
| if authors: | |
| st.caption(", ".join(authors)) | |
| # Resource links | |
| links_html = f"""<div class="paper-links" style="margin-bottom:12px;"> | |
| <a href="{paper.get('hf_url','#')}" target="_blank">🤗 HuggingFace</a> | |
| <a href="{paper.get('arxiv_url','#')}" target="_blank">📄 arXiv</a> | |
| <a href="{paper.get('pdf_url','#')}" target="_blank">📥 PDF</a> | |
| </div>""" | |
| st.markdown(links_html, unsafe_allow_html=True) | |
| # Use global language toggle | |
| lang = st.session_state.get("global_lang_toggle", False) | |
| # Topics & Keywords | |
| if lang: | |
| topics = paper.get("topics_zh", []) or paper.get("topics", []) | |
| kws = paper.get("keywords_zh", []) or paper.get("keywords", []) | |
| else: | |
| topics = paper.get("topics", []) | |
| kws = paper.get("keywords", []) | |
| if topics or kws: | |
| lines = [] | |
| if topics: | |
| topic_spans = "".join( | |
| f'<span style="background:#eef1f5;padding:3px 10px;border-radius:12px;' | |
| f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>' | |
| for t in topics | |
| ) | |
| lines.append(f'<div style="display:flex;gap:6px;flex-wrap:wrap;">{topic_spans}</div>') | |
| if kws: | |
| kw_spans = "".join( | |
| f'<span style="background:#fff8e1;padding:3px 10px;border-radius:12px;' | |
| f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>' | |
| for k in kws | |
| ) | |
| lines.append(f'<div style="display:flex;gap:6px;flex-wrap:wrap;">{kw_spans}</div>') | |
| st.markdown( | |
| f'<div style="display:flex;flex-direction:column;gap:8px;margin-bottom:12px;">{"".join(lines)}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| # TL;DR | |
| if lang: | |
| concise = paper.get("concise_summary_zh", "") or paper.get( | |
| "concise_summary", "" | |
| ) | |
| else: | |
| concise = paper.get("concise_summary", "") | |
| if concise: | |
| st.markdown("#### 📝 TL;DR") | |
| st.markdown(concise) | |
| # Detailed Analysis | |
| if lang: | |
| analysis = paper.get("detailed_analysis_zh", {}) or paper.get( | |
| "detailed_analysis", {} | |
| ) | |
| else: | |
| analysis = paper.get("detailed_analysis", {}) | |
| if analysis: | |
| st.divider() | |
| st.markdown("#### 🔬 Detailed Analysis" if not lang else "#### 🔬 详细分析") | |
| st.markdown(analysis.get("summary", "")) | |
| st.divider() | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| pros = analysis.get("pros", []) | |
| pros_html = "".join(f'<div class="point">{p}</div>' for p in pros) | |
| label = "✓ Strengths" if not lang else "✓ 优势" | |
| st.markdown( | |
| f'<div class="pros-box"><div class="section-label">{label}</div>{pros_html}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| with col_b: | |
| cons = analysis.get("cons", []) | |
| cons_html = "".join(f'<div class="point">{c}</div>' for c in cons) | |
| label = "✗ Limitations" if not lang else "✗ 不足" | |
| st.markdown( | |
| f'<div class="cons-box"><div class="section-label">{label}</div>{cons_html}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Render paper card | |
| # --------------------------------------------------------------------------- | |
| def render_card(paper: dict, rank: int): | |
| pid = paper.get("paper_id", "") | |
| title = paper.get("title", "Untitled") | |
| authors = paper.get("authors", []) | |
| thumb_url = HF_THUMB.format(pid=pid) | |
| if authors: | |
| authors_str = ", ".join(authors) | |
| else: | |
| authors_str = "Unknown authors" | |
| with st.container(border=True): | |
| # Thumbnail | |
| st.image(thumb_url, width="stretch") | |
| # Title as clickable button | |
| if st.button(f"**{title}**", key=f"card-{rank}", use_container_width=True): | |
| show_summary(paper) | |
| # Authors | |
| lang = st.session_state.get("global_lang_toggle", False) | |
| if lang: | |
| topics = paper.get("topics_zh", []) or paper.get("topics", []) | |
| else: | |
| topics = paper.get("topics", []) | |
| topic_spans = "".join( | |
| f'<span style="background:#eef1f5;padding:2px 8px;border-radius:10px;' | |
| f'font-size:11px;font-weight:600;color:#2563eb;white-space:nowrap;">{t}</span>' | |
| for t in topics | |
| ) | |
| html = f""" | |
| <div style="padding: 0 4px;"> | |
| <div class="paper-authors">{authors_str}</div> | |
| </div> | |
| <div class="card-topics">{topic_spans}</div>""" | |
| st.markdown(html, unsafe_allow_html=True) | |
| # --------------------------------------------------------------------------- | |
| # Main content | |
| # --------------------------------------------------------------------------- | |
| papers: list[dict] = [] | |
| yesterday_str = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d") | |
| # --- Header row: date selector + language toggle --- | |
| col_date, col_lang = st.columns([0.1, 0.9]) | |
| with col_date: | |
| available_dates = list_available_dates() | |
| selected_date = st.date_input( | |
| "Select date", | |
| value=( | |
| datetime.strptime(available_dates[0], "%Y-%m-%d").date() | |
| if available_dates | |
| else (datetime.now(timezone.utc) - timedelta(days=1)).date() | |
| ), | |
| format="YYYY-MM-DD", | |
| label_visibility="collapsed", | |
| ) | |
| selected_date_str = selected_date.strftime("%Y-%m-%d") | |
| with col_lang: | |
| # st.markdown("<div style='height:12px'></div>", unsafe_allow_html=True) | |
| use_zh = st.toggle("中文", key="global_lang_toggle") | |
| latest_date = selected_date_str | |
| with st.spinner("Loading papers..."): | |
| hf_data = pull_from_hf_dataset(target_date=selected_date_str) | |
| if hf_data: | |
| papers = hf_data[selected_date_str] | |
| if not papers: | |
| json_files = find_json_files() | |
| if selected_date_str in json_files: | |
| papers = load_papers(json_files[selected_date_str]) | |
| # Check if loaded papers have incomplete summaries (interrupted collection) | |
| needs_summarization = papers and any(not _paper_has_summary(p) for p in papers) | |
| if not papers or needs_summarization: | |
| if not papers: | |
| st.balloons() | |
| st.toast(f"You are the first one to read papers on {selected_date_str}! We are collecting papers for you.", icon="📰") | |
| else: | |
| summarized = sum(1 for p in papers if _paper_has_summary(p)) | |
| st.toast(f"Resuming summarization: {summarized}/{len(papers)} papers done.", icon="⏩") | |
| result_path = crawl_and_summarize(selected_date_str) | |
| if result_path: | |
| papers = load_papers(result_path) | |
| if not papers: | |
| st.error("No papers found. Please check back later.") | |
| st.stop() | |
| papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True) | |
| date_label = latest_date | |
| lang = st.session_state.get("global_lang_toggle", False) | |
| # --- Trending status (spinner under title, filled later) --- | |
| trending_spinner = st.empty() | |
| # --- Trending summary placeholder (filled after papers render) --- | |
| trending_placeholder = st.empty() | |
| # --- Render paper grid (3 columns) --- | |
| NUM_COLS = 3 | |
| for row_start in range(0, len(papers), NUM_COLS): | |
| cols = st.columns(NUM_COLS, gap="medium") | |
| for col_idx, col in enumerate(cols): | |
| paper_idx = row_start + col_idx | |
| if paper_idx >= len(papers): | |
| break | |
| with col: | |
| render_card(papers[paper_idx], rank=paper_idx + 1) | |
| # --- Trending summary (loaded after papers are displayed) --- | |
| with trending_spinner.container(): | |
| with st.spinner("Loading trending summary..."): | |
| trending, trending_date_range = get_or_generate_trending( | |
| selected_date_str, status=None | |
| ) | |
| trending_spinner.empty() | |
| if trending: | |
| if lang: | |
| summary_text = trending.get("trending_summary_zh", "") or trending.get( | |
| "trending_summary", "" | |
| ) | |
| topics = trending.get("top_topics_zh", []) or trending.get("top_topics", []) | |
| keywords = trending.get("keywords_zh", []) or trending.get("keywords", []) | |
| else: | |
| summary_text = trending.get("trending_summary", "") | |
| topics = trending.get("top_topics", []) | |
| keywords = trending.get("keywords", []) | |
| topics_html = " ".join( | |
| f'<span style="background:#eef1f5;padding:2px 10px;border-radius:12px;' | |
| f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>' | |
| for t in topics | |
| ) | |
| keywords_html = " ".join( | |
| f'<span style="background:#fff8e1;padding:2px 10px;border-radius:12px;' | |
| f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>' | |
| for k in keywords | |
| ) | |
| date_range_label = ( | |
| f'<span style="font-size:12px;color:#9a6700;font-weight:600;">({trending_date_range})</span>' | |
| if trending_date_range | |
| else "" | |
| ) | |
| trending_placeholder.markdown( | |
| f"""<div class="stats-bar"> | |
| <div style="flex:1;min-width:200px;"> | |
| <div style="font-size:13px;color:#656d76;margin-bottom:4px;"> | |
| {"🔥 趋势" if lang else "🔥 Trending"} {date_range_label} | |
| </div> | |
| <div style="font-size:13px;color:#424a53;line-height:1.5;">{summary_text}</div> | |
| <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{topics_html}</div> | |
| <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{keywords_html}</div> | |
| </div> | |
| </div>""", | |
| unsafe_allow_html=True, | |
| ) | |