Spaces:

Daniel0315
/

cithub_website

Sleeping

App Files Files Community

Daniel0315 commited on 6 days ago

Commit

1c0f18c

verified ·

1 Parent(s): c089330

Upload app.py

Browse files

Files changed (1) hide show

src/app.py +3 -72

src/app.py CHANGED Viewed

@@ -15,7 +15,6 @@ import streamlit.components.v1 as components
 HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
 def csv_download_link(data: bytes, filename: str, label: str) -> None:
     """st.download_button 대신 base64 HTML 링크로 다운로드 — 서버 연결 불필요."""
     b64 = base64.b64encode(data).decode()
@@ -56,12 +55,10 @@ DEFAULT_DATA_DIR = Path(os.environ.get(
     r"C:\Users\user\OneDrive\바탕 화면\Citehub_huggingface\data",
 ))
 def fmt_num(x):
     try: return f"{int(x):,}"
     except: return "-"
 def _hf_download(filename: str) -> str:
     from huggingface_hub import hf_hub_download
     return hf_hub_download(
@@ -69,13 +66,11 @@ def _hf_download(filename: str) -> str:
         filename=f"data/{filename}", token=HF_TOKEN or None,
     )
 def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
     if HF_REPO_ID:
         return pd.read_parquet(_hf_download(filename))
     return pd.read_parquet(data_dir / filename)
 def plotly_network_fig(
     nodes_df: pd.DataFrame,
     edges_df: pd.DataFrame,
@@ -102,7 +97,6 @@ def plotly_network_fig(
     k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
     pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
-    # ── edges ─────────────────────────────────
     ex, ey = [], []
     for src, tgt in G.edges():
         x0, y0 = pos.get(src, (0, 0))
@@ -118,7 +112,6 @@ def plotly_network_fig(
         )
     ]
-    # ── nodes grouped by type ─────────────────
     for ntype, color in NODE_TYPE_COLORS.items():
         subset = nodes_df[nodes_df["node_type"] == ntype]
         if subset.empty:
@@ -170,11 +163,9 @@ def plotly_network_fig(
     )
     return fig
 def plotly_ontology_fig(height: int = 820) -> go.Figure:
     """CitationHub 온톨로지 구조 — Plotly SVG. 각 노드에 속성값 표시."""
-    # 각 노드 타입의 주요 속성 (노드 아래에 작게 표시)
     NODE_PROPS = {
         "seed_paper":     "doi · title · journal\nauthor · affiliation\ncountry · field · citedby_count",
         "citation_event": "event_id · citing_year\nprimary_intent · context\nis_influential",
@@ -215,7 +206,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
     pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
-    # ── 엣지 라인 + 엣지 레이블 ───────────────────────────────
     ex, ey = [], []
     ann = []
     for s, t, lbl in edge_defs:
@@ -234,13 +224,11 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
                    hoverinfo="none", showlegend=False)
     ]
-    # ── 노드 + 속성 annotation ────────────────────────────────
     for nid, label, ntype in node_defs:
         x, y = pos[nid]
         color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
         props = NODE_PROPS.get(ntype, "")
-        # 노드 마커 + 상단 이름 레이블
         traces.append(go.Scatter(
             x=[x], y=[y], mode="markers+text",
             text=[f"<b>{label}</b>"], textposition="top center",
@@ -253,7 +241,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
             textfont=dict(size=11, color="#1e293b"),
         ))
-        # 속성값: 노드 아래에 작은 글씨 annotation
         if props:
             prop_html = props.replace("\n", "<br>")
             ann.append(dict(
@@ -262,7 +249,7 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
                 showarrow=False,
                 xanchor="center",
                 yanchor="top",
-                yshift=-22,          # 노드 마커 아래로 오프셋
                 font=dict(size=8, color="#64748b"),
                 bgcolor="rgba(248,250,252,0.85)",
                 borderpad=2,
@@ -279,7 +266,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
     )
     return fig
 def inject_fullscreen(html: str) -> str:
     extra = """
     <button onclick="var el=document.getElementById('mynetwork');
@@ -311,8 +297,6 @@ def inject_fullscreen(html: str) -> str:
     """
     return html.replace("</body>", extra + "</body>")
-# ── 메인 데이터 로드 (핵심 3개 — 빠른 초기 기동) ──────────────
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
@@ -389,38 +373,31 @@ def load_data(data_dir_str: str):
     }
     return seed, events, citing, filters, overview
-# ── 보조 데이터: 해당 탭 접근 시에만 로드 (lazy) ───────────────
 @st.cache_data(show_spinner=False)
 def load_authors_data(data_dir_str: str) -> pd.DataFrame:
     """Analytics 탭에서만 사용 — 탭 진입 시 로드"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("authors.parquet", d)
 @st.cache_data(show_spinner=False)
 def load_geo_data(data_dir_str: str) -> pd.DataFrame:
     """Geographic Map 탭에서만 사용 — 탭 진입 시 로드"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("affiliation_geo.parquet", d)
-# ── KG 데이터: DuckDB 방식으로 분리 로드 ─────────────────────
 @st.cache_data(show_spinner=False)
 def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
     """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("kg_nodes.parquet", d)
 @st.cache_data(show_spinner=False)
 def get_parquet_path(filename: str, data_dir_str: str) -> str:
     """파일 경로 반환 (HF면 로컬 캐시에 다운로드 후 경로 반환)"""
     if HF_REPO_ID:
         return _hf_download(filename)
-    # DuckDB용: 역슬래시 → 슬래시 변환
-    return str(Path(data_dir_str) / filename).replace("\\", "/")
 @st.cache_data(show_spinner=False)
 def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
@@ -436,7 +413,6 @@ def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 8
     """
     return duckdb.execute(q).df()
 @st.cache_data(show_spinner=False)
 def query_enriched_stats(enriched_path: str):
     """DuckDB: enriched 전체 로드 없이 집계 통계만 쿼리"""
@@ -461,7 +437,6 @@ def query_enriched_stats(enriched_path: str):
     return sem_df, field_df
 @st.cache_data(show_spinner=False)
 def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
     """DuckDB: KG Explorer용 임의 노드 엣지 쿼리"""
@@ -476,8 +451,6 @@ def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60)
     """
     return duckdb.execute(q).df()
-# ── 헬퍼 ───────────────────────────────────────────────────────
 def filter_seed_papers(seed, q, fields, countries, journals):
     df = seed.copy()
     q = (q or "").strip().lower()
@@ -488,20 +461,17 @@ def filter_seed_papers(seed, q, fields, countries, journals):
     if journals:  df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
     return df.reset_index(drop=True)
 def event_subset(events, seed_paper_id, year_min, year_max):
     df = events[events["seed_paper_id"] == seed_paper_id].copy()
     df = df[df["citing_year"].fillna(-99999) >= year_min]
     df = df[df["citing_year"].fillna(99999) <= year_max]
     return df.reset_index(drop=True)
 def build_intent_summary(df):
     counts = df.groupby("primary_intent").size().to_dict()
     return pd.DataFrame({"intent": ALLOWED_INTENTS,
                           "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
 def build_context_rows(df, limit=20):
     rows = []
     df = df.sort_values(["context_count","intent_count","citing_year"],
@@ -518,7 +488,6 @@ def build_context_rows(df, limit=20):
         if len(rows) >= limit: break
     return pd.DataFrame(rows[:limit])
 def build_citing_table(df, limit=30):
     if df.empty:
         return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
@@ -527,7 +496,6 @@ def build_citing_table(df, limit=30):
             [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
             .drop_duplicates(subset=["citing_paper_id"]).head(limit))
 def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
     """선택된 seed paper를 인용한 논문들이 함께 인용한 다른 seed papers"""
     citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
@@ -540,7 +508,6 @@ def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
     return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
                          on="seed_paper_id", how="left")
 def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
     """선택된 seed paper의 KG 1-hop 서브그래프 반환"""
     node_id = f"seed:{seed_doi}"
@@ -552,7 +519,6 @@ def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
     nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
     return nodes, edges
 def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
     """KG Explorer: 임의 노드 기준 서브그래프"""
     edges = kg_edges[(kg_edges["source"] == search_node_id) |
@@ -563,8 +529,6 @@ def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60)
     nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
     return nodes, edges
-# ── pyvis 빌더 ��────────────────────────────────────────────────
 def pyvis_citation_graph(seed_row, events_df):
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     sid = seed_row["seed_paper_id"]
@@ -583,7 +547,6 @@ def pyvis_citation_graph(seed_row, events_df):
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 def pyvis_ontology():
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     for nid, label, typ in [
@@ -605,7 +568,6 @@ def pyvis_ontology():
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 def pyvis_from_kg(nodes_df, edges_df, height="780px"):
     """kg_nodes / kg_edges DataFrame으로 pyvis 그래프 생성"""
     net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
@@ -624,14 +586,9 @@ def pyvis_from_kg(nodes_df, edges_df, height="780px"):
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
-# ═══════════════════════════════════════════════════════════════
-#  메인 UI
-# ═══════════════════════════════════════════════════════════════
 st.title("CitationHub")
 st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
-# ── Sidebar ────────────────────────────────────────────────────
 with st.sidebar:
     st.subheader("Data source")
     if HF_REPO_ID:
@@ -688,15 +645,12 @@ intent_summary = build_intent_summary(seed_events)
 contexts_df    = build_context_rows(seed_events)
 citing_table   = build_citing_table(seed_events)
-# ── 탭 ─────────────────────────────────────────────────────────
 (tab_overview, tab_cnet,
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
     "Overview","Citation Network",
     "Knowledge Graph","Geographic Map","Analytics",
 ])
-# ═══ 1. OVERVIEW ═══════════════════════════════════════════════
 with tab_overview:
     col1, col2 = st.columns(2)
     with col1:
@@ -768,8 +722,6 @@ with tab_overview:
                 <div>{row['context']}</div></div>""",
                 unsafe_allow_html=True)
-# ═══ 2. CITATION NETWORK ════════════════════════════════════════
 with tab_cnet:
     st.subheader("Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
@@ -778,13 +730,9 @@ with tab_cnet:
     else:
         components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
-# ═══ 3. ONTOLOGY ════════════════════════════════════════════════
-# ═══ 3. KNOWLEDGE GRAPH (KG Explorer + Ontology) ════════════════
 with tab_kg_exp:
     st.subheader("Knowledge Graph")
-    # ── CitationHub Ontology ─────────────────────────────────────
     st.subheader("CitationHub Ontology — Concepts, Instances & Relationships")
     st.caption("🔍 Scroll/pinch: zoom  |  Drag: pan  |  Hover node: details  |  ⛶ (top-right toolbar): fullscreen")
     st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
@@ -796,7 +744,6 @@ with tab_kg_exp:
             kg_nodes_exp  = load_kg_nodes(data_dir_val)
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
-        # ── 노드/엣지 타입 분포 통계
         import duckdb as _ddb
         nt = kg_nodes_exp["node_type"].value_counts().reset_index()
@@ -828,14 +775,12 @@ with tab_kg_exp:
                                  yaxis_title="Count", xaxis_tickangle=-35)
             st.plotly_chart(et_fig, use_container_width=True)
-        # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
         st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
         n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
-        # edge type당 고정 샘플 수 — 10 types × 10 = 최대 100 edges
         EDGES_PER_TYPE = 10
         with st.spinner("Querying graph..."):
@@ -847,8 +792,6 @@ with tab_kg_exp:
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
-                # 1-hop: seed paper에 연결된 모든 edge type
-                # → journal / author / affiliation / city / country / field / citation_event
                 hop1 = _ddb.execute(f"""
                     WITH ranked AS (
                         SELECT source, target, edge_type,
@@ -862,8 +805,6 @@ with tab_kg_exp:
                     WHERE rn <= {EDGES_PER_TYPE}
                 """).df()
-                # 2-hop: kg_nodes_exp 타입 기반으로 citation_event 노드 ID 추출
-                # (prefix 가정 없이 실제 node_type 컬럼으로 확인)
                 hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
                 event_node_ids = (
                     kg_nodes_exp[
@@ -874,8 +815,7 @@ with tab_kg_exp:
                 if event_node_ids:
                     ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
-                    # citation_event → HAS_CITING_PAPER → citing_paper
-                    # citation_event → HAS_PRIMARY_INTENT → intent
                     hop2 = _ddb.execute(f"""
                         WITH ranked AS (
                             SELECT source, target, edge_type,
@@ -914,8 +854,6 @@ with tab_kg_exp:
     except Exception as e:
         st.error(str(e))
-# ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
     with st.spinner("Loading geographic data..."):
@@ -948,7 +886,6 @@ with tab_geo:
             .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
             use_container_width=True)
-    # ── Affiliation 시각화 ──────────────────────────────────────
     st.subheader("Top Affiliations")
     geo_col1, geo_col2 = st.columns(2)
@@ -989,8 +926,6 @@ with tab_geo:
                                legend_title="Country", height=520),
                 use_container_width=True)
-# ═══ 7. ANALYTICS ═══════════════════════════════════════════════
 with tab_analytics:
     try:
         with st.spinner("Loading analytics data..."):
@@ -1063,7 +998,6 @@ with tab_analytics:
                        title="Influential vs Non-influential"),
                 use_container_width=True)
-    # ── Intent Evolution over Years ────────────────────────────
     st.markdown("---")
     st.subheader("CitationHub Intent Evolution over Years")
     st.caption("How citation intents have changed across all papers over time")
@@ -1088,7 +1022,6 @@ with tab_analytics:
             use_container_width=True,
         )
-    # ── Top Citing Venues ───────────────────────────────────────
     st.markdown("---")
     col_v1, col_v2 = st.columns(2)
@@ -1136,7 +1069,6 @@ with tab_analytics:
                 use_container_width=True,
             )
-    # ── Citation Trend over Time ────────────────────────────────
     st.markdown("---")
     st.subheader("Citation Trend over Time (selected paper)")
     st.caption("How citations to the selected seed paper have changed year by year")
@@ -1154,7 +1086,6 @@ with tab_analytics:
     else:
         st.info("No citation trend data for the selected paper.")
-    # ── Export ─────────────────────────────────────────────────
     st.markdown("---")
     st.subheader("Export Data")
     col_e1, col_e2, col_e3 = st.columns(3)

 HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
 def csv_download_link(data: bytes, filename: str, label: str) -> None:
     """st.download_button 대신 base64 HTML 링크로 다운로드 — 서버 연결 불필요."""
     b64 = base64.b64encode(data).decode()
     r"C:\Users\user\OneDrive\바탕 화면\Citehub_huggingface\data",
 ))
 def fmt_num(x):
     try: return f"{int(x):,}"
     except: return "-"
 def _hf_download(filename: str) -> str:
     from huggingface_hub import hf_hub_download
     return hf_hub_download(
         filename=f"data/{filename}", token=HF_TOKEN or None,
     )
 def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
     if HF_REPO_ID:
         return pd.read_parquet(_hf_download(filename))
     return pd.read_parquet(data_dir / filename)
 def plotly_network_fig(
     nodes_df: pd.DataFrame,
     edges_df: pd.DataFrame,
     k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
     pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
     ex, ey = [], []
     for src, tgt in G.edges():
         x0, y0 = pos.get(src, (0, 0))
         )
     ]
     for ntype, color in NODE_TYPE_COLORS.items():
         subset = nodes_df[nodes_df["node_type"] == ntype]
         if subset.empty:
     )
     return fig
 def plotly_ontology_fig(height: int = 820) -> go.Figure:
     """CitationHub 온톨로지 구조 — Plotly SVG. 각 노드에 속성값 표시."""
     NODE_PROPS = {
         "seed_paper":     "doi · title · journal\nauthor · affiliation\ncountry · field · citedby_count",
         "citation_event": "event_id · citing_year\nprimary_intent · context\nis_influential",
     pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
     ex, ey = [], []
     ann = []
     for s, t, lbl in edge_defs:
                    hoverinfo="none", showlegend=False)
     ]
     for nid, label, ntype in node_defs:
         x, y = pos[nid]
         color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
         props = NODE_PROPS.get(ntype, "")
         traces.append(go.Scatter(
             x=[x], y=[y], mode="markers+text",
             text=[f"<b>{label}</b>"], textposition="top center",
             textfont=dict(size=11, color="#1e293b"),
         ))
         if props:
             prop_html = props.replace("\n", "<br>")
             ann.append(dict(
                 showarrow=False,
                 xanchor="center",
                 yanchor="top",
+                yshift=-22,
                 font=dict(size=8, color="#64748b"),
                 bgcolor="rgba(248,250,252,0.85)",
                 borderpad=2,
     )
     return fig
 def inject_fullscreen(html: str) -> str:
     extra = """
     <button onclick="var el=document.getElementById('mynetwork');
     """
     return html.replace("</body>", extra + "</body>")
 @st.cache_data(show_spinner=False)
 def load_data(data_dir_str: str):
     d = None if HF_REPO_ID else Path(data_dir_str)
     }
     return seed, events, citing, filters, overview
 @st.cache_data(show_spinner=False)
 def load_authors_data(data_dir_str: str) -> pd.DataFrame:
     """Analytics 탭에서만 사용 — 탭 진입 시 로드"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("authors.parquet", d)
 @st.cache_data(show_spinner=False)
 def load_geo_data(data_dir_str: str) -> pd.DataFrame:
     """Geographic Map 탭에서만 사용 — 탭 진입 시 로드"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("affiliation_geo.parquet", d)
 @st.cache_data(show_spinner=False)
 def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
     """kg_nodes 전체 로드 (3.4M rows, ~160MB 파일)"""
     d = None if HF_REPO_ID else Path(data_dir_str)
     return _read("kg_nodes.parquet", d)
 @st.cache_data(show_spinner=False)
 def get_parquet_path(filename: str, data_dir_str: str) -> str:
     """파일 경로 반환 (HF면 로컬 캐시에 다운로드 후 경로 반환)"""
     if HF_REPO_ID:
         return _hf_download(filename)
+    return str(Path(data_dir_str) / filename).replace("\\", "/")
 @st.cache_data(show_spinner=False)
 def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
     """
     return duckdb.execute(q).df()
 @st.cache_data(show_spinner=False)
 def query_enriched_stats(enriched_path: str):
     """DuckDB: enriched 전체 로드 없이 집계 통계만 쿼리"""
     return sem_df, field_df
 @st.cache_data(show_spinner=False)
 def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
     """DuckDB: KG Explorer용 임의 노드 엣지 쿼리"""
     """
     return duckdb.execute(q).df()
 def filter_seed_papers(seed, q, fields, countries, journals):
     df = seed.copy()
     q = (q or "").strip().lower()
     if journals:  df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
     return df.reset_index(drop=True)
 def event_subset(events, seed_paper_id, year_min, year_max):
     df = events[events["seed_paper_id"] == seed_paper_id].copy()
     df = df[df["citing_year"].fillna(-99999) >= year_min]
     df = df[df["citing_year"].fillna(99999) <= year_max]
     return df.reset_index(drop=True)
 def build_intent_summary(df):
     counts = df.groupby("primary_intent").size().to_dict()
     return pd.DataFrame({"intent": ALLOWED_INTENTS,
                           "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
 def build_context_rows(df, limit=20):
     rows = []
     df = df.sort_values(["context_count","intent_count","citing_year"],
         if len(rows) >= limit: break
     return pd.DataFrame(rows[:limit])
 def build_citing_table(df, limit=30):
     if df.empty:
         return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
             [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
             .drop_duplicates(subset=["citing_paper_id"]).head(limit))
 def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
     """선택된 seed paper를 인용한 논문들이 함께 인용한 다른 seed papers"""
     citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
     return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
                          on="seed_paper_id", how="left")
 def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
     """선택된 seed paper의 KG 1-hop 서브그래프 반환"""
     node_id = f"seed:{seed_doi}"
     nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
     return nodes, edges
 def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
     """KG Explorer: 임의 노드 기준 서브그래프"""
     edges = kg_edges[(kg_edges["source"] == search_node_id) |
     nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
     return nodes, edges
 def pyvis_citation_graph(seed_row, events_df):
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     sid = seed_row["seed_paper_id"]
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 def pyvis_ontology():
     net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     for nid, label, typ in [
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 def pyvis_from_kg(nodes_df, edges_df, height="780px"):
     """kg_nodes / kg_edges DataFrame으로 pyvis 그래프 생성"""
     net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
     net.barnes_hut()
     return inject_fullscreen(net.generate_html())
 st.title("CitationHub")
 st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
 with st.sidebar:
     st.subheader("Data source")
     if HF_REPO_ID:
 contexts_df    = build_context_rows(seed_events)
 citing_table   = build_citing_table(seed_events)
 (tab_overview, tab_cnet,
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
     "Overview","Citation Network",
     "Knowledge Graph","Geographic Map","Analytics",
 ])
 with tab_overview:
     col1, col2 = st.columns(2)
     with col1:
                 <div>{row['context']}</div></div>""",
                 unsafe_allow_html=True)
 with tab_cnet:
     st.subheader("Citation Network")
     st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
     else:
         components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
 with tab_kg_exp:
     st.subheader("Knowledge Graph")
     st.subheader("CitationHub Ontology — Concepts, Instances & Relationships")
     st.caption("🔍 Scroll/pinch: zoom  |  Drag: pan  |  Hover node: details  |  ⛶ (top-right toolbar): fullscreen")
     st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
             kg_nodes_exp  = load_kg_nodes(data_dir_val)
             kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
         import duckdb as _ddb
         nt = kg_nodes_exp["node_type"].value_counts().reset_index()
                                  yaxis_title="Count", xaxis_tickangle=-35)
             st.plotly_chart(et_fig, use_container_width=True)
         st.markdown("---")
         st.subheader("Multi-Node Knowledge Graph")
         st.caption("🖱 Scroll: zoom  |  Drag: pan  |  Click node: info  |  ⛶ button: fullscreen")
         n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
         EDGES_PER_TYPE = 10
         with st.spinner("Querying graph..."):
             if seed_ids:
                 ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
                 hop1 = _ddb.execute(f"""
                     WITH ranked AS (
                         SELECT source, target, edge_type,
                     WHERE rn <= {EDGES_PER_TYPE}
                 """).df()
                 hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
                 event_node_ids = (
                     kg_nodes_exp[
                 if event_node_ids:
                     ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
                     hop2 = _ddb.execute(f"""
                         WITH ranked AS (
                             SELECT source, target, edge_type,
     except Exception as e:
         st.error(str(e))
 with tab_geo:
     st.subheader("Geographic Distribution of Seed Papers")
     with st.spinner("Loading geographic data..."):
             .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
             use_container_width=True)
     st.subheader("Top Affiliations")
     geo_col1, geo_col2 = st.columns(2)
                                legend_title="Country", height=520),
                 use_container_width=True)
 with tab_analytics:
     try:
         with st.spinner("Loading analytics data..."):
                        title="Influential vs Non-influential"),
                 use_container_width=True)
     st.markdown("---")
     st.subheader("CitationHub Intent Evolution over Years")
     st.caption("How citation intents have changed across all papers over time")
             use_container_width=True,
         )
     st.markdown("---")
     col_v1, col_v2 = st.columns(2)
                 use_container_width=True,
             )
     st.markdown("---")
     st.subheader("Citation Trend over Time (selected paper)")
     st.caption("How citations to the selected seed paper have changed year by year")
     else:
         st.info("No citation trend data for the selected paper.")
     st.markdown("---")
     st.subheader("Export Data")
     col_e1, col_e2, col_e3 = st.columns(3)