Spaces:
Sleeping
Sleeping
Upload app.py
Browse files- src/app.py +3 -72
src/app.py
CHANGED
|
@@ -15,7 +15,6 @@ import streamlit.components.v1 as components
|
|
| 15 |
|
| 16 |
HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
|
| 17 |
|
| 18 |
-
|
| 19 |
def csv_download_link(data: bytes, filename: str, label: str) -> None:
|
| 20 |
"""st.download_button λμ base64 HTML λ§ν¬λ‘ λ€μ΄λ‘λ β μλ² μ°κ²° λΆνμ."""
|
| 21 |
b64 = base64.b64encode(data).decode()
|
|
@@ -56,12 +55,10 @@ DEFAULT_DATA_DIR = Path(os.environ.get(
|
|
| 56 |
r"C:\Users\user\OneDrive\λ°ν νλ©΄\Citehub_huggingface\data",
|
| 57 |
))
|
| 58 |
|
| 59 |
-
|
| 60 |
def fmt_num(x):
|
| 61 |
try: return f"{int(x):,}"
|
| 62 |
except: return "-"
|
| 63 |
|
| 64 |
-
|
| 65 |
def _hf_download(filename: str) -> str:
|
| 66 |
from huggingface_hub import hf_hub_download
|
| 67 |
return hf_hub_download(
|
|
@@ -69,13 +66,11 @@ def _hf_download(filename: str) -> str:
|
|
| 69 |
filename=f"data/{filename}", token=HF_TOKEN or None,
|
| 70 |
)
|
| 71 |
|
| 72 |
-
|
| 73 |
def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
|
| 74 |
if HF_REPO_ID:
|
| 75 |
return pd.read_parquet(_hf_download(filename))
|
| 76 |
return pd.read_parquet(data_dir / filename)
|
| 77 |
|
| 78 |
-
|
| 79 |
def plotly_network_fig(
|
| 80 |
nodes_df: pd.DataFrame,
|
| 81 |
edges_df: pd.DataFrame,
|
|
@@ -102,7 +97,6 @@ def plotly_network_fig(
|
|
| 102 |
k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
|
| 103 |
pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
|
| 104 |
|
| 105 |
-
# ββ edges βββββββββββββββββββββββββββββββββ
|
| 106 |
ex, ey = [], []
|
| 107 |
for src, tgt in G.edges():
|
| 108 |
x0, y0 = pos.get(src, (0, 0))
|
|
@@ -118,7 +112,6 @@ def plotly_network_fig(
|
|
| 118 |
)
|
| 119 |
]
|
| 120 |
|
| 121 |
-
# ββ nodes grouped by type βββββββββββββββββ
|
| 122 |
for ntype, color in NODE_TYPE_COLORS.items():
|
| 123 |
subset = nodes_df[nodes_df["node_type"] == ntype]
|
| 124 |
if subset.empty:
|
|
@@ -170,11 +163,9 @@ def plotly_network_fig(
|
|
| 170 |
)
|
| 171 |
return fig
|
| 172 |
|
| 173 |
-
|
| 174 |
def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
| 175 |
"""CitationHub μ¨ν¨λ‘μ§ κ΅¬μ‘° β Plotly SVG. κ° λ
Έλμ μμ±κ° νμ."""
|
| 176 |
|
| 177 |
-
# κ° λ
Έλ νμ
μ μ£Όμ μμ± (λ
Έλ μλμ μκ² νμ)
|
| 178 |
NODE_PROPS = {
|
| 179 |
"seed_paper": "doi Β· title Β· journal\nauthor Β· affiliation\ncountry Β· field Β· citedby_count",
|
| 180 |
"citation_event": "event_id Β· citing_year\nprimary_intent Β· context\nis_influential",
|
|
@@ -215,7 +206,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
|
| 215 |
|
| 216 |
pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
|
| 217 |
|
| 218 |
-
# ββ μ£μ§ λΌμΈ + μ£μ§ λ μ΄λΈ βββββββββββββββββββββββββββββββ
|
| 219 |
ex, ey = [], []
|
| 220 |
ann = []
|
| 221 |
for s, t, lbl in edge_defs:
|
|
@@ -234,13 +224,11 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
|
| 234 |
hoverinfo="none", showlegend=False)
|
| 235 |
]
|
| 236 |
|
| 237 |
-
# ββ λ
Έλ + μμ± annotation ββββββββββββββββββββββββββββββββ
|
| 238 |
for nid, label, ntype in node_defs:
|
| 239 |
x, y = pos[nid]
|
| 240 |
color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
|
| 241 |
props = NODE_PROPS.get(ntype, "")
|
| 242 |
|
| 243 |
-
# λ
Έλ λ§μ»€ + μλ¨ μ΄λ¦ λ μ΄λΈ
|
| 244 |
traces.append(go.Scatter(
|
| 245 |
x=[x], y=[y], mode="markers+text",
|
| 246 |
text=[f"<b>{label}</b>"], textposition="top center",
|
|
@@ -253,7 +241,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
|
| 253 |
textfont=dict(size=11, color="#1e293b"),
|
| 254 |
))
|
| 255 |
|
| 256 |
-
# μμ±κ°: λ
Έλ μλμ μμ κΈμ¨ annotation
|
| 257 |
if props:
|
| 258 |
prop_html = props.replace("\n", "<br>")
|
| 259 |
ann.append(dict(
|
|
@@ -262,7 +249,7 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
|
| 262 |
showarrow=False,
|
| 263 |
xanchor="center",
|
| 264 |
yanchor="top",
|
| 265 |
-
yshift=-22,
|
| 266 |
font=dict(size=8, color="#64748b"),
|
| 267 |
bgcolor="rgba(248,250,252,0.85)",
|
| 268 |
borderpad=2,
|
|
@@ -279,7 +266,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
|
| 279 |
)
|
| 280 |
return fig
|
| 281 |
|
| 282 |
-
|
| 283 |
def inject_fullscreen(html: str) -> str:
|
| 284 |
extra = """
|
| 285 |
<button onclick="var el=document.getElementById('mynetwork');
|
|
@@ -311,8 +297,6 @@ def inject_fullscreen(html: str) -> str:
|
|
| 311 |
"""
|
| 312 |
return html.replace("</body>", extra + "</body>")
|
| 313 |
|
| 314 |
-
|
| 315 |
-
# ββ λ©μΈ λ°μ΄ν° λ‘λ (ν΅μ¬ 3κ° β λΉ λ₯Έ μ΄κΈ° κΈ°λ) ββββββββββββββ
|
| 316 |
@st.cache_data(show_spinner=False)
|
| 317 |
def load_data(data_dir_str: str):
|
| 318 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
|
@@ -389,38 +373,31 @@ def load_data(data_dir_str: str):
|
|
| 389 |
}
|
| 390 |
return seed, events, citing, filters, overview
|
| 391 |
|
| 392 |
-
|
| 393 |
-
# ββ 보쑰 λ°μ΄ν°: ν΄λΉ ν μ κ·Ό μμλ§ λ‘λ (lazy) βββββββββββββββ
|
| 394 |
@st.cache_data(show_spinner=False)
|
| 395 |
def load_authors_data(data_dir_str: str) -> pd.DataFrame:
|
| 396 |
"""Analytics νμμλ§ μ¬μ© β ν μ§μ
μ λ‘λ"""
|
| 397 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 398 |
return _read("authors.parquet", d)
|
| 399 |
|
| 400 |
-
|
| 401 |
@st.cache_data(show_spinner=False)
|
| 402 |
def load_geo_data(data_dir_str: str) -> pd.DataFrame:
|
| 403 |
"""Geographic Map νμμλ§ μ¬μ© β ν μ§μ
μ λ‘λ"""
|
| 404 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 405 |
return _read("affiliation_geo.parquet", d)
|
| 406 |
|
| 407 |
-
|
| 408 |
-
# ββ KG λ°μ΄ν°: DuckDB λ°©μμΌλ‘ λΆλ¦¬ λ‘λ βββββββββββββββββββββ
|
| 409 |
@st.cache_data(show_spinner=False)
|
| 410 |
def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
|
| 411 |
"""kg_nodes μ 체 λ‘λ (3.4M rows, ~160MB νμΌ)"""
|
| 412 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 413 |
return _read("kg_nodes.parquet", d)
|
| 414 |
|
| 415 |
-
|
| 416 |
@st.cache_data(show_spinner=False)
|
| 417 |
def get_parquet_path(filename: str, data_dir_str: str) -> str:
|
| 418 |
"""νμΌ κ²½λ‘ λ°ν (HFλ©΄ λ‘컬 μΊμμ λ€μ΄λ‘λ ν κ²½λ‘ λ°ν)"""
|
| 419 |
if HF_REPO_ID:
|
| 420 |
return _hf_download(filename)
|
| 421 |
-
# DuckDBμ©: μμ¬λμ β μ¬λμ λ³ν
|
| 422 |
-
return str(Path(data_dir_str) / filename).replace("\\", "/")
|
| 423 |
|
|
|
|
| 424 |
|
| 425 |
@st.cache_data(show_spinner=False)
|
| 426 |
def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
|
|
@@ -436,7 +413,6 @@ def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 8
|
|
| 436 |
"""
|
| 437 |
return duckdb.execute(q).df()
|
| 438 |
|
| 439 |
-
|
| 440 |
@st.cache_data(show_spinner=False)
|
| 441 |
def query_enriched_stats(enriched_path: str):
|
| 442 |
"""DuckDB: enriched μ 체 λ‘λ μμ΄ μ§κ³ ν΅κ³λ§ 쿼리"""
|
|
@@ -461,7 +437,6 @@ def query_enriched_stats(enriched_path: str):
|
|
| 461 |
|
| 462 |
return sem_df, field_df
|
| 463 |
|
| 464 |
-
|
| 465 |
@st.cache_data(show_spinner=False)
|
| 466 |
def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
|
| 467 |
"""DuckDB: KG Explorerμ© μμ λ
Έλ μ£μ§ 쿼리"""
|
|
@@ -476,8 +451,6 @@ def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60)
|
|
| 476 |
"""
|
| 477 |
return duckdb.execute(q).df()
|
| 478 |
|
| 479 |
-
|
| 480 |
-
# ββ ν¬νΌ βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 481 |
def filter_seed_papers(seed, q, fields, countries, journals):
|
| 482 |
df = seed.copy()
|
| 483 |
q = (q or "").strip().lower()
|
|
@@ -488,20 +461,17 @@ def filter_seed_papers(seed, q, fields, countries, journals):
|
|
| 488 |
if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
|
| 489 |
return df.reset_index(drop=True)
|
| 490 |
|
| 491 |
-
|
| 492 |
def event_subset(events, seed_paper_id, year_min, year_max):
|
| 493 |
df = events[events["seed_paper_id"] == seed_paper_id].copy()
|
| 494 |
df = df[df["citing_year"].fillna(-99999) >= year_min]
|
| 495 |
df = df[df["citing_year"].fillna(99999) <= year_max]
|
| 496 |
return df.reset_index(drop=True)
|
| 497 |
|
| 498 |
-
|
| 499 |
def build_intent_summary(df):
|
| 500 |
counts = df.groupby("primary_intent").size().to_dict()
|
| 501 |
return pd.DataFrame({"intent": ALLOWED_INTENTS,
|
| 502 |
"count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
|
| 503 |
|
| 504 |
-
|
| 505 |
def build_context_rows(df, limit=20):
|
| 506 |
rows = []
|
| 507 |
df = df.sort_values(["context_count","intent_count","citing_year"],
|
|
@@ -518,7 +488,6 @@ def build_context_rows(df, limit=20):
|
|
| 518 |
if len(rows) >= limit: break
|
| 519 |
return pd.DataFrame(rows[:limit])
|
| 520 |
|
| 521 |
-
|
| 522 |
def build_citing_table(df, limit=30):
|
| 523 |
if df.empty:
|
| 524 |
return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
|
|
@@ -527,7 +496,6 @@ def build_citing_table(df, limit=30):
|
|
| 527 |
[["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
|
| 528 |
.drop_duplicates(subset=["citing_paper_id"]).head(limit))
|
| 529 |
|
| 530 |
-
|
| 531 |
def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
|
| 532 |
"""μ νλ seed paperλ₯Ό μΈμ©ν λ
Όλ¬Έλ€μ΄ ν¨κ» μΈμ©ν λ€λ₯Έ seed papers"""
|
| 533 |
citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
|
|
@@ -540,7 +508,6 @@ def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
|
|
| 540 |
return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
|
| 541 |
on="seed_paper_id", how="left")
|
| 542 |
|
| 543 |
-
|
| 544 |
def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
|
| 545 |
"""μ νλ seed paperμ KG 1-hop μλΈκ·Έλν λ°ν"""
|
| 546 |
node_id = f"seed:{seed_doi}"
|
|
@@ -552,7 +519,6 @@ def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
|
|
| 552 |
nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
|
| 553 |
return nodes, edges
|
| 554 |
|
| 555 |
-
|
| 556 |
def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
|
| 557 |
"""KG Explorer: μμ λ
Έλ κΈ°μ€ μλΈκ·Έλν"""
|
| 558 |
edges = kg_edges[(kg_edges["source"] == search_node_id) |
|
|
@@ -563,8 +529,6 @@ def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60)
|
|
| 563 |
nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
|
| 564 |
return nodes, edges
|
| 565 |
|
| 566 |
-
|
| 567 |
-
# ββ pyvis λΉλ οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 568 |
def pyvis_citation_graph(seed_row, events_df):
|
| 569 |
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
| 570 |
sid = seed_row["seed_paper_id"]
|
|
@@ -583,7 +547,6 @@ def pyvis_citation_graph(seed_row, events_df):
|
|
| 583 |
net.barnes_hut()
|
| 584 |
return inject_fullscreen(net.generate_html())
|
| 585 |
|
| 586 |
-
|
| 587 |
def pyvis_ontology():
|
| 588 |
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
| 589 |
for nid, label, typ in [
|
|
@@ -605,7 +568,6 @@ def pyvis_ontology():
|
|
| 605 |
net.barnes_hut()
|
| 606 |
return inject_fullscreen(net.generate_html())
|
| 607 |
|
| 608 |
-
|
| 609 |
def pyvis_from_kg(nodes_df, edges_df, height="780px"):
|
| 610 |
"""kg_nodes / kg_edges DataFrameμΌλ‘ pyvis κ·Έλν μμ±"""
|
| 611 |
net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
|
@@ -624,14 +586,9 @@ def pyvis_from_kg(nodes_df, edges_df, height="780px"):
|
|
| 624 |
net.barnes_hut()
|
| 625 |
return inject_fullscreen(net.generate_html())
|
| 626 |
|
| 627 |
-
|
| 628 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 629 |
-
# λ©μΈ UI
|
| 630 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 631 |
st.title("CitationHub")
|
| 632 |
st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
|
| 633 |
|
| 634 |
-
# ββ Sidebar ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 635 |
with st.sidebar:
|
| 636 |
st.subheader("Data source")
|
| 637 |
if HF_REPO_ID:
|
|
@@ -688,15 +645,12 @@ intent_summary = build_intent_summary(seed_events)
|
|
| 688 |
contexts_df = build_context_rows(seed_events)
|
| 689 |
citing_table = build_citing_table(seed_events)
|
| 690 |
|
| 691 |
-
# ββ ν βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 692 |
(tab_overview, tab_cnet,
|
| 693 |
tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
|
| 694 |
"Overview","Citation Network",
|
| 695 |
"Knowledge Graph","Geographic Map","Analytics",
|
| 696 |
])
|
| 697 |
|
| 698 |
-
|
| 699 |
-
# βββ 1. OVERVIEW βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 700 |
with tab_overview:
|
| 701 |
col1, col2 = st.columns(2)
|
| 702 |
with col1:
|
|
@@ -768,8 +722,6 @@ with tab_overview:
|
|
| 768 |
<div>{row['context']}</div></div>""",
|
| 769 |
unsafe_allow_html=True)
|
| 770 |
|
| 771 |
-
|
| 772 |
-
# βββ 2. CITATION NETWORK ββββββββββββββββββββββββββββββββββββββββ
|
| 773 |
with tab_cnet:
|
| 774 |
st.subheader("Citation Network")
|
| 775 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
|
@@ -778,13 +730,9 @@ with tab_cnet:
|
|
| 778 |
else:
|
| 779 |
components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
|
| 780 |
|
| 781 |
-
|
| 782 |
-
# βββ 3. ONTOLOGY ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 783 |
-
# βββ 3. KNOWLEDGE GRAPH (KG Explorer + Ontology) ββββββββββββββββ
|
| 784 |
with tab_kg_exp:
|
| 785 |
st.subheader("Knowledge Graph")
|
| 786 |
|
| 787 |
-
# ββ CitationHub Ontology βββββββββββββββββββββββββββββββββββββ
|
| 788 |
st.subheader("CitationHub Ontology β Concepts, Instances & Relationships")
|
| 789 |
st.caption("π Scroll/pinch: zoom | Drag: pan | Hover node: details | βΆ (top-right toolbar): fullscreen")
|
| 790 |
st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
|
|
@@ -796,7 +744,6 @@ with tab_kg_exp:
|
|
| 796 |
kg_nodes_exp = load_kg_nodes(data_dir_val)
|
| 797 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 798 |
|
| 799 |
-
# ββ λ
Έλ/μ£μ§ νμ
λΆν¬ ν΅κ³
|
| 800 |
import duckdb as _ddb
|
| 801 |
|
| 802 |
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
|
@@ -828,14 +775,12 @@ with tab_kg_exp:
|
|
| 828 |
yaxis_title="Count", xaxis_tickangle=-35)
|
| 829 |
st.plotly_chart(et_fig, use_container_width=True)
|
| 830 |
|
| 831 |
-
# ββ Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
|
| 832 |
st.markdown("---")
|
| 833 |
st.subheader("Multi-Node Knowledge Graph")
|
| 834 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 835 |
|
| 836 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 837 |
|
| 838 |
-
# edge typeλΉ κ³ μ μν μ β 10 types Γ 10 = μ΅λ 100 edges
|
| 839 |
EDGES_PER_TYPE = 10
|
| 840 |
|
| 841 |
with st.spinner("Querying graph..."):
|
|
@@ -847,8 +792,6 @@ with tab_kg_exp:
|
|
| 847 |
if seed_ids:
|
| 848 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 849 |
|
| 850 |
-
# 1-hop: seed paperμ μ°κ²°λ λͺ¨λ edge type
|
| 851 |
-
# β journal / author / affiliation / city / country / field / citation_event
|
| 852 |
hop1 = _ddb.execute(f"""
|
| 853 |
WITH ranked AS (
|
| 854 |
SELECT source, target, edge_type,
|
|
@@ -862,8 +805,6 @@ with tab_kg_exp:
|
|
| 862 |
WHERE rn <= {EDGES_PER_TYPE}
|
| 863 |
""").df()
|
| 864 |
|
| 865 |
-
# 2-hop: kg_nodes_exp νμ
κΈ°λ°μΌλ‘ citation_event λ
Έλ ID μΆμΆ
|
| 866 |
-
# (prefix κ°μ μμ΄ μ€μ node_type 컬λΌμΌλ‘ νμΈ)
|
| 867 |
hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 868 |
event_node_ids = (
|
| 869 |
kg_nodes_exp[
|
|
@@ -874,8 +815,7 @@ with tab_kg_exp:
|
|
| 874 |
|
| 875 |
if event_node_ids:
|
| 876 |
ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
|
| 877 |
-
|
| 878 |
-
# citation_event β HAS_PRIMARY_INTENT β intent
|
| 879 |
hop2 = _ddb.execute(f"""
|
| 880 |
WITH ranked AS (
|
| 881 |
SELECT source, target, edge_type,
|
|
@@ -914,8 +854,6 @@ with tab_kg_exp:
|
|
| 914 |
except Exception as e:
|
| 915 |
st.error(str(e))
|
| 916 |
|
| 917 |
-
|
| 918 |
-
# βββ 6. GEOGRAPHIC MAP ββββββββββββββββββββββββββββββββββββββββββ
|
| 919 |
with tab_geo:
|
| 920 |
st.subheader("Geographic Distribution of Seed Papers")
|
| 921 |
with st.spinner("Loading geographic data..."):
|
|
@@ -948,7 +886,6 @@ with tab_geo:
|
|
| 948 |
.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
|
| 949 |
use_container_width=True)
|
| 950 |
|
| 951 |
-
# ββ Affiliation μκ°ν ββββββββββββββββββββββββββββββββββββββ
|
| 952 |
st.subheader("Top Affiliations")
|
| 953 |
geo_col1, geo_col2 = st.columns(2)
|
| 954 |
|
|
@@ -989,8 +926,6 @@ with tab_geo:
|
|
| 989 |
legend_title="Country", height=520),
|
| 990 |
use_container_width=True)
|
| 991 |
|
| 992 |
-
|
| 993 |
-
# βββ 7. ANALYTICS βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 994 |
with tab_analytics:
|
| 995 |
try:
|
| 996 |
with st.spinner("Loading analytics data..."):
|
|
@@ -1063,7 +998,6 @@ with tab_analytics:
|
|
| 1063 |
title="Influential vs Non-influential"),
|
| 1064 |
use_container_width=True)
|
| 1065 |
|
| 1066 |
-
# ββ Intent Evolution over Years ββββββββββββββββββββββββββββ
|
| 1067 |
st.markdown("---")
|
| 1068 |
st.subheader("CitationHub Intent Evolution over Years")
|
| 1069 |
st.caption("How citation intents have changed across all papers over time")
|
|
@@ -1088,7 +1022,6 @@ with tab_analytics:
|
|
| 1088 |
use_container_width=True,
|
| 1089 |
)
|
| 1090 |
|
| 1091 |
-
# ββ Top Citing Venues βββββββββββββββββββββββββββββββββββββββ
|
| 1092 |
st.markdown("---")
|
| 1093 |
col_v1, col_v2 = st.columns(2)
|
| 1094 |
|
|
@@ -1136,7 +1069,6 @@ with tab_analytics:
|
|
| 1136 |
use_container_width=True,
|
| 1137 |
)
|
| 1138 |
|
| 1139 |
-
# ββ Citation Trend over Time ββββββββββββββββββββββββββββββββ
|
| 1140 |
st.markdown("---")
|
| 1141 |
st.subheader("Citation Trend over Time (selected paper)")
|
| 1142 |
st.caption("How citations to the selected seed paper have changed year by year")
|
|
@@ -1154,7 +1086,6 @@ with tab_analytics:
|
|
| 1154 |
else:
|
| 1155 |
st.info("No citation trend data for the selected paper.")
|
| 1156 |
|
| 1157 |
-
# ββ Export βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1158 |
st.markdown("---")
|
| 1159 |
st.subheader("Export Data")
|
| 1160 |
col_e1, col_e2, col_e3 = st.columns(3)
|
|
|
|
| 15 |
|
| 16 |
HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
|
| 17 |
|
|
|
|
| 18 |
def csv_download_link(data: bytes, filename: str, label: str) -> None:
|
| 19 |
"""st.download_button λμ base64 HTML λ§ν¬λ‘ λ€μ΄λ‘λ β μλ² μ°κ²° λΆνμ."""
|
| 20 |
b64 = base64.b64encode(data).decode()
|
|
|
|
| 55 |
r"C:\Users\user\OneDrive\λ°ν νλ©΄\Citehub_huggingface\data",
|
| 56 |
))
|
| 57 |
|
|
|
|
| 58 |
def fmt_num(x):
|
| 59 |
try: return f"{int(x):,}"
|
| 60 |
except: return "-"
|
| 61 |
|
|
|
|
| 62 |
def _hf_download(filename: str) -> str:
|
| 63 |
from huggingface_hub import hf_hub_download
|
| 64 |
return hf_hub_download(
|
|
|
|
| 66 |
filename=f"data/{filename}", token=HF_TOKEN or None,
|
| 67 |
)
|
| 68 |
|
|
|
|
| 69 |
def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
|
| 70 |
if HF_REPO_ID:
|
| 71 |
return pd.read_parquet(_hf_download(filename))
|
| 72 |
return pd.read_parquet(data_dir / filename)
|
| 73 |
|
|
|
|
| 74 |
def plotly_network_fig(
|
| 75 |
nodes_df: pd.DataFrame,
|
| 76 |
edges_df: pd.DataFrame,
|
|
|
|
| 97 |
k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
|
| 98 |
pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
|
| 99 |
|
|
|
|
| 100 |
ex, ey = [], []
|
| 101 |
for src, tgt in G.edges():
|
| 102 |
x0, y0 = pos.get(src, (0, 0))
|
|
|
|
| 112 |
)
|
| 113 |
]
|
| 114 |
|
|
|
|
| 115 |
for ntype, color in NODE_TYPE_COLORS.items():
|
| 116 |
subset = nodes_df[nodes_df["node_type"] == ntype]
|
| 117 |
if subset.empty:
|
|
|
|
| 163 |
)
|
| 164 |
return fig
|
| 165 |
|
|
|
|
| 166 |
def plotly_ontology_fig(height: int = 820) -> go.Figure:
|
| 167 |
"""CitationHub μ¨ν¨λ‘μ§ κ΅¬μ‘° β Plotly SVG. κ° λ
Έλμ μμ±κ° νμ."""
|
| 168 |
|
|
|
|
| 169 |
NODE_PROPS = {
|
| 170 |
"seed_paper": "doi Β· title Β· journal\nauthor Β· affiliation\ncountry Β· field Β· citedby_count",
|
| 171 |
"citation_event": "event_id Β· citing_year\nprimary_intent Β· context\nis_influential",
|
|
|
|
| 206 |
|
| 207 |
pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
|
| 208 |
|
|
|
|
| 209 |
ex, ey = [], []
|
| 210 |
ann = []
|
| 211 |
for s, t, lbl in edge_defs:
|
|
|
|
| 224 |
hoverinfo="none", showlegend=False)
|
| 225 |
]
|
| 226 |
|
|
|
|
| 227 |
for nid, label, ntype in node_defs:
|
| 228 |
x, y = pos[nid]
|
| 229 |
color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
|
| 230 |
props = NODE_PROPS.get(ntype, "")
|
| 231 |
|
|
|
|
| 232 |
traces.append(go.Scatter(
|
| 233 |
x=[x], y=[y], mode="markers+text",
|
| 234 |
text=[f"<b>{label}</b>"], textposition="top center",
|
|
|
|
| 241 |
textfont=dict(size=11, color="#1e293b"),
|
| 242 |
))
|
| 243 |
|
|
|
|
| 244 |
if props:
|
| 245 |
prop_html = props.replace("\n", "<br>")
|
| 246 |
ann.append(dict(
|
|
|
|
| 249 |
showarrow=False,
|
| 250 |
xanchor="center",
|
| 251 |
yanchor="top",
|
| 252 |
+
yshift=-22,
|
| 253 |
font=dict(size=8, color="#64748b"),
|
| 254 |
bgcolor="rgba(248,250,252,0.85)",
|
| 255 |
borderpad=2,
|
|
|
|
| 266 |
)
|
| 267 |
return fig
|
| 268 |
|
|
|
|
| 269 |
def inject_fullscreen(html: str) -> str:
|
| 270 |
extra = """
|
| 271 |
<button onclick="var el=document.getElementById('mynetwork');
|
|
|
|
| 297 |
"""
|
| 298 |
return html.replace("</body>", extra + "</body>")
|
| 299 |
|
|
|
|
|
|
|
| 300 |
@st.cache_data(show_spinner=False)
|
| 301 |
def load_data(data_dir_str: str):
|
| 302 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
|
|
|
| 373 |
}
|
| 374 |
return seed, events, citing, filters, overview
|
| 375 |
|
|
|
|
|
|
|
| 376 |
@st.cache_data(show_spinner=False)
|
| 377 |
def load_authors_data(data_dir_str: str) -> pd.DataFrame:
|
| 378 |
"""Analytics νμμλ§ μ¬μ© β ν μ§μ
μ λ‘λ"""
|
| 379 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 380 |
return _read("authors.parquet", d)
|
| 381 |
|
|
|
|
| 382 |
@st.cache_data(show_spinner=False)
|
| 383 |
def load_geo_data(data_dir_str: str) -> pd.DataFrame:
|
| 384 |
"""Geographic Map νμμλ§ μ¬μ© β ν μ§μ
μ λ‘λ"""
|
| 385 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 386 |
return _read("affiliation_geo.parquet", d)
|
| 387 |
|
|
|
|
|
|
|
| 388 |
@st.cache_data(show_spinner=False)
|
| 389 |
def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
|
| 390 |
"""kg_nodes μ 체 λ‘λ (3.4M rows, ~160MB νμΌ)"""
|
| 391 |
d = None if HF_REPO_ID else Path(data_dir_str)
|
| 392 |
return _read("kg_nodes.parquet", d)
|
| 393 |
|
|
|
|
| 394 |
@st.cache_data(show_spinner=False)
|
| 395 |
def get_parquet_path(filename: str, data_dir_str: str) -> str:
|
| 396 |
"""νμΌ κ²½λ‘ λ°ν (HFλ©΄ λ‘컬 μΊμμ λ€μ΄λ‘λ ν κ²½λ‘ λ°ν)"""
|
| 397 |
if HF_REPO_ID:
|
| 398 |
return _hf_download(filename)
|
|
|
|
|
|
|
| 399 |
|
| 400 |
+
return str(Path(data_dir_str) / filename).replace("\\", "/")
|
| 401 |
|
| 402 |
@st.cache_data(show_spinner=False)
|
| 403 |
def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
|
|
|
|
| 413 |
"""
|
| 414 |
return duckdb.execute(q).df()
|
| 415 |
|
|
|
|
| 416 |
@st.cache_data(show_spinner=False)
|
| 417 |
def query_enriched_stats(enriched_path: str):
|
| 418 |
"""DuckDB: enriched μ 체 λ‘λ μμ΄ μ§κ³ ν΅κ³λ§ 쿼리"""
|
|
|
|
| 437 |
|
| 438 |
return sem_df, field_df
|
| 439 |
|
|
|
|
| 440 |
@st.cache_data(show_spinner=False)
|
| 441 |
def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
|
| 442 |
"""DuckDB: KG Explorerμ© μμ λ
Έλ μ£μ§ 쿼리"""
|
|
|
|
| 451 |
"""
|
| 452 |
return duckdb.execute(q).df()
|
| 453 |
|
|
|
|
|
|
|
| 454 |
def filter_seed_papers(seed, q, fields, countries, journals):
|
| 455 |
df = seed.copy()
|
| 456 |
q = (q or "").strip().lower()
|
|
|
|
| 461 |
if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
|
| 462 |
return df.reset_index(drop=True)
|
| 463 |
|
|
|
|
| 464 |
def event_subset(events, seed_paper_id, year_min, year_max):
|
| 465 |
df = events[events["seed_paper_id"] == seed_paper_id].copy()
|
| 466 |
df = df[df["citing_year"].fillna(-99999) >= year_min]
|
| 467 |
df = df[df["citing_year"].fillna(99999) <= year_max]
|
| 468 |
return df.reset_index(drop=True)
|
| 469 |
|
|
|
|
| 470 |
def build_intent_summary(df):
|
| 471 |
counts = df.groupby("primary_intent").size().to_dict()
|
| 472 |
return pd.DataFrame({"intent": ALLOWED_INTENTS,
|
| 473 |
"count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
|
| 474 |
|
|
|
|
| 475 |
def build_context_rows(df, limit=20):
|
| 476 |
rows = []
|
| 477 |
df = df.sort_values(["context_count","intent_count","citing_year"],
|
|
|
|
| 488 |
if len(rows) >= limit: break
|
| 489 |
return pd.DataFrame(rows[:limit])
|
| 490 |
|
|
|
|
| 491 |
def build_citing_table(df, limit=30):
|
| 492 |
if df.empty:
|
| 493 |
return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
|
|
|
|
| 496 |
[["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
|
| 497 |
.drop_duplicates(subset=["citing_paper_id"]).head(limit))
|
| 498 |
|
|
|
|
| 499 |
def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
|
| 500 |
"""μ νλ seed paperλ₯Ό μΈμ©ν λ
Όλ¬Έλ€μ΄ ν¨κ» μΈμ©ν λ€λ₯Έ seed papers"""
|
| 501 |
citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
|
|
|
|
| 508 |
return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
|
| 509 |
on="seed_paper_id", how="left")
|
| 510 |
|
|
|
|
| 511 |
def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
|
| 512 |
"""μ νλ seed paperμ KG 1-hop μλΈκ·Έλν λ°ν"""
|
| 513 |
node_id = f"seed:{seed_doi}"
|
|
|
|
| 519 |
nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
|
| 520 |
return nodes, edges
|
| 521 |
|
|
|
|
| 522 |
def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
|
| 523 |
"""KG Explorer: μμ λ
Έλ κΈ°μ€ μλΈκ·Έλν"""
|
| 524 |
edges = kg_edges[(kg_edges["source"] == search_node_id) |
|
|
|
|
| 529 |
nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
|
| 530 |
return nodes, edges
|
| 531 |
|
|
|
|
|
|
|
| 532 |
def pyvis_citation_graph(seed_row, events_df):
|
| 533 |
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
| 534 |
sid = seed_row["seed_paper_id"]
|
|
|
|
| 547 |
net.barnes_hut()
|
| 548 |
return inject_fullscreen(net.generate_html())
|
| 549 |
|
|
|
|
| 550 |
def pyvis_ontology():
|
| 551 |
net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
| 552 |
for nid, label, typ in [
|
|
|
|
| 568 |
net.barnes_hut()
|
| 569 |
return inject_fullscreen(net.generate_html())
|
| 570 |
|
|
|
|
| 571 |
def pyvis_from_kg(nodes_df, edges_df, height="780px"):
|
| 572 |
"""kg_nodes / kg_edges DataFrameμΌλ‘ pyvis κ·Έλν μμ±"""
|
| 573 |
net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
|
|
|
|
| 586 |
net.barnes_hut()
|
| 587 |
return inject_fullscreen(net.generate_html())
|
| 588 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
st.title("CitationHub")
|
| 590 |
st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
|
| 591 |
|
|
|
|
| 592 |
with st.sidebar:
|
| 593 |
st.subheader("Data source")
|
| 594 |
if HF_REPO_ID:
|
|
|
|
| 645 |
contexts_df = build_context_rows(seed_events)
|
| 646 |
citing_table = build_citing_table(seed_events)
|
| 647 |
|
|
|
|
| 648 |
(tab_overview, tab_cnet,
|
| 649 |
tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
|
| 650 |
"Overview","Citation Network",
|
| 651 |
"Knowledge Graph","Geographic Map","Analytics",
|
| 652 |
])
|
| 653 |
|
|
|
|
|
|
|
| 654 |
with tab_overview:
|
| 655 |
col1, col2 = st.columns(2)
|
| 656 |
with col1:
|
|
|
|
| 722 |
<div>{row['context']}</div></div>""",
|
| 723 |
unsafe_allow_html=True)
|
| 724 |
|
|
|
|
|
|
|
| 725 |
with tab_cnet:
|
| 726 |
st.subheader("Citation Network")
|
| 727 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
|
|
|
| 730 |
else:
|
| 731 |
components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
|
| 732 |
|
|
|
|
|
|
|
|
|
|
| 733 |
with tab_kg_exp:
|
| 734 |
st.subheader("Knowledge Graph")
|
| 735 |
|
|
|
|
| 736 |
st.subheader("CitationHub Ontology β Concepts, Instances & Relationships")
|
| 737 |
st.caption("π Scroll/pinch: zoom | Drag: pan | Hover node: details | βΆ (top-right toolbar): fullscreen")
|
| 738 |
st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
|
|
|
|
| 744 |
kg_nodes_exp = load_kg_nodes(data_dir_val)
|
| 745 |
kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
|
| 746 |
|
|
|
|
| 747 |
import duckdb as _ddb
|
| 748 |
|
| 749 |
nt = kg_nodes_exp["node_type"].value_counts().reset_index()
|
|
|
|
| 775 |
yaxis_title="Count", xaxis_tickangle=-35)
|
| 776 |
st.plotly_chart(et_fig, use_container_width=True)
|
| 777 |
|
|
|
|
| 778 |
st.markdown("---")
|
| 779 |
st.subheader("Multi-Node Knowledge Graph")
|
| 780 |
st.caption("π± Scroll: zoom | Drag: pan | Click node: info | βΆ button: fullscreen")
|
| 781 |
|
| 782 |
n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
|
| 783 |
|
|
|
|
| 784 |
EDGES_PER_TYPE = 10
|
| 785 |
|
| 786 |
with st.spinner("Querying graph..."):
|
|
|
|
| 792 |
if seed_ids:
|
| 793 |
ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
|
| 794 |
|
|
|
|
|
|
|
| 795 |
hop1 = _ddb.execute(f"""
|
| 796 |
WITH ranked AS (
|
| 797 |
SELECT source, target, edge_type,
|
|
|
|
| 805 |
WHERE rn <= {EDGES_PER_TYPE}
|
| 806 |
""").df()
|
| 807 |
|
|
|
|
|
|
|
| 808 |
hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
|
| 809 |
event_node_ids = (
|
| 810 |
kg_nodes_exp[
|
|
|
|
| 815 |
|
| 816 |
if event_node_ids:
|
| 817 |
ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
|
| 818 |
+
|
|
|
|
| 819 |
hop2 = _ddb.execute(f"""
|
| 820 |
WITH ranked AS (
|
| 821 |
SELECT source, target, edge_type,
|
|
|
|
| 854 |
except Exception as e:
|
| 855 |
st.error(str(e))
|
| 856 |
|
|
|
|
|
|
|
| 857 |
with tab_geo:
|
| 858 |
st.subheader("Geographic Distribution of Seed Papers")
|
| 859 |
with st.spinner("Loading geographic data..."):
|
|
|
|
| 886 |
.update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
|
| 887 |
use_container_width=True)
|
| 888 |
|
|
|
|
| 889 |
st.subheader("Top Affiliations")
|
| 890 |
geo_col1, geo_col2 = st.columns(2)
|
| 891 |
|
|
|
|
| 926 |
legend_title="Country", height=520),
|
| 927 |
use_container_width=True)
|
| 928 |
|
|
|
|
|
|
|
| 929 |
with tab_analytics:
|
| 930 |
try:
|
| 931 |
with st.spinner("Loading analytics data..."):
|
|
|
|
| 998 |
title="Influential vs Non-influential"),
|
| 999 |
use_container_width=True)
|
| 1000 |
|
|
|
|
| 1001 |
st.markdown("---")
|
| 1002 |
st.subheader("CitationHub Intent Evolution over Years")
|
| 1003 |
st.caption("How citation intents have changed across all papers over time")
|
|
|
|
| 1022 |
use_container_width=True,
|
| 1023 |
)
|
| 1024 |
|
|
|
|
| 1025 |
st.markdown("---")
|
| 1026 |
col_v1, col_v2 = st.columns(2)
|
| 1027 |
|
|
|
|
| 1069 |
use_container_width=True,
|
| 1070 |
)
|
| 1071 |
|
|
|
|
| 1072 |
st.markdown("---")
|
| 1073 |
st.subheader("Citation Trend over Time (selected paper)")
|
| 1074 |
st.caption("How citations to the selected seed paper have changed year by year")
|
|
|
|
| 1086 |
else:
|
| 1087 |
st.info("No citation trend data for the selected paper.")
|
| 1088 |
|
|
|
|
| 1089 |
st.markdown("---")
|
| 1090 |
st.subheader("Export Data")
|
| 1091 |
col_e1, col_e2, col_e3 = st.columns(3)
|