Daniel0315 commited on
Commit
1c0f18c
Β·
verified Β·
1 Parent(s): c089330

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +3 -72
src/app.py CHANGED
@@ -15,7 +15,6 @@ import streamlit.components.v1 as components
15
 
16
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
17
 
18
-
19
  def csv_download_link(data: bytes, filename: str, label: str) -> None:
20
  """st.download_button λŒ€μ‹  base64 HTML 링크둜 λ‹€μš΄λ‘œλ“œ β€” μ„œλ²„ μ—°κ²° λΆˆν•„μš”."""
21
  b64 = base64.b64encode(data).decode()
@@ -56,12 +55,10 @@ DEFAULT_DATA_DIR = Path(os.environ.get(
56
  r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data",
57
  ))
58
 
59
-
60
  def fmt_num(x):
61
  try: return f"{int(x):,}"
62
  except: return "-"
63
 
64
-
65
  def _hf_download(filename: str) -> str:
66
  from huggingface_hub import hf_hub_download
67
  return hf_hub_download(
@@ -69,13 +66,11 @@ def _hf_download(filename: str) -> str:
69
  filename=f"data/{filename}", token=HF_TOKEN or None,
70
  )
71
 
72
-
73
  def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
74
  if HF_REPO_ID:
75
  return pd.read_parquet(_hf_download(filename))
76
  return pd.read_parquet(data_dir / filename)
77
 
78
-
79
  def plotly_network_fig(
80
  nodes_df: pd.DataFrame,
81
  edges_df: pd.DataFrame,
@@ -102,7 +97,6 @@ def plotly_network_fig(
102
  k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
103
  pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
104
 
105
- # ── edges ─────────────────────────────────
106
  ex, ey = [], []
107
  for src, tgt in G.edges():
108
  x0, y0 = pos.get(src, (0, 0))
@@ -118,7 +112,6 @@ def plotly_network_fig(
118
  )
119
  ]
120
 
121
- # ── nodes grouped by type ─────────────────
122
  for ntype, color in NODE_TYPE_COLORS.items():
123
  subset = nodes_df[nodes_df["node_type"] == ntype]
124
  if subset.empty:
@@ -170,11 +163,9 @@ def plotly_network_fig(
170
  )
171
  return fig
172
 
173
-
174
  def plotly_ontology_fig(height: int = 820) -> go.Figure:
175
  """CitationHub μ˜¨ν†¨λ‘œμ§€ ꡬ쑰 β€” Plotly SVG. 각 λ…Έλ“œμ— 속성값 ν‘œμ‹œ."""
176
 
177
- # 각 λ…Έλ“œ νƒ€μž…μ˜ μ£Όμš” 속성 (λ…Έλ“œ μ•„λž˜μ— μž‘κ²Œ ν‘œμ‹œ)
178
  NODE_PROPS = {
179
  "seed_paper": "doi Β· title Β· journal\nauthor Β· affiliation\ncountry Β· field Β· citedby_count",
180
  "citation_event": "event_id Β· citing_year\nprimary_intent Β· context\nis_influential",
@@ -215,7 +206,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
215
 
216
  pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
217
 
218
- # ── μ—£μ§€ 라인 + μ—£μ§€ λ ˆμ΄λΈ” ───────────────────────────────
219
  ex, ey = [], []
220
  ann = []
221
  for s, t, lbl in edge_defs:
@@ -234,13 +224,11 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
234
  hoverinfo="none", showlegend=False)
235
  ]
236
 
237
- # ── λ…Έλ“œ + 속성 annotation ────────────────────────────────
238
  for nid, label, ntype in node_defs:
239
  x, y = pos[nid]
240
  color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
241
  props = NODE_PROPS.get(ntype, "")
242
 
243
- # λ…Έλ“œ 마컀 + 상단 이름 λ ˆμ΄λΈ”
244
  traces.append(go.Scatter(
245
  x=[x], y=[y], mode="markers+text",
246
  text=[f"<b>{label}</b>"], textposition="top center",
@@ -253,7 +241,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
253
  textfont=dict(size=11, color="#1e293b"),
254
  ))
255
 
256
- # 속성값: λ…Έλ“œ μ•„λž˜μ— μž‘μ€ 글씨 annotation
257
  if props:
258
  prop_html = props.replace("\n", "<br>")
259
  ann.append(dict(
@@ -262,7 +249,7 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
262
  showarrow=False,
263
  xanchor="center",
264
  yanchor="top",
265
- yshift=-22, # λ…Έλ“œ 마컀 μ•„λž˜λ‘œ μ˜€ν”„μ…‹
266
  font=dict(size=8, color="#64748b"),
267
  bgcolor="rgba(248,250,252,0.85)",
268
  borderpad=2,
@@ -279,7 +266,6 @@ def plotly_ontology_fig(height: int = 820) -> go.Figure:
279
  )
280
  return fig
281
 
282
-
283
  def inject_fullscreen(html: str) -> str:
284
  extra = """
285
  <button onclick="var el=document.getElementById('mynetwork');
@@ -311,8 +297,6 @@ def inject_fullscreen(html: str) -> str:
311
  """
312
  return html.replace("</body>", extra + "</body>")
313
 
314
-
315
- # ── 메인 데이터 λ‘œλ“œ (핡심 3개 β€” λΉ λ₯Έ 초기 기동) ──────────────
316
  @st.cache_data(show_spinner=False)
317
  def load_data(data_dir_str: str):
318
  d = None if HF_REPO_ID else Path(data_dir_str)
@@ -389,38 +373,31 @@ def load_data(data_dir_str: str):
389
  }
390
  return seed, events, citing, filters, overview
391
 
392
-
393
- # ── 보쑰 데이터: ν•΄λ‹Ή νƒ­ μ ‘κ·Ό μ‹œμ—λ§Œ λ‘œλ“œ (lazy) ───────────────
394
  @st.cache_data(show_spinner=False)
395
  def load_authors_data(data_dir_str: str) -> pd.DataFrame:
396
  """Analytics νƒ­μ—μ„œλ§Œ μ‚¬μš© β€” νƒ­ μ§„μž… μ‹œ λ‘œλ“œ"""
397
  d = None if HF_REPO_ID else Path(data_dir_str)
398
  return _read("authors.parquet", d)
399
 
400
-
401
  @st.cache_data(show_spinner=False)
402
  def load_geo_data(data_dir_str: str) -> pd.DataFrame:
403
  """Geographic Map νƒ­μ—μ„œλ§Œ μ‚¬μš© β€” νƒ­ μ§„μž… μ‹œ λ‘œλ“œ"""
404
  d = None if HF_REPO_ID else Path(data_dir_str)
405
  return _read("affiliation_geo.parquet", d)
406
 
407
-
408
- # ── KG 데이터: DuckDB λ°©μ‹μœΌλ‘œ 뢄리 λ‘œλ“œ ─────────────────────
409
  @st.cache_data(show_spinner=False)
410
  def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
411
  """kg_nodes 전체 λ‘œλ“œ (3.4M rows, ~160MB 파일)"""
412
  d = None if HF_REPO_ID else Path(data_dir_str)
413
  return _read("kg_nodes.parquet", d)
414
 
415
-
416
  @st.cache_data(show_spinner=False)
417
  def get_parquet_path(filename: str, data_dir_str: str) -> str:
418
  """파일 경둜 λ°˜ν™˜ (HFλ©΄ 둜컬 μΊμ‹œμ— λ‹€μš΄λ‘œλ“œ ν›„ 경둜 λ°˜ν™˜)"""
419
  if HF_REPO_ID:
420
  return _hf_download(filename)
421
- # DuckDB용: μ—­μŠ¬λž˜μ‹œ β†’ μŠ¬λž˜μ‹œ λ³€ν™˜
422
- return str(Path(data_dir_str) / filename).replace("\\", "/")
423
 
 
424
 
425
  @st.cache_data(show_spinner=False)
426
  def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
@@ -436,7 +413,6 @@ def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 8
436
  """
437
  return duckdb.execute(q).df()
438
 
439
-
440
  @st.cache_data(show_spinner=False)
441
  def query_enriched_stats(enriched_path: str):
442
  """DuckDB: enriched 전체 λ‘œλ“œ 없이 집계 ν†΅κ³„λ§Œ 쿼리"""
@@ -461,7 +437,6 @@ def query_enriched_stats(enriched_path: str):
461
 
462
  return sem_df, field_df
463
 
464
-
465
  @st.cache_data(show_spinner=False)
466
  def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
467
  """DuckDB: KG Explorer용 μž„μ˜ λ…Έλ“œ μ—£μ§€ 쿼리"""
@@ -476,8 +451,6 @@ def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60)
476
  """
477
  return duckdb.execute(q).df()
478
 
479
-
480
- # ── 헬퍼 ───────────────────────────────────────────────────────
481
  def filter_seed_papers(seed, q, fields, countries, journals):
482
  df = seed.copy()
483
  q = (q or "").strip().lower()
@@ -488,20 +461,17 @@ def filter_seed_papers(seed, q, fields, countries, journals):
488
  if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
489
  return df.reset_index(drop=True)
490
 
491
-
492
  def event_subset(events, seed_paper_id, year_min, year_max):
493
  df = events[events["seed_paper_id"] == seed_paper_id].copy()
494
  df = df[df["citing_year"].fillna(-99999) >= year_min]
495
  df = df[df["citing_year"].fillna(99999) <= year_max]
496
  return df.reset_index(drop=True)
497
 
498
-
499
  def build_intent_summary(df):
500
  counts = df.groupby("primary_intent").size().to_dict()
501
  return pd.DataFrame({"intent": ALLOWED_INTENTS,
502
  "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
503
 
504
-
505
  def build_context_rows(df, limit=20):
506
  rows = []
507
  df = df.sort_values(["context_count","intent_count","citing_year"],
@@ -518,7 +488,6 @@ def build_context_rows(df, limit=20):
518
  if len(rows) >= limit: break
519
  return pd.DataFrame(rows[:limit])
520
 
521
-
522
  def build_citing_table(df, limit=30):
523
  if df.empty:
524
  return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
@@ -527,7 +496,6 @@ def build_citing_table(df, limit=30):
527
  [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
528
  .drop_duplicates(subset=["citing_paper_id"]).head(limit))
529
 
530
-
531
  def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
532
  """μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers"""
533
  citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
@@ -540,7 +508,6 @@ def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
540
  return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
541
  on="seed_paper_id", how="left")
542
 
543
-
544
  def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
545
  """μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜"""
546
  node_id = f"seed:{seed_doi}"
@@ -552,7 +519,6 @@ def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
552
  nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
553
  return nodes, edges
554
 
555
-
556
  def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
557
  """KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„"""
558
  edges = kg_edges[(kg_edges["source"] == search_node_id) |
@@ -563,8 +529,6 @@ def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60)
563
  nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
564
  return nodes, edges
565
 
566
-
567
- # ── pyvis λΉŒλ” ��────────────────────────────────────────────────
568
  def pyvis_citation_graph(seed_row, events_df):
569
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
570
  sid = seed_row["seed_paper_id"]
@@ -583,7 +547,6 @@ def pyvis_citation_graph(seed_row, events_df):
583
  net.barnes_hut()
584
  return inject_fullscreen(net.generate_html())
585
 
586
-
587
  def pyvis_ontology():
588
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
589
  for nid, label, typ in [
@@ -605,7 +568,6 @@ def pyvis_ontology():
605
  net.barnes_hut()
606
  return inject_fullscreen(net.generate_html())
607
 
608
-
609
  def pyvis_from_kg(nodes_df, edges_df, height="780px"):
610
  """kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성"""
611
  net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
@@ -624,14 +586,9 @@ def pyvis_from_kg(nodes_df, edges_df, height="780px"):
624
  net.barnes_hut()
625
  return inject_fullscreen(net.generate_html())
626
 
627
-
628
- # ═══════════════════════════════════════════════════════════════
629
- # 메인 UI
630
- # ═══════════════════════════════════════════════════════════════
631
  st.title("CitationHub")
632
  st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
633
 
634
- # ── Sidebar ────────────────────────────────────────────────────
635
  with st.sidebar:
636
  st.subheader("Data source")
637
  if HF_REPO_ID:
@@ -688,15 +645,12 @@ intent_summary = build_intent_summary(seed_events)
688
  contexts_df = build_context_rows(seed_events)
689
  citing_table = build_citing_table(seed_events)
690
 
691
- # ── νƒ­ ─────────────────────────────────────────────────────────
692
  (tab_overview, tab_cnet,
693
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
694
  "Overview","Citation Network",
695
  "Knowledge Graph","Geographic Map","Analytics",
696
  ])
697
 
698
-
699
- # ═══ 1. OVERVIEW ═══════════════════════════════════════════════
700
  with tab_overview:
701
  col1, col2 = st.columns(2)
702
  with col1:
@@ -768,8 +722,6 @@ with tab_overview:
768
  <div>{row['context']}</div></div>""",
769
  unsafe_allow_html=True)
770
 
771
-
772
- # ═══ 2. CITATION NETWORK ════════════════════════════════════════
773
  with tab_cnet:
774
  st.subheader("Citation Network")
775
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
@@ -778,13 +730,9 @@ with tab_cnet:
778
  else:
779
  components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
780
 
781
-
782
- # ═══ 3. ONTOLOGY ════════════════════════════════════════════════
783
- # ═══ 3. KNOWLEDGE GRAPH (KG Explorer + Ontology) ════════════════
784
  with tab_kg_exp:
785
  st.subheader("Knowledge Graph")
786
 
787
- # ── CitationHub Ontology ─────────────────────────────────────
788
  st.subheader("CitationHub Ontology β€” Concepts, Instances & Relationships")
789
  st.caption("πŸ” Scroll/pinch: zoom | Drag: pan | Hover node: details | β›Ά (top-right toolbar): fullscreen")
790
  st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
@@ -796,7 +744,6 @@ with tab_kg_exp:
796
  kg_nodes_exp = load_kg_nodes(data_dir_val)
797
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
798
 
799
- # ── λ…Έλ“œ/μ—£μ§€ νƒ€μž… 뢄포 톡계
800
  import duckdb as _ddb
801
 
802
  nt = kg_nodes_exp["node_type"].value_counts().reset_index()
@@ -828,14 +775,12 @@ with tab_kg_exp:
828
  yaxis_title="Count", xaxis_tickangle=-35)
829
  st.plotly_chart(et_fig, use_container_width=True)
830
 
831
- # ── Multi-Node Knowledge Graph (2-hop: 10 node types + 10 edge types)
832
  st.markdown("---")
833
  st.subheader("Multi-Node Knowledge Graph")
834
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
835
 
836
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
837
 
838
- # edge typeλ‹Ή κ³ μ • μƒ˜ν”Œ 수 β€” 10 types Γ— 10 = μ΅œλŒ€ 100 edges
839
  EDGES_PER_TYPE = 10
840
 
841
  with st.spinner("Querying graph..."):
@@ -847,8 +792,6 @@ with tab_kg_exp:
847
  if seed_ids:
848
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
849
 
850
- # 1-hop: seed paper에 μ—°κ²°λœ λͺ¨λ“  edge type
851
- # β†’ journal / author / affiliation / city / country / field / citation_event
852
  hop1 = _ddb.execute(f"""
853
  WITH ranked AS (
854
  SELECT source, target, edge_type,
@@ -862,8 +805,6 @@ with tab_kg_exp:
862
  WHERE rn <= {EDGES_PER_TYPE}
863
  """).df()
864
 
865
- # 2-hop: kg_nodes_exp νƒ€μž… 기반으둜 citation_event λ…Έλ“œ ID μΆ”μΆœ
866
- # (prefix κ°€μ • 없이 μ‹€μ œ node_type 컬럼으둜 확인)
867
  hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
868
  event_node_ids = (
869
  kg_nodes_exp[
@@ -874,8 +815,7 @@ with tab_kg_exp:
874
 
875
  if event_node_ids:
876
  ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
877
- # citation_event β†’ HAS_CITING_PAPER β†’ citing_paper
878
- # citation_event β†’ HAS_PRIMARY_INTENT β†’ intent
879
  hop2 = _ddb.execute(f"""
880
  WITH ranked AS (
881
  SELECT source, target, edge_type,
@@ -914,8 +854,6 @@ with tab_kg_exp:
914
  except Exception as e:
915
  st.error(str(e))
916
 
917
-
918
- # ═══ 6. GEOGRAPHIC MAP ══════════════════════════════════════════
919
  with tab_geo:
920
  st.subheader("Geographic Distribution of Seed Papers")
921
  with st.spinner("Loading geographic data..."):
@@ -948,7 +886,6 @@ with tab_geo:
948
  .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
949
  use_container_width=True)
950
 
951
- # ── Affiliation μ‹œκ°ν™” ──────────────────────────────────────
952
  st.subheader("Top Affiliations")
953
  geo_col1, geo_col2 = st.columns(2)
954
 
@@ -989,8 +926,6 @@ with tab_geo:
989
  legend_title="Country", height=520),
990
  use_container_width=True)
991
 
992
-
993
- # ═══ 7. ANALYTICS ═══════════════════════════════════════════════
994
  with tab_analytics:
995
  try:
996
  with st.spinner("Loading analytics data..."):
@@ -1063,7 +998,6 @@ with tab_analytics:
1063
  title="Influential vs Non-influential"),
1064
  use_container_width=True)
1065
 
1066
- # ── Intent Evolution over Years ────────────────────────────
1067
  st.markdown("---")
1068
  st.subheader("CitationHub Intent Evolution over Years")
1069
  st.caption("How citation intents have changed across all papers over time")
@@ -1088,7 +1022,6 @@ with tab_analytics:
1088
  use_container_width=True,
1089
  )
1090
 
1091
- # ── Top Citing Venues ───────────────────────────────────────
1092
  st.markdown("---")
1093
  col_v1, col_v2 = st.columns(2)
1094
 
@@ -1136,7 +1069,6 @@ with tab_analytics:
1136
  use_container_width=True,
1137
  )
1138
 
1139
- # ── Citation Trend over Time ────────────────────────────────
1140
  st.markdown("---")
1141
  st.subheader("Citation Trend over Time (selected paper)")
1142
  st.caption("How citations to the selected seed paper have changed year by year")
@@ -1154,7 +1086,6 @@ with tab_analytics:
1154
  else:
1155
  st.info("No citation trend data for the selected paper.")
1156
 
1157
- # ── Export ─────────────────────────────────────────────────
1158
  st.markdown("---")
1159
  st.subheader("Export Data")
1160
  col_e1, col_e2, col_e3 = st.columns(3)
 
15
 
16
  HF_REPO_ID = os.environ.get("HF_REPO_ID", "")
17
 
 
18
  def csv_download_link(data: bytes, filename: str, label: str) -> None:
19
  """st.download_button λŒ€μ‹  base64 HTML 링크둜 λ‹€μš΄λ‘œλ“œ β€” μ„œλ²„ μ—°κ²° λΆˆν•„μš”."""
20
  b64 = base64.b64encode(data).decode()
 
55
  r"C:\Users\user\OneDrive\바탕 ν™”λ©΄\Citehub_huggingface\data",
56
  ))
57
 
 
58
  def fmt_num(x):
59
  try: return f"{int(x):,}"
60
  except: return "-"
61
 
 
62
  def _hf_download(filename: str) -> str:
63
  from huggingface_hub import hf_hub_download
64
  return hf_hub_download(
 
66
  filename=f"data/{filename}", token=HF_TOKEN or None,
67
  )
68
 
 
69
  def _read(filename: str, data_dir: Path | None = None) -> pd.DataFrame:
70
  if HF_REPO_ID:
71
  return pd.read_parquet(_hf_download(filename))
72
  return pd.read_parquet(data_dir / filename)
73
 
 
74
  def plotly_network_fig(
75
  nodes_df: pd.DataFrame,
76
  edges_df: pd.DataFrame,
 
97
  k = max(1.5, 3.0 / (len(G.nodes) ** 0.4))
98
  pos = nx.spring_layout(G, seed=42, k=k, iterations=60)
99
 
 
100
  ex, ey = [], []
101
  for src, tgt in G.edges():
102
  x0, y0 = pos.get(src, (0, 0))
 
112
  )
113
  ]
114
 
 
115
  for ntype, color in NODE_TYPE_COLORS.items():
116
  subset = nodes_df[nodes_df["node_type"] == ntype]
117
  if subset.empty:
 
163
  )
164
  return fig
165
 
 
166
  def plotly_ontology_fig(height: int = 820) -> go.Figure:
167
  """CitationHub μ˜¨ν†¨λ‘œμ§€ ꡬ쑰 β€” Plotly SVG. 각 λ…Έλ“œμ— 속성값 ν‘œμ‹œ."""
168
 
 
169
  NODE_PROPS = {
170
  "seed_paper": "doi Β· title Β· journal\nauthor Β· affiliation\ncountry Β· field Β· citedby_count",
171
  "citation_event": "event_id Β· citing_year\nprimary_intent Β· context\nis_influential",
 
206
 
207
  pos = nx.spring_layout(G, seed=7, k=2.5, iterations=80)
208
 
 
209
  ex, ey = [], []
210
  ann = []
211
  for s, t, lbl in edge_defs:
 
224
  hoverinfo="none", showlegend=False)
225
  ]
226
 
 
227
  for nid, label, ntype in node_defs:
228
  x, y = pos[nid]
229
  color = NODE_TYPE_COLORS.get(ntype, "#94a3b8")
230
  props = NODE_PROPS.get(ntype, "")
231
 
 
232
  traces.append(go.Scatter(
233
  x=[x], y=[y], mode="markers+text",
234
  text=[f"<b>{label}</b>"], textposition="top center",
 
241
  textfont=dict(size=11, color="#1e293b"),
242
  ))
243
 
 
244
  if props:
245
  prop_html = props.replace("\n", "<br>")
246
  ann.append(dict(
 
249
  showarrow=False,
250
  xanchor="center",
251
  yanchor="top",
252
+ yshift=-22,
253
  font=dict(size=8, color="#64748b"),
254
  bgcolor="rgba(248,250,252,0.85)",
255
  borderpad=2,
 
266
  )
267
  return fig
268
 
 
269
  def inject_fullscreen(html: str) -> str:
270
  extra = """
271
  <button onclick="var el=document.getElementById('mynetwork');
 
297
  """
298
  return html.replace("</body>", extra + "</body>")
299
 
 
 
300
  @st.cache_data(show_spinner=False)
301
  def load_data(data_dir_str: str):
302
  d = None if HF_REPO_ID else Path(data_dir_str)
 
373
  }
374
  return seed, events, citing, filters, overview
375
 
 
 
376
  @st.cache_data(show_spinner=False)
377
  def load_authors_data(data_dir_str: str) -> pd.DataFrame:
378
  """Analytics νƒ­μ—μ„œλ§Œ μ‚¬μš© β€” νƒ­ μ§„μž… μ‹œ λ‘œλ“œ"""
379
  d = None if HF_REPO_ID else Path(data_dir_str)
380
  return _read("authors.parquet", d)
381
 
 
382
  @st.cache_data(show_spinner=False)
383
  def load_geo_data(data_dir_str: str) -> pd.DataFrame:
384
  """Geographic Map νƒ­μ—μ„œλ§Œ μ‚¬μš© β€” νƒ­ μ§„μž… μ‹œ λ‘œλ“œ"""
385
  d = None if HF_REPO_ID else Path(data_dir_str)
386
  return _read("affiliation_geo.parquet", d)
387
 
 
 
388
  @st.cache_data(show_spinner=False)
389
  def load_kg_nodes(data_dir_str: str) -> pd.DataFrame:
390
  """kg_nodes 전체 λ‘œλ“œ (3.4M rows, ~160MB 파일)"""
391
  d = None if HF_REPO_ID else Path(data_dir_str)
392
  return _read("kg_nodes.parquet", d)
393
 
 
394
  @st.cache_data(show_spinner=False)
395
  def get_parquet_path(filename: str, data_dir_str: str) -> str:
396
  """파일 경둜 λ°˜ν™˜ (HFλ©΄ 둜컬 μΊμ‹œμ— λ‹€μš΄λ‘œλ“œ ν›„ 경둜 λ°˜ν™˜)"""
397
  if HF_REPO_ID:
398
  return _hf_download(filename)
 
 
399
 
400
+ return str(Path(data_dir_str) / filename).replace("\\", "/")
401
 
402
  @st.cache_data(show_spinner=False)
403
  def query_kg_edges_for_node(node_id: str, kg_edges_path: str, max_edges: int = 80) -> pd.DataFrame:
 
413
  """
414
  return duckdb.execute(q).df()
415
 
 
416
  @st.cache_data(show_spinner=False)
417
  def query_enriched_stats(enriched_path: str):
418
  """DuckDB: enriched 전체 λ‘œλ“œ 없이 집계 ν†΅κ³„λ§Œ 쿼리"""
 
437
 
438
  return sem_df, field_df
439
 
 
440
  @st.cache_data(show_spinner=False)
441
  def query_explorer_edges(node_id: str, kg_edges_path: str, max_edges: int = 60) -> pd.DataFrame:
442
  """DuckDB: KG Explorer용 μž„μ˜ λ…Έλ“œ μ—£μ§€ 쿼리"""
 
451
  """
452
  return duckdb.execute(q).df()
453
 
 
 
454
  def filter_seed_papers(seed, q, fields, countries, journals):
455
  df = seed.copy()
456
  q = (q or "").strip().lower()
 
461
  if journals: df = df[df["journal"].str.lower().isin({x.lower() for x in journals})]
462
  return df.reset_index(drop=True)
463
 
 
464
  def event_subset(events, seed_paper_id, year_min, year_max):
465
  df = events[events["seed_paper_id"] == seed_paper_id].copy()
466
  df = df[df["citing_year"].fillna(-99999) >= year_min]
467
  df = df[df["citing_year"].fillna(99999) <= year_max]
468
  return df.reset_index(drop=True)
469
 
 
470
  def build_intent_summary(df):
471
  counts = df.groupby("primary_intent").size().to_dict()
472
  return pd.DataFrame({"intent": ALLOWED_INTENTS,
473
  "count": [int(counts.get(i,0)) for i in ALLOWED_INTENTS]})
474
 
 
475
  def build_context_rows(df, limit=20):
476
  rows = []
477
  df = df.sort_values(["context_count","intent_count","citing_year"],
 
488
  if len(rows) >= limit: break
489
  return pd.DataFrame(rows[:limit])
490
 
 
491
  def build_citing_table(df, limit=30):
492
  if df.empty:
493
  return pd.DataFrame(columns=["citing_title","citing_year","primary_intent","context_count"])
 
496
  [["citing_paper_id","citing_title","citing_doi","citing_year","primary_intent","context_count"]]
497
  .drop_duplicates(subset=["citing_paper_id"]).head(limit))
498
 
 
499
  def get_cocited_papers(selected_seed_id, events, seed, top_n=15):
500
  """μ„ νƒλœ seed paperλ₯Ό μΈμš©ν•œ 논문듀이 ν•¨κ»˜ μΈμš©ν•œ λ‹€λ₯Έ seed papers"""
501
  citing_ids = events[events["seed_paper_id"] == selected_seed_id]["citing_paper_id"].unique()
 
508
  return cocited.merge(seed[["seed_paper_id","title","field","journal","citedby_count"]],
509
  on="seed_paper_id", how="left")
510
 
 
511
  def get_kg_subgraph(seed_doi: str, kg_nodes, kg_edges, max_edges=80):
512
  """μ„ νƒλœ seed paper의 KG 1-hop μ„œλΈŒκ·Έλž˜ν”„ λ°˜ν™˜"""
513
  node_id = f"seed:{seed_doi}"
 
519
  nodes = kg_nodes[kg_nodes["node_id"].isin(all_node_ids)]
520
  return nodes, edges
521
 
 
522
  def get_explorer_subgraph(search_node_id: str, kg_nodes, kg_edges, max_edges=60):
523
  """KG Explorer: μž„μ˜ λ…Έλ“œ κΈ°μ€€ μ„œλΈŒκ·Έλž˜ν”„"""
524
  edges = kg_edges[(kg_edges["source"] == search_node_id) |
 
529
  nodes = kg_nodes[kg_nodes["node_id"].isin(all_ids)]
530
  return nodes, edges
531
 
 
 
532
  def pyvis_citation_graph(seed_row, events_df):
533
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
534
  sid = seed_row["seed_paper_id"]
 
547
  net.barnes_hut()
548
  return inject_fullscreen(net.generate_html())
549
 
 
550
  def pyvis_ontology():
551
  net = Network(height="780px", width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
552
  for nid, label, typ in [
 
568
  net.barnes_hut()
569
  return inject_fullscreen(net.generate_html())
570
 
 
571
  def pyvis_from_kg(nodes_df, edges_df, height="780px"):
572
  """kg_nodes / kg_edges DataFrame으둜 pyvis κ·Έλž˜ν”„ 생성"""
573
  net = Network(height=height, width="100%", bgcolor="#ffffff", font_color="#111827", directed=True)
 
586
  net.barnes_hut()
587
  return inject_fullscreen(net.generate_html())
588
 
 
 
 
 
589
  st.title("CitationHub")
590
  st.caption("Explore influential papers (top 5% cited), their citation networks, and knowledge graphs.")
591
 
 
592
  with st.sidebar:
593
  st.subheader("Data source")
594
  if HF_REPO_ID:
 
645
  contexts_df = build_context_rows(seed_events)
646
  citing_table = build_citing_table(seed_events)
647
 
 
648
  (tab_overview, tab_cnet,
649
  tab_kg_exp, tab_geo, tab_analytics) = st.tabs([
650
  "Overview","Citation Network",
651
  "Knowledge Graph","Geographic Map","Analytics",
652
  ])
653
 
 
 
654
  with tab_overview:
655
  col1, col2 = st.columns(2)
656
  with col1:
 
722
  <div>{row['context']}</div></div>""",
723
  unsafe_allow_html=True)
724
 
 
 
725
  with tab_cnet:
726
  st.subheader("Citation Network")
727
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
 
730
  else:
731
  components.html(pyvis_citation_graph(selected_seed, seed_events), height=820, scrolling=True)
732
 
 
 
 
733
  with tab_kg_exp:
734
  st.subheader("Knowledge Graph")
735
 
 
736
  st.subheader("CitationHub Ontology β€” Concepts, Instances & Relationships")
737
  st.caption("πŸ” Scroll/pinch: zoom | Drag: pan | Hover node: details | β›Ά (top-right toolbar): fullscreen")
738
  st.plotly_chart(plotly_ontology_fig(height=820), use_container_width=True)
 
744
  kg_nodes_exp = load_kg_nodes(data_dir_val)
745
  kg_edges_path = get_parquet_path("kg_edges.parquet", data_dir_val)
746
 
 
747
  import duckdb as _ddb
748
 
749
  nt = kg_nodes_exp["node_type"].value_counts().reset_index()
 
775
  yaxis_title="Count", xaxis_tickangle=-35)
776
  st.plotly_chart(et_fig, use_container_width=True)
777
 
 
778
  st.markdown("---")
779
  st.subheader("Multi-Node Knowledge Graph")
780
  st.caption("πŸ–± Scroll: zoom | Drag: pan | Click node: info | β›Ά button: fullscreen")
781
 
782
  n_seeds = st.slider("Number of seed papers", 3, 15, 6, key="kg_exp_n_seeds")
783
 
 
784
  EDGES_PER_TYPE = 10
785
 
786
  with st.spinner("Querying graph..."):
 
792
  if seed_ids:
793
  ids_sql = ", ".join(f"'{sid}'" for sid in seed_ids)
794
 
 
 
795
  hop1 = _ddb.execute(f"""
796
  WITH ranked AS (
797
  SELECT source, target, edge_type,
 
805
  WHERE rn <= {EDGES_PER_TYPE}
806
  """).df()
807
 
 
 
808
  hop1_all_ids = set(hop1["source"].tolist()) | set(hop1["target"].tolist())
809
  event_node_ids = (
810
  kg_nodes_exp[
 
815
 
816
  if event_node_ids:
817
  ev_sql = ", ".join(f"'{eid}'" for eid in event_node_ids)
818
+
 
819
  hop2 = _ddb.execute(f"""
820
  WITH ranked AS (
821
  SELECT source, target, edge_type,
 
854
  except Exception as e:
855
  st.error(str(e))
856
 
 
 
857
  with tab_geo:
858
  st.subheader("Geographic Distribution of Seed Papers")
859
  with st.spinner("Loading geographic data..."):
 
886
  .update_layout(xaxis_title="", yaxis_title="# Seed Papers", xaxis_tickangle=-40),
887
  use_container_width=True)
888
 
 
889
  st.subheader("Top Affiliations")
890
  geo_col1, geo_col2 = st.columns(2)
891
 
 
926
  legend_title="Country", height=520),
927
  use_container_width=True)
928
 
 
 
929
  with tab_analytics:
930
  try:
931
  with st.spinner("Loading analytics data..."):
 
998
  title="Influential vs Non-influential"),
999
  use_container_width=True)
1000
 
 
1001
  st.markdown("---")
1002
  st.subheader("CitationHub Intent Evolution over Years")
1003
  st.caption("How citation intents have changed across all papers over time")
 
1022
  use_container_width=True,
1023
  )
1024
 
 
1025
  st.markdown("---")
1026
  col_v1, col_v2 = st.columns(2)
1027
 
 
1069
  use_container_width=True,
1070
  )
1071
 
 
1072
  st.markdown("---")
1073
  st.subheader("Citation Trend over Time (selected paper)")
1074
  st.caption("How citations to the selected seed paper have changed year by year")
 
1086
  else:
1087
  st.info("No citation trend data for the selected paper.")
1088
 
 
1089
  st.markdown("---")
1090
  st.subheader("Export Data")
1091
  col_e1, col_e2, col_e3 = st.columns(3)