rafmacalaba commited on
Commit
b877288
Β·
1 Parent(s): 94bf6c6

switch to new data.json

Browse files
Files changed (2) hide show
  1. app.py +312 -170
  2. consolidated_data_optimized.json +0 -0
app.py CHANGED
@@ -6,7 +6,7 @@ from collections import Counter, defaultdict
6
  import gradio as gr
7
 
8
  # ── Local CONFIG ──────────────────────────────────────────────────────────────
9
- DATA_FILE = "gradio_ner_data.json"
10
 
11
 
12
  def load_initial_data() -> List[Dict]:
@@ -14,19 +14,10 @@ def load_initial_data() -> List[Dict]:
14
  raise FileNotFoundError(f"{DATA_FILE} not found in current directory.")
15
  with open(DATA_FILE, "r", encoding="utf-8") as f:
16
  data = json.load(f)
17
-
18
- # Calculate mixed types (types that have both True and False LLM assessments)
19
- type_assessments = defaultdict(set)
20
- for rec in data:
21
- if rec.get("type") and rec.get("llm_is_dataset_contextual") is not None:
22
- type_assessments[rec["type"]].add(rec["llm_is_dataset_contextual"])
23
 
24
- mixed_types = {t for t, assessments in type_assessments.items() if True in assessments and False in assessments}
 
25
 
26
- # Flag records
27
- for rec in data:
28
- rec["is_mixed_type"] = rec.get("type") in mixed_types
29
-
30
  return data
31
 
32
 
@@ -41,47 +32,90 @@ class DynamicDataset:
41
  return self.data[self.current]
42
 
43
 
44
- class MixedTypeManager:
45
  def __init__(self, data: List[Dict]):
46
- self.grouped_data = defaultdict(lambda: {'true': [], 'false': []})
47
- self.mixed_types = []
 
 
 
 
 
48
 
49
- # Group data
50
  for rec in data:
51
  dtype = rec.get("type")
52
- is_ds = rec.get("llm_is_dataset_contextual")
53
- if dtype and is_ds is not None:
54
- key = 'true' if is_ds else 'false'
55
- self.grouped_data[dtype][key].append(rec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Identify mixed types
58
- for dtype, groups in self.grouped_data.items():
59
- if groups['true'] and groups['false']:
60
- self.mixed_types.append(dtype)
 
 
 
 
61
 
62
- # Sort by total count
63
- self.mixed_types.sort(key=lambda t: len(self.grouped_data[t]['true']) + len(self.grouped_data[t]['false']), reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- def get_example(self, dtype: str, is_dataset: bool, idx: int) -> Dict:
66
- if dtype not in self.grouped_data:
 
 
 
 
 
67
  return {}
68
- group = self.grouped_data[dtype]['true' if is_dataset else 'false']
69
  if not group:
70
  return {}
71
- # Cycle through examples
72
  safe_idx = idx % len(group)
73
  return group[safe_idx]
74
 
75
- def get_count(self, dtype: str, is_dataset: bool) -> int:
76
- if dtype not in self.grouped_data:
77
  return 0
78
- return len(self.grouped_data[dtype]['true' if is_dataset else 'false'])
79
 
80
 
81
  # ── Highlight utils ──────────────────────────────────────────────────────────
82
  def prepare_for_highlight(rec: Dict) -> List[Tuple[str, Optional[str]]]:
83
  text = rec.get("text", "") or ""
84
- ner_spans = rec.get("ner_annotated", rec.get("ner_text", [])) or []
85
 
86
  segments = []
87
  last_idx = 0
@@ -110,21 +144,16 @@ def prepare_for_highlight(rec: Dict) -> List[Tuple[str, Optional[str]]]:
110
 
111
 
112
  # ── Filtering helpers ─────────────────────────────────────────────────────────
113
- def record_matches_filters(rec: Dict, llm_dataset_filter: str, type_filter: str):
114
- # Use LLM assessment instead of is_dataset
115
- llm_is_ds = rec.get("llm_is_dataset_contextual")
116
 
117
- # If LLM assessment is not available, skip this record
118
- if llm_is_ds is None:
119
  return False
120
-
121
- if llm_dataset_filter == "LLM: Datasets only" and not llm_is_ds:
122
- return False
123
- if llm_dataset_filter == "LLM: Non-datasets only" and llm_is_ds:
124
  return False
125
- if llm_dataset_filter == "πŸ”₯ Show Confusion/Mixed Cases":
126
- # Only show records that are part of a mixed type group
127
- return rec.get("is_mixed_type", False)
128
 
129
  if type_filter != "All types":
130
  return rec.get("type") == type_filter
@@ -138,7 +167,7 @@ DOCUMENTATION = """
138
 
139
  ## What is this tool?
140
 
141
- This application helps you **review and explore dataset mentions** extracted documents.
142
  It displays text excerpts where potential datasets have been identified, along with metadata about each mention.
143
 
144
  ## What you'll see
@@ -150,6 +179,7 @@ Each record shows:
150
  - **βœ… Dataset Status**: Whether this mention actually refers to a dataset
151
  - **πŸ’‘ Context**: The surrounding text that provides context
152
  - **πŸ“ Explanation**: Why this was classified as a dataset (or not)
 
153
 
154
  ## How to use this tool
155
 
@@ -164,33 +194,34 @@ Each record shows:
164
  - **All**: Show all records
165
  - **Datasets only**: Show only records that contain actual dataset references
166
  - **Non-datasets only**: Show records that were identified but don't actually refer to datasets
 
167
 
168
  2. **Data Type Filter**
169
  - Filter by specific data types (census, survey, database, etc.)
170
  - Types are sorted by frequency (most common first)
171
 
172
- ### πŸ’‘ Tips
173
- - Use filters to focus on specific types of data mentions
174
- - The "Contains Dataset" field tells you if the mention is a true dataset reference
175
- - Review the "Explanation" to understand the classification reasoning
176
- - Highlighted text shows exactly where the dataset mention appears in context
177
-
178
- ## πŸš€ Try It Yourself!
179
 
180
- Want to extract datasets from your own text? Try our **Dataset Extraction Tool**:
181
 
182
- πŸ‘‰ **[Launch Dataset Extraction Tool](https://huggingface.co/spaces/ai4data/datause-extraction)**
 
183
 
184
- This interactive tool allows you to:
185
- - ✨ **Extract datasets** from your own text or documents
186
- - πŸ“ **Use predefined samples** to see how it works
187
- - πŸ”¬ **Explore the extraction process** in real-time
188
 
189
- Perfect for testing the extraction capabilities on new documents or experimenting with different types of text!
 
 
 
 
 
190
 
191
  ## Data Source
192
 
193
- This viewer uses data from World Bank project documents.
194
  """
195
 
196
 
@@ -198,7 +229,7 @@ This viewer uses data from World Bank project documents.
198
  def create_demo() -> gr.Blocks:
199
  data = load_initial_data()
200
  dynamic_dataset = DynamicDataset(data)
201
- mixed_manager = MixedTypeManager(data)
202
 
203
  # Count types and sort by frequency (most common first)
204
  type_counter = Counter(rec.get("type") for rec in data if rec.get("type"))
@@ -212,15 +243,18 @@ def create_demo() -> gr.Blocks:
212
  v_type = rec.get("type", "β€”")
213
  empirical_context = rec.get("empirical_context", "β€”")
214
  explanation = rec.get("explanation", "β€”")
215
- is_mixed = rec.get("is_mixed_type", False)
216
- llm_is_dataset = rec.get("llm_is_dataset_contextual")
217
-
218
- # Apply conditional highlighting based on LLM assessment
219
- if rec.get("ner_text") and rec.get("text") and llm_is_dataset is not None:
 
 
 
220
  try:
221
  start, end = rec["ner_text"][0][0], rec["ner_text"][0][1]
222
  term = rec["text"][start:end]
223
- if llm_is_dataset:
224
  highlight_style = 'background-color: #90ee90; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #5cb85c;'
225
  else:
226
  highlight_style = 'background-color: #ff7f7f; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #d9534f;'
@@ -231,8 +265,24 @@ def create_demo() -> gr.Blocks:
231
 
232
  # Build HTML
233
  type_html = f"<code>{v_type}</code>"
234
- if is_mixed:
235
- type_html += " ⚠️ <b>Mixed/Confusing Type</b>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  html = f"""
238
  <h3>πŸ“„ Document Information</h3>
@@ -241,36 +291,43 @@ def create_demo() -> gr.Blocks:
241
 
242
  <h3>🏷️ Type</h3>
243
  <p>{type_html}</p>
 
 
 
 
 
 
 
244
 
 
245
  <h3>πŸ“ Surrounding Text</h3>
246
  <p>{empirical_context}</p>
247
  """
248
 
249
- # Add LLM contextual analysis section if available
250
- llm_reasons = rec.get("llm_contextual_reason", [])
251
- llm_thinking = rec.get("llm_thinking_contextual", "")
 
 
 
 
 
252
 
253
- if llm_is_dataset is not None:
254
- status_icon = 'βœ…' if llm_is_dataset else '❌'
255
- status_text = 'Is a dataset' if llm_is_dataset else 'Not a dataset'
256
  html += f"""
257
- <h3>πŸ€– Contextual Analysis</h3>
258
- <p><b>Assessment:</b> {status_icon} {status_text}</p>
 
 
 
 
 
 
 
 
 
 
259
  """
260
-
261
- if llm_reasons:
262
- html += "<p><b>Reasoning:</b></p><ul>"
263
- for reason in llm_reasons:
264
- html += f"<li>{reason}</li>"
265
- html += "</ul>"
266
-
267
- if llm_thinking:
268
- html += f"""
269
- <p><b>Detailed Analysis:</b></p>
270
- <blockquote style="border-left: 3px solid #ccc; padding-left: 10px; color: #666;">
271
- {llm_thinking}
272
- </blockquote>
273
- """
274
 
275
  return html
276
 
@@ -281,10 +338,10 @@ def create_demo() -> gr.Blocks:
281
  return segs, idx, make_info(rec)
282
 
283
  # When filters change β†’ jump to first matching record
284
- def jump_on_filters(llm_dataset_filter, type_filter):
285
  n = dynamic_dataset.len
286
  for i in range(n):
287
- if record_matches_filters(data[i], llm_dataset_filter, type_filter):
288
  dynamic_dataset.current = i
289
  rec = data[i]
290
  segs = prepare_for_highlight(rec)
@@ -294,11 +351,11 @@ def create_demo() -> gr.Blocks:
294
  return [], 0, "⚠️ No matching records found with the selected filters."
295
 
296
  # Navigation respecting filters
297
- def nav_next(llm_dataset_filter, type_filter):
298
  i = dynamic_dataset.current + 1
299
  n = dynamic_dataset.len
300
  while i < n:
301
- if record_matches_filters(data[i], llm_dataset_filter, type_filter):
302
  break
303
  i += 1
304
  if i >= n:
@@ -307,10 +364,10 @@ def create_demo() -> gr.Blocks:
307
  rec = data[i]
308
  return prepare_for_highlight(rec), i, make_info(rec)
309
 
310
- def nav_prev(llm_dataset_filter, type_filter):
311
  i = dynamic_dataset.current - 1
312
  while i >= 0:
313
- if record_matches_filters(data[i], llm_dataset_filter, type_filter):
314
  break
315
  i -= 1
316
  if i < 0:
@@ -319,39 +376,61 @@ def create_demo() -> gr.Blocks:
319
  rec = data[i]
320
  return prepare_for_highlight(rec), i, make_info(rec)
321
 
322
- # Comparison Logic
323
- def load_comparison(dtype, pos_idx, neg_idx):
324
  if not dtype:
325
- return [], "Select a type", [], "Select a type"
326
 
327
- pos_rec = mixed_manager.get_example(dtype, True, pos_idx)
328
- neg_rec = mixed_manager.get_example(dtype, False, neg_idx)
329
 
330
- pos_hl = prepare_for_highlight(pos_rec)
331
- neg_hl = prepare_for_highlight(neg_rec)
332
 
333
- pos_info = make_info(pos_rec)
334
- neg_info = make_info(neg_rec)
335
 
336
  # Add count info
337
- pos_total = mixed_manager.get_count(dtype, True)
338
- neg_total = mixed_manager.get_count(dtype, False)
339
 
340
- pos_header = f"### βœ… IS Dataset ({pos_idx % pos_total + 1}/{pos_total})"
341
- neg_header = f"### ❌ NOT Dataset ({neg_idx % neg_total + 1}/{neg_total})"
342
 
343
  return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header
344
 
345
- def next_pos(dtype, current_idx):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  return current_idx + 1
347
 
348
- def next_neg(dtype, current_idx):
349
  return current_idx + 1
350
 
351
  # ---- UI ----
352
  with gr.Blocks(title="Monitoring of Data Use") as demo:
353
  gr.Markdown("# πŸ“Š Monitoring of Data Use")
354
- # gr.Markdown(f"*Exploring {dynamic_dataset.len:,} dataset mentions from World Bank documents*")
355
 
356
  with gr.Tabs():
357
  with gr.Tab("πŸ“– How to Use"):
@@ -369,10 +448,10 @@ def create_demo() -> gr.Blocks:
369
  )
370
 
371
  with gr.Row():
372
- llm_dataset_filter = gr.Dropdown(
373
- choices=["πŸ”₯ Show Confusion/Mixed Cases", "All", "LLM: Datasets only", "LLM: Non-datasets only"],
374
- value="πŸ”₯ Show Confusion/Mixed Cases",
375
- label="πŸ€– Filter by Assessment",
376
  )
377
 
378
  type_filter = gr.Dropdown(
@@ -409,94 +488,157 @@ def create_demo() -> gr.Blocks:
409
  )
410
 
411
  # Filters
412
- llm_dataset_filter.change(
413
  fn=jump_on_filters,
414
- inputs=[llm_dataset_filter, type_filter],
415
  outputs=[inp_box, prog, info_md],
416
  )
417
  type_filter.change(
418
  fn=jump_on_filters,
419
- inputs=[llm_dataset_filter, type_filter],
420
  outputs=[inp_box, prog, info_md],
421
  )
422
 
423
  # Prev / Next navigation respecting filters
424
  prev_btn.click(
425
  fn=nav_prev,
426
- inputs=[llm_dataset_filter, type_filter],
427
  outputs=[inp_box, prog, info_md],
428
  )
429
  next_btn.click(
430
  fn=nav_next,
431
- inputs=[llm_dataset_filter, type_filter],
432
  outputs=[inp_box, prog, info_md],
433
  )
434
 
435
  with gr.Tab("βš–οΈ Comparison"):
436
- gr.Markdown("### Side-by-Side Comparison of Mixed Types")
437
- gr.Markdown("Compare examples where the **same type** is classified differently based on context.")
438
 
439
- with gr.Row():
 
 
 
 
 
 
 
 
440
  comp_type_selector = gr.Dropdown(
441
- choices=mixed_manager.mixed_types,
442
- value=mixed_manager.mixed_types[0] if mixed_manager.mixed_types else None,
443
  label="Select Mixed Type to Compare",
444
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- # State for indices
447
- pos_idx_state = gr.State(0)
448
- neg_idx_state = gr.State(0)
 
 
449
 
450
- with gr.Row():
451
- # Left Column: Positive
452
- with gr.Column():
453
- pos_header = gr.Markdown("### βœ… IS Dataset")
454
- pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
455
- pos_info_box = gr.HTML()
456
- pos_next_btn = gr.Button("Next Example ➑️")
457
-
458
- # Right Column: Negative
459
- with gr.Column():
460
- neg_header = gr.Markdown("### ❌ NOT Dataset")
461
- neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
462
- neg_info_box = gr.HTML()
463
- neg_next_btn = gr.Button("Next Example ➑️")
464
-
465
- # Events
466
  comp_type_selector.change(
467
- fn=lambda: (0, 0), # Reset indices
468
- outputs=[pos_idx_state, neg_idx_state]
469
  ).then(
470
- fn=load_comparison,
471
- inputs=[comp_type_selector, pos_idx_state, neg_idx_state],
472
- outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header]
473
  )
474
 
475
- pos_next_btn.click(
476
  fn=next_pos,
477
- inputs=[comp_type_selector, pos_idx_state],
478
- outputs=[pos_idx_state]
479
  ).then(
480
- fn=load_comparison,
481
- inputs=[comp_type_selector, pos_idx_state, neg_idx_state],
482
- outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header]
483
  )
484
 
485
- neg_next_btn.click(
486
  fn=next_neg,
487
- inputs=[comp_type_selector, neg_idx_state],
488
- outputs=[neg_idx_state]
 
 
 
 
 
 
 
 
 
 
489
  ).then(
490
- fn=load_comparison,
491
- inputs=[comp_type_selector, pos_idx_state, neg_idx_state],
492
- outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header]
493
  )
494
 
495
- # Initial Load
496
- demo.load(
497
- fn=load_comparison,
498
- inputs=[comp_type_selector, pos_idx_state, neg_idx_state],
499
- outputs=[pos_hl_box, pos_info_box, neg_hl_box, neg_info_box, pos_header, neg_header]
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  )
501
 
502
  return demo
 
6
  import gradio as gr
7
 
8
  # ── Local CONFIG ──────────────────────────────────────────────────────────────
9
+ DATA_FILE = "consolidated_data_optimized.json"
10
 
11
 
12
  def load_initial_data() -> List[Dict]:
 
14
  raise FileNotFoundError(f"{DATA_FILE} not found in current directory.")
15
  with open(DATA_FILE, "r", encoding="utf-8") as f:
16
  data = json.load(f)
 
 
 
 
 
 
17
 
18
+ # Sort to show records with relations first (most informative)
19
+ data.sort(key=lambda x: len(x.get('ner_text', [])), reverse=True)
20
 
 
 
 
 
21
  return data
22
 
23
 
 
32
  return self.data[self.current]
33
 
34
 
35
+ class ComparisonManager:
36
  def __init__(self, data: List[Dict]):
37
+ self.data = data
38
+
39
+ # Group by type
40
+ self.type_groups = defaultdict(lambda: {'validated': [], 'not_validated': []})
41
+
42
+ # Group by term (extract from ner_text)
43
+ self.term_groups = defaultdict(lambda: {'validated': [], 'not_validated': []})
44
 
 
45
  for rec in data:
46
  dtype = rec.get("type")
47
+ is_validated = rec.get("validated", False)
48
+ tags = rec.get("tags", [])
49
+
50
+ # Only include borderline cases
51
+ if "borderline" not in tags:
52
+ continue
53
+
54
+ # Group by type
55
+ if dtype:
56
+ key = 'validated' if is_validated else 'not_validated'
57
+ self.type_groups[dtype][key].append(rec)
58
+
59
+ # Extract term from ner_text
60
+ if rec.get('ner_text') and len(rec['ner_text']) > 0:
61
+ start, end, label = rec['ner_text'][0]
62
+ if label == 'named' and rec.get('text'):
63
+ term = rec['text'][start:end]
64
+ if term and "confusing_term" in tags:
65
+ key = 'validated' if is_validated else 'not_validated'
66
+ self.term_groups[term][key].append(rec)
67
 
68
+ # Get mixed types (sorted by total count)
69
+ self.mixed_types = []
70
+ for dtype, groups in self.type_groups.items():
71
+ if groups['validated'] and groups['not_validated']:
72
+ total = len(groups['validated']) + len(groups['not_validated'])
73
+ self.mixed_types.append((dtype, total))
74
+ self.mixed_types.sort(key=lambda x: x[1], reverse=True)
75
+ self.mixed_types = [t[0] for t in self.mixed_types]
76
 
77
+ # Get confusing terms (sorted by total count)
78
+ self.confusing_terms = []
79
+ for term, groups in self.term_groups.items():
80
+ if groups['validated'] and groups['not_validated']:
81
+ total = len(groups['validated']) + len(groups['not_validated'])
82
+ self.confusing_terms.append((term, total))
83
+ self.confusing_terms.sort(key=lambda x: x[1], reverse=True)
84
+ self.confusing_terms = [t[0] for t in self.confusing_terms]
85
+
86
+ def get_example_by_type(self, dtype: str, is_validated: bool, idx: int) -> Dict:
87
+ if dtype not in self.type_groups:
88
+ return {}
89
+ group = self.type_groups[dtype]['validated' if is_validated else 'not_validated']
90
+ if not group:
91
+ return {}
92
+ safe_idx = idx % len(group)
93
+ return group[safe_idx]
94
 
95
+ def get_count_by_type(self, dtype: str, is_validated: bool) -> int:
96
+ if dtype not in self.type_groups:
97
+ return 0
98
+ return len(self.type_groups[dtype]['validated' if is_validated else 'not_validated'])
99
+
100
+ def get_example_by_term(self, term: str, is_validated: bool, idx: int) -> Dict:
101
+ if term not in self.term_groups:
102
  return {}
103
+ group = self.term_groups[term]['validated' if is_validated else 'not_validated']
104
  if not group:
105
  return {}
 
106
  safe_idx = idx % len(group)
107
  return group[safe_idx]
108
 
109
+ def get_count_by_term(self, term: str, is_validated: bool) -> int:
110
+ if term not in self.term_groups:
111
  return 0
112
+ return len(self.term_groups[term]['validated' if is_validated else 'not_validated'])
113
 
114
 
115
  # ── Highlight utils ──────────────────────────────────────────────────────────
116
  def prepare_for_highlight(rec: Dict) -> List[Tuple[str, Optional[str]]]:
117
  text = rec.get("text", "") or ""
118
+ ner_spans = rec.get("ner_text", []) or []
119
 
120
  segments = []
121
  last_idx = 0
 
144
 
145
 
146
  # ── Filtering helpers ─────────────────────────────────────────────────────────
147
+ def record_matches_filters(rec: Dict, dataset_filter: str, type_filter: str):
148
+ is_validated = rec.get("validated", False)
149
+ tags = rec.get("tags", [])
150
 
151
+ if dataset_filter == "Datasets only" and not is_validated:
 
152
  return False
153
+ if dataset_filter == "Non-datasets only" and is_validated:
 
 
 
154
  return False
155
+ if dataset_filter == "Borderline Cases Only":
156
+ return "borderline" in tags
 
157
 
158
  if type_filter != "All types":
159
  return rec.get("type") == type_filter
 
167
 
168
  ## What is this tool?
169
 
170
+ This application helps you **review and explore dataset mentions** extracted from documents.
171
  It displays text excerpts where potential datasets have been identified, along with metadata about each mention.
172
 
173
  ## What you'll see
 
179
  - **βœ… Dataset Status**: Whether this mention actually refers to a dataset
180
  - **πŸ’‘ Context**: The surrounding text that provides context
181
  - **πŸ“ Explanation**: Why this was classified as a dataset (or not)
182
+ - **🏷️ Tags**: Borderline, mixed type, or confusing term indicators
183
 
184
  ## How to use this tool
185
 
 
194
  - **All**: Show all records
195
  - **Datasets only**: Show only records that contain actual dataset references
196
  - **Non-datasets only**: Show records that were identified but don't actually refer to datasets
197
+ - **πŸ”₯ Borderline Cases Only**: Show only confusing/mixed cases
198
 
199
  2. **Data Type Filter**
200
  - Filter by specific data types (census, survey, database, etc.)
201
  - Types are sorted by frequency (most common first)
202
 
203
+ ### βš–οΈ Comparison Tab
 
 
 
 
 
 
204
 
205
+ The Comparison tab helps you understand **why the same type or term** can be validated differently:
206
 
207
+ 1. **By Type**: Compare examples of the same data type (e.g., "system") with different validation outcomes
208
+ 2. **By Term**: Compare the exact same term (e.g., "Project MIS") appearing in different contexts
209
 
210
+ This helps identify:
211
+ - What contextual signals distinguish valid from invalid datasets
212
+ - Why borderline cases are confusing
213
+ - Patterns in validation decisions
214
 
215
+ ### πŸ’‘ Tips
216
+ - Use filters to focus on specific types of data mentions
217
+ - The "Validated" field tells you if the mention is a true dataset reference
218
+ - Review the "Explanation" to understand the classification reasoning
219
+ - Highlighted text shows exactly where the dataset mention appears in context
220
+ - Check tags to identify borderline/confusing cases
221
 
222
  ## Data Source
223
 
224
+ This viewer uses data from World Bank project documents with revalidation analysis.
225
  """
226
 
227
 
 
229
  def create_demo() -> gr.Blocks:
230
  data = load_initial_data()
231
  dynamic_dataset = DynamicDataset(data)
232
+ comparison_manager = ComparisonManager(data)
233
 
234
  # Count types and sort by frequency (most common first)
235
  type_counter = Counter(rec.get("type") for rec in data if rec.get("type"))
 
243
  v_type = rec.get("type", "β€”")
244
  empirical_context = rec.get("empirical_context", "β€”")
245
  explanation = rec.get("explanation", "β€”")
246
+ tags = rec.get("tags", [])
247
+ is_validated = rec.get("validated", False)
248
+ contextual_signal = rec.get("contextual_signal", "β€”")
249
+ contextual_reason_model = rec.get("contextual_reason_model", "β€”")
250
+ contextual_reason_agent = rec.get("contextual_reason_agent", "β€”")
251
+
252
+ # Apply conditional highlighting based on validation
253
+ if rec.get("ner_text") and rec.get("text") and is_validated is not None:
254
  try:
255
  start, end = rec["ner_text"][0][0], rec["ner_text"][0][1]
256
  term = rec["text"][start:end]
257
+ if is_validated:
258
  highlight_style = 'background-color: #90ee90; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #5cb85c;'
259
  else:
260
  highlight_style = 'background-color: #ff7f7f; color: black; padding: 2px 4px; border-radius: 4px; font-weight: bold; border: 1px solid #d9534f;'
 
265
 
266
  # Build HTML
267
  type_html = f"<code>{v_type}</code>"
268
+
269
+ # Add type stats if available
270
+ type_stats = rec.get("type_stats")
271
+ if type_stats:
272
+ type_html += f" <small>(Type: {type_stats['validated']} βœ… / {type_stats['not_validated']} ❌)</small>"
273
+
274
+ tags_html = ""
275
+ # Add tags
276
+ if tags:
277
+ tag_badges = []
278
+ if "borderline" in tags:
279
+ tag_badges.append("⚠️ <b>Borderline</b>")
280
+ if "mixed_type" in tags:
281
+ tag_badges.append("πŸ” <b>Mixed Type</b>")
282
+ if "confusing_term" in tags:
283
+ tag_badges.append("πŸ€” <b>Confusing Term</b>")
284
+ if tag_badges:
285
+ tags_html = " ".join(tag_badges)
286
 
287
  html = f"""
288
  <h3>πŸ“„ Document Information</h3>
 
291
 
292
  <h3>🏷️ Type</h3>
293
  <p>{type_html}</p>
294
+ """
295
+
296
+ if tags_html:
297
+ html += f"""
298
+ <h3>🚩 Tags</h3>
299
+ <p>{tags_html}</p>
300
+ """
301
 
302
+ html += f"""
303
  <h3>πŸ“ Surrounding Text</h3>
304
  <p>{empirical_context}</p>
305
  """
306
 
307
+ # Add validation analysis
308
+ status_icon = 'βœ…' if is_validated else '❌'
309
+ status_text = 'Is a dataset' if is_validated else 'Not a dataset'
310
+ html += f"""
311
+ <h3>πŸ€– Validation Analysis</h3>
312
+ <p><b>Assessment:</b> {status_icon} {status_text}</p>
313
+ <p><b>Contextual Signal:</b> <code>{contextual_signal}</code></p>
314
+ """
315
 
316
+ if contextual_reason_agent:
 
 
317
  html += f"""
318
+ <p><b>Agent Reasoning:</b></p>
319
+ <blockquote style="border-left: 3px solid #ccc; padding-left: 10px; color: #666;">
320
+ {contextual_reason_agent}
321
+ </blockquote>
322
+ """
323
+
324
+ if contextual_reason_model:
325
+ html += f"""
326
+ <p><b>Model Reasoning:</b></p>
327
+ <blockquote style="border-left: 3px solid #999; padding-left: 10px; color: #888;">
328
+ {contextual_reason_model}
329
+ </blockquote>
330
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  return html
333
 
 
338
  return segs, idx, make_info(rec)
339
 
340
  # When filters change β†’ jump to first matching record
341
+ def jump_on_filters(dataset_filter, type_filter):
342
  n = dynamic_dataset.len
343
  for i in range(n):
344
+ if record_matches_filters(data[i], dataset_filter, type_filter):
345
  dynamic_dataset.current = i
346
  rec = data[i]
347
  segs = prepare_for_highlight(rec)
 
351
  return [], 0, "⚠️ No matching records found with the selected filters."
352
 
353
  # Navigation respecting filters
354
+ def nav_next(dataset_filter, type_filter):
355
  i = dynamic_dataset.current + 1
356
  n = dynamic_dataset.len
357
  while i < n:
358
+ if record_matches_filters(data[i], dataset_filter, type_filter):
359
  break
360
  i += 1
361
  if i >= n:
 
364
  rec = data[i]
365
  return prepare_for_highlight(rec), i, make_info(rec)
366
 
367
+ def nav_prev(dataset_filter, type_filter):
368
  i = dynamic_dataset.current - 1
369
  while i >= 0:
370
+ if record_matches_filters(data[i], dataset_filter, type_filter):
371
  break
372
  i -= 1
373
  if i < 0:
 
376
  rec = data[i]
377
  return prepare_for_highlight(rec), i, make_info(rec)
378
 
379
+ # Comparison Logic - By Type
380
+ def load_type_comparison(dtype, pos_idx, neg_idx):
381
  if not dtype:
382
+ return [], "Select a type", [], "Select a type", "### βœ… IS Dataset", "### ❌ NOT Dataset"
383
 
384
+ pos_rec = comparison_manager.get_example_by_type(dtype, True, pos_idx)
385
+ neg_rec = comparison_manager.get_example_by_type(dtype, False, neg_idx)
386
 
387
+ pos_hl = prepare_for_highlight(pos_rec) if pos_rec else []
388
+ neg_hl = prepare_for_highlight(neg_rec) if neg_rec else []
389
 
390
+ pos_info = make_info(pos_rec) if pos_rec else "No examples"
391
+ neg_info = make_info(neg_rec) if neg_rec else "No examples"
392
 
393
  # Add count info
394
+ pos_total = comparison_manager.get_count_by_type(dtype, True)
395
+ neg_total = comparison_manager.get_count_by_type(dtype, False)
396
 
397
+ pos_header = f"### βœ… IS Dataset ({(pos_idx % pos_total) + 1 if pos_total > 0 else 0}/{pos_total})"
398
+ neg_header = f"### ❌ NOT Dataset ({(neg_idx % neg_total) + 1 if neg_total > 0 else 0}/{neg_total})"
399
 
400
  return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header
401
 
402
+ # Comparison Logic - By Term
403
+ def load_term_comparison(term, pos_idx, neg_idx):
404
+ if not term:
405
+ return [], "Select a term", [], "Select a term", "### βœ… IS Dataset", "### ❌ NOT Dataset"
406
+
407
+ pos_rec = comparison_manager.get_example_by_term(term, True, pos_idx)
408
+ neg_rec = comparison_manager.get_example_by_term(term, False, neg_idx)
409
+
410
+ pos_hl = prepare_for_highlight(pos_rec) if pos_rec else []
411
+ neg_hl = prepare_for_highlight(neg_rec) if neg_rec else []
412
+
413
+ pos_info = make_info(pos_rec) if pos_rec else "No examples"
414
+ neg_info = make_info(neg_rec) if neg_rec else "No examples"
415
+
416
+ # Add count info
417
+ pos_total = comparison_manager.get_count_by_term(term, True)
418
+ neg_total = comparison_manager.get_count_by_term(term, False)
419
+
420
+ pos_header = f"### βœ… IS Dataset ({(pos_idx % pos_total) + 1 if pos_total > 0 else 0}/{pos_total})"
421
+ neg_header = f"### ❌ NOT Dataset ({(neg_idx % neg_total) + 1 if neg_total > 0 else 0}/{neg_total})"
422
+
423
+ return pos_hl, pos_info, neg_hl, neg_info, pos_header, neg_header
424
+
425
+ def next_pos(current_idx):
426
  return current_idx + 1
427
 
428
+ def next_neg(current_idx):
429
  return current_idx + 1
430
 
431
  # ---- UI ----
432
  with gr.Blocks(title="Monitoring of Data Use") as demo:
433
  gr.Markdown("# πŸ“Š Monitoring of Data Use")
 
434
 
435
  with gr.Tabs():
436
  with gr.Tab("πŸ“– How to Use"):
 
448
  )
449
 
450
  with gr.Row():
451
+ dataset_filter = gr.Dropdown(
452
+ choices=["All", "Datasets only", "Non-datasets only", "Borderline Cases Only"],
453
+ value="Datasets only",
454
+ label="🎯 Filter by Validation Status",
455
  )
456
 
457
  type_filter = gr.Dropdown(
 
488
  )
489
 
490
  # Filters
491
+ dataset_filter.change(
492
  fn=jump_on_filters,
493
+ inputs=[dataset_filter, type_filter],
494
  outputs=[inp_box, prog, info_md],
495
  )
496
  type_filter.change(
497
  fn=jump_on_filters,
498
+ inputs=[dataset_filter, type_filter],
499
  outputs=[inp_box, prog, info_md],
500
  )
501
 
502
  # Prev / Next navigation respecting filters
503
  prev_btn.click(
504
  fn=nav_prev,
505
+ inputs=[dataset_filter, type_filter],
506
  outputs=[inp_box, prog, info_md],
507
  )
508
  next_btn.click(
509
  fn=nav_next,
510
+ inputs=[dataset_filter, type_filter],
511
  outputs=[inp_box, prog, info_md],
512
  )
513
 
514
  with gr.Tab("βš–οΈ Comparison"):
515
+ gr.Markdown("### Side-by-Side Comparison of Borderline Cases")
516
+ gr.Markdown("Compare examples to understand **why the same type or term** is validated differently based on context.")
517
 
518
+ comparison_mode = gr.Radio(
519
+ choices=["By Type", "By Term"],
520
+ value="By Type",
521
+ label="Comparison Mode"
522
+ )
523
+
524
+ # Type comparison
525
+ with gr.Group(visible=True) as type_comparison_group:
526
+ gr.Markdown("**Compare by Data Type**: See how the same type (e.g., 'system') can be valid or invalid")
527
  comp_type_selector = gr.Dropdown(
528
+ choices=comparison_manager.mixed_types,
529
+ value=comparison_manager.mixed_types[0] if comparison_manager.mixed_types else None,
530
  label="Select Mixed Type to Compare",
531
  )
532
+
533
+ type_pos_idx_state = gr.State(0)
534
+ type_neg_idx_state = gr.State(0)
535
+
536
+ with gr.Row():
537
+ with gr.Column():
538
+ type_pos_header = gr.Markdown("### βœ… IS Dataset")
539
+ type_pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
540
+ type_pos_info_box = gr.HTML()
541
+ type_pos_next_btn = gr.Button("Next Example ➑️")
542
+
543
+ with gr.Column():
544
+ type_neg_header = gr.Markdown("### ❌ NOT Dataset")
545
+ type_neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
546
+ type_neg_info_box = gr.HTML()
547
+ type_neg_next_btn = gr.Button("Next Example ➑️")
548
+
549
+ # Term comparison
550
+ with gr.Group(visible=False) as term_comparison_group:
551
+ gr.Markdown("**Compare by Term**: See how the exact same term appears in different validation contexts")
552
+ comp_term_selector = gr.Dropdown(
553
+ choices=comparison_manager.confusing_terms,
554
+ value=comparison_manager.confusing_terms[0] if comparison_manager.confusing_terms else None,
555
+ label="Select Confusing Term to Compare",
556
+ )
557
+
558
+ term_pos_idx_state = gr.State(0)
559
+ term_neg_idx_state = gr.State(0)
560
+
561
+ with gr.Row():
562
+ with gr.Column():
563
+ term_pos_header = gr.Markdown("### βœ… IS Dataset")
564
+ term_pos_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
565
+ term_pos_info_box = gr.HTML()
566
+ term_pos_next_btn = gr.Button("Next Example ➑️")
567
+
568
+ with gr.Column():
569
+ term_neg_header = gr.Markdown("### ❌ NOT Dataset")
570
+ term_neg_hl_box = gr.HighlightedText(label="Context", interactive=False, show_legend=False, value="")
571
+ term_neg_info_box = gr.HTML()
572
+ term_neg_next_btn = gr.Button("Next Example ➑️")
573
+
574
+ # Toggle visibility based on mode
575
+ def toggle_comparison_mode(mode):
576
+ return gr.update(visible=mode == "By Type"), gr.update(visible=mode == "By Term")
577
 
578
+ comparison_mode.change(
579
+ fn=toggle_comparison_mode,
580
+ inputs=[comparison_mode],
581
+ outputs=[type_comparison_group, term_comparison_group]
582
+ )
583
 
584
+ # Type comparison events
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  comp_type_selector.change(
586
+ fn=lambda: (0, 0),
587
+ outputs=[type_pos_idx_state, type_neg_idx_state]
588
  ).then(
589
+ fn=load_type_comparison,
590
+ inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
591
+ outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
592
  )
593
 
594
+ type_pos_next_btn.click(
595
  fn=next_pos,
596
+ inputs=[type_pos_idx_state],
597
+ outputs=[type_pos_idx_state]
598
  ).then(
599
+ fn=load_type_comparison,
600
+ inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
601
+ outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
602
  )
603
 
604
+ type_neg_next_btn.click(
605
  fn=next_neg,
606
+ inputs=[type_neg_idx_state],
607
+ outputs=[type_neg_idx_state]
608
+ ).then(
609
+ fn=load_type_comparison,
610
+ inputs=[comp_type_selector, type_pos_idx_state, type_neg_idx_state],
611
+ outputs=[type_pos_hl_box, type_pos_info_box, type_neg_hl_box, type_neg_info_box, type_pos_header, type_neg_header]
612
+ )
613
+
614
+ # Term comparison events
615
+ comp_term_selector.change(
616
+ fn=lambda: (0, 0),
617
+ outputs=[term_pos_idx_state, term_neg_idx_state]
618
  ).then(
619
+ fn=load_term_comparison,
620
+ inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
621
+ outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
622
  )
623
 
624
+ term_pos_next_btn.click(
625
+ fn=next_pos,
626
+ inputs=[term_pos_idx_state],
627
+ outputs=[term_pos_idx_state]
628
+ ).then(
629
+ fn=load_term_comparison,
630
+ inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
631
+ outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
632
+ )
633
+
634
+ term_neg_next_btn.click(
635
+ fn=next_neg,
636
+ inputs=[term_neg_idx_state],
637
+ outputs=[term_neg_idx_state]
638
+ ).then(
639
+ fn=load_term_comparison,
640
+ inputs=[comp_term_selector, term_pos_idx_state, term_neg_idx_state],
641
+ outputs=[term_pos_hl_box, term_pos_info_box, term_neg_hl_box, term_neg_info_box, term_pos_header, term_neg_header]
642
  )
643
 
644
  return demo
consolidated_data_optimized.json ADDED
The diff for this file is too large to render. See raw diff