oliver-aizip kai-aizip commited on
Commit
d9de1e9
·
verified ·
1 Parent(s): c93381b

Rewrote the processor to make it more robust to edge cases (#5)

Browse files

- Rewrote the processor to make it more robust to edge cases (6c7b9049870d1e05651edd6f1a40ee658741fed4)


Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>

Files changed (1) hide show
  1. utils/context_processor.py +460 -157
utils/context_processor.py CHANGED
@@ -1,155 +1,460 @@
1
  import re
 
2
  import json
 
3
 
4
- def debug_text(text, label="Text"):
5
- """Helper function to debug text processing issues"""
6
- print(f"\n--- DEBUG {label} ---")
7
- print(f"Length: {len(text)}")
8
- print(f"First 100 chars: {text[:100]}")
9
- print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
10
- print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
11
- print("-------------------------\n")
12
-
13
- def clean_json_text(text):
14
- """
15
- Handle text that came from JSON and might have JSON escaping.
16
- This handles the case of text like: "the sky isn\\'t falling"
17
- """
18
- # First attempt to clean JSON-style escapes
19
- try:
20
- # Try to treat the string as if it were a JSON string
21
- if '\\' in text:
22
- # Create a valid JSON string with the text as content
23
- json_str = json.dumps({"text": text})
24
- # Parse it back to get properly unescaped text
25
- parsed = json.loads(json_str)
26
- return parsed["text"]
27
- except Exception:
28
- # If that fails, continue with the original text
29
- pass
30
 
31
- return text
32
-
33
- def process_highlights(text):
34
- """
35
- Process highlight markers in text to create HTML highlighted text.
36
- Handles both standard format and alternative format.
37
- Also properly handles escaped quotes.
38
- """
39
- # Debug info
40
- # debug_text(text, "Before processing")
41
-
42
- # Clean JSON escaping
43
- text = clean_json_text(text)
44
 
45
- # Process highlight tags
46
- pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
47
- replacement = r'<span class="highlight">\1</span>'
48
- highlighted_text = re.sub(pattern1, replacement, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
51
- highlighted_text = re.sub(pattern2, replacement, highlighted_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Debug info
54
- # debug_text(highlighted_text, "After processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- return highlighted_text
57
-
58
- def process_table_with_highlights(markdown_table):
59
- """
60
- Special function to process markdown tables with highlights.
61
- Ensures the table structure is preserved while applying highlights.
62
- """
63
- # First, split the table into lines
64
- lines = markdown_table.strip().split('\n')
65
- processed_lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- for line in lines:
68
- # Process highlights in each line
69
- processed_line = process_highlights(line)
70
- processed_lines.append(processed_line)
 
 
 
 
 
 
 
 
71
 
72
- return convert_markdown_table_to_html('\n'.join(processed_lines))
73
-
74
- def convert_markdown_table_to_html(markdown_text):
75
- """
76
- Converts a markdown table to an HTML table.
77
- """
78
- # Clean JSON escaping
79
- markdown_text = clean_json_text(markdown_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- lines = markdown_text.strip().split('\n')
82
- table_lines = [line for line in lines if line.strip().startswith('|')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- if len(table_lines) < 2: # Need at least header and separator
85
- return markdown_text # Return original if not a proper table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- html = '<table class="md-table">'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- # Check if we have a header row
90
- if len(table_lines) >= 2 and '---' in table_lines[1]:
91
- # Process header
92
- header_cells = table_lines[0].split('|')[1:-1] if table_lines[0].strip().endswith('|') else table_lines[0].split('|')[1:]
93
- html += '<thead><tr>'
94
- for cell in header_cells:
95
- # Process highlights in the cell
96
- processed_cell = process_highlights(cell.strip())
97
- html += f'<th>{processed_cell}</th>'
98
- html += '</tr></thead>'
99
-
100
- # Process data rows (skip the separator row at index 1)
101
- html += '<tbody>'
102
- for line in table_lines[2:]:
103
- if not line.strip():
104
- continue
105
 
106
- cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
107
- html += '<tr>'
108
- for cell in cells:
109
- # Process highlights in the cell
110
- processed_cell = process_highlights(cell.strip())
111
- html += f'<td>{processed_cell}</td>'
112
- html += '</tr>'
113
- html += '</tbody>'
114
- else:
115
- # No header row, treat all rows as data
116
- html += '<tbody>'
117
- for line in table_lines:
118
- if not line.strip():
119
- continue
120
 
121
- cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
122
- html += '<tr>'
123
- for cell in cells:
124
- # Process highlights in the cell
125
- processed_cell = process_highlights(cell.strip())
126
- html += f'<td>{processed_cell}</td>'
127
- html += '</tr>'
128
- html += '</tbody>'
 
 
129
 
130
- html += '</table>'
131
- return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  def get_context_html(example, show_full=False):
134
- """
135
- Formats the context chunks into an HTML string for display using specific CSS classes.
136
- Includes an alert for insufficient context and applies highlighting.
137
-
138
- Parameters:
139
- - example: The example data containing contexts
140
- - show_full: Boolean indicating whether to show full context
141
- """
142
- html = ""
143
 
144
- # Add insufficient context warning if needed
145
  if example.get("insufficient", False):
146
  insufficient_reason = example.get("insufficient_reason", "")
147
- reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"
 
 
 
 
 
148
 
149
- html += f"""
150
  <div class="insufficient-alert">
151
  <strong>
152
- <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
 
 
153
  <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
154
  <line x1="12" y1="9" x2="12" y2="13"></line>
155
  <line x1="12" y1="17" x2="12.01" y2="17"></line>
@@ -160,47 +465,45 @@ def get_context_html(example, show_full=False):
160
  </div>
161
  """
162
 
163
- # Create container div for all context items
164
- html += '<div class="context-items-container">'
165
 
166
- # Determine which context to display based on show_full flag
167
  if show_full and "full_contexts" in example and example["full_contexts"]:
168
- # If showing full context, create individual items for each chunk without headers
169
  for context_item in example["full_contexts"]:
170
- context_text = context_item.get('content', '')
 
171
 
172
- # Check for markdown table format (both standard and newline format)
173
- if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
174
- # Process as a table
175
- html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
176
- else:
177
- # Regular text content - process highlights
178
- processed_text = process_highlights(context_text)
179
- html += f'<div class="context-item">{processed_text}</div>'
180
  else:
181
- # Show the highlighted context items
182
  if "contexts" in example and example["contexts"]:
183
  for context_item in example["contexts"]:
184
- chunk_num = context_item.get('chunk_num', '')
185
- context_text = context_item.get('content', '')
 
 
 
 
186
  is_primary = context_item.get('is_primary', False)
 
 
 
187
 
188
- # Add appropriate class for primary chunks
 
 
 
 
 
189
  extra_class = " primary-context" if is_primary else ""
190
 
191
- # Check for markdown table format
192
- if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
193
- # Process as a table
194
- html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
195
- else:
196
- # Regular text with potential highlights
197
- processed_text = process_highlights(context_text)
198
- html += f'<div class="context-item{extra_class}">{processed_text}</div>'
199
  else:
200
- # If no contexts available, show a message
201
- html += '<div class="context-item">No context available. Try toggling to full context view.</div>'
202
 
203
- # Close the container div
204
- html += '</div>'
205
 
206
- return html
 
1
  import re
2
+ import html
3
  import json
4
+ from typing import Dict, List, Tuple, Optional, Any, Union
5
 
6
+ class ContextProcessor:
7
+ """Processes highlighted contexts for the RAG Summarizer Arena"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Common HTML entities that might be incomplete
10
+ INCOMPLETE_ENTITIES = {
11
+ '&#x27': '&#x27;',
12
+ '&quot': '&quot;',
13
+ '&lt': '&lt;',
14
+ '&gt': '&gt;',
15
+ '&amp': '&amp;'
16
+ }
 
 
 
 
 
17
 
18
+ @staticmethod
19
+ def clean_text(text: str) -> str:
20
+ """Cleans text by fixing HTML entities and handling escaped characters"""
21
+ if not text or not isinstance(text, str):
22
+ return text
23
+
24
+ # Fix incomplete HTML entities
25
+ for incomplete, complete in ContextProcessor.INCOMPLETE_ENTITIES.items():
26
+ text = re.sub(f"{re.escape(incomplete)}(?!;)", complete, text)
27
+
28
+ # Convert HTML entities to characters
29
+ try:
30
+ text = html.unescape(text)
31
+ except Exception:
32
+ pass
33
+
34
+ # Handle escaped quotes and special characters
35
+ replacements = {
36
+ r'\"': '"', r"\'": "'", r"\n": "\n", r"\t": "\t", r"\\": "\\",
37
+ '"': '"', '"': '"', ''': "'", ''': "'", '`': "'", '´': "'"
38
+ }
39
+ for pattern, replacement in replacements.items():
40
+ text = text.replace(pattern, replacement)
41
+
42
+ # Remove trailing backslash if present
43
+ if text.rstrip().endswith('\\'):
44
+ text = text.rstrip().rstrip('\\')
45
+
46
+ return text
47
 
48
+ @staticmethod
49
+ def balance_highlight_tags(text: str) -> str:
50
+ """Ensures highlight tags are properly balanced"""
51
+ if not text or not isinstance(text, str):
52
+ return text
53
+
54
+ # Define highlight tag patterns
55
+ highlight_pairs = [
56
+ ('[[start_highlight]]', '[[end_highlight]]'),
57
+ ('[[highlight_start]]', '[[highlight_end]]'),
58
+ ('<span class="highlight">', '</span>')
59
+ ]
60
+
61
+ # Check and balance each pair
62
+ for start_tag, end_tag in highlight_pairs:
63
+ start_count = text.count(start_tag)
64
+ end_count = text.count(end_tag)
65
+
66
+ # Add missing tags if needed
67
+ if start_count > end_count:
68
+ text += end_tag * (start_count - end_count)
69
+ elif end_count > start_count:
70
+ text = start_tag * (end_count - start_count) + text
71
+
72
+ return text
73
 
74
+ @staticmethod
75
+ def balance_quotes(text: str) -> str:
76
+ """Ensures quotes are properly balanced"""
77
+ if not text or not isinstance(text, str):
78
+ return text
79
+
80
+ # First, remove escaped quotes from the count
81
+ plain_text = text.replace('\\"', '')
82
+
83
+ # Count quotes and balance if needed
84
+ quote_count = plain_text.count('"')
85
+ if quote_count % 2 == 1:
86
+ text += '"'
87
+
88
+ return text
89
 
90
+ @staticmethod
91
+ def extract_highlight_parts(text: str) -> List[Tuple[bool, str]]:
92
+ """
93
+ Extracts highlighted and non-highlighted parts from text, preserving order
94
+ """
95
+ # Ensure highlight tags are balanced
96
+ text = ContextProcessor.balance_highlight_tags(text)
97
+
98
+ # Define all highlight patterns
99
+ highlight_patterns = [
100
+ ('[[start_highlight]]', '[[end_highlight]]'),
101
+ ('[[highlight_start]]', '[[highlight_end]]'),
102
+ ('<span class="highlight">', '</span>')
103
+ ]
104
+
105
+ # Collect all highlight sections with their positions
106
+ all_highlights = []
107
+
108
+ for start_tag, end_tag in highlight_patterns:
109
+ # Escape special regex characters if needed
110
+ start_esc = re.escape(start_tag)
111
+ end_esc = re.escape(end_tag)
112
+
113
+ # Find all occurrences of this highlight pattern
114
+ for match in re.finditer(f"{start_esc}(.*?){end_esc}", text, re.DOTALL):
115
+ all_highlights.append({
116
+ 'start': match.start(),
117
+ 'end': match.end(),
118
+ 'content': match.group(1),
119
+ 'start_tag': start_tag,
120
+ 'end_tag': end_tag
121
+ })
122
+
123
+ # If no highlights found, return the whole text as unhighlighted
124
+ if not all_highlights:
125
+ return [(False, text)]
126
+
127
+ # Sort highlights by start position
128
+ all_highlights.sort(key=lambda x: x['start'])
129
+
130
+ # Build the parts list by processing text portions between and including highlights
131
+ parts = []
132
+ current_pos = 0
133
+
134
+ for highlight in all_highlights:
135
+ # Add non-highlighted text before this highlight
136
+ if highlight['start'] > current_pos:
137
+ parts.append((False, text[current_pos:highlight['start']]))
138
+
139
+ # Add the highlighted text
140
+ parts.append((True, highlight['content']))
141
+
142
+ # Update position to end of this highlight
143
+ current_pos = highlight['end']
144
+
145
+ # Add any remaining text after the last highlight
146
+ if current_pos < len(text):
147
+ parts.append((False, text[current_pos:]))
148
+
149
+ return parts
150
 
151
+ @staticmethod
152
+ def is_markdown_table(text: str) -> bool:
153
+ """Checks if text looks like a markdown table"""
154
+ if not text or not isinstance(text, str):
155
+ return False
156
+
157
+ if '|' in text and '\n' in text:
158
+ lines = text.strip().split('\n')
159
+ pipe_lines = sum(1 for line in lines if line.strip().startswith('|'))
160
+ return pipe_lines >= 2
161
+
162
+ return False
163
 
164
+ @staticmethod
165
+ def process_cell_content(cell_text: str) -> str:
166
+ """Processes a single table cell, handling highlights if present"""
167
+ # Clean and prepare the text
168
+ cell_text = ContextProcessor.clean_text(cell_text)
169
+ cell_text = ContextProcessor.balance_quotes(cell_text)
170
+
171
+ # Check if cell has any highlight tags
172
+ has_highlights = False
173
+ highlight_patterns = [
174
+ '[[start_highlight]]', '[[end_highlight]]',
175
+ '[[highlight_start]]', '[[highlight_end]]',
176
+ '<span class="highlight">', '</span>'
177
+ ]
178
+
179
+ for pattern in highlight_patterns:
180
+ if pattern in cell_text:
181
+ has_highlights = True
182
+ break
183
+
184
+ if has_highlights:
185
+ # Extract and process highlight parts
186
+ parts = ContextProcessor.extract_highlight_parts(cell_text)
187
+
188
+ # Build the result
189
+ result = ""
190
+ for is_highlighted, part in parts:
191
+ if is_highlighted:
192
+ result += f'<span class="highlight">{html.escape(part)}</span>'
193
+ else:
194
+ result += html.escape(part)
195
+
196
+ return result
197
+ else:
198
+ # Just escape HTML in regular cells
199
+ return html.escape(cell_text)
200
 
201
+ @staticmethod
202
+ def convert_table_to_html(text: str) -> str:
203
+ """Converts markdown table to HTML with support for highlights in cells"""
204
+ # Clean the text
205
+ text = ContextProcessor.clean_text(text)
206
+
207
+ # Split into lines and get table rows
208
+ lines = text.strip().split('\n')
209
+ table_lines = [line for line in lines if line.strip().startswith('|')]
210
+
211
+ # Check if it's a proper table
212
+ if len(table_lines) < 2:
213
+ return ContextProcessor.process_text(text)
214
+
215
+ # Check if second line is a separator (----)
216
+ has_header = False
217
+ if len(table_lines) >= 2 and '---' in table_lines[1]:
218
+ has_header = True
219
+
220
+ # Start building HTML table
221
+ html_output = '<table class="md-table">'
222
+
223
+ if has_header:
224
+ # Process header row
225
+ header_line = table_lines[0]
226
+ # Split by pipe and remove empty first and last elements
227
+ cells = [cell.strip() for cell in header_line.split('|')]
228
+ if cells and not cells[0]:
229
+ cells.pop(0)
230
+ if cells and not cells[-1]:
231
+ cells.pop()
232
+
233
+ html_output += '<thead><tr>'
234
+ for cell in cells:
235
+ cell_html = ContextProcessor.process_cell_content(cell)
236
+ html_output += f'<th>{cell_html}</th>'
237
+ html_output += '</tr></thead>'
238
+
239
+ # Process data rows (skip header and separator)
240
+ html_output += '<tbody>'
241
+ for line in table_lines[2:]:
242
+ cells = [cell.strip() for cell in line.split('|')]
243
+ if cells and not cells[0]:
244
+ cells.pop(0)
245
+ if cells and not cells[-1]:
246
+ cells.pop()
247
+
248
+ html_output += '<tr>'
249
+ for cell in cells:
250
+ cell_html = ContextProcessor.process_cell_content(cell)
251
+ html_output += f'<td>{cell_html}</td>'
252
+ html_output += '</tr>'
253
+ html_output += '</tbody>'
254
+ else:
255
+ # All rows are data
256
+ html_output += '<tbody>'
257
+ for line in table_lines:
258
+ cells = [cell.strip() for cell in line.split('|')]
259
+ if cells and not cells[0]:
260
+ cells.pop(0)
261
+ if cells and not cells[-1]:
262
+ cells.pop()
263
+
264
+ html_output += '<tr>'
265
+ for cell in cells:
266
+ cell_html = ContextProcessor.process_cell_content(cell)
267
+ html_output += f'<td>{cell_html}</td>'
268
+ html_output += '</tr>'
269
+ html_output += '</tbody>'
270
+
271
+ html_output += '</table>'
272
+ return html_output
273
 
274
+ @staticmethod
275
+ def process_text(text: str) -> str:
276
+ """Processes text with highlights, handling all edge cases"""
277
+ # Clean and prepare the text
278
+ text = ContextProcessor.clean_text(text)
279
+ text = ContextProcessor.balance_quotes(text)
280
+ text = ContextProcessor.balance_highlight_tags(text)
281
+
282
+ # Extract and process highlight parts
283
+ parts = ContextProcessor.extract_highlight_parts(text)
284
+
285
+ # Build the result
286
+ result = ""
287
+ for is_highlighted, part in parts:
288
+ if is_highlighted:
289
+ escaped_part = html.escape(part)
290
+ result += f'<span class="highlight">{escaped_part}</span>'
291
+ else:
292
+ result += html.escape(part)
293
+
294
+ return result
295
 
296
+ @staticmethod
297
+ def process_content(content: str, abbreviated_content: Optional[str] = None) -> str:
298
+ """Main function to process any kind of content"""
299
+ # Handle null/empty content
300
+ if not content or not isinstance(content, str):
301
+ return ""
302
+
303
+ # Special cases that need abbreviated content
304
+ special_cases = [
305
+ lambda c: c.strip() == "In Oklahoma,",
306
+ lambda c: c.strip().startswith('"') and c.count('"') == 1,
307
+ lambda c: c.rstrip().endswith('\\'),
308
+ lambda c: (c.replace('\\"', '').count('"') % 2) == 1,
309
+ lambda c: any((c.count(start) != c.count(end)) for start, end in [
310
+ ('[[start_highlight]]', '[[end_highlight]]'),
311
+ ('[[highlight_start]]', '[[highlight_end]]'),
312
+ ('<span class="highlight">', '</span>')
313
+ ])
314
+ ]
315
+
316
+ # Check if we need to use abbreviated content
317
+ needs_abbreviated = any(check(content) for check in special_cases)
318
+
319
+ # If content needs help and we have abbreviated content, use it
320
+ if needs_abbreviated and abbreviated_content:
321
+ # Handle abbreviated content that might be a JSON string
322
+ if abbreviated_content.strip().startswith('{') and abbreviated_content.strip().endswith('}'):
323
+ try:
324
+ data = json.loads(abbreviated_content)
325
+ if "abbreviatedContent" in data:
326
+ abbreviated_content = data["abbreviatedContent"]
327
+ except json.JSONDecodeError:
328
+ pass
329
+
330
+ # Clean and prepare the abbreviated content
331
+ abbreviated_content = ContextProcessor.clean_text(abbreviated_content)
332
+ abbreviated_content = ContextProcessor.balance_quotes(abbreviated_content)
333
+ abbreviated_content = ContextProcessor.balance_highlight_tags(abbreviated_content)
334
+
335
+ # Use abbreviated content instead
336
+ content = abbreviated_content
337
+
338
+ # Check if content is a markdown table
339
+ if ContextProcessor.is_markdown_table(content):
340
+ return ContextProcessor.convert_table_to_html(content)
341
+ else:
342
+ return ContextProcessor.process_text(content)
343
 
344
+ @staticmethod
345
+ def parse_json_contexts(context_json: str) -> List[Dict[str, Any]]:
346
+ """Parses JSON-formatted context data with fallback to regex extraction"""
347
+ contexts = []
348
+
349
+ # First try standard JSON parsing
350
+ try:
351
+ contexts = json.loads(context_json)
352
+ if not isinstance(contexts, list):
353
+ contexts = []
354
+ except json.JSONDecodeError:
355
+ # If standard parsing fails, use regex to extract the data
356
+ try:
357
+ # Extract type field
358
+ type_pattern = r'"type":\s*"(primary|secondary)"'
359
+ types = re.findall(type_pattern, context_json)
360
 
361
+ # Extract abbreviatedContent field - more robustly handle quotes
362
+ content_pattern = r'"abbreviatedContent":\s*"((?:\\.|[^"])*?)"'
363
+ contents = re.findall(content_pattern, context_json)
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ # Build context objects
366
+ for i, (ctx_type, content) in enumerate(zip(types, contents)):
367
+ contexts.append({
368
+ 'type': ctx_type,
369
+ 'abbreviatedContent': content.replace('\\"', '"')
370
+ })
371
+ except Exception as e:
372
+ print(f"Error extracting contexts with regex: {e}")
373
+
374
+ return contexts
375
 
376
+ @staticmethod
377
+ def process_json_contexts(context_json: str) -> List[Dict[str, Any]]:
378
+ """Process JSON-formatted highlighted contexts"""
379
+ processed_contexts = []
380
+
381
+ try:
382
+ # Parse the JSON contexts
383
+ contexts = ContextProcessor.parse_json_contexts(context_json)
384
+
385
+ # Process each context item
386
+ for i, item in enumerate(contexts):
387
+ if isinstance(item, dict):
388
+ context_type = item.get('type', 'secondary')
389
+ content = item.get('abbreviatedContent', '')
390
+
391
+ # Process the content
392
+ processed_content = ContextProcessor.process_content(content)
393
+
394
+ # Create processed context item
395
+ processed_contexts.append({
396
+ 'chunk_num': i + 1,
397
+ 'content': processed_content,
398
+ 'is_primary': context_type == 'primary'
399
+ })
400
+ except Exception as e:
401
+ print(f"Error processing JSON contexts: {e}")
402
+
403
+ return processed_contexts
404
+
405
+
406
+ # Module-level functions for backward compatibility
407
+ def clean_text(text):
408
+ return ContextProcessor.clean_text(text)
409
+
410
+ def balance_highlight_tags(text):
411
+ return ContextProcessor.balance_highlight_tags(text)
412
+
413
+ def balance_quotes(text):
414
+ return ContextProcessor.balance_quotes(text)
415
+
416
+ def extract_highlight_parts(text):
417
+ return ContextProcessor.extract_highlight_parts(text)
418
+
419
+ def is_markdown_table(text):
420
+ return ContextProcessor.is_markdown_table(text)
421
+
422
+ def process_cell_content(cell_text):
423
+ return ContextProcessor.process_cell_content(cell_text)
424
+
425
+ def convert_table_to_html(text):
426
+ return ContextProcessor.convert_table_to_html(text)
427
+
428
+ def process_text(text):
429
+ return ContextProcessor.process_text(text)
430
+
431
+ def process_content(content, abbreviated_content=None):
432
+ return ContextProcessor.process_content(content, abbreviated_content)
433
+
434
+ def process_highlights(text):
435
+ """Main entry point called from data_loader.py"""
436
+ return ContextProcessor.process_content(text)
437
 
438
  def get_context_html(example, show_full=False):
439
+ """Format context chunks into HTML for display"""
440
+ html_output = ""
 
 
 
 
 
 
 
441
 
442
+ # Process insufficient context warning if needed
443
  if example.get("insufficient", False):
444
  insufficient_reason = example.get("insufficient_reason", "")
445
+ reason_html = (
446
+ f"<p>{insufficient_reason}</p>" if insufficient_reason else
447
+ "<p>The context may not contain enough information to fully answer the question, "
448
+ "or the question might be ambiguous. Models should ideally indicate this limitation "
449
+ "or refuse to answer.</p>"
450
+ )
451
 
452
+ html_output += f"""
453
  <div class="insufficient-alert">
454
  <strong>
455
+ <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none"
456
+ stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"
457
+ style="vertical-align: middle; margin-right: 5px;">
458
  <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
459
  <line x1="12" y1="9" x2="12" y2="13"></line>
460
  <line x1="12" y1="17" x2="12.01" y2="17"></line>
 
465
  </div>
466
  """
467
 
468
+ html_output += '<div class="context-items-container">'
 
469
 
470
+ # Display full contexts if requested
471
  if show_full and "full_contexts" in example and example["full_contexts"]:
 
472
  for context_item in example["full_contexts"]:
473
+ content = context_item.get('content', '')
474
+ abbreviated = context_item.get('abbreviatedContent', None)
475
 
476
+ # Process the content
477
+ processed = ContextProcessor.process_content(content, abbreviated)
478
+
479
+ html_output += f'<div class="context-item">{processed}</div>'
 
 
 
 
480
  else:
481
+ # Display regular contexts if available
482
  if "contexts" in example and example["contexts"]:
483
  for context_item in example["contexts"]:
484
+ content = context_item.get('content', '')
485
+ abbreviated = context_item.get('abbreviatedContent', None)
486
+
487
+ # Process the content
488
+ processed = ContextProcessor.process_content(content, abbreviated)
489
+
490
  is_primary = context_item.get('is_primary', False)
491
+ extra_class = " primary-context" if is_primary else ""
492
+
493
+ html_output += f'<div class="context-item{extra_class}">{processed}</div>'
494
 
495
+ # Or process JSON-structured highlighted contexts
496
+ elif "contexts_highlighted" in example and example["contexts_highlighted"]:
497
+ processed_contexts = ContextProcessor.process_json_contexts(example["contexts_highlighted"])
498
+
499
+ for context_item in processed_contexts:
500
+ is_primary = context_item.get('is_primary', False)
501
  extra_class = " primary-context" if is_primary else ""
502
 
503
+ html_output += f'<div class="context-item{extra_class}">{context_item["content"]}</div>'
 
 
 
 
 
 
 
504
  else:
505
+ html_output += '<div class="context-item">No context available. Try toggling to full context view.</div>'
 
506
 
507
+ html_output += '</div>'
 
508
 
509
+ return html_output