SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip

kai-aizip commited on 5 days ago

Commit

d9de1e9

verified ·

1 Parent(s): c93381b

Rewrote the processor to make it more robust to edge cases (#5)

Browse files

- Rewrote the processor to make it more robust to edge cases (6c7b9049870d1e05651edd6f1a40ee658741fed4)

Co-authored-by: Kai <kai-aizip@users.noreply.huggingface.co>

Files changed (1) hide show

utils/context_processor.py +460 -157

utils/context_processor.py CHANGED Viewed

@@ -1,155 +1,460 @@
 import re
 import json
-def debug_text(text, label="Text"):
-    """Helper function to debug text processing issues"""
-    print(f"\n--- DEBUG {label} ---")
-    print(f"Length: {len(text)}")
-    print(f"First 100 chars: {text[:100]}")
-    print(f"Contains highlight_start: {'[[highlight_start]]' in text}")
-    print(f"Contains start_highlight: {'[[start_highlight]]' in text}")
-    print("-------------------------\n")
-def clean_json_text(text):
-    """
-    Handle text that came from JSON and might have JSON escaping.
-    This handles the case of text like: "the sky isn\\'t falling"
-    """
-    # First attempt to clean JSON-style escapes
-    try:
-        # Try to treat the string as if it were a JSON string
-        if '\\' in text:
-            # Create a valid JSON string with the text as content
-            json_str = json.dumps({"text": text})
-            # Parse it back to get properly unescaped text
-            parsed = json.loads(json_str)
-            return parsed["text"]
-    except Exception:
-        # If that fails, continue with the original text
-        pass
-    return text
-def process_highlights(text):
-    """
-    Process highlight markers in text to create HTML highlighted text.
-    Handles both standard format and alternative format.
-    Also properly handles escaped quotes.
-    """
-    # Debug info
-    # debug_text(text, "Before processing")
-    # Clean JSON escaping
-    text = clean_json_text(text)
-    # Process highlight tags
-    pattern1 = r'\[\[highlight_start\]\](.*?)\[\[highlight_end\]\]'
-    replacement = r'<span class="highlight">\1</span>'
-    highlighted_text = re.sub(pattern1, replacement, text)
-    pattern2 = r'\[\[start_highlight\]\](.*?)\[\[end_highlight\]\]'
-    highlighted_text = re.sub(pattern2, replacement, highlighted_text)
-    # Debug info
-    # debug_text(highlighted_text, "After processing")
-    return highlighted_text
-def process_table_with_highlights(markdown_table):
-    """
-    Special function to process markdown tables with highlights.
-    Ensures the table structure is preserved while applying highlights.
-    """
-    # First, split the table into lines
-    lines = markdown_table.strip().split('\n')
-    processed_lines = []
-    for line in lines:
-        # Process highlights in each line
-        processed_line = process_highlights(line)
-        processed_lines.append(processed_line)
-    return convert_markdown_table_to_html('\n'.join(processed_lines))
-def convert_markdown_table_to_html(markdown_text):
-    """
-    Converts a markdown table to an HTML table.
-    """
-    # Clean JSON escaping
-    markdown_text = clean_json_text(markdown_text)
-    lines = markdown_text.strip().split('\n')
-    table_lines = [line for line in lines if line.strip().startswith('|')]
-    if len(table_lines) < 2:  # Need at least header and separator
-        return markdown_text  # Return original if not a proper table
-    html = '<table class="md-table">'
-    # Check if we have a header row
-    if len(table_lines) >= 2 and '---' in table_lines[1]:
-        # Process header
-        header_cells = table_lines[0].split('|')[1:-1] if table_lines[0].strip().endswith('|') else table_lines[0].split('|')[1:]
-        html += '<thead><tr>'
-        for cell in header_cells:
-            # Process highlights in the cell
-            processed_cell = process_highlights(cell.strip())
-            html += f'<th>{processed_cell}</th>'
-        html += '</tr></thead>'
-        # Process data rows (skip the separator row at index 1)
-        html += '<tbody>'
-        for line in table_lines[2:]:
-            if not line.strip():
-                continue
-            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
-            html += '<tr>'
-            for cell in cells:
-                # Process highlights in the cell
-                processed_cell = process_highlights(cell.strip())
-                html += f'<td>{processed_cell}</td>'
-            html += '</tr>'
-        html += '</tbody>'
-    else:
-        # No header row, treat all rows as data
-        html += '<tbody>'
-        for line in table_lines:
-            if not line.strip():
-                continue
-            cells = line.split('|')[1:-1] if line.strip().endswith('|') else line.split('|')[1:]
-            html += '<tr>'
-            for cell in cells:
-                # Process highlights in the cell
-                processed_cell = process_highlights(cell.strip())
-                html += f'<td>{processed_cell}</td>'
-            html += '</tr>'
-        html += '</tbody>'
-    html += '</table>'
-    return html
 def get_context_html(example, show_full=False):
-    """
-    Formats the context chunks into an HTML string for display using specific CSS classes.
-    Includes an alert for insufficient context and applies highlighting.
-    Parameters:
-    - example: The example data containing contexts
-    - show_full: Boolean indicating whether to show full context
-    """
-    html = ""
-    # Add insufficient context warning if needed
     if example.get("insufficient", False):
         insufficient_reason = example.get("insufficient_reason", "")
-        reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"
-        html += f"""
         <div class="insufficient-alert">
             <strong>
-                <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
                     <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
                     <line x1="12" y1="9" x2="12" y2="13"></line>
                     <line x1="12" y1="17" x2="12.01" y2="17"></line>
@@ -160,47 +465,45 @@ def get_context_html(example, show_full=False):
         </div>
         """
-    # Create container div for all context items
-    html += '<div class="context-items-container">'
-    # Determine which context to display based on show_full flag
     if show_full and "full_contexts" in example and example["full_contexts"]:
-        # If showing full context, create individual items for each chunk without headers
         for context_item in example["full_contexts"]:
-            context_text = context_item.get('content', '')
-            # Check for markdown table format (both standard and newline format)
-            if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
-                # Process as a table
-                html += f'<div class="context-item">{process_table_with_highlights(context_text)}</div>'
-            else:
-                # Regular text content - process highlights
-                processed_text = process_highlights(context_text)
-                html += f'<div class="context-item">{processed_text}</div>'
     else:
-        # Show the highlighted context items
         if "contexts" in example and example["contexts"]:
             for context_item in example["contexts"]:
-                chunk_num = context_item.get('chunk_num', '')
-                context_text = context_item.get('content', '')
                 is_primary = context_item.get('is_primary', False)
-                # Add appropriate class for primary chunks
                 extra_class = " primary-context" if is_primary else ""
-                # Check for markdown table format
-                if '|' in context_text and ('\n|' in context_text or '\n-' in context_text):
-                    # Process as a table
-                    html += f'<div class="context-item{extra_class}">{process_table_with_highlights(context_text)}</div>'
-                else:
-                    # Regular text with potential highlights
-                    processed_text = process_highlights(context_text)
-                    html += f'<div class="context-item{extra_class}">{processed_text}</div>'
         else:
-            # If no contexts available, show a message
-            html += '<div class="context-item">No context available. Try toggling to full context view.</div>'
-    # Close the container div
-    html += '</div>'
-    return html

 import re
+import html
 import json
+from typing import Dict, List, Tuple, Optional, Any, Union
+class ContextProcessor:
+    """Processes highlighted contexts for the RAG Summarizer Arena"""
+    # Common HTML entities that might be incomplete
+    INCOMPLETE_ENTITIES = {
+        '&#x27': '&#x27;',
+        '&quot': '&quot;',
+        '&lt': '&lt;',
+        '&gt': '&gt;',
+        '&amp': '&amp;'
+    }
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Cleans text by fixing HTML entities and handling escaped characters"""
+        if not text or not isinstance(text, str):
+            return text
+        # Fix incomplete HTML entities
+        for incomplete, complete in ContextProcessor.INCOMPLETE_ENTITIES.items():
+            text = re.sub(f"{re.escape(incomplete)}(?!;)", complete, text)
+        # Convert HTML entities to characters
+        try:
+            text = html.unescape(text)
+        except Exception:
+            pass
+        # Handle escaped quotes and special characters
+        replacements = {
+            r'\"': '"', r"\'": "'", r"\n": "\n", r"\t": "\t", r"\\": "\\",
+            '"': '"', '"': '"', ''': "'", ''': "'", '`': "'", '´': "'"
+        }
+        for pattern, replacement in replacements.items():
+            text = text.replace(pattern, replacement)
+        # Remove trailing backslash if present
+        if text.rstrip().endswith('\\'):
+            text = text.rstrip().rstrip('\\')
+        return text
+    @staticmethod
+    def balance_highlight_tags(text: str) -> str:
+        """Ensures highlight tags are properly balanced"""
+        if not text or not isinstance(text, str):
+            return text
+        # Define highlight tag patterns
+        highlight_pairs = [
+            ('[[start_highlight]]', '[[end_highlight]]'),
+            ('[[highlight_start]]', '[[highlight_end]]'),
+            ('<span class="highlight">', '</span>')
+        ]
+        # Check and balance each pair
+        for start_tag, end_tag in highlight_pairs:
+            start_count = text.count(start_tag)
+            end_count = text.count(end_tag)
+            # Add missing tags if needed
+            if start_count > end_count:
+                text += end_tag * (start_count - end_count)
+            elif end_count > start_count:
+                text = start_tag * (end_count - start_count) + text
+        return text
+    @staticmethod
+    def balance_quotes(text: str) -> str:
+        """Ensures quotes are properly balanced"""
+        if not text or not isinstance(text, str):
+            return text
+        # First, remove escaped quotes from the count
+        plain_text = text.replace('\\"', '')
+        # Count quotes and balance if needed
+        quote_count = plain_text.count('"')
+        if quote_count % 2 == 1:
+            text += '"'
+        return text
+    @staticmethod
+    def extract_highlight_parts(text: str) -> List[Tuple[bool, str]]:
+        """
+        Extracts highlighted and non-highlighted parts from text, preserving order
+        """
+        # Ensure highlight tags are balanced
+        text = ContextProcessor.balance_highlight_tags(text)
+        # Define all highlight patterns
+        highlight_patterns = [
+            ('[[start_highlight]]', '[[end_highlight]]'),
+            ('[[highlight_start]]', '[[highlight_end]]'),
+            ('<span class="highlight">', '</span>')
+        ]
+        # Collect all highlight sections with their positions
+        all_highlights = []
+        for start_tag, end_tag in highlight_patterns:
+            # Escape special regex characters if needed
+            start_esc = re.escape(start_tag)
+            end_esc = re.escape(end_tag)
+            # Find all occurrences of this highlight pattern
+            for match in re.finditer(f"{start_esc}(.*?){end_esc}", text, re.DOTALL):
+                all_highlights.append({
+                    'start': match.start(),
+                    'end': match.end(),
+                    'content': match.group(1),
+                    'start_tag': start_tag,
+                    'end_tag': end_tag
+                })
+        # If no highlights found, return the whole text as unhighlighted
+        if not all_highlights:
+            return [(False, text)]
+        # Sort highlights by start position
+        all_highlights.sort(key=lambda x: x['start'])
+        # Build the parts list by processing text portions between and including highlights
+        parts = []
+        current_pos = 0
+        for highlight in all_highlights:
+            # Add non-highlighted text before this highlight
+            if highlight['start'] > current_pos:
+                parts.append((False, text[current_pos:highlight['start']]))
+            # Add the highlighted text
+            parts.append((True, highlight['content']))
+            # Update position to end of this highlight
+            current_pos = highlight['end']
+        # Add any remaining text after the last highlight
+        if current_pos < len(text):
+            parts.append((False, text[current_pos:]))
+        return parts
+    @staticmethod
+    def is_markdown_table(text: str) -> bool:
+        """Checks if text looks like a markdown table"""
+        if not text or not isinstance(text, str):
+            return False
+        if '|' in text and '\n' in text:
+            lines = text.strip().split('\n')
+            pipe_lines = sum(1 for line in lines if line.strip().startswith('|'))
+            return pipe_lines >= 2
+        return False
+    @staticmethod
+    def process_cell_content(cell_text: str) -> str:
+        """Processes a single table cell, handling highlights if present"""
+        # Clean and prepare the text
+        cell_text = ContextProcessor.clean_text(cell_text)
+        cell_text = ContextProcessor.balance_quotes(cell_text)
+        # Check if cell has any highlight tags
+        has_highlights = False
+        highlight_patterns = [
+            '[[start_highlight]]', '[[end_highlight]]',
+            '[[highlight_start]]', '[[highlight_end]]',
+            '<span class="highlight">', '</span>'
+        ]
+        for pattern in highlight_patterns:
+            if pattern in cell_text:
+                has_highlights = True
+                break
+        if has_highlights:
+            # Extract and process highlight parts
+            parts = ContextProcessor.extract_highlight_parts(cell_text)
+            # Build the result
+            result = ""
+            for is_highlighted, part in parts:
+                if is_highlighted:
+                    result += f'<span class="highlight">{html.escape(part)}</span>'
+                else:
+                    result += html.escape(part)
+            return result
+        else:
+            # Just escape HTML in regular cells
+            return html.escape(cell_text)
+    @staticmethod
+    def convert_table_to_html(text: str) -> str:
+        """Converts markdown table to HTML with support for highlights in cells"""
+        # Clean the text
+        text = ContextProcessor.clean_text(text)
+        # Split into lines and get table rows
+        lines = text.strip().split('\n')
+        table_lines = [line for line in lines if line.strip().startswith('|')]
+        # Check if it's a proper table
+        if len(table_lines) < 2:
+            return ContextProcessor.process_text(text)
+        # Check if second line is a separator (----)
+        has_header = False
+        if len(table_lines) >= 2 and '---' in table_lines[1]:
+            has_header = True
+        # Start building HTML table
+        html_output = '<table class="md-table">'
+        if has_header:
+            # Process header row
+            header_line = table_lines[0]
+            # Split by pipe and remove empty first and last elements
+            cells = [cell.strip() for cell in header_line.split('|')]
+            if cells and not cells[0]:
+                cells.pop(0)
+            if cells and not cells[-1]:
+                cells.pop()
+            html_output += '<thead><tr>'
+            for cell in cells:
+                cell_html = ContextProcessor.process_cell_content(cell)
+                html_output += f'<th>{cell_html}</th>'
+            html_output += '</tr></thead>'
+            # Process data rows (skip header and separator)
+            html_output += '<tbody>'
+            for line in table_lines[2:]:
+                cells = [cell.strip() for cell in line.split('|')]
+                if cells and not cells[0]:
+                    cells.pop(0)
+                if cells and not cells[-1]:
+                    cells.pop()
+                html_output += '<tr>'
+                for cell in cells:
+                    cell_html = ContextProcessor.process_cell_content(cell)
+                    html_output += f'<td>{cell_html}</td>'
+                html_output += '</tr>'
+            html_output += '</tbody>'
+        else:
+            # All rows are data
+            html_output += '<tbody>'
+            for line in table_lines:
+                cells = [cell.strip() for cell in line.split('|')]
+                if cells and not cells[0]:
+                    cells.pop(0)
+                if cells and not cells[-1]:
+                    cells.pop()
+                html_output += '<tr>'
+                for cell in cells:
+                    cell_html = ContextProcessor.process_cell_content(cell)
+                    html_output += f'<td>{cell_html}</td>'
+                html_output += '</tr>'
+            html_output += '</tbody>'
+        html_output += '</table>'
+        return html_output
+    @staticmethod
+    def process_text(text: str) -> str:
+        """Processes text with highlights, handling all edge cases"""
+        # Clean and prepare the text
+        text = ContextProcessor.clean_text(text)
+        text = ContextProcessor.balance_quotes(text)
+        text = ContextProcessor.balance_highlight_tags(text)
+        # Extract and process highlight parts
+        parts = ContextProcessor.extract_highlight_parts(text)
+        # Build the result
+        result = ""
+        for is_highlighted, part in parts:
+            if is_highlighted:
+                escaped_part = html.escape(part)
+                result += f'<span class="highlight">{escaped_part}</span>'
+            else:
+                result += html.escape(part)
+        return result
+    @staticmethod
+    def process_content(content: str, abbreviated_content: Optional[str] = None) -> str:
+        """Main function to process any kind of content"""
+        # Handle null/empty content
+        if not content or not isinstance(content, str):
+            return ""
+        # Special cases that need abbreviated content
+        special_cases = [
+            lambda c: c.strip() == "In Oklahoma,",
+            lambda c: c.strip().startswith('"') and c.count('"') == 1,
+            lambda c: c.rstrip().endswith('\\'),
+            lambda c: (c.replace('\\"', '').count('"') % 2) == 1,
+            lambda c: any((c.count(start) != c.count(end)) for start, end in [
+                ('[[start_highlight]]', '[[end_highlight]]'),
+                ('[[highlight_start]]', '[[highlight_end]]'),
+                ('<span class="highlight">', '</span>')
+            ])
+        ]
+        # Check if we need to use abbreviated content
+        needs_abbreviated = any(check(content) for check in special_cases)
+        # If content needs help and we have abbreviated content, use it
+        if needs_abbreviated and abbreviated_content:
+            # Handle abbreviated content that might be a JSON string
+            if abbreviated_content.strip().startswith('{') and abbreviated_content.strip().endswith('}'):
+                try:
+                    data = json.loads(abbreviated_content)
+                    if "abbreviatedContent" in data:
+                        abbreviated_content = data["abbreviatedContent"]
+                except json.JSONDecodeError:
+                    pass
+            # Clean and prepare the abbreviated content
+            abbreviated_content = ContextProcessor.clean_text(abbreviated_content)
+            abbreviated_content = ContextProcessor.balance_quotes(abbreviated_content)
+            abbreviated_content = ContextProcessor.balance_highlight_tags(abbreviated_content)
+            # Use abbreviated content instead
+            content = abbreviated_content
+        # Check if content is a markdown table
+        if ContextProcessor.is_markdown_table(content):
+            return ContextProcessor.convert_table_to_html(content)
+        else:
+            return ContextProcessor.process_text(content)
+    @staticmethod
+    def parse_json_contexts(context_json: str) -> List[Dict[str, Any]]:
+        """Parses JSON-formatted context data with fallback to regex extraction"""
+        contexts = []
+        # First try standard JSON parsing
+        try:
+            contexts = json.loads(context_json)
+            if not isinstance(contexts, list):
+                contexts = []
+        except json.JSONDecodeError:
+            # If standard parsing fails, use regex to extract the data
+            try:
+                # Extract type field
+                type_pattern = r'"type":\s*"(primary|secondary)"'
+                types = re.findall(type_pattern, context_json)
+                # Extract abbreviatedContent field - more robustly handle quotes
+                content_pattern = r'"abbreviatedContent":\s*"((?:\\.|[^"])*?)"'
+                contents = re.findall(content_pattern, context_json)
+                # Build context objects
+                for i, (ctx_type, content) in enumerate(zip(types, contents)):
+                    contexts.append({
+                        'type': ctx_type,
+                        'abbreviatedContent': content.replace('\\"', '"')
+                    })
+            except Exception as e:
+                print(f"Error extracting contexts with regex: {e}")
+        return contexts
+    @staticmethod
+    def process_json_contexts(context_json: str) -> List[Dict[str, Any]]:
+        """Process JSON-formatted highlighted contexts"""
+        processed_contexts = []
+        try:
+            # Parse the JSON contexts
+            contexts = ContextProcessor.parse_json_contexts(context_json)
+            # Process each context item
+            for i, item in enumerate(contexts):
+                if isinstance(item, dict):
+                    context_type = item.get('type', 'secondary')
+                    content = item.get('abbreviatedContent', '')
+                    # Process the content
+                    processed_content = ContextProcessor.process_content(content)
+                    # Create processed context item
+                    processed_contexts.append({
+                        'chunk_num': i + 1,
+                        'content': processed_content,
+                        'is_primary': context_type == 'primary'
+                    })
+        except Exception as e:
+            print(f"Error processing JSON contexts: {e}")
+        return processed_contexts
+# Module-level functions for backward compatibility
+def clean_text(text):
+    return ContextProcessor.clean_text(text)
+def balance_highlight_tags(text):
+    return ContextProcessor.balance_highlight_tags(text)
+def balance_quotes(text):
+    return ContextProcessor.balance_quotes(text)
+def extract_highlight_parts(text):
+    return ContextProcessor.extract_highlight_parts(text)
+def is_markdown_table(text):
+    return ContextProcessor.is_markdown_table(text)
+def process_cell_content(cell_text):
+    return ContextProcessor.process_cell_content(cell_text)
+def convert_table_to_html(text):
+    return ContextProcessor.convert_table_to_html(text)
+def process_text(text):
+    return ContextProcessor.process_text(text)
+def process_content(content, abbreviated_content=None):
+    return ContextProcessor.process_content(content, abbreviated_content)
+def process_highlights(text):
+    """Main entry point called from data_loader.py"""
+    return ContextProcessor.process_content(text)
 def get_context_html(example, show_full=False):
+    """Format context chunks into HTML for display"""
+    html_output = ""
+    # Process insufficient context warning if needed
     if example.get("insufficient", False):
         insufficient_reason = example.get("insufficient_reason", "")
+        reason_html = (
+            f"<p>{insufficient_reason}</p>" if insufficient_reason else
+            "<p>The context may not contain enough information to fully answer the question, "
+            "or the question might be ambiguous. Models should ideally indicate this limitation "
+            "or refuse to answer.</p>"
+        )
+        html_output += f"""
         <div class="insufficient-alert">
             <strong>
+                <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none"
+                     stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"
+                     style="vertical-align: middle; margin-right: 5px;">
                     <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
                     <line x1="12" y1="9" x2="12" y2="13"></line>
                     <line x1="12" y1="17" x2="12.01" y2="17"></line>
         </div>
         """
+    html_output += '<div class="context-items-container">'
+    # Display full contexts if requested
     if show_full and "full_contexts" in example and example["full_contexts"]:
         for context_item in example["full_contexts"]:
+            content = context_item.get('content', '')
+            abbreviated = context_item.get('abbreviatedContent', None)
+            # Process the content
+            processed = ContextProcessor.process_content(content, abbreviated)
+            html_output += f'<div class="context-item">{processed}</div>'
     else:
+        # Display regular contexts if available
         if "contexts" in example and example["contexts"]:
             for context_item in example["contexts"]:
+                content = context_item.get('content', '')
+                abbreviated = context_item.get('abbreviatedContent', None)
+                # Process the content
+                processed = ContextProcessor.process_content(content, abbreviated)
                 is_primary = context_item.get('is_primary', False)
+                extra_class = " primary-context" if is_primary else ""
+                html_output += f'<div class="context-item{extra_class}">{processed}</div>'
+        # Or process JSON-structured highlighted contexts
+        elif "contexts_highlighted" in example and example["contexts_highlighted"]:
+            processed_contexts = ContextProcessor.process_json_contexts(example["contexts_highlighted"])
+            for context_item in processed_contexts:
+                is_primary = context_item.get('is_primary', False)
                 extra_class = " primary-context" if is_primary else ""
+                html_output += f'<div class="context-item{extra_class}">{context_item["content"]}</div>'
         else:
+            html_output += '<div class="context-item">No context available. Try toggling to full context view.</div>'
+    html_output += '</div>'
+    return html_output