Spaces:

hfmlsoc
/

policy-docs-qa

Running

File size: 5,708 Bytes

e672262

import difflib
import html
import re
from typing import List, Tuple


# --- Helper Function for Markdown Highlighting ---
def generate_highlighted_markdown(text, spans_with_info):
    """Applies highlighting spans with hover info to text for Markdown output."""
    # Ensure spans are sorted by start index and valid
    # Expects spans_with_info to be list of (start, end, hover_text_string)
    valid_spans = sorted(
        [
            (s, e, info)
            for s, e, info in spans_with_info  # Unpack the tuple
            if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text)
        ],
        key=lambda x: x[0],
    )

    highlighted_parts = []
    current_pos = 0
    # Iterate through sorted spans with info
    for start, end, hover_text in valid_spans:
        # Add text before the current span (NO HTML escaping)
        if start > current_pos:
            highlighted_parts.append(text[current_pos:start])
        # Add the highlighted span with title attribute
        if start < end:
            # Escape hover text for the title attribute
            escaped_hover_text = html.escape(hover_text, quote=True)
            # Escape span content for display
            escaped_content = html.escape(text[start:end])
            highlighted_parts.append(
                f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>"
            )
        # Update current position, ensuring it doesn't go backward in case of overlap
        current_pos = max(current_pos, end)

    # Add any remaining text after the last span (NO HTML escaping)
    if current_pos < len(text):
        highlighted_parts.append(text[current_pos:])

    return "".join(highlighted_parts)


# --- Citation Span Matching Function ---
def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]:
    """
    Finds character spans in the document that likely form the citation,
    allowing for fragments and minor differences. Uses SequenceMatcher
    on alphanumeric words and maps back to character indices.
    This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered.

    Args:
        document: The source document string.
        citation: The citation string, potentially with fragments/typos.

    Returns:
        A list of (start, end) character tuples from the document,
        representing the most likely origins of the citation fragments.
    """
    # 1. Tokenize document and citation into ALPHANUMERIC words with char spans
    doc_tokens = [
        (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document)
    ]
    cite_tokens = [
        (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation)
    ]
    if not doc_tokens or not cite_tokens:
        return []

    doc_words = [t[0].lower() for t in doc_tokens]
    cite_words = [t[0].lower() for t in cite_tokens]

    # 2. Find longest common blocks of words using SequenceMatcher
    matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False)
    matching_blocks = []
    matched_tokens = 0

    unmatched_doc_words = [(0, len(doc_words))]
    unmatched_cite_words = [(0, len(cite_words))]

    while matched_tokens < len(cite_words):
        next_match_candidates = []
        for da, db in unmatched_doc_words:
            for ca, cb in unmatched_cite_words:
                match = matcher.find_longest_match(da, db, ca, cb)
                if match.size > 0:
                    next_match_candidates.append(match)
        if len(next_match_candidates) == 0:
            break
        next_match = max(next_match_candidates, key=lambda x: x.size)
        matching_blocks.append(next_match)
        matched_tokens += next_match.size

        # Update unmatched regions (this part needs careful implementation)
        # Simplified logic: remove fully contained regions and split overlapping ones
        new_unmatched_docs = []
        for da, db in unmatched_doc_words:
            # Check if this doc segment overlaps with the match
            if next_match.a < db and next_match.a + next_match.size > da:
                # Add segment before the match
                if next_match.a > da:
                    new_unmatched_docs.append((da, next_match.a))
                # Add segment after the match
                if next_match.a + next_match.size < db:
                    new_unmatched_docs.append((next_match.a + next_match.size, db))
            else:
                new_unmatched_docs.append((da, db))  # Keep non-overlapping segment
        unmatched_doc_words = new_unmatched_docs

        new_unmatched_cites = []
        for ca, cb in unmatched_cite_words:
            if next_match.b < cb and next_match.b + next_match.size > ca:
                if next_match.b > ca:
                    new_unmatched_cites.append((ca, next_match.b))
                if next_match.b + next_match.size < cb:
                    new_unmatched_cites.append((next_match.b + next_match.size, cb))
            else:
                new_unmatched_cites.append((ca, cb))
        unmatched_cite_words = new_unmatched_cites

    # 3. Convert matching word blocks back to character spans
    char_spans = []
    for i, j, n in sorted(matching_blocks, key=lambda x: x.a):
        if n == 0:
            continue
        start_char = doc_tokens[i][1]
        end_char = doc_tokens[i + n - 1][2]
        if char_spans and char_spans[-1][1] >= start_char - 1:
            char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char))
        else:
            char_spans.append((start_char, end_char))

    return char_spans