File size: 5,708 Bytes
e672262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import difflib
import html
import re
from typing import List, Tuple


# --- Helper Function for Markdown Highlighting ---
def generate_highlighted_markdown(text, spans_with_info):
    """Applies highlighting spans with hover info to text for Markdown output."""
    # Ensure spans are sorted by start index and valid
    # Expects spans_with_info to be list of (start, end, hover_text_string)
    valid_spans = sorted(
        [
            (s, e, info)
            for s, e, info in spans_with_info  # Unpack the tuple
            if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text)
        ],
        key=lambda x: x[0],
    )

    highlighted_parts = []
    current_pos = 0
    # Iterate through sorted spans with info
    for start, end, hover_text in valid_spans:
        # Add text before the current span (NO HTML escaping)
        if start > current_pos:
            highlighted_parts.append(text[current_pos:start])
        # Add the highlighted span with title attribute
        if start < end:
            # Escape hover text for the title attribute
            escaped_hover_text = html.escape(hover_text, quote=True)
            # Escape span content for display
            escaped_content = html.escape(text[start:end])
            highlighted_parts.append(
                f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>"
            )
        # Update current position, ensuring it doesn't go backward in case of overlap
        current_pos = max(current_pos, end)

    # Add any remaining text after the last span (NO HTML escaping)
    if current_pos < len(text):
        highlighted_parts.append(text[current_pos:])

    return "".join(highlighted_parts)


# --- Citation Span Matching Function ---
def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]:
    """
    Finds character spans in the document that likely form the citation,
    allowing for fragments and minor differences. Uses SequenceMatcher
    on alphanumeric words and maps back to character indices.
    This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered.

    Args:
        document: The source document string.
        citation: The citation string, potentially with fragments/typos.

    Returns:
        A list of (start, end) character tuples from the document,
        representing the most likely origins of the citation fragments.
    """
    # 1. Tokenize document and citation into ALPHANUMERIC words with char spans
    doc_tokens = [
        (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document)
    ]
    cite_tokens = [
        (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation)
    ]
    if not doc_tokens or not cite_tokens:
        return []

    doc_words = [t[0].lower() for t in doc_tokens]
    cite_words = [t[0].lower() for t in cite_tokens]

    # 2. Find longest common blocks of words using SequenceMatcher
    matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False)
    matching_blocks = []
    matched_tokens = 0

    unmatched_doc_words = [(0, len(doc_words))]
    unmatched_cite_words = [(0, len(cite_words))]

    while matched_tokens < len(cite_words):
        next_match_candidates = []
        for da, db in unmatched_doc_words:
            for ca, cb in unmatched_cite_words:
                match = matcher.find_longest_match(da, db, ca, cb)
                if match.size > 0:
                    next_match_candidates.append(match)
        if len(next_match_candidates) == 0:
            break
        next_match = max(next_match_candidates, key=lambda x: x.size)
        matching_blocks.append(next_match)
        matched_tokens += next_match.size

        # Update unmatched regions (this part needs careful implementation)
        # Simplified logic: remove fully contained regions and split overlapping ones
        new_unmatched_docs = []
        for da, db in unmatched_doc_words:
            # Check if this doc segment overlaps with the match
            if next_match.a < db and next_match.a + next_match.size > da:
                # Add segment before the match
                if next_match.a > da:
                    new_unmatched_docs.append((da, next_match.a))
                # Add segment after the match
                if next_match.a + next_match.size < db:
                    new_unmatched_docs.append((next_match.a + next_match.size, db))
            else:
                new_unmatched_docs.append((da, db))  # Keep non-overlapping segment
        unmatched_doc_words = new_unmatched_docs

        new_unmatched_cites = []
        for ca, cb in unmatched_cite_words:
            if next_match.b < cb and next_match.b + next_match.size > ca:
                if next_match.b > ca:
                    new_unmatched_cites.append((ca, next_match.b))
                if next_match.b + next_match.size < cb:
                    new_unmatched_cites.append((next_match.b + next_match.size, cb))
            else:
                new_unmatched_cites.append((ca, cb))
        unmatched_cite_words = new_unmatched_cites

    # 3. Convert matching word blocks back to character spans
    char_spans = []
    for i, j, n in sorted(matching_blocks, key=lambda x: x.a):
        if n == 0:
            continue
        start_char = doc_tokens[i][1]
        end_char = doc_tokens[i + n - 1][2]
        if char_spans and char_spans[-1][1] >= start_char - 1:
            char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char))
        else:
            char_spans.append((start_char, end_char))

    return char_spans