Spaces:
Running
Running
File size: 5,708 Bytes
e672262 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import difflib
import html
import re
from typing import List, Tuple
# --- Helper Function for Markdown Highlighting ---
def generate_highlighted_markdown(text, spans_with_info):
"""Applies highlighting spans with hover info to text for Markdown output."""
# Ensure spans are sorted by start index and valid
# Expects spans_with_info to be list of (start, end, hover_text_string)
valid_spans = sorted(
[
(s, e, info)
for s, e, info in spans_with_info # Unpack the tuple
if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text)
],
key=lambda x: x[0],
)
highlighted_parts = []
current_pos = 0
# Iterate through sorted spans with info
for start, end, hover_text in valid_spans:
# Add text before the current span (NO HTML escaping)
if start > current_pos:
highlighted_parts.append(text[current_pos:start])
# Add the highlighted span with title attribute
if start < end:
# Escape hover text for the title attribute
escaped_hover_text = html.escape(hover_text, quote=True)
# Escape span content for display
escaped_content = html.escape(text[start:end])
highlighted_parts.append(
f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>"
)
# Update current position, ensuring it doesn't go backward in case of overlap
current_pos = max(current_pos, end)
# Add any remaining text after the last span (NO HTML escaping)
if current_pos < len(text):
highlighted_parts.append(text[current_pos:])
return "".join(highlighted_parts)
# --- Citation Span Matching Function ---
def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]:
"""
Finds character spans in the document that likely form the citation,
allowing for fragments and minor differences. Uses SequenceMatcher
on alphanumeric words and maps back to character indices.
This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered.
Args:
document: The source document string.
citation: The citation string, potentially with fragments/typos.
Returns:
A list of (start, end) character tuples from the document,
representing the most likely origins of the citation fragments.
"""
# 1. Tokenize document and citation into ALPHANUMERIC words with char spans
doc_tokens = [
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document)
]
cite_tokens = [
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation)
]
if not doc_tokens or not cite_tokens:
return []
doc_words = [t[0].lower() for t in doc_tokens]
cite_words = [t[0].lower() for t in cite_tokens]
# 2. Find longest common blocks of words using SequenceMatcher
matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False)
matching_blocks = []
matched_tokens = 0
unmatched_doc_words = [(0, len(doc_words))]
unmatched_cite_words = [(0, len(cite_words))]
while matched_tokens < len(cite_words):
next_match_candidates = []
for da, db in unmatched_doc_words:
for ca, cb in unmatched_cite_words:
match = matcher.find_longest_match(da, db, ca, cb)
if match.size > 0:
next_match_candidates.append(match)
if len(next_match_candidates) == 0:
break
next_match = max(next_match_candidates, key=lambda x: x.size)
matching_blocks.append(next_match)
matched_tokens += next_match.size
# Update unmatched regions (this part needs careful implementation)
# Simplified logic: remove fully contained regions and split overlapping ones
new_unmatched_docs = []
for da, db in unmatched_doc_words:
# Check if this doc segment overlaps with the match
if next_match.a < db and next_match.a + next_match.size > da:
# Add segment before the match
if next_match.a > da:
new_unmatched_docs.append((da, next_match.a))
# Add segment after the match
if next_match.a + next_match.size < db:
new_unmatched_docs.append((next_match.a + next_match.size, db))
else:
new_unmatched_docs.append((da, db)) # Keep non-overlapping segment
unmatched_doc_words = new_unmatched_docs
new_unmatched_cites = []
for ca, cb in unmatched_cite_words:
if next_match.b < cb and next_match.b + next_match.size > ca:
if next_match.b > ca:
new_unmatched_cites.append((ca, next_match.b))
if next_match.b + next_match.size < cb:
new_unmatched_cites.append((next_match.b + next_match.size, cb))
else:
new_unmatched_cites.append((ca, cb))
unmatched_cite_words = new_unmatched_cites
# 3. Convert matching word blocks back to character spans
char_spans = []
for i, j, n in sorted(matching_blocks, key=lambda x: x.a):
if n == 0:
continue
start_char = doc_tokens[i][1]
end_char = doc_tokens[i + n - 1][2]
if char_spans and char_spans[-1][1] >= start_char - 1:
char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char))
else:
char_spans.append((start_char, end_char))
return char_spans
|