policy-docs-qa / utils /interface_utils.py
yjernite's picture
yjernite HF Staff
Upload 2 files
e672262 verified
raw
history blame
5.71 kB
import difflib
import html
import re
from typing import List, Tuple
# --- Helper Function for Markdown Highlighting ---
def generate_highlighted_markdown(text, spans_with_info):
"""Applies highlighting spans with hover info to text for Markdown output."""
# Ensure spans are sorted by start index and valid
# Expects spans_with_info to be list of (start, end, hover_text_string)
valid_spans = sorted(
[
(s, e, info)
for s, e, info in spans_with_info # Unpack the tuple
if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text)
],
key=lambda x: x[0],
)
highlighted_parts = []
current_pos = 0
# Iterate through sorted spans with info
for start, end, hover_text in valid_spans:
# Add text before the current span (NO HTML escaping)
if start > current_pos:
highlighted_parts.append(text[current_pos:start])
# Add the highlighted span with title attribute
if start < end:
# Escape hover text for the title attribute
escaped_hover_text = html.escape(hover_text, quote=True)
# Escape span content for display
escaped_content = html.escape(text[start:end])
highlighted_parts.append(
f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>"
)
# Update current position, ensuring it doesn't go backward in case of overlap
current_pos = max(current_pos, end)
# Add any remaining text after the last span (NO HTML escaping)
if current_pos < len(text):
highlighted_parts.append(text[current_pos:])
return "".join(highlighted_parts)
# --- Citation Span Matching Function ---
def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]:
"""
Finds character spans in the document that likely form the citation,
allowing for fragments and minor differences. Uses SequenceMatcher
on alphanumeric words and maps back to character indices.
This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered.
Args:
document: The source document string.
citation: The citation string, potentially with fragments/typos.
Returns:
A list of (start, end) character tuples from the document,
representing the most likely origins of the citation fragments.
"""
# 1. Tokenize document and citation into ALPHANUMERIC words with char spans
doc_tokens = [
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document)
]
cite_tokens = [
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation)
]
if not doc_tokens or not cite_tokens:
return []
doc_words = [t[0].lower() for t in doc_tokens]
cite_words = [t[0].lower() for t in cite_tokens]
# 2. Find longest common blocks of words using SequenceMatcher
matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False)
matching_blocks = []
matched_tokens = 0
unmatched_doc_words = [(0, len(doc_words))]
unmatched_cite_words = [(0, len(cite_words))]
while matched_tokens < len(cite_words):
next_match_candidates = []
for da, db in unmatched_doc_words:
for ca, cb in unmatched_cite_words:
match = matcher.find_longest_match(da, db, ca, cb)
if match.size > 0:
next_match_candidates.append(match)
if len(next_match_candidates) == 0:
break
next_match = max(next_match_candidates, key=lambda x: x.size)
matching_blocks.append(next_match)
matched_tokens += next_match.size
# Update unmatched regions (this part needs careful implementation)
# Simplified logic: remove fully contained regions and split overlapping ones
new_unmatched_docs = []
for da, db in unmatched_doc_words:
# Check if this doc segment overlaps with the match
if next_match.a < db and next_match.a + next_match.size > da:
# Add segment before the match
if next_match.a > da:
new_unmatched_docs.append((da, next_match.a))
# Add segment after the match
if next_match.a + next_match.size < db:
new_unmatched_docs.append((next_match.a + next_match.size, db))
else:
new_unmatched_docs.append((da, db)) # Keep non-overlapping segment
unmatched_doc_words = new_unmatched_docs
new_unmatched_cites = []
for ca, cb in unmatched_cite_words:
if next_match.b < cb and next_match.b + next_match.size > ca:
if next_match.b > ca:
new_unmatched_cites.append((ca, next_match.b))
if next_match.b + next_match.size < cb:
new_unmatched_cites.append((next_match.b + next_match.size, cb))
else:
new_unmatched_cites.append((ca, cb))
unmatched_cite_words = new_unmatched_cites
# 3. Convert matching word blocks back to character spans
char_spans = []
for i, j, n in sorted(matching_blocks, key=lambda x: x.a):
if n == 0:
continue
start_char = doc_tokens[i][1]
end_char = doc_tokens[i + n - 1][2]
if char_spans and char_spans[-1][1] >= start_char - 1:
char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char))
else:
char_spans.append((start_char, end_char))
return char_spans