Spaces:

hfmlsoc
/

policy-docs-qa

Running

App Files Files Community

policy-docs-qa / utils /interface_utils.py

yjernite HF Staff

Upload 2 files

e672262 verified 2 months ago

raw

history blame

5.71 kB

	import difflib
	import html
	import re
	from typing import List, Tuple


	# --- Helper Function for Markdown Highlighting ---
	def generate_highlighted_markdown(text, spans_with_info):
	"""Applies highlighting spans with hover info to text for Markdown output."""
	# Ensure spans are sorted by start index and valid
	# Expects spans_with_info to be list of (start, end, hover_text_string)
	valid_spans = sorted(
	[
	(s, e, info)
	for s, e, info in spans_with_info # Unpack the tuple
	if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text)
	],
	key=lambda x: x[0],
	)

	highlighted_parts = []
	current_pos = 0
	# Iterate through sorted spans with info
	for start, end, hover_text in valid_spans:
	# Add text before the current span (NO HTML escaping)
	if start > current_pos:
	highlighted_parts.append(text[current_pos:start])
	# Add the highlighted span with title attribute
	if start < end:
	# Escape hover text for the title attribute
	escaped_hover_text = html.escape(hover_text, quote=True)
	# Escape span content for display
	escaped_content = html.escape(text[start:end])
	highlighted_parts.append(
	f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>"
	)
	# Update current position, ensuring it doesn't go backward in case of overlap
	current_pos = max(current_pos, end)

	# Add any remaining text after the last span (NO HTML escaping)
	if current_pos < len(text):
	highlighted_parts.append(text[current_pos:])

	return "".join(highlighted_parts)


	# --- Citation Span Matching Function ---
	def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]:
	"""
	Finds character spans in the document that likely form the citation,
	allowing for fragments and minor differences. Uses SequenceMatcher
	on alphanumeric words and maps back to character indices.
	This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered.

	Args:
	document: The source document string.
	citation: The citation string, potentially with fragments/typos.

	Returns:
	A list of (start, end) character tuples from the document,
	representing the most likely origins of the citation fragments.
	"""
	# 1. Tokenize document and citation into ALPHANUMERIC words with char spans
	doc_tokens = [
	(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document)
	]
	cite_tokens = [
	(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation)
	]
	if not doc_tokens or not cite_tokens:
	return []

	doc_words = [t[0].lower() for t in doc_tokens]
	cite_words = [t[0].lower() for t in cite_tokens]

	# 2. Find longest common blocks of words using SequenceMatcher
	matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False)
	matching_blocks = []
	matched_tokens = 0

	unmatched_doc_words = [(0, len(doc_words))]
	unmatched_cite_words = [(0, len(cite_words))]

	while matched_tokens < len(cite_words):
	next_match_candidates = []
	for da, db in unmatched_doc_words:
	for ca, cb in unmatched_cite_words:
	match = matcher.find_longest_match(da, db, ca, cb)
	if match.size > 0:
	next_match_candidates.append(match)
	if len(next_match_candidates) == 0:
	break
	next_match = max(next_match_candidates, key=lambda x: x.size)
	matching_blocks.append(next_match)
	matched_tokens += next_match.size

	# Update unmatched regions (this part needs careful implementation)
	# Simplified logic: remove fully contained regions and split overlapping ones
	new_unmatched_docs = []
	for da, db in unmatched_doc_words:
	# Check if this doc segment overlaps with the match
	if next_match.a < db and next_match.a + next_match.size > da:
	# Add segment before the match
	if next_match.a > da:
	new_unmatched_docs.append((da, next_match.a))
	# Add segment after the match
	if next_match.a + next_match.size < db:
	new_unmatched_docs.append((next_match.a + next_match.size, db))
	else:
	new_unmatched_docs.append((da, db)) # Keep non-overlapping segment
	unmatched_doc_words = new_unmatched_docs

	new_unmatched_cites = []
	for ca, cb in unmatched_cite_words:
	if next_match.b < cb and next_match.b + next_match.size > ca:
	if next_match.b > ca:
	new_unmatched_cites.append((ca, next_match.b))
	if next_match.b + next_match.size < cb:
	new_unmatched_cites.append((next_match.b + next_match.size, cb))
	else:
	new_unmatched_cites.append((ca, cb))
	unmatched_cite_words = new_unmatched_cites

	# 3. Convert matching word blocks back to character spans
	char_spans = []
	for i, j, n in sorted(matching_blocks, key=lambda x: x.a):
	if n == 0:
	continue
	start_char = doc_tokens[i][1]
	end_char = doc_tokens[i + n - 1][2]
	if char_spans and char_spans[-1][1] >= start_char - 1:
	char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char))
	else:
	char_spans.append((start_char, end_char))

	return char_spans