import difflib import html import re from typing import List, Tuple # --- Helper Function for Markdown Highlighting --- def generate_highlighted_markdown(text, spans_with_info): """Applies highlighting spans with hover info to text for Markdown output.""" # Ensure spans are sorted by start index and valid # Expects spans_with_info to be list of (start, end, hover_text_string) valid_spans = sorted( [ (s, e, info) for s, e, info in spans_with_info # Unpack the tuple if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text) ], key=lambda x: x[0], ) highlighted_parts = [] current_pos = 0 # Iterate through sorted spans with info for start, end, hover_text in valid_spans: # Add text before the current span (NO HTML escaping) if start > current_pos: highlighted_parts.append(text[current_pos:start]) # Add the highlighted span with title attribute if start < end: # Escape hover text for the title attribute escaped_hover_text = html.escape(hover_text, quote=True) # Escape span content for display escaped_content = html.escape(text[start:end]) highlighted_parts.append( f"{escaped_content}" ) # Update current position, ensuring it doesn't go backward in case of overlap current_pos = max(current_pos, end) # Add any remaining text after the last span (NO HTML escaping) if current_pos < len(text): highlighted_parts.append(text[current_pos:]) return "".join(highlighted_parts) # --- Citation Span Matching Function --- def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]: """ Finds character spans in the document that likely form the citation, allowing for fragments and minor differences. Uses SequenceMatcher on alphanumeric words and maps back to character indices. This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered. Args: document: The source document string. citation: The citation string, potentially with fragments/typos. Returns: A list of (start, end) character tuples from the document, representing the most likely origins of the citation fragments. """ # 1. Tokenize document and citation into ALPHANUMERIC words with char spans doc_tokens = [ (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document) ] cite_tokens = [ (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation) ] if not doc_tokens or not cite_tokens: return [] doc_words = [t[0].lower() for t in doc_tokens] cite_words = [t[0].lower() for t in cite_tokens] # 2. Find longest common blocks of words using SequenceMatcher matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False) matching_blocks = [] matched_tokens = 0 unmatched_doc_words = [(0, len(doc_words))] unmatched_cite_words = [(0, len(cite_words))] while matched_tokens < len(cite_words): next_match_candidates = [] for da, db in unmatched_doc_words: for ca, cb in unmatched_cite_words: match = matcher.find_longest_match(da, db, ca, cb) if match.size > 0: next_match_candidates.append(match) if len(next_match_candidates) == 0: break next_match = max(next_match_candidates, key=lambda x: x.size) matching_blocks.append(next_match) matched_tokens += next_match.size # Update unmatched regions (this part needs careful implementation) # Simplified logic: remove fully contained regions and split overlapping ones new_unmatched_docs = [] for da, db in unmatched_doc_words: # Check if this doc segment overlaps with the match if next_match.a < db and next_match.a + next_match.size > da: # Add segment before the match if next_match.a > da: new_unmatched_docs.append((da, next_match.a)) # Add segment after the match if next_match.a + next_match.size < db: new_unmatched_docs.append((next_match.a + next_match.size, db)) else: new_unmatched_docs.append((da, db)) # Keep non-overlapping segment unmatched_doc_words = new_unmatched_docs new_unmatched_cites = [] for ca, cb in unmatched_cite_words: if next_match.b < cb and next_match.b + next_match.size > ca: if next_match.b > ca: new_unmatched_cites.append((ca, next_match.b)) if next_match.b + next_match.size < cb: new_unmatched_cites.append((next_match.b + next_match.size, cb)) else: new_unmatched_cites.append((ca, cb)) unmatched_cite_words = new_unmatched_cites # 3. Convert matching word blocks back to character spans char_spans = [] for i, j, n in sorted(matching_blocks, key=lambda x: x.a): if n == 0: continue start_char = doc_tokens[i][1] end_char = doc_tokens[i + n - 1][2] if char_spans and char_spans[-1][1] >= start_char - 1: char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char)) else: char_spans.append((start_char, end_char)) return char_spans