Spaces:
Sleeping
Sleeping
| import difflib | |
| import html | |
| import re | |
| from typing import List, Tuple | |
| # --- Helper Function for Markdown Highlighting --- | |
| def generate_highlighted_markdown(text, spans_with_info): | |
| """Applies highlighting spans with hover info to text for Markdown output.""" | |
| # Ensure spans are sorted by start index and valid | |
| # Expects spans_with_info to be list of (start, end, hover_text_string) | |
| valid_spans = sorted( | |
| [ | |
| (s, e, info) | |
| for s, e, info in spans_with_info # Unpack the tuple | |
| if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text) | |
| ], | |
| key=lambda x: x[0], | |
| ) | |
| highlighted_parts = [] | |
| current_pos = 0 | |
| # Iterate through sorted spans with info | |
| for start, end, hover_text in valid_spans: | |
| # Add text before the current span (NO HTML escaping) | |
| if start > current_pos: | |
| highlighted_parts.append(text[current_pos:start]) | |
| # Add the highlighted span with title attribute | |
| if start < end: | |
| # Escape hover text for the title attribute | |
| escaped_hover_text = html.escape(hover_text, quote=True) | |
| # Escape span content for display | |
| escaped_content = html.escape(text[start:end]) | |
| highlighted_parts.append( | |
| f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>" | |
| ) | |
| # Update current position, ensuring it doesn't go backward in case of overlap | |
| current_pos = max(current_pos, end) | |
| # Add any remaining text after the last span (NO HTML escaping) | |
| if current_pos < len(text): | |
| highlighted_parts.append(text[current_pos:]) | |
| return "".join(highlighted_parts) | |
| # --- Citation Span Matching Function --- | |
| def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]: | |
| """ | |
| Finds character spans in the document that likely form the citation, | |
| allowing for fragments and minor differences. Uses SequenceMatcher | |
| on alphanumeric words and maps back to character indices. | |
| This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered. | |
| Args: | |
| document: The source document string. | |
| citation: The citation string, potentially with fragments/typos. | |
| Returns: | |
| A list of (start, end) character tuples from the document, | |
| representing the most likely origins of the citation fragments. | |
| """ | |
| # 1. Tokenize document and citation into ALPHANUMERIC words with char spans | |
| doc_tokens = [ | |
| (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document) | |
| ] | |
| cite_tokens = [ | |
| (m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation) | |
| ] | |
| if not doc_tokens or not cite_tokens: | |
| return [] | |
| doc_words = [t[0].lower() for t in doc_tokens] | |
| cite_words = [t[0].lower() for t in cite_tokens] | |
| # 2. Find longest common blocks of words using SequenceMatcher | |
| matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False) | |
| matching_blocks = [] | |
| matched_tokens = 0 | |
| unmatched_doc_words = [(0, len(doc_words))] | |
| unmatched_cite_words = [(0, len(cite_words))] | |
| while matched_tokens < len(cite_words): | |
| next_match_candidates = [] | |
| for da, db in unmatched_doc_words: | |
| for ca, cb in unmatched_cite_words: | |
| match = matcher.find_longest_match(da, db, ca, cb) | |
| if match.size > 0: | |
| next_match_candidates.append(match) | |
| if len(next_match_candidates) == 0: | |
| break | |
| next_match = max(next_match_candidates, key=lambda x: x.size) | |
| matching_blocks.append(next_match) | |
| matched_tokens += next_match.size | |
| # Update unmatched regions (this part needs careful implementation) | |
| # Simplified logic: remove fully contained regions and split overlapping ones | |
| new_unmatched_docs = [] | |
| for da, db in unmatched_doc_words: | |
| # Check if this doc segment overlaps with the match | |
| if next_match.a < db and next_match.a + next_match.size > da: | |
| # Add segment before the match | |
| if next_match.a > da: | |
| new_unmatched_docs.append((da, next_match.a)) | |
| # Add segment after the match | |
| if next_match.a + next_match.size < db: | |
| new_unmatched_docs.append((next_match.a + next_match.size, db)) | |
| else: | |
| new_unmatched_docs.append((da, db)) # Keep non-overlapping segment | |
| unmatched_doc_words = new_unmatched_docs | |
| new_unmatched_cites = [] | |
| for ca, cb in unmatched_cite_words: | |
| if next_match.b < cb and next_match.b + next_match.size > ca: | |
| if next_match.b > ca: | |
| new_unmatched_cites.append((ca, next_match.b)) | |
| if next_match.b + next_match.size < cb: | |
| new_unmatched_cites.append((next_match.b + next_match.size, cb)) | |
| else: | |
| new_unmatched_cites.append((ca, cb)) | |
| unmatched_cite_words = new_unmatched_cites | |
| # 3. Convert matching word blocks back to character spans | |
| char_spans = [] | |
| for i, j, n in sorted(matching_blocks, key=lambda x: x.a): | |
| if n == 0: | |
| continue | |
| start_char = doc_tokens[i][1] | |
| end_char = doc_tokens[i + n - 1][2] | |
| if char_spans and char_spans[-1][1] >= start_char - 1: | |
| char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char)) | |
| else: | |
| char_spans.append((start_char, end_char)) | |
| return char_spans | |