| """ |
| Text normalization utilities for comparing bibliography entries. |
| """ |
| import re |
| import unicodedata |
| from unidecode import unidecode |
|
|
|
|
| class TextNormalizer: |
| """Utility class for normalizing text for comparison.""" |
| |
| |
| LATEX_COMMANDS = [ |
| (r'\\textbf\{([^}]*)\}', r'\1'), |
| (r'\\textit\{([^}]*)\}', r'\1'), |
| (r'\\emph\{([^}]*)\}', r'\1'), |
| (r'\\textrm\{([^}]*)\}', r'\1'), |
| (r'\\texttt\{([^}]*)\}', r'\1'), |
| (r'\\textsf\{([^}]*)\}', r'\1'), |
| (r'\\textsc\{([^}]*)\}', r'\1'), |
| (r'\\text\{([^}]*)\}', r'\1'), |
| (r'\\mathrm\{([^}]*)\}', r'\1'), |
| (r'\\mathbf\{([^}]*)\}', r'\1'), |
| (r'\\mathit\{([^}]*)\}', r'\1'), |
| (r'\\url\{([^}]*)\}', r'\1'), |
| (r'\\href\{[^}]*\}\{([^}]*)\}', r'\1'), |
| ] |
| |
| |
| LATEX_CHARS = { |
| r'\&': '&', |
| r'\%': '%', |
| r'\$': '$', |
| r'\#': '#', |
| r'\_': '_', |
| r'\{': '{', |
| r'\}': '}', |
| r'\~': '~', |
| r'\^': '^', |
| r'``': '"', |
| r"''": '"', |
| r'`': "'", |
| r"'": "'", |
| r'--': '–', |
| r'---': '—', |
| } |
| |
| |
| LATEX_ACCENTS = [ |
| (r"\\'([aeiouAEIOU])", r'\1'), |
| (r'\\`([aeiouAEIOU])', r'\1'), |
| (r'\\^([aeiouAEIOU])', r'\1'), |
| (r'\\"([aeiouAEIOU])', r'\1'), |
| (r'\\~([nNaAoO])', r'\1'), |
| (r'\\c\{([cC])\}', r'\1'), |
| (r"\\'{([aeiouAEIOU])}", r'\1'), |
| (r'\\`{([aeiouAEIOU])}', r'\1'), |
| (r'\\^{([aeiouAEIOU])}', r'\1'), |
| (r'\\"{([aeiouAEIOU])}', r'\1'), |
| (r'\\~{([nNaAoO])}', r'\1'), |
| ] |
| |
| @classmethod |
| def normalize_latex(cls, text: str) -> str: |
| """Remove LaTeX formatting commands.""" |
| if not text: |
| return "" |
| |
| result = text |
| |
| |
| for pattern, replacement in cls.LATEX_COMMANDS: |
| result = re.sub(pattern, replacement, result) |
| |
| |
| for pattern, replacement in cls.LATEX_ACCENTS: |
| result = re.sub(pattern, replacement, result) |
| |
| |
| for latex_char, normal_char in cls.LATEX_CHARS.items(): |
| result = result.replace(latex_char, normal_char) |
| |
| |
| result = re.sub(r'[{}]', '', result) |
| |
| return result |
| |
| @classmethod |
| def normalize_unicode(cls, text: str) -> str: |
| """Normalize Unicode characters to ASCII.""" |
| if not text: |
| return "" |
| |
| |
| text = unicodedata.normalize('NFKD', text) |
| |
| text = unidecode(text) |
| return text |
| |
| @classmethod |
| def normalize_whitespace(cls, text: str) -> str: |
| """Normalize whitespace.""" |
| if not text: |
| return "" |
| |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| text = text.strip() |
| return text |
| |
| @classmethod |
| def remove_punctuation(cls, text: str) -> str: |
| """Remove punctuation for comparison.""" |
| if not text: |
| return "" |
| |
| |
| return re.sub(r'[^\w\s]', '', text) |
| |
| @classmethod |
| def normalize_for_comparison(cls, text: str) -> str: |
| """ |
| Full normalization pipeline for text comparison. |
| |
| Steps: |
| 1. Remove LaTeX formatting |
| 2. Normalize Unicode to ASCII |
| 3. Convert to lowercase |
| 4. Normalize whitespace |
| 5. Remove punctuation |
| """ |
| if not text: |
| return "" |
| |
| text = cls.normalize_latex(text) |
| text = cls.normalize_unicode(text) |
| text = text.lower() |
| text = cls.normalize_whitespace(text) |
| text = cls.remove_punctuation(text) |
| return text |
| |
| @classmethod |
| def normalize_author_name(cls, name: str) -> str: |
| """ |
| Normalize author name format. |
| Handles: "Last, First" and "First Last" formats. |
| Returns: normalized "first last" format. |
| """ |
| if not name: |
| return "" |
| |
| name = cls.normalize_latex(name) |
| name = cls.normalize_unicode(name) |
| name = cls.normalize_whitespace(name) |
| |
| |
| if ',' in name: |
| parts = name.split(',', 1) |
| if len(parts) == 2: |
| name = f"{parts[1].strip()} {parts[0].strip()}" |
| |
| name = name.lower() |
| name = cls.remove_punctuation(name) |
| return name |
| |
| @classmethod |
| def normalize_author_list(cls, authors: str) -> list[str]: |
| """ |
| Parse and normalize a list of authors. |
| Handles "and" as separator and "Last, First" format. |
| """ |
| if not authors: |
| return [] |
| |
| |
| author_list = re.split(r'\s+and\s+', authors, flags=re.IGNORECASE) |
| |
| |
| normalized = [] |
| for author in author_list: |
| normalized_name = cls.normalize_author_name(author.strip()) |
| if normalized_name: |
| normalized.append(normalized_name) |
| |
| return normalized |
| |
| @classmethod |
| def similarity_ratio(cls, text1: str, text2: str) -> float: |
| """Calculate word-based Jaccard similarity ratio between two strings.""" |
| if not text1 or not text2: |
| return 0.0 |
| |
| words1 = set(text1.split()) |
| words2 = set(text2.split()) |
| |
| if not words1 and not words2: |
| return 1.0 |
| if not words1 or not words2: |
| return 0.0 |
| |
| intersection = words1 & words2 |
| union = words1 | words2 |
| |
| return len(intersection) / len(union) |
| |
| @classmethod |
| def levenshtein_similarity(cls, s1: str, s2: str) -> float: |
| """Calculate normalized Levenshtein similarity.""" |
| if not s1 and not s2: |
| return 1.0 |
| if not s1 or not s2: |
| return 0.0 |
| |
| |
| m, n = len(s1), len(s2) |
| dp = [[0] * (n + 1) for _ in range(m + 1)] |
| |
| for i in range(m + 1): |
| dp[i][0] = i |
| for j in range(n + 1): |
| dp[0][j] = j |
| |
| for i in range(1, m + 1): |
| for j in range(1, n + 1): |
| if s1[i-1] == s2[j-1]: |
| dp[i][j] = dp[i-1][j-1] |
| else: |
| dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1 |
| |
| max_len = max(m, n) |
| distance = dp[m][n] |
| return 1.0 - (distance / max_len) |
|
|