| """ |
| Text preprocessing for PubGuard. |
| |
| Designed for text *already extracted from PDFs* (e.g. via pdfplumber, |
| PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning |
| OCR / layout artefacts and producing a compact representation that |
| captures enough signal for the three classification heads. |
| """ |
|
|
| import re |
| from typing import Optional |
|
|
| |
|
|
| _WHITESPACE = re.compile(r"\s+") |
| _HEADER_JUNK = re.compile( |
| r"(doi:\s*\S+|https?://\S+|Β©\s*\d{4}|all rights reserved)", |
| re.IGNORECASE, |
| ) |
| _PAGE_NUMBER = re.compile(r"\n\s*\d{1,4}\s*\n") |
| _LIGATURE = re.compile(r"[ο¬ο¬ο¬ο¬ο¬]") |
|
|
| |
| SECTION_HEADINGS = re.compile( |
| r"\b(abstract|introduction|methods?|methodology|results|discussion|" |
| r"conclusions?|references|bibliography|acknowledgments?|funding|" |
| r"supplementary|materials?\s+and\s+methods?|related\s+work|" |
| r"background|literature\s+review|experimental|data\s+availability)\b", |
| re.IGNORECASE, |
| ) |
|
|
| CITATION_PATTERN = re.compile( |
| r"\[\d+\]|\(\w+\s+et\s+al\.\s*,?\s*\d{4}\)|\(\w+,\s*\d{4}\)", |
| ) |
|
|
|
|
| def clean_text(text: Optional[str], max_chars: int = 4000) -> str: |
| """ |
| Normalise raw PDF-extracted text for embedding. |
| |
| Steps: |
| 1. Replace ligatures with ASCII equivalents. |
| 2. Strip DOIs, URLs, copyright lines. |
| 3. Remove isolated page numbers. |
| 4. Collapse whitespace. |
| 5. Truncate to `max_chars`. |
| """ |
| if not text: |
| return "" |
|
|
| if not isinstance(text, str): |
| text = str(text) |
|
|
| |
| text = _LIGATURE.sub(lambda m: { |
| "ο¬": "fi", "ο¬": "fl", "ο¬": "ff", "ο¬": "ffi", "ο¬": "ffl" |
| }.get(m.group(), m.group()), text) |
|
|
| text = _HEADER_JUNK.sub(" ", text) |
| text = _PAGE_NUMBER.sub("\n", text) |
| text = _WHITESPACE.sub(" ", text).strip() |
|
|
| return text[:max_chars] |
|
|
|
|
| def extract_structural_features(text: str) -> dict: |
| """ |
| Cheap heuristic features that augment the embedding signal. |
| |
| Returns a dict of float features (0-1 range) that the linear |
| head can concatenate with the embedding vector. |
| """ |
| if not text: |
| return _empty_features() |
|
|
| n_chars = len(text) |
| n_words = len(text.split()) |
|
|
| |
| headings = SECTION_HEADINGS.findall(text) |
| unique_headings = set(h.lower() for h in headings) |
|
|
| |
| citations = CITATION_PATTERN.findall(text) |
|
|
| |
| alpha = sum(c.isalpha() for c in text) |
| digit = sum(c.isdigit() for c in text) |
| upper = sum(c.isupper() for c in text) |
|
|
| return { |
| |
| "log_chars": min(1.0, len(text) / 4000), |
| "log_words": min(1.0, n_words / 800), |
|
|
| |
| "n_unique_sections": min(1.0, len(unique_headings) / 8), |
| "has_abstract": float("abstract" in unique_headings), |
| "has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})), |
| "has_references": float(bool(unique_headings & {"references", "bibliography"})), |
| "has_introduction": float("introduction" in unique_headings), |
| "has_results": float("results" in unique_headings), |
| "has_discussion": float("discussion" in unique_headings), |
|
|
| |
| "citation_density": min(1.0, len(citations) / max(n_words, 1) * 100), |
|
|
| |
| "alpha_ratio": alpha / max(n_chars, 1), |
| "digit_ratio": digit / max(n_chars, 1), |
| "upper_ratio": upper / max(alpha, 1), |
|
|
| |
| "mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50), |
| } |
|
|
|
|
| def _mean_sentence_length(text: str) -> float: |
| """Average words per sentence (rough split on .!?).""" |
| sentences = re.split(r"[.!?]+", text) |
| sentences = [s.strip() for s in sentences if s.strip()] |
| if not sentences: |
| return 0.0 |
| return sum(len(s.split()) for s in sentences) / len(sentences) |
|
|
|
|
| def _empty_features() -> dict: |
| return { |
| "log_chars": 0.0, "log_words": 0.0, |
| "n_unique_sections": 0.0, |
| "has_abstract": 0.0, "has_methods": 0.0, |
| "has_references": 0.0, "has_introduction": 0.0, |
| "has_results": 0.0, "has_discussion": 0.0, |
| "citation_density": 0.0, |
| "alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0, |
| "mean_sentence_len": 0.0, |
| } |
|
|
|
|
| STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys()) |
| N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES) |
|
|