Upload src/pubguard/text.py with huggingface_hub

23c2fec verified about 1 month ago

4.72 kB

	"""
	Text preprocessing for PubGuard.

	Designed for text already extracted from PDFs (e.g. via pdfplumber,
	PyMuPDF, or GROBID in the PubVerse pipeline). Focuses on cleaning
	OCR / layout artefacts and producing a compact representation that
	captures enough signal for the three classification heads.
	"""

	import re
	from typing import Optional

	# ── Compiled patterns ────────────────────────────────────────────

	_WHITESPACE = re.compile(r"\s+")
	_HEADER_JUNK = re.compile(
	r"(doi:\s\S+\|https?://\S+\|©\s\d{4}\|all rights reserved)",
	re.IGNORECASE,
	)
	_PAGE_NUMBER = re.compile(r"\n\s\d{1,4}\s\n")
	_LIGATURE = re.compile(r"[ﬁﬂﬀﬃﬄ]")

	# Structural markers we look for to characterise document type
	SECTION_HEADINGS = re.compile(
	r"\b(abstract\|introduction\|methods?\|methodology\|results\|discussion\|"
	r"conclusions?\|references\|bibliography\|acknowledgments?\|funding\|"
	r"supplementary\|materials?\s+and\s+methods?\|related\s+work\|"
	r"background\|literature\s+review\|experimental\|data\s+availability)\b",
	re.IGNORECASE,
	)

	CITATION_PATTERN = re.compile(
	r"\[\d+\]\|\(\w+\s+et\s+al\.\s,?\s\d{4}\)\|\(\w+,\s*\d{4}\)",
	)


	def clean_text(text: Optional[str], max_chars: int = 4000) -> str:
	"""
	Normalise raw PDF-extracted text for embedding.

	Steps:
	1. Replace ligatures with ASCII equivalents.
	2. Strip DOIs, URLs, copyright lines.
	3. Remove isolated page numbers.
	4. Collapse whitespace.
	5. Truncate to `max_chars`.
	"""
	if not text:
	return ""

	if not isinstance(text, str):
	text = str(text)

	# Ligatures
	text = _LIGATURE.sub(lambda m: {
	"ﬁ": "fi", "ﬂ": "fl", "ﬀ": "ff", "ﬃ": "ffi", "ﬄ": "ffl"
	}.get(m.group(), m.group()), text)

	text = _HEADER_JUNK.sub(" ", text)
	text = _PAGE_NUMBER.sub("\n", text)
	text = _WHITESPACE.sub(" ", text).strip()

	return text[:max_chars]


	def extract_structural_features(text: str) -> dict:
	"""
	Cheap heuristic features that augment the embedding signal.

	Returns a dict of float features (0-1 range) that the linear
	head can concatenate with the embedding vector.
	"""
	if not text:
	return _empty_features()

	n_chars = len(text)
	n_words = len(text.split())

	# Section heading density
	headings = SECTION_HEADINGS.findall(text)
	unique_headings = set(h.lower() for h in headings)

	# Citation density
	citations = CITATION_PATTERN.findall(text)

	# Character-level ratios
	alpha = sum(c.isalpha() for c in text)
	digit = sum(c.isdigit() for c in text)
	upper = sum(c.isupper() for c in text)

	return {
	# Document length signals (log-scaled, clipped)
	"log_chars": min(1.0, len(text) / 4000),
	"log_words": min(1.0, n_words / 800),

	# Structure signals
	"n_unique_sections": min(1.0, len(unique_headings) / 8),
	"has_abstract": float("abstract" in unique_headings),
	"has_methods": float(bool(unique_headings & {"methods", "methodology", "materials and methods"})),
	"has_references": float(bool(unique_headings & {"references", "bibliography"})),
	"has_introduction": float("introduction" in unique_headings),
	"has_results": float("results" in unique_headings),
	"has_discussion": float("discussion" in unique_headings),

	# Citation density
	"citation_density": min(1.0, len(citations) / max(n_words, 1) * 100),

	# Character composition
	"alpha_ratio": alpha / max(n_chars, 1),
	"digit_ratio": digit / max(n_chars, 1),
	"upper_ratio": upper / max(alpha, 1),

	# Mean sentence length (proxy for formality)
	"mean_sentence_len": min(1.0, _mean_sentence_length(text) / 50),
	}


	def _mean_sentence_length(text: str) -> float:
	"""Average words per sentence (rough split on .!?)."""
	sentences = re.split(r"[.!?]+", text)
	sentences = [s.strip() for s in sentences if s.strip()]
	if not sentences:
	return 0.0
	return sum(len(s.split()) for s in sentences) / len(sentences)


	def _empty_features() -> dict:
	return {
	"log_chars": 0.0, "log_words": 0.0,
	"n_unique_sections": 0.0,
	"has_abstract": 0.0, "has_methods": 0.0,
	"has_references": 0.0, "has_introduction": 0.0,
	"has_results": 0.0, "has_discussion": 0.0,
	"citation_density": 0.0,
	"alpha_ratio": 0.0, "digit_ratio": 0.0, "upper_ratio": 0.0,
	"mean_sentence_len": 0.0,
	}


	STRUCTURAL_FEATURE_NAMES = list(_empty_features().keys())
	N_STRUCTURAL_FEATURES = len(STRUCTURAL_FEATURE_NAMES)