Spaces:

Nomio4640
/

NLP-intelligence

Sleeping

App Files Files Community

NLP-intelligence / nlp_core /ner_engine.py

Nomio4640

bert chunk problem

d4ff564 28 days ago

raw

history blame contribute delete

8.54 kB

	"""
	NER Engine — Named Entity Recognition using HuggingFace Transformers.
	Wraps the Nomio4640/ner-mongolian fine-tuned model.

	Long-text handling:
	BERT has a 512-token hard limit. Long social-media posts (especially
	Google reviews, long Facebook posts) are silently truncated, causing
	entities in the second half to be completely missed.

	Fix: texts longer than MAX_CHUNK_CHARS are split at sentence boundaries
	into overlapping chunks. Each chunk is processed independently and the
	character offsets from each chunk are corrected before merging. Duplicate
	entities at chunk boundaries are deduplicated by (word, start) key.
	"""

	from typing import List, Tuple
	from .models import EntityResult


	HF_MODEL_ID = "Nomio4640/ner-mongolian"

	# ~400-450 Mongolian Cyrillic tokens ≈ 1 200-1 500 characters.
	# Keeping well below 512 BERT tokens leaves room for tokenizer overhead.
	MAX_CHUNK_CHARS = 1_300


	class NEREngine:
	"""Named Entity Recognition service using HuggingFace pipeline."""

	def __init__(self, model_name: str = None):
	import os
	# Use local model if it exists, otherwise fall back to HuggingFace Hub
	local_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "adapters", "ner_mongolian")
	if model_name:
	self.model_name = model_name
	elif os.path.exists(os.path.join(local_path, "model.safetensors")):
	self.model_name = local_path
	else:
	self.model_name = HF_MODEL_ID
	self._pipeline = None

	def _load_pipeline(self):
	"""Lazy-load the NER pipeline (heavy model, load only when needed)."""
	if self._pipeline is None:
	import torch
	from transformers import pipeline
	device = 0 if torch.cuda.is_available() else -1
	self._pipeline = pipeline(
	"ner",
	model=self.model_name,
	aggregation_strategy="simple",
	device=device,
	)
	print(f"[NEREngine] Loaded on {'GPU' if device == 0 else 'CPU'}")
	return self._pipeline

	def _clean_entities(self, raw_entities: List[dict]) -> List[dict]:
	"""Merge subword tokens (## prefixed) back together."""
	cleaned = []
	for ent in raw_entities:
	word = ent.get("word", "")
	if word.startswith("##") and len(cleaned) > 0:
	cleaned[-1]["word"] += word.replace("##", "")
	else:
	cleaned.append(dict(ent))
	return cleaned

	# ------------------------------------------------------------------
	# Long-text chunking
	# ------------------------------------------------------------------

	def _chunk_text(self, text: str, max_chars: int = MAX_CHUNK_CHARS) -> List[Tuple[str, int]]:
	"""
	Split text into chunks of at most max_chars characters, breaking
	at sentence boundaries where possible. Returns a list of
	(chunk_text, start_char_offset_in_original) tuples.
	"""
	chunks: List[Tuple[str, int]] = []
	start = 0
	n = len(text)
	while start < n:
	end = min(start + max_chars, n)
	if end < n:
	# Try to break at a sentence boundary within the window
	for sep in (". ", "! ", "? ", "\n", " "):
	pos = text.rfind(sep, start + max_chars // 2, end)
	if pos != -1:
	end = pos + len(sep)
	break
	chunk = text[start:end].strip()
	if chunk:
	chunks.append((chunk, start))
	start = end
	return chunks or [(text, 0)]

	def _recognize_chunked(self, text: str) -> List[EntityResult]:
	"""
	Run NER on text by splitting it into chunks, correcting entity
	character offsets back to the original text's coordinate space,
	and deduplicating entities that appear at chunk boundaries.
	"""
	pipe = self._load_pipeline()
	chunks = self._chunk_text(text)
	all_results: List[EntityResult] = []
	seen: set = set() # (word_lower, abs_start) dedup key

	for chunk_text, chunk_offset in chunks:
	if not chunk_text.strip():
	continue
	try:
	raw = pipe(chunk_text)
	except Exception:
	continue
	for ent in self._clean_entities(raw):
	word = ent.get("word", "")
	abs_start = chunk_offset + int(ent.get("start", 0))
	abs_end = chunk_offset + int(ent.get("end", 0))
	key = (word.lower(), abs_start)
	if key in seen:
	continue
	seen.add(key)
	all_results.append(EntityResult(
	word=word,
	entity_group=ent.get("entity_group", "MISC"),
	score=float(ent.get("score", 0.0)),
	start=abs_start,
	end=abs_end,
	))

	return all_results

	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def recognize(self, text: str) -> List[EntityResult]:
	"""
	Run NER on a single text and return cleaned entities.
	Automatically chunks texts longer than MAX_CHUNK_CHARS so that
	entities in the second half of long documents are not silently
	dropped by BERT's 512-token truncation.
	"""
	if not text or not text.strip():
	return []

	# Long text → chunk-and-merge instead of letting BERT truncate
	if len(text) > MAX_CHUNK_CHARS:
	return self._recognize_chunked(text)

	pipe = self._load_pipeline()
	try:
	raw = pipe(text)
	except Exception:
	return []

	results = []
	for ent in self._clean_entities(raw):
	results.append(EntityResult(
	word=ent.get("word", ""),
	entity_group=ent.get("entity_group", "MISC"),
	score=float(ent.get("score", 0.0)),
	start=int(ent.get("start", 0)),
	end=int(ent.get("end", 0)),
	))
	return results

	def recognize_batch(self, texts: List[str], batch_size: int = 16) -> List[List[EntityResult]]:
	"""
	Run NER on a batch of texts.

	Short texts (≤ MAX_CHUNK_CHARS) are processed together via HuggingFace
	pipeline batching for GPU efficiency. Long texts are handled
	individually with chunk-and-merge so that no entities are missed.
	"""
	if not texts:
	return []

	out: List[List[EntityResult]] = [[] for _ in texts]

	# Separate short and long texts
	short_texts: List[str] = []
	short_indices: List[int] = []
	long_indices: List[int] = []

	for i, text in enumerate(texts):
	if not text or not text.strip():
	continue
	if len(text) > MAX_CHUNK_CHARS:
	long_indices.append(i)
	else:
	short_texts.append(text)
	short_indices.append(i)

	# --- Batch-process short texts ---
	if short_texts:
	pipe = self._load_pipeline()
	try:
	raw_results = pipe(short_texts, batch_size=batch_size)
	for idx, raw in zip(short_indices, raw_results):
	entity_results = []
	for ent in self._clean_entities(raw):
	entity_results.append(EntityResult(
	word=ent.get("word", ""),
	entity_group=ent.get("entity_group", "MISC"),
	score=float(ent.get("score", 0.0)),
	start=int(ent.get("start", 0)),
	end=int(ent.get("end", 0)),
	))
	out[idx] = entity_results
	except Exception as e:
	print(f"[NEREngine] Batch processing error: {e}")
	# Fallback to per-text processing
	for idx, text in zip(short_indices, short_texts):
	out[idx] = self.recognize(text)

	# --- Chunk-and-merge long texts (sequential, no truncation) ---
	for idx in long_indices:
	out[idx] = self._recognize_chunked(texts[idx])

	return out