Spaces:

Nomio4640
/

NLP-intelligence

Running

App Files Files Community

NLP-intelligence / adapters /api /services.py

Nomio4640

google colab

4b08593 7 days ago

raw

history blame contribute delete

3.35 kB

	"""
	services.py — shared singleton NLP services.

	Changes from previous version:
	1. Seeds ALL hardcoded MONGOLIAN_STOPWORDS into the DB on startup so
	the admin can see and edit the full list. Uses INSERT OR IGNORE so
	server restarts never create duplicates.
	2. Preprocessor now reads stopwords FROM the DB (so admin additions/
	deletions are always reflected after reload_preprocessor()).
	3. MIN_TOPICS_DOCS lowered to match topic_modeler.py's new threshold of 3.
	"""

	import os
	from typing import Optional

	from nlp_core.knowledge_base import KnowledgeBase
	from nlp_core.preprocessing import Preprocessor, MONGOLIAN_STOPWORDS
	from nlp_core.ner_engine import NEREngine
	from nlp_core.sentiment import SentimentAnalyzer
	from nlp_core.topic_modeler import TopicModeler
	from nlp_core.network_analyzer import NetworkAnalyzer

	# ---------------------------------------------------------------------------
	# DB path — resolves to webapp/knowledge.db regardless of cwd
	# ---------------------------------------------------------------------------
	_HERE = os.path.dirname(os.path.abspath(__file__))
	# Check if an external DB_PATH is provided via environment variables (e.g., Colab Google Drive).
	# Otherwise, default to the local knowledge.db inside the project folder.
	DB_PATH = os.environ.get(
	"DB_PATH",
	os.path.normpath(os.path.join(_HERE, "..", "..", "knowledge.db"))
	)

	# ---------------------------------------------------------------------------
	# Singleton instances
	# ---------------------------------------------------------------------------
	kb = KnowledgeBase(db_path=DB_PATH)

	# Seed all hardcoded stopwords into the DB on first run.
	# INSERT OR IGNORE makes this safe to call on every restart.
	_seeded = kb.seed_stopwords(list(MONGOLIAN_STOPWORDS))
	if _seeded > 0:
	print(f"[services] Seeded {_seeded} default stopwords into DB.")

	# Preprocessor reads stopwords from DB — now includes all defaults
	# plus anything the admin has added via the UI.
	preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords())

	# Heavy ML models — lazy-loaded inside the classes on first actual use.
	# Keeping them as module-level objects means HuggingFace pipelines are
	# only constructed ONCE per server lifetime, not once per request.
	ner = NEREngine()
	sentiment = SentimentAnalyzer()
	topic = TopicModeler()
	network = NetworkAnalyzer()

	# ---------------------------------------------------------------------------
	# In-memory cache of the last analysis (for /network and /insights)
	# ---------------------------------------------------------------------------
	_last_analysis = None


	def get_last_analysis():
	return _last_analysis


	def set_last_analysis(result) -> None:
	global _last_analysis
	_last_analysis = result


	# ---------------------------------------------------------------------------
	# Reload — called by POST /admin/reload after admin changes
	# ---------------------------------------------------------------------------

	def reload_preprocessor() -> None:
	"""
	Rebuild the Preprocessor with the latest stopword list from the DB.
	Call this after the admin adds or removes stopwords so the change
	takes effect on the next analysis request without restarting.
	"""
	global preprocessor
	preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords())