Spaces:

nuseAI
/

FastAPI

Sleeping

FastAPI / nuse_modules /keyword_extracter.py

raghavNCI

revamping the nude modules

9c1bffa about 2 months ago

2.4 kB

	# nuse_modules/keyword_extractor.py

	from __future__ import annotations
	import json, re, logging, itertools
	from collections import Counter
	from pathlib import Path

	from models_initialization.mistral_registry import mistral_generate

	STOPWORDS = set(Path(__file__).with_name("stopwords_en.txt").read_text().split())

	_JSON_RE = re.compile(r"\[[^\[\]]+\]", re.S) # first [...] block

	def _dedupe_keep_order(seq):
	seen = set()
	for x in seq:
	if x.lower() not in seen:
	seen.add(x.lower())
	yield x

	def _extract_with_llm(question: str, k: int) -> list[str]:
	prompt = (
	"Extract the most important keywords (nouns or noun-phrases) from the question below.\n"
	f"Return a JSON list of {k} or fewer lowercase keywords, no commentary.\n\n"
	f"QUESTION:\n{question}"
	)
	raw = mistral_generate(prompt, max_new_tokens=48, temperature=0.3)
	logging.debug("LLM raw output: %s", raw)

	# find the first [...] JSON chunk
	match = _JSON_RE.search(raw or "")
	if not match:
	raise ValueError("No JSON list detected in LLM output")

	try:
	keywords = json.loads(match.group())
	if not isinstance(keywords, list):
	raise ValueError
	except Exception as e:
	raise ValueError("Invalid JSON list") from e

	cleaned = list(
	_dedupe_keep_order(
	kw.lower().strip(" .,\"'") for kw in keywords if kw and kw.lower() not in STOPWORDS
	)
	)
	return cleaned[:k]


	_WORD_RE = re.compile(r"[A-Za-z][\w\-]+")

	def _fallback_keywords(text: str, k: int) -> list[str]:
	tokens = [t.lower() for t in _WORD_RE.findall(text)]
	tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
	counts = Counter(tokens)
	# remove very common words by frequency threshold
	common_cut = (len(tokens) // 100) + 2
	keywords, _ = zip(*counts.most_common(k + common_cut))
	return list(keywords[:k])

	def keywords_extractor(question: str, max_keywords: int = 6) -> list[str]:
	"""
	Return ≤ `max_keywords` keywords for the given question.
	"""
	try:
	kw = _extract_with_llm(question, max_keywords)
	if kw:
	return kw
	except Exception as exc:
	logging.warning("LLM keyword extraction failed: %s. Falling back.", exc)

	# fallback heuristic
	return _fallback_keywords(question, max_keywords)