FastAPI / nuse_modules /keyword_extracter.py
raghavNCI
revamping the nude modules
9c1bffa
raw
history blame
2.4 kB
# nuse_modules/keyword_extractor.py
from __future__ import annotations
import json, re, logging, itertools
from collections import Counter
from pathlib import Path
from models_initialization.mistral_registry import mistral_generate
STOPWORDS = set(Path(__file__).with_name("stopwords_en.txt").read_text().split())
_JSON_RE = re.compile(r"\[[^\[\]]+\]", re.S) # first [...] block
def _dedupe_keep_order(seq):
seen = set()
for x in seq:
if x.lower() not in seen:
seen.add(x.lower())
yield x
def _extract_with_llm(question: str, k: int) -> list[str]:
prompt = (
"Extract the **most important keywords** (nouns or noun-phrases) from the question below.\n"
f"Return a **JSON list** of {k} or fewer lowercase keywords, no commentary.\n\n"
f"QUESTION:\n{question}"
)
raw = mistral_generate(prompt, max_new_tokens=48, temperature=0.3)
logging.debug("LLM raw output: %s", raw)
# find the first [...] JSON chunk
match = _JSON_RE.search(raw or "")
if not match:
raise ValueError("No JSON list detected in LLM output")
try:
keywords = json.loads(match.group())
if not isinstance(keywords, list):
raise ValueError
except Exception as e:
raise ValueError("Invalid JSON list") from e
cleaned = list(
_dedupe_keep_order(
kw.lower().strip(" .,\"'") for kw in keywords if kw and kw.lower() not in STOPWORDS
)
)
return cleaned[:k]
_WORD_RE = re.compile(r"[A-Za-z][\w\-]+")
def _fallback_keywords(text: str, k: int) -> list[str]:
tokens = [t.lower() for t in _WORD_RE.findall(text)]
tokens = [t for t in tokens if t not in STOPWORDS and len(t) > 2]
counts = Counter(tokens)
# remove very common words by frequency threshold
common_cut = (len(tokens) // 100) + 2
keywords, _ = zip(*counts.most_common(k + common_cut))
return list(keywords[:k])
def keywords_extractor(question: str, max_keywords: int = 6) -> list[str]:
"""
Return ≤ `max_keywords` keywords for the given question.
"""
try:
kw = _extract_with_llm(question, max_keywords)
if kw:
return kw
except Exception as exc:
logging.warning("LLM keyword extraction failed: %s. Falling back.", exc)
# fallback heuristic
return _fallback_keywords(question, max_keywords)