Spaces:

PlotweaverAI
/

Voice-AI-Agent-Clean

Running

App Files Files Community

Toadoum commited on 20 days ago

Commit

d8a53e2

verified ·

1 Parent(s): ddbabb4

Update nlu.py

Browse files

Files changed (1) hide show

nlu.py +185 -146

nlu.py CHANGED Viewed

@@ -1,14 +1,24 @@
 """
-NLU — Hybrid Hausa intent + entity extraction.
-Three-tier architecture:
-  1. Rule-based keyword matcher (fast path, ~80% of demo utterances)
-  2. Qwen2.5-1.5B-Instruct zero-shot JSON extractor (paraphrases, novel phrasings)
-  3. Rule-based fallback (if LLM fails or returns unparseable output)
-The LLM is lazy-loaded on first non-matched utterance so the Space boots fast.
-In production this would be replaced with a fine-tuned classifier on
-PlotWeaver's Hausa intent corpus.
 """
 from __future__ import annotations
 import re
@@ -18,24 +28,10 @@ from typing import Optional
 logger = logging.getLogger("plotweaver.nlu")
 # ---------------------------------------------------------------------------
-# Layer 1: rule-based fast path (covers common demo phrases)
 # ---------------------------------------------------------------------------
-INTENT_KEYWORDS = {
-    "check_balance": ["duba", "ma'auni", "balance", "kudi", "asusu"],
-    "block_card": ["toshe", "kati", "block"],
-    "transfer_money": ["tura", "canji", "canjin", "aika", "transfer"],
-    "buy_airtime": ["airtime", "caji"],
-    "buy_bundle": ["bundle", "data", "intanet"],
-    "complaint": ["korafi", "matsala", "complain"],
-    "check_order": ["bincika", "order", "oda"],
-    "reschedule": ["sake tsara", "reschedule", "canja lokaci"],
-    "return_item": ["mayar", "mayarwa", "return"],
-    "human_agent": ["mutum", "wakili", "agent", "human"],
-    "yes": ["i ", " i", "eh", "haka ne", "yes", "ok", "okay"],
-    "no": ["a'a", "a'aa", "ba haka", " no", "no "],
-}
 WORD_DIGITS = {
     "sifili": "0", "daya": "1", "ɗaya": "1", "biyu": "2", "uku": "3",
     "hudu": "4", "huɗu": "4", "biyar": "5", "shida": "6", "bakwai": "7",
@@ -48,18 +44,12 @@ WORD_AMOUNTS = {
     "ɗari": 100, "dari": 100,
 }
-def _norm(t: str) -> str:
-    return " " + t.lower().strip() + " "
-def _match_intent_kw(text: str) -> Optional[str]:
-    t = _norm(text)
-    for intent, kws in INTENT_KEYWORDS.items():
-        for kw in kws:
-            if kw in t:
-                return intent
-    return None
 def _extract_digits(text: str) -> Optional[str]:
@@ -82,71 +72,93 @@ def _extract_amount(text: str) -> Optional[int]:
     return None
-def _rule_based_parse(text: str, expected: Optional[str]) -> tuple[str, dict]:
-    """Layer 1 + 3: deterministic keyword + slot matcher."""
-    entities: dict = {}
-    if not text or not text.strip():
-        return "unknown", entities
-    # Universal escape
-    if _match_intent_kw(text) == "human_agent":
-        return "human_agent", entities
-    if expected == "digits":
-        d = _extract_digits(text)
-        if d:
-            entities["digits"] = d
-            return "provide_digits", entities
-    if expected == "amount":
-        a = _extract_amount(text)
-        if a is not None:
-            entities["amount"] = a
-            return "provide_amount", entities
-    if expected == "name":
-        name = text.strip().split()[-1] if text.strip() else ""
-        if name:
-            entities["name"] = name
-            return "provide_name", entities
-    if expected == "date":
-        entities["date"] = text.strip()
-        return "provide_date", entities
-    if expected == "bundle":
-        t = text.lower()
-        for b in ("rana", "mako", "wata"):
-            if b in t:
-                entities["bundle"] = b
-                return "provide_bundle", entities
-    if expected == "text":
-        entities["text"] = text.strip()
-        return "provide_text", entities
-    if expected == "yesno":
-        i = _match_intent_kw(text)
-        if i in ("yes", "no"):
-            return i, entities
-    i = _match_intent_kw(text)
-    if i:
-        return i, entities
-    return "unknown", entities
 # ---------------------------------------------------------------------------
-# Layer 2: Qwen2.5-1.5B-Instruct zero-shot NLU
 # ---------------------------------------------------------------------------
 _llm_model = None
 _llm_tokenizer = None
-_llm_failed = False  # set to True after any load failure, to prevent retries
 def _load_llm():
-    """Lazy-load Qwen2.5-1.5B-Instruct. Called only when rule-based misses."""
     global _llm_model, _llm_tokenizer, _llm_failed
     if _llm_failed:
         return None, None
@@ -155,25 +167,23 @@ def _load_llm():
     try:
         import torch
         from transformers import AutoModelForCausalLM, AutoTokenizer
-        logger.info("Loading Qwen2.5-1.5B-Instruct for NLU…")
         model_id = "Qwen/Qwen2.5-1.5B-Instruct"
         _llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
         _llm_model = AutoModelForCausalLM.from_pretrained(
             model_id,
-            torch_dtype=torch.float32,  # CPU — bfloat16 not broadly supported
             low_cpu_mem_usage=True,
         )
         _llm_model.eval()
-        logger.info("Qwen2.5-1.5B-Instruct ready.")
         return _llm_model, _llm_tokenizer
     except Exception as e:
-        logger.warning(f"LLM load failed: {e}")
         _llm_failed = True
         return None, None
-# Candidate intents per expected-slot context. Keeps the LLM prompt small
-# and constrains output to valid options only.
 CANDIDATE_INTENTS = {
     None: ["check_balance", "block_card", "transfer_money",
            "buy_airtime", "buy_bundle", "complaint",
@@ -184,8 +194,6 @@ CANDIDATE_INTENTS = {
                "check_order", "reschedule", "return_item",
                "human_agent", "unknown"],
     "yesno": ["yes", "no", "human_agent", "unknown"],
-    "digits": ["provide_digits", "human_agent", "unknown"],
-    "amount": ["provide_amount", "human_agent", "unknown"],
     "name": ["provide_name", "human_agent", "unknown"],
     "date": ["provide_date", "human_agent", "unknown"],
     "bundle": ["provide_bundle", "human_agent", "unknown"],
@@ -193,42 +201,39 @@ CANDIDATE_INTENTS = {
 }
-SYSTEM_PROMPT = """You are an intent classifier for a Hausa-language customer service voice agent.
-Analyze the user's Hausa utterance and return a JSON object with:
-- "intent": one of the candidate intents provided
-- "entities": a dict of extracted values (may be empty)
 Intent meanings:
-- check_balance: user wants to check their account balance
-- block_card: user wants to block or freeze their bank card
-- transfer_money: user wants to transfer or send money
-- buy_airtime: user wants to buy phone airtime
-- buy_bundle: user wants to buy a data bundle
-- complaint: user wants to file a complaint
-- check_order: user wants to check an order status
 - reschedule: user wants to reschedule a delivery
 - return_item: user wants to return an item
-- human_agent: user wants to speak to a human
-- yes / no: affirmative or negative response
-- provide_digits / provide_amount / provide_name / provide_date / provide_bundle / provide_text: user is providing specific information
-- unknown: cannot determine the intent
-Return ONLY a valid JSON object, no explanation. Example: {"intent": "check_balance", "entities": {}}"""
-def _llm_parse(text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]:
-    """Layer 2: zero-shot LLM classification. Returns None on any failure."""
     model, tokenizer = _load_llm()
     if model is None:
         return None
     candidates = CANDIDATE_INTENTS.get(expected, CANDIDATE_INTENTS[None])
     user_prompt = (
-        f'Hausa utterance: "{text}"\n'
-        f'Expected slot type: {expected or "any"}\n'
         f'Candidate intents: {", ".join(candidates)}\n\n'
-        'Respond with JSON only.'
     )
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
@@ -241,14 +246,13 @@ def _llm_parse(text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]
         with torch.no_grad():
             out = model.generate(
                 **inputs,
-                max_new_tokens=80,
                 do_sample=False,
                 pad_token_id=tokenizer.eos_token_id,
             )
         generated = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
-        logger.info(f"LLM raw output: {generated}")
-        # Extract JSON (model sometimes wraps it in markdown fences or prose)
         m = re.search(r"\{.*?\}", generated, re.DOTALL)
         if not m:
             return None
@@ -257,13 +261,12 @@ def _llm_parse(text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]
         entities = parsed.get("entities", {}) or {}
         if not isinstance(entities, dict):
             entities = {}
-        # Validate intent is in candidate list
         if intent not in candidates:
-            logger.info(f"LLM returned out-of-candidate intent: {intent}")
             return None
         return intent, entities
     except Exception as e:
-        logger.warning(f"LLM inference failed: {e}")
         return None
@@ -273,38 +276,74 @@ def _llm_parse(text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]
 def parse(text: str, expected: Optional[str] = None,
           use_llm: bool = True) -> tuple[str, dict, str]:
     """
-    Hybrid NLU. Returns (intent, entities, source) where source is one of
-    'rule', 'llm', or 'rule_fallback'.
-    Flow:
-      1. Try rule-based keyword/slot matcher (fast, deterministic)
-      2. If result is 'unknown' AND use_llm=True: try Qwen2.5 zero-shot
-      3. If LLM fails or returns invalid output: return rule-based 'unknown'
     """
-    intent, entities = _rule_based_parse(text, expected)
-    if intent != "unknown":
-        return intent, entities, "rule"
-    if not use_llm:
-        return intent, entities, "rule"
-    # Rule-based missed — try LLM
-    llm_result = _llm_parse(text, expected)
-    if llm_result is None:
-        return intent, entities, "rule_fallback"
-    llm_intent, llm_entities = llm_result
-    # Sanity-check entities for slot-typed expected (LLM might hallucinate
-    # digits; re-run our deterministic extractors for strict-format slots)
     if expected == "digits":
         d = _extract_digits(text)
         if d:
-            llm_entities["digits"] = d
-    elif expected == "amount":
         a = _extract_amount(text)
         if a is not None:
-            llm_entities["amount"] = a
-    return llm_intent, llm_entities, "llm"

 """
+NLU — NLLB + Qwen pivot-through-English architecture.
+Flow:
+  1. Deterministic structural extractors run FIRST on the original Hausa
+     text (digits, amounts, yes/no keywords). These MUST be deterministic
+     because "1234" → "provide_digits" with digits="1234" is non-negotiable
+     for banks, and regex is faster + more reliable than any model for
+     this sub-task.
+  2. If structural extractors don't match the expected slot type, the text
+     is translated Hausa → English via NLLB-200, then classified by
+     Qwen2.5-1.5B in English (where it is strong) into one of a small
+     fixed set of intent labels.
+  3. If NLLB or Qwen fails, we return "unknown" cleanly — the dialogue
+     manager will re-prompt.
+All models are lazy-loaded on first use. Cold-start downloads:
+  - NLLB-200-distilled-600M: ~2.4 GB
+  - Qwen2.5-1.5B-Instruct: ~3 GB
 """
 from __future__ import annotations
 import re
 logger = logging.getLogger("plotweaver.nlu")
 # ---------------------------------------------------------------------------
+# Deterministic structural extractors (run on raw Hausa text)
 # ---------------------------------------------------------------------------
 WORD_DIGITS = {
     "sifili": "0", "daya": "1", "ɗaya": "1", "biyu": "2", "uku": "3",
     "hudu": "4", "huɗu": "4", "biyar": "5", "shida": "6", "bakwai": "7",
     "ɗari": 100, "dari": 100,
 }
+# Hausa yes/no keywords for the sole case where we short-circuit Qwen
+HAUSA_YES = {"i", "eh", "haka ne", "haka", "ok", "okay", "yes"}
+HAUSA_NO = {"a'a", "a'aa", "ba haka", "ba", "no"}
+# Human-agent escape hatch
+HUMAN_KEYWORDS = {"mutum", "wakili", "agent", "human"}
 def _extract_digits(text: str) -> Optional[str]:
     return None
+def _match_yesno(text: str) -> Optional[str]:
+    t = " " + text.lower().strip() + " "
+    for kw in HAUSA_YES:
+        if f" {kw} " in t or t.strip() == kw:
+            return "yes"
+    for kw in HAUSA_NO:
+        if f" {kw} " in t or t.strip() == kw:
+            return "no"
+    return None
+def _contains_human_keyword(text: str) -> bool:
+    t = text.lower()
+    return any(kw in t for kw in HUMAN_KEYWORDS)
+# ---------------------------------------------------------------------------
+# NLLB-200 Ha → En translation (lazy-loaded)
+# ---------------------------------------------------------------------------
+_nllb_model = None
+_nllb_tokenizer = None
+_nllb_failed = False
+def _load_nllb():
+    """Lazy-load NLLB-200-distilled-600M."""
+    global _nllb_model, _nllb_tokenizer, _nllb_failed
+    if _nllb_failed:
+        return None, None
+    if _nllb_model is not None:
+        return _nllb_model, _nllb_tokenizer
+    try:
+        import torch
+        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+        logger.info("Loading NLLB-200-distilled-600M…")
+        model_id = "facebook/nllb-200-distilled-600M"
+        _nllb_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        _nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True,
+        )
+        _nllb_model.eval()
+        logger.info("NLLB-200 ready.")
+        return _nllb_model, _nllb_tokenizer
+    except Exception as e:
+        logger.warning(f"NLLB load failed: {e}")
+        _nllb_failed = True
+        return None, None
+def translate_ha_to_en(text: str) -> Optional[str]:
+    """Translate Hausa to English via NLLB. Returns None on failure."""
+    model, tokenizer = _load_nllb()
+    if model is None or not text.strip():
+        return None
+    try:
+        import torch
+        # NLLB requires source language token set on tokenizer
+        tokenizer.src_lang = "hau_Latn"
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
+        # Force English output via forced_bos_token_id
+        forced_bos_id = tokenizer.convert_tokens_to_ids("eng_Latn")
+        with torch.no_grad():
+            out = model.generate(
+                **inputs,
+                forced_bos_token_id=forced_bos_id,
+                max_new_tokens=128,
+                num_beams=2,
+            )
+        translated = tokenizer.batch_decode(out, skip_special_tokens=True)[0].strip()
+        logger.info(f"NLLB Ha→En: {text!r} → {translated!r}")
+        return translated
+    except Exception as e:
+        logger.warning(f"NLLB translate failed: {e}")
+        return None
 # ---------------------------------------------------------------------------
+# Qwen2.5-1.5B intent classifier (operates on English text)
 # ---------------------------------------------------------------------------
 _llm_model = None
 _llm_tokenizer = None
+_llm_failed = False
 def _load_llm():
     global _llm_model, _llm_tokenizer, _llm_failed
     if _llm_failed:
         return None, None
     try:
         import torch
         from transformers import AutoModelForCausalLM, AutoTokenizer
+        logger.info("Loading Qwen2.5-1.5B-Instruct…")
         model_id = "Qwen/Qwen2.5-1.5B-Instruct"
         _llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
         _llm_model = AutoModelForCausalLM.from_pretrained(
             model_id,
+            torch_dtype=torch.float32,
             low_cpu_mem_usage=True,
         )
         _llm_model.eval()
+        logger.info("Qwen2.5-1.5B ready.")
         return _llm_model, _llm_tokenizer
     except Exception as e:
+        logger.warning(f"Qwen load failed: {e}")
         _llm_failed = True
         return None, None
 CANDIDATE_INTENTS = {
     None: ["check_balance", "block_card", "transfer_money",
            "buy_airtime", "buy_bundle", "complaint",
                "check_order", "reschedule", "return_item",
                "human_agent", "unknown"],
     "yesno": ["yes", "no", "human_agent", "unknown"],
     "name": ["provide_name", "human_agent", "unknown"],
     "date": ["provide_date", "human_agent", "unknown"],
     "bundle": ["provide_bundle", "human_agent", "unknown"],
 }
+SYSTEM_PROMPT = """You are an intent classifier for a customer-service voice bot.
+You will be given an English-language utterance (translated from Hausa) and a list of candidate intents. Return JSON with the single best-matching intent and any entities you can extract.
 Intent meanings:
+- check_balance: user wants to check an account balance
+- block_card: user wants to block, freeze, or cancel a bank card
+- transfer_money: user wants to send or transfer money
+- buy_airtime: user wants to buy phone airtime / top-up
+- buy_bundle: user wants to buy a data bundle / internet package
+- complaint: user wants to file a complaint or report a problem
+- check_order: user wants to check the status of an order
 - reschedule: user wants to reschedule a delivery
 - return_item: user wants to return an item
+- human_agent: user wants to speak to a human person
+- yes / no: affirmative or negative reply
+- provide_name / provide_date / provide_bundle / provide_text: user is supplying information
+- unknown: cannot determine intent
+Return ONLY valid JSON. No explanation, no markdown. Example: {"intent": "check_balance", "entities": {}}"""
+def _qwen_classify(english_text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]:
+    """Classify an English utterance into an intent. Returns None on failure."""
     model, tokenizer = _load_llm()
     if model is None:
         return None
     candidates = CANDIDATE_INTENTS.get(expected, CANDIDATE_INTENTS[None])
     user_prompt = (
+        f'Utterance: "{english_text}"\n'
         f'Candidate intents: {", ".join(candidates)}\n\n'
+        'Return JSON only.'
     )
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         with torch.no_grad():
             out = model.generate(
                 **inputs,
+                max_new_tokens=60,
                 do_sample=False,
                 pad_token_id=tokenizer.eos_token_id,
             )
         generated = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
+        logger.info(f"Qwen raw: {generated}")
         m = re.search(r"\{.*?\}", generated, re.DOTALL)
         if not m:
             return None
         entities = parsed.get("entities", {}) or {}
         if not isinstance(entities, dict):
             entities = {}
         if intent not in candidates:
+            logger.info(f"Qwen returned out-of-candidate intent: {intent}")
             return None
         return intent, entities
     except Exception as e:
+        logger.warning(f"Qwen inference failed: {e}")
         return None
 def parse(text: str, expected: Optional[str] = None,
           use_llm: bool = True) -> tuple[str, dict, str]:
     """
+    NLU. Returns (intent, entities, source) where source is one of:
+      - 'structural': deterministic extractor caught it (digits, amount, yes/no)
+      - 'nllb+qwen': translated via NLLB and classified via Qwen
+      - 'human_keyword': caught human-agent escape hatch by keyword
+      - 'unknown': nothing matched
     """
+    entities: dict = {}
+    if not text or not text.strip():
+        return "unknown", entities, "unknown"
+    # Always-on human-agent escape (safety)
+    if _contains_human_keyword(text):
+        return "human_agent", entities, "human_keyword"
+    # Layer 1: deterministic structural extractors for strict-format slots
     if expected == "digits":
         d = _extract_digits(text)
         if d:
+            entities["digits"] = d
+            return "provide_digits", entities, "structural"
+    if expected == "amount":
         a = _extract_amount(text)
         if a is not None:
+            entities["amount"] = a
+            return "provide_amount", entities, "structural"
+    if expected == "yesno":
+        yn = _match_yesno(text)
+        if yn:
+            return yn, entities, "structural"
+    if expected == "name":
+        # Name is free-form; take the last token as a quick heuristic. Qwen
+        # would not help here — names don't translate meaningfully.
+        name = text.strip().split()[-1] if text.strip() else ""
+        if name:
+            entities["name"] = name
+            return "provide_name", entities, "structural"
+    if expected == "date":
+        entities["date"] = text.strip()
+        return "provide_date", entities, "structural"
+    # Layer 2: NLLB Ha → En, then Qwen classification
+    if not use_llm:
+        return "unknown", entities, "unknown"
+    english_text = translate_ha_to_en(text)
+    if english_text is None:
+        return "unknown", entities, "unknown"
+    qwen_result = _qwen_classify(english_text, expected)
+    if qwen_result is None:
+        return "unknown", entities, "unknown"
+    intent, llm_entities = qwen_result
+    # For free-text slots, pass the original Hausa text through (don't want
+    # English-translated complaint text stored as a Hausa complaint)
+    if expected == "bundle":
+        t = text.lower()
+        for b in ("rana", "mako", "wata"):
+            if b in t:
+                llm_entities["bundle"] = b
+                break
+    if expected == "text":
+        llm_entities["text"] = text.strip()
+    return intent, llm_entities, "nllb+qwen"