Spaces:

Zhe-Zhang
/

Language_Classifier

Sleeping

App Files Files Community

Zhe-Zhang commited on Sep 30, 2025

Commit

0b58da8

verified ·

1 Parent(s): fea6dd3

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -31

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import numpy as np
 import torch
 import torch.nn as nn
@@ -6,7 +8,7 @@ import joblib
 from collections import Counter
 import gradio as gr
-# --- utils (from the notebook) ---
 def ngrams(sentence, n=1, lc=True):
     ngram_l = []
     sentence = sentence.lower()
@@ -30,16 +32,16 @@ def reproducible_hash(string):
     h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
     return int.from_bytes(h.digest()[0:8], 'big', signed=True)
-def hash_ngrams(ngrams, modulos):
     hash_codes = []
-    for ngram_list, modulo in zip(ngrams, modulos):
         codes = [(reproducible_hash(x) % modulo) for x in ngram_list]
         hash_codes.append(codes)
     return hash_codes
 def calc_rel_freq(codes):
     cnt = Counter(codes)
-    total = sum(cnt.values())
     for k in cnt:
         cnt[k] /= total
     return cnt
@@ -57,52 +59,131 @@ def shift_keys(dicts, MAX_SHIFT):
 def build_freq_dict(sentence, MAXES=MAXES, MAX_SHIFT=MAX_SHIFT):
     hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
-    fhcodes = map(calc_rel_freq, hngrams)
     return shift_keys(fhcodes, MAX_SHIFT)
-# --- load models ---
-vectorizer = joblib.load("nld_vectorizer.joblib")
-idx2lang = joblib.load("nld_lang_codes.joblib")
 input_dim = len(vectorizer.vocabulary_)
 nbr_classes = len(idx2lang)
 model = nn.Sequential(
     nn.Linear(input_dim, 50),
     nn.ReLU(),
     nn.Linear(50, nbr_classes)
 )
-model.load_state_dict(torch.load("nld.pth", map_location="cpu"))
 model.eval()
-# --- prediction function ---
-def detect_lang(src_sentence: str):
     feat_dict = build_freq_dict(src_sentence)
-    X_test = vectorizer.transform([feat_dict])  # ✅ 用 list 包起来
     if hasattr(X_test, "toarray"):
-        X_test = X_test.toarray()
-    X_test = torch.from_numpy(X_test.astype("float32"))
     with torch.no_grad():
-        logits = model(X_test)
-        pred_idx = torch.argmax(logits, dim=-1).item()
-    return idx2lang[pred_idx]
 # --- Gradio UI ---
-with gr.Blocks(title="Antons language detector") as demo:
-    gr.Markdown("# Antons language detector")
     with gr.Row():
-        with gr.Column():
-            src_sentence = gr.Textbox(
-                label="Text", placeholder="Write your text..."
-            )
-        with gr.Column():
-            tgt_sentence = gr.Textbox(
-                label="Language",
-                placeholder="Language will show here...",
-                interactive=False  # ✅ 输出框不可编辑
-            )
-    btn = gr.Button("Guess the language!")
-    btn.click(fn=detect_lang, inputs=[src_sentence], outputs=[tgt_sentence])
 demo.launch()

+# debug_app.py — 把它放到 HF Space 替换原来的 app.py
+import os, hashlib, json
 import numpy as np
 import torch
 import torch.nn as nn
 from collections import Counter
 import gradio as gr
+# --- utils (同训练代码) ---
 def ngrams(sentence, n=1, lc=True):
     ngram_l = []
     sentence = sentence.lower()
     h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
     return int.from_bytes(h.digest()[0:8], 'big', signed=True)
+def hash_ngrams(ngrams_list, modulos):
     hash_codes = []
+    for ngram_list, modulo in zip(ngrams_list, modulos):
         codes = [(reproducible_hash(x) % modulo) for x in ngram_list]
         hash_codes.append(codes)
     return hash_codes
 def calc_rel_freq(codes):
     cnt = Counter(codes)
+    total = sum(cnt.values()) if cnt else 1
     for k in cnt:
         cnt[k] /= total
     return cnt
 def build_freq_dict(sentence, MAXES=MAXES, MAX_SHIFT=MAX_SHIFT):
     hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
+    fhcodes = list(map(calc_rel_freq, hngrams))
     return shift_keys(fhcodes, MAX_SHIFT)
+# --- helper diagnostics ---
+def file_md5(path):
+    if not os.path.exists(path):
+        return None
+    with open(path, "rb") as f:
+        return hashlib.md5(f.read()).hexdigest()
+def model_param_stats(m):
+    mins, maxs, means = [], [], []
+    for p in m.parameters():
+        arr = p.detach().cpu().numpy().ravel()
+        if arr.size == 0:
+            continue
+        mins.append(float(arr.min()))
+        maxs.append(float(arr.max()))
+        means.append(float(arr.mean()))
+    if not mins:
+        return {"min": None, "max": None, "mean": None}
+    return {"min": min(mins), "max": max(maxs), "mean": float(np.mean(means))}
+# --- load artifacts (ensure these files exist in your repo) ---
+VEC_FN = "nld_vectorizer.joblib"
+LANG_FN = "nld_lang_codes.joblib"
+MODEL_FN = "nld.pth"
+vectorizer = joblib.load(VEC_FN)
+idx2lang = joblib.load(LANG_FN)
 input_dim = len(vectorizer.vocabulary_)
 nbr_classes = len(idx2lang)
+# build model skeleton same as training
 model = nn.Sequential(
     nn.Linear(input_dim, 50),
     nn.ReLU(),
     nn.Linear(50, nbr_classes)
 )
+model.load_state_dict(torch.load(MODEL_FN, map_location="cpu"))
 model.eval()
+# quick sanity info (will also print to logs)
+print(">>> artifact md5:", MODEL_FN, file_md5(MODEL_FN))
+print(">>> artifact md5:", VEC_FN, file_md5(VEC_FN))
+print(">>> artifact md5:", LANG_FN, file_md5(LANG_FN))
+print("vocab size:", len(vectorizer.vocabulary_))
+print("sample vocab items:", list(vectorizer.vocabulary_.items())[:10])
+print("idx2lang sample:", list(idx2lang.items())[:10])
+print("model param stats:", model_param_stats(model))
+# --- prediction + debug function ---
+def detect_lang_debug(src_sentence: str):
+    debug = {}
+    debug['md5_model'] = file_md5(MODEL_FN)
+    debug['md5_vectorizer'] = file_md5(VEC_FN)
+    debug['md5_idx2lang'] = file_md5(LANG_FN)
+    debug['vocab_size'] = len(vectorizer.vocabulary_)
+    debug['idx2lang_len'] = len(idx2lang)
+    debug['idx2lang_sample'] = dict(list(idx2lang.items())[:10])
+    debug['model_param_stats'] = model_param_stats(model)
     feat_dict = build_freq_dict(src_sentence)
+    X_test = vectorizer.transform([feat_dict])  # ensure a single dict in list
     if hasattr(X_test, "toarray"):
+        X_arr = X_test.toarray()
+    else:
+        X_arr = np.array(X_test)
+    debug['nonzero_features'] = int(np.count_nonzero(X_arr))
+    debug['X_shape'] = X_arr.shape
+    X_tensor = torch.from_numpy(X_arr.astype("float32"))
     with torch.no_grad():
+        logits = model(X_tensor)
+        probs = torch.softmax(logits, dim=-1).cpu().numpy().ravel()
+    topk = list(np.argsort(probs)[::-1][:5])
+    topk_info = [(int(k), idx2lang[int(k)], float(probs[int(k)])) for k in topk]
+    pred_idx = int(topk[0])
+    pred_lang = idx2lang[pred_idx]
+    debug_text = json.dumps({
+        "pred_lang": pred_lang,
+        "pred_idx": pred_idx,
+        "topk": topk_info,
+        "debug": debug
+    }, ensure_ascii=False, indent=2)
+    print("DEBUG:", debug_text)  # Visible in Spaces logs
+    return pred_lang, debug_text
+# --- self-test example set ---
+SELF_TESTS = {
+    "eng": "Hello, how are you?",
+    "fra": "Bonjour, comment allez-vous?",
+    "cmn": "你好，你在做什么？",
+    "jpn": "こんにちは、お元気ですか？",
+    "kor": "안녕하세요. 잘 지내세요?",
+    "ara": "مرحبا كيف حالك",
+    "swe": "Hej, hur mår du?",
+    "dan": "Godmorgen, hvordan har du det?"
+}
+def run_self_test():
+    results = []
+    for lang, sent in SELF_TESTS.items():
+        pred, dbg = detect_lang_debug(sent)
+        ok = (pred == lang) or (pred == lang)  # best-effort equality
+        results.append(f"{lang} | sent: {sent} | pred: {pred} | ok: {ok}")
+    out = "\n".join(results)
+    print("SELF-TEST RESULTS:\n", out)
+    return out
 # --- Gradio UI ---
+with gr.Blocks(title="Antons language detector (debug)") as demo:
+    gr.Markdown("# Antons language detector — debug build")
     with gr.Row():
+        with gr.Column(scale=3):
+            src = gr.Textbox(label="Text", placeholder="Write your text...")
+            btn = gr.Button("Guess the language!")
+            selftest_btn = gr.Button("Run self-test")
+        with gr.Column(scale=2):
+            out_lang = gr.Textbox(label="Language", interactive=False)
+            out_debug = gr.Textbox(label="Debug info (JSON)", interactive=False, lines=20)
+    btn.click(fn=detect_lang_debug, inputs=[src], outputs=[out_lang, out_debug])
+    selftest_btn.click(fn=run_self_test, inputs=[], outputs=[out_debug])
 demo.launch()