Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| import unicodedata | |
| from bs4 import BeautifulSoup | |
| import numpy as np | |
| import gradio as gr | |
| def clean_html(raw_html: str) -> str: | |
| soup = BeautifulSoup(raw_html, "html.parser") | |
| for img in soup.find_all("img"): img.decompose() | |
| for math in soup.find_all("math"): math.decompose() | |
| return soup.get_text(separator=" ", strip=True) | |
| def normalize_text(text: str) -> str: | |
| text = text.lower() | |
| chars = [] | |
| for ch in text: | |
| cat = unicodedata.category(ch) | |
| if cat.startswith("L") or ch.isdigit() or ch.isspace(): | |
| chars.append(ch) | |
| else: | |
| chars.append(" ") | |
| return re.sub(r"\s+", " ", "".join(chars)).strip() | |
| def preprocess(content_html: str) -> str: | |
| return normalize_text(clean_html(content_html)) | |
| with open("vectorizer.json", encoding="utf-8") as f: | |
| vect_data = json.load(f) | |
| vocab = vect_data["vocabulary"] | |
| # Implement CountVectorizer-like transform: | |
| def transform_count(docs): | |
| """ | |
| docs: list of preprocessed strings | |
| return: 2D numpy array (n_docs x n_features) | |
| """ | |
| n_docs = len(docs) | |
| n_feats = len(vocab) | |
| X = np.zeros((n_docs, n_feats), dtype=np.float32) | |
| for i, doc in enumerate(docs): | |
| for token in doc.split(): | |
| idx = vocab.get(token) | |
| if idx is not None: | |
| X[i, idx] += 1.0 | |
| return X | |
| with open("nbc_model.json", encoding="utf-8") as f: | |
| clf_data = json.load(f) | |
| classes = np.array(clf_data["classes"]) | |
| class_log_prior = np.array(clf_data["class_log_prior"]) | |
| feature_log_prob = np.array(clf_data["feature_log_prob"]) | |
| def predict_nb_count(docs): | |
| """ | |
| doc-term count matrix X: sử dụng log-prob NB | |
| return: list of labels | |
| """ | |
| X = transform_count(docs) # shape (n_docs, n_feats) | |
| # tính log posterior: log_prior + X @ feature_log_prob.T | |
| log_post = class_log_prior + X.dot(feature_log_prob.T) | |
| idx = np.argmax(log_post, axis=1) | |
| return classes[idx] | |
| def predict_kc(content_html: str): | |
| if not content_html: | |
| return "Chưa nhập nội dung câu hỏi." | |
| text = preprocess(content_html) | |
| if not text: | |
| return "Nội dung rỗng sau khi xử lý." | |
| label = predict_nb_count([text])[0] | |
| return label | |
| css = """ | |
| textarea { font-size: 18px !important; } | |
| .gradio-container .output-text { font-size: 18px !important; } | |
| """ | |
| interface = gr.Interface( | |
| fn = predict_kc, | |
| inputs = gr.Textbox(lines=6, | |
| placeholder="Dán nội dung câu hỏi dạng HTML", | |
| label="Nội dung câu hỏi ", | |
| elem_id="input-box"), | |
| outputs = gr.Textbox(label="KC dự đoán", | |
| elem_id="output-box"), | |
| title = "Demo dự đoán KC", | |
| description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.", | |
| css=css, | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |