Spaces:

Di12
/

KC_Classifier

Sleeping

App Files Files Community

KC_Classifier / app.py

Di12

Update app.py

c1c6e2b verified 9 months ago

raw

history blame contribute delete

2.87 kB

	import json
	import re
	import unicodedata
	from bs4 import BeautifulSoup
	import numpy as np
	import gradio as gr

	def clean_html(raw_html: str) -> str:
	soup = BeautifulSoup(raw_html, "html.parser")
	for img in soup.find_all("img"): img.decompose()
	for math in soup.find_all("math"): math.decompose()
	return soup.get_text(separator=" ", strip=True)

	def normalize_text(text: str) -> str:
	text = text.lower()
	chars = []
	for ch in text:
	cat = unicodedata.category(ch)
	if cat.startswith("L") or ch.isdigit() or ch.isspace():
	chars.append(ch)
	else:
	chars.append(" ")
	return re.sub(r"\s+", " ", "".join(chars)).strip()

	def preprocess(content_html: str) -> str:
	return normalize_text(clean_html(content_html))

	with open("vectorizer.json", encoding="utf-8") as f:
	vect_data = json.load(f)
	vocab = vect_data["vocabulary"]

	# Implement CountVectorizer-like transform:
	def transform_count(docs):
	"""
	docs: list of preprocessed strings
	return: 2D numpy array (n_docs x n_features)
	"""
	n_docs = len(docs)
	n_feats = len(vocab)
	X = np.zeros((n_docs, n_feats), dtype=np.float32)
	for i, doc in enumerate(docs):
	for token in doc.split():
	idx = vocab.get(token)
	if idx is not None:
	X[i, idx] += 1.0
	return X

	with open("nbc_model.json", encoding="utf-8") as f:
	clf_data = json.load(f)
	classes = np.array(clf_data["classes"])
	class_log_prior = np.array(clf_data["class_log_prior"])
	feature_log_prob = np.array(clf_data["feature_log_prob"])

	def predict_nb_count(docs):
	"""
	doc-term count matrix X: sử dụng log-prob NB
	return: list of labels
	"""
	X = transform_count(docs) # shape (n_docs, n_feats)
	# tính log posterior: log_prior + X @ feature_log_prob.T
	log_post = class_log_prior + X.dot(feature_log_prob.T)
	idx = np.argmax(log_post, axis=1)
	return classes[idx]

	def predict_kc(content_html: str):
	if not content_html:
	return "Chưa nhập nội dung câu hỏi."
	text = preprocess(content_html)
	if not text:
	return "Nội dung rỗng sau khi xử lý."
	label = predict_nb_count([text])[0]
	return label

	css = """
	textarea { font-size: 18px !important; }
	.gradio-container .output-text { font-size: 18px !important; }
	"""

	interface = gr.Interface(
	fn = predict_kc,
	inputs = gr.Textbox(lines=6,
	placeholder="Dán nội dung câu hỏi dạng HTML",
	label="Nội dung câu hỏi ",
	elem_id="input-box"),
	outputs = gr.Textbox(label="KC dự đoán",
	elem_id="output-box"),
	title = "Demo dự đoán KC",
	description="Dự đoán nhãn KC từ câu hỏi dựa trên Naive Bayes.",
	css=css,
	allow_flagging="never"
	)

	if __name__ == "__main__":
	interface.launch()