Spaces:

dongtruong1910
/

viet-toxic-classifier

Sleeping

App Files Files Community

dongtruong1910 commited on 24 days ago

Commit

f817340

1 Parent(s): ad5bfe2

Deploy

Browse files

Files changed (8) hide show

Dockerfile +25 -0
api.py +34 -0
requirements.txt +6 -0
saved_models/best_model.pth +3 -0
src/__init__.py +0 -0
src/configs.py +34 -0
src/model.py +33 -0
src/predict.py +119 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# 1. Chọn hệ điều hành Python 3.9
+FROM python:3.9
+# 2. Tạo thư mục làm việc
+WORKDIR /code
+# 3. Copy file requirements và cài đặt thư viện
+COPY ./requirements.txt /code/requirements.txt
+# Cài torch bản CPU cho nhẹ (tùy chọn, hoặc cài thường cũng được)
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# 4. Copy toàn bộ code vào trong
+COPY ./src /code/src
+COPY ./saved_models /code/saved_models
+COPY ./api.py /code/api.py
+# 5. Cấp quyền cho user (Hugging Face yêu cầu)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# 6. Mở cổng 7860 (Cổng bắt buộc của Hugging Face)
+# Lưu ý: Code api.py của bạn đang chạy port 8000, ta sẽ đổi lệnh chạy ở đây
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

api.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+import uvicorn
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from src.predict import HateSpeechPredictor
+app = FastAPI()
+print("--> Đang khởi động Server...")
+predictor = HateSpeechPredictor()
+class Item(BaseModel):
+    text: str
+@app.post("/predict")
+def predict(item: Item):
+    # Gọi hàm predict thông minh (đã xử lý đoạn văn)
+    result = predictor.predict(item.text)
+    return {
+        "text": item.text,
+        "prediction": result['label'],
+        "confidence": f"{result['confidence']:.2%}",
+        "is_toxic": result['is_toxic'],
+        "flagged_sentence": result['flagged_sentence']
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+pyvi
+fastapi
+uvicorn
+pydantic

saved_models/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09517912757395f534419b53f4f8cdaaa75d3ed6d16cc68da5d4c26cd43dc261
+size 540084487

src/__init__.py ADDED Viewed

File without changes

src/configs.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import os
+# Lấy đường dẫn gốc của dự án
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+class Config:
+    # --- ĐƯỜNG DẪN DỮ LIỆU ---
+    TRAIN_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'train.csv')
+    DEV_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'dev.csv')
+    TEST_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'test.csv')
+    # Nơi lưu model
+    MODEL_SAVE_PATH = os.path.join(BASE_DIR, 'saved_models', 'best_model.pth')
+    # --- CẤU HÌNH PHOBERT ---
+    MODEL_NAME = "vinai/phobert-base"
+    # Tham số xử lý văn bản
+    MAX_LEN = 100  # Độ dài câu tối đa
+    N_CLASSES = 3  # <--- DÒNG BẠN ĐANG THIẾU (0: Clean, 1: Offensive, 2: Hate)
+    # --- THAM SỐ HUẤN LUYỆN (Fine-tuning) ---
+    BATCH_SIZE = 16 # PhoBERT nặng nên để batch size nhỏ (16 hoặc 8)
+    EPOCHS = 10
+    LEARNING_RATE = 2e-5  # Learning rate rất nhỏ cho Transformer
+    # Tự động chọn GPU
+    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if __name__ == '__main__':
+    print(f"Device: {Config.DEVICE}")

src/model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch.nn as nn
+from transformers import AutoModel
+from .configs import Config
+class HateSpeechModel(nn.Module):
+    def __init__(self, n_classes):
+        super(HateSpeechModel, self).__init__()
+        # Load khung xương PhoBERT
+        self.bert = AutoModel.from_pretrained(Config.MODEL_NAME, weights_only=False)
+        # Khóa bớt các tầng đầu để train nhanh hơn (Optional - Tùy chọn)
+        for param in self.bert.parameters():
+            param.requires_grad = True
+        # Thêm đầu ra phân loại
+        self.drop = nn.Dropout(p=0.3)
+        self.fc = nn.Linear(768, n_classes)  # 768 là kích thước vector của PhoBERT Base
+    def forward(self, input_ids, attention_mask):
+        # Cho dữ liệu chạy qua PhoBERT
+        # output[0] là hidden states, output[1] là pooled output (vector đại diện câu)
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        # Lấy vector đại diện của token [CLS] (token đầu tiên)
+        # Nó chứa ý nghĩa của toàn bộ câu
+        pooled_output = outputs[1]
+        output = self.drop(pooled_output)
+        return self.fc(output)

src/predict.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+from pyvi import ViTokenizer
+from src.configs import Config
+from src.model import HateSpeechModel
+import os
+import re
+class HateSpeechPredictor:
+    def __init__(self, model_path=None):
+        self.device = Config.DEVICE
+        print(f"--> Đang khởi tạo Predictor trên: {self.device}")
+        # 1. Load Tokenizer & Model
+        self.tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
+        self.model = HateSpeechModel(n_classes=Config.N_CLASSES)
+        # 2. Load Weights
+        if model_path is None:
+            model_path = Config.MODEL_SAVE_PATH
+        if os.path.exists(model_path):
+            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
+            self.model.to(self.device)
+            self.model.eval()
+            print("--> Đã load model thành công!")
+        else:
+            raise FileNotFoundError(f"Chưa có file model tại {model_path}")
+        # Map nhãn
+        self.labels_map = {0: "CLEAN", 1: "OFFENSIVE", 2: "HATE"}
+        # Map mức độ nghiêm trọng (để so sánh)
+        self.severity_map = {"CLEAN": 0, "OFFENSIVE": 1, "HATE": 2}
+    def _split_sentences(self, text):
+        """Hàm tách đoạn văn thành các câu nhỏ"""
+        # Tách dựa trên dấu chấm, chấm than, chấm hỏi, hoặc xuống dòng
+        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\n)\s', text)
+        return [s.strip() for s in sentences if len(s.strip()) > 1]
+    def _predict_single(self, text):
+        """Dự đoán cho 1 câu đơn"""
+        text_segmented = ViTokenizer.tokenize(text)
+        encoding = self.tokenizer.encode_plus(
+            text_segmented,
+            max_length=Config.MAX_LEN,
+            truncation=True,
+            padding='max_length',
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors='pt'
+        )
+        input_ids = encoding['input_ids'].to(self.device)
+        attention_mask = encoding['attention_mask'].to(self.device)
+        with torch.no_grad():
+            outputs = self.model(input_ids, attention_mask)
+            probs = F.softmax(outputs, dim=1)
+        max_prob, pred_idx = torch.max(probs, dim=1)
+        return self.labels_map[pred_idx.item()], max_prob.item()
+    def predict(self, text):
+        """
+        Hàm chính: Xử lý cả đoạn văn.
+        Logic: Tách câu -> Dự đoán từng câu -> Lấy nhãn NẶNG NHẤT.
+        """
+        sentences = self._split_sentences(text)
+        final_label = "CLEAN"
+        final_conf = 0.0
+        max_severity = 0
+        flagged_sentence = ""  # Lưu lại câu bị vi phạm
+        # Nếu đoạn văn quá ngắn hoặc không tách được, coi là 1 câu
+        if len(sentences) == 0:
+            sentences = [text]
+        for sent in sentences:
+            label, conf = self._predict_single(sent)
+            severity = self.severity_map[label]
+            # Cập nhật nếu tìm thấy câu nặng hơn (HATE > OFFENSIVE > CLEAN)
+            # Hoặc cùng mức độ nhưng độ tin cậy cao hơn
+            if severity > max_severity:
+                max_severity = severity
+                final_label = label
+                final_conf = conf
+                flagged_sentence = sent
+            elif severity == max_severity and conf > final_conf:
+                final_conf = conf
+                flagged_sentence = sent
+        # Nếu là CLEAN thì không cần flagged_sentence
+        if final_label == "CLEAN":
+            flagged_sentence = None
+        return {
+            "label": final_label,
+            "confidence": final_conf,
+            "is_toxic": final_label != "CLEAN",
+            "flagged_sentence": flagged_sentence  # Câu "tội đồ" làm bài bị chặn
+        }
+# Test
+if __name__ == "__main__":
+    p = HateSpeechPredictor()
+    # Test đoạn văn dài
+    paragraph = "Hôm nay trời đẹp. Nhưng mày là đồ ngu. Đi chơi thôi."
+    result = p.predict(paragraph)
+    print(f"Input: {paragraph}")
+    print(f"Kết quả: {result['label']} ({result['confidence']:.2%})")
+    print(f"Câu vi phạm: {result['flagged_sentence']}")