Upload 9 files

Files changed (9) hide show

bert_toxicity_final_model/config.json ADDED Viewed

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "Safe",
+    "1": "Unsafe"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "Safe": 0,
+    "Unsafe": 1
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "vocab_size": 30522
+}

bert_toxicity_final_model/inference_example.py ADDED Viewed

+# Example code to load and use the trained model
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import json
+import numpy as np
+# load the saved model
+model_path = "./bert_toxicity_final_model"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForSequenceClassification.from_pretrained(model_path)
+# load configuration
+with open(f"{model_path}/model_config.json", 'r') as f:
+    config = json.load(f)
+MAX_LENGTH = config['max_length']
+THRESHOLD = config['best_threshold']
+def predict_toxicity(text):
+    """
+    Predict toxicity for a single text input
+    Returns: (is_toxic: bool, toxicity_score: float)
+    """
+    # tokenize
+    inputs = tokenizer(
+        text,
+        truncation=True,
+        padding=True,
+        max_length=MAX_LENGTH,
+        return_tensors="pt"
+    )
+    # predict
+    with torch.no_grad():
+        outputs = model(**inputs)
+        probabilities = torch.softmax(outputs.logits, dim=1)
+        toxicity_score = probabilities[0][1].item()  # probability of toxic class
+        is_toxic = toxicity_score >= THRESHOLD
+    return is_toxic, toxicity_score
+# example usage
+# is_toxic, score = predict_toxicity("Your text here")
+# print(f"Toxic: {is_toxic}, Score: {score:.3f}")

bert_toxicity_final_model/model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c12dc14dd33bce1d8cfc4de5ac4c6e1db5780aa7aa0e977753ea70ccc25ffef8
+size 267832560

bert_toxicity_final_model/model_config.json ADDED Viewed

+{
+  "model_name": "distilbert-base-uncased",
+  "max_length": 385,
+  "num_epochs": 10,
+  "batch_size": 16,
+  "learning_rate": 2e-05,
+  "best_threshold": 0.1,
+  "best_f1_score": 0.7503015681544029,
+  "training_samples": 4448,
+  "validation_samples": 1245,
+  "test_samples": 717,
+  "final_test_metrics": {
+    "accuracy": 0.7350069735006973,
+    "precision": 0.7175792507204611,
+    "recall": 0.7302052785923754,
+    "f1": 0.7238372093023255,
+    "auc_roc": 0.8193907156673115
+  },
+  "optimized_metrics": {
+    "threshold": 0.1,
+    "f1": 0.7503015681544029,
+    "precision": 0.6372950819672131,
+    "recall": 0.9120234604105572,
+    "accuracy": 0.7112970711297071,
+    "tp": 311,
+    "fp": 177,
+    "tn": 199,
+    "fn": 30
+  }
+}

bert_toxicity_final_model/special_tokens_map.json ADDED Viewed

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert_toxicity_final_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert_toxicity_final_model/tokenizer_config.json ADDED Viewed

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

bert_toxicity_final_model/training_args.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:11c7db08cc04b34f748962dc11fae11e7ce243734a70a42364be827325b24f7a
+size 5304

bert_toxicity_final_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff