maurodore commited on 5 days ago

Commit

c17aad4

verified ·

1 Parent(s): 3a6e1da

Publish code_detection model and artifacts

Browse files

Files changed (19) hide show

README.md +74 -0
model_card.json +11 -0
onnx/fp32/config.json +27 -0
onnx/fp32/merges.txt +0 -0
onnx/fp32/model.onnx +3 -0
onnx/fp32/special_tokens_map.json +51 -0
onnx/fp32/tokenizer_config.json +57 -0
onnx/fp32/vocab.json +0 -0
onnx/int8/config.json +27 -0
onnx/int8/merges.txt +0 -0
onnx/int8/model_quantized.onnx +3 -0
onnx/int8/ort_config.json +33 -0
onnx/int8/special_tokens_map.json +51 -0
onnx/int8/tokenizer.json +0 -0
onnx/int8/tokenizer_config.json +57 -0
onnx/int8/vocab.json +0 -0
threshold.json +7 -0
tokenizer/special_tokens_map.json +51 -0
tokenizer/tokenizer_config.json +57 -0

README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+---
+language:
+- code
+library_name: optimum
+pipeline_tag: text-classification
+tags:
+- code-detection
+- safety
+- onnx
+- hikmaai
+license: apache-2.0
+---
+# hikmaai-codebert-base-code-detection
+A binary classifier that detects whether the input contains source code,
+fine-tuned from
+[microsoft/codebert-base](https://huggingface.co/microsoft/codebert-base)
+by [HikmaAI](https://huggingface.co/HikmaAI).
+## Model Description
+- **Task**: Binary classification (safe=0, threat=1, where "threat" = code detected)
+- **Base model**: `microsoft/codebert-base`
+- **Export formats**: ONNX FP32 + INT8 dynamic quantization
+## Performance
+See `model_card.json` for detailed metrics.
+Optimized threshold: **0.9950** (val recall: 0.9984)
+## Usage (ONNX)
+```python
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import AutoTokenizer
+model = ORTModelForSequenceClassification.from_pretrained(
+    "HikmaAI/hikmaai-codebert-base-code-detection",
+    subfolder="onnx/int8",
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "HikmaAI/hikmaai-codebert-base-code-detection",
+    subfolder="tokenizer",
+)
+inputs = tokenizer("def hello():\n    print('hi')", return_tensors="pt")
+outputs = model(**inputs)
+# outputs.logits -> [safe_score, threat_score]
+```
+## Training
+- Epochs: 5
+- Learning rate: 2e-05
+- Batch size: 16
+- Class weights: [1.0, 2.0]
+## License
+Apache-2.0
+## Citation
+```bibtex
+@misc{hikmaai-code_detection-2026,
+  title={hikmaai-codebert-base-code-detection},
+  author={HikmaAI},
+  year={2026},
+  publisher={HuggingFace},
+  url={https://huggingface.co/HikmaAI/hikmaai-codebert-base-code-detection}
+}
+```

model_card.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "classifier_name": "code_detection",
+  "version": "1.0.0",
+  "task_type": "sequence_classification",
+  "base_model": "microsoft/codebert-base",
+  "metrics": {},
+  "dataset_hash": "",
+  "created_at": "2026-05-06T14:01:09.044946Z",
+  "export_format": "onnx",
+  "quantization": "int8_dynamic"
+}

onnx/fp32/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.57.6",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

onnx/fp32/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/fp32/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:976bf6e56ecbb1e0583cdf5a2b3ef0995a2da8a16d8613c681573f014bbcefd9
+size 498811644

onnx/fp32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx/fp32/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "unk_token": "<unk>"
+}

onnx/fp32/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/int8/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.57.6",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

onnx/int8/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/int8/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:828f1479e946cc2c3a08a5fdf10d33e4c5b2c21d0b4bad011079038193c5d741
+size 125465749

onnx/int8/ort_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "Conv",
+      "MatMul",
+      "Attention",
+      "LSTM",
+      "Gather",
+      "Transpose",
+      "EmbedLayerNormalization"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "use_external_data_format": false
+}

onnx/int8/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx/int8/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx/int8/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "unk_token": "<unk>"
+}

onnx/int8/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

threshold.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "threshold": 0.995,
+  "val_recall": 0.9984453944811504,
+  "val_precision": 0.9988335925349923,
+  "val_f1": 0.998639455782313,
+  "val_accuracy": 0.998943715104874
+}

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "unk_token": "<unk>"
+}