yonad2008 commited on
Commit
2c69dbe
·
verified ·
1 Parent(s): 69ba471

Upload GNN turn-level model artifacts

Browse files
Files changed (3) hide show
  1. README.md +13 -13
  2. gnn_homo_payload.pt +2 -2
  3. metadata.json +8 -8
README.md CHANGED
@@ -13,19 +13,19 @@ model-index:
13
  metrics:
14
  - name: F1
15
  type: f1
16
- value: 0.8586
17
  - name: PR-AUC
18
  type: pr_auc
19
- value: 0.9720
20
  - name: ROC-AUC
21
  type: roc_auc
22
- value: 0.9772
23
  - name: Precision
24
  type: precision
25
- value: 0.8589
26
  - name: Recall
27
  type: recall
28
- value: 0.9158
29
  ---
30
  # GNN Jailbreak Prediction Model (phi4:14b)
31
 
@@ -35,17 +35,17 @@ Homogeneous GNN classifier for unsafe/jailbreak likelihood in multi-turn convers
35
 
36
  | Metric | Value |
37
  |----------------|--------|
38
- | F1 | 0.8586 |
39
- | PR-AUC | 0.9720 |
40
- | ROC-AUC | 0.9772 |
41
- | Precision | 0.8589 |
42
- | Recall | 0.9158 |
43
- | Best Threshold | 0.390 |
44
 
45
  ## Training Details
46
 
47
  - **Target model**: `phi4:14b`
48
- - **Datasets**: harmbench
49
  - **Split column**: `goal`
50
  - **Seed**: `42`
51
  - **Sentence model**: `sentence-transformers/all-MiniLM-L6-v2`
@@ -55,4 +55,4 @@ Homogeneous GNN classifier for unsafe/jailbreak likelihood in multi-turn convers
55
 
56
  ## Dataset Size (training samples)
57
 
58
- Prepared turn-level samples: 395
 
13
  metrics:
14
  - name: F1
15
  type: f1
16
+ value: 0.9411
17
  - name: PR-AUC
18
  type: pr_auc
19
+ value: 0.9782
20
  - name: ROC-AUC
21
  type: roc_auc
22
+ value: 0.9593
23
  - name: Precision
24
  type: precision
25
+ value: 0.9682
26
  - name: Recall
27
  type: recall
28
+ value: 0.9163
29
  ---
30
  # GNN Jailbreak Prediction Model (phi4:14b)
31
 
 
35
 
36
  | Metric | Value |
37
  |----------------|--------|
38
+ | F1 | 0.9411 |
39
+ | PR-AUC | 0.9782 |
40
+ | ROC-AUC | 0.9593 |
41
+ | Precision | 0.9682 |
42
+ | Recall | 0.9163 |
43
+ | Best Threshold | 0.270 |
44
 
45
  ## Training Details
46
 
47
  - **Target model**: `phi4:14b`
48
+ - **Datasets**: harmbench, harmful_behaviors_1
49
  - **Split column**: `goal`
50
  - **Seed**: `42`
51
  - **Sentence model**: `sentence-transformers/all-MiniLM-L6-v2`
 
55
 
56
  ## Dataset Size (training samples)
57
 
58
+ Prepared turn-level samples: 707
gnn_homo_payload.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edb1c160195c31c644ebb39468cbeed6ecfb41393cebc20f32c681f6161be870
3
- size 971461
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa9ce79e481cf9181e12aaa43775f560a0d317ff65ee43a5c82a7f9172d55228
3
+ size 1454533
metadata.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "csv": "/home/digayona/multi_turn_jailbreak_RL/GNN/turns_table_llama3_8b_harmbench.csv",
3
  "target_model": "phi4:14b",
4
- "threshold": 0.39,
5
  "sentence_model_name": "sentence-transformers/all-MiniLM-L6-v2",
6
- "n_rows": 395,
7
  "n_models": 1,
8
  "split_col": "goal",
9
  "seed": 42,
10
  "turn_norm_mode": "dataset_max",
11
  "turn_norm_denom": 22.0,
12
  "session_len_norm_mode": "dataset_max",
13
- "session_len_norm_denom": 20.0,
14
  "model_kwargs": {
15
  "hidden_channels": 128,
16
  "num_layers": 2,
@@ -20,10 +20,10 @@
20
  },
21
  "use_turn_bucket_features": false,
22
  "test_metrics": {
23
- "roc_auc": 0.9772275091195899,
24
- "pr_auc": 0.9720258299076259,
25
- "f1": 0.8585849597195537,
26
- "precision": 0.8589285714285715,
27
- "recall": 0.9158333333333333
28
  }
29
  }
 
1
  {
2
  "csv": "/home/digayona/multi_turn_jailbreak_RL/GNN/turns_table_llama3_8b_harmbench.csv",
3
  "target_model": "phi4:14b",
4
+ "threshold": 0.27,
5
  "sentence_model_name": "sentence-transformers/all-MiniLM-L6-v2",
6
+ "n_rows": 707,
7
  "n_models": 1,
8
  "split_col": "goal",
9
  "seed": 42,
10
  "turn_norm_mode": "dataset_max",
11
  "turn_norm_denom": 22.0,
12
  "session_len_norm_mode": "dataset_max",
13
+ "session_len_norm_denom": 12.0,
14
  "model_kwargs": {
15
  "hidden_channels": 128,
16
  "num_layers": 2,
 
20
  },
21
  "use_turn_bucket_features": false,
22
  "test_metrics": {
23
+ "roc_auc": 0.9592919442561716,
24
+ "pr_auc": 0.9782228704301538,
25
+ "f1": 0.941144695354281,
26
+ "precision": 0.9681518630856448,
27
+ "recall": 0.9162547510892741
28
  }
29
  }