README.md CHANGED
@@ -66,7 +66,7 @@ predicted_label = torch.argmax(probabilities, dim=-1)
66
  REPO_NAME = "daxa-ai/pebblo-classifier"
67
 
68
  # Path to the label encoder file in the repository
69
- LABEL_ENCODER_FILE = "label encoder.joblib"
70
 
71
  # Construct the URL to the label encoder file
72
  url = hf_hub_url(REPO_NAME, filename=LABEL_ENCODER_FILE)
@@ -96,9 +96,9 @@ Here are the labels along with their respective counts in the dataset:
96
  | BOARD_MEETING_AGREEMENT | 4,225 |
97
  | CONSULTING_AGREEMENT | 2,965 |
98
  | CUSTOMER_LIST_AGREEMENT | 9,000 |
99
- | DISTRIBUTION_PARTNER_AGREEMENT | 8,339 |
100
  | EMPLOYEE_AGREEMENT | 3,921 |
101
- | ENTERPRISE_AGREEMENT | 3,820 |
102
  | ENTERPRISE_LICENSE_AGREEMENT | 9,000 |
103
  | EXECUTIVE_SEVERANCE_AGREEMENT | 9,000 |
104
  | FINANCIAL_REPORT_AGREEMENT | 8,381 |
@@ -107,11 +107,11 @@ Here are the labels along with their respective counts in the dataset:
107
  | LOAN_AND_SECURITY_AGREEMENT | 9,000 |
108
  | MEDICAL_ADVICE | 2,359 |
109
  | MERGER_AGREEMENT | 7,706 |
110
- | NDA_AGREEMENT | 2,966 |
111
- | NORMAL_TEXT | 6,742 |
112
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 9,000 |
113
  | PRICE_LIST_AGREEMENT | 9,000 |
114
- | SETTLEMENT_AGREEMENT | 9,000 |
115
  | SEXUAL_HARRASSMENT | 8,321 |
116
 
117
 
@@ -141,7 +141,7 @@ Here are the labels along with their respective counts in the dataset:
141
  | MEDICAL_ADVICE | 289 |
142
  | MERGER_AGREEMENT | 7,079 |
143
  | NDA_AGREEMENT | 1,452 |
144
- | NORMAL_TEXT | 1,808 |
145
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 6,177 |
146
  | PRICE_LIST_AGREEMENT | 5,453 |
147
  | SETTLEMENT_AGREEMENT | 5,806 |
@@ -153,33 +153,37 @@ Here are the labels along with their respective counts in the dataset:
153
 
154
  | Agreement Type | precision | recall | f1-score | support |
155
  | ------------------------------------------- | --------- | ------ | -------- | ------- |
156
- | BOARD_MEETING_AGREEMENT | 0.93 | 0.95 | 0.94 | 4335 |
157
- | CONSULTING_AGREEMENT | 0.72 | 0.98 | 0.84 | 1593 |
158
- | CUSTOMER_LIST_AGREEMENT | 0.64 | 0.82 | 0.72 | 4335 |
159
- | DISTRIBUTION_PARTNER_AGREEMENT | 0.83 | 0.47 | 0.61 | 7231 |
160
- | EMPLOYEE_AGREEMENT | 0.78 | 0.92 | 0.85 | 1333 |
161
- | ENTERPRISE_AGREEMENT | 0.29 | 0.40 | 0.34 | 1616 |
162
- | ENTERPRISE_LICENSE_AGREEMENT | 0.88 | 0.79 | 0.83 | 5574 |
163
- | EXECUTIVE_SERVICE_AGREEMENT | 0.92 | 0.85 | 0.89 | 8177 |
164
- | FINANCIAL_REPORT_AGREEMENT | 0.89 | 0.98 | 0.93 | 4264 |
165
- | HARMFUL_ADVICE | 0.79 | 0.95 | 0.86 | 474 |
166
- | INTERNAL_PRODUCT_ROADMAP_AGREEMENT | 0.91 | 0.98 | 0.94 | 4116 |
167
- | LOAN_AND_SECURITY_AGREEMENT | 0.77 | 0.98 | 0.86 | 6354 |
168
- | MEDICAL_ADVICE | 0.81 | 0.99 | 0.89 | 289 |
169
- | MERGER_AGREEMENT | 0.89 | 0.77 | 0.83 | 7279 |
170
- | NDA_AGREEMENT | 0.70 | 0.57 | 0.62 | 1452 |
171
- | NORMAL_TEXT | 0.79 | 0.97 | 0.87 | 1888 |
172
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 0.95 | 0.99 | 0.97 | 6177 |
173
- | PRICE_LIST_AGREEMENT | 0.60 | 0.75 | 0.67 | 5565 |
174
- | SETTLEMENT_AGREEMENT | 0.82 | 0.54 | 0.65 | 5843 |
175
- | SEXUAL_HARASSMENT | 0.97 | 0.94 | 0.95 | 440 |
176
  | | | | | |
177
- | accuracy | | | 0.79 | 82916 |
178
- | macro avg | 0.79 | 0.83 | 0.80 | 82916 |
179
- | weighted avg | 0.83 | 0.81 | 0.81 | 82916 |
180
 
181
 
182
  #### Results
183
 
184
- The model's performance is summarized by precision, recall, and f1-score metrics, which are detailed across all 20 labels in the dataset. The accuracy stands at 0.79 for the entire test set, with a macro average and weighted average of precision, recall, and f1-score around 0.80 and 0.81, respectively.
 
 
 
 
185
 
 
66
  REPO_NAME = "daxa-ai/pebblo-classifier"
67
 
68
  # Path to the label encoder file in the repository
69
+ LABEL_ENCODER_FILE = "label_encoder.joblib"
70
 
71
  # Construct the URL to the label encoder file
72
  url = hf_hub_url(REPO_NAME, filename=LABEL_ENCODER_FILE)
 
96
  | BOARD_MEETING_AGREEMENT | 4,225 |
97
  | CONSULTING_AGREEMENT | 2,965 |
98
  | CUSTOMER_LIST_AGREEMENT | 9,000 |
99
+ | DISTRIBUTION_PARTNER_AGREEMENT | 5,162 |
100
  | EMPLOYEE_AGREEMENT | 3,921 |
101
+ | ENTERPRISE_AGREEMENT | 4,217 |
102
  | ENTERPRISE_LICENSE_AGREEMENT | 9,000 |
103
  | EXECUTIVE_SEVERANCE_AGREEMENT | 9,000 |
104
  | FINANCIAL_REPORT_AGREEMENT | 8,381 |
 
107
  | LOAN_AND_SECURITY_AGREEMENT | 9,000 |
108
  | MEDICAL_ADVICE | 2,359 |
109
  | MERGER_AGREEMENT | 7,706 |
110
+ | NDA_AGREEMENT | 5,229 |
111
+ | NORMAL_TEXT | 9,000 |
112
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 9,000 |
113
  | PRICE_LIST_AGREEMENT | 9,000 |
114
+ | SETTLEMENT_AGREEMENT | 3,754 |
115
  | SEXUAL_HARRASSMENT | 8,321 |
116
 
117
 
 
141
  | MEDICAL_ADVICE | 289 |
142
  | MERGER_AGREEMENT | 7,079 |
143
  | NDA_AGREEMENT | 1,452 |
144
+ | NORMAL_TEXT | 8,335 |
145
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 6,177 |
146
  | PRICE_LIST_AGREEMENT | 5,453 |
147
  | SETTLEMENT_AGREEMENT | 5,806 |
 
153
 
154
  | Agreement Type | precision | recall | f1-score | support |
155
  | ------------------------------------------- | --------- | ------ | -------- | ------- |
156
+ | BOARD_MEETING_AGREEMENT | 0.96 | 0.94 | 0.95 | 4335 |
157
+ | CONSULTING_AGREEMENT | 0.77 | 0.89 | 0.83 | 1533 |
158
+ | CUSTOMER_LIST_AGREEMENT | 0.84 | 0.87 | 0.85 | 4995 |
159
+ | DISTRIBUTION_PARTNER_AGREEMENT | 0.71 | 0.64 | 0.67 | 7231 |
160
+ | EMPLOYEE_AGREEMENT | 0.78 | 0.90 | 0.83 | 1433 |
161
+ | ENTERPRISE_AGREEMENT | 0.19 | 0.72 | 0.30 | 1616 |
162
+ | ENTERPRISE_LICENSE_AGREEMENT | 0.92 | 0.78 | 0.84 | 8574 |
163
+ | EXECUTIVE_SEVERANCE_AGREEMENT | 0.96 | 0.85 | 0.90 | 5177 |
164
+ | FINANCIAL_REPORT_AGREEMENT | 0.92 | 0.98 | 0.95 | 4264 |
165
+ | HARMFUL_ADVICE | 0.82 | 0.92 | 0.87 | 474 |
166
+ | INTERNAL_PRODUCT_ROADMAP_AGREEMENT | 0.94 | 0.97 | 0.96 | 4116 |
167
+ | LOAN_AND_SECURITY_AGREEMENT | 0.92 | 0.96 | 0.94 | 6354 |
168
+ | MEDICAL_ADVICE | 0.76 | 1.00 | 0.86 | 289 |
169
+ | MERGER_AGREEMENT | 0.90 | 0.55 | 0.68 | 7079 |
170
+ | NDA_AGREEMENT | 0.62 | 0.89 | 0.74 | 1452 |
171
+ | NORMAL_TEXT | 0.99 | 0.99 | 0.99 | 6049 |
172
  | PATENT_APPLICATION_FILLINGS_AGREEMENT | 0.95 | 0.99 | 0.97 | 6177 |
173
+ | PRICE_LIST_AGREEMENT | 0.81 | 0.75 | 0.78 | 5453 |
174
+ | SETTLEMENT_AGREEMENT | 0.83 | 0.73 | 0.78 | 5806 |
175
+ | SEXUAL_HARRASSMENT | 0.98 | 0.93 | 0.96 | 4750 |
176
  | | | | | |
177
+ | accuracy | | | 0.84 | 87157 |
178
+ | macro avg | 0.83 | 0.86 | 0.83 | 87157 |
179
+ | weighted avg | 0.87 | 0.84 | 0.85 | 87157 |
180
 
181
 
182
  #### Results
183
 
184
+ The models performance is summarized by precision, recall, and f1-score metrics, which are detailed across all 20 labels in the dataset. Based on the test data evaluation results, the model achieved an accuracy of 0.8376, a precision of 0.8744, and a recall of 0.8376. The F1-score, which is the harmonic mean of precision and recall, stands at 0.8478.
185
+
186
+ The evaluation loss, which measures the discrepancy between the model’s predictions and the actual values, is 0.5616. Lower loss values indicate better model performance.
187
+
188
+ The model was able to process approximately 101.886 samples per second during the evaluation, which took a total runtime of 855.4327 seconds. The model performed approximately 0.796 evaluation steps per second.
189
 
config.json CHANGED
@@ -9,54 +9,51 @@
9
  "dropout": 0.1,
10
  "hidden_dim": 3072,
11
  "id2label": {
12
- "0": "BOARD_MEETING_AGREEMENT",
13
- "1": "CONSULTING_AGREEMENT",
14
- "2": "CUSTOMER_LIST_AGREEMENT",
15
- "3": "DISTRIBUTION_PARTNER_AGREEMENT",
16
- "4": "ENTERPRISE_LICENSE_AGREEMENT",
17
- "5": "EXECUTIVE_SEVERANCE_AGREEMENT",
18
- "6": "FINANCIAL_REPORT_AGREEMENT",
19
- "7": "HARMFUL_ADVICE",
20
- "8": "INTERNAL_USE_ONLY_AGREEMENT",
21
- "9": "LOAN_AND_SECURITY_AGREEMENT",
22
- "10": "MEDICAL_ADVICE",
23
- "11": "MERGER_AGREEMENT",
24
- "12": "NDA_AGREEMENT",
25
- "13": "NORMAL_TEXT",
26
- "14": "PATENT_APPLICATION_FILLINGS_AGREEMENT",
27
- "15": "PRICE_LIST_AGREEMENT",
28
- "16": "SECRET_SAUCE_AGREEMENT",
29
- "17": "SECURITY_BREACH_AGREEMENT",
30
- "18": "SETTLEMENT_AGREEMENT",
31
- "19": "SEXUAL_HARRASSMENT_AGREEMENT",
32
- "20": "EMPLOYEE_AGREEMENT",
33
- "21": "ENTERPRISE_AGREEMENT"
34
- },
35
  "initializer_range": 0.02,
36
  "label2id": {
37
- "BOARD_MEETING_AGREEMENT": 0,
38
- "CONSULTING_AGREEMENT": 1,
39
- "MEDICAL_ADVICE": 10,
40
- "MERGER_AGREEMENT": 11,
41
- "NDA_AGREEMENT": 12,
42
- "NORMAL_TEXT": 13,
43
- "PATENT_APPLICATION_FILLINGS_AGREEMENT": 14,
44
- "PRICE_LIST_AGREEMENT": 15,
45
- "SECRET_SAUCE_AGREEMENT": 16,
46
- "SECURITY_BREACH_AGREEMENT": 17,
47
- "SETTLEMENT_AGREEMENT": 18,
48
- "SEXUAL_HARRASSMENT_AGREEMENT": 19,
49
- "CUSTOMER_LIST_AGREEMENT": 2,
50
- "EMPLOYEE_AGREEMENT": 20,
51
- "ENTERPRISE_AGREEMENT": 21,
52
- "DISTRIBUTION_PARTNER_AGREEMENT": 3,
53
- "ENTERPRISE_LICENSE_AGREEMENT": 4,
54
- "EXECUTIVE_SEVERANCE_AGREEMENT": 5,
55
- "FINANCIAL_REPORT_AGREEMENT": 6,
56
- "HARMFUL_ADVICE": 7,
57
- "INTERNAL_USE_ONLY_AGREEMENT": 8,
58
- "LOAN_AND_SECURITY_AGREEMENT": 9
59
- },
60
  "max_position_embeddings": 512,
61
  "model_type": "distilbert",
62
  "n_heads": 12,
@@ -70,4 +67,3 @@
70
  "transformers_version": "4.36.2",
71
  "vocab_size": 30522
72
  }
73
-
 
9
  "dropout": 0.1,
10
  "hidden_dim": 3072,
11
  "id2label": {
12
+
13
+ "0": "BOARD_MEETING_AGREEMENT",
14
+ "1": "CONSULTING_AGREEMENT",
15
+ "2": "CUSTOMER_LIST_AGREEMENT",
16
+ "3": "DISTRIBUTION_PARTNER_AGREEMENT",
17
+ "4": "EMPLOYEE_AGREEMENT",
18
+ "5": "ENTERPRISE_AGREEMENT",
19
+ "6": "ENTERPRISE_LICENSE_AGREEMENT",
20
+ "7": "EXECUTIVE_SEVERANCE_AGREEMENT",
21
+ "8": "FINANCIAL_REPORT_AGREEMENT",
22
+ "9": "HARMFUL_ADVICE",
23
+ "10": "INTERNAL_PRODUCT_ROADMAP_AGREEMENT",
24
+ "11": "LOAN_AND_SECURITY_AGREEMENT",
25
+ "12": "MEDICAL_ADVICE",
26
+ "13": "MERGER_AGREEMENT",
27
+ "14": "NDA_AGREEMENT",
28
+ "15": "NORMAL_TEXT",
29
+ "16": "PATENT_APPLICATION_FILLINGS_AGREEMENT",
30
+ "17": "PRICE_LIST_AGREEMENT",
31
+ "18": "SETTLEMENT_AGREEMENT",
32
+ "19": "SEXUAL_HARRASSMENT"
33
+ },
 
34
  "initializer_range": 0.02,
35
  "label2id": {
36
+ "BOARD_MEETING_AGREEMENT": 0,
37
+ "CONSULTING_AGREEMENT": 1,
38
+ "INTERNAL_PRODUCT_ROADMAP_AGREEMENT": 10,
39
+ "LOAN_AND_SECURITY_AGREEMENT": 11,
40
+ "MEDICAL_ADVICE": 12,
41
+ "MERGER_AGREEMENT": 13,
42
+ "NDA_AGREEMENT": 14,
43
+ "NORMAL_TEXT": 15,
44
+ "PATENT_APPLICATION_FILLINGS_AGREEMENT": 16,
45
+ "PRICE_LIST_AGREEMENT": 17,
46
+ "SETTLEMENT_AGREEMENT": 18,
47
+ "SEXUAL_HARRASSMENT": 19,
48
+ "CUSTOMER_LIST_AGREEMENT": 2,
49
+ "DISTRIBUTION_PARTNER_AGREEMENT": 3,
50
+ "EMPLOYEE_AGREEMENT": 4,
51
+ "ENTERPRISE_AGREEMENT": 5,
52
+ "ENTERPRISE_LICENSE_AGREEMENT": 6,
53
+ "EXECUTIVE_SEVERANCE_AGREEMENT": 7,
54
+ "FINANCIAL_REPORT_AGREEMENT": 8,
55
+ "HARMFUL_ADVICE": 9
56
+ },
 
 
57
  "max_position_embeddings": 512,
58
  "model_type": "distilbert",
59
  "n_heads": 12,
 
67
  "transformers_version": "4.36.2",
68
  "vocab_size": 30522
69
  }
 
label encoder.joblib → label_encoder.joblib RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aed62c86044052d34301175575b4ec585383ad9cf7c1177a8372e0161c1f8fb4
3
- size 1057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:679d6eddc6f1fc6f4f4f58df6d284bf455024e8273567b22557de19dfc8753bb
3
+ size 1099
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ce2513da6968cd9aa84adb3a032d6e3cbdd00db82da188bc31a155b32485eb1
3
- size 268216125
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c110ff2add5c7adf6aadaa01e0e14ce8e140ede610307633fb7172e066fa42fc
3
+ size 268209725