ibaucells commited on
Commit
fd5e291
·
1 Parent(s): 70a22b8

Roberta-base-ca-v2 trained with the new version of the TeCla dataset (v2).

Files changed (5) hide show
  1. README.md +11 -11
  2. config.json +110 -40
  3. pytorch_model.bin +2 -2
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +1 -1
README.md CHANGED
@@ -34,7 +34,7 @@ model-index:
34
  metrics:
35
  - name: Accuracy
36
  type: accuracy
37
- value: 0.7426
38
 
39
  widget:
40
 
@@ -107,7 +107,7 @@ At the time of submission, no measures have been taken to estimate the bias embe
107
  ## Training
108
 
109
  ### Training data
110
- We used the TC dataset in Catalan called [TeCla](https://huggingface.co/datasets/projecte-aina/tecla) for training and evaluation.
111
 
112
  ### Training procedure
113
  The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5 epochs. We then selected the best checkpoint using the downstream task metric in the corresponding development set and then evaluated it on the test set.
@@ -116,17 +116,17 @@ The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5
116
 
117
  ### Variable and metrics
118
 
119
- This model was finetuned maximizing accuracy.
120
 
121
  ## Evaluation results
122
- We evaluated the _roberta-base-ca-v2-cased-tc_ on the TeCla test set against standard multilingual and monolingual baselines:
123
-
124
- | Model | TeCla (Accuracy) |
125
- | ------------|:-------------|
126
- | roberta-base-ca-v2-cased-tc | **74.26** |
127
- | roberta-base-ca-cased-tc | 73.65 |
128
- | mBERT | 69.90 |
129
- | XLM-RoBERTa | 70.14 |
130
 
131
  For more details, check the fine-tuning and evaluation scripts in the official [GitHub repository](https://github.com/projecte-aina/club).
132
 
 
34
  metrics:
35
  - name: Accuracy
36
  type: accuracy
37
+ value: 0.8034
38
 
39
  widget:
40
 
 
107
  ## Training
108
 
109
  ### Training data
110
+ We used the TC dataset in Catalan called [TeCla](https://huggingface.co/datasets/projecte-aina/tecla) for training and evaluation. Although TeCla includes a coarse-grained ('label1') and a fine-grained categorization ('label2'), only the last one, with 53 classes, was used for the training.
111
 
112
  ### Training procedure
113
  The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5 epochs. We then selected the best checkpoint using the downstream task metric in the corresponding development set and then evaluated it on the test set.
 
116
 
117
  ### Variable and metrics
118
 
119
+ This model was finetuned maximizing F1 (weighted).
120
 
121
  ## Evaluation results
122
+ We evaluated the _roberta-base-ca-v2-cased-tc_ on the TeCla test set against standard multilingual and monolingual baselines. The results for 'label1' categories were obtained through a mapping from the fine-grained category ('label2') to the corresponding coarse-grained one ('label1').
123
+
124
+ | Model | TeCla - label1 (Accuracy) | TeCla - label2 (Accuracy) |
125
+ | ------------|:-------------|:-------------|
126
+ | roberta-base-ca-v2 | 96.31 | 80.34 |
127
+ | roberta-large-ca-v2 | **96.51** | **80.68** |
128
+ | mBERT | 95.72 | 78.47 |
129
+ | XLM-RoBERTa | 95.66 | 78.01 |
130
 
131
  For more details, check the fine-tuning and evaluation scripts in the official [GitHub repository](https://github.com/projecte-aina/club).
132
 
config.json CHANGED
@@ -1,10 +1,11 @@
1
  {
2
- "_name_or_path": "projecte-aina/roberta-base-ca-v2-cased-tc",
3
  "architectures": [
4
  "RobertaForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
 
8
  "eos_token_id": 2,
9
  "finetuning_task": "tecla",
10
  "gradient_checkpointing": false,
@@ -12,48 +13,116 @@
12
  "hidden_dropout_prob": 0.1,
13
  "hidden_size": 768,
14
  "id2label": {
15
- "0": "Medi ambient",
16
- "1": "Societat",
17
- "2": "Policial",
18
- "3": "Judicial",
19
- "4": "Empresa",
20
- "5": "Partits",
21
- "6": "Política",
22
- "7": "Successos",
23
- "8": "Salut",
24
- "9": "Infraestructures",
25
- "10": "Parlament",
26
- "11": "Música",
27
- "12": "Govern",
28
- "13": "Unió Europea",
29
- "14": "Economia",
30
- "15": "Mobilitat",
31
- "16": "Treball",
32
- "17": "Cultura",
33
- "18": "Educació"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  "initializer_range": 0.02,
36
  "intermediate_size": 3072,
37
  "label2id": {
38
- "Medi ambient": 0,
39
- "Societat": 1,
40
- "Policial": 2,
41
- "Judicial": 3,
42
- "Empresa": 4,
43
- "Partits": 5,
44
- "Política": 6,
45
- "Successos": 7,
46
- "Salut": 8,
47
- "Infraestructures": 9,
48
- "Parlament": 10,
49
- "Música": 11,
50
- "Govern": 12,
51
- "Unió Europea": 13,
52
- "Economia": 14,
53
- "Mobilitat": 15,
54
- "Treball": 16,
55
- "Cultura": 17,
56
- "Educació": 18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  },
58
  "layer_norm_eps": 1e-05,
59
  "max_position_embeddings": 514,
@@ -63,7 +132,8 @@
63
  "pad_token_id": 1,
64
  "position_embedding_type": "absolute",
65
  "problem_type": "single_label_classification",
66
- "transformers_version": "4.6.1",
 
67
  "type_vocab_size": 1,
68
  "use_cache": true,
69
  "vocab_size": 50262
 
1
  {
2
+ "_name_or_path": "/gpfs/projects/bsc88/projects/catalan_evaluation/models/roberta-base-ca-v2",
3
  "architectures": [
4
  "RobertaForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
  "eos_token_id": 2,
10
  "finetuning_task": "tecla",
11
  "gradient_checkpointing": false,
 
13
  "hidden_dropout_prob": 0.1,
14
  "hidden_size": 768,
15
  "id2label": {
16
+ "0": "Llengua",
17
+ "1": "Infraestructures",
18
+ "2": "Arts",
19
+ "3": "Parlament",
20
+ "4": "Noves tecnologies",
21
+ "5": "Castells",
22
+ "6": "Successos",
23
+ "7": "Empresa",
24
+ "8": "Mobilitat",
25
+ "9": "Teatre",
26
+ "10": "Treball",
27
+ "11": "Log\u00edstica",
28
+ "12": "Urbanisme",
29
+ "13": "Govern",
30
+ "14": "Entitats",
31
+ "15": "Finances",
32
+ "16": "Govern espanyol",
33
+ "17": "Tr\u00e0nsit",
34
+ "18": "Ind\u00fastria",
35
+ "19": "Esports",
36
+ "20": "Exteriors",
37
+ "21": "Medi ambient",
38
+ "22": "Habitatge",
39
+ "23": "Salut",
40
+ "24": "Equipaments i patrimoni",
41
+ "25": "Recerca",
42
+ "26": "Cooperaci\u00f3",
43
+ "27": "Innovaci\u00f3",
44
+ "28": "Agroalimentaci\u00f3",
45
+ "29": "Policial",
46
+ "30": "Serveis Socials",
47
+ "31": "Cinema",
48
+ "32": "Mem\u00f2ria hist\u00f2rica",
49
+ "33": "Turisme",
50
+ "34": "Pol\u00edtica municipal",
51
+ "35": "Comer\u00e7",
52
+ "36": "Universitats",
53
+ "37": "Hisenda",
54
+ "38": "Judicial",
55
+ "39": "Partits",
56
+ "40": "M\u00fasica",
57
+ "41": "Lletres",
58
+ "42": "Religi\u00f3",
59
+ "43": "Festa i cultura popular",
60
+ "44": "Uni\u00f3 Europea",
61
+ "45": "Moda",
62
+ "46": "Moviments socials",
63
+ "47": "Comptes p\u00fablics",
64
+ "48": "Immigraci\u00f3",
65
+ "49": "Educaci\u00f3",
66
+ "50": "Gastronomia",
67
+ "51": "Meteorologia",
68
+ "52": "Energia"
69
  },
70
  "initializer_range": 0.02,
71
  "intermediate_size": 3072,
72
  "label2id": {
73
+ "Agroalimentaci\u00f3": 28,
74
+ "Arts": 2,
75
+ "Castells": 5,
76
+ "Cinema": 31,
77
+ "Comer\u00e7": 35,
78
+ "Comptes p\u00fablics": 47,
79
+ "Cooperaci\u00f3": 26,
80
+ "Educaci\u00f3": 49,
81
+ "Empresa": 7,
82
+ "Energia": 52,
83
+ "Entitats": 14,
84
+ "Equipaments i patrimoni": 24,
85
+ "Esports": 19,
86
+ "Exteriors": 20,
87
+ "Festa i cultura popular": 43,
88
+ "Finances": 15,
89
+ "Gastronomia": 50,
90
+ "Govern": 13,
91
+ "Govern espanyol": 16,
92
+ "Habitatge": 22,
93
+ "Hisenda": 37,
94
+ "Immigraci\u00f3": 48,
95
+ "Ind\u00fastria": 18,
96
+ "Infraestructures": 1,
97
+ "Innovaci\u00f3": 27,
98
+ "Judicial": 38,
99
+ "Llengua": 0,
100
+ "Lletres": 41,
101
+ "Log\u00edstica": 11,
102
+ "Medi ambient": 21,
103
+ "Mem\u00f2ria hist\u00f2rica": 32,
104
+ "Meteorologia": 51,
105
+ "Mobilitat": 8,
106
+ "Moda": 45,
107
+ "Moviments socials": 46,
108
+ "M\u00fasica": 40,
109
+ "Noves tecnologies": 4,
110
+ "Parlament": 3,
111
+ "Partits": 39,
112
+ "Policial": 29,
113
+ "Pol\u00edtica municipal": 34,
114
+ "Recerca": 25,
115
+ "Religi\u00f3": 42,
116
+ "Salut": 23,
117
+ "Serveis Socials": 30,
118
+ "Successos": 6,
119
+ "Teatre": 9,
120
+ "Treball": 10,
121
+ "Tr\u00e0nsit": 17,
122
+ "Turisme": 33,
123
+ "Universitats": 36,
124
+ "Uni\u00f3 Europea": 44,
125
+ "Urbanisme": 12
126
  },
127
  "layer_norm_eps": 1e-05,
128
  "max_position_embeddings": 514,
 
132
  "pad_token_id": 1,
133
  "position_embedding_type": "absolute",
134
  "problem_type": "single_label_classification",
135
+ "torch_dtype": "float32",
136
+ "transformers_version": "4.17.0",
137
  "type_vocab_size": 1,
138
  "use_cache": true,
139
  "vocab_size": 50262
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5da19d73d770844b657a2e4489d742af5e843a480c8b5fc46cbb61d1a6698e2
3
- size 498717165
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e13e309d6f6b36be7736992f9164db10e421c8abadf61128f44759237fd686
3
+ size 498822701
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/gpfs/projects/bsc88/BERTs/models/roberta_base_ca_jsc/transformed_lr0.0005"}
 
1
+ {"errors": "replace", "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "trim_offsets": true, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/gpfs/projects/bsc88/projects/catalan_evaluation/models/roberta-base-ca-v2", "tokenizer_class": "RobertaTokenizer"}