sdocio commited on
Commit
480bb57
1 Parent(s): e2f3078

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,74 @@
1
  ---
 
2
  license: gpl-3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language: es
3
  license: gpl-3.0
4
+ tags:
5
+ - PyTorch
6
+ - Transformers
7
+ - Token Classification
8
+ - xlm-roberta
9
+ - xlm-roberta-large
10
+ widget:
11
+ - text: "Fue antes de llegar a Sigüeiro, en el Camino de Santiago."
12
+ - text: "Si te metes en el Franco desde la Alameda, vas hacia la Catedral."
13
+ - text: "Y allí precisamente es Santiago el patrón del pueblo."
14
+ model-index:
15
+ - name: es_trf_ner_cds_xlm-large
16
+ results: []
17
  ---
18
+
19
+ # Introduction
20
+
21
+ This model is a fine-tuned version of [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) for Named-Entity Recognition, in the domain of tourism related to the Way of Saint Jacques. It recognizes four types of entities: location (LOC), organizations (ORG), person (PER) and miscellaneous (MISC).
22
+
23
+ ## Usage
24
+
25
+ You can use this model with Transformers *pipeline* for NER.
26
+
27
+ ```python
28
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
29
+
30
+ tokenizer = AutoTokenizer.from_pretrained("es_trf_ner_cds_xlm-large")
31
+ model = AutoModelForTokenClassification.from_pretrained("es_trf_ner_cds_xlm-large")
32
+
33
+ example = "Fue antes de llegar a Sigüeiro, en el Camino de Santiago. Si te metes en el Franco desde la Alameda, vas hacia la Catedral. Y allí precisamente es Santiago el patrón del pueblo."
34
+ ner_pipe = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
35
+
36
+ for ent in ner_pipe(example):
37
+ print(ent)
38
+ ```
39
+
40
+ ## Dataset
41
+
42
+ ToDo
43
+
44
+ ## Model performance
45
+
46
+ entity|precision|recall|f1
47
+ -|-|-|-
48
+ LOC|0.973|0.983|0.978
49
+ MISC|0.760|0.788|0.773
50
+ ORG|0.885|0.701|0.783
51
+ PER|0.937|0.878|0.906
52
+ micro avg|0.953|0.958|0.955
53
+ macro avg|0.889|0.838|0.860
54
+ weighted avg|0.953|0.958|0.955
55
+
56
+ ## Training procedure
57
+
58
+ ### Training hyperparameters
59
+
60
+ The following hyperparameters were used during training:
61
+ - learning_rate: 5e-05
62
+ - train_batch_size: 32
63
+ - eval_batch_size: 8
64
+ - seed: 42
65
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
66
+ - lr_scheduler_type: linear
67
+ - num_epochs: 3.0
68
+
69
+ ### Framework versions
70
+
71
+ - Transformers 4.28.1
72
+ - Pytorch 2.0.1+cu117
73
+ - Datasets 2.12.0
74
+ - Tokenizers 0.13.3
all_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.9979369961144651,
4
+ "eval_f1": 0.9566217926590725,
5
+ "eval_loss": 0.009228814393281937,
6
+ "eval_precision": 0.9547542489664677,
7
+ "eval_recall": 0.9584966566751211,
8
+ "eval_runtime": 38.1835,
9
+ "eval_samples": 15178,
10
+ "eval_samples_per_second": 397.502,
11
+ "eval_steps_per_second": 49.707,
12
+ "train_loss": 0.08099352212526335,
13
+ "train_runtime": 1003.7611,
14
+ "train_samples": 45533,
15
+ "train_samples_per_second": 136.087,
16
+ "train_steps_per_second": 4.253
17
+ }
config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "xlm-roberta-large",
3
+ "architectures": [
4
+ "XLMRobertaForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "finetuning_task": "ner",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "B-LOC",
16
+ "1": "B-MISC",
17
+ "2": "B-ORG",
18
+ "3": "B-PER",
19
+ "4": "I-LOC",
20
+ "5": "I-MISC",
21
+ "6": "I-ORG",
22
+ "7": "I-PER",
23
+ "8": "O"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 4096,
27
+ "label2id": {
28
+ "B-LOC": 0,
29
+ "B-MISC": 1,
30
+ "B-ORG": 2,
31
+ "B-PER": 3,
32
+ "I-LOC": 4,
33
+ "I-MISC": 5,
34
+ "I-ORG": 6,
35
+ "I-PER": 7,
36
+ "O": 8
37
+ },
38
+ "layer_norm_eps": 1e-05,
39
+ "max_position_embeddings": 514,
40
+ "model_type": "xlm-roberta",
41
+ "num_attention_heads": 16,
42
+ "num_hidden_layers": 24,
43
+ "output_past": true,
44
+ "pad_token_id": 1,
45
+ "position_embedding_type": "absolute",
46
+ "torch_dtype": "float32",
47
+ "transformers_version": "4.28.1",
48
+ "type_vocab_size": 1,
49
+ "use_cache": true,
50
+ "vocab_size": 250002
51
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2b10e42f3d811754eebae3495d6a247824a93bbce5707c5ab0b6f198c99725
3
+ size 2235539565
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1edb0658cb47689db5cf78194ebe041bba3b6b775d1f1069fc9501b372d4acb0
3
+ size 17082758
tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 512,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "tokenizer_class": "XLMRobertaTokenizer",
18
+ "unk_token": "<unk>"
19
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81ff6af0468d14857cf0bc6096131bad61a715d6c50507564e63a69aa2380138
3
+ size 3579