add model and config files

Browse files

Files changed (6) hide show

README.md +82 -0
classes.txt +10 -0
config.json +75 -0
pytorch_model.bin +3 -0
special_tokens_map.json +8 -0
tokenizer_config.json +15 -0

README.md CHANGED Viewed

@@ -1,3 +1,85 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language:
+- pt
 ---
+# bertimbau-large-ner-selective
+This model card aims to simplify the use of the [portuguese Bert, a.k.a, Bertimbau](https://github.com/neuralmind-ai/portuguese-bert) for the Named Entity Recognition task.
+For this model card the we used the <mark style="background-color: #d3d3d3"> **BERT-CRF (total scenario, 10 classes)** </mark> model available in the [ner_evaluation](https://github.com/neuralmind-ai/portuguese-bert/tree/master/ner_evaluation) folder of the original Bertimbau repo.
+Available classes are:
++ PESSOA
++ ORGANIZACAO
++ LOCAL
++ TEMPO
++ VALOR
++ ABSTRACCAO
++ ACONTECIMENTO
++ COISA
++ OBRA
++ OUTRO
+## Usage
+```
+# Load model directly
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer = AutoTokenizer.from_pretrained("marquesafonso/bertimbau-large-ner-selective")
+model = AutoModelForTokenClassification.from_pretrained("marquesafonso/bertimbau-large-ner-selective")
+```
+## Example
+```
+from transformers import pipeline
+pipe = pipeline("ner", model="marquesafonso/bertimbau-large-ner-selective", aggregation_strategy='simple')
+sentence = "Acima de Ederson, abaixo de Rúben Dias. É entre os dois jogadores do Manchester City que se vai colocar Gonçalo Ramos no ranking de vendas mais avultadas do Benfica."
+result = pipe([sentence])
+print(f"{sentence}\n{result}")
+# Acima de Ederson, abaixo de Rúben Dias. É entre os dois jogadores do Manchester City que se vai colocar Gonçalo Ramos no ranking de vendas mais avultadas do Benfica.
+# [[
+#     {'entity_group': 'PESSOA', 'score': 0.99694395, 'word': 'Ederson', 'start': 9, 'end': 16},
+#     {'entity_group': 'PESSOA', 'score': 0.9918462, 'word': 'Rúben Dias', 'start': 28, 'end': 38},
+#     {'entity_group': 'ORGANIZACAO', 'score': 0.96376556, 'word': 'Manchester City', 'start': 69, 'end': 84},
+#     {'entity_group': 'PESSOA', 'score': 0.9993823, 'word': 'Gonçalo Ramos', 'start': 104, 'end': 117},
+#     {'entity_group': 'ORGANIZACAO', 'score': 0.9033079, 'word': 'Benfica', 'start': 157, 'end': 164}
+# ]]
+```
+## Acknowledgements
+This work is an adaptation of [portuguese Bert, a.k.a, Bertimbau](https://github.com/neuralmind-ai/portuguese-bert). You may check and/or cite their [work](http://arxiv.org/abs/1909.10649):
+```
+@InProceedings{souza2020bertimbau,
+    author="Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto",
+    editor="Cerri, Ricardo and Prati, Ronaldo C.",
+    title="BERTimbau: Pretrained BERT Models for Brazilian Portuguese",
+    booktitle="Intelligent Systems",
+    year="2020",
+    publisher="Springer International Publishing",
+    address="Cham",
+    pages="403--417",
+    isbn="978-3-030-61377-8"
+}
+@article{souza2019portuguese,
+    title={Portuguese Named Entity Recognition using BERT-CRF},
+    author={Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto},
+    journal={arXiv preprint arXiv:1909.10649},
+    url={http://arxiv.org/abs/1909.10649},
+    year={2019}
+}
+```
+Note that the authors - Fabio Capuano de Souza, Rodrigo Nogueira, Roberto de Alencar Lotufo - have used an MIT LICENSE for their work.

classes.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+PESSOA
+ORGANIZACAO
+LOCAL
+TEMPO
+VALOR
+ABSTRACCAO
+ACONTECIMENTO
+COISA
+OBRA
+OUTRO

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "_name_or_path": "neuralmind/bert-large-portuguese-cased",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-PESSOA",
+    "2": "I-PESSOA",
+    "3": "B-ORGANIZACAO",
+    "4": "I-ORGANIZACAO",
+    "5": "B-LOCAL",
+    "6": "I-LOCAL",
+    "7": "B-TEMPO",
+    "8": "I-TEMPO",
+    "9": "B-VALOR",
+    "10": "I-VALOR",
+    "11": "B-ABSTRACCAO",
+    "12": "I-ABSTRACCAO",
+    "13": "B-ACONTECIMENTO",
+    "14": "I-ACONTECIMENTO",
+    "15": "B-COISA",
+    "16": "I-COISA",
+    "17": "B-OBRA",
+    "18": "I-OBRA",
+    "19": "B-OUTRO",
+    "20": "I-OUTRO"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-LOCAL": 5,
+    "B-ORGANIZACAO": 3,
+    "B-PESSOA": 1,
+    "B-TEMPO": 7,
+    "B-VALOR": 9,
+    "B-ABSTRACCAO": 11,
+    "B-ACONTECIMENTO": 13,
+    "B-COISA": 15,
+    "B-OBRA": 17,
+    "B-OUTRO": 19,
+    "I-LOCAL": 6,
+    "I-ORGANIZACAO": 4,
+    "I-PESSOA": 2,
+    "I-TEMPO": 8,
+    "I-VALOR": 10,
+    "I-ABSTRACCAO": 12,
+    "I-ACONTECIMENTO": 14,
+    "I-COISA": 16,
+    "I-OBRA": 18,
+    "I-OUTRO": 20,
+    "O": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_labels": 21,
+  "output_attentions": false,
+  "output_hidden_states": true,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "torchscript": false,
+  "type_vocab_size": 2,
+  "vocab_size": 29794
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84e7ada7c449c648d6f92b8efb9f03064dd55a33b146993fb126324e6797bcb8
+size 435807397

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]",
+    "pad_token": "[PAD]",
+    "sep_token": "[SEP]",
+    "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "clean_up_tokenization_spaces": true,
+    "cls_token": "[CLS]",
+    "do_basic_tokenize": true,
+    "do_lower_case": false,
+    "mask_token": "[MASK]",
+    "model_max_length": 1000000000000000019884624838656,
+    "never_split": null,
+    "pad_token": "[PAD]",
+    "sep_token": "[SEP]",
+    "strip_accents": null,
+    "tokenize_chinese_chars": true,
+    "tokenizer_class": "BertTokenizer",
+    "unk_token": "[UNK]"
+}