baseok

Browse files

Files changed (9) hide show

README.md +62 -1
added_tokens.json +1 -0
config.json +90 -0
model.safetensors +3 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +15 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,64 @@
 ---
-license: c-uda
 ---

 ---
+language: "pt"
+widget:
+- text: "Tinha uma pedra no meio do caminho."
+- text: "Vamos tomar um café quentinho?"
+- text: "Como você se chama?"
+datasets:
+- MacMorpho
 ---
+# POS-Tagger Portuguese
+We fine-tuned the [BERTimbau](https://github.com/neuralmind-ai/portuguese-bert/) model with the [MacMorpho](http://nilc.icmc.usp.br/macmorpho/) corpus for the Post-Tagger task, with 10 epochs, achieving a general F1-Score of 0.9826.
+Metrics:
+```
+              Precision  Recall  F1    Suport
+accuracy                         0.98  33729
+macro avg     0.96       0.95    0.95  33729
+weighted avg  0.98       0.98    0.98  33729
+F1:  0.9826 Accuracy:  0.9826
+```
+Parameters:
+```
+nclasses = 27
+nepochs = 30
+batch_size = 32
+batch_status = 32
+learning_rate = 1e-5
+early_stop = 3
+max_length = 200
+```
+Tags:
+| Tag |  Meaning |
+| ------------------- | ------------------- |
+|  ADJ |  Adjetivo |
+|  ADV |  Advérbio |
+|  ADV-KS |  Advérbio conjuntivo subordinado  |
+|  ADV-KS-REL |   Advérbio relativo subordinado |
+|  ART |  Artigo  |
+|  CUR |  Moeda  |
+|  IN |  Interjeição |
+|  KC |  Conjunção coordenativa |
+|  KS |  Conjunção subordinativa |
+|  N |  Substantivo |
+|  NPROP | Substantivo próprio |
+|  NUM |  Número |
+|  PCP |  Particípio |
+|  PDEN |  Palavra denotativa |
+|  PREP |  Preposição |
+|  PROADJ |  Pronome Adjetivo |
+|  PRO-KS |  Pronome conjuntivo subordinado |
+|  PRO-KS-REL |  Pronome relativo conectivo subordinado |
+|  PROPESS |  Pronome pessoal |
+|  PROSUB |  Pronome nominal |
+|  V | Verbo |
+|  VAUX  | Verbo auxiliar |
+## Questions?
+Please, post a Github issue on the [NLP Portuguese POS-Tagger](https://github.com/lisaterumi/nlp-portuguese-postagger).

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

config.json ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+     "0": "NUM",
+	 "1": "KS",
+	 "2": "PREP+PROADJ",
+	 "3": "ADV-KS",
+	 "4": "NPROP",
+	 "5": "PDEN",
+	 "6": "PROADJ",
+	 "7": "PCP",
+	 "8": "KC",
+	 "9": "PU",
+	 "10": "PREP",
+	 "11": "ADV",
+	 "12": "PRO-KS",
+	 "13": "ART",
+	 "14": "N",
+	 "15": "PROPESS",
+	 "16": "PREP+PROPESS",
+	 "17": "CUR",
+	 "18": "ADJ",
+	 "19": "IN",
+	 "20": "PREP+ART",
+	 "21": "PROSUB",
+	 "22": "PREP+PRO-KS",
+	 "23": "PREP+PROSUB",
+	 "24": "V",
+	 "25": "PREP+ADV",
+	 "26": "<pad>"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "<pad>": 26,
+	 "ADJ": 18,
+	 "ADV": 11,
+	 "ADV-KS": 3,
+	 "ART": 13,
+	 "CUR": 17,
+	 "IN": 19,
+	 "KC": 8,
+	 "KS": 1,
+	 "N": 14,
+	 "NPROP": 4,
+	 "NUM": 0,
+	 "PCP": 7,
+	 "PDEN": 5,
+	 "PREP": 10,
+	 "PREP+ADV": 25,
+	 "PREP+ART": 20,
+	 "PREP+PRO-KS": 22,
+	 "PREP+PROADJ": 2,
+	 "PREP+PROPESS": 16,
+	 "PREP+PROSUB": 23,
+	 "PRO-KS": 12,
+	 "PROADJ": 6,
+	 "PROPESS": 15,
+	 "PROSUB": 21,
+	 "PU": 9,
+	 "V": 24
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.20.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 29794
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f2e1ccd99a2a28417be61caacc92297b97f08e7ff0591d0adc35b2564b984b
+size 433440760

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:136fe2d35d147f464be9f1a08c2e225aab419019a65d5267fd246d6dde79d9b2
+size 433483633

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "name_or_path": "neuralmind/bert-base-portuguese-cased",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/root/.cache/huggingface/transformers/eecc45187d085a1169eed91017d358cc0e9cbdd5dc236bcd710059dbf0a2f816.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff