Initialize

Browse files

Files changed (8) hide show

README.md +91 -0
config.json +78 -0
pytorch_model.bin +3 -0
special_tokens_map.json +9 -0
spiece.model +3 -0
tf_model.h5 +3 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+language: fa
+---
+# AlbertNER
+This model fine-tuned for the Named Entity Recognition (NER) task on a mixed NER dataset collected from [ARMAN](https://github.com/HaniehP/PersianNER), [PEYMA](http://nsurl.org/2019-2/tasks/task-7-named-entity-recognition-ner-for-farsi/), and [WikiANN](https://elisa-ie.github.io/wikiann/) that covered ten types of entities:
+- Date (DAT)
+- Event (EVE)
+- Facility (FAC)
+- Location (LOC)
+- Money (MON)
+- Organization (ORG)
+- Percent (PCT)
+- Person (PER)
+- Product (PRO)
+- Time (TIM)
+## Dataset Information
+|       |   Records |   B-DAT |   B-EVE |   B-FAC |   B-LOC |   B-MON |   B-ORG |   B-PCT |   B-PER |   B-PRO |   B-TIM |   I-DAT |   I-EVE |   I-FAC |   I-LOC |   I-MON |   I-ORG |   I-PCT |   I-PER |   I-PRO |   I-TIM |
+|:------|----------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|
+| Train |     29133 |    1423 |    1487 |    1400 |   13919 |     417 |   15926 |     355 |   12347 |    1855 |     150 |    1947 |    5018 |    2421 |    4118 |    1059 |   19579 |     573 |    7699 |    1914 |     332 |
+| Valid |      5142 |     267 |     253 |     250 |    2362 |     100 |    2651 |      64 |    2173 |     317 |      19 |     373 |     799 |     387 |     717 |     270 |    3260 |     101 |    1382 |     303 |      35 |
+| Test  |      6049 |     407 |     256 |     248 |    2886 |      98 |    3216 |      94 |    2646 |     318 |      43 |     568 |     888 |     408 |     858 |     263 |    3967 |     141 |    1707 |     296 |      78 |
+## Evaluation
+The following tables summarize the scores obtained by model overall and per each class.
+**Overall**
+|    Model   | accuracy | precision |  recall  |    f1    |
+|:----------:|:--------:|:---------:|:--------:|:--------:|
+|   Albert   | 0.993405 |  0.938907 | 0.943966 | 0.941429 |
+**Per entities**
+|     	| number 	| precision 	|  recall  	|    f1    	|
+|:---:	|:------:	|:---------:	|:--------:	|:--------:	|
+| DAT 	|   407  	|  0.820639 	| 0.820639 	| 0.820639 	|
+| EVE 	|   256  	|  0.936803 	| 0.984375 	| 0.960000 	|
+| FAC 	|   248  	|  0.925373 	| 1.000000 	| 0.961240 	|
+| LOC 	|  2884  	|  0.960818 	| 0.960818 	| 0.960818 	|
+| MON 	|   98   	|  0.913978 	| 0.867347 	| 0.890052 	|
+| ORG 	|  3216  	|  0.920892 	| 0.937500 	| 0.929122 	|
+| PCT 	|   94   	|  0.946809 	| 0.946809 	| 0.946809 	|
+| PER 	|  2644  	|  0.960000 	| 0.944024 	| 0.951945 	|
+| PRO 	|   318  	|  0.942943 	| 0.987421 	| 0.964670 	|
+| TIM 	|   43   	|  0.780488 	| 0.744186 	| 0.761905 	|
+## How To Use
+You use this model with Transformers pipeline for NER.
+### Installing requirements
+```bash
+pip install sentencepiece
+pip install transformers
+```
+### How to predict using pipeline
+```python
+from transformers import AutoTokenizer
+from transformers import AutoModelForTokenClassification  # for pytorch
+from transformers import TFAutoModelForTokenClassification  # for tensorflow
+from transformers import pipeline
+model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2-ner"  # Albert
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Pytorch
+# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Tensorflow
+nlp = pipeline("ner", model=model, tokenizer=tokenizer)
+example = "در سال ۲۰۱۳ درگذشت و آندرتیکر و کین برای او مراسم یادبود گرفتند."
+ner_results = nlp(example)
+print(ner_results)
+```
+## Questions?
+Post a Github issue on the [ParsNER Issues](https://github.com/hooshvare/parsner/issues) repo.

config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+  "architectures": [
+    "AlbertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "down_scale_factor": 1,
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "finetuning_task": "ner",
+  "gap_size": 0,
+  "hidden_act": "gelu_new",
+  "hidden_dropout_prob": 0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-DAT",
+    "2": "B-EVE",
+    "3": "B-FAC",
+    "4": "B-LOC",
+    "5": "B-MON",
+    "6": "B-ORG",
+    "7": "B-PCT",
+    "8": "B-PER",
+    "9": "B-PRO",
+    "10": "B-TIM",
+    "11": "I-DAT",
+    "12": "I-EVE",
+    "13": "I-FAC",
+    "14": "I-LOC",
+    "15": "I-MON",
+    "16": "I-ORG",
+    "17": "I-PCT",
+    "18": "I-PER",
+    "19": "I-PRO",
+    "20": "I-TIM"
+  },
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-DAT": 1,
+    "B-EVE": 2,
+    "B-FAC": 3,
+    "B-LOC": 4,
+    "B-MON": 5,
+    "B-ORG": 6,
+    "B-PCT": 7,
+    "B-PER": 8,
+    "B-PRO": 9,
+    "B-TIM": 10,
+    "I-DAT": 11,
+    "I-EVE": 12,
+    "I-FAC": 13,
+    "I-LOC": 14,
+    "I-MON": 15,
+    "I-ORG": 16,
+    "I-PCT": 17,
+    "I-PER": 18,
+    "I-PRO": 19,
+    "I-TIM": 20,
+    "O": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "albert",
+  "net_structure_type": 0,
+  "num_attention_heads": 12,
+  "num_hidden_groups": 1,
+  "num_hidden_layers": 12,
+  "num_memory_blocks": 0,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.5.0.dev0",
+  "type_vocab_size": 2,
+  "vocab_size": 30000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0edc50c4a65288408d25530df7a63bfca9efe75fdd94c40641c4e4ea9bef7caf
+size 44451929

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "bos_token": "[CLS]",
+    "eos_token": "[SEP]",
+    "unk_token": "<unk>",
+    "sep_token": "[SEP]",
+    "pad_token": "<pad>",
+    "cls_token": "[CLS]",
+    "mask_token": "[MASK]"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:903319b1a4a7e58e49383764d33897a7f49784510247d68438e4f3bff25b01f1
+size 857476

tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a71db8fd9dfb34333ca5d1f7084d19e27815012e4e1bf5fb57fc7386809ccae
+size 44478676

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "do_lower_case": false,
+    "remove_space": true,
+    "keep_accents": false,
+    "bos_token": "[CLS]",
+    "eos_token": "[SEP]",
+    "unk_token": "<unk>",
+    "sep_token": "[SEP]",
+    "pad_token": "<pad>",
+    "cls_token": "[CLS]",
+    "mask_token": {
+        "content": "[MASK]",
+        "single_word": false,
+        "lstrip": true,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "model_max_length": 512,
+    "special_tokens_map_file": null
+}