First model version

Browse files

Files changed (4) hide show

README.md +59 -0
config.json +43 -0
pytorch_model.bin +3 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+language: id
+tags:
+- indobert
+- indobenchmark
+---
+## How to use
+### Load model and tokenizer
+```python
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
+model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
+```
+### Extract NER Tag
+```python
+import torch
+def predict(model, tokenizer, sentence):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    inputs = tokenizer(sentence.split(),
+                    is_split_into_words = True,
+                    return_offsets_mapping=True,
+                    return_tensors="pt",
+                    padding='max_length',
+                    truncation=True,
+                    max_length=512)
+    model.to(device)
+    # move to gpu
+    ids = inputs["input_ids"].to(device)
+    mask = inputs["attention_mask"].to(device)
+    # forward pass
+    outputs = model(ids, attention_mask=mask)
+    logits = outputs[0]
+    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
+    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
+    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
+    token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
+    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
+    prediction = []
+    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
+        #only predictions on first word pieces are important
+        if mapping[0] == 0 and mapping[1] != 0:
+            prediction.append(token_pred[1])
+        else:
+            continue
+    return sentence.split(), prediction
+sentence = "BJ Habibie adalah Presiden Indonesia ke-3 yang lahir pada tanggl 25 Juni 1936"
+words, labels = predict(model, tokenizer, sentence)
+```

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "indobenchmark/indobert-large-p2",
+  "_num_labels": 5,
+  "architectures": [
+    "BertForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "B-TIMEX",
+    "1": "I-TIMEX",
+    "2": "O"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "B-TIMEX": 0,
+    "I-TIMEX": 1,
+    "O": 2
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.27.4",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5b9dd1146b641d2e76875c14cb51d4a023cb6dabaef39fee97a591f0b0dd2ee
+size 1336519661

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff