metadata
language: id
tags:
- indobert
- indobenchmark
How to use
Load model and tokenizer
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
Extract NER Tag
import torch
def predict(model, tokenizer, sentence):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = tokenizer(sentence.split(),
is_split_into_words = True,
return_offsets_mapping=True,
return_tensors="pt",
padding='max_length',
truncation=True,
max_length=512)
model.to(device)
# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
#only predictions on first word pieces are important
if mapping[0] == 0 and mapping[1] != 0:
prediction.append(token_pred[1])
else:
continue
return sentence.split(), prediction
sentence = "BJ Habibie adalah Presiden Indonesia ke-3 yang lahir pada tanggl 25 Juni 1936"
words, labels = predict(model, tokenizer, sentence)