--- language: id tags: - indobert - indobenchmark --- ## How to use ### Load model and tokenizer ```python from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") ``` ### Extract NER Tag ```python import torch def predict(model, tokenizer, sentence): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") inputs = tokenizer(sentence.split(), is_split_into_words = True, return_offsets_mapping=True, return_tensors="pt", padding='max_length', truncation=True, max_length=512) model.to(device) # move to gpu ids = inputs["input_ids"].to(device) mask = inputs["attention_mask"].to(device) # forward pass outputs = model(ids, attention_mask=mask) logits = outputs[0] active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels) flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist()) token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()] wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction) prediction = [] for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()): #only predictions on first word pieces are important if mapping[0] == 0 and mapping[1] != 0: prediction.append(token_pred[1]) else: continue return sentence.split(), prediction sentence = "BJ Habibie adalah Presiden Indonesia ke-3" words, labels = predict(model, tokenizer, sentence) ```