--- language: id tags: - indobert - indobenchmark --- ## How to use ### Load model and tokenizer ```python from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") ``` ### Extract NER Tag ```python import torch def predict(model, tokenizer, sentence): # will be moved to config later ids_to_labels = { 0: 'B-ORGANISATION', 1: 'B-PERSON', 2: 'B-PLACE', 3: 'I-ORGANISATION', 4: 'I-PERSON', 5: 'I-PLACE', 6: 'O', } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") inputs = tokenizer(sentence.split(), is_split_into_words = True, return_offsets_mapping=True, return_tensors="pt") model.to(device) # move to gpu ids = inputs["input_ids"].to(device) mask = inputs["attention_mask"].to(device) # forward pass outputs = model(ids, attention_mask=mask) logits = outputs[0] active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels) flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist()) token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()] wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction) prediction = [] for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()): #only predictions on first word pieces are important if mapping[0] == 0 and mapping[1] != 0: prediction.append(token_pred[1]) else: continue return sentence.split(), prediction sentence = "BJ Habibie adalah Presiden Indonesia ke-3" words, labels = predict(model, tokenizer, sentence) ```