--- inference: false language: pt datasets: - lener_br license: mit pipeline_tag: token-classification --- # DeBERTinha XSmall for NER ## Full Token Classification Example ```python from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig import torch model_name = "sagui-nlp/debertinha-ptbr-xsmall-lenerbr" model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=13) tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "Acrescento que não há de se falar em violação do artigo 114, § 3º, da Constituição Federal, posto que referido dispositivo revela-se impertinente, tratando da possibilidade de ajuizamento de dissídio coletivo pelo Ministério Público do Trabalho nos casos de greve em atividade essencial." inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt") tokens = inputs.tokens() outputs = model(**inputs).logits predictions = torch.argmax(outputs, dim=2) entities = [] current_entity = [] current_label = None for token, prediction in zip(tokens[1:-1], predictions[0].numpy()[1:-1]): # print((token, model.config.id2label[prediction])) if not len(current_entity): current_entity.append(token) current_label = model.config.id2label[prediction] elif token.startswith("▁"): entities.append(("".join(current_entity), current_label)) current_entity = [token] current_label = model.config.id2label[prediction] else: current_entity.append(token) entities.append(("".join(current_entity), current_label)) list(filter(lambda x:x[1]!="O", entities)) ``` ## Training notes Training was done on label of only the first token ```python label_all_tokens = False task="ner" def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=512) labels = [] for i, label in enumerate(examples[f"{task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label[word_idx] if label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs dataset = dataset.map(tokenize_and_align_labels, batched=True) ``` ## Citation ``` @misc{campiotti2023debertinha, title={DeBERTinha: A Multistep Approach to Adapt DebertaV3 XSmall for Brazilian Portuguese Natural Language Processing Task}, author={Israel Campiotti and Matheus Rodrigues and Yuri Albuquerque and Rafael Azevedo and Alyson Andrade}, year={2023}, eprint={2309.16844}, archivePrefix={arXiv}, primaryClass={cs.CL} } ```