Fine-Tuning mDeBERTa for Named Entity Recognition (NER)

📌 Model Overview

This repository contains a fine-tuned version of MoritzLaurer/mDeBERTa-v3-base-mnli-xnli for Named Entity Recognition (NER) using the mnaguib/WikiNER dataset in multiple languages.

🚀 Features

Pretrained on mDeBERTa: A powerful multilingual model for text understanding.
Fine-tuned for NER: Detects entities such as persons (PER), locations (LOC), organizations (ORG), and more.

📖 Training Details

Base model: MoritzLaurer/mDeBERTa-v3-base-mnli-xnli
Dataset: mnaguib/WikiNER
Languages: English (en), Spanish (es), ...
Epochs: 2
Optimizer: AdamW
Loss function: CrossEntropyLoss

Inference

To use the model for inference:

from transformers import AutoModelForTokenClassification, AutoTokenizer

# Load the model and tokenizer
model_path = "jordigonzm/mdeberta-v3-base-multilingual-ner"
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# NER Prediction Function
def predict_ner(text):
    tokens = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**tokens)
    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()
    tokens_decoded = tokenizer.convert_ids_to_tokens(tokens["input_ids"].squeeze().tolist())
    return list(zip(tokens_decoded, predictions))

# Example
text = "The Mona Lisa is located in the Louvre Museum, in Paris."
result = predict_ner(text)
print(result)

Post-processing function

from transformers import AutoModelForTokenClassification, AutoTokenizer
from rich.console import Console
from rich.table import Table

# Load the model and tokenizer
model_path = "jordigonzm/mdeberta-v3-base-multilingual-ner"
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# NER label mapping
id_to_label = {0: "0", 1: "LOC", 2: "PER", 3: "MISC", 4: "ORG"}

# Post-processing function to merge subtokens
def postprocess_ner(decoded_tokens, predictions):
    ner_results = []
    current_word = ""
    current_label = None
    
    for token, label in zip(decoded_tokens, predictions):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue  # Ignore special tokens
        
        if token.startswith("▁"):  # New word token
            if current_word:
                ner_results.append((current_word, id_to_label.get(current_label, "O")))
            current_word = token[1:]  # Remove prefix '▁'
            current_label = label
        else:  # Subtoken, append to the current word
            current_word += token
    
    if current_word:  # Add the last word
        ner_results.append((current_word, id_to_label.get(current_label, "O")))
    
    return ner_results
  
# NER Prediction Function
def predict_ner(text):
    tokens = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**tokens)
    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()
    decoded_tokens = tokenizer.convert_ids_to_tokens(tokens["input_ids"].squeeze().tolist())
    entities = postprocess_ner(decoded_tokens, predictions)
    return entities

# Display Results
def display_ner_results(results):
    console = Console()
    table = Table(title="Entity Classification", show_lines=True)
    
    table.add_column("Token", justify="left", style="cyan")
    table.add_column("Entity", justify="center", style="magenta")
    
    for token, entity in results:
        if entity != "0":
            print(f"{token:<10} -> {entity}")
            table.add_row(token, str(entity))
    
    console.print(table)

# Example
text = "The Mona Lisa is located in the Louvre Museum, in Paris."
result = predict_ner(text)
display_ner_results(result)

Model Usage

You can load the model directly from Hugging Face:

from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained("jordigonzm/mdeberta-v3-base-multilingual-ner")
tokenizer = AutoTokenizer.from_pretrained("jordigonzm/mdeberta-v3-base-multilingual-ner")

jordigonzm
/

mdeberta-v3-base-multilingual-ner