GliLem: Leveraging GliNER for Contextualized Lemmatization in Estonian

from gliner import GLiNER
import gradio as gr
import nltk

from rule_processor import RuleProcessor
from vabamorf_lemmatizer import Lemmatizer
from utils import sentence_to_spans


nltk.download("punkt_tab")

examples = [
    "Kui leiate andmete põhjal sobiva korteri, tasub kõigepealt välja uurida täpne aadress.",
    "Autor esitab oma põhjendatud seisukohad, hinnangut saab anda ainult siis, kui eelnevalt on probleeme analüüsitud.",
    "Nüüd on need unistused viimaks täitumas.",
    "Isegi autod liiguvad varsti Internetiga.",
    "4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.",
    "WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:",
]

rule_processor = RuleProcessor()
model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator")
lemmatizer = Lemmatizer(
    disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True
)


def process_text(text):
    lemmas, tokens = lemmatizer(text, return_tokens=True)
    lemmas = [list(set(el)) for el in lemmas]
    tokens = [el[0] for el in tokens]
    # serves as input for GliNER to remain consistent with Vabamorf tokenization
    processed_text = " ".join(tokens)
    labels = []
    # contains the token id for each span
    span_to_token_id = sentence_to_spans(tokens)
    # produce a transofrmation rule for each lemma candidate
    for token, lemma_list in zip(tokens, lemmas):
        for lemma in lemma_list:
            labels.append(
                rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True)
            )
    # we only consider unique rules
    labels = list(set(labels))
    predicted_entities = model.predict_entities(
        text=processed_text, labels=labels, flat_ner=True, threshold=0.5
    )

    predictions = tokens.copy()
    for entity in predicted_entities:
        cur_start = entity["start"]
        cur_end = entity["end"]
        token = processed_text[cur_start:cur_end]
        if f"{cur_start}-{cur_end}" in span_to_token_id:
            token_id = span_to_token_id[f"{cur_start}-{cur_end}"]
            token = tokens[token_id]
            # if there are multiple lemma candidates, apply the highest scoring rule
            if len(lemmas[token_id]) > 1:
                result = rule_processor.apply_lemma_rule(token, entity["label"])
            # otherwise, we trust the Vabamorf lemma
            else:
                result = lemmas[token_id][0]
            predictions[token_id] = result
    # store labels to highlight changed word forms
    lemma_labels = []
    for pred, token in zip(predictions, tokens):
        lemma_labels.append(pred != token)
    # expected input format for HighlightedText component
    processed_entities = {
        "text": processed_text,
        "entities": [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": entity["score"],
            }
            for entity in predicted_entities
        ],
    }
    processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)]

    return processed_entities, processed_lemmas


if __name__ == "__main__":

    with gr.Blocks() as demo:
        title = title = gr.HTML("<h1>GliLem: Leveraging GliNER for Contextualized Lemmatization in Estonian</h1>")
        input_text = gr.Textbox(
            label="Text input", placeholder="Enter your text in Estonian here"
        )
        label_output = gr.HighlightedText(label="Predicted Transformation Rules")
        lemma_output = gr.HighlightedText(label="Predicted Lemmas")
        submit_btn = gr.Button("Submit")
        input_text.submit(
            fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
        )
        submit_btn.click(
            fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
        )
        examples = gr.Examples(
            examples,
            fn=process_text,
            inputs=input_text,
            outputs=[label_output, lemma_output],
            cache_examples=False,
        )
        theme = gr.themes.Base()
        demo.launch()