from gliner import GLiNER import gradio as gr import nltk from rule_processor import RuleProcessor from vabamorf_lemmatizer import Lemmatizer from utils import sentence_to_spans nltk.download("punkt_tab") examples = [ "Kui leiate andmete põhjal sobiva korteri, tasub kõigepealt välja uurida täpne aadress.", "Autor esitab oma põhjendatud seisukohad, hinnangut saab anda ainult siis, kui eelnevalt on probleeme analüüsitud.", "Nüüd on need unistused viimaks täitumas.", "Isegi autod liiguvad varsti Internetiga.", "4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.", "WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:", ] rule_processor = RuleProcessor() model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator") lemmatizer = Lemmatizer( disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True ) def process_text(text): lemmas, tokens = lemmatizer(text, return_tokens=True) lemmas = [list(set(el)) for el in lemmas] tokens = [el[0] for el in tokens] # serves as input for GliNER to remain consistent with Vabamorf tokenization processed_text = " ".join(tokens) labels = [] # contains the token id for each span span_to_token_id = sentence_to_spans(tokens) # produce a transofrmation rule for each lemma candidate for token, lemma_list in zip(tokens, lemmas): for lemma in lemma_list: labels.append( rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True) ) # we only consider unique rules labels = list(set(labels)) predicted_entities = model.predict_entities( text=processed_text, labels=labels, flat_ner=True, threshold=0.5 ) predictions = tokens.copy() for entity in predicted_entities: cur_start = entity["start"] cur_end = entity["end"] token = processed_text[cur_start:cur_end] if f"{cur_start}-{cur_end}" in span_to_token_id: token_id = span_to_token_id[f"{cur_start}-{cur_end}"] token = tokens[token_id] # if there are multiple lemma candidates, apply the highest scoring rule if len(lemmas[token_id]) > 1: result = rule_processor.apply_lemma_rule(token, entity["label"]) # otherwise, we trust the Vabamorf lemma else: result = lemmas[token_id][0] predictions[token_id] = result # store labels to highlight changed word forms lemma_labels = [] for pred, token in zip(predictions, tokens): lemma_labels.append(pred != token) # expected input format for HighlightedText component processed_entities = { "text": processed_text, "entities": [ { "entity": entity["label"], "word": entity["text"], "start": entity["start"], "end": entity["end"], "score": entity["score"], } for entity in predicted_entities ], } processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)] return processed_entities, processed_lemmas if __name__ == "__main__": with gr.Blocks() as demo: title = title = gr.HTML("