Spaces:

adorkin
/

GliLem

Sleeping

App Files Files Community

GliLem / demo.py

adorkin

Update demo.py

cd5e0da verified 27 days ago

raw

history blame contribute delete

4.32 kB

	from gliner import GLiNER
	import gradio as gr
	import nltk

	from rule_processor import RuleProcessor
	from vabamorf_lemmatizer import Lemmatizer
	from utils import sentence_to_spans


	nltk.download("punkt_tab")

	examples = [
	"Kui leiate andmete põhjal sobiva korteri, tasub kõigepealt välja uurida täpne aadress.",
	"Autor esitab oma põhjendatud seisukohad, hinnangut saab anda ainult siis, kui eelnevalt on probleeme analüüsitud.",
	"Nüüd on need unistused viimaks täitumas.",
	"Isegi autod liiguvad varsti Internetiga.",
	"4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.",
	"WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:",
	]

	rule_processor = RuleProcessor()
	model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator")
	lemmatizer = Lemmatizer(
	disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True
	)


	def process_text(text):
	lemmas, tokens = lemmatizer(text, return_tokens=True)
	lemmas = [list(set(el)) for el in lemmas]
	tokens = [el[0] for el in tokens]
	# serves as input for GliNER to remain consistent with Vabamorf tokenization
	processed_text = " ".join(tokens)
	labels = []
	# contains the token id for each span
	span_to_token_id = sentence_to_spans(tokens)
	# produce a transofrmation rule for each lemma candidate
	for token, lemma_list in zip(tokens, lemmas):
	for lemma in lemma_list:
	labels.append(
	rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True)
	)
	# we only consider unique rules
	labels = list(set(labels))
	predicted_entities = model.predict_entities(
	text=processed_text, labels=labels, flat_ner=True, threshold=0.5
	)

	predictions = tokens.copy()
	for entity in predicted_entities:
	cur_start = entity["start"]
	cur_end = entity["end"]
	token = processed_text[cur_start:cur_end]
	if f"{cur_start}-{cur_end}" in span_to_token_id:
	token_id = span_to_token_id[f"{cur_start}-{cur_end}"]
	token = tokens[token_id]
	# if there are multiple lemma candidates, apply the highest scoring rule
	if len(lemmas[token_id]) > 1:
	result = rule_processor.apply_lemma_rule(token, entity["label"])
	# otherwise, we trust the Vabamorf lemma
	else:
	result = lemmas[token_id][0]
	predictions[token_id] = result
	# store labels to highlight changed word forms
	lemma_labels = []
	for pred, token in zip(predictions, tokens):
	lemma_labels.append(pred != token)
	# expected input format for HighlightedText component
	processed_entities = {
	"text": processed_text,
	"entities": [
	{
	"entity": entity["label"],
	"word": entity["text"],
	"start": entity["start"],
	"end": entity["end"],
	"score": entity["score"],
	}
	for entity in predicted_entities
	],
	}
	processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)]

	return processed_entities, processed_lemmas


	if __name__ == "__main__":

	with gr.Blocks() as demo:
	title = title = gr.HTML("<h1>GliLem: Leveraging GliNER for Contextualized Lemmatization in Estonian</h1>")
	input_text = gr.Textbox(
	label="Text input", placeholder="Enter your text in Estonian here"
	)
	label_output = gr.HighlightedText(label="Predicted Transformation Rules")
	lemma_output = gr.HighlightedText(label="Predicted Lemmas")
	submit_btn = gr.Button("Submit")
	input_text.submit(
	fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
	)
	submit_btn.click(
	fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
	)
	examples = gr.Examples(
	examples,
	fn=process_text,
	inputs=input_text,
	outputs=[label_output, lemma_output],
	cache_examples=False,
	)
	theme = gr.themes.Base()
	demo.launch()