Spaces:

crabz
/

sk-ner

Runtime error

File size: 4,674 Bytes

# -*- coding: utf-8 -*-
import os

os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
          "https://download.pytorch.org/whl/cpu/torch_stable.html")

import gradio as gr
from transformers import pipeline

import spacy
from spacy import displacy

ner_map = {0: '0',
           1: 'B-OSOBA',
           2: 'I-OSOBA',
           3: 'B-ORGANIZÁCIA',
           4: 'I-ORGANIZÁCIA',
           5: 'B-LOKALITA',
           6: 'I-LOKALITA'}

options = {"ents": ["OSOBA",
                    "ORGANIZÁCIA",
                    "LOKALITA"],
           "colors": {"OSOBA": "lightblue",
                      "ORGANIZÁCIA": "lightcoral",
                      "LOKALITA": "lightgreen"}}

ner_pipeline = pipeline(task='ner', model="crabz/slovakbert-ner")
nlp = spacy.blank("sk")


def postprocess(classifications):
    entities = []
    for i in range(len(classifications)):
        if classifications[i]['entity'] != 0:
            if ner_map[classifications[i]['entity']][0] == 'B':
                j = i + 1
                while j < len(classifications) and ner_map[classifications[j]['entity']][0] == 'I':
                    j += 1
                entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
                                 classifications[j - 1]['end']))
    while True:
        merged = False
        to_remove = []
        merged_entities = []
        for i in range(len(entities)):
            for j in range(i + 1, len(entities)):
                if entities[i] != entities[j] and entities[i][0] == entities[j][0] and \
                        (entities[i][2] == entities[j][1] or entities[i][1] == entities[j][2]):
                    to_remove.append(entities[i])
                    to_remove.append(entities[j])

                    new_start = min(entities[i][1], entities[j][1])
                    new_end = max(entities[i][2], entities[j][2])
                    merged_entities.append((entities[i][0], new_start, new_end))
                    merged = True
                    break
            if merged:
                break
        for ent in to_remove:
            entities.remove(ent)
        entities += merged_entities
        if not merged:
            break
    return entities


def set_entities(sentence, entities):
    doc = nlp(sentence)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    return doc


def apply_ner(sentence: str):
    classifications = ner_pipeline(sentence)
    entities = postprocess(classifications)
    doc = set_entities(sentence, entities)
    displacy_html = displacy.render(doc, style="ent", options=options)
    return displacy_html


intf = gr.Interface(fn=apply_ner, inputs="text", outputs="html", title='Slovak Named Entity Recognition',
                    allow_flagging=False,
                    examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
                               "štyroch prípadov variantu omikron na Slovensku."],
                              ["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
                               "rovnako\"."],
                              ["Informácie o týchto veľkolepých plánoch prišli týždeň po tom, ako sa japonský "
                               "miliardár Jusaku Maezawa vrátil z 12-dňového pobytu na Medzinárodnej vesmírnej stanici "
                               "(ISS), čím sa stal prvým vesmírnym turistom, ktorý cestoval na ISS za viac ako desať "
                               "rokov."],
                              ["Minister financií a líder mandátovo najsilnejšieho hnutia OĽaNO Igor Matovič "
                               "upozorňuje, že následky tretej vlny budú na Slovensku veľmi veľké."],
                              ["Začiatkom roka 2021 sa objavili nezhody medzi Richardom Sulíkom a šéfom hnutia OĽANO "
                               "Igorom Matovičom, ktoré v istej miere pretrvávajú aj dodnes."]],
                    description="Named-entity recognition (NER) labels named-entities in unstructured text. This "
                                "implementation supports three labels: person (OSOBA), organization (ORGANIZÁCIA) and "
                                "location (LOKALITA). You can try out one of the examples below or type your own "
                                "sentence. Don't forget to use double quotes (\" \") instead of curved quotes („ “)",
                    article="")
intf.launch()