File size: 4,674 Bytes
0fd6af9
 
 
dad5c8f
 
0fd6af9
44562bb
939362b
44562bb
f1d4807
 
44562bb
f1d4807
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939362b
 
f1d4807
 
58ee780
f1d4807
 
 
 
 
 
 
 
 
6cef16b
 
 
 
 
 
 
 
 
 
58ee780
6cef16b
 
 
 
 
 
 
 
 
 
 
 
58ee780
f1d4807
58ee780
 
 
f1d4807
 
 
 
58ee780
 
f1d4807
58ee780
 
 
 
f1d4807
 
 
 
 
939362b
 
 
 
58ee780
6cef16b
 
 
 
58ee780
 
6cef16b
58ee780
 
 
 
 
939362b
f1d4807
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
import os

os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
          "https://download.pytorch.org/whl/cpu/torch_stable.html")

import gradio as gr
from transformers import pipeline

import spacy
from spacy import displacy

ner_map = {0: '0',
           1: 'B-OSOBA',
           2: 'I-OSOBA',
           3: 'B-ORGANIZÁCIA',
           4: 'I-ORGANIZÁCIA',
           5: 'B-LOKALITA',
           6: 'I-LOKALITA'}

options = {"ents": ["OSOBA",
                    "ORGANIZÁCIA",
                    "LOKALITA"],
           "colors": {"OSOBA": "lightblue",
                      "ORGANIZÁCIA": "lightcoral",
                      "LOKALITA": "lightgreen"}}

ner_pipeline = pipeline(task='ner', model="crabz/slovakbert-ner")
nlp = spacy.blank("sk")


def postprocess(classifications):
    entities = []
    for i in range(len(classifications)):
        if classifications[i]['entity'] != 0:
            if ner_map[classifications[i]['entity']][0] == 'B':
                j = i + 1
                while j < len(classifications) and ner_map[classifications[j]['entity']][0] == 'I':
                    j += 1
                entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
                                 classifications[j - 1]['end']))
    while True:
        merged = False
        to_remove = []
        merged_entities = []
        for i in range(len(entities)):
            for j in range(i + 1, len(entities)):
                if entities[i] != entities[j] and entities[i][0] == entities[j][0] and \
                        (entities[i][2] == entities[j][1] or entities[i][1] == entities[j][2]):
                    to_remove.append(entities[i])
                    to_remove.append(entities[j])

                    new_start = min(entities[i][1], entities[j][1])
                    new_end = max(entities[i][2], entities[j][2])
                    merged_entities.append((entities[i][0], new_start, new_end))
                    merged = True
                    break
            if merged:
                break
        for ent in to_remove:
            entities.remove(ent)
        entities += merged_entities
        if not merged:
            break
    return entities


def set_entities(sentence, entities):
    doc = nlp(sentence)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    return doc


def apply_ner(sentence: str):
    classifications = ner_pipeline(sentence)
    entities = postprocess(classifications)
    doc = set_entities(sentence, entities)
    displacy_html = displacy.render(doc, style="ent", options=options)
    return displacy_html


intf = gr.Interface(fn=apply_ner, inputs="text", outputs="html", title='Slovak Named Entity Recognition',
                    allow_flagging=False,
                    examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
                               "štyroch prípadov variantu omikron na Slovensku."],
                              ["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
                               "rovnako\"."],
                              ["Informácie o týchto veľkolepých plánoch prišli týždeň po tom, ako sa japonský "
                               "miliardár Jusaku Maezawa vrátil z 12-dňového pobytu na Medzinárodnej vesmírnej stanici "
                               "(ISS), čím sa stal prvým vesmírnym turistom, ktorý cestoval na ISS za viac ako desať "
                               "rokov."],
                              ["Minister financií a líder mandátovo najsilnejšieho hnutia OĽaNO Igor Matovič "
                               "upozorňuje, že následky tretej vlny budú na Slovensku veľmi veľké."],
                              ["Začiatkom roka 2021 sa objavili nezhody medzi Richardom Sulíkom a šéfom hnutia OĽANO "
                               "Igorom Matovičom, ktoré v istej miere pretrvávajú aj dodnes."]],
                    description="Named-entity recognition (NER) labels named-entities in unstructured text. This "
                                "implementation supports three labels: person (OSOBA), organization (ORGANIZÁCIA) and "
                                "location (LOKALITA). You can try out one of the examples below or type your own "
                                "sentence. Don't forget to use double quotes (\" \") instead of curved quotes („ “)",
                    article="")
intf.launch()