crabz commited on
Commit
58ee780
1 Parent(s): cb255ed

better postprocessing

Browse files
Files changed (1) hide show
  1. app.py +37 -10
app.py CHANGED
@@ -1,8 +1,8 @@
1
  # -*- coding: utf-8 -*-
2
  import os
3
 
4
- os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
5
- "https://download.pytorch.org/whl/cpu/torch_stable.html")
6
 
7
  import gradio as gr
8
  from transformers import pipeline
@@ -29,9 +29,7 @@ ner_pipeline = pipeline(task='ner', model="crabz/slovakbert-ner")
29
  nlp = spacy.blank("sk")
30
 
31
 
32
- def apply_ner(sentence: str):
33
- classifications = ner_pipeline(sentence)
34
-
35
  entities = []
36
  for i in range(len(classifications)):
37
  if classifications[i]['entity'] != 0:
@@ -41,13 +39,37 @@ def apply_ner(sentence: str):
41
  j += 1
42
  entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
43
  classifications[j - 1]['end']))
44
- doc = nlp(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
 
 
 
46
  ents = []
47
  for ee in entities:
48
  ents.append(doc.char_span(ee[1], ee[2], ee[0]))
49
  doc.ents = ents
 
 
50
 
 
 
 
 
51
  displacy_html = displacy.render(doc, style="ent", options=options)
52
  return displacy_html
53
 
@@ -57,9 +79,14 @@ intf = gr.Interface(fn=apply_ner, inputs="text", outputs="html", title='Slovak N
57
  examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
58
  "štyroch prípadov variantu omikron na Slovensku."],
59
  ["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
60
- "rovnako\"."]],
61
- description="Named Entity Recognition (NER) labels persons, organizations and locations in the "
62
- "given sentence. You can try out one of the examples below or type your own sentence. "
63
- "Don't forget to use double quotes (\") instead of curved quotes („“)",
 
 
 
 
 
64
  article="")
65
  intf.launch()
 
1
  # -*- coding: utf-8 -*-
2
  import os
3
 
4
+ # os.system("pip3 install torch==1.10.1+cpu torchvision==0.11.2+cpu torchaudio==0.10.1+cpu -f "
5
+ # "https://download.pytorch.org/whl/cpu/torch_stable.html")
6
 
7
  import gradio as gr
8
  from transformers import pipeline
 
29
  nlp = spacy.blank("sk")
30
 
31
 
32
+ def postprocess(classifications):
 
 
33
  entities = []
34
  for i in range(len(classifications)):
35
  if classifications[i]['entity'] != 0:
 
39
  j += 1
40
  entities.append((ner_map[classifications[i]['entity']].split('-')[1], classifications[i]['start'],
41
  classifications[j - 1]['end']))
42
+ to_remove = []
43
+ merged_entities = []
44
+ for i in range(len(entities)):
45
+ for j in range(i + 1, len(entities)):
46
+ if entities[i] != entities[j] and entities[i][0] == entities[j][0] and (entities[i][2] == entities[j][1] or
47
+ entities[i][1] == entities[j][2]):
48
+ to_remove.append(entities[i])
49
+ to_remove.append(entities[j])
50
+
51
+ new_start = min(entities[i][1], entities[j][1])
52
+ new_end = max(entities[i][2], entities[j][2])
53
+ merged_entities.append((entities[i][0], new_start, new_end))
54
+ for ent in to_remove:
55
+ entities.remove(ent)
56
+ entities += merged_entities
57
+ return entities
58
 
59
+
60
+ def set_entities(sentence, entities):
61
+ doc = nlp(sentence)
62
  ents = []
63
  for ee in entities:
64
  ents.append(doc.char_span(ee[1], ee[2], ee[0]))
65
  doc.ents = ents
66
+ return doc
67
+
68
 
69
+ def apply_ner(sentence: str):
70
+ classifications = ner_pipeline(sentence)
71
+ entities = postprocess(classifications)
72
+ doc = set_entities(sentence, entities)
73
  displacy_html = displacy.render(doc, style="ent", options=options)
74
  return displacy_html
75
 
 
79
  examples=[["Laboratóriá Úradu verejného zdravotníctva sekvenovaním potvrdili výskyt ďalších "
80
  "štyroch prípadov variantu omikron na Slovensku."],
81
  ["Čaputová opakovane tvrdí, že \"spravodlivosť na Slovensku neplatí vždy pre všetkých "
82
+ "rovnako\"."],
83
+ ["Minister financií a líder mandátovo najsilnejšieho hnutia OĽaNO Igor Matovič "
84
+ "upozorňuje, že následky tretej vlny budú na Slovensku veľmi veľké."],
85
+ ["Začiatkom roka sa objavili nezhody medzi Richardom Sulíkom a šéfom hnutia OĽANO "
86
+ "Igorom Matovičom, ktoré v istej miere pretrvávajú aj dodnes."]],
87
+ description="Named-entity recognition (NER) labels named-entities in unstructured text. This "
88
+ "implementation supports three labels: person (OSOBA), organization (ORGANIZÁCIA) and "
89
+ "location (LOKALITA). You can try out one of the examples below or type your own "
90
+ "sentence. Don't forget to use double quotes (\" \") instead of curved quotes („ “)",
91
  article="")
92
  intf.launch()