ner_app / app.py
finiteautomata's picture
First commit
8739181
raw
history blame
No virus
2.67 kB
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text
# Load data
ds = load_dataset("hs-knowledge/hateval_ner")
ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
# Show highlighted ner entities in a tweet
def display_text(example):
# Use annotated_text to show entities
ner_output = example["ner_output"]
chunks = []
current_chunk = ""
current_type = None
# Check if there are two labels repeated
previous_label = None
for label in ner_output["labels"]:
if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
pass
previous_label = label
for token, label in zip(ner_output["tokens"], ner_output["labels"]):
if label is None:
# Perhaps it is too long
continue
if label == "O":
if current_type is not None:
# Add previous entity
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = None
else:
current_chunk += token + " "
current_type = None
elif label.startswith("B-"):
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label[2:]
elif label.startswith('I-'):
current_chunk += token + " "
current_type = label[2:]
else:
# It doesn't start with B- or I- => add single token
if label != current_type:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label
else:
current_chunk += token + " "
current_type = label
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
# remove nones
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
# Get first 1000 examples
elements = random.choices(range(len(ds["train"])), k=300)
ds["train"] = ds["train"].select(elements)
ds_2["train"] = ds_2["train"].select(elements)
for ex1, ex2 in zip(ds["train"], ds_2["train"]):
st.write("====================================")
st.write("NER model: robertuito", "\n")
display_text(ex1)
st.write("NER model: roberta-large", "\n")
display_text(ex2)
st.write("\n")
st.write(f"Original text: {ex1['text']}")