ner_app / app.py
finiteautomata's picture
Add other datasets
43e2fd9
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text
# Show highlighted ner entities in a tweet
def display_ner(example):
ner_output = example["ner_output"]
chunks = []
current_chunk = ""
current_type = None
# Check if there are two labels repeated
previous_label = None
for label in ner_output["labels"]:
if (
label
and previous_label
and previous_label == label
and label != "O"
and not label.startswith("I-")
and not label.startswith("B-")
):
pass
previous_label = label
for token, label in zip(ner_output["tokens"], ner_output["labels"]):
if label is None:
# Perhaps it is too long
continue
if label == "O":
if current_type is not None:
# Add previous entity
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = None
else:
current_chunk += token + " "
current_type = None
elif label.startswith("B-"):
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label[2:]
elif label.startswith("I-"):
current_chunk += token + " "
current_type = label[2:]
else:
# It doesn't start with B- or I- => add single token
if label != current_type:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label
else:
current_chunk += token + " "
current_type = label
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
# Display text
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
def display_text(example, text_column):
# Use annotated_text to show entities
text = example[text_column]
# Sort entities by start
entities = sorted(example["entities"], key=lambda x: x["start"])
for entity in entities:
entity_text = entity["text"]
# find in text
start = text.find(entity_text)
end = start + len(entity_text)
entity["start"] = start
entity["end"] = end
# Chunk text
if len(entities) == 0:
annotated_text(*[text])
return
chunks = []
last_index = 0
for i in range(len(entities)):
entity = entities[i]
start, end = entity["start"], entity["end"]
if last_index < start:
chunk_before_entity = text[last_index : entity["start"]]
chunks.append((chunk_before_entity, None))
chunks.append((entity["text"], entity["type"]))
last_index = end
if last_index < len(text):
chunks.append((text[last_index:], None))
# description = entity["kg_result"]["detailedDescription"]["articleBody"]
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
# selectbox to choose dataset
selected_dataset = st.sidebar.selectbox(
"Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"]
)
# Load data
ds = load_dataset(f"hs-knowledge/{selected_dataset}")
text_column = {
"hateval_enriched": "text",
"sbf-enriched": "post",
"hatecheck-enriched": "test_case",
}
elements = random.choices(range(len(ds["train"])), k=50)
ds["train"] = ds["train"].select(elements)
for ex in ds["train"]:
# display_text(ex)
st.markdown("---")
display_ner(ex)
with st.expander("Show entities"):
for ent in ex["entities"]:
entity_name = ent["text"]
entity_type = ent["type"]
entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
st.write(f"{entity_name} ({entity_type}): {entity_description}")