Spaces:
Sleeping
Sleeping
# Streamlit app to highlight NER entities | |
import random | |
import streamlit as st | |
from datasets import load_dataset | |
from annotated_text import annotated_text | |
# Load data | |
ds = load_dataset("hs-knowledge/hateval_ner") | |
ds_2 = load_dataset("hs-knowledge/hateval_ner_2") | |
# Show highlighted ner entities in a tweet | |
def display_text(example): | |
# Use annotated_text to show entities | |
ner_output = example["ner_output"] | |
chunks = [] | |
current_chunk = "" | |
current_type = None | |
# Check if there are two labels repeated | |
previous_label = None | |
for label in ner_output["labels"]: | |
if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"): | |
pass | |
previous_label = label | |
for token, label in zip(ner_output["tokens"], ner_output["labels"]): | |
if label is None: | |
# Perhaps it is too long | |
continue | |
if label == "O": | |
if current_type is not None: | |
# Add previous entity | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = None | |
else: | |
current_chunk += token + " " | |
current_type = None | |
elif label.startswith("B-"): | |
if current_chunk: | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = label[2:] | |
elif label.startswith('I-'): | |
current_chunk += token + " " | |
current_type = label[2:] | |
else: | |
# It doesn't start with B- or I- => add single token | |
if label != current_type: | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = label | |
else: | |
current_chunk += token + " " | |
current_type = label | |
if current_chunk: | |
chunks.append((current_chunk.strip(), current_type)) | |
# remove nones | |
chunks = [(c, t) if t is not None else c for c, t in chunks] | |
annotated_text(*chunks) | |
# Get first 1000 examples | |
elements = random.choices(range(len(ds["train"])), k=300) | |
ds["train"] = ds["train"].select(elements) | |
ds_2["train"] = ds_2["train"].select(elements) | |
for ex1, ex2 in zip(ds["train"], ds_2["train"]): | |
st.write("====================================") | |
st.write("NER model: robertuito", "\n") | |
display_text(ex1) | |
st.write("NER model: roberta-large", "\n") | |
display_text(ex2) | |
st.write("\n") | |
st.write(f"Original text: {ex1['text']}") | |