# Streamlit app to highlight NER entities import random import streamlit as st from datasets import load_dataset from annotated_text import annotated_text # Load data ds = load_dataset("hs-knowledge/hateval_ner") ds_2 = load_dataset("hs-knowledge/hateval_ner_2") # Show highlighted ner entities in a tweet def display_text(example): # Use annotated_text to show entities ner_output = example["ner_output"] chunks = [] current_chunk = "" current_type = None # Check if there are two labels repeated previous_label = None for label in ner_output["labels"]: if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"): pass previous_label = label for token, label in zip(ner_output["tokens"], ner_output["labels"]): if label is None: # Perhaps it is too long continue if label == "O": if current_type is not None: # Add previous entity chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = None else: current_chunk += token + " " current_type = None elif label.startswith("B-"): if current_chunk: chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = label[2:] elif label.startswith('I-'): current_chunk += token + " " current_type = label[2:] else: # It doesn't start with B- or I- => add single token if label != current_type: chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = label else: current_chunk += token + " " current_type = label if current_chunk: chunks.append((current_chunk.strip(), current_type)) # remove nones chunks = [(c, t) if t is not None else c for c, t in chunks] annotated_text(*chunks) # Get first 1000 examples elements = random.choices(range(len(ds["train"])), k=300) ds["train"] = ds["train"].select(elements) ds_2["train"] = ds_2["train"].select(elements) for ex1, ex2 in zip(ds["train"], ds_2["train"]): st.write("====================================") st.write("NER model: robertuito", "\n") display_text(ex1) st.write("NER model: roberta-large", "\n") display_text(ex2) st.write("\n") st.write(f"Original text: {ex1['text']}")