# Streamlit app to highlight NER entities import random import streamlit as st from datasets import load_dataset from annotated_text import annotated_text # Show highlighted ner entities in a tweet def display_ner(example): ner_output = example["ner_output"] chunks = [] current_chunk = "" current_type = None # Check if there are two labels repeated previous_label = None for label in ner_output["labels"]: if ( label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-") ): pass previous_label = label for token, label in zip(ner_output["tokens"], ner_output["labels"]): if label is None: # Perhaps it is too long continue if label == "O": if current_type is not None: # Add previous entity chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = None else: current_chunk += token + " " current_type = None elif label.startswith("B-"): if current_chunk: chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = label[2:] elif label.startswith("I-"): current_chunk += token + " " current_type = label[2:] else: # It doesn't start with B- or I- => add single token if label != current_type: chunks.append((current_chunk.strip(), current_type)) current_chunk = token + " " current_type = label else: current_chunk += token + " " current_type = label if current_chunk: chunks.append((current_chunk.strip(), current_type)) # Display text chunks = [(c, t) if t is not None else c for c, t in chunks] annotated_text(*chunks) def display_text(example, text_column): # Use annotated_text to show entities text = example[text_column] # Sort entities by start entities = sorted(example["entities"], key=lambda x: x["start"]) for entity in entities: entity_text = entity["text"] # find in text start = text.find(entity_text) end = start + len(entity_text) entity["start"] = start entity["end"] = end # Chunk text if len(entities) == 0: annotated_text(*[text]) return chunks = [] last_index = 0 for i in range(len(entities)): entity = entities[i] start, end = entity["start"], entity["end"] if last_index < start: chunk_before_entity = text[last_index : entity["start"]] chunks.append((chunk_before_entity, None)) chunks.append((entity["text"], entity["type"])) last_index = end if last_index < len(text): chunks.append((text[last_index:], None)) # description = entity["kg_result"]["detailedDescription"]["articleBody"] chunks = [(c, t) if t is not None else c for c, t in chunks] annotated_text(*chunks) # selectbox to choose dataset selected_dataset = st.sidebar.selectbox( "Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"] ) # Load data ds = load_dataset(f"hs-knowledge/{selected_dataset}") text_column = { "hateval_enriched": "text", "sbf-enriched": "post", "hatecheck-enriched": "test_case", } elements = random.choices(range(len(ds["train"])), k=50) ds["train"] = ds["train"].select(elements) for ex in ds["train"]: # display_text(ex) st.markdown("---") display_ner(ex) with st.expander("Show entities"): for ent in ex["entities"]: entity_name = ent["text"] entity_type = ent["type"] entity_description = ent["kg_result"]["detailedDescription"]["articleBody"] st.write(f"{entity_name} ({entity_type}): {entity_description}")