Spaces:
Sleeping
Sleeping
File size: 2,668 Bytes
8739181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text
# Load data
ds = load_dataset("hs-knowledge/hateval_ner")
ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
# Show highlighted ner entities in a tweet
def display_text(example):
# Use annotated_text to show entities
ner_output = example["ner_output"]
chunks = []
current_chunk = ""
current_type = None
# Check if there are two labels repeated
previous_label = None
for label in ner_output["labels"]:
if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
pass
previous_label = label
for token, label in zip(ner_output["tokens"], ner_output["labels"]):
if label is None:
# Perhaps it is too long
continue
if label == "O":
if current_type is not None:
# Add previous entity
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = None
else:
current_chunk += token + " "
current_type = None
elif label.startswith("B-"):
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label[2:]
elif label.startswith('I-'):
current_chunk += token + " "
current_type = label[2:]
else:
# It doesn't start with B- or I- => add single token
if label != current_type:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label
else:
current_chunk += token + " "
current_type = label
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
# remove nones
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
# Get first 1000 examples
elements = random.choices(range(len(ds["train"])), k=300)
ds["train"] = ds["train"].select(elements)
ds_2["train"] = ds_2["train"].select(elements)
for ex1, ex2 in zip(ds["train"], ds_2["train"]):
st.write("====================================")
st.write("NER model: robertuito", "\n")
display_text(ex1)
st.write("NER model: roberta-large", "\n")
display_text(ex2)
st.write("\n")
st.write(f"Original text: {ex1['text']}")
|