Spaces:

hs-knowledge
/

ner_app

Sleeping

App Files Files Community

ner_app / app.py

finiteautomata

First commit

8739181 over 1 year ago

raw

history blame

No virus

2.67 kB

	# Streamlit app to highlight NER entities
	import random
	import streamlit as st
	from datasets import load_dataset
	from annotated_text import annotated_text

	# Load data
	ds = load_dataset("hs-knowledge/hateval_ner")
	ds_2 = load_dataset("hs-knowledge/hateval_ner_2")

	# Show highlighted ner entities in a tweet


	def display_text(example):
	# Use annotated_text to show entities
	ner_output = example["ner_output"]

	chunks = []
	current_chunk = ""
	current_type = None

	# Check if there are two labels repeated
	previous_label = None

	for label in ner_output["labels"]:
	if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
	pass
	previous_label = label

	for token, label in zip(ner_output["tokens"], ner_output["labels"]):
	if label is None:
	# Perhaps it is too long
	continue
	if label == "O":
	if current_type is not None:
	# Add previous entity
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = None
	else:
	current_chunk += token + " "
	current_type = None
	elif label.startswith("B-"):
	if current_chunk:
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = label[2:]
	elif label.startswith('I-'):
	current_chunk += token + " "
	current_type = label[2:]
	else:
	# It doesn't start with B- or I- => add single token
	if label != current_type:
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = label
	else:
	current_chunk += token + " "
	current_type = label

	if current_chunk:
	chunks.append((current_chunk.strip(), current_type))

	# remove nones
	chunks = [(c, t) if t is not None else c for c, t in chunks]
	annotated_text(*chunks)

	# Get first 1000 examples


	elements = random.choices(range(len(ds["train"])), k=300)
	ds["train"] = ds["train"].select(elements)
	ds_2["train"] = ds_2["train"].select(elements)

	for ex1, ex2 in zip(ds["train"], ds_2["train"]):
	st.write("====================================")
	st.write("NER model: robertuito", "\n")
	display_text(ex1)
	st.write("NER model: roberta-large", "\n")
	display_text(ex2)
	st.write("\n")
	st.write(f"Original text: {ex1['text']}")