Spaces:
Sleeping
Sleeping
| # Streamlit app to highlight NER entities | |
| import random | |
| import streamlit as st | |
| from datasets import load_dataset | |
| from annotated_text import annotated_text | |
| # Show highlighted ner entities in a tweet | |
| def display_ner(example): | |
| ner_output = example["ner_output"] | |
| chunks = [] | |
| current_chunk = "" | |
| current_type = None | |
| # Check if there are two labels repeated | |
| previous_label = None | |
| for label in ner_output["labels"]: | |
| if ( | |
| label | |
| and previous_label | |
| and previous_label == label | |
| and label != "O" | |
| and not label.startswith("I-") | |
| and not label.startswith("B-") | |
| ): | |
| pass | |
| previous_label = label | |
| for token, label in zip(ner_output["tokens"], ner_output["labels"]): | |
| if label is None: | |
| # Perhaps it is too long | |
| continue | |
| if label == "O": | |
| if current_type is not None: | |
| # Add previous entity | |
| chunks.append((current_chunk.strip(), current_type)) | |
| current_chunk = token + " " | |
| current_type = None | |
| else: | |
| current_chunk += token + " " | |
| current_type = None | |
| elif label.startswith("B-"): | |
| if current_chunk: | |
| chunks.append((current_chunk.strip(), current_type)) | |
| current_chunk = token + " " | |
| current_type = label[2:] | |
| elif label.startswith("I-"): | |
| current_chunk += token + " " | |
| current_type = label[2:] | |
| else: | |
| # It doesn't start with B- or I- => add single token | |
| if label != current_type: | |
| chunks.append((current_chunk.strip(), current_type)) | |
| current_chunk = token + " " | |
| current_type = label | |
| else: | |
| current_chunk += token + " " | |
| current_type = label | |
| if current_chunk: | |
| chunks.append((current_chunk.strip(), current_type)) | |
| # Display text | |
| chunks = [(c, t) if t is not None else c for c, t in chunks] | |
| annotated_text(*chunks) | |
| def display_text(example, text_column): | |
| # Use annotated_text to show entities | |
| text = example[text_column] | |
| # Sort entities by start | |
| entities = sorted(example["entities"], key=lambda x: x["start"]) | |
| for entity in entities: | |
| entity_text = entity["text"] | |
| # find in text | |
| start = text.find(entity_text) | |
| end = start + len(entity_text) | |
| entity["start"] = start | |
| entity["end"] = end | |
| # Chunk text | |
| if len(entities) == 0: | |
| annotated_text(*[text]) | |
| return | |
| chunks = [] | |
| last_index = 0 | |
| for i in range(len(entities)): | |
| entity = entities[i] | |
| start, end = entity["start"], entity["end"] | |
| if last_index < start: | |
| chunk_before_entity = text[last_index : entity["start"]] | |
| chunks.append((chunk_before_entity, None)) | |
| chunks.append((entity["text"], entity["type"])) | |
| last_index = end | |
| if last_index < len(text): | |
| chunks.append((text[last_index:], None)) | |
| # description = entity["kg_result"]["detailedDescription"]["articleBody"] | |
| chunks = [(c, t) if t is not None else c for c, t in chunks] | |
| annotated_text(*chunks) | |
| # selectbox to choose dataset | |
| selected_dataset = st.sidebar.selectbox( | |
| "Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"] | |
| ) | |
| # Load data | |
| ds = load_dataset(f"hs-knowledge/{selected_dataset}") | |
| text_column = { | |
| "hateval_enriched": "text", | |
| "sbf-enriched": "post", | |
| "hatecheck-enriched": "test_case", | |
| } | |
| elements = random.choices(range(len(ds["train"])), k=50) | |
| ds["train"] = ds["train"].select(elements) | |
| for ex in ds["train"]: | |
| # display_text(ex) | |
| st.markdown("---") | |
| display_ner(ex) | |
| with st.expander("Show entities"): | |
| for ent in ex["entities"]: | |
| entity_name = ent["text"] | |
| entity_type = ent["type"] | |
| entity_description = ent["kg_result"]["detailedDescription"]["articleBody"] | |
| st.write(f"{entity_name} ({entity_type}): {entity_description}") | |