File size: 2,668 Bytes
8739181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text

# Load data
ds = load_dataset("hs-knowledge/hateval_ner")
ds_2 = load_dataset("hs-knowledge/hateval_ner_2")

# Show highlighted ner entities in a tweet


def display_text(example):
    # Use annotated_text to show entities
    ner_output = example["ner_output"]

    chunks = []
    current_chunk = ""
    current_type = None

    # Check if there are two labels repeated
    previous_label = None

    for label in ner_output["labels"]:
        if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
            pass
        previous_label = label

    for token, label in zip(ner_output["tokens"], ner_output["labels"]):
        if label is None:
            # Perhaps it is too long
            continue
        if label == "O":
            if current_type is not None:
                # Add previous entity
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = None
            else:
                current_chunk += token + " "
                current_type = None
        elif label.startswith("B-"):
            if current_chunk:
                chunks.append((current_chunk.strip(), current_type))
            current_chunk = token + " "
            current_type = label[2:]
        elif label.startswith('I-'):
            current_chunk += token + " "
            current_type = label[2:]
        else:
            # It doesn't start with B- or I- => add single token
            if label != current_type:
                chunks.append((current_chunk.strip(), current_type))
                current_chunk = token + " "
                current_type = label
            else:
                current_chunk += token + " "
                current_type = label

    if current_chunk:
        chunks.append((current_chunk.strip(), current_type))

    # remove nones
    chunks = [(c, t) if t is not None else c for c, t in chunks]
    annotated_text(*chunks)

# Get first 1000 examples


elements = random.choices(range(len(ds["train"])), k=300)
ds["train"] = ds["train"].select(elements)
ds_2["train"] = ds_2["train"].select(elements)

for ex1, ex2 in zip(ds["train"], ds_2["train"]):
    st.write("====================================")
    st.write("NER model: robertuito", "\n")
    display_text(ex1)
    st.write("NER model: roberta-large", "\n")
    display_text(ex2)
    st.write("\n")
    st.write(f"Original text: {ex1['text']}")