finiteautomata commited on
Commit
3f556fb
1 Parent(s): 8739181
Files changed (1) hide show
  1. app.py +39 -61
app.py CHANGED
@@ -5,78 +5,56 @@ from datasets import load_dataset
5
  from annotated_text import annotated_text
6
 
7
  # Load data
8
- ds = load_dataset("hs-knowledge/hateval_ner")
9
- ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
10
 
11
  # Show highlighted ner entities in a tweet
12
 
13
 
14
  def display_text(example):
15
  # Use annotated_text to show entities
16
- ner_output = example["ner_output"]
 
 
 
 
 
 
 
 
 
17
 
18
  chunks = []
19
- current_chunk = ""
20
- current_type = None
21
-
22
- # Check if there are two labels repeated
23
- previous_label = None
24
-
25
- for label in ner_output["labels"]:
26
- if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
27
- pass
28
- previous_label = label
29
-
30
- for token, label in zip(ner_output["tokens"], ner_output["labels"]):
31
- if label is None:
32
- # Perhaps it is too long
33
- continue
34
- if label == "O":
35
- if current_type is not None:
36
- # Add previous entity
37
- chunks.append((current_chunk.strip(), current_type))
38
- current_chunk = token + " "
39
- current_type = None
40
- else:
41
- current_chunk += token + " "
42
- current_type = None
43
- elif label.startswith("B-"):
44
- if current_chunk:
45
- chunks.append((current_chunk.strip(), current_type))
46
- current_chunk = token + " "
47
- current_type = label[2:]
48
- elif label.startswith('I-'):
49
- current_chunk += token + " "
50
- current_type = label[2:]
51
- else:
52
- # It doesn't start with B- or I- => add single token
53
- if label != current_type:
54
- chunks.append((current_chunk.strip(), current_type))
55
- current_chunk = token + " "
56
- current_type = label
57
- else:
58
- current_chunk += token + " "
59
- current_type = label
60
-
61
- if current_chunk:
62
- chunks.append((current_chunk.strip(), current_type))
63
-
64
- # remove nones
65
  chunks = [(c, t) if t is not None else c for c, t in chunks]
66
  annotated_text(*chunks)
67
 
68
- # Get first 1000 examples
69
 
 
70
 
71
- elements = random.choices(range(len(ds["train"])), k=300)
72
  ds["train"] = ds["train"].select(elements)
73
- ds_2["train"] = ds_2["train"].select(elements)
74
-
75
- for ex1, ex2 in zip(ds["train"], ds_2["train"]):
76
- st.write("====================================")
77
- st.write("NER model: robertuito", "\n")
78
- display_text(ex1)
79
- st.write("NER model: roberta-large", "\n")
80
- display_text(ex2)
81
- st.write("\n")
82
- st.write(f"Original text: {ex1['text']}")
 
 
 
 
5
  from annotated_text import annotated_text
6
 
7
  # Load data
8
+ ds = load_dataset("hs-knowledge/hateval_enriched")
 
9
 
10
  # Show highlighted ner entities in a tweet
11
 
12
 
13
  def display_text(example):
14
  # Use annotated_text to show entities
15
+ text = example["text"]
16
+
17
+ # Sort entities by start
18
+ entities = sorted(example["entities"], key=lambda x: x["start"])
19
+
20
+ # Chunk text
21
+
22
+ if len(entities) == 0:
23
+ annotated_text(*[text])
24
+ return
25
 
26
  chunks = []
27
+ last_index = 0
28
+ for i in range(len(entities)):
29
+ entity = entities[i]
30
+ start, end = entity["start"], entity["end"]
31
+
32
+ if last_index < start:
33
+ chunk_before_entity = text[last_index : entity["start"]]
34
+ chunks.append((chunk_before_entity, None))
35
+ chunks.append((entity["text"], entity["type"]))
36
+
37
+ last_index = end
38
+
39
+ # description = entity["kg_result"]["detailedDescription"]["articleBody"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  chunks = [(c, t) if t is not None else c for c, t in chunks]
41
  annotated_text(*chunks)
42
 
 
43
 
44
+ # Get first 1000 examples
45
 
46
+ elements = random.choices(range(len(ds["train"])), k=50)
47
  ds["train"] = ds["train"].select(elements)
48
+
49
+ for ex in ds["train"]:
50
+ st.write("=" * 80)
51
+ # display_text(ex)
52
+ st.write(ex["text"])
53
+
54
+ for ent in ex["entities"]:
55
+ entity_name = ent["text"]
56
+ entity_type = ent["type"]
57
+ entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
58
+ annotated_text(
59
+ (entity_name, "entity"), (f"({entity_type})", "type"), entity_description
60
+ )