Spaces:

wadood
/

ner_evaluation_metrics

Sleeping

ner_evaluation_metrics / span_dataclass_converters.py

added eval image and explanation

cd80277 about 2 months ago

No virus

1.58 kB

	"""
	There are 4 data formats for spans
	1. annotations - this is what we obtain from the text_annotator, the format can be seen in the predefined_examples, gt_labels
	2. higlight_spans - this is the format used by the highlighter to return the highlighted html text. This is a list of string/tuples("string", "label", color)
	3. ner_spans - this is the standard format used for representing ner_spans, it is a dict of {"start":int, "end":int, "label":str, "span_text":str}
	4. Token level output - this is delt with in the token_level_output file, this is either a list of tuples with [(token, label)] or just a list of [label, label]
	"""


	def get_ner_spans_from_annotations(annotated_labels):
	spans = []
	for entity_type, spans_list in annotated_labels.items():
	for spans_dict in spans_list:
	ner_span_dict = {
	**spans_dict,
	"label": entity_type,
	"span_text": spans_dict["label"],
	}
	spans.append(ner_span_dict)
	return spans


	def get_highlight_spans_from_ner_spans(ner_spans, parent_text):
	if not ner_spans:
	return [parent_text]

	output_list = []
	prev_span_end = 0
	# output_list = [parent_text[ner_spans[0]["start"]]]
	for span in ner_spans:
	output_list.append(parent_text[prev_span_end : span["start"]])
	tup = (span["span_text"], span["label"])
	output_list.append(tup)
	prev_span_end = span["end"]

	if prev_span_end != len(parent_text):
	output_list.append(parent_text[prev_span_end:])

	return output_list