Spaces:

ner4archives
/

ner4archives-NEL-vizualizer-app

Running

App Files Files Community

ner4archives-NEL-vizualizer-app / app.py

lterriel

test

af23812 over 2 years ago

raw

history blame

8.63 kB

	import streamlit
	import spacy_streamlit
	import spacy
	from lxml import etree
	import pandas as pd
	from spacy import Language
	from spacy.tokens import Doc

	streamlit.set_page_config(layout="wide")

	samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}

	# TITLE APP
	streamlit.title("NER4Archives visualizer")
	streamlit.sidebar.title("NER4Archives visualizer")
	streamlit.sidebar.write("## Motivation")
	streamlit.sidebar.markdown("""<div style="text-align: justify;">
	<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
	XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>

	<p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
	extracted from XML EAD finding aids and test it on new data.<p>

	<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
	framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
	Other models may be added in the future.</p>

	<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>

	NER4Archives - 2022</div>
	""", unsafe_allow_html=True)

	scol1, scol2 = streamlit.sidebar.columns(2)
	scol1.image("./assets/an.png", width=170)
	scol2.image("./assets/almanach_rouge-inria.png", width=100)

	flag_file = False

	# 1. User provides a XML EAD
	streamlit.write("## 📄 Input XML EAD:")
	filename = streamlit.file_uploader("Load an XML EAD", type="xml")
	streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory")
	data = ""




	flag_model = False
	if filename is not None:
	data = filename.getvalue().decode("utf-8").encode("utf-8")
	if len(data) > 0:
	flag_file = True




	import re
	def ead_strategy(tree):
	# create a container for sentences and dids
	# elements
	sentences = []
	container_dids = []
	# get the <dsc> level
	dsc = tree.xpath('.//dsc')
	for chlidren_dsc in dsc:
	# get <did> levels
	for did in chlidren_dsc.xpath('.//did'):
	container_dids.append(did)
	text = ""
	if did is not None:
	text += " ".join(
	[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
	# get the scopecontent if exists and concatenate with the rest
	if did.getnext() is not None:
	text += " ".join(
	[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
	len(scopecontent) > 0])
	sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
	# assert len(sentences) == len(container_dids)
	return container_dids, sentences

	model = ""
	linking = True
	flag_view = False
	if flag_file:
	col1, col2 = streamlit.columns(2)
	col1.write("## 👁️ XML tree view:")
	col2.write("## 👁️ Plain text view:")
	parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
	tree = etree.fromstring(data, parser=parser)
	xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
	col1.text_area("", value=xml, height=500, disabled=True)
	dids, sentences = ead_strategy(tree)
	plain = "\n".join(sentences)
	col2.text_area("", value=plain, height=500, disabled=True)
	flag_view = True

	if flag_view:
	streamlit.write("## ⚙️ Configure NER model and options:")
	models = []
	for pipe in spacy.info()["pipelines"]:
	models.append(pipe)
	option = streamlit.selectbox(
	'Choose a NER model you want to apply in the list: ',
	models)
	model = option
	if model != "":
	flag_model = True
	linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
	linkingicon = "✅️"
	if linking is False:
	linkingicon = "❌"
	streamlit.write("#### Actual Parameters:")
	streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
	@Language.factory("custom_ner", default_config={
	"model_name": "",
	"sentences_to_process": []
	})
	class CustomNer:
	def __init__(self,
	nlp: Language,
	name: str,
	model_name: str,
	sentences_to_process: list):
	self.nlp = nlp
	self.pipeline_ner = spacy.load(model_name)
	f_score = self.pipeline_ner.meta['performance']['ents_f']
	recall = self.pipeline_ner.meta['performance']['ents_r']
	precision = self.pipeline_ner.meta['performance']['ents_p']
	mcol1, mcol2, mcol3 = streamlit.columns(3)
	mcol1.metric("F-Score", f'{f_score:.2f}')
	mcol2.metric("Precision", f'{precision:.2f}')
	mcol3.metric("Recall", f'{recall:.2f}')
	self.sentences = sentences_to_process

	def __call__(self, doc: Doc):
	start_sentence = 0
	spans = []
	count = 0
	bar = streamlit.progress(count)
	for sent in self.pipeline_ner.pipe(self.sentences):
	# add 1 char that correspond to space added in
	# sentences concatenation (" ".join())
	end_sentence = start_sentence + len(sent.text) + 1
	# recompute named entities characters offsets
	for ent in sent.ents:
	start = start_sentence + ent.start_char
	end = start + len(ent.text)
	spans.append(doc.char_span(start, end, label=ent.label_))
	start_sentence = end_sentence
	count += 1
	bar.progress((count/len(sentences))*1.0)

	doc.set_ents(spans)

	return doc

	entities = []
	flag_vizualize = False

	if flag_model:
	if streamlit.button('Launch'):
	with streamlit.spinner('Initialize NER...'):
	huge_pipeline_linking = spacy.blank("fr")
	huge_pipeline_linking.max_length = 5000000
	huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
	if linking:
	huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
	with streamlit.spinner('NER processing... (please, wait depends on data size)'):
	doc = huge_pipeline_linking(plain)

	entities = [
	(ent.start_char,
	ent.end_char,
	ent.text,
	ent.label_,
	ent._.url_wikidata,
	ent._.nerd_score
	) for ent in doc.ents
	]
	streamlit.success('😃 NER applied with success!')


	df = pd.DataFrame(entities, columns=['START',
	'END',
	'MENTION',
	'NER LABEL',
	'WIKIDATA RESSOURCE (wikidata disambiguation)',
	'LINKING SCORE'
	])

	streamlit.write("## 🔎 Explore named entities in table: ")
	streamlit.write(df)

	streamlit.write("## 🔎 Explore named entities in text: ")
	spacy_streamlit.visualize_ner(
	[{"text": doc.text,
	"ents": [
	{"start": ent.start_char,
	"end": ent.end_char,
	"label": ent.label_,
	"kb_id": ent._.kb_qid,
	"kb_url": ent._.url_wikidata
	} for ent in doc.ents
	]}],
	labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
	show_table=False,
	manual=True,
	title="",
	displacy_options={
	"colors": {
	"EVENT": "#ec7063",
	"LOCATION": "#45b39d",
	"ORGANISATION": "#f39c12",
	"PERSON": "#3498db",
	"TITLE": "#a569bd ",
	"LOC": "#45b39d",
	"MISC": "#ec7063",
	"ORG": "#f39c12",
	"PER": "#3498db"

	}
	})