lterriel's picture
test
af23812
raw
history blame
8.63 kB
import streamlit
import spacy_streamlit
import spacy
from lxml import etree
import pandas as pd
from spacy import Language
from spacy.tokens import Doc
streamlit.set_page_config(layout="wide")
samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
# TITLE APP
streamlit.title("NER4Archives visualizer")
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""<div style="text-align: justify;">
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
<p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
extracted from XML EAD finding aids and test it on new data.<p>
<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
Other models may be added in the future.</p>
<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
NER4Archives - 2022</div>
""", unsafe_allow_html=True)
scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image("./assets/an.png", width=170)
scol2.image("./assets/almanach_rouge-inria.png", width=100)
flag_file = False
# 1. User provides a XML EAD
streamlit.write("## πŸ“„ Input XML EAD:")
filename = streamlit.file_uploader("Load an XML EAD", type="xml")
streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory")
data = ""
flag_model = False
if filename is not None:
data = filename.getvalue().decode("utf-8").encode("utf-8")
if len(data) > 0:
flag_file = True
import re
def ead_strategy(tree):
# create a container for sentences and dids
# elements
sentences = []
container_dids = []
# get the <dsc> level
dsc = tree.xpath('.//dsc')
for chlidren_dsc in dsc:
# get <did> levels
for did in chlidren_dsc.xpath('.//did'):
container_dids.append(did)
text = ""
if did is not None:
text += " ".join(
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
# get the scopecontent if exists and concatenate with the rest
if did.getnext() is not None:
text += " ".join(
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
len(scopecontent) > 0])
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
# assert len(sentences) == len(container_dids)
return container_dids, sentences
model = ""
linking = True
flag_view = False
if flag_file:
col1, col2 = streamlit.columns(2)
col1.write("## πŸ‘οΈ XML tree view:")
col2.write("## πŸ‘οΈ Plain text view:")
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
tree = etree.fromstring(data, parser=parser)
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
col1.text_area("", value=xml, height=500, disabled=True)
dids, sentences = ead_strategy(tree)
plain = "\n".join(sentences)
col2.text_area("", value=plain, height=500, disabled=True)
flag_view = True
if flag_view:
streamlit.write("## βš™οΈ Configure NER model and options:")
models = []
for pipe in spacy.info()["pipelines"]:
models.append(pipe)
option = streamlit.selectbox(
'Choose a NER model you want to apply in the list: ',
models)
model = option
if model != "":
flag_model = True
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
linkingicon = "βœ…οΈ"
if linking is False:
linkingicon = "❌"
streamlit.write("#### Actual Parameters:")
streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
@Language.factory("custom_ner", default_config={
"model_name": "",
"sentences_to_process": []
})
class CustomNer:
def __init__(self,
nlp: Language,
name: str,
model_name: str,
sentences_to_process: list):
self.nlp = nlp
self.pipeline_ner = spacy.load(model_name)
f_score = self.pipeline_ner.meta['performance']['ents_f']
recall = self.pipeline_ner.meta['performance']['ents_r']
precision = self.pipeline_ner.meta['performance']['ents_p']
mcol1, mcol2, mcol3 = streamlit.columns(3)
mcol1.metric("F-Score", f'{f_score:.2f}')
mcol2.metric("Precision", f'{precision:.2f}')
mcol3.metric("Recall", f'{recall:.2f}')
self.sentences = sentences_to_process
def __call__(self, doc: Doc):
start_sentence = 0
spans = []
count = 0
bar = streamlit.progress(count)
for sent in self.pipeline_ner.pipe(self.sentences):
# add 1 char that correspond to space added in
# sentences concatenation (" ".join())
end_sentence = start_sentence + len(sent.text) + 1
# recompute named entities characters offsets
for ent in sent.ents:
start = start_sentence + ent.start_char
end = start + len(ent.text)
spans.append(doc.char_span(start, end, label=ent.label_))
start_sentence = end_sentence
count += 1
bar.progress((count/len(sentences))*1.0)
doc.set_ents(spans)
return doc
entities = []
flag_vizualize = False
if flag_model:
if streamlit.button('Launch'):
with streamlit.spinner('Initialize NER...'):
huge_pipeline_linking = spacy.blank("fr")
huge_pipeline_linking.max_length = 5000000
huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
if linking:
huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
with streamlit.spinner('NER processing... (please, wait depends on data size)'):
doc = huge_pipeline_linking(plain)
entities = [
(ent.start_char,
ent.end_char,
ent.text,
ent.label_,
ent._.url_wikidata,
ent._.nerd_score
) for ent in doc.ents
]
streamlit.success('πŸ˜ƒ NER applied with success!')
df = pd.DataFrame(entities, columns=['START',
'END',
'MENTION',
'NER LABEL',
'WIKIDATA RESSOURCE (wikidata disambiguation)',
'LINKING SCORE'
])
streamlit.write("## πŸ”Ž Explore named entities in table: ")
streamlit.write(df)
streamlit.write("## πŸ”Ž Explore named entities in text: ")
spacy_streamlit.visualize_ner(
[{"text": doc.text,
"ents": [
{"start": ent.start_char,
"end": ent.end_char,
"label": ent.label_,
"kb_id": ent._.kb_qid,
"kb_url": ent._.url_wikidata
} for ent in doc.ents
]}],
labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
show_table=False,
manual=True,
title="",
displacy_options={
"colors": {
"EVENT": "#ec7063",
"LOCATION": "#45b39d",
"ORGANISATION": "#f39c12",
"PERSON": "#3498db",
"TITLE": "#a569bd ",
"LOC": "#45b39d",
"MISC": "#ec7063",
"ORG": "#f39c12",
"PER": "#3498db"
}
})