|
import streamlit |
|
import spacy_streamlit |
|
import spacy |
|
from lxml import etree |
|
import pandas as pd |
|
from spacy import Language |
|
from spacy.tokens import Doc |
|
|
|
streamlit.set_page_config(layout="wide") |
|
|
|
samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"} |
|
|
|
|
|
streamlit.title("NER4Archives visualizer") |
|
streamlit.sidebar.title("NER4Archives visualizer") |
|
streamlit.sidebar.write("## Motivation") |
|
streamlit.sidebar.markdown("""<div style="text-align: justify;"> |
|
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on |
|
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p> |
|
|
|
<p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset |
|
extracted from XML EAD finding aids and test it on new data.<p> |
|
|
|
<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a> |
|
framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>. |
|
Other models may be added in the future.</p> |
|
|
|
<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p> |
|
|
|
NER4Archives - 2022</div> |
|
""", unsafe_allow_html=True) |
|
|
|
scol1, scol2 = streamlit.sidebar.columns(2) |
|
scol1.image("./assets/an.png", width=170) |
|
scol2.image("./assets/almanach_rouge-inria.png", width=100) |
|
|
|
flag_file = False |
|
|
|
|
|
streamlit.write("## π Input XML EAD:") |
|
filename = streamlit.file_uploader("Load an XML EAD", type="xml") |
|
streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory") |
|
data = "" |
|
|
|
|
|
|
|
|
|
flag_model = False |
|
if filename is not None: |
|
data = filename.getvalue().decode("utf-8").encode("utf-8") |
|
if len(data) > 0: |
|
flag_file = True |
|
|
|
|
|
|
|
|
|
import re |
|
def ead_strategy(tree): |
|
|
|
|
|
sentences = [] |
|
container_dids = [] |
|
|
|
dsc = tree.xpath('.//dsc') |
|
for chlidren_dsc in dsc: |
|
|
|
for did in chlidren_dsc.xpath('.//did'): |
|
container_dids.append(did) |
|
text = "" |
|
if did is not None: |
|
text += " ".join( |
|
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0]) |
|
|
|
if did.getnext() is not None: |
|
text += " ".join( |
|
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if |
|
len(scopecontent) > 0]) |
|
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ") |
|
|
|
return container_dids, sentences |
|
|
|
model = "" |
|
linking = True |
|
flag_view = False |
|
if flag_file: |
|
col1, col2 = streamlit.columns(2) |
|
col1.write("## ποΈ XML tree view:") |
|
col2.write("## ποΈ Plain text view:") |
|
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') |
|
tree = etree.fromstring(data, parser=parser) |
|
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8") |
|
col1.text_area("", value=xml, height=500, disabled=True) |
|
dids, sentences = ead_strategy(tree) |
|
plain = "\n".join(sentences) |
|
col2.text_area("", value=plain, height=500, disabled=True) |
|
flag_view = True |
|
|
|
if flag_view: |
|
streamlit.write("## βοΈ Configure NER model and options:") |
|
models = [] |
|
for pipe in spacy.info()["pipelines"]: |
|
models.append(pipe) |
|
option = streamlit.selectbox( |
|
'Choose a NER model you want to apply in the list: ', |
|
models) |
|
model = option |
|
if model != "": |
|
flag_model = True |
|
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True) |
|
linkingicon = "β
οΈ" |
|
if linking is False: |
|
linkingicon = "β" |
|
streamlit.write("#### Actual Parameters:") |
|
streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}') |
|
@Language.factory("custom_ner", default_config={ |
|
"model_name": "", |
|
"sentences_to_process": [] |
|
}) |
|
class CustomNer: |
|
def __init__(self, |
|
nlp: Language, |
|
name: str, |
|
model_name: str, |
|
sentences_to_process: list): |
|
self.nlp = nlp |
|
self.pipeline_ner = spacy.load(model_name) |
|
f_score = self.pipeline_ner.meta['performance']['ents_f'] |
|
recall = self.pipeline_ner.meta['performance']['ents_r'] |
|
precision = self.pipeline_ner.meta['performance']['ents_p'] |
|
mcol1, mcol2, mcol3 = streamlit.columns(3) |
|
mcol1.metric("F-Score", f'{f_score:.2f}') |
|
mcol2.metric("Precision", f'{precision:.2f}') |
|
mcol3.metric("Recall", f'{recall:.2f}') |
|
self.sentences = sentences_to_process |
|
|
|
def __call__(self, doc: Doc): |
|
start_sentence = 0 |
|
spans = [] |
|
count = 0 |
|
bar = streamlit.progress(count) |
|
for sent in self.pipeline_ner.pipe(self.sentences): |
|
|
|
|
|
end_sentence = start_sentence + len(sent.text) + 1 |
|
|
|
for ent in sent.ents: |
|
start = start_sentence + ent.start_char |
|
end = start + len(ent.text) |
|
spans.append(doc.char_span(start, end, label=ent.label_)) |
|
start_sentence = end_sentence |
|
count += 1 |
|
bar.progress((count/len(sentences))*1.0) |
|
|
|
doc.set_ents(spans) |
|
|
|
return doc |
|
|
|
entities = [] |
|
flag_vizualize = False |
|
|
|
if flag_model: |
|
if streamlit.button('Launch'): |
|
with streamlit.spinner('Initialize NER...'): |
|
huge_pipeline_linking = spacy.blank("fr") |
|
huge_pipeline_linking.max_length = 5000000 |
|
huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences}) |
|
if linking: |
|
huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"}) |
|
with streamlit.spinner('NER processing... (please, wait depends on data size)'): |
|
doc = huge_pipeline_linking(plain) |
|
|
|
entities = [ |
|
(ent.start_char, |
|
ent.end_char, |
|
ent.text, |
|
ent.label_, |
|
ent._.url_wikidata, |
|
ent._.nerd_score |
|
) for ent in doc.ents |
|
] |
|
streamlit.success('π NER applied with success!') |
|
|
|
|
|
df = pd.DataFrame(entities, columns=['START', |
|
'END', |
|
'MENTION', |
|
'NER LABEL', |
|
'WIKIDATA RESSOURCE (wikidata disambiguation)', |
|
'LINKING SCORE' |
|
]) |
|
|
|
streamlit.write("## π Explore named entities in table: ") |
|
streamlit.write(df) |
|
|
|
streamlit.write("## π Explore named entities in text: ") |
|
spacy_streamlit.visualize_ner( |
|
[{"text": doc.text, |
|
"ents": [ |
|
{"start": ent.start_char, |
|
"end": ent.end_char, |
|
"label": ent.label_, |
|
"kb_id": ent._.kb_qid, |
|
"kb_url": ent._.url_wikidata |
|
} for ent in doc.ents |
|
]}], |
|
labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'], |
|
show_table=False, |
|
manual=True, |
|
title="", |
|
displacy_options={ |
|
"colors": { |
|
"EVENT": "#ec7063", |
|
"LOCATION": "#45b39d", |
|
"ORGANISATION": "#f39c12", |
|
"PERSON": "#3498db", |
|
"TITLE": "#a569bd ", |
|
"LOC": "#45b39d", |
|
"MISC": "#ec7063", |
|
"ORG": "#f39c12", |
|
"PER": "#3498db" |
|
|
|
} |
|
}) |
|
|
|
|
|
|