import streamlit import spacy_streamlit import spacy from lxml import etree import pandas as pd from spacy import Language from spacy.tokens import Doc streamlit.set_page_config(layout="wide") samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"} # TITLE APP streamlit.title("NER4Archives visualizer") streamlit.sidebar.title("NER4Archives visualizer") streamlit.sidebar.write("## Motivation") streamlit.sidebar.markdown("""

This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on XML EAD finding aids and evaluate NER predictions.

In context of NER4Archives project (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset extracted from XML EAD finding aids and test it on new data.

Most of the models available here are trained with the NLP spaCy framework and its available on the HF organisation hub. Other models may be added in the future.

The project also includes a downstream entity linking task. The SpaCy fishing extension (based on entity-fishing) is used here to support this purpose.

NER4Archives - 2022
""", unsafe_allow_html=True) scol1, scol2 = streamlit.sidebar.columns(2) scol1.image("./assets/an.png", width=170) scol2.image("./assets/almanach_rouge-inria.png", width=100) flag_file = False # 1. User provides a XML EAD streamlit.write("## 📄 Input XML EAD:") filename = streamlit.file_uploader("Load an XML EAD", type="xml") streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory") data = "" flag_model = False if filename is not None: data = filename.getvalue().decode("utf-8").encode("utf-8") if len(data) > 0: flag_file = True import re def ead_strategy(tree): # create a container for sentences and dids # elements sentences = [] container_dids = [] # get the level dsc = tree.xpath('.//dsc') for chlidren_dsc in dsc: # get levels for did in chlidren_dsc.xpath('.//did'): container_dids.append(did) text = "" if did is not None: text += " ".join( [did_content.strip() for did_content in did.itertext() if len(did_content) > 0]) # get the scopecontent if exists and concatenate with the rest if did.getnext() is not None: text += " ".join( [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if len(scopecontent) > 0]) sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ") # assert len(sentences) == len(container_dids) return container_dids, sentences model = "" linking = True flag_view = False if flag_file: col1, col2 = streamlit.columns(2) col1.write("## 👁️ XML tree view:") col2.write("## 👁️ Plain text view:") parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') tree = etree.fromstring(data, parser=parser) xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8") col1.text_area("", value=xml, height=500, disabled=True) dids, sentences = ead_strategy(tree) plain = "\n".join(sentences) col2.text_area("", value=plain, height=500, disabled=True) flag_view = True if flag_view: streamlit.write("## ⚙️ Configure NER model and options:") models = [] for pipe in spacy.info()["pipelines"]: models.append(pipe) option = streamlit.selectbox( 'Choose a NER model you want to apply in the list: ', models) model = option if model != "": flag_model = True linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True) linkingicon = "✅️" if linking is False: linkingicon = "❌" streamlit.write("#### Actual Parameters:") streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}') @Language.factory("custom_ner", default_config={ "model_name": "", "sentences_to_process": [] }) class CustomNer: def __init__(self, nlp: Language, name: str, model_name: str, sentences_to_process: list): self.nlp = nlp self.pipeline_ner = spacy.load(model_name) f_score = self.pipeline_ner.meta['performance']['ents_f'] recall = self.pipeline_ner.meta['performance']['ents_r'] precision = self.pipeline_ner.meta['performance']['ents_p'] mcol1, mcol2, mcol3 = streamlit.columns(3) mcol1.metric("F-Score", f'{f_score:.2f}') mcol2.metric("Precision", f'{precision:.2f}') mcol3.metric("Recall", f'{recall:.2f}') self.sentences = sentences_to_process def __call__(self, doc: Doc): start_sentence = 0 spans = [] count = 0 bar = streamlit.progress(count) for sent in self.pipeline_ner.pipe(self.sentences): # add 1 char that correspond to space added in # sentences concatenation (" ".join()) end_sentence = start_sentence + len(sent.text) + 1 # recompute named entities characters offsets for ent in sent.ents: start = start_sentence + ent.start_char end = start + len(ent.text) spans.append(doc.char_span(start, end, label=ent.label_)) start_sentence = end_sentence count += 1 bar.progress((count/len(sentences))*1.0) doc.set_ents(spans) return doc entities = [] flag_vizualize = False if flag_model: if streamlit.button('Launch'): with streamlit.spinner('Initialize NER...'): huge_pipeline_linking = spacy.blank("fr") huge_pipeline_linking.max_length = 5000000 huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences}) if linking: huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"}) with streamlit.spinner('NER processing... (please, wait depends on data size)'): doc = huge_pipeline_linking(plain) entities = [ (ent.start_char, ent.end_char, ent.text, ent.label_, ent._.url_wikidata, ent._.nerd_score ) for ent in doc.ents ] streamlit.success('😃 NER applied with success!') df = pd.DataFrame(entities, columns=['START', 'END', 'MENTION', 'NER LABEL', 'WIKIDATA RESSOURCE (wikidata disambiguation)', 'LINKING SCORE' ]) streamlit.write("## 🔎 Explore named entities in table: ") streamlit.write(df) streamlit.write("## 🔎 Explore named entities in text: ") spacy_streamlit.visualize_ner( [{"text": doc.text, "ents": [ {"start": ent.start_char, "end": ent.end_char, "label": ent.label_, "kb_id": ent._.kb_qid, "kb_url": ent._.url_wikidata } for ent in doc.ents ]}], labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'], show_table=False, manual=True, title="", displacy_options={ "colors": { "EVENT": "#ec7063", "LOCATION": "#45b39d", "ORGANISATION": "#f39c12", "PERSON": "#3498db", "TITLE": "#a569bd ", "LOC": "#45b39d", "MISC": "#ec7063", "ORG": "#f39c12", "PER": "#3498db" } })