import re import json import subprocess import requests import streamlit import spacy from lxml import etree import pandas as pd # Constants CONFIG_FILE = "config.json" ASSETS_DIR = "assets" XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'} ENTITY_COLORS = { "EVENT": "#ec7063", "LOCATION": "#45b39d", "ORGANISATION": "#f39c12", "PERSON": "#3498db", "TITLE": "#a569bd ", "LOC": "#45b39d", "MISC": "#ec7063", "ORG": "#f39c12", "PER": "#3498db" } MAP_MODELS = { "":"", #"fr_ner4archives_V3_camembert_base": "fr-ner4archives-V3-camembert-base", "fr_ner4archives_v3_default": "fr_ner4archives_v3_default", "fr_ner4archives_v3_with_vectors":"fr_ner4archives_v3_with_vectors" } # Read configuration with open(CONFIG_FILE, mode="r") as json_file: CONFIGURATION = json.loads(json_file.read()) # Set up Streamlit page streamlit.set_page_config(layout="wide") streamlit.title("NER4Archives visualizer") def ead_strategy(tree): sentences = [] container_dids = [] # get the level dsc = tree.xpath('.//dsc') for chlidren_dsc in dsc: # get levels for did in chlidren_dsc.xpath('.//did'): container_dids.append(did) text = "" if did is not None: text += " ".join( [did_content.strip() for did_content in did.itertext() if len(did_content) > 0]) # get the scopecontent if exists and concatenate with the rest if did.getnext() is not None: text += " ".join( [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if len(scopecontent) > 0]) sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ") # assert len(sentences) == len(container_dids) return container_dids, sentences def process_xml(data): parser = etree.XMLParser(**XML_PARSER_CONFIG) tree = etree.fromstring(data, parser=parser) xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8") dids, sentences = ead_strategy(tree) return xml, dids, sentences def is_entity_fishing_online(): try: response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1])) if response.status_code == 200: return True else: return False except: return False def setup_sidebar(): streamlit.sidebar.title("NER4Archives visualizer") streamlit.sidebar.write("## Motivation") streamlit.sidebar.markdown("""

This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on XML EAD finding aids and evaluate NER predictions.

In the context of the NER4Archives project (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset extracted from XML EAD finding aids and test it on new data.

Most of the models available here are trained with the NLP spaCy framework and are available on the HF organisation hub. Other models may be added in the future.

The project also includes a downstream entity linking task. The SpaCy fishing extension (based on entity-fishing) is used here to support this purpose.

NER4Archives - 2022/2023
""", unsafe_allow_html=True) scol1, scol2 = streamlit.sidebar.columns(2) scol1.image(f"{ASSETS_DIR}/an.png", width=170) scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100) def main(): setup_sidebar() flag_file = False flag_model = False data = "" model = "" linking = True entities = [] # 1. User provides a XML EAD streamlit.write("## 📄 Input XML EAD:") filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml") streamlit.markdown( "or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main/samples) directory") if filename is not None: data = filename.getvalue().decode("utf-8").encode("utf-8") if len(data) > 0: flag_file = True if flag_file: col1, col2 = streamlit.columns(2) col1.write("## 👁️ XML tree view:") col2.write("## 👁️ Plain text view:") xml, _, sentences = process_xml(data) col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True) plain = "\n".join(sentences) col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True) flag_view = True flag_model = False nlp = None if flag_view: streamlit.write("## ⚙️ Configure NER pipeline and options:") streamlit.write( "⚠️ Using Bert based model and/or linking may increase considerably the processing time.") # Normaly: Load from PIP or directory (install issues with HF spaces) models = [str(key) for key in MAP_MODELS.keys()] option = streamlit.selectbox( 'Choose a NER model you want to apply in the list: ', models, index=0) model = option model_loaded = None if model != "": print(MAP_MODELS[model]) nlp = spacy.load(MAP_MODELS[model]) flag_model = True streamlit.write(f"{model} is available locally.") linking = False # Launch NER process: if flag_model: if streamlit.button('Launch'): plain = "\n".join(sentences) with streamlit.spinner('Initialize NER...'): nlp.max_length = 5000000 if linking: nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']}) with streamlit.spinner('NER processing...'): if linking: start_sentence = 0 for doc in nlp.pipe(sentences): end_sentence = start_sentence + len(doc.text) + 1 for ent in doc.ents: start_tok = start_sentence + ent.start_char end_tok = start_tok + len(ent.text) entities.append(( start_tok, end_tok, ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score )) start_sentence = end_sentence else: start_sentence = 0 for doc in nlp.pipe(sentences): end_sentence = start_sentence + len(doc.text) + 1 for ent in doc.ents: start_tok = start_sentence + ent.start_char end_tok = start_tok + len(ent.text) entities.append((start_tok, end_tok, ent.text, ent.label_, "", "", "" )) start_sentence = end_sentence streamlit.success('😃 NER applied with success!') df = pd.DataFrame(entities, columns=['START', 'END', 'MENTION', 'NER LABEL', 'QID', 'WIKIDATA RESSOURCE (wikidata disambiguation)', 'LINKING SCORE' ]) df[['START', 'END']] = df[['START', 'END']].astype(int) streamlit.write("## 🔎 Explore named entities in table: ") streamlit.write(df) streamlit.write("## 🔎 Explore named entities in text: ") ents_html = spacy.displacy.render( [{"text": plain, "ents": [{"start": ent[0], "end": ent[1], "label": ent[3], "kb_id": ent[4] if linking else "", "kb_url": ent[5] if linking else "" } for ent in entities]}], style="ent", manual=True, options={ "ents":["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'], "colors": ENTITY_COLORS }) streamlit.markdown(ents_html, unsafe_allow_html=True) if __name__ == "__main__": main()