Spaces:

ner4archives
/

ner4archives-NEL-vizualizer-app

Running

File size: 12,315 Bytes

import re
import json
import subprocess

import requests

import streamlit
import spacy
from lxml import etree
import pandas as pd

# Constants
CONFIG_FILE = "config.json"
ASSETS_DIR = "assets"
XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
ENTITY_COLORS = {
    "EVENT": "#ec7063",
    "LOCATION": "#45b39d",
    "ORGANISATION": "#f39c12",
    "PERSON": "#3498db",
    "TITLE": "#a569bd ",
    "LOC": "#45b39d",
    "MISC": "#ec7063",
    "ORG": "#f39c12",
    "PER": "#3498db"
}

MAP_MODELS = {
    "":"",
    # "fr_ner4archives_V3_camembert_base": "https://huggingface.co/ner4archives/fr_ner4archives_V3_camembert_base/resolve/main/fr_ner4archives_V3_camembert_base-any-py3-none-any.whl", # Use this only locally (not in HF Spaces)
    "fr_ner4archives_v3_default": "https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl",
    "fr_ner4archives_v3_with_vectors":"https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl"
}


# Read configuration
with open(CONFIG_FILE, mode="r") as json_file:
    CONFIGURATION = json.loads(json_file.read())

# Set up Streamlit page
streamlit.set_page_config(layout="wide")
streamlit.title("NER4Archives visualizer")


def ead_strategy(tree):
    sentences = []
    container_dids = []
    # get the <dsc> level
    dsc = tree.xpath('.//dsc')
    for chlidren_dsc in dsc:
        # get <did> levels
        for did in chlidren_dsc.xpath('.//did'):
            container_dids.append(did)
            text = ""
            if did is not None:
                text += " ".join(
                    [did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
            # get the scopecontent if exists and concatenate with the rest
            if did.getnext() is not None:
                text += " ".join(
                    [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
                     len(scopecontent) > 0])
            sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
    # assert len(sentences) == len(container_dids)
    return container_dids, sentences


def process_xml(data):
    parser = etree.XMLParser(**XML_PARSER_CONFIG)
    tree = etree.fromstring(data, parser=parser)
    xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
    dids, sentences = ead_strategy(tree)
    return xml, dids, sentences


def is_entity_fishing_online():
    try:
        response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
        if response.status_code == 200:
            return True
        else:
            return False
    except:
        return False


def setup_sidebar():
    streamlit.sidebar.title("NER4Archives visualizer")
    streamlit.sidebar.write("## Motivation")
    streamlit.sidebar.markdown("""<div style="text-align: justify;">
    <p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
    XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
    <p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset 
    extracted from XML EAD finding aids and test it on new data.<p>
    <p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a> 
    framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>. 
    Other models may be added in the future.</p>
    <p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
    NER4Archives - 2022/2023</div>
    """, unsafe_allow_html=True)
    scol1, scol2 = streamlit.sidebar.columns(2)
    scol1.image(f"{ASSETS_DIR}/an.png", width=170)
    scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)


def main():
    setup_sidebar()
    flag_file = False
    flag_model = False
    data = ""
    model = ""
    linking = True
    entities = []
    # 1. User provides a XML EAD
    streamlit.write("## 📄 Input XML EAD:")
    filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml")
    streamlit.markdown(
        "or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main/samples) directory")

    if filename is not None:
        data = filename.getvalue().decode("utf-8").encode("utf-8")
        if len(data) > 0:
            flag_file = True

    if flag_file:
        col1, col2 = streamlit.columns(2)
        col1.write("## 👁️ XML tree view:")
        col2.write("## 👁️ Plain text view:")
        xml, _, sentences = process_xml(data)
        col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
        plain = "\n".join(sentences)
        col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
        flag_view = True
        flag_model = False

        if flag_view:
            streamlit.write("## ⚙️ Configure NER pipeline and options:")
            streamlit.write(
                "⚠️ Using Bert based model and/or linking may increase considerably the processing time.")
            # Normaly: Load from PIP or directory (install issues with HF spaces)
            models = [str(key) for key in MAP_MODELS.keys()]
            option = streamlit.selectbox(
                'Choose a NER model you want to apply in the list: ',
                models,
                index=0)
            model = option
            model_loaded = None
            if model != "":
                try:
                    spacy.load(model)
                    flag_model = True
                    streamlit.write(f"{model} is available locally.")
                except:
                    placeholder = streamlit.empty()
                    button = streamlit.button(f"Download model: {model}")
                    with placeholder.container():
                        if button:
                            streamlit.write(f"Download model: {model} in progress...")
                            p1 = subprocess.Popen(["pip", "install", MAP_MODELS[model]])
                            o = p1.wait()
                            if o == 0:
                                streamlit.write(f"Download model: {model} done.")
                                flag_model = True
                                streamlit.write(f"{model} is available locally.")
                    placeholder.empty()

            if flag_model:
                gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
                gpu_icon = "❌"
                if gpu:
                    spacy.prefer_gpu()
                    gpu_icon = "✅️"
                else:
                    spacy.require_cpu()

                if is_entity_fishing_online():
                    streamlit.write("Entity-fishing server status: 🟢 (you can use linking feature)")
                    linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
                                                 value=False)
                    linkingicon = "✅️"
                    if linking is False:
                        linkingicon = "❌"
                else:
                    streamlit.write("Entity-fishing server status: 🔴 (you can't use linking feature)")
                    linking = False
                    linkingicon = "❌"
                streamlit.write("#### Actual Parameters:")
                streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')

        # Launch NER process:
        if flag_model:
            if streamlit.button('Launch'):
                plain = "\n".join(sentences)
                with streamlit.spinner('Initialize NER...'):
                    nlp = spacy.load(model)
                    nlp.max_length = 5000000
                    if linking:
                        nlp.add_pipe('entityfishing',
                                     config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})

                with streamlit.spinner('NER processing...'):
                    if linking:
                        start_sentence = 0
                        for doc in nlp.pipe(sentences):
                            end_sentence = start_sentence + len(doc.text) + 1
                            for ent in doc.ents:
                                start_tok = start_sentence + ent.start_char
                                end_tok = start_tok + len(ent.text)
                                entities.append((
                                    start_tok,
                                    end_tok,
                                    ent.text,
                                    ent.label_,
                                    ent._.kb_qid,
                                    ent._.url_wikidata,
                                    ent._.nerd_score
                                ))
                            start_sentence = end_sentence
                    else:
                        start_sentence = 0
                        for doc in nlp.pipe(sentences):
                            end_sentence = start_sentence + len(doc.text) + 1
                            for ent in doc.ents:
                                start_tok = start_sentence + ent.start_char
                                end_tok = start_tok + len(ent.text)
                                entities.append((start_tok,
                                                 end_tok,
                                                 ent.text,
                                                 ent.label_,
                                                 "",
                                                 "",
                                                 ""
                                                 ))
                            start_sentence = end_sentence

                streamlit.success('😃 NER applied with success!')

                df = pd.DataFrame(entities, columns=['START',
                                                     'END',
                                                     'MENTION',
                                                     'NER LABEL',
                                                     'QID',
                                                     'WIKIDATA RESSOURCE  (wikidata disambiguation)',
                                                     'LINKING SCORE'
                                                     ])
                df[['START', 'END']] = df[['START', 'END']].astype(int)
                streamlit.write("## 🔎 Explore named entities in table: ")
                streamlit.write(df)

                streamlit.write("## 🔎 Explore named entities in text: ")
                ents_html = spacy.displacy.render(
                    [{"text": plain,
                     "ents": [{"start": ent[0],
                               "end": ent[1],
                               "label": ent[3],
                               "kb_id": ent[4] if linking else "",
                               "kb_url": ent[5] if linking else ""
                               } for ent in entities]}],
                    style="ent",
                    manual=True,
                    options={
                        "ents":["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
                        "colors": ENTITY_COLORS
                    })
                streamlit.markdown(ents_html, unsafe_allow_html=True)


if __name__ == "__main__":
    main()