import streamlit
import spacy_streamlit
import spacy
from lxml import etree
import pandas as pd
from spacy import Language
from spacy.tokens import Doc

streamlit.set_page_config(layout="wide")

samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}

# TITLE APP
streamlit.title("NER4Archives visualizer")
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""<div style="text-align: justify;">
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>

<p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset 
extracted from XML EAD finding aids and test it on new data.<p>

<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a> 
framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>. 
Other models may be added in the future.</p>

<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>

NER4Archives - 2022</div>
""", unsafe_allow_html=True)

scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image("./assets/an.png", width=170)
scol2.image("./assets/almanach_rouge-inria.png", width=100)

flag_file = False

# 1. User provides a XML EAD
streamlit.write("## 📄 Input XML EAD:")
filename = streamlit.file_uploader("Load an XML EAD", type="xml")
streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory")
data = ""


flag_model = False
if filename is not None:
    data = filename.getvalue().decode("utf-8").encode("utf-8")
    if len(data) > 0:
        flag_file = True


import re
def ead_strategy(tree):
    # create a container for sentences and dids
    # elements
    sentences = []
    container_dids = []
    # get the <dsc> level
    dsc = tree.xpath('.//dsc')
    for chlidren_dsc in dsc:
        # get <did> levels
        for did in chlidren_dsc.xpath('.//did'):
            container_dids.append(did)
            text = ""
            if did is not None:
                text += " ".join(
                    [did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
            # get the scopecontent if exists and concatenate with the rest
            if did.getnext() is not None:
                text += " ".join(
                    [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
                     len(scopecontent) > 0])
            sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
    # assert len(sentences) == len(container_dids)
    return container_dids, sentences

model = ""
linking = True
flag_view = False
if flag_file:
    col1, col2 = streamlit.columns(2)
    col1.write("## 👁️ XML tree view:")
    col2.write("## 👁️ Plain text view:")
    parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
    tree = etree.fromstring(data, parser=parser)
    xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
    col1.text_area("", value=xml, height=500, disabled=True)
    dids, sentences = ead_strategy(tree)
    plain = "\n".join(sentences)
    col2.text_area("", value=plain, height=500, disabled=True)
    flag_view = True

if flag_view:
    streamlit.write("## ⚙️ Configure NER model and options:")
    models = []
    for pipe in spacy.info()["pipelines"]:
        models.append(pipe)
    option = streamlit.selectbox(
        'Choose a NER model you want to apply in the list: ',
        models)
    model = option
    if model != "":
        flag_model = True
    linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
    linkingicon = "✅️"
    if linking is False:
        linkingicon = "❌"
    streamlit.write("#### Actual Parameters:")
    streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
@Language.factory("custom_ner", default_config={
    "model_name": "",
    "sentences_to_process": []
})
class CustomNer:
    def __init__(self,
                 nlp: Language,
                 name: str,
                 model_name: str,
                 sentences_to_process: list):
        self.nlp = nlp
        self.pipeline_ner = spacy.load(model_name)
        f_score = self.pipeline_ner.meta['performance']['ents_f']
        recall  = self.pipeline_ner.meta['performance']['ents_r']
        precision = self.pipeline_ner.meta['performance']['ents_p']
        mcol1, mcol2, mcol3 = streamlit.columns(3)
        mcol1.metric("F-Score", f'{f_score:.2f}')
        mcol2.metric("Precision", f'{precision:.2f}')
        mcol3.metric("Recall", f'{recall:.2f}')
        self.sentences = sentences_to_process

    def __call__(self, doc: Doc):
        start_sentence = 0
        spans = []
        count = 0
        bar = streamlit.progress(count)
        for sent in self.pipeline_ner.pipe(self.sentences):
            # add 1 char that correspond to space added in
            # sentences concatenation (" ".join())
            end_sentence = start_sentence + len(sent.text) + 1
            # recompute named entities characters offsets
            for ent in sent.ents:
                start = start_sentence + ent.start_char
                end = start + len(ent.text)
                spans.append(doc.char_span(start, end, label=ent.label_))
            start_sentence = end_sentence
            count += 1
            bar.progress((count/len(sentences))*1.0)

        doc.set_ents(spans)

        return doc

entities = []
flag_vizualize = False

if flag_model:
    if streamlit.button('Launch'):
        with streamlit.spinner('Initialize NER...'):
            huge_pipeline_linking = spacy.blank("fr")
            huge_pipeline_linking.max_length = 5000000
            huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
            if linking:
                huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
        with streamlit.spinner('NER processing... (please, wait depends on data size)'):
            doc = huge_pipeline_linking(plain)

        entities = [
            (ent.start_char,
            ent.end_char,
            ent.text,
            ent.label_,
            ent._.url_wikidata,
             ent._.nerd_score
            ) for ent in doc.ents
        ]
        streamlit.success('😃 NER applied with success!')


        df = pd.DataFrame(entities, columns=['START',
                                             'END',
                                             'MENTION',
                                             'NER LABEL',
                                             'WIKIDATA RESSOURCE  (wikidata disambiguation)',
                                             'LINKING SCORE'
                                             ])

        streamlit.write("## 🔎 Explore named entities in table: ")
        streamlit.write(df)

        streamlit.write("## 🔎 Explore named entities in text: ")
        spacy_streamlit.visualize_ner(
            [{"text": doc.text,
             "ents": [
                 {"start": ent.start_char,
                  "end": ent.end_char,
                  "label": ent.label_,
                  "kb_id": ent._.kb_qid,
                  "kb_url": ent._.url_wikidata
                  } for ent in doc.ents
             ]}],
            labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
            show_table=False,
            manual=True,
            title="",
        displacy_options={
            "colors": {
                "EVENT": "#ec7063",
                "LOCATION": "#45b39d",
                "ORGANISATION": "#f39c12",
                "PERSON": "#3498db",
                "TITLE": "#a569bd ",
                "LOC": "#45b39d",
                "MISC": "#ec7063",
                "ORG": "#f39c12",
                "PER": "#3498db"

            }
        })