lterriel's picture
Update app.py
af032b9
raw
history blame
12.3 kB
import re
import json
import subprocess
import requests
import streamlit
import spacy
from lxml import etree
import pandas as pd
# Constants
CONFIG_FILE = "config.json"
ASSETS_DIR = "assets"
XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
ENTITY_COLORS = {
"EVENT": "#ec7063",
"LOCATION": "#45b39d",
"ORGANISATION": "#f39c12",
"PERSON": "#3498db",
"TITLE": "#a569bd ",
"LOC": "#45b39d",
"MISC": "#ec7063",
"ORG": "#f39c12",
"PER": "#3498db"
}
MAP_MODELS = {
"":"",
# "fr_ner4archives_V3_camembert_base": "https://huggingface.co/ner4archives/fr_ner4archives_V3_camembert_base/resolve/main/fr_ner4archives_V3_camembert_base-any-py3-none-any.whl", # Use this only locally (not in HF Spaces)
"fr_ner4archives_v3_default": "https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl",
"fr_ner4archives_v3_with_vectors":"https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl"
}
# Read configuration
with open(CONFIG_FILE, mode="r") as json_file:
CONFIGURATION = json.loads(json_file.read())
# Set up Streamlit page
streamlit.set_page_config(layout="wide")
streamlit.title("NER4Archives visualizer")
def ead_strategy(tree):
sentences = []
container_dids = []
# get the <dsc> level
dsc = tree.xpath('.//dsc')
for chlidren_dsc in dsc:
# get <did> levels
for did in chlidren_dsc.xpath('.//did'):
container_dids.append(did)
text = ""
if did is not None:
text += " ".join(
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
# get the scopecontent if exists and concatenate with the rest
if did.getnext() is not None:
text += " ".join(
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
len(scopecontent) > 0])
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
# assert len(sentences) == len(container_dids)
return container_dids, sentences
def process_xml(data):
parser = etree.XMLParser(**XML_PARSER_CONFIG)
tree = etree.fromstring(data, parser=parser)
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
dids, sentences = ead_strategy(tree)
return xml, dids, sentences
def is_entity_fishing_online():
try:
response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
if response.status_code == 200:
return True
else:
return False
except:
return False
def setup_sidebar():
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""<div style="text-align: justify;">
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
extracted from XML EAD finding aids and test it on new data.<p>
<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
Other models may be added in the future.</p>
<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
NER4Archives - 2022/2023</div>
""", unsafe_allow_html=True)
scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image(f"{ASSETS_DIR}/an.png", width=170)
scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)
def main():
setup_sidebar()
flag_file = False
flag_model = False
data = ""
model = ""
linking = True
entities = []
# 1. User provides a XML EAD
streamlit.write("## πŸ“„ Input XML EAD:")
filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml")
streamlit.markdown(
"or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main/samples) directory")
if filename is not None:
data = filename.getvalue().decode("utf-8").encode("utf-8")
if len(data) > 0:
flag_file = True
if flag_file:
col1, col2 = streamlit.columns(2)
col1.write("## πŸ‘οΈ XML tree view:")
col2.write("## πŸ‘οΈ Plain text view:")
xml, _, sentences = process_xml(data)
col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
plain = "\n".join(sentences)
col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
flag_view = True
flag_model = False
if flag_view:
streamlit.write("## βš™οΈ Configure NER pipeline and options:")
streamlit.write(
"⚠️ Using Bert based model and/or linking may increase considerably the processing time.")
# Normaly: Load from PIP or directory (install issues with HF spaces)
models = [str(key) for key in MAP_MODELS.keys()]
option = streamlit.selectbox(
'Choose a NER model you want to apply in the list: ',
models,
index=0)
model = option
model_loaded = None
if model != "":
try:
spacy.load(model)
flag_model = True
streamlit.write(f"{model} is available locally.")
except:
placeholder = streamlit.empty()
button = streamlit.button(f"Download model: {model}")
with placeholder.container():
if button:
streamlit.write(f"Download model: {model} in progress...")
p1 = subprocess.Popen(["pip", "install", MAP_MODELS[model]])
o = p1.wait()
if o == 0:
streamlit.write(f"Download model: {model} done.")
flag_model = True
streamlit.write(f"{model} is available locally.")
placeholder.empty()
if flag_model:
gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
gpu_icon = "❌"
if gpu:
spacy.prefer_gpu()
gpu_icon = "βœ…οΈ"
else:
spacy.require_cpu()
if is_entity_fishing_online():
streamlit.write("Entity-fishing server status: 🟒 (you can use linking feature)")
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
value=False)
linkingicon = "βœ…οΈ"
if linking is False:
linkingicon = "❌"
else:
streamlit.write("Entity-fishing server status: πŸ”΄ (you can't use linking feature)")
linking = False
linkingicon = "❌"
streamlit.write("#### Actual Parameters:")
streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')
# Launch NER process:
if flag_model:
if streamlit.button('Launch'):
plain = "\n".join(sentences)
with streamlit.spinner('Initialize NER...'):
nlp = spacy.load(model)
nlp.max_length = 5000000
if linking:
nlp.add_pipe('entityfishing',
config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
with streamlit.spinner('NER processing...'):
if linking:
start_sentence = 0
for doc in nlp.pipe(sentences):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((
start_tok,
end_tok,
ent.text,
ent.label_,
ent._.kb_qid,
ent._.url_wikidata,
ent._.nerd_score
))
start_sentence = end_sentence
else:
start_sentence = 0
for doc in nlp.pipe(sentences):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((start_tok,
end_tok,
ent.text,
ent.label_,
"",
"",
""
))
start_sentence = end_sentence
streamlit.success('πŸ˜ƒ NER applied with success!')
df = pd.DataFrame(entities, columns=['START',
'END',
'MENTION',
'NER LABEL',
'QID',
'WIKIDATA RESSOURCE (wikidata disambiguation)',
'LINKING SCORE'
])
df[['START', 'END']] = df[['START', 'END']].astype(int)
streamlit.write("## πŸ”Ž Explore named entities in table: ")
streamlit.write(df)
streamlit.write("## πŸ”Ž Explore named entities in text: ")
ents_html = spacy.displacy.render(
[{"text": plain,
"ents": [{"start": ent[0],
"end": ent[1],
"label": ent[3],
"kb_id": ent[4] if linking else "",
"kb_url": ent[5] if linking else ""
} for ent in entities]}],
style="ent",
manual=True,
options={
"ents":["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
"colors": ENTITY_COLORS
})
streamlit.markdown(ents_html, unsafe_allow_html=True)
if __name__ == "__main__":
main()