File size: 12,315 Bytes
c1ac802 56f7cac 0681ecd c1ac802 ac73442 7384625 ea2e29d ac73442 8c415e5 ac73442 d360486 96f9554 2bac980 dc0c3a6 96f9554 ac73442 d360486 ac73442 56f7cac ac73442 7384625 ea2e29d ac73442 af23812 ea2e29d ac73442 f82590c ac73442 af032b9 ac73442 c1ac802 ac73442 96f9554 ac73442 5f37ada ac73442 5f37ada ac73442 5f37ada ac73442 5f37ada dc0c3a6 5f37ada 96f9554 ac73442 96f9554 ac73442 dc0c3a6 ac73442 8c415e5 ac73442 8c415e5 ac73442 8c415e5 ac73442 8c415e5 ac73442 0f339be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
import re
import json
import subprocess
import requests
import streamlit
import spacy
from lxml import etree
import pandas as pd
# Constants
CONFIG_FILE = "config.json"
ASSETS_DIR = "assets"
XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
ENTITY_COLORS = {
"EVENT": "#ec7063",
"LOCATION": "#45b39d",
"ORGANISATION": "#f39c12",
"PERSON": "#3498db",
"TITLE": "#a569bd ",
"LOC": "#45b39d",
"MISC": "#ec7063",
"ORG": "#f39c12",
"PER": "#3498db"
}
MAP_MODELS = {
"":"",
# "fr_ner4archives_V3_camembert_base": "https://huggingface.co/ner4archives/fr_ner4archives_V3_camembert_base/resolve/main/fr_ner4archives_V3_camembert_base-any-py3-none-any.whl", # Use this only locally (not in HF Spaces)
"fr_ner4archives_v3_default": "https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl",
"fr_ner4archives_v3_with_vectors":"https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl"
}
# Read configuration
with open(CONFIG_FILE, mode="r") as json_file:
CONFIGURATION = json.loads(json_file.read())
# Set up Streamlit page
streamlit.set_page_config(layout="wide")
streamlit.title("NER4Archives visualizer")
def ead_strategy(tree):
sentences = []
container_dids = []
# get the <dsc> level
dsc = tree.xpath('.//dsc')
for chlidren_dsc in dsc:
# get <did> levels
for did in chlidren_dsc.xpath('.//did'):
container_dids.append(did)
text = ""
if did is not None:
text += " ".join(
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
# get the scopecontent if exists and concatenate with the rest
if did.getnext() is not None:
text += " ".join(
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
len(scopecontent) > 0])
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
# assert len(sentences) == len(container_dids)
return container_dids, sentences
def process_xml(data):
parser = etree.XMLParser(**XML_PARSER_CONFIG)
tree = etree.fromstring(data, parser=parser)
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
dids, sentences = ead_strategy(tree)
return xml, dids, sentences
def is_entity_fishing_online():
try:
response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
if response.status_code == 200:
return True
else:
return False
except:
return False
def setup_sidebar():
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""<div style="text-align: justify;">
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
extracted from XML EAD finding aids and test it on new data.<p>
<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
Other models may be added in the future.</p>
<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
NER4Archives - 2022/2023</div>
""", unsafe_allow_html=True)
scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image(f"{ASSETS_DIR}/an.png", width=170)
scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)
def main():
setup_sidebar()
flag_file = False
flag_model = False
data = ""
model = ""
linking = True
entities = []
# 1. User provides a XML EAD
streamlit.write("## π Input XML EAD:")
filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml")
streamlit.markdown(
"or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main/samples) directory")
if filename is not None:
data = filename.getvalue().decode("utf-8").encode("utf-8")
if len(data) > 0:
flag_file = True
if flag_file:
col1, col2 = streamlit.columns(2)
col1.write("## ποΈ XML tree view:")
col2.write("## ποΈ Plain text view:")
xml, _, sentences = process_xml(data)
col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
plain = "\n".join(sentences)
col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
flag_view = True
flag_model = False
if flag_view:
streamlit.write("## βοΈ Configure NER pipeline and options:")
streamlit.write(
"β οΈ Using Bert based model and/or linking may increase considerably the processing time.")
# Normaly: Load from PIP or directory (install issues with HF spaces)
models = [str(key) for key in MAP_MODELS.keys()]
option = streamlit.selectbox(
'Choose a NER model you want to apply in the list: ',
models,
index=0)
model = option
model_loaded = None
if model != "":
try:
spacy.load(model)
flag_model = True
streamlit.write(f"{model} is available locally.")
except:
placeholder = streamlit.empty()
button = streamlit.button(f"Download model: {model}")
with placeholder.container():
if button:
streamlit.write(f"Download model: {model} in progress...")
p1 = subprocess.Popen(["pip", "install", MAP_MODELS[model]])
o = p1.wait()
if o == 0:
streamlit.write(f"Download model: {model} done.")
flag_model = True
streamlit.write(f"{model} is available locally.")
placeholder.empty()
if flag_model:
gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
gpu_icon = "β"
if gpu:
spacy.prefer_gpu()
gpu_icon = "β
οΈ"
else:
spacy.require_cpu()
if is_entity_fishing_online():
streamlit.write("Entity-fishing server status: π’ (you can use linking feature)")
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
value=False)
linkingicon = "β
οΈ"
if linking is False:
linkingicon = "β"
else:
streamlit.write("Entity-fishing server status: π΄ (you can't use linking feature)")
linking = False
linkingicon = "β"
streamlit.write("#### Actual Parameters:")
streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')
# Launch NER process:
if flag_model:
if streamlit.button('Launch'):
plain = "\n".join(sentences)
with streamlit.spinner('Initialize NER...'):
nlp = spacy.load(model)
nlp.max_length = 5000000
if linking:
nlp.add_pipe('entityfishing',
config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
with streamlit.spinner('NER processing...'):
if linking:
start_sentence = 0
for doc in nlp.pipe(sentences):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((
start_tok,
end_tok,
ent.text,
ent.label_,
ent._.kb_qid,
ent._.url_wikidata,
ent._.nerd_score
))
start_sentence = end_sentence
else:
start_sentence = 0
for doc in nlp.pipe(sentences):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((start_tok,
end_tok,
ent.text,
ent.label_,
"",
"",
""
))
start_sentence = end_sentence
streamlit.success('π NER applied with success!')
df = pd.DataFrame(entities, columns=['START',
'END',
'MENTION',
'NER LABEL',
'QID',
'WIKIDATA RESSOURCE (wikidata disambiguation)',
'LINKING SCORE'
])
df[['START', 'END']] = df[['START', 'END']].astype(int)
streamlit.write("## π Explore named entities in table: ")
streamlit.write(df)
streamlit.write("## π Explore named entities in text: ")
ents_html = spacy.displacy.render(
[{"text": plain,
"ents": [{"start": ent[0],
"end": ent[1],
"label": ent[3],
"kb_id": ent[4] if linking else "",
"kb_url": ent[5] if linking else ""
} for ent in entities]}],
style="ent",
manual=True,
options={
"ents":["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
"colors": ENTITY_COLORS
})
streamlit.markdown(ents_html, unsafe_allow_html=True)
if __name__ == "__main__":
main()
|