lterriel commited on
Commit
ac73442
Β·
1 Parent(s): 4c6d441

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -168
app.py CHANGED
@@ -1,62 +1,40 @@
1
  import re
2
  import json
3
 
 
 
4
  import streamlit
5
  import spacy_streamlit
6
  import spacy
7
  from lxml import etree
8
  import pandas as pd
9
 
10
-
11
- streamlit.set_page_config(layout="wide")
12
-
13
- samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
14
-
15
- with open('config.json', mode="r") as json_file:
 
 
 
 
 
 
 
 
 
 
 
 
16
  CONFIGURATION = json.loads(json_file.read())
17
 
18
-
19
-
20
- # TITLE APP
21
  streamlit.title("NER4Archives visualizer")
22
- streamlit.sidebar.title("NER4Archives visualizer")
23
- streamlit.sidebar.write("## Motivation")
24
- streamlit.sidebar.markdown("""<div style="text-align: justify;">
25
- <p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
26
- XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
27
 
28
- <p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
29
- extracted from XML EAD finding aids and test it on new data.<p>
30
 
31
- <p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
32
- framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
33
- Other models may be added in the future.</p>
34
-
35
- <p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
36
-
37
- NER4Archives - 2022</div>
38
- """, unsafe_allow_html=True)
39
-
40
- scol1, scol2 = streamlit.sidebar.columns(2)
41
- scol1.image("./assets/an.png", width=170)
42
- scol2.image("./assets/almanach_rouge-inria.png", width=100)
43
-
44
- flag_file = False
45
-
46
- # 1. User provides a XML EAD
47
- streamlit.write("## πŸ“„ Input XML EAD:")
48
- filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
49
- streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
50
- data = ""
51
- flag_model = False
52
-
53
- if filename is not None:
54
- data = filename.getvalue().decode("utf-8").encode("utf-8")
55
- if len(data) > 0:
56
- flag_file = True
57
  def ead_strategy(tree):
58
- # create a container for sentences and dids
59
- # elements
60
  sentences = []
61
  container_dids = []
62
  # get the <dsc> level
@@ -78,132 +56,186 @@ def ead_strategy(tree):
78
  # assert len(sentences) == len(container_dids)
79
  return container_dids, sentences
80
 
81
- model = ""
82
- linking = True
83
- flag_view = False
84
- if flag_file:
85
- col1, col2 = streamlit.columns(2)
86
- col1.write("## πŸ‘οΈ XML tree view:")
87
- col2.write("## πŸ‘οΈ Plain text view:")
88
- parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
89
  tree = etree.fromstring(data, parser=parser)
90
  xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
91
- col1.text_area("", value=xml, height=500, disabled=True)
92
  dids, sentences = ead_strategy(tree)
93
- plain = "\n".join(sentences)
94
- col2.text_area("", value=plain, height=500, disabled=True)
95
- flag_view = True
96
-
97
- if flag_view:
98
- streamlit.write("## βš™οΈ Configure NER model and options:")
99
- models = []
100
- for pipe in spacy.info()["pipelines"]:
101
- models.append(pipe)
102
- option = streamlit.selectbox(
103
- 'Choose a NER model you want to apply in the list: ',
104
- models)
105
- model = option
106
- if model != "":
107
- flag_model = True
108
- #linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
109
- #linkingicon = "βœ…οΈ"
110
- #if linking is False:
111
- # linkingicon = "❌"
112
- linking = False
113
- streamlit.write("#### Actual Parameters:")
114
- #streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
115
- streamlit.write(f'- NER model selected: {option}\n')
116
-
117
- entities = []
118
- docs = []
119
- ents = []
120
- flag_vizualize = False
121
-
122
- # Launch NER process:
123
- if flag_model:
124
- if streamlit.button('Launch'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  plain = "\n".join(sentences)
126
- with streamlit.spinner('Initialize NER...'):
127
- nlp = spacy.load(model)
128
- nlp.max_length = 5000000
129
- if linking:
130
- nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
131
-
132
- with streamlit.spinner('NER processing...'):
133
- if linking:
134
- start_sentence = 0
135
- for doc in nlp.pipe(sentences, batch_size=250):
136
- end_sentence = start_sentence + len(doc.text) + 1
137
- for ent in doc.ents:
138
- start_tok = start_sentence + ent.start_char
139
- end_tok = start_tok + len(ent.text)
140
- entities.append((
141
- start_tok,
142
- end_tok,
143
- ent.text,
144
- ent.label_,
145
- ent._.kb_qid,
146
- ent._.url_wikidata,
147
- ent._.nerd_score
148
- ))
149
- start_sentence = end_sentence
 
 
 
 
 
 
 
150
  else:
151
- start_sentence = 0
152
- for doc in nlp.pipe(sentences):
153
- end_sentence = start_sentence + len(doc.text) + 1
154
- for ent in doc.ents:
155
- start_tok = start_sentence + ent.start_char
156
- end_tok = start_tok + len(ent.text)
157
- entities.append((start_tok,
158
- end_tok,
159
- ent.text,
160
- ent.label_,
161
- "",
162
- "",
163
- ""
164
- ))
165
- start_sentence = end_sentence
166
-
167
-
168
- streamlit.success('πŸ˜ƒ NER applied with success!')
169
-
170
-
171
- df = pd.DataFrame(entities, columns=['START',
172
- 'END',
173
- 'MENTION',
174
- 'NER LABEL',
175
- 'QID',
176
- 'WIKIDATA RESSOURCE (wikidata disambiguation)',
177
- 'LINKING SCORE'
178
- ])
179
-
180
- streamlit.write("## πŸ”Ž Explore named entities in table: ")
181
- streamlit.write(df)
182
-
183
-
184
- streamlit.write("## πŸ”Ž Explore named entities in text: ")
185
- spacy_streamlit.visualize_ner(
186
- {"text": plain,
187
- "ents": [{"start": ent[0],
188
- "end": ent[1],
189
- "label": ent[3],
190
- "kb_id": ent[4] if linking else "",
191
- "kb_url": ent[5] if linking else ""
192
- } for ent in entities]},
193
- labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
194
- show_table=False,
195
- manual=True,
196
- title="",
197
- displacy_options={
198
- "colors": {
199
- "EVENT": "#ec7063",
200
- "LOCATION": "#45b39d",
201
- "ORGANISATION": "#f39c12",
202
- "PERSON": "#3498db",
203
- "TITLE": "#a569bd ",
204
- "LOC": "#45b39d",
205
- "MISC": "#ec7063",
206
- "ORG": "#f39c12",
207
- "PER": "#3498db"
208
- }
209
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import json
3
 
4
+ import requests
5
+
6
  import streamlit
7
  import spacy_streamlit
8
  import spacy
9
  from lxml import etree
10
  import pandas as pd
11
 
12
+ # Constants
13
+ CONFIG_FILE = "config.json"
14
+ ASSETS_DIR = "./assets"
15
+ XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
16
+ ENTITY_COLORS = {
17
+ "EVENT": "#ec7063",
18
+ "LOCATION": "#45b39d",
19
+ "ORGANISATION": "#f39c12",
20
+ "PERSON": "#3498db",
21
+ "TITLE": "#a569bd ",
22
+ "LOC": "#45b39d",
23
+ "MISC": "#ec7063",
24
+ "ORG": "#f39c12",
25
+ "PER": "#3498db"
26
+ }
27
+
28
+ # Read configuration
29
+ with open(CONFIG_FILE, mode="r") as json_file:
30
  CONFIGURATION = json.loads(json_file.read())
31
 
32
+ # Set up Streamlit page
33
+ streamlit.set_page_config(layout="wide")
 
34
  streamlit.title("NER4Archives visualizer")
 
 
 
 
 
35
 
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def ead_strategy(tree):
 
 
38
  sentences = []
39
  container_dids = []
40
  # get the <dsc> level
 
56
  # assert len(sentences) == len(container_dids)
57
  return container_dids, sentences
58
 
59
+
60
+ def process_xml(data):
61
+ parser = etree.XMLParser(**XML_PARSER_CONFIG)
 
 
 
 
 
62
  tree = etree.fromstring(data, parser=parser)
63
  xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
 
64
  dids, sentences = ead_strategy(tree)
65
+ return xml, dids, sentences
66
+
67
+
68
+ def is_entity_fishing_online():
69
+ try:
70
+ response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
71
+ if response.status_code == 200:
72
+ return True
73
+ else:
74
+ return False
75
+ except:
76
+ return False
77
+
78
+
79
+ def setup_sidebar():
80
+ streamlit.sidebar.title("NER4Archives visualizer")
81
+ streamlit.sidebar.write("## Motivation")
82
+ streamlit.sidebar.markdown("""<div style="text-align: justify;">
83
+ <p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
84
+ XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
85
+ <p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
86
+ extracted from XML EAD finding aids and test it on new data.<p>
87
+ <p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
88
+ framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
89
+ Other models may be added in the future.</p>
90
+ <p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
91
+ NER4Archives - 2022/2023</div>
92
+ """, unsafe_allow_html=True)
93
+ scol1, scol2 = streamlit.sidebar.columns(2)
94
+ scol1.image(f"{ASSETS_DIR}/an.png", width=170)
95
+ scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)
96
+
97
+
98
+ def main():
99
+ setup_sidebar()
100
+ flag_file = False
101
+ flag_model = False
102
+ data = ""
103
+ model = ""
104
+ linking = True
105
+ entities = []
106
+ # 1. User provides a XML EAD
107
+ streamlit.write("## πŸ“„ Input XML EAD:")
108
+ filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml", label_visibility="collapsed")
109
+ streamlit.markdown(
110
+ "or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
111
+
112
+ if filename is not None:
113
+ data = filename.getvalue().decode("utf-8").encode("utf-8")
114
+ if len(data) > 0:
115
+ flag_file = True
116
+
117
+ if flag_file:
118
+ col1, col2 = streamlit.columns(2)
119
+ col1.write("## πŸ‘οΈ XML tree view:")
120
+ col2.write("## πŸ‘οΈ Plain text view:")
121
+ xml, _, sentences = process_xml(data)
122
+ col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
123
  plain = "\n".join(sentences)
124
+ col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
125
+ flag_view = True
126
+
127
+ if flag_view:
128
+ streamlit.write("## βš™οΈ Configure NER pipeline and options:")
129
+ streamlit.write("⚠️ Using Bert based model and/or linking may increase considerably the processing time.")
130
+ models = []
131
+ for pipe in spacy.info()["pipelines"]:
132
+ models.append(pipe)
133
+ option = streamlit.selectbox(
134
+ 'Choose a NER model you want to apply in the list: ',
135
+ models)
136
+ model = option
137
+ if model != "":
138
+ flag_model = True
139
+
140
+ gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
141
+ gpu_icon = "❌"
142
+ if gpu:
143
+ spacy.prefer_gpu()
144
+ gpu_icon = "βœ…οΈ"
145
+ else:
146
+ spacy.require_cpu()
147
+
148
+ if is_entity_fishing_online():
149
+ streamlit.write("Entity-fishing server status: 🟒 (you can use linking feature)")
150
+ linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
151
+ value=False)
152
+ linkingicon = "βœ…οΈ"
153
+ if linking is False:
154
+ linkingicon = "❌"
155
  else:
156
+ streamlit.write("Entity-fishing server status: πŸ”΄ (you can't use linking feature)")
157
+ linking = False
158
+ linkingicon = "❌"
159
+ streamlit.write("#### Actual Parameters:")
160
+ streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')
161
+
162
+ # Launch NER process:
163
+ if flag_model:
164
+ if streamlit.button('Launch'):
165
+ plain = "\n".join(sentences)
166
+ with streamlit.spinner('Initialize NER...'):
167
+ nlp = spacy.load(model)
168
+ nlp.max_length = 5000000
169
+ if linking:
170
+ nlp.add_pipe('entityfishing',
171
+ config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
172
+
173
+ with streamlit.spinner('NER processing...'):
174
+ if linking:
175
+ start_sentence = 0
176
+ for doc in nlp.pipe(sentences):
177
+ end_sentence = start_sentence + len(doc.text) + 1
178
+ for ent in doc.ents:
179
+ start_tok = start_sentence + ent.start_char
180
+ end_tok = start_tok + len(ent.text)
181
+ entities.append((
182
+ start_tok,
183
+ end_tok,
184
+ ent.text,
185
+ ent.label_,
186
+ ent._.kb_qid,
187
+ ent._.url_wikidata,
188
+ ent._.nerd_score
189
+ ))
190
+ start_sentence = end_sentence
191
+ else:
192
+ start_sentence = 0
193
+ for doc in nlp.pipe(sentences):
194
+ end_sentence = start_sentence + len(doc.text) + 1
195
+ for ent in doc.ents:
196
+ start_tok = start_sentence + ent.start_char
197
+ end_tok = start_tok + len(ent.text)
198
+ entities.append((start_tok,
199
+ end_tok,
200
+ ent.text,
201
+ ent.label_,
202
+ "",
203
+ "",
204
+ ""
205
+ ))
206
+ start_sentence = end_sentence
207
+
208
+ streamlit.success('πŸ˜ƒ NER applied with success!')
209
+
210
+ df = pd.DataFrame(entities, columns=['START',
211
+ 'END',
212
+ 'MENTION',
213
+ 'NER LABEL',
214
+ 'QID',
215
+ 'WIKIDATA RESSOURCE (wikidata disambiguation)',
216
+ 'LINKING SCORE'
217
+ ])
218
+ df[['START', 'END']] = df[['START', 'END']].astype(int)
219
+ streamlit.write("## πŸ”Ž Explore named entities in table: ")
220
+ streamlit.write(df)
221
+
222
+ streamlit.write("## πŸ”Ž Explore named entities in text: ")
223
+ spacy_streamlit.visualize_ner(
224
+ {"text": plain,
225
+ "ents": [{"start": ent[0],
226
+ "end": ent[1],
227
+ "label": ent[3],
228
+ "kb_id": ent[4] if linking else "",
229
+ "kb_url": ent[5] if linking else ""
230
+ } for ent in entities]},
231
+ labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
232
+ show_table=False,
233
+ manual=True,
234
+ title="",
235
+ displacy_options={
236
+ "colors": ENTITY_COLORS
237
+ })
238
+
239
+
240
+ if __name__ == "__main__":
241
+ main()