update app structure + add new models to requirements
Browse files- .gitignore +2 -1
- app.py +62 -92
- models.md +10 -0
- requirements.txt +6 -7
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
Legacy.py
|
2 |
.idea
|
3 |
-
standoffconverter
|
|
|
|
1 |
Legacy.py
|
2 |
.idea
|
3 |
+
standoffconverter
|
4 |
+
venv/
|
app.py
CHANGED
@@ -1,10 +1,10 @@
|
|
|
|
|
|
1 |
import streamlit
|
2 |
import spacy_streamlit
|
3 |
import spacy
|
4 |
from lxml import etree
|
5 |
import pandas as pd
|
6 |
-
from spacy import Language
|
7 |
-
from spacy.tokens import Doc
|
8 |
|
9 |
streamlit.set_page_config(layout="wide")
|
10 |
|
@@ -41,20 +41,12 @@ streamlit.write("## π Input XML EAD:")
|
|
41 |
filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
|
42 |
streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
|
43 |
data = ""
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
flag_model = False
|
|
|
49 |
if filename is not None:
|
50 |
data = filename.getvalue().decode("utf-8").encode("utf-8")
|
51 |
if len(data) > 0:
|
52 |
flag_file = True
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
import re
|
58 |
def ead_strategy(tree):
|
59 |
# create a container for sentences and dids
|
60 |
# elements
|
@@ -112,48 +104,7 @@ if flag_view:
|
|
112 |
linkingicon = "β"
|
113 |
streamlit.write("#### Actual Parameters:")
|
114 |
streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
|
115 |
-
|
116 |
-
"model_name": "",
|
117 |
-
"sentences_to_process": []
|
118 |
-
})
|
119 |
-
class CustomNer:
|
120 |
-
def __init__(self,
|
121 |
-
nlp: Language,
|
122 |
-
name: str,
|
123 |
-
model_name: str,
|
124 |
-
sentences_to_process: list):
|
125 |
-
self.nlp = nlp
|
126 |
-
self.pipeline_ner = spacy.load(model_name)
|
127 |
-
f_score = self.pipeline_ner.meta['performance']['ents_f']
|
128 |
-
recall = self.pipeline_ner.meta['performance']['ents_r']
|
129 |
-
precision = self.pipeline_ner.meta['performance']['ents_p']
|
130 |
-
mcol1, mcol2, mcol3 = streamlit.columns(3)
|
131 |
-
mcol1.metric("F-Score", f'{f_score:.2f}')
|
132 |
-
mcol2.metric("Precision", f'{precision:.2f}')
|
133 |
-
mcol3.metric("Recall", f'{recall:.2f}')
|
134 |
-
self.sentences = sentences_to_process
|
135 |
-
|
136 |
-
def __call__(self, doc: Doc):
|
137 |
-
start_sentence = 0
|
138 |
-
spans = []
|
139 |
-
count = 0
|
140 |
-
bar = streamlit.progress(count)
|
141 |
-
for sent in self.pipeline_ner.pipe(self.sentences):
|
142 |
-
# add 1 char that correspond to space added in
|
143 |
-
# sentences concatenation (" ".join())
|
144 |
-
end_sentence = start_sentence + len(sent.text) + 1
|
145 |
-
# recompute named entities characters offsets
|
146 |
-
for ent in sent.ents:
|
147 |
-
start = start_sentence + ent.start_char
|
148 |
-
end = start + len(ent.text)
|
149 |
-
spans.append(doc.char_span(start, end, label=ent.label_))
|
150 |
-
start_sentence = end_sentence
|
151 |
-
count += 1
|
152 |
-
bar.progress((count/len(sentences))*1.0)
|
153 |
-
|
154 |
-
doc.set_ents(spans)
|
155 |
-
|
156 |
-
return doc
|
157 |
|
158 |
entities = []
|
159 |
docs = []
|
@@ -163,27 +114,44 @@ flag_vizualize = False
|
|
163 |
# Launch NER process:
|
164 |
if flag_model:
|
165 |
if streamlit.button('Launch'):
|
|
|
166 |
with streamlit.spinner('Initialize NER...'):
|
167 |
-
|
168 |
-
|
169 |
-
huge_pipeline_linking.max_length = 5000000
|
170 |
-
huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
|
171 |
if linking:
|
172 |
-
|
173 |
-
|
174 |
with streamlit.spinner('NER processing...'):
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
streamlit.success('π NER applied with success!')
|
189 |
|
@@ -192,6 +160,7 @@ if flag_model:
|
|
192 |
'END',
|
193 |
'MENTION',
|
194 |
'NER LABEL',
|
|
|
195 |
'WIKIDATA RESSOURCE (wikidata disambiguation)',
|
196 |
'LINKING SCORE'
|
197 |
])
|
@@ -199,32 +168,33 @@ if flag_model:
|
|
199 |
streamlit.write("## π Explore named entities in table: ")
|
200 |
streamlit.write(df)
|
201 |
|
|
|
202 |
streamlit.write("## π Explore named entities in text: ")
|
203 |
spacy_streamlit.visualize_ner(
|
204 |
-
{"text":
|
205 |
-
|
206 |
-
"
|
207 |
-
"
|
208 |
-
"
|
209 |
-
|
|
|
210 |
labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
|
211 |
show_table=False,
|
212 |
manual=True,
|
213 |
title="",
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
}
|
227 |
-
|
228 |
-
|
229 |
|
230 |
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
import streamlit
|
4 |
import spacy_streamlit
|
5 |
import spacy
|
6 |
from lxml import etree
|
7 |
import pandas as pd
|
|
|
|
|
8 |
|
9 |
streamlit.set_page_config(layout="wide")
|
10 |
|
|
|
41 |
filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
|
42 |
streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
|
43 |
data = ""
|
|
|
|
|
|
|
|
|
44 |
flag_model = False
|
45 |
+
|
46 |
if filename is not None:
|
47 |
data = filename.getvalue().decode("utf-8").encode("utf-8")
|
48 |
if len(data) > 0:
|
49 |
flag_file = True
|
|
|
|
|
|
|
|
|
|
|
50 |
def ead_strategy(tree):
|
51 |
# create a container for sentences and dids
|
52 |
# elements
|
|
|
104 |
linkingicon = "β"
|
105 |
streamlit.write("#### Actual Parameters:")
|
106 |
streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
|
107 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
entities = []
|
110 |
docs = []
|
|
|
114 |
# Launch NER process:
|
115 |
if flag_model:
|
116 |
if streamlit.button('Launch'):
|
117 |
+
plain = "\n".join(sentences)
|
118 |
with streamlit.spinner('Initialize NER...'):
|
119 |
+
nlp = spacy.load(model)
|
120 |
+
nlp.max_length = 5000000
|
|
|
|
|
121 |
if linking:
|
122 |
+
nlp.add_pipe('entityfishing', config={"language": "fr"})
|
123 |
+
|
124 |
with streamlit.spinner('NER processing...'):
|
125 |
+
if linking:
|
126 |
+
start_sentence = 0
|
127 |
+
docs = nlp.pipe(sentences, batch_size=100)
|
128 |
+
for doc in docs:
|
129 |
+
end_sentence = start_sentence + len(doc.text) + 1
|
130 |
+
for ent in doc.ents:
|
131 |
+
start_tok = start_sentence + ent.start_char
|
132 |
+
end_tok = start_tok + len(ent.text)
|
133 |
+
entities.append((
|
134 |
+
start_tok,
|
135 |
+
end_tok,
|
136 |
+
ent.text,
|
137 |
+
ent.label_,
|
138 |
+
ent._.kb_qid,
|
139 |
+
ent._.url_wikidata,
|
140 |
+
ent._.nerd_score
|
141 |
+
))
|
142 |
+
start_sentence = end_sentence
|
143 |
+
else:
|
144 |
+
entities = [
|
145 |
+
(ent.start_char,
|
146 |
+
ent.end_char,
|
147 |
+
ent.text,
|
148 |
+
ent.label_,
|
149 |
+
"",
|
150 |
+
"",
|
151 |
+
""
|
152 |
+
) for ent in nlp(plain).ents
|
153 |
+
]
|
154 |
+
|
155 |
|
156 |
streamlit.success('π NER applied with success!')
|
157 |
|
|
|
160 |
'END',
|
161 |
'MENTION',
|
162 |
'NER LABEL',
|
163 |
+
'QID',
|
164 |
'WIKIDATA RESSOURCE (wikidata disambiguation)',
|
165 |
'LINKING SCORE'
|
166 |
])
|
|
|
168 |
streamlit.write("## π Explore named entities in table: ")
|
169 |
streamlit.write(df)
|
170 |
|
171 |
+
|
172 |
streamlit.write("## π Explore named entities in text: ")
|
173 |
spacy_streamlit.visualize_ner(
|
174 |
+
{"text": plain,
|
175 |
+
"ents": [{"start": ent[0],
|
176 |
+
"end": ent[1],
|
177 |
+
"label": ent[3],
|
178 |
+
"kb_id": ent[4] if linking else "",
|
179 |
+
"kb_url": ent[5] if linking else ""
|
180 |
+
} for ent in entities]},
|
181 |
labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
|
182 |
show_table=False,
|
183 |
manual=True,
|
184 |
title="",
|
185 |
+
displacy_options={
|
186 |
+
"colors": {
|
187 |
+
"EVENT": "#ec7063",
|
188 |
+
"LOCATION": "#45b39d",
|
189 |
+
"ORGANISATION": "#f39c12",
|
190 |
+
"PERSON": "#3498db",
|
191 |
+
"TITLE": "#a569bd ",
|
192 |
+
"LOC": "#45b39d",
|
193 |
+
"MISC": "#ec7063",
|
194 |
+
"ORG": "#f39c12",
|
195 |
+
"PER": "#3498db"
|
196 |
+
}
|
197 |
+
})
|
198 |
+
|
|
|
199 |
|
200 |
|
models.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Models available for Ner4Archives:
|
2 |
+
|
3 |
+
* Corpus V2:
|
4 |
+
- fr_ner4archives_default_test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
|
5 |
+
- fr_ner4archives_camembert_base @ https://huggingface.co/ner4archives/fr_ner4archives_camembert_base/resolve/main/fr_ner4archives_camembert_base-any-py3-none-any.whl
|
6 |
+
|
7 |
+
* Corpus V3:
|
8 |
+
- fr_core_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_default/resolve/main/fr_core_ner4archives_v3_default-any-py3-none-any.whl
|
9 |
+
- fr_core_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_with_vectors/resolve/main/fr_core_ner4archives_v3_with_vectors-any-py3-none-any.whl
|
10 |
+
- fr_core_ner4archives_V3_camembert_base @ https://huggingface.co/ner4archives/fr_core_ner4archives_V3_camembert_base/resolve/main/fr_core_ner4archives_V3_camembert_base-any-py3-none-any.whl
|
requirements.txt
CHANGED
@@ -23,10 +23,6 @@ defusedxml==0.7.1
|
|
23 |
entrypoints==0.4
|
24 |
executing==0.9.1
|
25 |
fastjsonschema==2.16.1
|
26 |
-
fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl
|
27 |
-
fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
|
28 |
-
fr_ner4archives_default_vectors_lg @ https://huggingface.co/ner4archives/fr_ner4archives_default_vectors_lg/resolve/main/fr_ner4archives_default_vectors_lg-any-py3-none-any.whl
|
29 |
-
fr_ner4archives_camembert_base @ https://huggingface.co/ner4archives/fr_ner4archives_camembert_base/resolve/main/fr_ner4archives_camembert_base-any-py3-none-any.whl
|
30 |
gitdb==4.0.9
|
31 |
GitPython==3.1.27
|
32 |
idna==3.3
|
@@ -90,16 +86,16 @@ six==1.16.0
|
|
90 |
smart-open==5.2.1
|
91 |
smmap==5.0.0
|
92 |
soupsieve==2.3.2.post1
|
93 |
-
spacy==3.
|
94 |
spacy-legacy==3.0.9
|
95 |
spacy-loggers==1.0.3
|
96 |
spacy-streamlit==1.0.4
|
97 |
-
spacyfishing==0.1.
|
98 |
srsly==2.4.4
|
99 |
stack-data==0.3.0
|
100 |
streamlit==1.11.1
|
101 |
terminado==0.15.0
|
102 |
-
thinc==8.
|
103 |
tinycss2==1.1.1
|
104 |
toml==0.10.2
|
105 |
toolz==0.12.0
|
@@ -118,3 +114,6 @@ wcwidth==0.2.5
|
|
118 |
webencodings==0.5.1
|
119 |
widgetsnbextension==3.6.1
|
120 |
zipp==3.8.1
|
|
|
|
|
|
|
|
23 |
entrypoints==0.4
|
24 |
executing==0.9.1
|
25 |
fastjsonschema==2.16.1
|
|
|
|
|
|
|
|
|
26 |
gitdb==4.0.9
|
27 |
GitPython==3.1.27
|
28 |
idna==3.3
|
|
|
86 |
smart-open==5.2.1
|
87 |
smmap==5.0.0
|
88 |
soupsieve==2.3.2.post1
|
89 |
+
spacy==3.4.0
|
90 |
spacy-legacy==3.0.9
|
91 |
spacy-loggers==1.0.3
|
92 |
spacy-streamlit==1.0.4
|
93 |
+
spacyfishing==0.1.8
|
94 |
srsly==2.4.4
|
95 |
stack-data==0.3.0
|
96 |
streamlit==1.11.1
|
97 |
terminado==0.15.0
|
98 |
+
thinc==8.1.2
|
99 |
tinycss2==1.1.1
|
100 |
toml==0.10.2
|
101 |
toolz==0.12.0
|
|
|
114 |
webencodings==0.5.1
|
115 |
widgetsnbextension==3.6.1
|
116 |
zipp==3.8.1
|
117 |
+
fr_core_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_default/resolve/main/fr_core_ner4archives_v3_default-any-py3-none-any.whl
|
118 |
+
fr_core_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_with_vectors/resolve/main/fr_core_ner4archives_v3_with_vectors-any-py3-none-any.whl
|
119 |
+
fr_core_ner4archives_V3_camembert_base @ https://huggingface.co/ner4archives/fr_core_ner4archives_V3_camembert_base/resolve/main/fr_core_ner4archives_V3_camembert_base-any-py3-none-any.whl
|