lterriel commited on
Commit
c1ac802
Β·
1 Parent(s): 778fde6

update app structure + add new models to requirements

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app.py +62 -92
  3. models.md +10 -0
  4. requirements.txt +6 -7
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  Legacy.py
2
  .idea
3
- standoffconverter
 
 
1
  Legacy.py
2
  .idea
3
+ standoffconverter
4
+ venv/
app.py CHANGED
@@ -1,10 +1,10 @@
 
 
1
  import streamlit
2
  import spacy_streamlit
3
  import spacy
4
  from lxml import etree
5
  import pandas as pd
6
- from spacy import Language
7
- from spacy.tokens import Doc
8
 
9
  streamlit.set_page_config(layout="wide")
10
 
@@ -41,20 +41,12 @@ streamlit.write("## πŸ“„ Input XML EAD:")
41
  filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
42
  streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
43
  data = ""
44
-
45
-
46
-
47
-
48
  flag_model = False
 
49
  if filename is not None:
50
  data = filename.getvalue().decode("utf-8").encode("utf-8")
51
  if len(data) > 0:
52
  flag_file = True
53
-
54
-
55
-
56
-
57
- import re
58
  def ead_strategy(tree):
59
  # create a container for sentences and dids
60
  # elements
@@ -112,48 +104,7 @@ if flag_view:
112
  linkingicon = "❌"
113
  streamlit.write("#### Actual Parameters:")
114
  streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
115
- @Language.factory("custom_ner", default_config={
116
- "model_name": "",
117
- "sentences_to_process": []
118
- })
119
- class CustomNer:
120
- def __init__(self,
121
- nlp: Language,
122
- name: str,
123
- model_name: str,
124
- sentences_to_process: list):
125
- self.nlp = nlp
126
- self.pipeline_ner = spacy.load(model_name)
127
- f_score = self.pipeline_ner.meta['performance']['ents_f']
128
- recall = self.pipeline_ner.meta['performance']['ents_r']
129
- precision = self.pipeline_ner.meta['performance']['ents_p']
130
- mcol1, mcol2, mcol3 = streamlit.columns(3)
131
- mcol1.metric("F-Score", f'{f_score:.2f}')
132
- mcol2.metric("Precision", f'{precision:.2f}')
133
- mcol3.metric("Recall", f'{recall:.2f}')
134
- self.sentences = sentences_to_process
135
-
136
- def __call__(self, doc: Doc):
137
- start_sentence = 0
138
- spans = []
139
- count = 0
140
- bar = streamlit.progress(count)
141
- for sent in self.pipeline_ner.pipe(self.sentences):
142
- # add 1 char that correspond to space added in
143
- # sentences concatenation (" ".join())
144
- end_sentence = start_sentence + len(sent.text) + 1
145
- # recompute named entities characters offsets
146
- for ent in sent.ents:
147
- start = start_sentence + ent.start_char
148
- end = start + len(ent.text)
149
- spans.append(doc.char_span(start, end, label=ent.label_))
150
- start_sentence = end_sentence
151
- count += 1
152
- bar.progress((count/len(sentences))*1.0)
153
-
154
- doc.set_ents(spans)
155
-
156
- return doc
157
 
158
  entities = []
159
  docs = []
@@ -163,27 +114,44 @@ flag_vizualize = False
163
  # Launch NER process:
164
  if flag_model:
165
  if streamlit.button('Launch'):
 
166
  with streamlit.spinner('Initialize NER...'):
167
-
168
- huge_pipeline_linking = spacy.blank("fr")
169
- huge_pipeline_linking.max_length = 5000000
170
- huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
171
  if linking:
172
- huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
173
-
174
  with streamlit.spinner('NER processing...'):
175
- doc = huge_pipeline_linking(plain)
176
- #doc = huge_pipeline_linking(plain)
177
-
178
- entities = [
179
- (ent.start_char,
180
- ent.end_char,
181
- ent.text,
182
- ent.label_,
183
- ent._.url_wikidata if linking else None,
184
- ent._.nerd_score if linking else None
185
- ) for ent in doc.ents
186
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  streamlit.success('πŸ˜ƒ NER applied with success!')
189
 
@@ -192,6 +160,7 @@ if flag_model:
192
  'END',
193
  'MENTION',
194
  'NER LABEL',
 
195
  'WIKIDATA RESSOURCE (wikidata disambiguation)',
196
  'LINKING SCORE'
197
  ])
@@ -199,32 +168,33 @@ if flag_model:
199
  streamlit.write("## πŸ”Ž Explore named entities in table: ")
200
  streamlit.write(df)
201
 
 
202
  streamlit.write("## πŸ”Ž Explore named entities in text: ")
203
  spacy_streamlit.visualize_ner(
204
- {"text": doc.text, "ents": [{"start": ent.start_char,
205
- "end": ent.end_char,
206
- "label": ent.label_,
207
- "kb_id": ent._.kb_qid if linking else None,
208
- "kb_url": ent._.url_wikidata if linking else None
209
- } for ent in doc.ents]},
 
210
  labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
211
  show_table=False,
212
  manual=True,
213
  title="",
214
- displacy_options={
215
- "colors": {
216
- "EVENT": "#ec7063",
217
- "LOCATION": "#45b39d",
218
- "ORGANISATION": "#f39c12",
219
- "PERSON": "#3498db",
220
- "TITLE": "#a569bd ",
221
- "LOC": "#45b39d",
222
- "MISC": "#ec7063",
223
- "ORG": "#f39c12",
224
- "PER": "#3498db"
225
-
226
- }
227
- })
228
-
229
 
230
 
 
1
+ import re
2
+
3
  import streamlit
4
  import spacy_streamlit
5
  import spacy
6
  from lxml import etree
7
  import pandas as pd
 
 
8
 
9
  streamlit.set_page_config(layout="wide")
10
 
 
41
  filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
42
  streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
43
  data = ""
 
 
 
 
44
  flag_model = False
45
+
46
  if filename is not None:
47
  data = filename.getvalue().decode("utf-8").encode("utf-8")
48
  if len(data) > 0:
49
  flag_file = True
 
 
 
 
 
50
  def ead_strategy(tree):
51
  # create a container for sentences and dids
52
  # elements
 
104
  linkingicon = "❌"
105
  streamlit.write("#### Actual Parameters:")
106
  streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
107
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  entities = []
110
  docs = []
 
114
  # Launch NER process:
115
  if flag_model:
116
  if streamlit.button('Launch'):
117
+ plain = "\n".join(sentences)
118
  with streamlit.spinner('Initialize NER...'):
119
+ nlp = spacy.load(model)
120
+ nlp.max_length = 5000000
 
 
121
  if linking:
122
+ nlp.add_pipe('entityfishing', config={"language": "fr"})
123
+
124
  with streamlit.spinner('NER processing...'):
125
+ if linking:
126
+ start_sentence = 0
127
+ docs = nlp.pipe(sentences, batch_size=100)
128
+ for doc in docs:
129
+ end_sentence = start_sentence + len(doc.text) + 1
130
+ for ent in doc.ents:
131
+ start_tok = start_sentence + ent.start_char
132
+ end_tok = start_tok + len(ent.text)
133
+ entities.append((
134
+ start_tok,
135
+ end_tok,
136
+ ent.text,
137
+ ent.label_,
138
+ ent._.kb_qid,
139
+ ent._.url_wikidata,
140
+ ent._.nerd_score
141
+ ))
142
+ start_sentence = end_sentence
143
+ else:
144
+ entities = [
145
+ (ent.start_char,
146
+ ent.end_char,
147
+ ent.text,
148
+ ent.label_,
149
+ "",
150
+ "",
151
+ ""
152
+ ) for ent in nlp(plain).ents
153
+ ]
154
+
155
 
156
  streamlit.success('πŸ˜ƒ NER applied with success!')
157
 
 
160
  'END',
161
  'MENTION',
162
  'NER LABEL',
163
+ 'QID',
164
  'WIKIDATA RESSOURCE (wikidata disambiguation)',
165
  'LINKING SCORE'
166
  ])
 
168
  streamlit.write("## πŸ”Ž Explore named entities in table: ")
169
  streamlit.write(df)
170
 
171
+
172
  streamlit.write("## πŸ”Ž Explore named entities in text: ")
173
  spacy_streamlit.visualize_ner(
174
+ {"text": plain,
175
+ "ents": [{"start": ent[0],
176
+ "end": ent[1],
177
+ "label": ent[3],
178
+ "kb_id": ent[4] if linking else "",
179
+ "kb_url": ent[5] if linking else ""
180
+ } for ent in entities]},
181
  labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
182
  show_table=False,
183
  manual=True,
184
  title="",
185
+ displacy_options={
186
+ "colors": {
187
+ "EVENT": "#ec7063",
188
+ "LOCATION": "#45b39d",
189
+ "ORGANISATION": "#f39c12",
190
+ "PERSON": "#3498db",
191
+ "TITLE": "#a569bd ",
192
+ "LOC": "#45b39d",
193
+ "MISC": "#ec7063",
194
+ "ORG": "#f39c12",
195
+ "PER": "#3498db"
196
+ }
197
+ })
198
+
 
199
 
200
 
models.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Models available for Ner4Archives:
2
+
3
+ * Corpus V2:
4
+ - fr_ner4archives_default_test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
5
+ - fr_ner4archives_camembert_base @ https://huggingface.co/ner4archives/fr_ner4archives_camembert_base/resolve/main/fr_ner4archives_camembert_base-any-py3-none-any.whl
6
+
7
+ * Corpus V3:
8
+ - fr_core_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_default/resolve/main/fr_core_ner4archives_v3_default-any-py3-none-any.whl
9
+ - fr_core_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_with_vectors/resolve/main/fr_core_ner4archives_v3_with_vectors-any-py3-none-any.whl
10
+ - fr_core_ner4archives_V3_camembert_base @ https://huggingface.co/ner4archives/fr_core_ner4archives_V3_camembert_base/resolve/main/fr_core_ner4archives_V3_camembert_base-any-py3-none-any.whl
requirements.txt CHANGED
@@ -23,10 +23,6 @@ defusedxml==0.7.1
23
  entrypoints==0.4
24
  executing==0.9.1
25
  fastjsonschema==2.16.1
26
- fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl
27
- fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
28
- fr_ner4archives_default_vectors_lg @ https://huggingface.co/ner4archives/fr_ner4archives_default_vectors_lg/resolve/main/fr_ner4archives_default_vectors_lg-any-py3-none-any.whl
29
- fr_ner4archives_camembert_base @ https://huggingface.co/ner4archives/fr_ner4archives_camembert_base/resolve/main/fr_ner4archives_camembert_base-any-py3-none-any.whl
30
  gitdb==4.0.9
31
  GitPython==3.1.27
32
  idna==3.3
@@ -90,16 +86,16 @@ six==1.16.0
90
  smart-open==5.2.1
91
  smmap==5.0.0
92
  soupsieve==2.3.2.post1
93
- spacy==3.3.1
94
  spacy-legacy==3.0.9
95
  spacy-loggers==1.0.3
96
  spacy-streamlit==1.0.4
97
- spacyfishing==0.1.7
98
  srsly==2.4.4
99
  stack-data==0.3.0
100
  streamlit==1.11.1
101
  terminado==0.15.0
102
- thinc==8.0.17
103
  tinycss2==1.1.1
104
  toml==0.10.2
105
  toolz==0.12.0
@@ -118,3 +114,6 @@ wcwidth==0.2.5
118
  webencodings==0.5.1
119
  widgetsnbextension==3.6.1
120
  zipp==3.8.1
 
 
 
 
23
  entrypoints==0.4
24
  executing==0.9.1
25
  fastjsonschema==2.16.1
 
 
 
 
26
  gitdb==4.0.9
27
  GitPython==3.1.27
28
  idna==3.3
 
86
  smart-open==5.2.1
87
  smmap==5.0.0
88
  soupsieve==2.3.2.post1
89
+ spacy==3.4.0
90
  spacy-legacy==3.0.9
91
  spacy-loggers==1.0.3
92
  spacy-streamlit==1.0.4
93
+ spacyfishing==0.1.8
94
  srsly==2.4.4
95
  stack-data==0.3.0
96
  streamlit==1.11.1
97
  terminado==0.15.0
98
+ thinc==8.1.2
99
  tinycss2==1.1.1
100
  toml==0.10.2
101
  toolz==0.12.0
 
114
  webencodings==0.5.1
115
  widgetsnbextension==3.6.1
116
  zipp==3.8.1
117
+ fr_core_ner4archives_v3_default @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_default/resolve/main/fr_core_ner4archives_v3_default-any-py3-none-any.whl
118
+ fr_core_ner4archives_v3_with_vectors @ https://huggingface.co/ner4archives/fr_core_ner4archives_v3_with_vectors/resolve/main/fr_core_ner4archives_v3_with_vectors-any-py3-none-any.whl
119
+ fr_core_ner4archives_V3_camembert_base @ https://huggingface.co/ner4archives/fr_core_ner4archives_V3_camembert_base/resolve/main/fr_core_ner4archives_V3_camembert_base-any-py3-none-any.whl