Spaces:
Build error
Build error
the model is very sensitive to the number of spaces between references. The issue is mitigated by removing an extra space between lines
Browse files
app.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
import io
|
|
|
2 |
import gradio as gr
|
|
|
3 |
import spacy
|
4 |
from spacy import displacy
|
|
|
5 |
|
6 |
from bib_tokenizers import create_references_tokenizer
|
7 |
-
from schema import tags_ent
|
8 |
|
9 |
|
10 |
nlp = spacy.load("en_bib_references_trf")
|
@@ -18,24 +21,18 @@ nlp.get_pipe("spancat").cfg["threshold"] = 0.0 # see )
|
|
18 |
print(nlp.get_pipe("spancat").cfg)
|
19 |
|
20 |
|
21 |
-
def create_bib_item_start_scorer_for_doc(doc
|
22 |
|
23 |
-
span_group = doc.spans[
|
24 |
assert not span_group.has_overlap
|
25 |
assert len(span_group) == len(
|
26 |
doc
|
27 |
), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
for span in span_group
|
32 |
-
for offset in range(span.start_char, span.end_char + 1)
|
33 |
-
}
|
34 |
-
|
35 |
-
def scorer(char_offset, fuzzy_in_tokens=(0, 0)):
|
36 |
-
i = spans_idx[char_offset]
|
37 |
|
38 |
-
span = span_group[i]
|
39 |
assert i == span.start
|
40 |
|
41 |
# fuzzines might improve fault tolerance if the model made a small mistake,
|
@@ -56,7 +53,7 @@ nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)
|
|
56 |
|
57 |
|
58 |
def split_up_references(
|
59 |
-
references: str, is_eol_mode=False, nlp=nlp, nlp_blank=nlp_blank
|
60 |
):
|
61 |
"""
|
62 |
Args:
|
@@ -65,63 +62,82 @@ def split_up_references(
|
|
65 |
nlp_blank - a blank nlp with the same tokenizer/language
|
66 |
"""
|
67 |
|
68 |
-
normalized_references = references.replace("\n", " ")
|
69 |
-
|
70 |
-
# the model trained on 'normalized' references - the ones without '\n'
|
71 |
-
doc = nlp(normalized_references)
|
72 |
-
|
73 |
-
# 'transfer' annotations from doc without '\n' (normalized references) to the target doc created from the original input string
|
74 |
-
# the problem here is that docs differ in a number of tokens
|
75 |
-
# however, it should be easy to align on characters level because both '\n' and ' ' are whitespace, so spans have the same boundaries
|
76 |
-
|
77 |
target_doc = nlp_blank(references)
|
78 |
target_tokens_idx = {
|
79 |
offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
|
80 |
}
|
|
|
|
|
81 |
|
82 |
-
#
|
83 |
-
|
84 |
-
t.is_sent_start = i == 0
|
85 |
if is_eol_mode:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
# use SpanCat scores to set sentence boundaries on the target doc
|
|
|
|
|
|
|
|
|
87 |
char_offset = 0
|
88 |
-
|
89 |
-
|
90 |
-
threshold = 0.2
|
91 |
-
lines = [line for line in f]
|
92 |
-
lines_len_in_tokens = [
|
93 |
-
_len for _len in map(lambda line: len(nlp_blank.tokenizer(line)), lines)
|
94 |
-
]
|
95 |
for line_num, line in enumerate(lines):
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
if
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
char_offset += len(line)
|
105 |
else:
|
106 |
# copy SentenceRecognizer annotations from doc without '\n' to the target doc
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
# copy ner annotations:
|
112 |
-
|
113 |
-
target_doc.
|
114 |
-
|
115 |
-
# remove entities crossing sentence boundaries
|
116 |
-
if not any([t.is_sent_start for t in ent if t.i != ent.start])
|
117 |
-
]
|
118 |
|
119 |
return target_doc
|
120 |
|
121 |
|
122 |
def text_analysis(text, is_eol_mode):
|
123 |
|
124 |
-
if not text:
|
125 |
return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
|
126 |
|
127 |
doc_with_linebreaks = split_up_references(
|
@@ -173,18 +189,19 @@ def text_analysis(text, is_eol_mode):
|
|
173 |
return html
|
174 |
|
175 |
|
|
|
176 |
demo = gr.Blocks()
|
177 |
with demo:
|
178 |
|
179 |
textbox = gr.components.Textbox(
|
180 |
label="Unparsed Bibliography Section",
|
181 |
-
placeholder="Enter bibliography here...
|
182 |
lines=20,
|
183 |
)
|
184 |
is_eol_mode = gr.components.Checkbox(
|
185 |
-
label="
|
186 |
)
|
187 |
-
html = gr.components.HTML(label="Parsed
|
188 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
189 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
190 |
|
@@ -219,7 +236,8 @@ CFR
|
|
219 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
220 |
],
|
221 |
[
|
222 |
-
"""
|
|
|
223 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
224 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
225 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|
|
|
1 |
import io
|
2 |
+
|
3 |
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
import spacy
|
6 |
from spacy import displacy
|
7 |
+
from spacy.training import Example
|
8 |
|
9 |
from bib_tokenizers import create_references_tokenizer
|
10 |
+
from schema import spankey_sentence_start, tags_ent
|
11 |
|
12 |
|
13 |
nlp = spacy.load("en_bib_references_trf")
|
|
|
21 |
print(nlp.get_pipe("spancat").cfg)
|
22 |
|
23 |
|
24 |
+
def create_bib_item_start_scorer_for_doc(doc):
|
25 |
|
26 |
+
span_group = doc.spans[spankey_sentence_start]
|
27 |
assert not span_group.has_overlap
|
28 |
assert len(span_group) == len(
|
29 |
doc
|
30 |
), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
|
31 |
|
32 |
+
def scorer(token_index_in_doc, fuzzy_in_tokens=(0, 0)):
|
33 |
+
i = token_index_in_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
span = span_group[i] # our spans are one token length
|
36 |
assert i == span.start
|
37 |
|
38 |
# fuzzines might improve fault tolerance if the model made a small mistake,
|
|
|
53 |
|
54 |
|
55 |
def split_up_references(
|
56 |
+
references: str, is_eol_mode=False, ner=True, nlp=nlp, nlp_blank=nlp_blank
|
57 |
):
|
58 |
"""
|
59 |
Args:
|
|
|
62 |
nlp_blank - a blank nlp with the same tokenizer/language
|
63 |
"""
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
target_doc = nlp_blank(references)
|
66 |
target_tokens_idx = {
|
67 |
offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
|
68 |
}
|
69 |
+
f = io.StringIO(references)
|
70 |
+
lines = [line for line in f]
|
71 |
|
72 |
+
# disable unused components to speedup inference && parse normalized referenences
|
73 |
+
disable = []
|
|
|
74 |
if is_eol_mode:
|
75 |
+
disable.append("senter")
|
76 |
+
else:
|
77 |
+
disable.append("spancat")
|
78 |
+
if not ner:
|
79 |
+
disable.append("ner")
|
80 |
+
with nlp.select_pipes(disable=disable):
|
81 |
+
# normalization applied: strip lines and remove any extra space between lines
|
82 |
+
norm_doc = nlp(" ".join([line.strip() for line in lines if line.strip()]))
|
83 |
+
|
84 |
+
# extremely useful spacy API for alignment normalized and target(created from non-modified input) docs
|
85 |
+
example = Example(target_doc, norm_doc)
|
86 |
+
|
87 |
+
if is_eol_mode:
|
88 |
+
alignment_data = example.alignment.y2x.data
|
89 |
+
|
90 |
# use SpanCat scores to set sentence boundaries on the target doc
|
91 |
+
# init senter annotations
|
92 |
+
for i, t in enumerate(target_doc):
|
93 |
+
t.is_sent_start = i == 0
|
94 |
+
|
95 |
char_offset = 0
|
96 |
+
token_scorer = create_bib_item_start_scorer_for_doc(norm_doc)
|
97 |
+
threshold = 0.5
|
|
|
|
|
|
|
|
|
|
|
98 |
for line_num, line in enumerate(lines):
|
99 |
+
if not line.strip():
|
100 |
+
# ignore empty line
|
101 |
+
char_offset += len(line)
|
102 |
+
continue
|
103 |
+
|
104 |
+
token_index_in_target_doc = target_tokens_idx[char_offset]
|
105 |
+
# scroll to the first non-space (if the line starts from space):
|
106 |
+
while (
|
107 |
+
token_index_in_target_doc < len(target_doc)
|
108 |
+
and target_doc[token_index_in_target_doc].is_space
|
109 |
+
):
|
110 |
+
token_index_in_target_doc += 1
|
111 |
+
|
112 |
+
index_in_norm_doc = np.where(alignment_data == token_index_in_target_doc)
|
113 |
+
if type(index_in_norm_doc) == tuple:
|
114 |
+
index_in_norm_doc = index_in_norm_doc[0] # depends on numpy version...
|
115 |
+
|
116 |
+
if index_in_norm_doc.size > 0:
|
117 |
+
index_in_norm_doc = index_in_norm_doc[0].item()
|
118 |
+
span, score = token_scorer(index_in_norm_doc)
|
119 |
+
print(span, score, index_in_norm_doc)
|
120 |
+
if score > threshold:
|
121 |
+
target_doc[target_tokens_idx[char_offset]].is_sent_start = True
|
122 |
+
|
123 |
char_offset += len(line)
|
124 |
else:
|
125 |
# copy SentenceRecognizer annotations from doc without '\n' to the target doc
|
126 |
+
sent_start = example.get_aligned("SENT_START")
|
127 |
+
for i, t in enumerate(target_doc):
|
128 |
+
target_doc[i].is_sent_start = sent_start[i] == 1
|
129 |
|
130 |
# copy ner annotations:
|
131 |
+
for label in tags_ent:
|
132 |
+
target_doc.vocab[label]
|
133 |
+
target_doc.ents = example.get_aligned_spans_y2x(norm_doc.ents)
|
|
|
|
|
|
|
134 |
|
135 |
return target_doc
|
136 |
|
137 |
|
138 |
def text_analysis(text, is_eol_mode):
|
139 |
|
140 |
+
if not text or not text.strip():
|
141 |
return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
|
142 |
|
143 |
doc_with_linebreaks = split_up_references(
|
|
|
189 |
return html
|
190 |
|
191 |
|
192 |
+
gr.close_all()
|
193 |
demo = gr.Blocks()
|
194 |
with demo:
|
195 |
|
196 |
textbox = gr.components.Textbox(
|
197 |
label="Unparsed Bibliography Section",
|
198 |
+
placeholder="Enter bibliography here...",
|
199 |
lines=20,
|
200 |
)
|
201 |
is_eol_mode = gr.components.Checkbox(
|
202 |
+
label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
|
203 |
)
|
204 |
+
html = gr.components.HTML(label="Parsed Bib Items")
|
205 |
textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
206 |
is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
|
207 |
|
|
|
236 |
[Knu] Donald Knuth. Knuth: Computers and typesetting."""
|
237 |
],
|
238 |
[
|
239 |
+
"""References.
|
240 |
+
Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
|
241 |
Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
|
242 |
Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
|
243 |
Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
|