vitaly commited on
Commit
d4bb227
1 Parent(s): ff7710f

the model is very sensitive to the number of spaces between references. The issue is mitigated by removing an extra space between lines

Browse files
Files changed (1) hide show
  1. app.py +72 -54
app.py CHANGED
@@ -1,10 +1,13 @@
1
  import io
 
2
  import gradio as gr
 
3
  import spacy
4
  from spacy import displacy
 
5
 
6
  from bib_tokenizers import create_references_tokenizer
7
- from schema import tags_ent
8
 
9
 
10
  nlp = spacy.load("en_bib_references_trf")
@@ -18,24 +21,18 @@ nlp.get_pipe("spancat").cfg["threshold"] = 0.0 # see )
18
  print(nlp.get_pipe("spancat").cfg)
19
 
20
 
21
- def create_bib_item_start_scorer_for_doc(doc, spanskey="sc"):
22
 
23
- span_group = doc.spans[spanskey]
24
  assert not span_group.has_overlap
25
  assert len(span_group) == len(
26
  doc
27
  ), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
28
 
29
- spans_idx = {
30
- offset: span.start
31
- for span in span_group
32
- for offset in range(span.start_char, span.end_char + 1)
33
- }
34
-
35
- def scorer(char_offset, fuzzy_in_tokens=(0, 0)):
36
- i = spans_idx[char_offset]
37
 
38
- span = span_group[i]
39
  assert i == span.start
40
 
41
  # fuzzines might improve fault tolerance if the model made a small mistake,
@@ -56,7 +53,7 @@ nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)
56
 
57
 
58
  def split_up_references(
59
- references: str, is_eol_mode=False, nlp=nlp, nlp_blank=nlp_blank
60
  ):
61
  """
62
  Args:
@@ -65,63 +62,82 @@ def split_up_references(
65
  nlp_blank - a blank nlp with the same tokenizer/language
66
  """
67
 
68
- normalized_references = references.replace("\n", " ")
69
-
70
- # the model trained on 'normalized' references - the ones without '\n'
71
- doc = nlp(normalized_references)
72
-
73
- # 'transfer' annotations from doc without '\n' (normalized references) to the target doc created from the original input string
74
- # the problem here is that docs differ in a number of tokens
75
- # however, it should be easy to align on characters level because both '\n' and ' ' are whitespace, so spans have the same boundaries
76
-
77
  target_doc = nlp_blank(references)
78
  target_tokens_idx = {
79
  offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
80
  }
 
 
81
 
82
- # senter annotations
83
- for i, t in enumerate(target_doc):
84
- t.is_sent_start = i == 0
85
  if is_eol_mode:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  # use SpanCat scores to set sentence boundaries on the target doc
 
 
 
 
87
  char_offset = 0
88
- f = io.StringIO(references)
89
- token_scorer = create_bib_item_start_scorer_for_doc(doc)
90
- threshold = 0.2
91
- lines = [line for line in f]
92
- lines_len_in_tokens = [
93
- _len for _len in map(lambda line: len(nlp_blank.tokenizer(line)), lines)
94
- ]
95
  for line_num, line in enumerate(lines):
96
- fuzzy = (
97
- 0 if line_num == 0 else lines_len_in_tokens[line_num - 1] // 4,
98
- lines_len_in_tokens[line_num] // 4,
99
- )
100
- span, score = token_scorer(char_offset, fuzzy_in_tokens=fuzzy)
101
- print(span, score)
102
- if score > threshold:
103
- target_doc[target_tokens_idx[char_offset]].is_sent_start = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  char_offset += len(line)
105
  else:
106
  # copy SentenceRecognizer annotations from doc without '\n' to the target doc
107
- for t in doc:
108
- if t.is_sent_start:
109
- target_doc[target_tokens_idx[t.idx]].is_sent_start = True
110
 
111
  # copy ner annotations:
112
- target_doc.ents = [
113
- target_doc.char_span(ent.start_char, ent.end_char, ent.label_)
114
- for ent in doc.ents
115
- # remove entities crossing sentence boundaries
116
- if not any([t.is_sent_start for t in ent if t.i != ent.start])
117
- ]
118
 
119
  return target_doc
120
 
121
 
122
  def text_analysis(text, is_eol_mode):
123
 
124
- if not text:
125
  return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
126
 
127
  doc_with_linebreaks = split_up_references(
@@ -173,18 +189,19 @@ def text_analysis(text, is_eol_mode):
173
  return html
174
 
175
 
 
176
  demo = gr.Blocks()
177
  with demo:
178
 
179
  textbox = gr.components.Textbox(
180
  label="Unparsed Bibliography Section",
181
- placeholder="Enter bibliography here... It will be splitted up into separate references.",
182
  lines=20,
183
  )
184
  is_eol_mode = gr.components.Checkbox(
185
- label="My Unparsed Bibliography does not contain more than one reference per line (Multiline references are supported regardless of this choice)"
186
  )
187
- html = gr.components.HTML(label="Parsed References")
188
  textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
189
  is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
190
 
@@ -219,7 +236,8 @@ CFR
219
  [Knu] Donald Knuth. Knuth: Computers and typesetting."""
220
  ],
221
  [
222
- """Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
 
223
  Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
224
  Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
225
  Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
 
1
  import io
2
+
3
  import gradio as gr
4
+ import numpy as np
5
  import spacy
6
  from spacy import displacy
7
+ from spacy.training import Example
8
 
9
  from bib_tokenizers import create_references_tokenizer
10
+ from schema import spankey_sentence_start, tags_ent
11
 
12
 
13
  nlp = spacy.load("en_bib_references_trf")
 
21
  print(nlp.get_pipe("spancat").cfg)
22
 
23
 
24
+ def create_bib_item_start_scorer_for_doc(doc):
25
 
26
+ span_group = doc.spans[spankey_sentence_start]
27
  assert not span_group.has_overlap
28
  assert len(span_group) == len(
29
  doc
30
  ), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
31
 
32
+ def scorer(token_index_in_doc, fuzzy_in_tokens=(0, 0)):
33
+ i = token_index_in_doc
 
 
 
 
 
 
34
 
35
+ span = span_group[i] # our spans are one token length
36
  assert i == span.start
37
 
38
  # fuzzines might improve fault tolerance if the model made a small mistake,
 
53
 
54
 
55
  def split_up_references(
56
+ references: str, is_eol_mode=False, ner=True, nlp=nlp, nlp_blank=nlp_blank
57
  ):
58
  """
59
  Args:
 
62
  nlp_blank - a blank nlp with the same tokenizer/language
63
  """
64
 
 
 
 
 
 
 
 
 
 
65
  target_doc = nlp_blank(references)
66
  target_tokens_idx = {
67
  offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
68
  }
69
+ f = io.StringIO(references)
70
+ lines = [line for line in f]
71
 
72
+ # disable unused components to speedup inference && parse normalized referenences
73
+ disable = []
 
74
  if is_eol_mode:
75
+ disable.append("senter")
76
+ else:
77
+ disable.append("spancat")
78
+ if not ner:
79
+ disable.append("ner")
80
+ with nlp.select_pipes(disable=disable):
81
+ # normalization applied: strip lines and remove any extra space between lines
82
+ norm_doc = nlp(" ".join([line.strip() for line in lines if line.strip()]))
83
+
84
+ # extremely useful spacy API for alignment normalized and target(created from non-modified input) docs
85
+ example = Example(target_doc, norm_doc)
86
+
87
+ if is_eol_mode:
88
+ alignment_data = example.alignment.y2x.data
89
+
90
  # use SpanCat scores to set sentence boundaries on the target doc
91
+ # init senter annotations
92
+ for i, t in enumerate(target_doc):
93
+ t.is_sent_start = i == 0
94
+
95
  char_offset = 0
96
+ token_scorer = create_bib_item_start_scorer_for_doc(norm_doc)
97
+ threshold = 0.5
 
 
 
 
 
98
  for line_num, line in enumerate(lines):
99
+ if not line.strip():
100
+ # ignore empty line
101
+ char_offset += len(line)
102
+ continue
103
+
104
+ token_index_in_target_doc = target_tokens_idx[char_offset]
105
+ # scroll to the first non-space (if the line starts from space):
106
+ while (
107
+ token_index_in_target_doc < len(target_doc)
108
+ and target_doc[token_index_in_target_doc].is_space
109
+ ):
110
+ token_index_in_target_doc += 1
111
+
112
+ index_in_norm_doc = np.where(alignment_data == token_index_in_target_doc)
113
+ if type(index_in_norm_doc) == tuple:
114
+ index_in_norm_doc = index_in_norm_doc[0] # depends on numpy version...
115
+
116
+ if index_in_norm_doc.size > 0:
117
+ index_in_norm_doc = index_in_norm_doc[0].item()
118
+ span, score = token_scorer(index_in_norm_doc)
119
+ print(span, score, index_in_norm_doc)
120
+ if score > threshold:
121
+ target_doc[target_tokens_idx[char_offset]].is_sent_start = True
122
+
123
  char_offset += len(line)
124
  else:
125
  # copy SentenceRecognizer annotations from doc without '\n' to the target doc
126
+ sent_start = example.get_aligned("SENT_START")
127
+ for i, t in enumerate(target_doc):
128
+ target_doc[i].is_sent_start = sent_start[i] == 1
129
 
130
  # copy ner annotations:
131
+ for label in tags_ent:
132
+ target_doc.vocab[label]
133
+ target_doc.ents = example.get_aligned_spans_y2x(norm_doc.ents)
 
 
 
134
 
135
  return target_doc
136
 
137
 
138
  def text_analysis(text, is_eol_mode):
139
 
140
+ if not text or not text.strip():
141
  return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"
142
 
143
  doc_with_linebreaks = split_up_references(
 
189
  return html
190
 
191
 
192
+ gr.close_all()
193
  demo = gr.Blocks()
194
  with demo:
195
 
196
  textbox = gr.components.Textbox(
197
  label="Unparsed Bibliography Section",
198
+ placeholder="Enter bibliography here...",
199
  lines=20,
200
  )
201
  is_eol_mode = gr.components.Checkbox(
202
+ label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
203
  )
204
+ html = gr.components.HTML(label="Parsed Bib Items")
205
  textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
206
  is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
207
 
 
236
  [Knu] Donald Knuth. Knuth: Computers and typesetting."""
237
  ],
238
  [
239
+ """References.
240
+ Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
241
  Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
242
  Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
243
  Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100