vitaly commited on
Commit
0f8d97e
1 Parent(s): bdbeeec

Upload gradio_app.py

Browse files
Files changed (1) hide show
  1. gradio_app.py +223 -0
gradio_app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import gradio as gr
3
+ import spacy
4
+ from spacy import displacy
5
+
6
+ from bib_tokenizers import create_references_tokenizer
7
+
8
+
9
+ nlp = None
10
+ nlp = spacy.load("spacy-pipelines/model-best")
11
+ # return score for each token:
12
+ # with threshold set to zero each suggested span is returned, and span == token,
13
+ # because suggester is configured to suggest spans with len(span) == 1:
14
+ # [components.spancat.suggester]
15
+ # @misc = "spacy.ngram_suggester.v1"
16
+ # sizes = [1]
17
+ nlp.get_pipe("spancat").cfg["threshold"] = 0.0 # see )
18
+ print(nlp.get_pipe("spancat").cfg)
19
+
20
+
21
+ def create_bib_item_start_scorer_for_doc(doc, spanskey="sc"):
22
+
23
+ span_group = doc.spans[spanskey]
24
+ assert not span_group.has_overlap
25
+ assert len(span_group) == len(
26
+ doc
27
+ ), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"
28
+
29
+ spans_idx = {
30
+ offset: span.start
31
+ for span in span_group
32
+ for offset in range(span.start_char, span.end_char + 1)
33
+ }
34
+
35
+ def scorer(char_offset, fuzzy_in_tokens=(0, 0)):
36
+ i = spans_idx[char_offset]
37
+
38
+ span = span_group[i]
39
+ assert i == span.start
40
+
41
+ # fuzzines might improve fault tolerance if the model made a small mistake,
42
+ # e.g., if a number from prev line is classified as "citation number",
43
+ # see example at https://www.deeplearningbook.org/contents/bib.html
44
+ # if fuzzy == (0,0), it return score for the selected span only
45
+ return span, max(
46
+ span_group.attrs["scores"][i]
47
+ for i in range(i - fuzzy_in_tokens[0], i + fuzzy_in_tokens[1] + 1)
48
+ if i >= 0 and i < len(doc.text)
49
+ )
50
+
51
+ return scorer
52
+
53
+
54
+ nlp_blank = spacy.blank("en")
55
+ nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)
56
+
57
+
58
+ def split_up_references(
59
+ references: str, is_eol_mode=False, nlp=nlp, nlp_blank=nlp_blank
60
+ ):
61
+ """
62
+ Args:
63
+ references - a references section, ideally without a header
64
+ nlp - a model that splits up references into separate sentences
65
+ nlp_blank - a blank nlp with the same tokenizer/language
66
+ """
67
+
68
+ normalized_references = references.replace("\n", " ")
69
+
70
+ # the model trained on 'normalized' references - the ones without '\n'
71
+ doc = nlp(normalized_references)
72
+
73
+ # 'transfer' annotations from doc without '\n' (normalized references) to the target doc created from the original input string
74
+ # the problem here is that docs differ in a number of tokens
75
+ # however, it should be easy to align on characters level because both '\n' and ' ' are whitespace, so spans have the same boundaries
76
+
77
+ target_doc = nlp_blank(references)
78
+ target_tokens_idx = {
79
+ offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
80
+ }
81
+
82
+ # senter annotations
83
+ for i, t in enumerate(target_doc):
84
+ t.is_sent_start = i == 0
85
+ if is_eol_mode:
86
+ # use SpanCat scores to set sentence boundaries on the target doc
87
+ char_offset = 0
88
+ f = io.StringIO(references)
89
+ token_scorer = create_bib_item_start_scorer_for_doc(doc)
90
+ threshold = 0.2
91
+ lines = [line for line in f]
92
+ lines_len_in_tokens = [
93
+ _len for _len in map(lambda line: len(nlp_blank.tokenizer(line)), lines)
94
+ ]
95
+ for line_num, line in enumerate(lines):
96
+ fuzzy = (
97
+ 0 if line_num == 0 else lines_len_in_tokens[line_num - 1] // 4,
98
+ lines_len_in_tokens[line_num] // 4,
99
+ )
100
+ span, score = token_scorer(char_offset, fuzzy_in_tokens=fuzzy)
101
+ print(span, score)
102
+ if score > threshold:
103
+ target_doc[target_tokens_idx[char_offset]].is_sent_start = True
104
+ char_offset += len(line)
105
+ else:
106
+ # copy SentenceRecognizer annotations from doc without '\n' to the target doc
107
+ for t in doc:
108
+ if t.is_sent_start:
109
+ target_doc[target_tokens_idx[t.idx]].is_sent_start = True
110
+
111
+ # copy ner annotations:
112
+ target_doc.ents = [
113
+ target_doc.char_span(ent.start_char, ent.end_char, ent.label_)
114
+ for ent in doc.ents
115
+ # remove entities crossing sentence boundaries
116
+ if not any([t.is_sent_start for t in ent if t.i != ent.start])
117
+ ]
118
+
119
+ return target_doc
120
+
121
+
122
+ def text_analysis(text, is_eol_mode):
123
+
124
+ html = ""
125
+
126
+ doc_with_linebreaks = split_up_references(
127
+ text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
128
+ )
129
+
130
+ for i, sent in enumerate(doc_with_linebreaks.sents):
131
+ bib_item_doc = sent.as_doc()
132
+ bib_item_doc.user_data = {"title": f"***** Bib Item {i+1}: *****"}
133
+ html += displacy.render(bib_item_doc, style="ent")
134
+
135
+ html = (
136
+ "<div style='max-width:100%; max-height:360px; overflow:auto'>"
137
+ + html
138
+ + "</div>"
139
+ )
140
+
141
+ return html
142
+
143
+
144
+ demo = gr.Blocks()
145
+ with demo:
146
+
147
+ textbox = gr.components.Textbox(
148
+ label="Unparsed Bibliography Section",
149
+ placeholder="Enter bibliography here...",
150
+ lines=20,
151
+ )
152
+ is_eol_mode = gr.components.Checkbox(
153
+ label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
154
+ )
155
+ html = gr.components.HTML(label="Parsed Bib Items")
156
+ textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
157
+ is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
158
+
159
+ gr.Examples(
160
+ examples=[
161
+ [
162
+ """[1] B. Foxman, R. Barlow, H. D'Arcy, B. Gillespie, and J. D. Sobel, "Urinary tract infection: self-reported incidence and associated costs," Ann Epidemiol, vol. 10, pp. 509-515, 2000. [2] B. Foxman, "Epidemiology of urinary tract infections: incidence, morbidity, and economic costs," Am J Med, vol. 113, pp. 5-13, 2002. [3] L. Nicolle, "Urinary tract infections in the elderly," Clin Geriatr Med, vol. 25, pp. 423-436, 2009."""
163
+ ],
164
+ [
165
+ """Barth, Fredrik, ed.
166
+ 1969 Ethnic groups and boundaries: The social organization of culture difference. Oslo: Scandinavian University Press.
167
+ Bondokji, Neven
168
+ 2016 The Expectation Gap in Humanitarian Operations: Field Perspectives from Jordan. Asian Journal of Peace Building 4(1):1-28.
169
+ Bourdieu, Pierre
170
+ The forms of capital In Handbook of Theory and Research for the Sociology of Education. J. Richardson, ed. Pp. 241-258. New York: Greenwood Publishesrs.
171
+ Carrion, Doris
172
+ 2015 Are Syrian Refguees a Security Threat to the MIddle East Vol. 2016. London Reuters.
173
+ CFR
174
+ 2016 The Global Humanitarian Regime: Priorities and Prospects for Reform. Council on Foerign Relations, International Institutues and Global Governance Program"""
175
+ ],
176
+ [
177
+ """(2) Hofmann, M.H. et al. Aberrant splicing caused by single nucleotide polymorphism c.516G>T [Q172H], a marker of CYP2B6*6, is responsible for decreased expression and activity of CYP2B6 in liver. J Pharmacol Exp Ther 325, 284-92 (2008).
178
+ (3) Zanger, U.M. & Klein, K. Pharmacogenetics of cytochrome P450 2B6 (CYP2B6): advances on polymorphisms, mechanisms, and clinical relevance. Front Genet 4, 24 (2013).
179
+ (4) Holzinger, E.R. et al. Genome-wide association study of plasma efavirenz pharmacokinetics in AIDS Clinical Trials Group protocols implicates several CYP2B6 variants. Pharmacogenet Genomics 22, 858-67 (2012).
180
+ """
181
+ ],
182
+ [
183
+ """[Ein05] Albert Einstein. Zur Elektrodynamik bewegter K ̈orper. (German)
184
+ [On the electrodynamics of moving bodies]. Annalen der Physik,
185
+ 322(10):891–921, 1905.
186
+ [GMS93] Michel Goossens, Frank Mittelbach, and Alexander Samarin. The LATEX Companion. Addison-Wesley, Reading, Massachusetts, 1993.
187
+ [Knu] Donald Knuth. Knuth: Computers and typesetting."""
188
+ ],
189
+ [
190
+ """References
191
+ Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
192
+ Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
193
+ Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
194
+ Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
195
+ Beuther, H., Walsh, A. J., Thorwirth, S., et al. 2007, A&A, 466, 989
196
+ Brogan, C. L., Hunter, T. R., Cyganowski, C. J., et al. 2011, ApJ, 739, L16
197
+ Brown, A. T., Little, L. T., MacDonald, G. H., Riley, P. W., & Matheson, D. N.
198
+ 1981, MNRAS, 195, 607
199
+ Brown, R. D. & Cragg, D. M. 1991, ApJ, 378, 445
200
+ Carrasco-González, C., Sanna, A., Rodríguez-Kamenetzky, A., et al. 2021, ApJ,
201
+ 914, L1
202
+ Cesaroni, R., Walmsley, C. M., & Churchwell, E. 1992, A&A, 256, 618
203
+ Cheung, A. C., Rank, D. M., Townes, C. H., Thornton, D. D., & Welch, W. J.
204
+ 1968, Phys. Rev. Lett., 21, 1701
205
+ Churchwell, E., Babler, B. L., Meade, M. R., et al. 2009, PASP, 121, 213
206
+ Cohen, R. J. & Brebner, G. C. 1985, MNRAS, 216, 51P
207
+ Comito, C., Schilke, P., Endesfelder, U., Jiménez-Serra, I., & Martín-Pintado, J.
208
+ 2007, A&A, 469, 207
209
+ Curiel, S., Ho, P. T. P., Patel, N. A., et al. 2006, ApJ, 638, 878
210
+ Danby, G., Flower, D. R., Valiron, P., Schilke, P., & Walmsley, C. M. 1988,
211
+ MNRAS, 235, 229
212
+ De Buizer, J. M., Liu, M., Tan, J. C., et al. 2017, ApJ, 843, 33
213
+ De Buizer, J. M., Radomski, J. T., Telesco, C. M., & Piña, R. K. 2003, ApJ, 598,
214
+ 1127
215
+ Dzib, S., Loinard, L., Rodríguez, L. F., Mioduszewski, A. J., & Torres, R. M.
216
+ 2011, ApJ, 733, 71
217
+ Flower, D. R., Offer, A., & Schilke, P. 1990, MNRAS, 244, 4P
218
+ Galván-Madrid, R., Keto, E., Zhang, Q., et al. 2009, ApJ, 706, 1036"""
219
+ ],
220
+ ],
221
+ inputs=textbox,
222
+ )
223
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7080)