Spaces:

vitaly
/

bibliography-parser

Build error

the model is very sensitive to the number of spaces between references. The issue is mitigated by removing an extra space between lines

d4bb227 over 2 years ago

raw

history blame

11.2 kB

	import io

	import gradio as gr
	import numpy as np
	import spacy
	from spacy import displacy
	from spacy.training import Example

	from bib_tokenizers import create_references_tokenizer
	from schema import spankey_sentence_start, tags_ent


	nlp = spacy.load("en_bib_references_trf")
	# return score for each token:
	# with threshold set to zero each suggested span is returned, and span == token,
	# because suggester is configured to suggest spans with len(span) == 1:
	# [components.spancat.suggester]
	# @misc = "spacy.ngram_suggester.v1"
	# sizes = [1]
	nlp.get_pipe("spancat").cfg["threshold"] = 0.0 # see )
	print(nlp.get_pipe("spancat").cfg)


	def create_bib_item_start_scorer_for_doc(doc):

	span_group = doc.spans[spankey_sentence_start]
	assert not span_group.has_overlap
	assert len(span_group) == len(
	doc
	), "Check suggester config and the spancat threshold to make sure that spangroup contains single token span for each token"

	def scorer(token_index_in_doc, fuzzy_in_tokens=(0, 0)):
	i = token_index_in_doc

	span = span_group[i] # our spans are one token length
	assert i == span.start

	# fuzzines might improve fault tolerance if the model made a small mistake,
	# e.g., if a number from prev line is classified as "citation number",
	# see example at https://www.deeplearningbook.org/contents/bib.html
	# if fuzzy == (0,0), it return score for the selected span only
	return span, max(
	span_group.attrs["scores"][i]
	for i in range(i - fuzzy_in_tokens[0], i + fuzzy_in_tokens[1] + 1)
	if i >= 0 and i < len(doc.text)
	)

	return scorer


	nlp_blank = spacy.blank("en")
	nlp_blank.tokenizer = create_references_tokenizer()(nlp_blank)


	def split_up_references(
	references: str, is_eol_mode=False, ner=True, nlp=nlp, nlp_blank=nlp_blank
	):
	"""
	Args:
	references - a references section, ideally without a header
	nlp - a model that splits up references into separate sentences
	nlp_blank - a blank nlp with the same tokenizer/language
	"""

	target_doc = nlp_blank(references)
	target_tokens_idx = {
	offset: t.i for t in target_doc for offset in range(t.idx, t.idx + len(t))
	}
	f = io.StringIO(references)
	lines = [line for line in f]

	# disable unused components to speedup inference && parse normalized referenences
	disable = []
	if is_eol_mode:
	disable.append("senter")
	else:
	disable.append("spancat")
	if not ner:
	disable.append("ner")
	with nlp.select_pipes(disable=disable):
	# normalization applied: strip lines and remove any extra space between lines
	norm_doc = nlp(" ".join([line.strip() for line in lines if line.strip()]))

	# extremely useful spacy API for alignment normalized and target(created from non-modified input) docs
	example = Example(target_doc, norm_doc)

	if is_eol_mode:
	alignment_data = example.alignment.y2x.data

	# use SpanCat scores to set sentence boundaries on the target doc
	# init senter annotations
	for i, t in enumerate(target_doc):
	t.is_sent_start = i == 0

	char_offset = 0
	token_scorer = create_bib_item_start_scorer_for_doc(norm_doc)
	threshold = 0.5
	for line_num, line in enumerate(lines):
	if not line.strip():
	# ignore empty line
	char_offset += len(line)
	continue

	token_index_in_target_doc = target_tokens_idx[char_offset]
	# scroll to the first non-space (if the line starts from space):
	while (
	token_index_in_target_doc < len(target_doc)
	and target_doc[token_index_in_target_doc].is_space
	):
	token_index_in_target_doc += 1

	index_in_norm_doc = np.where(alignment_data == token_index_in_target_doc)
	if type(index_in_norm_doc) == tuple:
	index_in_norm_doc = index_in_norm_doc[0] # depends on numpy version...

	if index_in_norm_doc.size > 0:
	index_in_norm_doc = index_in_norm_doc[0].item()
	span, score = token_scorer(index_in_norm_doc)
	print(span, score, index_in_norm_doc)
	if score > threshold:
	target_doc[target_tokens_idx[char_offset]].is_sent_start = True

	char_offset += len(line)
	else:
	# copy SentenceRecognizer annotations from doc without '\n' to the target doc
	sent_start = example.get_aligned("SENT_START")
	for i, t in enumerate(target_doc):
	target_doc[i].is_sent_start = sent_start[i] == 1

	# copy ner annotations:
	for label in tags_ent:
	target_doc.vocab[label]
	target_doc.ents = example.get_aligned_spans_y2x(norm_doc.ents)

	return target_doc


	def text_analysis(text, is_eol_mode):

	if not text or not text.strip():
	return "<div style='max-width:100%; overflow:auto; color:grey'><p>Unparsed Bibliography Section is empty</p></div>"

	doc_with_linebreaks = split_up_references(
	text, is_eol_mode=is_eol_mode, nlp=nlp, nlp_blank=nlp_blank
	)

	html = ""
	options = {
	"ents": tags_ent,
	"colors": {
	"citation-number": "yellow",
	"citation-label": "yellow",
	"family": "DeepSkyBlue",
	"given": "LightSkyBlue",
	"title": "PeachPuff",
	"container-title": "Moccasin",
	"publisher": "PaleTurquoise",
	"issued": "Gold",
	},
	}
	for i, sent in enumerate(doc_with_linebreaks.sents):
	bib_item_doc = sent.as_doc()
	ref = displacy.render(bib_item_doc, style="ent", options=options)
	html += f"<tr><td>{i}</td><td>{ref}</td></tr>"

	html = (
	"""<div style='max-width:100%; max-height:720px; overflow:auto'>
	<style>table {
	font-family: arial, sans-serif;
	border-collapse: collapse;
	width: 100%;
	}

	td, th {
	border: 1px solid #b0b0b0;
	text-align: left;
	padding: 8px;
	}

	tr:nth-child(even) {
	background-color: #f2f2f2;
	}</style>"""
	+ "<table><tr><th>Index</th><th>Parsed Reference</th></tr>"
	+ html
	+ "</table>"
	+ "</div>"
	)

	return html


	gr.close_all()
	demo = gr.Blocks()
	with demo:

	textbox = gr.components.Textbox(
	label="Unparsed Bibliography Section",
	placeholder="Enter bibliography here...",
	lines=20,
	)
	is_eol_mode = gr.components.Checkbox(
	label="a line does not contain more than one bibitem (Multiline bibitems are supported regardless of this choice)"
	)
	html = gr.components.HTML(label="Parsed Bib Items")
	textbox.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])
	is_eol_mode.change(fn=text_analysis, inputs=[textbox, is_eol_mode], outputs=[html])

	gr.Examples(
	examples=[
	[
	"""[1] B. Foxman, R. Barlow, H. D'Arcy, B. Gillespie, and J. D. Sobel, "Urinary tract infection: self-reported incidence and associated costs," Ann Epidemiol, vol. 10, pp. 509-515, 2000. [2] B. Foxman, "Epidemiology of urinary tract infections: incidence, morbidity, and economic costs," Am J Med, vol. 113, pp. 5-13, 2002. [3] L. Nicolle, "Urinary tract infections in the elderly," Clin Geriatr Med, vol. 25, pp. 423-436, 2009."""
	],
	[
	"""Barth, Fredrik, ed.
	1969 Ethnic groups and boundaries: The social organization of culture difference. Oslo: Scandinavian University Press.
	Bondokji, Neven
	2016 The Expectation Gap in Humanitarian Operations: Field Perspectives from Jordan. Asian Journal of Peace Building 4(1):1-28.
	Bourdieu, Pierre
	The forms of capital In Handbook of Theory and Research for the Sociology of Education. J. Richardson, ed. Pp. 241-258. New York: Greenwood Publishesrs.
	Carrion, Doris
	2015 Are Syrian Refguees a Security Threat to the MIddle East Vol. 2016. London Reuters.
	CFR
	2016 The Global Humanitarian Regime: Priorities and Prospects for Reform. Council on Foerign Relations, International Institutues and Global Governance Program"""
	],
	[
	"""(2) Hofmann, M.H. et al. Aberrant splicing caused by single nucleotide polymorphism c.516G>T [Q172H], a marker of CYP2B6*6, is responsible for decreased expression and activity of CYP2B6 in liver. J Pharmacol Exp Ther 325, 284-92 (2008).
	(3) Zanger, U.M. & Klein, K. Pharmacogenetics of cytochrome P450 2B6 (CYP2B6): advances on polymorphisms, mechanisms, and clinical relevance. Front Genet 4, 24 (2013).
	(4) Holzinger, E.R. et al. Genome-wide association study of plasma efavirenz pharmacokinetics in AIDS Clinical Trials Group protocols implicates several CYP2B6 variants. Pharmacogenet Genomics 22, 858-67 (2012).
	"""
	],
	[
	"""[Ein05] Albert Einstein. Zur Elektrodynamik bewegter K ̈orper. (German)
	[On the electrodynamics of moving bodies]. Annalen der Physik,
	322(10):891–921, 1905.
	[GMS93] Michel Goossens, Frank Mittelbach, and Alexander Samarin. The LATEX Companion. Addison-Wesley, Reading, Massachusetts, 1993.
	[Knu] Donald Knuth. Knuth: Computers and typesetting."""
	],
	[
	"""References.
	Bartkiewicz, A., Szymczak, M., Cohen, R. J., & Richards, A. M. S. 2005, MN- RAS, 361, 623
	Bartkiewicz, A., Szymczak, M., & van Langevelde, H. J. 2016, A&A, 587, A104
	Benjamin, R. A., Churchwell, E., Babler, B. L., et al. 2003, PASP, 115, 953
	Beuther, H., Mottram, J. C., Ahmadi, A., et al. 2018, A&A, 617, A100
	Beuther, H., Walsh, A. J., Thorwirth, S., et al. 2007, A&A, 466, 989
	Brogan, C. L., Hunter, T. R., Cyganowski, C. J., et al. 2011, ApJ, 739, L16
	Brown, A. T., Little, L. T., MacDonald, G. H., Riley, P. W., & Matheson, D. N.
	1981, MNRAS, 195, 607
	Brown, R. D. & Cragg, D. M. 1991, ApJ, 378, 445
	Carrasco-González, C., Sanna, A., Rodríguez-Kamenetzky, A., et al. 2021, ApJ,
	914, L1
	Cesaroni, R., Walmsley, C. M., & Churchwell, E. 1992, A&A, 256, 618
	Cheung, A. C., Rank, D. M., Townes, C. H., Thornton, D. D., & Welch, W. J.
	1968, Phys. Rev. Lett., 21, 1701
	Churchwell, E., Babler, B. L., Meade, M. R., et al. 2009, PASP, 121, 213
	Cohen, R. J. & Brebner, G. C. 1985, MNRAS, 216, 51P
	Comito, C., Schilke, P., Endesfelder, U., Jiménez-Serra, I., & Martín-Pintado, J.
	2007, A&A, 469, 207
	Curiel, S., Ho, P. T. P., Patel, N. A., et al. 2006, ApJ, 638, 878
	Danby, G., Flower, D. R., Valiron, P., Schilke, P., & Walmsley, C. M. 1988,
	MNRAS, 235, 229
	De Buizer, J. M., Liu, M., Tan, J. C., et al. 2017, ApJ, 843, 33
	De Buizer, J. M., Radomski, J. T., Telesco, C. M., & Piña, R. K. 2003, ApJ, 598,
	1127
	Dzib, S., Loinard, L., Rodríguez, L. F., Mioduszewski, A. J., & Torres, R. M.
	2011, ApJ, 733, 71
	Flower, D. R., Offer, A., & Schilke, P. 1990, MNRAS, 244, 4P
	Galván-Madrid, R., Keto, E., Zhang, Q., et al. 2009, ApJ, 706, 1036"""
	],
	],
	inputs=textbox,
	)
	demo.launch()