Spaces:

nianlong
/

memsum-arxiv-summarizer

Build error

memsum-arxiv-summarizer / s2orc-doc2json /doc2json /tex2json /xml_to_json.py

nianlonggu

init

02ae0bf about 1 year ago

48.5 kB

	import os
	import re
	import itertools
	import bs4
	from bs4 import BeautifulSoup, NavigableString
	from typing import List, Dict, Tuple, Optional
	import copy
	import latex2mathml.converter

	from doc2json.grobid2json.grobid.grobid_client import GrobidClient
	from doc2json.utils.grobid_util import parse_bib_entry, get_author_data_from_grobid_xml
	from doc2json.s2orc import Paper, Paragraph


	SKIP_TAGS = {
	'clearpage',
	'colorpool',
	'newpage',
	'tableofcontents'
	}

	TEXT_TAGS = {
	'p',
	'proof',
	'caption'
	}


	def normalize_latex_id(latex_id: str):
	str_norm = latex_id.upper().replace('_', '')
	if str_norm.startswith('BID'):
	return str_norm.replace('BID', 'BIBREF')
	if str_norm.startswith('CID'):
	return str_norm.replace('CID', 'SECREF')
	if str_norm.startswith('FORMULA'):
	return str_norm.replace('FORMULA', 'EQREF')
	return str_norm


	def process_author(
	author_text: str,
	grobid_client: GrobidClient,
	logfile: str
	) -> List[Dict]:
	"""
	Process authors
	:param author_text:
	:param grobid_client:
	:param logfile:
	:return:
	"""
	if author_text:
	author_xml_str = grobid_client.process_header_names(author_text, logfile)
	if author_xml_str:
	author_soup = BeautifulSoup(author_xml_str, 'xml')
	author_entry = get_author_data_from_grobid_xml(author_soup)
	return author_entry

	return [{
	"first": "",
	"middle": [],
	"last": author_text,
	"suffix": "",
	"affiliation": {},
	"email": ""
	}]


	def process_bibentry(bib_text: str, grobid_client: GrobidClient, logfile: str):
	"""
	Process one bib entry text into title, authors, etc
	:param bib_text:
	:param grobid_client:
	:param logfile:
	:return:
	"""
	if not bib_text:
	return None
	bib_lines = bib_text.split('\n')
	bib_lines = [re.sub(r'\s+', ' ', line) for line in bib_lines]
	bib_lines = [re.sub(r'\s', ' ', line).strip() for line in bib_lines]
	bib_string = ' '.join(bib_lines)
	xml_str = grobid_client.process_citation(bib_string, logfile)
	if xml_str:
	soup = BeautifulSoup(xml_str, 'lxml')
	bib_entry = parse_bib_entry(soup)
	if not bib_entry['raw_text']:
	bib_entry['raw_text'] = bib_string
	return bib_entry
	return None


	def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
	"""
	Replace all references in element with special tokens
	:param sp:
	:param el:
	:param ref_map:
	:return:
	"""
	# replace all citations with cite keyword
	for cite in el.find_all('cit'):
	try:
	target = cite.ref.get('target').replace('bid', 'BIBREF')
	cite.replace_with(sp.new_string(f" {target} "))
	except AttributeError:
	print('Attribute error: ', cite)
	continue

	# replace all non citation references
	for rtag in el.find_all('ref'):
	try:
	if rtag.get('target') and not rtag.get('target').startswith('bid'):
	if rtag.get('target').startswith('cid'):
	target = rtag.get('target').replace('cid', 'SECREF')
	elif rtag.get('target').startswith('uid'):
	if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
	target = rtag.get('target').replace('uid', 'FIGREF')
	elif rtag.get('target').replace('uid', 'TABREF') in ref_map:
	target = rtag.get('target').replace('uid', 'TABREF')
	elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
	target = rtag.get('target').replace('uid', 'EQREF')
	elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map:
	target = rtag.get('target').replace('uid', 'FOOTREF')
	elif rtag.get('target').replace('uid', 'SECREFU') in ref_map:
	target = rtag.get('target').replace('uid', 'SECREFU')
	else:
	target = rtag.get('target').upper()
	else:
	print('Weird ID!')
	target = rtag.get('target').upper()
	rtag.replace_with(sp.new_string(f" {target} "))
	except AttributeError:
	print('Attribute error: ', rtag)
	continue

	return el


	def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
	"""
	Process list element
	:param sp:
	:param list_el:
	:param section_info:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	# TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists)
	list_items = []
	for item in list_el.find_all('item'):
	# skip itemize settings
	if item.text.strip().startswith('[') and item.text.strip().endswith(']'):
	continue
	# try processing as paragraph
	list_num = item.get('id-text', None)
	item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map)
	# append list number if ordered
	if list_num:
	list_num_str = f'{list_num}. '
	# iterate cite spans
	new_cite_spans = []
	for span in item_as_para.cite_spans:
	new_cite_spans.append({
	"start": span['start'] + len(list_num_str),
	"end": span['end'] + len(list_num_str),
	"text": span['text']
	})
	# iterate ref spans
	new_ref_spans = []
	for span in item_as_para.ref_spans:
	new_ref_spans.append({
	"start": span['start'] + len(list_num_str),
	"end": span['end'] + len(list_num_str),
	"text": span['text']
	})
	# iterate equation spans
	new_eq_spans = []
	for span in item_as_para.eq_spans:
	new_eq_spans.append({
	"start": span['start'] + len(list_num_str),
	"end": span['end'] + len(list_num_str),
	"text": span['text'],
	"latex": span['latex'],
	"ref_id": span['ref_id']
	})
	new_para = Paragraph(
	text=list_num_str + item_as_para.text,
	cite_spans=new_cite_spans,
	ref_spans=new_ref_spans,
	eq_spans=new_eq_spans,
	section=item_as_para.section
	)
	else:
	new_para = item_as_para
	list_items.append(new_para)
	return list_items


	def process_navstring(str_el: NavigableString, section_info: List):
	"""
	Process one NavigableString
	:param sp:
	:param str_el:
	:param section_info:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	# substitute space characters
	text = re.sub(r'\s+', ' ', str_el)
	text = re.sub(r'\s', ' ', text)

	# get all cite spans
	all_cite_spans = []
	for span in re.finditer(r'(BIBREF\d+)', text):
	all_cite_spans.append({
	"start": span.start(),
	"end": span.start() + len(span.group()),
	"ref_id": span.group()
	})

	# get all ref spans
	all_ref_spans = []
	for span in itertools.chain(
	re.finditer(r'(FIGREF\d+)', text),
	re.finditer(r'(TABREF\d+)', text),
	re.finditer(r'(EQREF\d+)', text),
	re.finditer(r'(FOOTREF\d+)', text),
	re.finditer(r'(SECREF\d+)', text),
	re.finditer(r'(SECREFU\d+)', text),
	):
	all_ref_spans.append({
	"start": span.start(),
	"end": span.start() + len(span.group()),
	"ref_id": span.group()
	})

	# assert all align
	for cite_span in all_cite_spans:
	assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
	for ref_span in all_ref_spans:
	assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']

	return Paragraph(
	text=text,
	cite_spans=all_cite_spans,
	ref_spans=all_ref_spans,
	eq_spans=[],
	section=section_info
	)


	def process_paragraph(sp: BeautifulSoup, para_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
	"""
	Process one paragraph
	:param sp:
	:param para_el:
	:param section_info:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	# replace all ref tokens with special tokens
	para_el = replace_ref_tokens(sp, para_el, ref_map)

	# sub and get corresponding spans of inline formulas
	formula_dict = dict()
	inline_key_ind = 0
	display_key_ind = 0
	for ftag in para_el.find_all('formula'):
	try:
	# if formula has ref id, treat as display formula
	if ftag.get('id'):
	formula_key = f'DISPLAYFORM{display_key_ind}'
	ref_id = ftag.get('id').replace('uid', 'EQREF')
	display_key_ind += 1
	# else, treat as inline
	else:
	formula_key = f'INLINEFORM{inline_key_ind}'
	ref_id = None
	inline_key_ind += 1
	try:
	formula_mathml = latex2mathml.converter.convert(ftag.texmath.text)
	except Exception:
	formula_mathml = ""
	formula_dict[formula_key] = (ftag.math.text, ftag.texmath.text, formula_mathml, ref_id)
	ftag.replace_with(sp.new_string(f" {formula_key} "))
	except AttributeError:
	continue

	# remove floats
	for fl in para_el.find_all('float'):
	print('Warning: still has <float/>!')
	fl.decompose()

	# remove notes
	for note in para_el.find_all('note'):
	print('Warning: still has <note/>!')
	note.decompose()

	# substitute space characters
	text = re.sub(r'\s+', ' ', para_el.text)
	text = re.sub(r'\s', ' ', text)

	# get all cite spans
	all_cite_spans = []
	for span in re.finditer(r'(BIBREF\d+)', text):
	all_cite_spans.append({
	"start": span.start(),
	"end": span.start() + len(span.group()),
	"text": bib_map[span.group()]['num'] if span.group() in bib_map else None,
	"ref_id": span.group()
	})

	# get all ref spans
	all_ref_spans = []
	for span in itertools.chain(
	re.finditer(r'(FIGREF\d+)', text),
	re.finditer(r'(TABREF\d+)', text),
	re.finditer(r'(EQREF\d+)', text),
	re.finditer(r'(FOOTREF\d+)', text),
	re.finditer(r'(SECREF\d+)', text),
	re.finditer(r'(SECREFU\d+)', text),
	):
	all_ref_spans.append({
	"start": span.start(),
	"end": span.start() + len(span.group()),
	"text": ref_map[span.group()]['num'] if span.group() in ref_map else None,
	"ref_id": span.group()
	})

	# get all equation spans
	all_eq_spans = []
	for span in itertools.chain(
	re.finditer(r'(INLINEFORM\d+)', text),
	re.finditer(r'(DISPLAYFORM\d+)', text)
	):
	try:
	matching_formula = formula_dict[span.group()]
	all_eq_spans.append({
	"start": span.start(),
	"end": span.start() + len(span.group()),
	"text": matching_formula[0],
	"latex": matching_formula[1],
	"mathml": matching_formula[2],
	"ref_id": span.group()
	})
	except KeyError:
	continue

	# assert all align
	for cite_span in all_cite_spans:
	assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
	for ref_span in all_ref_spans:
	assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']

	return Paragraph(
	text=text,
	cite_spans=all_cite_spans,
	ref_spans=all_ref_spans,
	eq_spans=all_eq_spans,
	section=section_info
	)


	def decompose_tags_before_title(sp: BeautifulSoup):
	"""
	decompose all tags before title
	:param sp:
	:return:
	"""
	if sp.body.next.name == 'std':
	cld_tags = sp.std.find_all(recursive=False)
	if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
	for tag in sp.std:
	if type(tag) == bs4.element.Tag:
	if tag.name != 'maketitle' and tag.name != 'title':
	tag.decompose()
	else:
	break
	elif sp.body.next.name == 'unknown':
	cld_tags = sp.unknown.find_all(recursive=False)
	if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
	for tag in sp.std:
	if type(tag) == bs4.element.Tag:
	if tag.name != 'maketitle' and tag.name != 'title':
	tag.decompose()
	else:
	break
	else:
	print(f"Unknown inner tag: {sp.body.next.name}")
	return


	def process_metadata(sp: BeautifulSoup, grobid_client: GrobidClient, log_file: str) -> Tuple[str, List]:
	"""
	Process metadata section in soup
	:param sp:
	:param grobid_client:
	:param log_file:
	:return:
	"""
	title = ""
	authors = []

	if not sp.maketitle and not sp.metadata:
	if sp.title:
	title = sp.title.text
	return title, authors
	else:
	return title, authors
	elif sp.maketitle:
	try:
	# process title
	title = sp.maketitle.title.text
	for formula in sp.author.find_all('formula'):
	formula.decompose()
	# process authors
	author_parts = []
	for tag in sp.author:
	if type(tag) == NavigableString:
	author_parts.append(tag.strip())
	else:
	author_parts.append(tag.text.strip())
	author_parts = [re.sub(r'\s+', ' ', line) for line in author_parts]
	author_parts = [re.sub(r'\s', ' ', line).strip() for line in author_parts]
	author_parts = [part for part in author_parts if part.strip()]
	author_string = ', '.join(author_parts)
	authors = process_author(author_string, grobid_client, log_file)
	sp.maketitle.decompose()
	except AttributeError:
	sp.maketitle.decompose()
	return title, authors
	elif sp.metadata:
	try:
	# process title and authors from metadata
	title = sp.metadata.title.text
	# get authors
	for author in sp.authors:
	for subtag in author:
	subtag.decompose()
	if author.text.strip():
	author_parts = author.text.strip().split()
	authors.append({
	"first": author_parts[0] if len(author_parts) > 1 else "",
	"last": author_parts[-1]
	if author_parts[-1].lower() not in {"jr", "jr.", "iii", "iv", "v"}
	else author_parts[-2] if len(author_parts) > 1 else author_parts[-1],
	"middle": author_parts[1:-1],
	"suffix": "",
	"affiliation": {},
	"email": ""
	})
	sp.metadata.decompose()
	except AttributeError:
	sp.metadata.decompose()
	return title, authors

	return title, authors


	def process_bibliography_from_tex(sp: BeautifulSoup, client, log_file) -> Dict:
	"""
	Parse bibliography from latex
	:return:
	"""
	bibkey_map = dict()
	# replace Bibliography with bibliography if needed
	for bibl in sp.find_all("Bibliography"):
	bibl.name = 'bibliography'
	# construct bib map
	for bibliography in sp.find_all('bibliography'):
	bib_items = bibliography.find_all('bibitem')
	# map all bib entries
	if bib_items:
	for bi_num, bi in enumerate(bib_items):
	try:
	if not bi.get('id'):
	continue
	# get bib entry text and process it
	bib_par = bi.find_parent('p')
	if bib_par.text:
	bib_entry = process_bibentry(bib_par.text, client, log_file)
	else:
	next_tag = bib_par.findNext('p')
	if not next_tag.find('bibitem') and next_tag.text:
	bib_entry = process_bibentry(next_tag.text, client, log_file)
	else:
	bib_entry = None
	# if processed successfully, add to map
	if bib_entry:
	# get URLs from bib entry
	urls = []
	for xref in bib_par.find_all('xref'):
	urls.append(xref.get('url'))
	bib_entry['urls'] = urls
	# map to ref id
	ref_id = normalize_latex_id(bi.get('id'))
	bib_entry['ref_id'] = ref_id
	bib_entry['num'] = bi_num
	bibkey_map[ref_id] = bib_entry
	except AttributeError:
	print('Attribute error in bib item!', bi)
	continue
	except TypeError:
	print('Type error in bib item!', bi)
	continue
	else:
	for bi_num, p in enumerate(sp.bibliography.find_all('p')):
	try:
	bib_key, bib_entry = None, None
	bib_text = p.text
	bib_name = re.match(r'\[(.?)\](.)', bib_text)
	if bib_name:
	bib_text = re.sub(r'\s', ' ', bib_text)
	bib_name = re.match(r'\[(.?)\](.)', bib_text)
	if bib_name:
	bib_key = bib_name.group(1)
	bib_entry = process_bibentry(bib_name.group(2), client, log_file)
	else:
	bib_lines = bib_text.split('\n')
	bib_key = re.sub(r'\s', ' ', bib_lines[0])
	bib_text = re.sub(r'\s', ' ', ' '.join(bib_lines[1:]))
	bib_entry = process_bibentry(bib_text, client, log_file)
	if bib_key and bib_entry:
	# get URLs from bib entry
	urls = []
	for xref in p.find_all('xref'):
	urls.append(xref.get('url'))
	bib_entry['urls'] = urls
	bib_entry['num'] = bi_num
	# map to bib id
	bibkey_map[bib_key] = bib_entry
	except AttributeError:
	print('Attribute error in bib item!', p)
	continue
	except TypeError:
	print('Type error in bib item!', p)
	continue
	for bibliography in sp.find_all('bibliography'):
	bibliography.decompose()
	return bibkey_map


	def get_section_name(sec):
	"""
	Get section name from div tag
	:param sec:
	:return:
	"""
	if sec.head:
	sec_text = sec.head.text
	else:
	sec_str = []
	for tag in sec:
	if type(tag) == NavigableString:
	if len(tag.strip()) < 50:
	sec_str.append(tag.strip())
	else:
	break
	elif tag.name != 'p':
	if len(tag.text.strip()) < 50:
	sec_str.append(tag.text.strip())
	else:
	break
	else:
	break
	sec_text = ' '.join(sec_str).strip()
	return sec_text


	def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup, parent: Optional[str], faux_max: int) -> Dict:
	"""
	Process section headers for one div
	:param el:
	:param sp:
	:return:
	"""
	sec_map_dict = dict()
	el_ref_id = None

	# process divs with ids
	if el.get('id', None):
	sec_num = el.get('id-text', None)
	if 'cid' in el.get('id'):
	el_ref_id = el.get('id').replace('cid', 'SECREF')
	elif 'uid' in el.get('id'):
	el_ref_id = el.get('id').replace('uid', 'SECREFU')
	else:
	print('Unknown ID type!', el.get('id'))
	raise NotImplementedError
	el['s2orc_id'] = el_ref_id
	sec_map_dict[el_ref_id] = {
	"num": sec_num,
	"text": get_section_name(el),
	"ref_id": el_ref_id,
	"parent": parent
	}
	# process divs without section numbers
	elif el.get('rend') == "nonumber":
	el_ref_id = f'SECREF{faux_max}'
	el['s2orc_id'] = el_ref_id
	sec_map_dict[el_ref_id] = {
	"num": None,
	"text": get_section_name(el),
	"ref_id": el_ref_id,
	"parent": parent
	}

	# process sub elements
	for sub_el in el.find_all(recursive=False):
	if sub_el.name.startswith('div'):
	# add any unspecified keys
	sec_keys = [int(k.strip('SECREF')) for k in sec_map_dict.keys() if k and k.strip('SECREF').isdigit()]
	faux_max = max(sec_keys + [faux_max]) + 1
	sec_map_dict.update(
	get_sections_from_div(sub_el, sp, el_ref_id if el_ref_id else parent, faux_max)
	)
	elif sub_el.name == 'p' or sub_el.name == 'proof':
	if sub_el.get('id', None):
	sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None))
	if 'cid' in sub_el.get('id'):
	sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF')
	elif 'uid' in sub_el.get('id'):
	sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU')
	else:
	print('Unknown ID type!', sub_el.get('id'))
	raise NotImplementedError
	sub_el['s2orc_id'] = sub_el_ref_id
	sec_map_dict[el_ref_id] = {
	"num": sec_num,
	"text": sub_el.head.text if sub_el.head else sub_el.hi.text if sub_el.hi else "",
	"ref_id": sub_el_ref_id,
	"parent": el_ref_id if el_ref_id else parent
	}
	return sec_map_dict


	def process_sections_from_text(sp: BeautifulSoup) -> Dict:
	"""
	Generate section dict and replace with id tokens
	:param sp:
	:return:
	"""
	# initialize
	section_map = dict()
	max_above_1000 = 999

	for div0 in sp.find_all('div0'):
	parent = None
	section_map.update(get_sections_from_div(div0, sp, parent, max_above_1000 + 1))
	# add any unspecified keys
	sec_keys = [int(k.strip('SECREF')) for k in section_map.keys() if k and k.strip('SECREF').isdigit()]
	max_above_1000 = max(sec_keys + [max_above_1000]) + 1

	return section_map


	def process_equations_from_tex(sp: BeautifulSoup) -> Dict:
	"""
	Generate equation dict and replace with id tokens
	:param sp:
	:return:
	"""
	equation_map = dict()

	for eq in sp.find_all('formula'):
	try:
	if eq.get('type', None) == 'display':
	if eq.get('id', None):
	ref_id = eq.get('id').replace('uid', 'EQREF')
	try:
	mathml = latex2mathml.converter.convert(eq.texmath.text.strip())
	except Exception:
	mathml = ""
	equation_map[ref_id] = {
	"num": eq.get('id-text', None),
	"text": eq.math.text.strip(),
	"mathml": mathml,
	"latex": eq.texmath.text.strip(),
	"ref_id": ref_id
	}
	replace_item = sp.new_tag('p')
	equation_copy = copy.copy(eq)
	equation_copy['type'] = 'inline'
	replace_item.insert(0, equation_copy)

	# replace with <p> containing equation as inline
	eq.replace_with(replace_item)

	except AttributeError:
	continue

	return equation_map


	def process_footnotes_from_text(sp: BeautifulSoup) -> Dict:
	"""
	Process footnote marks
	:param sp:
	:return:
	"""
	footnote_map = dict()

	for note in sp.find_all('note'):
	try:
	if note.name and note.get('id'):
	# normalize footnote id
	ref_id = note.get('id').replace('uid', 'FOOTREF')
	# remove equation tex
	for eq in note.find_all('texmath'):
	eq.decompose()
	# replace all xrefs with link
	for xref in note.find_all('xref'):
	xref.replace_with(sp.new_string(f" {xref.get('url')} "))
	# clean footnote text
	footnote_text = None
	if note.text:
	footnote_text = note.text.strip()
	footnote_text = re.sub(r'\s+', ' ', footnote_text)
	footnote_text = re.sub(r'\s', ' ', footnote_text)
	# form footnote entry
	footnote_map[ref_id] = {
	"num": note.get('id-text', None),
	"text": footnote_text,
	"ref_id": ref_id
	}
	note.replace_with(sp.new_string(f" {ref_id} "))
	except AttributeError:
	continue

	return footnote_map


	def get_figure_map_from_tex(sp: BeautifulSoup) -> Dict:
	"""
	Generate figure dict only
	:param sp:
	:return:
	"""
	figure_map = dict()

	# get floats first because they are around figures
	for flt in sp.find_all('float'):
	try:
	if flt.name and flt.get('name') == 'figure':

	# get files
	fig_files = []
	for fig in flt.find_all('figure'):
	if fig.get('file') and fig.get('extension'):
	fname = fig.get('file') + '.' + fig.get('extension')
	fig_files.append(fname)
	elif fig.get('file'):
	fname = fig.get('file')
	fig_files.append(fname)
	else:
	for subfig in fig.find_all('subfigure'):
	if subfig.get('file') and subfig.get('extension'):
	fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
	elif subfig.get('file'):
	fig_files.append(subfig.get('file'))

	if flt.get('id'):
	ref_id = flt.get('id').replace('uid', 'FIGREF')
	# form figmap entry
	figure_map[ref_id] = {
	"num": flt.get('id-text', None),
	"text": None, # placeholder
	"uris": fig_files,
	"ref_id": ref_id
	}
	except AttributeError:
	print('Attribute error with figure float: ', flt.name)
	continue

	for fig in sp.find_all('figure'):
	try:
	if fig.name and fig.get('id'):
	# normalize figure id
	ref_id = fig.get('id').replace('uid', 'FIGREF')
	# try to get filenames of figures
	fig_files = []
	if fig.get('file') and fig.get('extension'):
	fname = fig.get('file') + '.' + fig.get('extension')
	fig_files.append(fname)
	elif fig.get('file'):
	fig_files.append(fig.get('file'))
	else:
	for subfig in fig.find_all('subfigure'):
	if subfig.get('file') and subfig.get('extension'):
	fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
	elif subfig.get('file'):
	fig_files.append(subfig.get('file'))
	# form figmap entry
	figure_map[ref_id] = {
	"num": fig.get('id-text', None),
	"text": None, # placeholder
	"uris": fig_files,
	"ref_id": ref_id
	}
	except AttributeError:
	print('Attribute error with figure: ', fig.name)
	continue

	return figure_map


	def process_figures_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
	"""
	Add figure captions to fig_map and decompose
	:param sp:
	:param ref_map:
	:return:
	"""
	# process floats first because they are on the outside
	for flt in sp.find_all('float'):
	try:
	if flt.name and flt.get('name') == 'figure':
	if flt.get('id'):
	ref_id = flt.get('id').replace('uid', 'FIGREF')
	# remove equation tex
	for eq in flt.find_all('texmath'):
	eq.decompose()
	# clean caption text
	caption_text = None
	if flt.caption:
	flt = replace_ref_tokens(sp, flt, ref_map)
	caption_text = flt.caption.text.strip()
	caption_text = re.sub(r'\s+', ' ', caption_text)
	caption_text = re.sub(r'\s', ' ', caption_text)
	# form figmap entry
	ref_map[ref_id]['text'] = caption_text
	flt.decompose()
	except AttributeError:
	print('Attribute error with figure float: ', flt.name)
	continue

	for fig in sp.find_all('figure'):
	try:
	if fig.name and fig.get('id'):
	# normalize figure id
	ref_id = fig.get('id').replace('uid', 'FIGREF')
	# remove equation tex
	for eq in fig.find_all('texmath'):
	eq.decompose()
	# clean caption text
	caption_text = None
	if fig.text:
	fig = replace_ref_tokens(sp, fig, ref_map)
	caption_text = fig.text.strip()
	caption_text = re.sub(r'\s+', ' ', caption_text)
	caption_text = re.sub(r'\s', ' ', caption_text)
	# add text to figmap entry
	ref_map[ref_id]["text"] = caption_text
	except AttributeError:
	print('Attribute error with figure: ', fig.name)
	continue
	fig.decompose()

	return ref_map


	def convert_table_to_html(table_lst: List) -> str:
	if not table_lst:
	return ''
	html_str = '<table>'
	for i, row in enumerate(table_lst):
	html_str += '<tr>'
	bottom_border = row.get('bottom-border')
	if i == 0 or bottom_border:
	for cell in row['cells']:
	html_str += f"<th>{cell['text']}</th>"
	else:
	for cell in row['cells']:
	html_str += f"<td>{cell['text']}</td>"
	html_str += '</tr>'
	html_str += '</table>'
	return html_str


	def extract_table(table: BeautifulSoup) -> List:
	"""
	Extract table values from table entry
	:param table:
	:return:
	"""
	table_rep = []
	for row in table.find_all('row'):
	cells = []
	for cell in row.find_all('cell'):

	text_items = []
	latex_items = []

	for child in cell:

	if type(child) == NavigableString:
	text_items.append(str(child))
	latex_items.append(str(child))
	elif child.name == 'formula':
	text_items.append(child.math.text)
	latex_items.append(child.texmath.text)
	else:
	text_items.append(child.text)
	latex_items.append(child.text)

	text = ' '.join(text_items)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'\s', ' ', text)

	latex = ' '.join(latex_items)
	latex = re.sub(r'\s+', ' ', latex)

	cells.append({
	"alignment": cell.get('halign'),
	"right-border": cell.get('right-border') == 'true',
	"left-border": cell.get('left-border') == 'true',
	"text": text.strip(),
	"latex": latex.strip()
	})
	table_rep.append({
	"top-border": row.get('top-border') == "true",
	"bottom-border": row.get('bottom-border') == "true",
	"cells": cells
	})
	return table_rep


	def get_table_map_from_text(sp: BeautifulSoup, keep_table_contents=True) -> Dict:
	"""
	Generate table dict only
	:param sp:
	:param keep_table_contents:
	:return:
	"""
	table_map = dict()

	for flt in sp.find_all('float'):
	try:
	if flt.name and flt.get('name') == 'table':
	if flt.get('id'):
	# normalize table id
	ref_id = flt.get('id').replace('uid', 'TABREF')
	# get table content
	content = extract_table(flt) if keep_table_contents else None
	html = convert_table_to_html(content) if keep_table_contents else None
	# form tabmap entry
	table_map[ref_id] = {
	"num": flt.get('id-text', None),
	"text": None, # placeholder
	"content": content,
	"html": html,
	"ref_id": ref_id
	}
	for row in flt.find_all('row'):
	row.decompose()
	except AttributeError:
	print('Attribute error with table float: ', flt.name)
	continue

	for tab in sp.find_all('table'):
	try:
	# skip inline tables
	if tab.get('rend') == 'inline':
	continue
	# process them
	if tab.name and tab.get('id'):
	# normalize table id
	ref_id = tab.get('id').replace('uid', 'TABREF')
	# get table content
	content = extract_table(tab) if keep_table_contents else None
	html = convert_table_to_html(content) if keep_table_contents else None
	# form tabmap entry
	table_map[ref_id] = {
	"num": tab.get('id-text', None),
	"text": None, # placeholder
	"content": content,
	"html": html,
	"ref_id": ref_id
	}
	for row in tab.find_all('row'):
	row.decompose()
	except AttributeError:
	print('Attribute error with table: ', tab.name)
	continue

	return table_map


	def process_tables_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
	"""
	Generate table dict and replace with id tokens
	:param sp:
	:param ref_map:
	:return:
	"""
	# process floats first because they are on the outside
	for flt in sp.find_all('float'):
	try:
	if flt.name and flt.get('name') == 'table':
	if flt.get('id'):
	# normalize table id
	ref_id = flt.get('id').replace('uid', 'TABREF')
	# remove equation tex
	if flt.caption:
	caption_el = replace_ref_tokens(sp, flt.caption, ref_map)
	for eq in caption_el.find_all('texmath'):
	eq.decompose()
	caption_text = caption_el.text.strip()
	elif flt.head:
	head_el = replace_ref_tokens(sp, flt.head, ref_map)
	for eq in head_el.find_all('texmath'):
	eq.decompose()
	caption_text = head_el.text.strip()
	elif flt.p:
	caption_parts = []
	for tab_p in flt.find_all('p'):
	p_el = replace_ref_tokens(sp, tab_p, ref_map)
	for eq in p_el.find_all('texmath'):
	eq.decompose()
	caption_parts.append(p_el.text.strip())
	caption_text = ' '.join(caption_parts)
	else:
	tab_el = replace_ref_tokens(sp, flt, ref_map)
	caption_text = tab_el.text.strip()
	if caption_text:
	caption_text = re.sub(r'\s+', ' ', caption_text)
	caption_text = re.sub(r'\s', ' ', caption_text)
	# form tabmap entry
	ref_map[ref_id]['text'] = caption_text
	flt.decompose()
	except AttributeError:
	print('Attribute error with table float: ', flt.name)
	continue

	for tab in sp.find_all('table'):
	try:
	# skip inline tables
	if tab.get('rend') == 'inline':
	continue
	# process them
	if tab.name and tab.get('id'):
	# normalize table id
	ref_id = tab.get('id').replace('uid', 'TABREF')
	# remove equation tex from caption and clean and resolve refs
	if tab.caption:
	caption_el = replace_ref_tokens(sp, tab.caption, ref_map)
	for eq in caption_el.find_all('texmath'):
	eq.decompose()
	caption_text = caption_el.text.strip()
	elif tab.head:
	head_el = replace_ref_tokens(sp, tab.head, ref_map)
	for eq in head_el.find_all('texmath'):
	eq.decompose()
	caption_text = head_el.text.strip()
	elif tab.p:
	caption_parts = []
	for tab_p in tab.find_all('p'):
	p_el = replace_ref_tokens(sp, tab_p, ref_map)
	for eq in p_el.find_all('texmath'):
	eq.decompose()
	caption_parts.append(p_el.text.strip())
	caption_text = ' '.join(caption_parts)
	else:
	tab_el = replace_ref_tokens(sp, tab, ref_map)
	caption_text = tab_el.text.strip()
	if caption_text:
	caption_text = re.sub(r'\s+', ' ', caption_text)
	caption_text = re.sub(r'\s', ' ', caption_text)
	# form tabmap entry
	ref_map[ref_id]['text'] = caption_text
	except AttributeError:
	print('Attribute error with table: ', tab.name)
	continue
	tab.decompose()

	return ref_map


	def combine_ref_maps(eq_map: Dict, fig_map: Dict, tab_map: Dict, foot_map: Dict, sec_map: Dict):
	"""
	Combine all items with ref ids into one map
	:param eq_map:
	:param fig_map:
	:param tab_map:
	:param sec_map:
	:return:
	"""
	ref_map = dict()
	for k, v in eq_map.items():
	v['type'] = 'equation'
	ref_map[k] = v
	for k, v in fig_map.items():
	v['type'] = 'figure'
	ref_map[k] = v
	for k, v in tab_map.items():
	v['type'] = 'table'
	ref_map[k] = v
	for k, v in foot_map.items():
	v['type'] = 'footnote'
	ref_map[k] = v
	for k, v in sec_map.items():
	v['type'] = 'section'
	ref_map[k] = v
	return ref_map


	def collapse_formatting_tags(sp: BeautifulSoup):
	"""
	Collapse formatting tags like <hi>
	:param sp:
	:return:
	"""
	for hi in sp.find_all('hi'):
	hi.replace_with(f' {sp.new_string(hi.text.strip())} ')


	def process_abstract_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
	"""
	Parse abstract from soup
	:param sp:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	abstract_text = []
	if sp.abstract:
	for p in sp.abstract.find_all('p'):
	abstract_text.append(
	process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
	)
	sp.abstract.decompose()
	else:
	if sp.std:
	p_tags = [tag for tag in sp.std if tag.name == 'p' and not tag.get('s2orc_id', None)]
	elif sp.unknown:
	p_tags = [tag for tag in sp.unknown if tag.name == 'p' and not tag.get('s2orc_id', None)]
	else:
	p_tags = None
	if p_tags:
	for p in p_tags:
	abstract_text.append(
	process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
	)
	p.decompose()
	return [para.__dict__ for para in abstract_text]


	def build_section_list(sec_id: str, ref_map: Dict) -> List[Tuple]:
	"""
	Build list of sections from reference map from sec_id using parent entry recursively
	:param sec_id:
	:param ref_map:
	:return:
	"""
	if not sec_id:
	return []
	elif sec_id not in ref_map:
	return []
	else:
	sec_entry = [(ref_map[sec_id]['num'], ref_map[sec_id]['text'])]
	if ref_map[sec_id]['parent'] == sec_id:
	return sec_entry
	else:
	return build_section_list(ref_map[sec_id]['parent'], ref_map) + sec_entry


	def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]:
	"""
	Build sec_list for tag
	:param el:
	:param ref_map:
	:param default_seclist:
	:return:
	"""
	if type(el) == NavigableString:
	return default_seclist
	sec_id = el.get('s2orc_id', None)
	if sec_id:
	return build_section_list(sec_id, ref_map)
	else:
	return default_seclist


	def process_div(tag: bs4.element.Tag, secs: List, sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
	"""
	Process div recursively
	:param tag:
	:param secs:
	:param sp:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	# iterate through children of this tag
	body_text = []

	# navigable strings
	if type(tag) == NavigableString:
	return []
	# skip these tags
	elif tag.name in SKIP_TAGS:
	return []
	# process normal tags
	elif tag.name in TEXT_TAGS:
	if tag.text:
	body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
	# process lists
	elif tag.name == 'list':
	if tag.text:
	body_text += process_list_el(sp, tag, secs, bib_map, ref_map)
	# process formula
	elif tag.name == 'formula':
	replace_item = sp.new_tag('p')
	tag_copy = copy.copy(tag)
	tag_copy['type'] = 'inline'
	replace_item.insert(0, tag_copy)
	tag.replace_with(replace_item)
	if tag.text:
	body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
	# process divs
	elif tag.name.startswith('div'):
	for el in tag:
	# process tags
	if type(el) == bs4.element.Tag:
	el_sec_list = get_seclist_for_el(el, ref_map, secs)
	body_text += process_div(el, el_sec_list, sp, bib_map, ref_map)
	# unknown tag type, skip for now
	else:
	print(f'Unknown tag type: {tag.name}')
	return []

	return body_text


	def process_body_text_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
	"""
	Parse body text from tag recursively
	:param sp:
	:param bib_map:
	:param ref_map:
	:return:
	"""
	body_text = []
	for tag in sp.body:
	# skip navigable string
	if type(tag) == NavigableString:
	continue
	else:
	sec_list = get_seclist_for_el(tag, ref_map, [])
	for cld in tag:
	# skip navigable string
	if type(tag) == NavigableString:
	continue
	else:
	sec_list = get_seclist_for_el(cld, ref_map, sec_list)
	if type(cld) == bs4.element.Tag:
	body_text += process_div(cld, sec_list, sp, bib_map, ref_map)

	# decompose everything
	sp.body.decompose()

	return [para.__dict__ for para in body_text]


	def convert_xml_to_s2orc(
	sp: BeautifulSoup, file_id: str, year_str: str, log_file: str, grobid_config: Optional[Dict]=None
	) -> Paper:
	"""
	Convert a bunch of xml to gorc format
	:param sp:
	:param file_id:
	:param year_str:
	:param log_file:
	:param grobid_config:
	:return:
	"""
	# create grobid client
	client = GrobidClient(grobid_config)

	# TODO: not sure why but have to run twice
	decompose_tags_before_title(sp)
	decompose_tags_before_title(sp)

	# process maketitle info
	title, authors = process_metadata(sp, client, log_file)

	# processing of bibliography entries
	# TODO: look into why authors aren't processing
	bibkey_map = process_bibliography_from_tex(sp, client, log_file)

	# no bibliography entries
	if not bibkey_map:
	with open(log_file, 'a+') as bib_f:
	bib_f.write(f'{file_id},warn_no_bibs\n')

	# process section headers
	section_map = process_sections_from_text(sp)

	# process and replace non-inline equations
	equation_map = process_equations_from_tex(sp)

	# process footnote markers
	footnote_map = process_footnotes_from_text(sp)

	# get figure map
	figure_map = get_figure_map_from_tex(sp)

	# get table_map
	table_map = get_table_map_from_text(sp)

	# combine references in one dict
	refkey_map = combine_ref_maps(equation_map, figure_map, table_map, footnote_map, section_map)

	# process and replace figures
	refkey_map = process_figures_from_tex(sp, refkey_map)

	# process and replace tables
	refkey_map = process_tables_from_tex(sp, refkey_map)

	# collapse all hi tags
	collapse_formatting_tags(sp)

	# process abstract if possible
	abstract = process_abstract_from_tex(sp, bibkey_map, refkey_map)

	# process body text
	body_text = process_body_text_from_tex(sp, bibkey_map, refkey_map)

	# skip if no body text parsed
	if not body_text:
	with open(log_file, 'a+') as body_f:
	body_f.write(f'{file_id},warn_no_body\n')

	metadata = {
	"title": title,
	"authors": authors,
	"year": year_str,
	"venue": "",
	"identifiers": {
	"arxiv_id": file_id
	}
	}

	return Paper(
	paper_id=file_id,
	pdf_hash="",
	metadata=metadata,
	abstract=abstract,
	body_text=body_text,
	back_matter=[],
	bib_entries=bibkey_map,
	ref_entries=refkey_map
	)


	def convert_latex_xml_to_s2orc_json(xml_fpath: str, log_dir: str, grobid_config: Optional[Dict]=None) -> Paper:
	"""
	:param xml_fpath:
	:param log_dir:
	:param grobid_config:
	:return:
	"""
	assert os.path.exists(xml_fpath)

	# get file id
	file_id = str(os.path.splitext(xml_fpath)[0]).split('/')[-1]

	# try to get year from file name
	year = file_id.split('.')[0][:2]
	if year.isdigit():
	year = int(year)
	if year < 40:
	year += 2000
	else:
	year += 1900
	year = str(year)
	else:
	year = ""

	# log file
	log_file = os.path.join(log_dir, 'failed.log')

	with open(xml_fpath, 'r') as f:
	try:
	xml = f.read()
	soup = BeautifulSoup(xml, "lxml")
	paper = convert_xml_to_s2orc(soup, file_id, year, log_file, grobid_config=grobid_config)
	return paper
	except UnicodeDecodeError:
	with open(log_file, 'a+') as log_f:
	log_f.write(f'{file_id},err_unicode_decode\n')
	raise UnicodeDecodeError