tags
+ title: str = parse_title_tag(front_tag=front_tag)
+
+ try:
+ authors: List[Dict] = parse_authors(front_tag=front_tag)
+ except NoAuthorNamesError:
+ authors: List[Dict] = []
+ affiliations: Dict = parse_affiliations(front_tag=front_tag)
+
+ dates: Dict = parse_date_tag(front_tag=front_tag)
+
+ pubmed_id: str = parse_pubmed_id_tag(front_tag=front_tag)
+ pmc_id: str = parse_pmc_id_tag(front_tag=front_tag)
+ doi: str = parse_doi_tag(front_tag=front_tag)
+
+ abstract: List[Dict] = parse_abstract_tag(front_tag=front_tag, soup=soup)
+
+ # categories: str = parse_category_tag(front_tag=front_tag)
+
+ funding_groups: List[str] = parse_funding_groups(front_tag=front_tag)
+
+ return {
+ 'title': title,
+ 'abstract': abstract,
+ 'authors': authors,
+ 'affiliations': affiliations,
+ 'journal_id': journal_id,
+ 'journal_name': journal_name,
+ 'pubmed_id': pubmed_id,
+ 'pmc_id': pmc_id,
+ 'doi': doi,
+ 'year': dates,
+ 'funding_groups': funding_groups
+ }
+
+
+def process_body_tag(body_tag, soup) -> Dict:
+ # replace all xref tags with string placeholders
+ replace_xref_with_string_placeholders(soup_tag=body_tag, soup=soup)
+
+ # replace all sup/sub tags with string placeholders
+ replace_sup_sub_tags_with_string_placeholders(soup_tag=body_tag, soup=soup)
+
+ # some articles (like PMC2844102) have no sections
+ sec_tags = body_tag.find_all('sec', recursive=False)
+
+ # try looking in article tag
+ if not sec_tags:
+ try:
+ sec_tags = body_tag.article.find_all('sec', recursive=False)
+ except:
+ pass
+
+ if sec_tags:
+ all_par_blobs = []
+ for sec_tag in sec_tags:
+ # note; most sections dont have this 'sec-type' attribute
+ if sec_tag.get('sec-type') == 'supplementary-material':
+ # hopefully all the important supplementary content already extracted above in previous step
+ continue
+ else:
+ par_blobs = recurse_parse_section(sec_tag=sec_tag)
+ all_par_blobs.extend(par_blobs)
+ else:
+ all_par_blobs = parse_all_paragraphs_in_section(body_tag)
+
+ return {
+ 'body_text': all_par_blobs,
+ }
+
+
+def process_back_tag(back_tag) -> Dict:
+ # glossary = {}
+ # if back_tag.find('glossary'):
+ # for def_item_tag in back_tag.find('glossary').find_all('def-item'):
+ # glossary[def_item_tag.find('term').text] = def_item_tag.find('def').text
+
+ # TODO: author contrib and COIs
+ # notes = []
+ # for notes_tag in back_tag.find_all('notes'):
+ # pass
+
+ # TODO: PMC2778891 has back tag that looks like: Acknowledgements Supported by the Austrian Science Fund (P-20670 and W11).
+ # that is, it doesn't have 'ack' section.
+ acknowledgements: List[Dict] = []
+ for ack_tag in back_tag.find_all('ack'):
+ title_tag = ack_tag.find('title')
+ for par_tag in ack_tag.find_all('p'):
+ acknowledgements.append({
+ 'section': title_tag.text if title_tag is not None else None,
+ 'text': par_tag.text,
+ 'funding_sources': [fund_tag.text for fund_tag in par_tag.find_all('funding-source')],
+ 'urls': [url_tag.text for url_tag in par_tag.find_all('ext-link')]
+ })
+
+ bib_entries = parse_bib_entries(back_tag)
+
+ return {
+ 'acknowledgements': acknowledgements,
+ 'bib_entries': bib_entries,
+ }
+
+
+def postprocess_front_tags_for_s2orc(init_front_dict: Dict):
+ """
+ Fix authors and year for S2ORC format
+ """
+ # Make authors in front tags look like S2ORC
+ for a in init_front_dict['authors']:
+ a['affiliation'] = {}
+ # get affiliation if available
+ if a['affiliation_ids']:
+ affil_id = a['affiliation_ids'][0]
+ affil_text = [affil['text'] for affil in init_front_dict['affiliations'] if affil['id'] == affil_id]
+ if affil_text:
+ a['affiliation'] = {
+ 'laboratory': "",
+ 'institution': affil_text[0],
+ 'location': {}
+ }
+ del a['affiliation_ids']
+ del a['corresponding']
+ del a['orcid']
+ del init_front_dict['affiliations']
+
+ # Pick best year and make year int in front tags
+ if init_front_dict['year'].get('epub'):
+ year = init_front_dict['year'].get('epub')
+ elif init_front_dict['year'].get('accepted'):
+ year = init_front_dict['year'].get('accepted')
+ elif init_front_dict['year'].get('collection'):
+ year = init_front_dict['year'].get('collection')
+ elif init_front_dict['year'].get('received'):
+ year = init_front_dict['year'].get('received')
+ else:
+ year = None
+ init_front_dict['year'] = year
+
+ return init_front_dict
+
+
+def convert_acks_to_s2orc(paragraphs: List) -> List[Dict]:
+ """
+ Convert acks to S2ORC paragraphs
+ """
+ for paragraph_blob in paragraphs:
+ paragraph_blob['cite_spans'] = []
+ paragraph_blob['ref_spans'] = []
+ del paragraph_blob['funding_sources']
+ del paragraph_blob['urls']
+ return paragraphs
+
+
+def convert_paragraphs_to_s2orc(paragraphs: List, old_to_new: Dict) -> List[Dict]:
+ """
+ Convert paragraphs into S2ORC format
+ """
+ # TODO: temp code to process body text into S2ORC format. this includes getting rid of sub/superscript spans.
+ # also combining fig & table spans into ref spans.
+ # also remapping the reference / bib labels to the new ones defined earlier in this function.
+ # temporarily, we cant support PMC xml parse bibs, so remove all links to the bibliography (cuz they'll be wrong)
+ for paragraph_blob in paragraphs:
+ del paragraph_blob['sup_spans']
+ del paragraph_blob['sub_spans']
+ paragraph_blob['ref_spans'] = []
+ for fig_tab_span in paragraph_blob['fig_spans'] + paragraph_blob['table_spans']:
+ # replace old ref_id with new ref_id. default to None if null
+ # optional, just wanted to check if this ever happens
+ assert fig_tab_span['ref_id']
+ fig_tab_span['ref_id'] = old_to_new.get(fig_tab_span['ref_id'])
+ paragraph_blob['ref_spans'].append(fig_tab_span)
+ del paragraph_blob['fig_spans']
+ del paragraph_blob['table_spans']
+ for cite_span in paragraph_blob['cite_spans']:
+ # replace old cite ids with new cite ids. again default to None if null
+ # optional, just wanted to check if this ever happens
+ assert cite_span['ref_id']
+ cite_span['ref_id'] = old_to_new.get(cite_span['ref_id'])
+ return paragraphs
+
+
+def convert_jats_xml_to_s2orc_json(jats_file: str, log_dir: str):
+ """
+ Convert JATS XML to S2ORC JSON
+ :param jats_file:
+ :param log_dir:
+ :return:
+ """
+ # get file id (PMC id usually)
+ file_id = jats_file.split('/')[-1].split('.')[0]
+
+ # read JATS XML
+ with open(jats_file, 'r') as f_in:
+ soup = BeautifulSoup(f_in, 'lxml')
+ destroy_unimportant_tags_inplace(soup, tags_to_remove=['bold', 'italic', 'graphic'])
+
+ # all the XML files have their own wonky reference IDs. we want to standardize them, but need to remember the old->new mapping
+ old_key_to_new_key = {}
+
+ # REFERENCES
+ table_blobs = extract_table_blobs(soup)
+ figure_blobs = extract_fig_blobs(soup)
+ # TODO: not current represented in S2ORC, keep for later
+ suppl_blobs = extract_suppl_blobs(soup)
+ # TODO: for S2ORC, need to process them into a single ref dict. need to construct new IDs to match ID conventions. and update all cite spans.
+ # also, S2ORC table captions are free text without detected reference/citation mentions
+ # TODO: may want to keep table representations around
+ ref_entries = {}
+ for i, (old_table_key, table_blob) in enumerate(sorted(table_blobs.items())):
+ # TODO: PMC2557072 table `tbl5` has no label. skip.
+ # TODO: PMC3137981 table `tab1` has no caption text. skip.
+ if not table_blob['label'] or not table_blob['caption']:
+ continue
+ table_text = table_blob['label'] + ': ' + ' '.join(
+ [c['text'] for c in table_blob['caption']]
+ ) + '\n' + ' '.join([f['text'] for f in table_blob['footnote']])
+ new_table_key = f'TABREF{i}'
+ old_key_to_new_key[old_table_key] = new_table_key
+ # TODO: skipping over any citations or references in the table for now
+ if table_blob['xml']:
+ table_content = table_blob['xml'][0]['text']
+ ref_entries[new_table_key] = {'text': table_text, 'content': table_content, 'type': 'table'}
+ for i, (old_figure_key, figure_blob) in enumerate(sorted(figure_blobs.items())):
+ # TODO: double-check, but it seems like figure blobs dont have footnotes parsed out? might be bug
+ # TODO: PMC1326260 first figure has no ['label']. just skip these for now (because no inline references)
+ # TODO: PMC2403743 has null-valued caption in `fig1`. also skip here. fix later.
+ if not figure_blob['label'] or not figure_blob['caption']:
+ continue
+ figure_text = figure_blob['label'] + ': ' + ' '.join([c['text'] for c in figure_blob['caption']])
+ new_figure_key = f'FIGREF{i}'
+ old_key_to_new_key[old_figure_key] = new_figure_key
+ ref_entries[new_figure_key] = {'text': figure_text, 'type': 'figure'}
+
+ # FRONT TAGS
+ front_tag = soup.find('front').extract()
+ front_dict = process_front_tag(front_tag=front_tag, soup=soup)
+ front_dict = postprocess_front_tags_for_s2orc(front_dict)
+ front_dict['abstract'] = convert_paragraphs_to_s2orc(front_dict['abstract'], old_key_to_new_key)
+
+ # BACK TAGS
+ back_tag = soup.find('back')
+ back_dict = {}
+ # PMC1139917 doesnt have 'back' tag
+ if back_tag is not None:
+ back_dict = process_back_tag(back_tag=back_tag)
+ # TODO: format bib entries to S2ORC format. we're already very close, but need a couple changes:
+ # - author blobs include a 'suffix' which defaults to empty string
+ # - issn defaults to empty string
+ # - rename all the bib IDs
+ bib_entries = {}
+ for i, (old_bib_key, bib_entry) in enumerate(sorted(back_dict['bib_entries'].items())):
+ del bib_entry['ref_id']
+ new_bib_key = f'BIBREF{i}'
+ old_key_to_new_key[old_bib_key] = new_bib_key
+ bib_entries[new_bib_key] = bib_entry
+ else:
+ bib_entries = {}
+
+ if back_dict and back_dict.get('acknowledgements'):
+ back_dict['acknowledgements'] = convert_acks_to_s2orc(back_dict['acknowledgements'])
+
+ # BODY TAGS
+ body_tag = soup.find('body')
+ # PMC1240684 doesnt have 'body' tag
+ if body_tag is not None:
+ body_dict = process_body_tag(body_tag=body_tag, soup=soup)
+ body_text = body_dict['body_text']
+ else:
+ # Has no body: /disk2/gorpus/20200101/pmc/Br_Foreign_Med_Chir_Rev/PMC5163425.nxml
+ body_text = []
+
+ body_text = convert_paragraphs_to_s2orc(body_text, old_key_to_new_key)
+
+ metadata = {
+ "title": front_dict['title'],
+ "authors": front_dict['authors'],
+ "year": front_dict['year'],
+ "venue": front_dict['journal_name'],
+ "identifiers": {
+ "doi": front_dict['doi'],
+ "pubmed_id": front_dict['pubmed_id'],
+ "pmc_id": front_dict['pmc_id']
+ }
+ }
+
+ return Paper(
+ paper_id=file_id,
+ pdf_hash="",
+ metadata=metadata,
+ abstract=front_dict['abstract'],
+ body_text=body_text,
+ back_matter=back_dict.get('acknowledgements', []),
+ bib_entries=bib_entries,
+ ref_entries=ref_entries
+ )
+
+
+if __name__ == '__main__':
+ jats_file = 'tests/jats/PMC5828200.nxml'
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+
+ jats_file = 'tests/jats/PMC6398430.nxml'
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+
+ jats_file = 'tests/jats/PMC7417471.nxml'
+ paper = convert_jats_xml_to_s2orc_json(jats_file, 'logs')
+
+ print('done.')
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9cc1a5875d226e97bc5be8a8ea42b18b52b7086
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/all_tag_utils.py
@@ -0,0 +1,300 @@
+from typing import Dict, List, Callable
+
+import re
+import itertools
+
+from bs4 import BeautifulSoup
+
+START_TOKENS = {"#!start#", "@!start@", "&!start&"}
+SEP_TOKENS = {"#!sep#"}
+END_TOKENS = {"#!end#", "@!end@", "&!end&"}
+ALL_TOKENS = START_TOKENS | SEP_TOKENS | END_TOKENS
+
+
+def replace_xref_with_string_placeholders(soup_tag, soup):
+ # replace all xref tags with string placeholders
+ for xref_tag in soup_tag.find_all("xref"):
+ rid = xref_tag['rid'] if 'rid' in xref_tag.attrs else None
+ ref_type = xref_tag['ref-type'] if 'ref-type' in xref_tag.attrs else None
+ xref_tag.replace_with(
+ soup.new_string(
+ f"#!start#{xref_tag.text}#!sep#{rid}#!sep#{ref_type}#!end#"
+ )
+ )
+
+
+def replace_sup_sub_tags_with_string_placeholders(soup_tag, soup):
+ # replace all sup/sub tags with string placeholders
+ for sup_tag in soup_tag.find_all("sup"):
+ sup_tag.replace_with(soup.new_string(f"@!start@{sup_tag.text}@!end@"))
+ for sub_tag in soup_tag.find_all("sub"):
+ sub_tag.replace_with(soup.new_string(f"&!start&{sub_tag.text}&!end&"))
+
+
+def recurse_parse_section(
+ sec_tag,
+ # suppl_blobs: Dict
+) -> List[Dict]:
+ """Recursive function for getting paragraph blobs to look like
+ {
+ 'text': ...,
+ ...,
+ 'section': SUBSUBSECTION_NAME :: SUBSECTION_NAME :: SECTION_NAME
+ }
+ """
+ subsections = sec_tag.find_all("sec", recursive=False)
+ if not subsections:
+ return parse_all_paragraphs_in_section(
+ sec_tag=sec_tag
+ ) # , suppl_blobs=suppl_blobs)
+ else:
+ outputs = []
+ for child in subsections:
+ child_blobs = recurse_parse_section(
+ sec_tag=child
+ ) # , suppl_blobs=suppl_blobs)
+ for blob in child_blobs:
+ # PMC373254 - process blob['section'] to remove any span markers left in there
+ for t in ALL_TOKENS:
+ blob['section'] = blob['section'].replace(t, '')
+ blob["section"] = blob["section"] + " :: " + sec_tag.find("title").text
+ outputs.extend(child_blobs)
+ return outputs
+
+
+def _reduce_args(stack: List, end_token: str) -> List[List]:
+ """Helper function for `_parse_all_paragraphs_in_section`.
+
+ Pop arguments for the xref off the top of the stack and return a list of argument lists,
+ where the outer lists represent groups divided by separators."""
+ start_token = end_token.replace('end', 'start')
+ sep_token = end_token.replace('end', 'sep')
+ args = [[]]
+ while True:
+ token = stack.pop()
+ if token == start_token:
+ return args
+ elif token == sep_token:
+ args.insert(0, [])
+ else:
+ args[0].insert(0, token)
+
+
+def _add_spans(
+ end_token: str,
+ start_pos: int,
+ text: str,
+ ref_id,
+ ref_type,
+ cite_spans: List,
+ fig_spans: List,
+ table_spans: List,
+ sup_spans: List,
+ sub_spans: List,
+):
+ """Helper function used by `_parse_all_paragraphs_in_section`."""
+ if end_token.startswith("#"): # process xref
+ blob = {
+ "start": start_pos,
+ "end": start_pos + len(text),
+ "mention": text,
+ "ref_id": ref_id,
+ }
+ if ref_type == "bibr":
+ cite_spans.append(blob)
+ elif ref_type == "fig":
+ fig_spans.append(blob)
+ elif ref_type == "table":
+ table_spans.append(blob)
+
+ else:
+ blob = {
+ "start": start_pos,
+ "end": start_pos + len(text),
+ "mention": text,
+ }
+ if end_token.startswith("@"):
+ sup_spans.append(blob)
+ else:
+ assert end_token.startswith("&")
+ sub_spans.append(blob)
+
+
+def get_latex_from_formula(
+ formula_tag
+):
+ if formula_tag.find('tex-math'):
+ latex_text = formula_tag.find('tex-math').text
+ match = re.search(r'\\begin\{document\}(.+)\\end\{document\}', latex_text)
+ if match:
+ return match.group(1).strip('$')
+ return None
+
+
+def get_mathml_from_formula(
+ formula_tag
+):
+ if formula_tag.find('mml:math'):
+ return str(formula_tag.find('mml:math'))
+ return None
+
+
+def parse_formulas(
+ para_el,
+ sp,
+ replace
+):
+ # sub and get corresponding spans of inline formulas
+ formula_dict = dict()
+ eq_ind = 0
+ for ftag in para_el.find_all('inline-formula'):
+ try:
+ formula_key = f'INLINEFORM{eq_ind}'
+ eq_ind += 1
+ try:
+ formula_text = ftag.find('mml:math').text
+ except:
+ if 'begin{document}' not in ftag.text:
+ formula_text = ftag.text
+ else:
+ formula_text = "FORMULA"
+ formula_latex = get_latex_from_formula(ftag)
+ formula_mathml = get_mathml_from_formula(ftag)
+ if not formula_mathml and formula_latex:
+ formula_mathml = latex2mathml.converter.convert(formula_latex)
+ formula_dict[formula_key] = (formula_text, formula_latex, formula_mathml, ftag.get('id'))
+ if replace:
+ ftag.replace_with(sp.new_string(f" {formula_key} "))
+ else:
+ # replace with mathml text if available
+ if formula_text != 'FORMULA':
+ ftag.replace_with(sp.new_string(f" {formula_text} "))
+ except AttributeError:
+ continue
+
+ return formula_dict
+
+
+def parse_all_paragraphs_in_section(
+ sec_tag,
+ par_to_text: Callable = None,
+ replace_formula=True
+) -> List[Dict]:
+ """Internal function. Assumes section has no nested tags
+ `par_to_text` is an optional function that converts the `par` tag into a string. by default, calls `par_tag.text`.
+ """
+ outputs = []
+ sp = BeautifulSoup('', 'lxml')
+ for par_tag in sec_tag.find_all("p", recursive=True):
+ cite_spans = []
+ fig_spans = []
+ table_spans = []
+ # suppl_spans = []
+ sup_spans = []
+ sub_spans = []
+ eq_spans = []
+
+ if par_tag.find('display-formula'):
+ raise NotImplementedError('Display formula!')
+
+ if par_tag.find('formula'):
+ raise NotImplementedError('Formula!')
+
+ formula_dict = parse_formulas(par_tag, sp, replace_formula)
+
+ par_text = par_to_text(par_tag) if par_to_text else par_tag.text
+ par_text = re.sub(
+ r"[^\S\n\t]", " ", par_text
+ ) # replaces whitespace but not newline or tab
+ par_text = re.sub(
+ r" ", " ", par_text
+ ) # replaces two spaces w/ one
+
+ # Tokenize the text into normal text and special placeholder tokens.
+ pattern = r"(#!start#)|(#!sep#)|(#!end#)|(@!start@)|(@!end@)|(&!start&)|(&!end&)"
+ tokens = [tok for tok in re.split(pattern, par_text) if tok]
+
+ # To handle nested structures, use a shift-reduce algorithm to consume the text. Placeholder tags are merged away, and related spans are registered.
+ stack = []
+ full_text = []
+ pos = 0
+ disable_count = False
+ for token in tokens:
+ if token in START_TOKENS:
+ stack.append(token)
+ stack.append(pos)
+ stack.append(token.replace('start', 'sep'))
+ elif token in SEP_TOKENS:
+ assert stack
+ stack.append(token)
+ disable_count = True
+ elif token in END_TOKENS:
+ assert stack
+ disable_count = False
+ args = _reduce_args(stack, token)
+ start_pos = args[0][0]
+ text = "".join(args[1])
+ assert len(args) == 2 or len(args) == 4
+ if len(args) == 2:
+ ref_id, ref_type = None, None
+ elif len(args) == 4:
+ ref_id = args[2] and args[2][0]
+ ref_type = args[3] and args[3][0]
+ stack.append(text)
+ _add_spans(
+ token,
+ start_pos,
+ text,
+ ref_id,
+ ref_type,
+ cite_spans,
+ fig_spans,
+ table_spans,
+ sup_spans,
+ sub_spans,
+ )
+ else: # just normal text
+ stack.append(token)
+ if not disable_count: # metadata appearing after a separator
+ full_text.append(token)
+ pos += len(token)
+
+ full_text = "".join(full_text)
+ assert pos == len(full_text)
+
+ title = sec_tag.find("title")
+ title = title.text if title else ""
+
+ # get all equation spans
+ eq_spans = []
+ for span in itertools.chain(
+ re.finditer(r'(INLINEFORM\d+)', full_text),
+ re.finditer(r'(DISPLAYFORM\d+)', full_text)
+ ):
+ try:
+ matching_formula = formula_dict[span.group()]
+ eq_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "text": matching_formula[0],
+ "latex": matching_formula[1],
+ "mathml": matching_formula[2],
+ "ref_id": span.group()
+ })
+ except KeyError:
+ continue
+
+ outputs.append(
+ {
+ "text": full_text,
+ 'cite_spans': cite_spans,
+ 'fig_spans': fig_spans,
+ 'table_spans': table_spans,
+ # 'suppl_spans': suppl_spans,
+ 'sup_spans': sup_spans,
+ 'sub_spans': sub_spans,
+ 'eq_spans': eq_spans,
+ "section": title,
+ }
+ )
+ return outputs
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7502dff8f0fd39e9a5e925ef11b5e1b9ca9564a0
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/back_tag_utils.py
@@ -0,0 +1,56 @@
+from typing import Dict, List
+
+
+def _wrap_text(tag):
+ return tag.text if tag else ''
+
+
+def parse_authors(authors_tag) -> List:
+ """The PMC XML has a slightly different format than authors listed in front tag."""
+ if not authors_tag:
+ return []
+
+ authors = []
+ for name_tag in authors_tag.find_all('name', recursive=False):
+ surname = name_tag.find('surname')
+ given_names = name_tag.find('given-names')
+ given_names = given_names.text.split(' ') if given_names else None
+ suffix = name_tag.find('suffix')
+ authors.append({
+ 'first': given_names[0] if given_names else '',
+ 'middle': given_names[1:] if given_names else [],
+ 'last': surname.text if surname else '',
+ 'suffix': suffix.text if suffix else ''
+ })
+ return authors
+
+
+def parse_bib_entries(back_tag) -> Dict:
+ bib_entries = {}
+ # TODO: PMC2778891 does not have 'ref-list' in its back_tag. do we even need this, or can directly .find_all('ref')?
+ ref_list_tag = back_tag.find('ref-list')
+ if ref_list_tag:
+ for ref_tag in ref_list_tag.find_all('ref'):
+ # The ref ID and label are semantically swapped between CORD-19 and PMC, lol
+ ref_label = ref_tag['id']
+ ref_id = ref_tag.find('label')
+ authors_tag = ref_tag.find('person-group', {'person-group-type': 'author'})
+ year = ref_tag.find('year')
+ fpage = ref_tag.find('fpage')
+ lpage = ref_tag.find('lpage')
+ pages = f'{fpage.text}-{lpage.text}' if fpage and lpage else None
+ dois = [tag.text for tag in ref_tag.find_all('pub-id', {'pub-id-type': 'doi'})]
+ bib_entries[ref_label] = {
+ 'ref_id': _wrap_text(ref_id),
+ 'title': _wrap_text(ref_tag.find('article-title')),
+ 'authors': parse_authors(authors_tag),
+ 'year': int(year.text) if year and year.text.isdigit() else None,
+ 'venue': _wrap_text(ref_tag.find('source')),
+ 'volume': _wrap_text(ref_tag.find('volume')),
+ 'issn': _wrap_text(ref_tag.find('issue')),
+ 'pages': pages,
+ 'other_ids': {
+ 'DOI': dois,
+ }
+ }
+ return bib_entries
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c95c456fa584097591ad5d08db48713cfb72a28
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/extract_utils.py
@@ -0,0 +1,106 @@
+
+from typing import Dict
+
+import bs4
+from bs4 import BeautifulSoup
+
+from doc2json.jats2json.pmc_utils.all_tag_utils import parse_all_paragraphs_in_section
+
+
+def extract_fig_blobs(body_tag) -> Dict:
+ fig_blobs = {}
+ for fig_tag in body_tag.find_all('fig'):
+ fig = fig_tag.extract()
+ label = fig.find('label')
+ fig_blobs[fig['id']] = {
+ 'label': label and label.text,
+ 'caption': fig.find('caption')
+ }
+ _update_fig_blobs(fig_blobs)
+ return fig_blobs
+
+
+def _update_fig_blobs(fig_blobs: Dict):
+ for fig_blob in fig_blobs.values():
+ if fig_blob['caption'] is None:
+ continue
+ # replace non-p tags w/ p tags in figure caption (mostly dealing with title tags, which weren't being extracted before)
+ for tag in fig_blob['caption']:
+ if type(tag) == bs4.element.Tag and tag.name != 'p':
+ tag.name = 'p'
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=fig_blob['caption'], replace_formula=False)
+ for par_blob in par_blobs:
+ del par_blob['section']
+ fig_blob['caption'] = par_blobs
+
+
+def extract_table_blobs(body_tag) -> Dict:
+ # note 1: footnotes dont always exist for each table; hence the if statement
+ # note 2: we want to preserve the XML tags for tables, but also need to run it through the regex cleaner for xrefs and other spans
+ # hence, wrapping all of the table XML text into a fake paragraph tag
+ table_blobs = {}
+ for table_tag in body_tag.find_all('table-wrap'):
+ table = table_tag.extract()
+ label = table.find('label')
+ # TODO: currently restricting to tables with identifiers. might want to include unreferenced tables once we care more.
+ if table.get('id'):
+ table_blobs[table['id']] = {
+ 'label': label and label.text,
+ 'caption': table.find('caption'),
+ 'footnote': table.find('table-wrap-foot') if table.find('table-wrap-foot') else BeautifulSoup('
', 'xml'),
+ 'xml': BeautifulSoup('' + str(table.find('table')) + '
', 'xml')
+ }
+ _update_table_blobs(table_blobs)
+ return table_blobs
+
+
+def _update_table_blobs(table_blobs: Dict):
+ for table_blob in table_blobs.values():
+ if table_blob['caption'] is not None:
+ # replace non-p tags w/ p tags in table caption (mostly dealing with title tags, which weren't being extracted before)
+ for tag in table_blob['caption']:
+ if type(tag) == bs4.element.Tag and tag.name != 'p':
+ tag.name = 'p'
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['caption'], replace_formula=False)
+ for par_blob in par_blobs:
+ del par_blob['section']
+ table_blob['caption'] = par_blobs
+ if table_blob['footnote'] is not None:
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['footnote'], replace_formula=False)
+ for par_blob in par_blobs:
+ del par_blob['section']
+ table_blob['footnote'] = par_blobs
+ # note: if we dont include `par_to_text` function, the parser will convert all tags to text via `par_tag.text`
+ # which actually removes all XML tags we wanted to preserve in table.
+ # by passing in str(), we ensure to keep all of those tags
+ if table_blob['xml'] is not None:
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=table_blob['xml'], par_to_text=str, replace_formula=False)
+ for par_blob in par_blobs:
+ del par_blob['section']
+ table_blob['xml'] = par_blobs
+
+
+def extract_suppl_blobs(body_tag) -> Dict:
+ suppl_blobs = {}
+ for suppl_tag in body_tag.find_all('supplementary-material'):
+ suppl = suppl_tag.extract()
+ # We only care about supplementary material that can be referenced (like figures/tables)
+ # for example, we dont care about PMC1139917 which has supplementary material but without an ID
+ if 'id' in suppl:
+ label = suppl.find('label')
+ suppl_blobs[suppl['id']] = {
+ 'label': label and label.text,
+ 'caption': suppl.find('caption')
+ }
+ _update_suppl_blobs(suppl_blobs)
+ return suppl_blobs
+
+
+def _update_suppl_blobs(suppl_blobs: Dict):
+ for suppl_blob in suppl_blobs.values():
+ if suppl_blob['caption'] is None:
+ continue
+ par_blobs = parse_all_paragraphs_in_section(sec_tag=suppl_blob['caption'])
+ for par_blob in par_blobs:
+ del par_blob['section']
+ suppl_blob['caption'] = par_blobs
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d192185cc4c30aa6a77f01b25f33872f1dc6567d
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/front_tag_utils.py
@@ -0,0 +1,381 @@
+"""
+
+Functions for parsing specific `front_tag` soup tags
+
+"""
+
+from typing import Dict, List, Optional
+
+from collections import Counter
+
+import re
+
+
+from doc2json.jats2json.pmc_utils.all_tag_utils import recurse_parse_section, parse_all_paragraphs_in_section, \
+ replace_sup_sub_tags_with_string_placeholders, replace_xref_with_string_placeholders
+
+
+class NoAuthorNamesError(Exception):
+ """Known papers that trigger:
+ - PMC3462967
+ """
+ pass
+
+
+def parse_journal_id_tag(front_tag) -> str:
+ """
+ front_tag.find_all('journal-id') returns:
+ [
+ Neurosci J ,
+ Neurosci J ,
+ NEUROSCIENCE
+ ]
+ [
+ BMC Biochem
+ BMC Biochem
+ ]
+ """
+ c = Counter()
+ for tag in front_tag.find_all('journal-id'):
+ c[tag.text] += 1
+ tag.decompose()
+ journal_id, n = c.most_common(1)[0]
+ return journal_id
+
+
+def parse_journal_name_tag(front_tag) -> str:
+ """
+ Examples:
+ # Paper 1
+
+ BMC Biochemistry
+
+ # Paper 2
+
+ Neuroscience Journal
+
+
+ But not all titles are contained within a `journal-title-group`. See PMC1079901
+
+
+ Biomed Eng Online
+
+
+ BioMedical Engineering OnLine
+
+ ...
+ """
+ if len(front_tag.find_all('journal-title')) > 1:
+ raise Exception('Multiple journal titles?!')
+ return front_tag.find('journal-title').extract().text
+
+
+def parse_pubmed_id_tag(front_tag) -> Optional[str]:
+ """Not every PMC paper has a PMID """
+ pmid_tag = front_tag.find('article-id', {'pub-id-type': 'pmid'})
+ if pmid_tag is None:
+ return None
+ else:
+ return pmid_tag.extract().text
+
+
+def parse_pmc_id_tag(front_tag) -> str:
+ return f"PMC{front_tag.find('article-id', {'pub-id-type': 'pmc'}).extract().text}"
+
+
+def parse_doi_tag(front_tag) -> Optional[str]:
+ """Not all papers have a DOI"""
+ doi_tag = front_tag.find('article-id', {'pub-id-type': 'doi'})
+ if doi_tag is not None:
+ return doi_tag.extract().text
+ else:
+ return None
+
+
+def parse_title_tag(front_tag) -> str:
+ """
+ Examples:
+ # Paper 1
+
+ Role of the highly conserved G68 residue in the yeast phosphorelay protein Ypd1: implications for interactions between histidine phosphotransfer (HPt) and response regulator proteins
+
+ # Paper 2
+
+ Association of Strength and Physical Functions in People with Parkinson's Disease
+
+
+ Want to restrict to `title-group` because sometimes title shows up in under self-citation
+ """
+ title_group = front_tag.find('title-group').extract()
+ if len(title_group.find_all('article-title')) > 1:
+ raise Exception('Multiple article titles?!')
+ return title_group.find('article-title').text
+
+
+def parse_category_tag(front_tag) -> List[str]:
+ """
+ Examples:
+ # Paper 1
+
+
+ Research Article
+
+
+ # Paper 2
+
+
+ Research Article
+
+
+ """
+ if len(front_tag.find_all('subj-group')) > 1 or len(front_tag.find_all('subject')) > 1:
+ raise Exception('Multiple categories?!')
+ article_categories = front_tag.find('article-categories').extract()
+ return article_categories.find('subject').text
+
+
+def parse_date_tag(front_tag) -> Dict:
+ """
+ Two sets of tags contain dates:
+
+ 2018
+
+
+ 12
+ 12
+ 2018
+
+ And:
+
+
+ 15
+ 10
+ 2018
+
+
+ 20
+ 11
+ 2018
+
+
+ 26
+ 11
+ 2018
+
+
+
+ PMC2557072 has `date` tag with no `day`, only `year` and `month`
+ """
+ out = {}
+ for pub_date in front_tag.find_all('pub-date'):
+ year = pub_date.find('year')
+ month = pub_date.find('month')
+ day = pub_date.find('day')
+ out[pub_date.get('pub-type', 'MISSING_PUB_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
+ pub_date.decompose()
+ for date in front_tag.find_all('date'):
+ year = date.find('year')
+ month = date.find('month')
+ day = date.find('day')
+ out[date.get('date-type', 'MISSING_DATE_TYPE')] = '-'.join([tag.text for tag in [year, month, day] if tag is not None])
+ date.decompose()
+ return out
+
+
+def parse_funding_groups(front_tag) -> List[str]:
+ outs = []
+ for tag in front_tag.find_all():
+
+ # AND statement skips cases where the two tag types nest within each other; we only process the inner one
+ if (tag.name == 'funding-source' or tag.name == 'funding-statement') and tag.find('funding-source') is None and tag.find('funding-statement') is None:
+
+ out = {
+ 'name': None,
+ 'doi': None,
+ 'notes': None,
+ # 'raw': str(tag) # for debugging
+ }
+
+ # handle institution
+ institution_id_tag = tag.find('institution-id')
+ if institution_id_tag:
+ out['doi'] = institution_id_tag.extract().text.replace('http://dx.doi.org/', '')
+ institution_tag = tag.find('institution')
+ if institution_tag:
+ out['name'] = tag.find('institution').extract().text
+
+ # handle named content
+ funder_name_tag = tag.find('named-content', {'content-type': 'funder-name'})
+ if funder_name_tag:
+ out['name'] = funder_name_tag.extract().text
+
+ funder_id_tag = tag.find('named-content', {'content-type': 'funder-identifier'})
+ if funder_id_tag:
+ out['doi'] = funder_id_tag.extract().text.replace('http://dx.doi.org/', '')
+
+ # handle urls
+ if tag.get('xlink:href'):
+ out['doi'] = tag['xlink:href']
+
+ # fix DOIs with URLs in them
+ if out['doi']:
+ match = re.search(r'http(s?)://dx.doi.org/(.+)', out['doi'])
+ if match:
+ out['doi'] = match.group(2)
+
+ # remainder text is either a name or a full statement
+ text = tag.text
+ if tag.name == 'funding-statement' or ('fund' in text or 'support' in text or 'provide' in text):
+ out['notes'] = text
+ else:
+ # what if something already in 'name'? observed it's typically empty string; so ignore.
+ if not out['name']:
+ out['name'] = text
+
+ # if DOI link is in the name, remove it and parse (PMC5407128)
+ if out['name'] and not out['doi']:
+ pattern = r'\s*http(s?)://dx.doi.org/(.+)$'
+ match = re.search(pattern, out['name'])
+ if match:
+ out['doi'] = match.group(2)
+ out['name'] = re.sub(pattern, r'', out['name'])
+
+ outs.append(out)
+ return outs
+
+
+# TODO: didnt want to handle group names; seemed rare and inconsistent; focus on with and
+def parse_authors(front_tag) -> List[Dict]:
+ authors = []
+ for contrib_tag in front_tag.find_all('contrib'):
+
+ # skip nesting; just process children (individual authors)
+ if contrib_tag.find_all('contrib'):
+ continue
+
+ # skip contribs without a name; these should be ones that consist of tag
+ if contrib_tag.find('name') is None:
+ continue
+
+ # corresponding tag
+ if (contrib_tag.get('corresp') == 'yes') or (contrib_tag.find('xref', {'ref-type': 'corresp'})):
+ is_corresp = True
+ else:
+ is_corresp = False
+
+ # orcid ID is sometimes a URL or just a number. standardize as hyphenized number.
+ if contrib_tag.find('contrib-id'):
+ orcid_id = contrib_tag.find('contrib-id').text
+ match = re.search(r'http(s?)://orcid.org/(.+)', orcid_id)
+ if match:
+ orcid_id = match.group(2)
+ # A very small number of articles have ID type CATS, which we don't handle. For example:
+ # /disk2/gorpus/20200101/pmc/Change/PMC6176774.nxml
+ if len(orcid_id) != 19:
+ orcid_id = None
+ else:
+ orcid_id = None
+
+ # Email may or may not be present.
+ email = contrib_tag.find('email')
+ email = email.text if email else None
+
+ # Get the name info for the author.
+ name_info = {name_tag.name: name_tag.text for name_tag in contrib_tag.find('name').find_all()}
+ # TODO: PMC3462967 is an Erratum. It does not have ['given-names']. not sure we care about those, so try-catch for now
+ try:
+ given_names = name_info['given-names'].split(' ')
+ except KeyError as e:
+ raise NoAuthorNamesError
+
+ authors.append({
+ 'first': given_names[0] if given_names else None,
+ 'middle': given_names[1:] if given_names else None,
+ 'last': name_info['surname'],
+ 'suffix': name_info.get('suffix', ''),
+ 'email': email,
+ 'affiliation_ids': [xref_tag.get('rid') for xref_tag in contrib_tag.find_all('xref', {'ref-type': 'aff'})],
+ 'corresponding': is_corresp,
+ 'orcid': orcid_id
+ })
+
+ # authors.append(str(contrib_tag.extract()))
+ return authors
+
+
+def parse_affiliations(front_tag) -> List[Dict]:
+ """
+ Sometimes affiliations is nested within '' along with
+ authors. Sometimes, they're not and listed outside as multiple tags.
+
+ Not all have IDs. For example:
+ St. Paul, Minnesota
+ """
+ outs = []
+ for aff_tag in front_tag.find_all('aff'):
+ if aff_tag.find('label'): # get rid of unused markers so `.text` is cleaner
+ aff_tag.find('label').decompose()
+ if aff_tag.find('sup'):
+ aff_tag.find('sup').decompose() # same treatment as label
+
+ aff_id = aff_tag.get('id')
+
+ # it looks like we want to go to the full affiliation surface form without worrying about all possible handlings of and other fields
+ # BUT, we do want to keep ISNI and GRID IDs when they occur. They seem to occur typically within
+ # so let's handle those if they exist; safely decompose the tags (because they dont contribute to surface form); then grab remaining affiliation surface form
+
+ # implicit in this approach is that we dont need to actually handle tags because only one per affiliation
+ if len(aff_tag.find_all('institution-wrap')) > 1:
+ import pdb; pdb.set_trace()
+ id_type_to_id = {}
+ for institution_id_tag in aff_tag.find_all('institution-id'):
+ id_type_to_id[institution_id_tag['institution-id-type']] = institution_id_tag.text
+ institution_id_tag.decompose()
+
+ # TODO: processing of text: there are a lot of random newline chars (cuz XML preserves page layout)
+ # --> replace them with whitespace if there's preceding punctuation char
+ # --> otherwise, replace them with comma
+ text = aff_tag.text
+
+ outs.append({
+ 'id': aff_id,
+ 'other_ids': id_type_to_id,
+ 'text': text
+ })
+
+ return outs
+
+
+def parse_abstract_tag(front_tag, soup) -> List[Dict]:
+ """Not every paper has an abstract
+
+ Furthermore, note very abstract is structured into sections.
+ Some abstracts (see PMC1914226) look like:
+
+ ...
+ ...
+
+ """
+ # TODO: are there cases where text text >
?
+ abstract: List[Dict] = []
+ if front_tag.find('abstract'):
+ abstract_tag = front_tag.find('abstract').extract()
+
+ # replace all xref tags with string placeholders
+ replace_xref_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
+
+ # replace all sup/sub tags with string placeholders
+ replace_sup_sub_tags_with_string_placeholders(soup_tag=abstract_tag, soup=soup)
+
+ if abstract_tag.find('sec'):
+ all_par_blobs = []
+ for sec_tag in abstract_tag.find_all('sec', recursive=False):
+ par_blobs = recurse_parse_section(sec_tag=sec_tag)
+ all_par_blobs.extend(par_blobs)
+ else:
+ all_par_blobs = parse_all_paragraphs_in_section(sec_tag=abstract_tag)
+ for par_blob in all_par_blobs:
+ # these 'sections' typically show up as empty string
+ par_blob['section'] = 'Abstract'
+ abstract.append(par_blob)
+ return abstract
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py b/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..f296c141efa6f87651b014a4cb0cafcfa1a4f652
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/pmc_utils/tests.py
@@ -0,0 +1,347 @@
+
+funding_tags_and_parsed_dicts = [
+ # is typically the top-level tag
+ #
+ # within, we see and as containing the main information we want
+ #
+ # here, we see with an 'id' attribute. we can ignore these.
+ ("""
+
+ Wellcome Trust
+
+ """, None),
+ # sometimes, there are also tags, but we can ignore these. they're funding-group specific.
+ ("""
+
+ US Department of Energy's Office of Science, Biological and Environmental Research Program
+ DE-AC02-05CH11231
+ DE-AC52-07NA27344
+ DE-AC02-06NA25396
+ DE-AC05-00OR22725
+
+
+ German Research Foundation
+ INST 599/1-2
+
+ """, None),
+
+ # is a less structured alternative to
+ ("""
+ No sources of funding were used to assist in the preparation of this study.
+ """, None),
+
+ # Rarely, there is nesting! ignore parents.
+ ("""
+
+ This work was supported by the Swedish Association for Sexuality Education (RFSU).
+
+ """, None),
+
+
+ # Sometimes both can occur, sort of duplicating the same information.
+ # For example "Cornell" is mentioned as both a and a
+ ("""
+
+
+ Cornell University Institute for the Social Sciences
+
+
+ The research was supported by a grant from the Cornell University Institute for the Social Sciences.
+ """, None),
+
+ # many
+ ("""
+
+ Brien Holden Vision Institute
+
+
+ Australian Federal Government
+
+
+ International Postgraduate Research Scholarship (Cathleen Fedtke)
+
+
+ University of New South Wales, Australia
+
+
+ National Institutes of Health
+ P30EY14801
+
+
+ Florida Lions Eye Bank
+
+
+ Bascom Palmer Eye Institute
+
+ """, None),
+
+ # institutions can optionally occur within
+ # 'institution-id-type' is common, but also optional
+ # regardless of the institution ID type, it looks like the ID is always a DOI (or URL to a DOI)
+ ("""
+
+
+
+ http://dx.doi.org/10.13039/100000025
+ National Institute of Mental Health
+
+
+ R01MH107333
+
+ Kim Woong-Ki
+
+
+ """, None),
+ ("""
+
+
+
+ Deutsche Forschungsgemeinschaft
+ http://search.crossref.org/fundref?q=501100001659
+
+
+ Re 628/16-1
+ GRK 1216
+
+ """, None),
+ ("""
+
+
+
+ National Institutes of Health
+ 10.13039/100000002
+
+
+
+ """, None),
+
+ # handing
+ ("""
+
+
+ Austrian Science Fund
+ 10.13039/501100002428
+
+ P 27625
+
+ This work was supported by Austrian Science Fund [grant number P 27625].
+ """, None),
+
+ # handling xlink:href attributes
+ ("""
+
+ Economic and Social Research Council
+ RES-360-25-0032
+
+
+ Wellcome Trust
+ 106542/Z/14/Z
+
+ """, None)
+]
+
+acknowledgement_tags_and_parsed_dicts = [
+ # variants with may/may not have a . always have but may/may not have
.
never has attributes.
+ # the text might contain or tags.
+ # the tags have required attributes 'ext-link-type' and 'xlink:href', and optional attribute 'id'. all the are URLs.
+ ("""
+ Acknowledgements
+ The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.
+ """, {
+ 'text': 'The authors thank the BBSRC (Project Grants BB/M025349/1 and BB/P011969/1) for its continued support, and appreciate the helpful comments of Dr Rob Young, Cardiff University School of Optometry and Vision Sciences.',
+ 'funding': [{'text': 'BBSRC', 'id': 'gs0005'}],
+ 'url': None}),
+ ("""
+ Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.
+ """, {
+ 'text': 'Supported by AA-11431 and AA-12908 from the National Institutes of Health and the Tobacco-Related Disease Research Program Grant 17RT-0171.',
+ 'funding': [],
+ 'url': None}),
+ ("""
+ Acknowledgements
+ This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.
+ """, {
+ 'text': 'This work was supported by the National Institutes of Health,National Cancer Institute grants R01CA196967 and R01CA209886.',
+ 'funding': [],
+ 'url': None}),
+ ("""
+ Data accessibility
+ The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL .
+ """, {
+ 'text': 'The data used is included in the RepeatABEL package available at https://cran.r-project.org/web/packages/RepeatABEL.',
+ 'funding': [],
+ 'url': 'https://cran.r-project.org/web/packages/RepeatABEL'}),
+ # variants with are similar to the above.
+ ("""
+ Acknowledgments
+ D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.
+ """, {
+ 'text': 'D.B.K. thanks Prof. Nigel Harper for a very useful discussion. We also thank the referees and the journal editors for exceptionally careful and thoughtful reviews that helped improve the manuscript considerably.',
+ 'funding': [],
+ 'url': None}),
+ ("""
+ Conflict of interest
+ The authors declare there is no conflict of interest associated with this manuscript.
+ """, {
+ 'text': 'The authors declare there is no conflict of interest associated with this manuscript.',
+ 'funding': [],
+ 'url': None})
+]
+
+affiliation_tags_and_parsed_dicts = [
+ # mix of tags with and without IDs
+ ("""Department of Internal Medicine, Division of Cardiology, Inha University Hospital, Incheon, South Korea """, None),
+ ("""1 Department of Cardiology, Atatürk Chest Diseases and Chest Surgery Training and Research Hospital; Ankara-Turkey """, None),
+ # there can exist a tag with/without IDs
+ ("""3 Center for Medical Education, Sapporo Medical University, Sapporo, Japan """, None),
+ # sometimes, the marker used in paper is kept also. for example, `1` in superscript.
+ # this can exist with/without the tag. as in, it's inconsistent whether the marker is encapsulated in or kept as string
+ ("""\n1 Department of Orthodontics, College of Dentistry, King Khalid University, Abha, Saudi Arabia """, None),
+ ("""1 University of Dundee """, None),
+ # tags can be straightforward; just ignore and grab text
+ ("""1 School of Chemistry, The University of Manchester, Manchester, United Kingdom """, None),
+ # sometimes tags can have SIBLING tags, like or
+ ("""2 Sr. Consultant & Head, Dept. of Neurology, National Neurosciences Centre, Peerless Hospital , Kolkata, India """, None),
+ ("""2 Institute for Transplantation Diagnostics and Cell Therapeutics, Heinrich Heine University Düsseldorf , Düsseldorf, Germany . """, None),
+ # is also a common CHILD tag; these can be either entirely structured affiliation entries (not intended for tag.text)
+ ("""
+ 7
+ VIB
+ Zwijnaarde
+ Belgium
+ """, None),
+ # or overlayed over a single affiliation string (comma-sep if call tag.text)
+ ("""
+ e
+
+ School of Public Health & Health Systems , University of Waterloo
+
+ """, None),
+ # example of a nonsense one that has TWO tags, whitespaces, the tag WITHIN
+ ("""\n
+ 7 \n
+ Brain Research Institute \n
+ University of Zürich \n
+ Zürich \n
+ Switzerland \n """, None),
+ # most common content-type within are: 'department', 'organisation-division', 'city', 'institution-name', 'postal-code', 'country-part', etc.
+
+ # is the other popular way to surface tags.
+ # They seem to always come with 1+ as children.
+
+ # finally, these wrappers can wrap multiple tags.
+ # in this example, see how the COMMA is awkwardly encapsulated within tags? Also, notice how the country is untagged outside of
+ # basically, everything is weird.
+ ("""
+ 10
+
+ 0000000123222966
+ grid.6936.a
+ Institute of Experimental Genetics, Life and Food Science Center Weihenstephan,
+ Technische Universität München,
+ Freising-Weihenstephan, Germany """, None)
+]
+
+author_tags_and_parsed_dicts = [
+ # every author seems to be in a tag.
+ # all tags seem to have a 'contrib-type' attribute, which often equals 'author' and sometimes equals 'collab'
+
+ # below is an 'author' that has , , and child tags. Also XREF to affiliation (can have multiple).
+ ("""
+ Sandström Annica
+ annica.sandstrom@ltu.se
+
+ Annica Sandström is an Associate Professor in Political Science at Luleå University of Technology. Working foremost within the field of environmental policy and management, her publications include empirical studies on the socio-political complexities of natural resource governance as well as theory-driven pieces on collaborative management, adaptive management, and policy networks.
+ """, None),
+ ("""
+ Cassidy John W.
+ 1
+ 2
+ """, None),
+
+ # below is an 'author' that contains a child tag. We can see sometimes there's other tags like an XREF to affiliation which can probably be .decomposed()
+ ("""
+ The HIV Neurobehavioral Research Programs (HNRP) Group
+ """, None),
+ ("""
+ JET EFDA contributors
+ a 3
+ """, None),
+
+ # below is a 'collab' that also contains nested tags wrapped by . Yikes!
+ # luckily, it seems is rare and always nested within an ultimate parent
+ # --> these are more like affiliations
+ ("""
+ UK Biobank Eye and Vision Consortium\n
+
+
+ Aslam Tariq
+
+
+ Bishop Paul
+
+
+ Barman Sarah
+
+
+
+
+ """, None),
+ ("""
+ WERF EPHect Working Group
+
+ Adamson G.D.
+ Allaire C.
+
+
+ """, None),
+
+ # there are optional tags instead of an
+ ("""
+ Beedle Aaron M
+ Department of Pharmaceutical and Biomedical Sciences, University of Georgia College of Pharmacy, Athens, GA 30602 USA
+ """, None),
+
+ # corresponding authors are indicated in two ways: (i) within as a 'corresp=yes' attribute, (ii) within as a 'ref-type=corresp' attribute
+ ("""
+ Kim Woong-Ki
+ kimw@evms.edu
+ 1
+ """, None),
+ ("""
+ Suero Molina Eric
+ MD, MBA
+
+
+
+ """, None),
+ # note that contrib-type 'editor' is also present, and seems to accompany tag and 'corresp=no' attribute
+ ("""
+ Greene Robert L.
+ Editor
+ """, None),
+
+ # within are optional child tags
+ # the 'contrib-id-type' seems to always be 'orcid'
+ # authentication seems optional
+ ("""
+ https://orcid.org/0000-0002-9987-6824
+ Sandeepa N. C.
+ drsandeepanc@gmail.com
+ \n2 \n
+ """, None),
+ ("""
+ http://orcid.org/0000-0003-1079-4775
+ West Ann H.
+ awest@ou.edu
+ 1
+ """, None),
+
+ # more edge cases; a tag with no --> probably just remove
+ ("""
+ on behalf of the National Advisory Committee on Blood and Blood Products
+ *
+
+ """, None),
+
+]
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/jats2json/process_jats.py b/s2orc-doc2json/doc2json/jats2json/process_jats.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f640c48248ffe3733740eba59fb17ef424f4113
--- /dev/null
+++ b/s2orc-doc2json/doc2json/jats2json/process_jats.py
@@ -0,0 +1,104 @@
+import os
+import json
+import argparse
+import time
+from typing import Optional
+
+from doc2json.jats2json.jats_to_json import convert_jats_xml_to_s2orc_json
+
+
+BASE_TEMP_DIR = 'temp'
+BASE_OUTPUT_DIR = 'output'
+BASE_LOG_DIR = 'log'
+
+
+def process_jats_stream(
+ fname: str,
+ stream: bytes,
+ temp_dir: str=BASE_TEMP_DIR
+):
+ """
+ Process a jats file stream
+ :param fname:
+ :param stream:
+ :param temp_dir:
+ :return:
+ """
+ temp_input_dir = os.path.join(temp_dir, 'input')
+ temp_input_file = os.path.join(temp_input_dir, fname)
+
+ os.makedirs(temp_dir, exist_ok=True)
+ os.makedirs(temp_input_dir, exist_ok=True)
+
+ with open(temp_input_file, 'wb') as outf:
+ outf.write(stream)
+
+ output_file = process_jats_file(temp_input_file)
+
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ contents = json.load(f)
+ return contents
+ else:
+ return []
+
+
+def process_jats_file(
+ jats_file: str,
+ output_dir: str=BASE_OUTPUT_DIR,
+ log_dir: str=BASE_LOG_DIR,
+) -> Optional[str]:
+ """
+ Process files in a JATS XML file and get JSON representation
+ :param jats_file:
+ :param output_dir:
+ :param log_dir:
+ :return:
+ """
+ # create directories
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(log_dir, exist_ok=True)
+
+ # get paper id as the name of the file
+ paper_id = os.path.splitext(jats_file)[0].split('/')[-1]
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
+
+ # check if input file exists and output file doesn't
+ if not os.path.exists(jats_file):
+ raise FileNotFoundError(f"{jats_file} doesn't exist")
+ if os.path.exists(output_file):
+ print(f'{output_file} already exists!')
+
+ # convert to S2ORC
+ paper = convert_jats_xml_to_s2orc_json(jats_file, log_dir)
+
+ # write to file
+ with open(output_file, 'w') as outf:
+ json.dump(paper.release_json("jats"), outf, indent=4, sort_keys=False)
+
+ return output_file
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="Run S2ORC JATS2JSON")
+ parser.add_argument("-i", "--input", default=None, help="path to the input JATS XML file")
+ parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
+ parser.add_argument("-l", "--log", default='log', help="path to the log dir")
+ parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
+
+ args = parser.parse_args()
+
+ input_path = args.input
+ output_path = args.output
+ log_path = args.log
+ keep_temp = args.keep
+
+ start_time = time.time()
+
+ os.makedirs(output_path, exist_ok=True)
+
+ process_jats_file(input_path, output_path, log_path, keep_temp)
+
+ runtime = round(time.time() - start_time, 3)
+ print("runtime: %s seconds " % (runtime))
+ print('done.')
diff --git a/s2orc-doc2json/doc2json/s2orc.py b/s2orc-doc2json/doc2json/s2orc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92d89d980f0ec8b13fda4e6b0822c4ce3ec70d1
--- /dev/null
+++ b/s2orc-doc2json/doc2json/s2orc.py
@@ -0,0 +1,527 @@
+"""
+S2ORC classes
+"""
+
+from datetime import datetime
+from typing import Dict, List, Optional
+from doc2json.config import *
+
+
+CORRECT_KEYS = {
+ "issn": "issue",
+ "type": "type_str"
+}
+
+SKIP_KEYS = {
+ 'link',
+ 'bib_id'
+}
+
+REFERENCE_OUTPUT_KEYS = {
+ 'figure': {'text', 'type_str', 'uris', 'num'},
+ 'table': {'text', 'type_str', 'content', 'num', 'html'},
+ 'footnote': {'text', 'type_str', 'num'},
+ 'section': {'text', 'type_str', 'num', 'parent'},
+ 'equation': {'text', 'type_str', 'latex', 'mathml', 'num'}
+}
+
+METADATA_KEYS = {
+ "title", "authors", "year", "venue", "identifiers"
+}
+
+
+class ReferenceEntry:
+ """
+ Class for representing S2ORC figure and table references
+
+ An example json representation (values are examples, not accurate):
+
+ {
+ "FIGREF0": {
+ "text": "FIG. 2. Depth profiles of...",
+ "latex": null,
+ "type": "figure"
+ },
+ "TABREF2": {
+ "text": "Diversity indices of...",
+ "latex": null,
+ "type": "table",
+ "content": "",
+ "html": ""
+ }
+ }
+ """
+ def __init__(
+ self,
+ ref_id: str,
+ text: str,
+ type_str: str,
+ latex: Optional[str] = None,
+ mathml: Optional[str] = None,
+ content: Optional[str] = None,
+ html: Optional[str] = None,
+ uris: Optional[List[str]] = None,
+ num: Optional[str] = None,
+ parent: Optional[str] = None
+ ):
+ self.ref_id = ref_id
+ self.text = text
+ self.type_str = type_str
+ self.latex = latex
+ self.mathml = mathml
+ self.content = content
+ self.html = html
+ self.uris = uris
+ self.num = num
+ self.parent = parent
+
+ def as_json(self):
+ keep_keys = REFERENCE_OUTPUT_KEYS.get(self.type_str, None)
+ if keep_keys:
+ return {
+ k: self.__getattribute__(k) for k in keep_keys
+ }
+ else:
+ return {
+ "text": self.text,
+ "type": self.type_str,
+ "latex": self.latex,
+ "mathml": self.mathml,
+ "content": self.content,
+ "html": self.html,
+ "uris": self.uris,
+ "num": self.num,
+ "parent": self.parent
+ }
+
+
+class BibliographyEntry:
+ """
+ Class for representing S2ORC parsed bibliography entries
+
+ An example json representation (values are examples, not accurate):
+
+ {
+ "title": "Mobility Reports...",
+ "authors": [
+ {
+ "first": "A",
+ "middle": ["A"],
+ "last": "Haija",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "IEEE Wireless Commun. Mag",
+ "volume": "42",
+ "issn": "9",
+ "pages": "80--92",
+ "other_ids": {
+ "doi": [
+ "10.1109/TWC.2014.2360196"
+ ],
+
+ }
+ }
+
+ """
+ def __init__(
+ self,
+ bib_id: str,
+ title: str,
+ authors: List[Dict[str, str]],
+ ref_id: Optional[str] = None,
+ year: Optional[int] = None,
+ venue: Optional[str] = None,
+ volume: Optional[str] = None,
+ issue: Optional[str] = None,
+ pages: Optional[str] = None,
+ other_ids: Dict[str, List] = None,
+ num: Optional[int] = None,
+ urls: Optional[List] = None,
+ raw_text: Optional[str] = None,
+ links: Optional[List] = None
+ ):
+ self.bib_id = bib_id
+ self.ref_id = ref_id
+ self.title = title
+ self.authors = authors
+ self.year = year
+ self.venue = venue
+ self.volume = volume
+ self.issue = issue
+ self.pages = pages
+ self.other_ids = other_ids
+ self.num = num
+ self.urls = urls
+ self.raw_text = raw_text
+ self.links = links
+
+ def as_json(self):
+ return {
+ "ref_id": self.ref_id,
+ "title": self.title,
+ "authors": self.authors,
+ "year": self.year,
+ "venue": self.venue,
+ "volume": self.volume,
+ "issue": self.issue,
+ "pages": self.pages,
+ "other_ids": self.other_ids,
+ "num": self.num,
+ "urls": self.urls,
+ "raw_text": self.raw_text,
+ "links": self.links
+ }
+
+
+class Affiliation:
+ """
+ Class for representing affiliation info
+
+ Example:
+ {
+ "laboratory": "Key Laboratory of Urban Environment and Health",
+ "institution": "Chinese Academy of Sciences",
+ "location": {
+ "postCode": "361021",
+ "settlement": "Xiamen",
+ "country": "People's Republic of China"
+ }
+ """
+ def __init__(
+ self,
+ laboratory: str,
+ institution: str,
+ location: Dict
+ ):
+ self.laboratory = laboratory
+ self.institution = institution
+ self.location = location
+
+ def as_json(self):
+ return {
+ "laboratory": self.laboratory,
+ "institution": self.institution,
+ "location": self.location
+ }
+
+
+class Author:
+ """
+ Class for representing paper authors
+
+ Example:
+
+ {
+ "first": "Anyi",
+ "middle": [],
+ "last": "Hu",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "Key Laboratory of Urban Environment and Health",
+ "institution": "Chinese Academy of Sciences",
+ "location": {
+ "postCode": "361021",
+ "settlement": "Xiamen",
+ "country": "People's Republic of China"
+ }
+ },
+ "email": ""
+ }
+ """
+ def __init__(
+ self,
+ first: str,
+ middle: List[str],
+ last: str,
+ suffix: str,
+ affiliation: Optional[Dict] = None,
+ email: Optional[str] = None
+ ):
+ self.first = first
+ self.middle = middle
+ self.last = last
+ self.suffix = suffix
+ self.affiliation = Affiliation(**affiliation) if affiliation else {}
+ self.email = email
+
+ def as_json(self):
+ return {
+ "first": self.first,
+ "middle": self.middle,
+ "last": self.last,
+ "suffix": self.suffix,
+ "affiliation": self.affiliation.as_json() if self.affiliation else {},
+ "email": self.email
+ }
+
+
+class Metadata:
+ """
+ Class for representing paper metadata
+
+ Example:
+ {
+ "title": "Niche Partitioning...",
+ "authors": [
+ {
+ "first": "Anyi",
+ "middle": [],
+ "last": "Hu",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "Key Laboratory of Urban Environment and Health",
+ "institution": "Chinese Academy of Sciences",
+ "location": {
+ "postCode": "361021",
+ "settlement": "Xiamen",
+ "country": "People's Republic of China"
+ }
+ },
+ "email": ""
+ }
+ ],
+ "year": "2011-11"
+ }
+ """
+ def __init__(
+ self,
+ title: str,
+ authors: List[Dict],
+ year: Optional[str] = None,
+ venue: Optional[str] = None,
+ identifiers: Optional[Dict] = {}
+ ):
+ self.title = title
+ self.authors = [Author(**author) for author in authors]
+ self.year = year
+ self.venue = venue
+ self.identifiers = identifiers
+
+ def as_json(self):
+ return {
+ "title": self.title,
+ "authors": [author.as_json() for author in self.authors],
+ "year": self.year,
+ "venue": self.venue,
+ "identifiers": self.identifiers
+ }
+
+
+class Paragraph:
+ """
+ Class for representing a parsed paragraph from Grobid xml
+ All xml tags are removed from the paragraph text, all figures, equations, and tables are replaced
+ with a special token that maps to a reference identifier
+ Citation mention spans and section header are extracted
+
+ An example json representation (values are examples, not accurate):
+
+ {
+ "text": "Formal language techniques BID1 may be used to study FORMULA0 (see REF0)...",
+ "mention_spans": [
+ {
+ "start": 27,
+ "end": 31,
+ "text": "[1]")
+ ],
+ "ref_spans": [
+ {
+ "start": ,
+ "end": ,
+ "text": "Fig. 1"
+ }
+ ],
+ "eq_spans": [
+ {
+ "start": 53,
+ "end": 61,
+ "text": "α = 1",
+ "latex": "\\alpha = 1",
+ "ref_id": null
+ }
+ ],
+ "section": "Abstract"
+ }
+ """
+ def __init__(
+ self,
+ text: str,
+ cite_spans: List[Dict],
+ ref_spans: List[Dict],
+ eq_spans: Optional[List[Dict]] = [],
+ section: Optional = None,
+ sec_num: Optional = None
+ ):
+ self.text = text
+ self.cite_spans = cite_spans
+ self.ref_spans = ref_spans
+ self.eq_spans = eq_spans
+ if type(section) == str:
+ if section:
+ sec_parts = section.split('::')
+ section_list = [[None, sec_name] for sec_name in sec_parts]
+ else:
+ section_list = None
+ if section_list and sec_num:
+ section_list[-1][0] = sec_num
+ else:
+ section_list = section
+ self.section = section_list
+
+ def as_json(self):
+ return {
+ "text": self.text,
+ "cite_spans": self.cite_spans,
+ "ref_spans": self.ref_spans,
+ "eq_spans": self.eq_spans,
+ "section": '::'.join([sec[1] for sec in self.section]) if self.section else "",
+ "sec_num": self.section[-1][0] if self.section else None
+ }
+
+
+class Paper:
+ """
+ Class for representing a parsed S2ORC paper
+ """
+ def __init__(
+ self,
+ paper_id: str,
+ pdf_hash: str,
+ metadata: Dict,
+ abstract: List[Dict],
+ body_text: List[Dict],
+ back_matter: List[Dict],
+ bib_entries: Dict,
+ ref_entries: Dict
+ ):
+ self.paper_id = paper_id
+ self.pdf_hash = pdf_hash
+ self.metadata = Metadata(**metadata)
+ self.abstract = [Paragraph(**para) for para in abstract]
+ self.body_text = [Paragraph(**para) for para in body_text]
+ self.back_matter = [Paragraph(**para) for para in back_matter]
+ self.bib_entries = [
+ BibliographyEntry(
+ bib_id=key,
+ **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in bib.items() if k not in SKIP_KEYS}
+ ) for key, bib in bib_entries.items()
+ ]
+ self.ref_entries = [
+ ReferenceEntry(
+ ref_id=key,
+ **{CORRECT_KEYS[k] if k in CORRECT_KEYS else k: v for k, v in ref.items() if k != 'ref_id'}
+ ) for key, ref in ref_entries.items()
+ ]
+
+ def as_json(self):
+ return {
+ "paper_id": self.paper_id,
+ "pdf_hash": self.pdf_hash,
+ "metadata": self.metadata.as_json(),
+ "abstract": [para.as_json() for para in self.abstract],
+ "body_text": [para.as_json() for para in self.body_text],
+ "back_matter": [para.as_json() for para in self.back_matter],
+ "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+ "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
+ }
+
+ @property
+ def raw_abstract_text(self) -> str:
+ """
+ Get all the body text joined by a newline
+ :return:
+ """
+ return '\n'.join([para.text for para in self.abstract])
+
+ @property
+ def raw_body_text(self) -> str:
+ """
+ Get all the body text joined by a newline
+ :return:
+ """
+ return '\n'.join([para.text for para in self.body_text])
+
+ def release_json(self, doc_type: str="pdf"):
+ """
+ Return in release JSON format
+ :return:
+ """
+ # TODO: not fully implemented; metadata format is not right; extra keys in some places
+ release_dict = {"paper_id": self.paper_id}
+ release_dict.update({"header": {
+ "generated_with": f'{S2ORC_NAME_STRING} {S2ORC_VERSION_STRING}',
+ "date_generated": datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
+ }})
+ release_dict.update(self.metadata.as_json())
+ release_dict.update({"abstract": self.raw_abstract_text})
+ release_dict.update({
+ f"{doc_type}_parse": {
+ "paper_id": self.paper_id,
+ "_pdf_hash": self.pdf_hash,
+ "abstract": [para.as_json() for para in self.abstract],
+ "body_text": [para.as_json() for para in self.body_text],
+ "back_matter": [para.as_json() for para in self.back_matter],
+ "bib_entries": {bib.bib_id: bib.as_json() for bib in self.bib_entries},
+ "ref_entries": {ref.ref_id: ref.as_json() for ref in self.ref_entries}
+ }
+ })
+ return release_dict
+
+
+def load_s2orc(paper_dict: Dict) -> Paper:
+ """
+ Load release S2ORC into Paper class
+ :param paper_dict:
+ :return:
+ """
+ paper_id = paper_dict['paper_id']
+ pdf_hash = paper_dict.get('_pdf_hash', paper_dict.get('s2_pdf_hash', None))
+
+ # 2019 gorc parses
+ if "grobid_parse" in paper_dict and paper_dict.get("grobid_parse"):
+ metadata = {k: v for k, v in paper_dict["metadata"].items() if k in METADATA_KEYS}
+ abstract = paper_dict.get("grobid_parse").get("abstract", [])
+ body_text = paper_dict.get("grobid_parse").get("body_text", [])
+ back_matter = paper_dict.get("grobid_parse").get("back_matter", [])
+ bib_entries = paper_dict.get("grobid_parse").get("bib_entries", {})
+ for k, v in bib_entries.items():
+ if 'link' in v:
+ v['links'] = [v['link']]
+ ref_entries = paper_dict.get("grobid_parse").get("ref_entries", {})
+ # current and 2020 s2orc release_json
+ elif ("pdf_parse" in paper_dict and paper_dict.get("pdf_parse")) or ("body_text" in paper_dict and paper_dict.get("body_text")):
+ if "pdf_parse" in paper_dict:
+ paper_dict = paper_dict["pdf_parse"]
+ if paper_dict.get("metadata"):
+ metadata = {k: v for k, v in paper_dict.get("metadata").items() if k in METADATA_KEYS}
+ # 2020 s2orc releases (metadata is separate)
+ else:
+ metadata = {
+ "title": None,
+ "authors": [],
+ "year": None
+ }
+ abstract = paper_dict.get("abstract", [])
+ body_text = paper_dict.get("body_text", [])
+ back_matter = paper_dict.get("back_matter", [])
+ bib_entries = paper_dict.get("bib_entries", {})
+ for k, v in bib_entries.items():
+ if 'link' in v:
+ v['links'] = [v['link']]
+ ref_entries = paper_dict.get("ref_entries", {})
+ else:
+ print(paper_id)
+ raise NotImplementedError("Unknown S2ORC file type!")
+
+ return Paper(
+ paper_id=paper_id,
+ pdf_hash=pdf_hash,
+ metadata=metadata,
+ abstract=abstract,
+ body_text=body_text,
+ back_matter=back_matter,
+ bib_entries=bib_entries,
+ ref_entries=ref_entries
+ )
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/spp2json/__init__.py b/s2orc-doc2json/doc2json/spp2json/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/s2orc-doc2json/doc2json/spp2json/process_pdf.py b/s2orc-doc2json/doc2json/spp2json/process_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..8558abbe69f8181e9ada4ef0e24db8ca3c433fdb
--- /dev/null
+++ b/s2orc-doc2json/doc2json/spp2json/process_pdf.py
@@ -0,0 +1,72 @@
+import os
+import json
+import argparse
+import time
+from typing import Dict
+
+from doc2json.spp2json.spp.spp_client import SppClient
+from doc2json.spp2json.spp.spp_json_to_s2orc_json import convert_spp_json_to_s2orc_json
+
+
+
+def process_pdf_file(input_file: str, temp_dir: str, output_dir: str) -> str:
+ """
+ Process a PDF file and get JSON representation
+ :param input_file:
+ :param temp_dir:
+ :param output_dir:
+ :return:
+ """
+ # get paper id as the name of the file
+ paper_id = '.'.join(input_file.split('/')[-1].split('.')[:-1])
+ spp_json_file = os.path.join(temp_dir, f'{paper_id}.json')
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
+
+ # check if input file exists and output file doesn't
+ if not os.path.exists(input_file):
+ raise FileNotFoundError(f"{input_file} doesn't exist")
+ if os.path.exists(output_file):
+ raise Warning(f'{output_file} already exists!')
+
+ # process PDF through SPP -> SPP JSON
+ client = SppClient()
+ # TODO: compute PDF hash
+ client.process(input_file, temp_dir)
+
+ # process SPP JSON -> S2ORC JSON
+ assert os.path.exists(spp_json_file)
+ with open(spp_json_file, 'r') as f_in:
+ spp_json = json.load(f_in)
+ paper = convert_spp_json_to_s2orc_json(spp_json=spp_json)
+
+ # write to file
+ with open(output_file, 'w') as outf:
+ json.dump(paper.release_json(), outf, indent=4, sort_keys=False)
+
+ return output_file
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="Run S2ORC PDF2JSON")
+ parser.add_argument("-i", "--input", default=None, help="path to the input PDF file")
+ parser.add_argument("-t", "--temp", default='temp/', help="path to the temp dir for putting tei xml files")
+ parser.add_argument("-o", "--output", default='output/', help="path to the output dir for putting json files")
+ parser.add_argument("-k", "--keep", action='store_true')
+
+ args = parser.parse_args()
+
+ input_path = args.input
+ temp_path = args.temp
+ output_path = args.output
+ keep_temp = args.keep
+
+ start_time = time.time()
+
+ os.makedirs(temp_path, exist_ok=True)
+ os.makedirs(output_path, exist_ok=True)
+
+ process_pdf_file(input_path, temp_path, output_path)
+
+ runtime = round(time.time() - start_time, 3)
+ print("runtime: %s seconds " % (runtime))
+ print('done.')
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/spp2json/spp/__init__.py b/s2orc-doc2json/doc2json/spp2json/spp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/s2orc-doc2json/doc2json/spp2json/spp/spp_client.py b/s2orc-doc2json/doc2json/spp2json/spp/spp_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c9c96fa303adf638a03ea27a5f5586b0838bf8
--- /dev/null
+++ b/s2orc-doc2json/doc2json/spp2json/spp/spp_client.py
@@ -0,0 +1,32 @@
+import os
+import io
+import json
+import argparse
+import time
+import glob
+import ntpath
+from typing import List
+
+
+class SppClient:
+ def process(self, input: str, output: str):
+ raise NotImplementedError
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Client for ScienceParsePlus (SPP) services")
+ parser.add_argument("--input", default=None, help="path to the directory containing PDF to process")
+ parser.add_argument("--output", default=None, help="path to the directory where to put the results")
+ args = parser.parse_args()
+
+ input_path = args.input
+ output_path = args.output
+
+ client = SppClient()
+
+ start_time = time.time()
+
+ client.process(input_path, output_path)
+
+ runtime = round(time.time() - start_time, 3)
+ print("runtime: %s seconds " % (runtime))
diff --git a/s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py b/s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..97e7bd3a133e2cc7375bbf200f04ec0bf54315a2
--- /dev/null
+++ b/s2orc-doc2json/doc2json/spp2json/spp/spp_json_to_s2orc_json.py
@@ -0,0 +1,7 @@
+from typing import *
+
+from doc2json.s2orc import Paper
+
+
+def convert_spp_json_to_s2orc_json(spp_json: Dict) -> Paper:
+ raise NotImplementedError
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/tex2json/__init__.py b/s2orc-doc2json/doc2json/tex2json/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/s2orc-doc2json/doc2json/tex2json/process_tex.py b/s2orc-doc2json/doc2json/tex2json/process_tex.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2da752a9c3d16cd167e2c02090c2085e05f89a2
--- /dev/null
+++ b/s2orc-doc2json/doc2json/tex2json/process_tex.py
@@ -0,0 +1,127 @@
+import os
+import json
+import argparse
+import time
+from typing import Optional, Dict
+
+from doc2json.tex2json.tex_to_xml import convert_latex_to_s2orc_json
+from doc2json.tex2json.xml_to_json import convert_latex_xml_to_s2orc_json
+
+
+BASE_TEMP_DIR = 'temp'
+BASE_OUTPUT_DIR = 'output'
+BASE_LOG_DIR = 'log'
+
+
+def process_tex_stream(
+ fname: str,
+ stream: bytes,
+ temp_dir: str=BASE_TEMP_DIR,
+ keep_flag: bool=False,
+ grobid_config: Optional[Dict] = None
+):
+ """
+ Process a gz file stream
+ :param fname:
+ :param stream:
+ :param temp_dir:
+ :param keep_flag:
+ :param grobid_config:
+ :return:
+ """
+ temp_input_dir = os.path.join(temp_dir, 'input')
+ temp_input_file = os.path.join(temp_input_dir, fname)
+
+ os.makedirs(temp_dir, exist_ok=True)
+ os.makedirs(temp_input_dir, exist_ok=True)
+
+ with open(temp_input_file, 'wb') as outf:
+ outf.write(stream)
+
+ output_file = process_tex_file(
+ temp_input_file, temp_dir=temp_dir, keep_flag=keep_flag, grobid_config=grobid_config
+ )
+
+ if os.path.exists(output_file):
+ with open(output_file, 'r') as f:
+ contents = json.load(f)
+ return contents
+ else:
+ return []
+
+
+def process_tex_file(
+ input_file: str,
+ temp_dir: str=BASE_TEMP_DIR,
+ output_dir: str=BASE_OUTPUT_DIR,
+ log_dir: str=BASE_LOG_DIR,
+ keep_flag: bool=False,
+ grobid_config: Optional[Dict]=None
+) -> Optional[str]:
+ """
+ Process files in a TEX zip and get JSON representation
+ :param input_file:
+ :param temp_dir:
+ :param output_dir:
+ :param log_dir:
+ :param keep_flag:
+ :param grobid_config:
+ :return:
+ """
+ # create directories
+ os.makedirs(temp_dir, exist_ok=True)
+ os.makedirs(output_dir, exist_ok=True)
+ os.makedirs(log_dir, exist_ok=True)
+
+ # get paper id as the name of the file
+ paper_id = os.path.splitext(input_file)[0].split('/')[-1]
+ output_file = os.path.join(output_dir, f'{paper_id}.json')
+ cleanup_flag = not keep_flag
+
+ # check if input file exists and output file doesn't
+ if not os.path.exists(input_file):
+ raise FileNotFoundError(f"{input_file} doesn't exist")
+ if os.path.exists(output_file):
+ print(f'{output_file} already exists!')
+
+ # process LaTeX
+ xml_file = convert_latex_to_s2orc_json(input_file, temp_dir, cleanup_flag)
+ if not xml_file:
+ return None
+
+ # convert to S2ORC
+ paper = convert_latex_xml_to_s2orc_json(xml_file, log_dir, grobid_config=grobid_config)
+
+ # write to file
+ with open(output_file, 'w') as outf:
+ json.dump(paper.release_json("latex"), outf, indent=4, sort_keys=False)
+
+ return output_file
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="Run S2ORC TEX2JSON")
+ parser.add_argument("-i", "--input", default=None, help="path to the input TEX zip file")
+ parser.add_argument("-t", "--temp", default='temp', help="path to a temp dir for partial files")
+ parser.add_argument("-o", "--output", default='output', help="path to the output dir for putting json files")
+ parser.add_argument("-l", "--log", default='log', help="path to the log dir")
+ parser.add_argument("-k", "--keep", default=False, help="keep temporary files")
+
+ args = parser.parse_args()
+
+ input_path = args.input
+ temp_path = args.temp
+ output_path = args.output
+ log_path = args.log
+ keep_temp = args.keep
+
+ start_time = time.time()
+
+ os.makedirs(temp_path, exist_ok=True)
+ os.makedirs(output_path, exist_ok=True)
+
+ process_tex_file(input_path, temp_path, output_path, log_path, keep_temp)
+
+ runtime = round(time.time() - start_time, 3)
+ print("runtime: %s seconds " % (runtime))
+ print('done.')
diff --git a/s2orc-doc2json/doc2json/tex2json/tex_to_xml.py b/s2orc-doc2json/doc2json/tex2json/tex_to_xml.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff96159e0bc112fa0e2239e5e004fbfb64babaaa
--- /dev/null
+++ b/s2orc-doc2json/doc2json/tex2json/tex_to_xml.py
@@ -0,0 +1,201 @@
+"""
+Process all the files in a LaTeX zip file to extract paper content
+
+1. Unzips LaTeX ZIP file
+2. Identifies primary TEX file
+3. Expands other TEX files into main TEX file using latexpand
+4. Expands BBL file into main TEX file
+5. Convert TEX file into XML using tralics
+6. Extract content of XML into S2ORC JSON
+
+"""
+
+import os
+import gzip
+import tarfile
+import zipfile
+import shutil
+from typing import Optional
+
+from doc2json.utils.latex_util import normalize, latex_to_xml
+
+
+def _is_gzip_file(fpath):
+ with open(fpath, 'rb') as test_f:
+ return test_f.read(2) == b'\x1f\x8b'
+
+
+def extract_latex(zip_file: str, latex_dir: str, cleanup=True):
+ """
+ Unzip latex zip into temp directory
+ :param zip_file:
+ :param latex_dir:
+ :param cleanup:
+ :return:
+ """
+ assert os.path.exists(zip_file)
+ assert zip_file.endswith('.gz') or zip_file.endswith('.zip') or zip_file.endswith('.tar')
+
+ # get name of zip file
+ file_id = os.path.splitext(zip_file)[0].split('/')[-1]
+
+ # check if tar file -> untar
+ tar_dir = os.path.join(latex_dir, file_id)
+ os.makedirs(tar_dir, exist_ok=True)
+ if tarfile.is_tarfile(zip_file):
+ with tarfile.open(zip_file) as tar:
+ tar.extractall(tar_dir)
+ # check if gzip file -> un-gz and/or untar
+ elif _is_gzip_file(zip_file):
+ tar_file = os.path.join(latex_dir, f'{file_id}.tar')
+ with gzip.open(zip_file, 'rb') as in_f, open(tar_file, 'wb') as out_f:
+ s = in_f.read()
+ out_f.write(s)
+ if os.path.exists(tar_file):
+ # check if tarfile
+ if tarfile.is_tarfile(tar_file):
+ with tarfile.open(tar_file) as tar:
+ tar.extractall(tar_dir)
+ os.remove(tar_file)
+ # else, copy to tex file
+ else:
+ tex_file = os.path.join(latex_dir, file_id, f'{file_id}.tex')
+ os.makedirs(tar_dir, exist_ok=True)
+ os.rename(tar_file, tex_file)
+ # check if zip file -> unzip
+ elif zipfile.is_zipfile(zip_file):
+ with zipfile.ZipFile(zip_file, 'r') as in_f:
+ in_f.extractall(tar_dir)
+ else:
+ return None
+
+ # clean up if needed
+ if cleanup:
+ os.remove(zip_file)
+
+ # returns directory
+ if os.path.exists(tar_dir):
+ return tar_dir
+
+
+def normalize_latex(latex_dir: str, norm_dir: str, norm_log_file: str, cleanup=True) -> Optional[str]:
+ """
+ Normalize all latex files from arxiv
+ :param latex_dir:
+ :param norm_dir:
+ :param norm_log_file:
+ :param cleanup:
+ :return:
+ """
+ # normalize file
+ file_id = latex_dir.strip('/').split('/')[-1]
+ if file_id == 'skipped':
+ return None
+ norm_output_folder = os.path.join(norm_dir, file_id)
+ os.makedirs(norm_output_folder, exist_ok=True)
+ try:
+ normalize(latex_dir, norm_output_folder)
+ except TypeError:
+ shutil.rmtree(norm_output_folder)
+ with open(norm_log_file, 'a+') as log_f:
+ log_f.write(f'{file_id}\n')
+
+ # delete latex directory if cleanup
+ if cleanup:
+ shutil.rmtree(latex_dir)
+
+ return norm_output_folder
+
+
+def norm_latex_to_xml(norm_dir: str, xml_dir: str, xml_err_file: str, xml_log_file: str, cleanup=True) -> Optional[str]:
+ """
+ Convert LaTeX to XML using tralics
+ :param norm_dir:
+ :param xml_dir:
+ :param xml_err_file:
+ :param xml_log_file:
+ :param cleanup:
+ :return:
+ """
+ file_id = norm_dir.strip('/').split('/')[-1]
+ norm_tex_file = os.path.join(norm_dir, f'{file_id}.tex')
+ xml_output_dir = os.path.join(xml_dir, file_id)
+ xml_file = os.path.join(xml_output_dir, f'{file_id}.xml')
+ os.makedirs(xml_output_dir, exist_ok=True)
+
+ latex_to_xml(
+ tex_file=norm_tex_file,
+ out_dir=xml_output_dir,
+ out_file=xml_file,
+ err_file=xml_err_file,
+ log_file=xml_log_file
+ )
+
+ # delete norm directory if cleanup
+ if cleanup:
+ shutil.rmtree(norm_dir)
+
+ if os.path.exists(xml_file):
+ return xml_file
+
+
+def convert_latex_to_xml(
+ zip_file: str, latex_dir: str, norm_dir: str, xml_dir: str, log_dir: str, cleanup=True
+) -> Optional[str]:
+ """
+ Run expansion, normalization, xml conversion on latex
+ :param zip_file:
+ :param latex_dir:
+ :param norm_dir:
+ :param xml_dir:
+ :param log_dir:
+ :param cleanup:
+ :return:
+ """
+ # extract zip file
+ latex_output_dir = extract_latex(zip_file, latex_dir, cleanup)
+
+ # normalize latex
+ norm_log_file = os.path.join(log_dir, 'norm_error.log')
+ norm_output_dir = normalize_latex(latex_output_dir, norm_dir, norm_log_file, cleanup)
+
+ # convert to xml
+ xml_error_file = os.path.join(log_dir, 'xml_error.log')
+ xml_log_file = os.path.join(log_dir, 'xml_skip.log')
+ xml_output_file = norm_latex_to_xml(norm_output_dir, xml_dir, xml_error_file, xml_log_file, cleanup)
+
+ return xml_output_file
+
+
+def convert_latex_to_s2orc_json(
+ latex_zip: str,
+ base_temp_dir: str,
+ cleanup_after: bool=True
+) -> str:
+ """
+ Convert a LaTeX zip file to S2ORC JSON
+ :param latex_zip:
+ :param base_temp_dir:
+ :param cleanup_after:
+ :return:
+ """
+ if not os.path.exists(latex_zip):
+ raise FileNotFoundError("Input LaTeX ZIP file doesn't exist")
+
+ # temp directories
+ latex_expand_dir = os.path.join(base_temp_dir, 'latex')
+ latex_norm_dir = os.path.join(base_temp_dir, 'norm')
+ latex_xml_dir = os.path.join(base_temp_dir, 'xml')
+ latex_log_dir = os.path.join(base_temp_dir, 'log')
+
+ os.makedirs(base_temp_dir, exist_ok=True)
+ os.makedirs(latex_expand_dir, exist_ok=True)
+ os.makedirs(latex_norm_dir, exist_ok=True)
+ os.makedirs(latex_xml_dir, exist_ok=True)
+ os.makedirs(latex_log_dir, exist_ok=True)
+
+ # convert to XML
+ xml_file = convert_latex_to_xml(
+ latex_zip, latex_expand_dir, latex_norm_dir, latex_xml_dir, latex_log_dir, cleanup_after
+ )
+ return xml_file
diff --git a/s2orc-doc2json/doc2json/tex2json/xml_to_json.py b/s2orc-doc2json/doc2json/tex2json/xml_to_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb4b748a82362fabb5715cb53673f4e688fb246
--- /dev/null
+++ b/s2orc-doc2json/doc2json/tex2json/xml_to_json.py
@@ -0,0 +1,1396 @@
+import os
+import re
+import itertools
+import bs4
+from bs4 import BeautifulSoup, NavigableString
+from typing import List, Dict, Tuple, Optional
+import copy
+import latex2mathml.converter
+
+from doc2json.grobid2json.grobid.grobid_client import GrobidClient
+from doc2json.utils.grobid_util import parse_bib_entry, get_author_data_from_grobid_xml
+from doc2json.s2orc import Paper, Paragraph
+
+
+SKIP_TAGS = {
+ 'clearpage',
+ 'colorpool',
+ 'newpage',
+ 'tableofcontents'
+}
+
+TEXT_TAGS = {
+ 'p',
+ 'proof',
+ 'caption'
+}
+
+
+def normalize_latex_id(latex_id: str):
+ str_norm = latex_id.upper().replace('_', '')
+ if str_norm.startswith('BID'):
+ return str_norm.replace('BID', 'BIBREF')
+ if str_norm.startswith('CID'):
+ return str_norm.replace('CID', 'SECREF')
+ if str_norm.startswith('FORMULA'):
+ return str_norm.replace('FORMULA', 'EQREF')
+ return str_norm
+
+
+def process_author(
+ author_text: str,
+ grobid_client: GrobidClient,
+ logfile: str
+) -> List[Dict]:
+ """
+ Process authors
+ :param author_text:
+ :param grobid_client:
+ :param logfile:
+ :return:
+ """
+ if author_text:
+ author_xml_str = grobid_client.process_header_names(author_text, logfile)
+ if author_xml_str:
+ author_soup = BeautifulSoup(author_xml_str, 'xml')
+ author_entry = get_author_data_from_grobid_xml(author_soup)
+ return author_entry
+
+ return [{
+ "first": "",
+ "middle": [],
+ "last": author_text,
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ }]
+
+
+def process_bibentry(bib_text: str, grobid_client: GrobidClient, logfile: str):
+ """
+ Process one bib entry text into title, authors, etc
+ :param bib_text:
+ :param grobid_client:
+ :param logfile:
+ :return:
+ """
+ if not bib_text:
+ return None
+ bib_lines = bib_text.split('\n')
+ bib_lines = [re.sub(r'\s+', ' ', line) for line in bib_lines]
+ bib_lines = [re.sub(r'\s', ' ', line).strip() for line in bib_lines]
+ bib_string = ' '.join(bib_lines)
+ xml_str = grobid_client.process_citation(bib_string, logfile)
+ if xml_str:
+ soup = BeautifulSoup(xml_str, 'lxml')
+ bib_entry = parse_bib_entry(soup)
+ if not bib_entry['raw_text']:
+ bib_entry['raw_text'] = bib_string
+ return bib_entry
+ return None
+
+
+def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
+ """
+ Replace all references in element with special tokens
+ :param sp:
+ :param el:
+ :param ref_map:
+ :return:
+ """
+ # replace all citations with cite keyword
+ for cite in el.find_all('cit'):
+ try:
+ target = cite.ref.get('target').replace('bid', 'BIBREF')
+ cite.replace_with(sp.new_string(f" {target} "))
+ except AttributeError:
+ print('Attribute error: ', cite)
+ continue
+
+ # replace all non citation references
+ for rtag in el.find_all('ref'):
+ try:
+ if rtag.get('target') and not rtag.get('target').startswith('bid'):
+ if rtag.get('target').startswith('cid'):
+ target = rtag.get('target').replace('cid', 'SECREF')
+ elif rtag.get('target').startswith('uid'):
+ if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
+ target = rtag.get('target').replace('uid', 'FIGREF')
+ elif rtag.get('target').replace('uid', 'TABREF') in ref_map:
+ target = rtag.get('target').replace('uid', 'TABREF')
+ elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
+ target = rtag.get('target').replace('uid', 'EQREF')
+ elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map:
+ target = rtag.get('target').replace('uid', 'FOOTREF')
+ elif rtag.get('target').replace('uid', 'SECREFU') in ref_map:
+ target = rtag.get('target').replace('uid', 'SECREFU')
+ else:
+ target = rtag.get('target').upper()
+ else:
+ print('Weird ID!')
+ target = rtag.get('target').upper()
+ rtag.replace_with(sp.new_string(f" {target} "))
+ except AttributeError:
+ print('Attribute error: ', rtag)
+ continue
+
+ return el
+
+
+def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
+ """
+ Process list element
+ :param sp:
+ :param list_el:
+ :param section_info:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ # TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists)
+ list_items = []
+ for item in list_el.find_all('item'):
+ # skip itemize settings
+ if item.text.strip().startswith('[') and item.text.strip().endswith(']'):
+ continue
+ # try processing as paragraph
+ list_num = item.get('id-text', None)
+ item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map)
+ # append list number if ordered
+ if list_num:
+ list_num_str = f'{list_num}. '
+ # iterate cite spans
+ new_cite_spans = []
+ for span in item_as_para.cite_spans:
+ new_cite_spans.append({
+ "start": span['start'] + len(list_num_str),
+ "end": span['end'] + len(list_num_str),
+ "text": span['text']
+ })
+ # iterate ref spans
+ new_ref_spans = []
+ for span in item_as_para.ref_spans:
+ new_ref_spans.append({
+ "start": span['start'] + len(list_num_str),
+ "end": span['end'] + len(list_num_str),
+ "text": span['text']
+ })
+ # iterate equation spans
+ new_eq_spans = []
+ for span in item_as_para.eq_spans:
+ new_eq_spans.append({
+ "start": span['start'] + len(list_num_str),
+ "end": span['end'] + len(list_num_str),
+ "text": span['text'],
+ "latex": span['latex'],
+ "ref_id": span['ref_id']
+ })
+ new_para = Paragraph(
+ text=list_num_str + item_as_para.text,
+ cite_spans=new_cite_spans,
+ ref_spans=new_ref_spans,
+ eq_spans=new_eq_spans,
+ section=item_as_para.section
+ )
+ else:
+ new_para = item_as_para
+ list_items.append(new_para)
+ return list_items
+
+
+def process_navstring(str_el: NavigableString, section_info: List):
+ """
+ Process one NavigableString
+ :param sp:
+ :param str_el:
+ :param section_info:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ # substitute space characters
+ text = re.sub(r'\s+', ' ', str_el)
+ text = re.sub(r'\s', ' ', text)
+
+ # get all cite spans
+ all_cite_spans = []
+ for span in re.finditer(r'(BIBREF\d+)', text):
+ all_cite_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "ref_id": span.group()
+ })
+
+ # get all ref spans
+ all_ref_spans = []
+ for span in itertools.chain(
+ re.finditer(r'(FIGREF\d+)', text),
+ re.finditer(r'(TABREF\d+)', text),
+ re.finditer(r'(EQREF\d+)', text),
+ re.finditer(r'(FOOTREF\d+)', text),
+ re.finditer(r'(SECREF\d+)', text),
+ re.finditer(r'(SECREFU\d+)', text),
+ ):
+ all_ref_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "ref_id": span.group()
+ })
+
+ # assert all align
+ for cite_span in all_cite_spans:
+ assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
+ for ref_span in all_ref_spans:
+ assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
+
+ return Paragraph(
+ text=text,
+ cite_spans=all_cite_spans,
+ ref_spans=all_ref_spans,
+ eq_spans=[],
+ section=section_info
+ )
+
+
+def process_paragraph(sp: BeautifulSoup, para_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
+ """
+ Process one paragraph
+ :param sp:
+ :param para_el:
+ :param section_info:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ # replace all ref tokens with special tokens
+ para_el = replace_ref_tokens(sp, para_el, ref_map)
+
+ # sub and get corresponding spans of inline formulas
+ formula_dict = dict()
+ inline_key_ind = 0
+ display_key_ind = 0
+ for ftag in para_el.find_all('formula'):
+ try:
+ # if formula has ref id, treat as display formula
+ if ftag.get('id'):
+ formula_key = f'DISPLAYFORM{display_key_ind}'
+ ref_id = ftag.get('id').replace('uid', 'EQREF')
+ display_key_ind += 1
+ # else, treat as inline
+ else:
+ formula_key = f'INLINEFORM{inline_key_ind}'
+ ref_id = None
+ inline_key_ind += 1
+ try:
+ formula_mathml = latex2mathml.converter.convert(ftag.texmath.text)
+ except Exception:
+ formula_mathml = ""
+ formula_dict[formula_key] = (ftag.math.text, ftag.texmath.text, formula_mathml, ref_id)
+ ftag.replace_with(sp.new_string(f" {formula_key} "))
+ except AttributeError:
+ continue
+
+ # remove floats
+ for fl in para_el.find_all('float'):
+ print('Warning: still has !')
+ fl.decompose()
+
+ # remove notes
+ for note in para_el.find_all('note'):
+ print('Warning: still has !')
+ note.decompose()
+
+ # substitute space characters
+ text = re.sub(r'\s+', ' ', para_el.text)
+ text = re.sub(r'\s', ' ', text)
+
+ # get all cite spans
+ all_cite_spans = []
+ for span in re.finditer(r'(BIBREF\d+)', text):
+ all_cite_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "text": bib_map[span.group()]['num'] if span.group() in bib_map else None,
+ "ref_id": span.group()
+ })
+
+ # get all ref spans
+ all_ref_spans = []
+ for span in itertools.chain(
+ re.finditer(r'(FIGREF\d+)', text),
+ re.finditer(r'(TABREF\d+)', text),
+ re.finditer(r'(EQREF\d+)', text),
+ re.finditer(r'(FOOTREF\d+)', text),
+ re.finditer(r'(SECREF\d+)', text),
+ re.finditer(r'(SECREFU\d+)', text),
+ ):
+ all_ref_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "text": ref_map[span.group()]['num'] if span.group() in ref_map else None,
+ "ref_id": span.group()
+ })
+
+ # get all equation spans
+ all_eq_spans = []
+ for span in itertools.chain(
+ re.finditer(r'(INLINEFORM\d+)', text),
+ re.finditer(r'(DISPLAYFORM\d+)', text)
+ ):
+ try:
+ matching_formula = formula_dict[span.group()]
+ all_eq_spans.append({
+ "start": span.start(),
+ "end": span.start() + len(span.group()),
+ "text": matching_formula[0],
+ "latex": matching_formula[1],
+ "mathml": matching_formula[2],
+ "ref_id": span.group()
+ })
+ except KeyError:
+ continue
+
+ # assert all align
+ for cite_span in all_cite_spans:
+ assert text[cite_span['start']:cite_span['end']] == cite_span['ref_id']
+ for ref_span in all_ref_spans:
+ assert text[ref_span['start']:ref_span['end']] == ref_span['ref_id']
+
+ return Paragraph(
+ text=text,
+ cite_spans=all_cite_spans,
+ ref_spans=all_ref_spans,
+ eq_spans=all_eq_spans,
+ section=section_info
+ )
+
+
+def decompose_tags_before_title(sp: BeautifulSoup):
+ """
+ decompose all tags before title
+ :param sp:
+ :return:
+ """
+ if sp.body.next.name == 'std':
+ cld_tags = sp.std.find_all(recursive=False)
+ if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
+ for tag in sp.std:
+ if type(tag) == bs4.element.Tag:
+ if tag.name != 'maketitle' and tag.name != 'title':
+ tag.decompose()
+ else:
+ break
+ elif sp.body.next.name == 'unknown':
+ cld_tags = sp.unknown.find_all(recursive=False)
+ if any([tag.name == 'maketitle' or tag.name == 'title' for tag in cld_tags]):
+ for tag in sp.std:
+ if type(tag) == bs4.element.Tag:
+ if tag.name != 'maketitle' and tag.name != 'title':
+ tag.decompose()
+ else:
+ break
+ else:
+ print(f"Unknown inner tag: {sp.body.next.name}")
+ return
+
+
+def process_metadata(sp: BeautifulSoup, grobid_client: GrobidClient, log_file: str) -> Tuple[str, List]:
+ """
+ Process metadata section in soup
+ :param sp:
+ :param grobid_client:
+ :param log_file:
+ :return:
+ """
+ title = ""
+ authors = []
+
+ if not sp.maketitle and not sp.metadata:
+ if sp.title:
+ title = sp.title.text
+ return title, authors
+ else:
+ return title, authors
+ elif sp.maketitle:
+ try:
+ # process title
+ title = sp.maketitle.title.text
+ for formula in sp.author.find_all('formula'):
+ formula.decompose()
+ # process authors
+ author_parts = []
+ for tag in sp.author:
+ if type(tag) == NavigableString:
+ author_parts.append(tag.strip())
+ else:
+ author_parts.append(tag.text.strip())
+ author_parts = [re.sub(r'\s+', ' ', line) for line in author_parts]
+ author_parts = [re.sub(r'\s', ' ', line).strip() for line in author_parts]
+ author_parts = [part for part in author_parts if part.strip()]
+ author_string = ', '.join(author_parts)
+ authors = process_author(author_string, grobid_client, log_file)
+ sp.maketitle.decompose()
+ except AttributeError:
+ sp.maketitle.decompose()
+ return title, authors
+ elif sp.metadata:
+ try:
+ # process title and authors from metadata
+ title = sp.metadata.title.text
+ # get authors
+ for author in sp.authors:
+ for subtag in author:
+ subtag.decompose()
+ if author.text.strip():
+ author_parts = author.text.strip().split()
+ authors.append({
+ "first": author_parts[0] if len(author_parts) > 1 else "",
+ "last": author_parts[-1]
+ if author_parts[-1].lower() not in {"jr", "jr.", "iii", "iv", "v"}
+ else author_parts[-2] if len(author_parts) > 1 else author_parts[-1],
+ "middle": author_parts[1:-1],
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ })
+ sp.metadata.decompose()
+ except AttributeError:
+ sp.metadata.decompose()
+ return title, authors
+
+ return title, authors
+
+
+def process_bibliography_from_tex(sp: BeautifulSoup, client, log_file) -> Dict:
+ """
+ Parse bibliography from latex
+ :return:
+ """
+ bibkey_map = dict()
+ # replace Bibliography with bibliography if needed
+ for bibl in sp.find_all("Bibliography"):
+ bibl.name = 'bibliography'
+ # construct bib map
+ for bibliography in sp.find_all('bibliography'):
+ bib_items = bibliography.find_all('bibitem')
+ # map all bib entries
+ if bib_items:
+ for bi_num, bi in enumerate(bib_items):
+ try:
+ if not bi.get('id'):
+ continue
+ # get bib entry text and process it
+ bib_par = bi.find_parent('p')
+ if bib_par.text:
+ bib_entry = process_bibentry(bib_par.text, client, log_file)
+ else:
+ next_tag = bib_par.findNext('p')
+ if not next_tag.find('bibitem') and next_tag.text:
+ bib_entry = process_bibentry(next_tag.text, client, log_file)
+ else:
+ bib_entry = None
+ # if processed successfully, add to map
+ if bib_entry:
+ # get URLs from bib entry
+ urls = []
+ for xref in bib_par.find_all('xref'):
+ urls.append(xref.get('url'))
+ bib_entry['urls'] = urls
+ # map to ref id
+ ref_id = normalize_latex_id(bi.get('id'))
+ bib_entry['ref_id'] = ref_id
+ bib_entry['num'] = bi_num
+ bibkey_map[ref_id] = bib_entry
+ except AttributeError:
+ print('Attribute error in bib item!', bi)
+ continue
+ except TypeError:
+ print('Type error in bib item!', bi)
+ continue
+ else:
+ for bi_num, p in enumerate(sp.bibliography.find_all('p')):
+ try:
+ bib_key, bib_entry = None, None
+ bib_text = p.text
+ bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
+ if bib_name:
+ bib_text = re.sub(r'\s', ' ', bib_text)
+ bib_name = re.match(r'\[(.*?)\](.*)', bib_text)
+ if bib_name:
+ bib_key = bib_name.group(1)
+ bib_entry = process_bibentry(bib_name.group(2), client, log_file)
+ else:
+ bib_lines = bib_text.split('\n')
+ bib_key = re.sub(r'\s', ' ', bib_lines[0])
+ bib_text = re.sub(r'\s', ' ', ' '.join(bib_lines[1:]))
+ bib_entry = process_bibentry(bib_text, client, log_file)
+ if bib_key and bib_entry:
+ # get URLs from bib entry
+ urls = []
+ for xref in p.find_all('xref'):
+ urls.append(xref.get('url'))
+ bib_entry['urls'] = urls
+ bib_entry['num'] = bi_num
+ # map to bib id
+ bibkey_map[bib_key] = bib_entry
+ except AttributeError:
+ print('Attribute error in bib item!', p)
+ continue
+ except TypeError:
+ print('Type error in bib item!', p)
+ continue
+ for bibliography in sp.find_all('bibliography'):
+ bibliography.decompose()
+ return bibkey_map
+
+
+def get_section_name(sec):
+ """
+ Get section name from div tag
+ :param sec:
+ :return:
+ """
+ if sec.head:
+ sec_text = sec.head.text
+ else:
+ sec_str = []
+ for tag in sec:
+ if type(tag) == NavigableString:
+ if len(tag.strip()) < 50:
+ sec_str.append(tag.strip())
+ else:
+ break
+ elif tag.name != 'p':
+ if len(tag.text.strip()) < 50:
+ sec_str.append(tag.text.strip())
+ else:
+ break
+ else:
+ break
+ sec_text = ' '.join(sec_str).strip()
+ return sec_text
+
+
+def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup, parent: Optional[str], faux_max: int) -> Dict:
+ """
+ Process section headers for one div
+ :param el:
+ :param sp:
+ :return:
+ """
+ sec_map_dict = dict()
+ el_ref_id = None
+
+ # process divs with ids
+ if el.get('id', None):
+ sec_num = el.get('id-text', None)
+ if 'cid' in el.get('id'):
+ el_ref_id = el.get('id').replace('cid', 'SECREF')
+ elif 'uid' in el.get('id'):
+ el_ref_id = el.get('id').replace('uid', 'SECREFU')
+ else:
+ print('Unknown ID type!', el.get('id'))
+ raise NotImplementedError
+ el['s2orc_id'] = el_ref_id
+ sec_map_dict[el_ref_id] = {
+ "num": sec_num,
+ "text": get_section_name(el),
+ "ref_id": el_ref_id,
+ "parent": parent
+ }
+ # process divs without section numbers
+ elif el.get('rend') == "nonumber":
+ el_ref_id = f'SECREF{faux_max}'
+ el['s2orc_id'] = el_ref_id
+ sec_map_dict[el_ref_id] = {
+ "num": None,
+ "text": get_section_name(el),
+ "ref_id": el_ref_id,
+ "parent": parent
+ }
+
+ # process sub elements
+ for sub_el in el.find_all(recursive=False):
+ if sub_el.name.startswith('div'):
+ # add any unspecified keys
+ sec_keys = [int(k.strip('SECREF')) for k in sec_map_dict.keys() if k and k.strip('SECREF').isdigit()]
+ faux_max = max(sec_keys + [faux_max]) + 1
+ sec_map_dict.update(
+ get_sections_from_div(sub_el, sp, el_ref_id if el_ref_id else parent, faux_max)
+ )
+ elif sub_el.name == 'p' or sub_el.name == 'proof':
+ if sub_el.get('id', None):
+ sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None))
+ if 'cid' in sub_el.get('id'):
+ sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF')
+ elif 'uid' in sub_el.get('id'):
+ sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU')
+ else:
+ print('Unknown ID type!', sub_el.get('id'))
+ raise NotImplementedError
+ sub_el['s2orc_id'] = sub_el_ref_id
+ sec_map_dict[el_ref_id] = {
+ "num": sec_num,
+ "text": sub_el.head.text if sub_el.head else sub_el.hi.text if sub_el.hi else "",
+ "ref_id": sub_el_ref_id,
+ "parent": el_ref_id if el_ref_id else parent
+ }
+ return sec_map_dict
+
+
+def process_sections_from_text(sp: BeautifulSoup) -> Dict:
+ """
+ Generate section dict and replace with id tokens
+ :param sp:
+ :return:
+ """
+ # initialize
+ section_map = dict()
+ max_above_1000 = 999
+
+ for div0 in sp.find_all('div0'):
+ parent = None
+ section_map.update(get_sections_from_div(div0, sp, parent, max_above_1000 + 1))
+ # add any unspecified keys
+ sec_keys = [int(k.strip('SECREF')) for k in section_map.keys() if k and k.strip('SECREF').isdigit()]
+ max_above_1000 = max(sec_keys + [max_above_1000]) + 1
+
+ return section_map
+
+
+def process_equations_from_tex(sp: BeautifulSoup) -> Dict:
+ """
+ Generate equation dict and replace with id tokens
+ :param sp:
+ :return:
+ """
+ equation_map = dict()
+
+ for eq in sp.find_all('formula'):
+ try:
+ if eq.get('type', None) == 'display':
+ if eq.get('id', None):
+ ref_id = eq.get('id').replace('uid', 'EQREF')
+ try:
+ mathml = latex2mathml.converter.convert(eq.texmath.text.strip())
+ except Exception:
+ mathml = ""
+ equation_map[ref_id] = {
+ "num": eq.get('id-text', None),
+ "text": eq.math.text.strip(),
+ "mathml": mathml,
+ "latex": eq.texmath.text.strip(),
+ "ref_id": ref_id
+ }
+ replace_item = sp.new_tag('p')
+ equation_copy = copy.copy(eq)
+ equation_copy['type'] = 'inline'
+ replace_item.insert(0, equation_copy)
+
+ # replace with containing equation as inline
+ eq.replace_with(replace_item)
+
+ except AttributeError:
+ continue
+
+ return equation_map
+
+
+def process_footnotes_from_text(sp: BeautifulSoup) -> Dict:
+ """
+ Process footnote marks
+ :param sp:
+ :return:
+ """
+ footnote_map = dict()
+
+ for note in sp.find_all('note'):
+ try:
+ if note.name and note.get('id'):
+ # normalize footnote id
+ ref_id = note.get('id').replace('uid', 'FOOTREF')
+ # remove equation tex
+ for eq in note.find_all('texmath'):
+ eq.decompose()
+ # replace all xrefs with link
+ for xref in note.find_all('xref'):
+ xref.replace_with(sp.new_string(f" {xref.get('url')} "))
+ # clean footnote text
+ footnote_text = None
+ if note.text:
+ footnote_text = note.text.strip()
+ footnote_text = re.sub(r'\s+', ' ', footnote_text)
+ footnote_text = re.sub(r'\s', ' ', footnote_text)
+ # form footnote entry
+ footnote_map[ref_id] = {
+ "num": note.get('id-text', None),
+ "text": footnote_text,
+ "ref_id": ref_id
+ }
+ note.replace_with(sp.new_string(f" {ref_id} "))
+ except AttributeError:
+ continue
+
+ return footnote_map
+
+
+def get_figure_map_from_tex(sp: BeautifulSoup) -> Dict:
+ """
+ Generate figure dict only
+ :param sp:
+ :return:
+ """
+ figure_map = dict()
+
+ # get floats first because they are around figures
+ for flt in sp.find_all('float'):
+ try:
+ if flt.name and flt.get('name') == 'figure':
+
+ # get files
+ fig_files = []
+ for fig in flt.find_all('figure'):
+ if fig.get('file') and fig.get('extension'):
+ fname = fig.get('file') + '.' + fig.get('extension')
+ fig_files.append(fname)
+ elif fig.get('file'):
+ fname = fig.get('file')
+ fig_files.append(fname)
+ else:
+ for subfig in fig.find_all('subfigure'):
+ if subfig.get('file') and subfig.get('extension'):
+ fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
+ elif subfig.get('file'):
+ fig_files.append(subfig.get('file'))
+
+ if flt.get('id'):
+ ref_id = flt.get('id').replace('uid', 'FIGREF')
+ # form figmap entry
+ figure_map[ref_id] = {
+ "num": flt.get('id-text', None),
+ "text": None, # placeholder
+ "uris": fig_files,
+ "ref_id": ref_id
+ }
+ except AttributeError:
+ print('Attribute error with figure float: ', flt.name)
+ continue
+
+ for fig in sp.find_all('figure'):
+ try:
+ if fig.name and fig.get('id'):
+ # normalize figure id
+ ref_id = fig.get('id').replace('uid', 'FIGREF')
+ # try to get filenames of figures
+ fig_files = []
+ if fig.get('file') and fig.get('extension'):
+ fname = fig.get('file') + '.' + fig.get('extension')
+ fig_files.append(fname)
+ elif fig.get('file'):
+ fig_files.append(fig.get('file'))
+ else:
+ for subfig in fig.find_all('subfigure'):
+ if subfig.get('file') and subfig.get('extension'):
+ fig_files.append(subfig.get('file') + '.' + subfig.get('extension'))
+ elif subfig.get('file'):
+ fig_files.append(subfig.get('file'))
+ # form figmap entry
+ figure_map[ref_id] = {
+ "num": fig.get('id-text', None),
+ "text": None, # placeholder
+ "uris": fig_files,
+ "ref_id": ref_id
+ }
+ except AttributeError:
+ print('Attribute error with figure: ', fig.name)
+ continue
+
+ return figure_map
+
+
+def process_figures_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
+ """
+ Add figure captions to fig_map and decompose
+ :param sp:
+ :param ref_map:
+ :return:
+ """
+ # process floats first because they are on the outside
+ for flt in sp.find_all('float'):
+ try:
+ if flt.name and flt.get('name') == 'figure':
+ if flt.get('id'):
+ ref_id = flt.get('id').replace('uid', 'FIGREF')
+ # remove equation tex
+ for eq in flt.find_all('texmath'):
+ eq.decompose()
+ # clean caption text
+ caption_text = None
+ if flt.caption:
+ flt = replace_ref_tokens(sp, flt, ref_map)
+ caption_text = flt.caption.text.strip()
+ caption_text = re.sub(r'\s+', ' ', caption_text)
+ caption_text = re.sub(r'\s', ' ', caption_text)
+ # form figmap entry
+ ref_map[ref_id]['text'] = caption_text
+ flt.decompose()
+ except AttributeError:
+ print('Attribute error with figure float: ', flt.name)
+ continue
+
+ for fig in sp.find_all('figure'):
+ try:
+ if fig.name and fig.get('id'):
+ # normalize figure id
+ ref_id = fig.get('id').replace('uid', 'FIGREF')
+ # remove equation tex
+ for eq in fig.find_all('texmath'):
+ eq.decompose()
+ # clean caption text
+ caption_text = None
+ if fig.text:
+ fig = replace_ref_tokens(sp, fig, ref_map)
+ caption_text = fig.text.strip()
+ caption_text = re.sub(r'\s+', ' ', caption_text)
+ caption_text = re.sub(r'\s', ' ', caption_text)
+ # add text to figmap entry
+ ref_map[ref_id]["text"] = caption_text
+ except AttributeError:
+ print('Attribute error with figure: ', fig.name)
+ continue
+ fig.decompose()
+
+ return ref_map
+
+
+def convert_table_to_html(table_lst: List) -> str:
+ if not table_lst:
+ return ''
+ html_str = '
'
+ for i, row in enumerate(table_lst):
+ html_str += ''
+ bottom_border = row.get('bottom-border')
+ if i == 0 or bottom_border:
+ for cell in row['cells']:
+ html_str += f"{cell['text']} "
+ else:
+ for cell in row['cells']:
+ html_str += f"{cell['text']} "
+ html_str += ' '
+ html_str += '
'
+ return html_str
+
+
+def extract_table(table: BeautifulSoup) -> List:
+ """
+ Extract table values from table entry
+ :param table:
+ :return:
+ """
+ table_rep = []
+ for row in table.find_all('row'):
+ cells = []
+ for cell in row.find_all('cell'):
+
+ text_items = []
+ latex_items = []
+
+ for child in cell:
+
+ if type(child) == NavigableString:
+ text_items.append(str(child))
+ latex_items.append(str(child))
+ elif child.name == 'formula':
+ text_items.append(child.math.text)
+ latex_items.append(child.texmath.text)
+ else:
+ text_items.append(child.text)
+ latex_items.append(child.text)
+
+ text = ' '.join(text_items)
+ text = re.sub(r'\s+', ' ', text)
+ text = re.sub(r'\s', ' ', text)
+
+ latex = ' '.join(latex_items)
+ latex = re.sub(r'\s+', ' ', latex)
+
+ cells.append({
+ "alignment": cell.get('halign'),
+ "right-border": cell.get('right-border') == 'true',
+ "left-border": cell.get('left-border') == 'true',
+ "text": text.strip(),
+ "latex": latex.strip()
+ })
+ table_rep.append({
+ "top-border": row.get('top-border') == "true",
+ "bottom-border": row.get('bottom-border') == "true",
+ "cells": cells
+ })
+ return table_rep
+
+
+def get_table_map_from_text(sp: BeautifulSoup, keep_table_contents=True) -> Dict:
+ """
+ Generate table dict only
+ :param sp:
+ :param keep_table_contents:
+ :return:
+ """
+ table_map = dict()
+
+ for flt in sp.find_all('float'):
+ try:
+ if flt.name and flt.get('name') == 'table':
+ if flt.get('id'):
+ # normalize table id
+ ref_id = flt.get('id').replace('uid', 'TABREF')
+ # get table content
+ content = extract_table(flt) if keep_table_contents else None
+ html = convert_table_to_html(content) if keep_table_contents else None
+ # form tabmap entry
+ table_map[ref_id] = {
+ "num": flt.get('id-text', None),
+ "text": None, # placeholder
+ "content": content,
+ "html": html,
+ "ref_id": ref_id
+ }
+ for row in flt.find_all('row'):
+ row.decompose()
+ except AttributeError:
+ print('Attribute error with table float: ', flt.name)
+ continue
+
+ for tab in sp.find_all('table'):
+ try:
+ # skip inline tables
+ if tab.get('rend') == 'inline':
+ continue
+ # process them
+ if tab.name and tab.get('id'):
+ # normalize table id
+ ref_id = tab.get('id').replace('uid', 'TABREF')
+ # get table content
+ content = extract_table(tab) if keep_table_contents else None
+ html = convert_table_to_html(content) if keep_table_contents else None
+ # form tabmap entry
+ table_map[ref_id] = {
+ "num": tab.get('id-text', None),
+ "text": None, # placeholder
+ "content": content,
+ "html": html,
+ "ref_id": ref_id
+ }
+ for row in tab.find_all('row'):
+ row.decompose()
+ except AttributeError:
+ print('Attribute error with table: ', tab.name)
+ continue
+
+ return table_map
+
+
+def process_tables_from_tex(sp: BeautifulSoup, ref_map: Dict) -> Dict:
+ """
+ Generate table dict and replace with id tokens
+ :param sp:
+ :param ref_map:
+ :return:
+ """
+ # process floats first because they are on the outside
+ for flt in sp.find_all('float'):
+ try:
+ if flt.name and flt.get('name') == 'table':
+ if flt.get('id'):
+ # normalize table id
+ ref_id = flt.get('id').replace('uid', 'TABREF')
+ # remove equation tex
+ if flt.caption:
+ caption_el = replace_ref_tokens(sp, flt.caption, ref_map)
+ for eq in caption_el.find_all('texmath'):
+ eq.decompose()
+ caption_text = caption_el.text.strip()
+ elif flt.head:
+ head_el = replace_ref_tokens(sp, flt.head, ref_map)
+ for eq in head_el.find_all('texmath'):
+ eq.decompose()
+ caption_text = head_el.text.strip()
+ elif flt.p:
+ caption_parts = []
+ for tab_p in flt.find_all('p'):
+ p_el = replace_ref_tokens(sp, tab_p, ref_map)
+ for eq in p_el.find_all('texmath'):
+ eq.decompose()
+ caption_parts.append(p_el.text.strip())
+ caption_text = ' '.join(caption_parts)
+ else:
+ tab_el = replace_ref_tokens(sp, flt, ref_map)
+ caption_text = tab_el.text.strip()
+ if caption_text:
+ caption_text = re.sub(r'\s+', ' ', caption_text)
+ caption_text = re.sub(r'\s', ' ', caption_text)
+ # form tabmap entry
+ ref_map[ref_id]['text'] = caption_text
+ flt.decompose()
+ except AttributeError:
+ print('Attribute error with table float: ', flt.name)
+ continue
+
+ for tab in sp.find_all('table'):
+ try:
+ # skip inline tables
+ if tab.get('rend') == 'inline':
+ continue
+ # process them
+ if tab.name and tab.get('id'):
+ # normalize table id
+ ref_id = tab.get('id').replace('uid', 'TABREF')
+ # remove equation tex from caption and clean and resolve refs
+ if tab.caption:
+ caption_el = replace_ref_tokens(sp, tab.caption, ref_map)
+ for eq in caption_el.find_all('texmath'):
+ eq.decompose()
+ caption_text = caption_el.text.strip()
+ elif tab.head:
+ head_el = replace_ref_tokens(sp, tab.head, ref_map)
+ for eq in head_el.find_all('texmath'):
+ eq.decompose()
+ caption_text = head_el.text.strip()
+ elif tab.p:
+ caption_parts = []
+ for tab_p in tab.find_all('p'):
+ p_el = replace_ref_tokens(sp, tab_p, ref_map)
+ for eq in p_el.find_all('texmath'):
+ eq.decompose()
+ caption_parts.append(p_el.text.strip())
+ caption_text = ' '.join(caption_parts)
+ else:
+ tab_el = replace_ref_tokens(sp, tab, ref_map)
+ caption_text = tab_el.text.strip()
+ if caption_text:
+ caption_text = re.sub(r'\s+', ' ', caption_text)
+ caption_text = re.sub(r'\s', ' ', caption_text)
+ # form tabmap entry
+ ref_map[ref_id]['text'] = caption_text
+ except AttributeError:
+ print('Attribute error with table: ', tab.name)
+ continue
+ tab.decompose()
+
+ return ref_map
+
+
+def combine_ref_maps(eq_map: Dict, fig_map: Dict, tab_map: Dict, foot_map: Dict, sec_map: Dict):
+ """
+ Combine all items with ref ids into one map
+ :param eq_map:
+ :param fig_map:
+ :param tab_map:
+ :param sec_map:
+ :return:
+ """
+ ref_map = dict()
+ for k, v in eq_map.items():
+ v['type'] = 'equation'
+ ref_map[k] = v
+ for k, v in fig_map.items():
+ v['type'] = 'figure'
+ ref_map[k] = v
+ for k, v in tab_map.items():
+ v['type'] = 'table'
+ ref_map[k] = v
+ for k, v in foot_map.items():
+ v['type'] = 'footnote'
+ ref_map[k] = v
+ for k, v in sec_map.items():
+ v['type'] = 'section'
+ ref_map[k] = v
+ return ref_map
+
+
+def collapse_formatting_tags(sp: BeautifulSoup):
+ """
+ Collapse formatting tags like
+ :param sp:
+ :return:
+ """
+ for hi in sp.find_all('hi'):
+ hi.replace_with(f' {sp.new_string(hi.text.strip())} ')
+
+
+def process_abstract_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+ """
+ Parse abstract from soup
+ :param sp:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ abstract_text = []
+ if sp.abstract:
+ for p in sp.abstract.find_all('p'):
+ abstract_text.append(
+ process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
+ )
+ sp.abstract.decompose()
+ else:
+ if sp.std:
+ p_tags = [tag for tag in sp.std if tag.name == 'p' and not tag.get('s2orc_id', None)]
+ elif sp.unknown:
+ p_tags = [tag for tag in sp.unknown if tag.name == 'p' and not tag.get('s2orc_id', None)]
+ else:
+ p_tags = None
+ if p_tags:
+ for p in p_tags:
+ abstract_text.append(
+ process_paragraph(sp, p, [(None, "Abstract")], bib_map, ref_map)
+ )
+ p.decompose()
+ return [para.__dict__ for para in abstract_text]
+
+
+def build_section_list(sec_id: str, ref_map: Dict) -> List[Tuple]:
+ """
+ Build list of sections from reference map from sec_id using parent entry recursively
+ :param sec_id:
+ :param ref_map:
+ :return:
+ """
+ if not sec_id:
+ return []
+ elif sec_id not in ref_map:
+ return []
+ else:
+ sec_entry = [(ref_map[sec_id]['num'], ref_map[sec_id]['text'])]
+ if ref_map[sec_id]['parent'] == sec_id:
+ return sec_entry
+ else:
+ return build_section_list(ref_map[sec_id]['parent'], ref_map) + sec_entry
+
+
+def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]:
+ """
+ Build sec_list for tag
+ :param el:
+ :param ref_map:
+ :param default_seclist:
+ :return:
+ """
+ if type(el) == NavigableString:
+ return default_seclist
+ sec_id = el.get('s2orc_id', None)
+ if sec_id:
+ return build_section_list(sec_id, ref_map)
+ else:
+ return default_seclist
+
+
+def process_div(tag: bs4.element.Tag, secs: List, sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+ """
+ Process div recursively
+ :param tag:
+ :param secs:
+ :param sp:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ # iterate through children of this tag
+ body_text = []
+
+ # navigable strings
+ if type(tag) == NavigableString:
+ return []
+ # skip these tags
+ elif tag.name in SKIP_TAGS:
+ return []
+ # process normal tags
+ elif tag.name in TEXT_TAGS:
+ if tag.text:
+ body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
+ # process lists
+ elif tag.name == 'list':
+ if tag.text:
+ body_text += process_list_el(sp, tag, secs, bib_map, ref_map)
+ # process formula
+ elif tag.name == 'formula':
+ replace_item = sp.new_tag('p')
+ tag_copy = copy.copy(tag)
+ tag_copy['type'] = 'inline'
+ replace_item.insert(0, tag_copy)
+ tag.replace_with(replace_item)
+ if tag.text:
+ body_text.append(process_paragraph(sp, tag, secs, bib_map, ref_map))
+ # process divs
+ elif tag.name.startswith('div'):
+ for el in tag:
+ # process tags
+ if type(el) == bs4.element.Tag:
+ el_sec_list = get_seclist_for_el(el, ref_map, secs)
+ body_text += process_div(el, el_sec_list, sp, bib_map, ref_map)
+ # unknown tag type, skip for now
+ else:
+ print(f'Unknown tag type: {tag.name}')
+ return []
+
+ return body_text
+
+
+def process_body_text_from_tex(sp: BeautifulSoup, bib_map: Dict, ref_map: Dict) -> List[Dict]:
+ """
+ Parse body text from tag recursively
+ :param sp:
+ :param bib_map:
+ :param ref_map:
+ :return:
+ """
+ body_text = []
+ for tag in sp.body:
+ # skip navigable string
+ if type(tag) == NavigableString:
+ continue
+ else:
+ sec_list = get_seclist_for_el(tag, ref_map, [])
+ for cld in tag:
+ # skip navigable string
+ if type(tag) == NavigableString:
+ continue
+ else:
+ sec_list = get_seclist_for_el(cld, ref_map, sec_list)
+ if type(cld) == bs4.element.Tag:
+ body_text += process_div(cld, sec_list, sp, bib_map, ref_map)
+
+ # decompose everything
+ sp.body.decompose()
+
+ return [para.__dict__ for para in body_text]
+
+
+def convert_xml_to_s2orc(
+ sp: BeautifulSoup, file_id: str, year_str: str, log_file: str, grobid_config: Optional[Dict]=None
+) -> Paper:
+ """
+ Convert a bunch of xml to gorc format
+ :param sp:
+ :param file_id:
+ :param year_str:
+ :param log_file:
+ :param grobid_config:
+ :return:
+ """
+ # create grobid client
+ client = GrobidClient(grobid_config)
+
+ # TODO: not sure why but have to run twice
+ decompose_tags_before_title(sp)
+ decompose_tags_before_title(sp)
+
+ # process maketitle info
+ title, authors = process_metadata(sp, client, log_file)
+
+ # processing of bibliography entries
+ # TODO: look into why authors aren't processing
+ bibkey_map = process_bibliography_from_tex(sp, client, log_file)
+
+ # no bibliography entries
+ if not bibkey_map:
+ with open(log_file, 'a+') as bib_f:
+ bib_f.write(f'{file_id},warn_no_bibs\n')
+
+ # process section headers
+ section_map = process_sections_from_text(sp)
+
+ # process and replace non-inline equations
+ equation_map = process_equations_from_tex(sp)
+
+ # process footnote markers
+ footnote_map = process_footnotes_from_text(sp)
+
+ # get figure map
+ figure_map = get_figure_map_from_tex(sp)
+
+ # get table_map
+ table_map = get_table_map_from_text(sp)
+
+ # combine references in one dict
+ refkey_map = combine_ref_maps(equation_map, figure_map, table_map, footnote_map, section_map)
+
+ # process and replace figures
+ refkey_map = process_figures_from_tex(sp, refkey_map)
+
+ # process and replace tables
+ refkey_map = process_tables_from_tex(sp, refkey_map)
+
+ # collapse all hi tags
+ collapse_formatting_tags(sp)
+
+ # process abstract if possible
+ abstract = process_abstract_from_tex(sp, bibkey_map, refkey_map)
+
+ # process body text
+ body_text = process_body_text_from_tex(sp, bibkey_map, refkey_map)
+
+ # skip if no body text parsed
+ if not body_text:
+ with open(log_file, 'a+') as body_f:
+ body_f.write(f'{file_id},warn_no_body\n')
+
+ metadata = {
+ "title": title,
+ "authors": authors,
+ "year": year_str,
+ "venue": "",
+ "identifiers": {
+ "arxiv_id": file_id
+ }
+ }
+
+ return Paper(
+ paper_id=file_id,
+ pdf_hash="",
+ metadata=metadata,
+ abstract=abstract,
+ body_text=body_text,
+ back_matter=[],
+ bib_entries=bibkey_map,
+ ref_entries=refkey_map
+ )
+
+
+def convert_latex_xml_to_s2orc_json(xml_fpath: str, log_dir: str, grobid_config: Optional[Dict]=None) -> Paper:
+ """
+ :param xml_fpath:
+ :param log_dir:
+ :param grobid_config:
+ :return:
+ """
+ assert os.path.exists(xml_fpath)
+
+ # get file id
+ file_id = str(os.path.splitext(xml_fpath)[0]).split('/')[-1]
+
+ # try to get year from file name
+ year = file_id.split('.')[0][:2]
+ if year.isdigit():
+ year = int(year)
+ if year < 40:
+ year += 2000
+ else:
+ year += 1900
+ year = str(year)
+ else:
+ year = ""
+
+ # log file
+ log_file = os.path.join(log_dir, 'failed.log')
+
+ with open(xml_fpath, 'r') as f:
+ try:
+ xml = f.read()
+ soup = BeautifulSoup(xml, "lxml")
+ paper = convert_xml_to_s2orc(soup, file_id, year, log_file, grobid_config=grobid_config)
+ return paper
+ except UnicodeDecodeError:
+ with open(log_file, 'a+') as log_f:
+ log_f.write(f'{file_id},err_unicode_decode\n')
+ raise UnicodeDecodeError
diff --git a/s2orc-doc2json/doc2json/utils/__init__.py b/s2orc-doc2json/doc2json/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/s2orc-doc2json/doc2json/utils/citation_util.py b/s2orc-doc2json/doc2json/utils/citation_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bae480231f6a5d215c51ce04770a4ffe3c89a07
--- /dev/null
+++ b/s2orc-doc2json/doc2json/utils/citation_util.py
@@ -0,0 +1,75 @@
+# utility functions for handling failure situations with grobid-detected citation spans
+
+import re
+from typing import Dict, List, Tuple
+
+
+BRACKET_REGEX = re.compile(r'\[[1-9]\d{0,2}([,;\-\s]+[1-9]\d{0,2})*;?\]')
+BRACKET_STYLE_THRESHOLD = 5
+
+SINGLE_BRACKET_REGEX = re.compile(r'\[([1-9]\d{0,2})\]')
+EXPANSION_CHARS = {'-', '–'}
+
+
+def span_already_added(sub_start: int, sub_end: int, span_indices: List[Tuple[int, int]]) -> bool:
+ """
+ Check if span is a subspan of existing span
+ :param sub_start:
+ :param sub_end:
+ :param span_indices:
+ :return:
+ """
+ for span_start, span_end in span_indices:
+ if sub_start >= span_start and sub_end <= span_end:
+ return True
+ return False
+
+
+def is_expansion_string(between_string: str) -> bool:
+ """
+ Check if the string between two refs is an expansion string
+ :param between_string:
+ :return:
+ """
+ if len(between_string) <= 2 \
+ and any([c in EXPANSION_CHARS for c in between_string]) \
+ and all([c in EXPANSION_CHARS.union({' '}) for c in between_string]):
+ return True
+ return False
+
+
+# TODO: still cases like `09bcee03baceb509d4fcf736fa1322cb8adf507f` w/ dups like ['L Jung', 'R Hessler', 'Louis Jung', 'Roland Hessler']
+# example paper that has empties & duplicates: `09bce26cc7e825e15a4469e3e78b7a54898bb97f`
+def _clean_empty_and_duplicate_authors_from_grobid_parse(authors: List[Dict]) -> List[Dict]:
+ """
+ Within affiliation, `location` is a dict with fields , , , , etc.
+ Too much hassle, so just take the first one that's not empty.
+ """
+ # stripping empties
+ clean_authors_list = []
+ for author in authors:
+ clean_first = author['first'].strip()
+ clean_last = author['last'].strip()
+ clean_middle = [m.strip() for m in author['middle']]
+ clean_suffix = author['suffix'].strip()
+ if clean_first or clean_last or clean_middle:
+ author['first'] = clean_first
+ author['last'] = clean_last
+ author['middle'] = clean_middle
+ author['suffix'] = clean_suffix
+ clean_authors_list.append(author)
+ # combining duplicates (preserve first occurrence of author name as position)
+ key_to_author_blobs = {}
+ ordered_keys_by_author_pos = []
+ for author in clean_authors_list:
+ key = (author['first'], author['last'], ' '.join(author['middle']), author['suffix'])
+ if key not in key_to_author_blobs:
+ key_to_author_blobs[key] = author
+ ordered_keys_by_author_pos.append(key)
+ else:
+ if author['email']:
+ key_to_author_blobs[key]['email'] = author['email']
+ if author['affiliation'] and (author['affiliation']['institution'] or author['affiliation']['laboratory'] or author['affiliation']['location']):
+ key_to_author_blobs[key]['affiliation'] = author['affiliation']
+ dedup_authors_list = [key_to_author_blobs[key] for key in ordered_keys_by_author_pos]
+ return dedup_authors_list
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/utils/grobid_util.py b/s2orc-doc2json/doc2json/utils/grobid_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cff1fcff213cfb5382443094cf95f7b8cdb6913
--- /dev/null
+++ b/s2orc-doc2json/doc2json/utils/grobid_util.py
@@ -0,0 +1,388 @@
+from typing import List, Dict, Optional
+import bs4
+from bs4 import BeautifulSoup
+import re
+from collections import defaultdict
+
+
+SUBSTITUTE_TAGS = {
+ 'persName',
+ 'orgName',
+ 'publicationStmt',
+ 'titleStmt',
+ 'biblScope'
+}
+
+
+def clean_tags(el: bs4.element.Tag):
+ """
+ Replace all tags with lowercase version
+ :param el:
+ :return:
+ """
+ for sub_tag in SUBSTITUTE_TAGS:
+ for sub_el in el.find_all(sub_tag):
+ sub_el.name = sub_tag.lower()
+
+
+def soup_from_path(file_path: str):
+ """
+ Read XML file
+ :param file_path:
+ :return:
+ """
+ return BeautifulSoup(open(file_path, "rb").read(), "xml")
+
+
+def get_title_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Returns title
+ :return:
+ """
+ for title_entry in raw_xml.find_all("title"):
+ if title_entry.has_attr("level") \
+ and title_entry["level"] == "a":
+ return title_entry.text
+ try:
+ return raw_xml.title.text
+ except AttributeError:
+ return ""
+
+
+def get_author_names_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict[str, str]]:
+ """
+ Returns a list of dictionaries, one for each author,
+ containing the first and last names.
+
+ e.g.
+ {
+ "first": first,
+ "middle": middle,
+ "last": last,
+ "suffix": suffix
+ }
+ """
+ names = []
+
+ for author in raw_xml.find_all("author"):
+ if not author.persname:
+ continue
+
+ # forenames include first and middle names
+ forenames = author.persname.find_all("forename")
+
+ # surnames include last names
+ surnames = author.persname.find_all("surname")
+
+ # name suffixes
+ suffixes = author.persname.find_all("suffix")
+
+ first = ""
+ middle = []
+ last = ""
+ suffix = ""
+
+ for forename in forenames:
+ if forename["type"] == "first":
+ if not first:
+ first = forename.text
+ else:
+ middle.append(forename.text)
+ elif forename["type"] == "middle":
+ middle.append(forename.text)
+
+ if len(surnames) > 1:
+ for surname in surnames[:-1]:
+ middle.append(surname.text)
+ last = surnames[-1].text
+ elif len(surnames) == 1:
+ last = surnames[0].text
+
+ if len(suffix) >= 1:
+ suffix = " ".join([suffix.text for suffix in suffixes])
+
+ names_dict = {
+ "first": first,
+ "middle": middle,
+ "last": last,
+ "suffix": suffix
+ }
+
+ names.append(names_dict)
+ return names
+
+
+def get_affiliation_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict:
+ """
+ Get affiliation from grobid xml
+ :param raw_xml:
+ :return:
+ """
+ location_dict = dict()
+ laboratory_name = ""
+ institution_name = ""
+
+ if raw_xml and raw_xml.affiliation:
+ for child in raw_xml.affiliation:
+ if child.name == "orgname":
+ if child.has_attr("type"):
+ if child["type"] == "laboratory":
+ laboratory_name = child.text
+ elif child["type"] == "institution":
+ institution_name = child.text
+ elif child.name == "address":
+ for grandchild in child:
+ if grandchild.name and grandchild.text:
+ location_dict[grandchild.name] = grandchild.text
+
+ if laboratory_name or institution_name:
+ return {
+ "laboratory": laboratory_name,
+ "institution": institution_name,
+ "location": location_dict
+ }
+
+ return {}
+
+
+def get_author_data_from_grobid_xml(raw_xml: BeautifulSoup) -> List[Dict]:
+ """
+ Returns a list of dictionaries, one for each author,
+ containing the first and last names.
+
+ e.g.
+ {
+ "first": first,
+ "middle": middle,
+ "last": last,
+ "suffix": suffix,
+ "affiliation": {
+ "laboratory": "",
+ "institution": "",
+ "location": "",
+ },
+ "email": ""
+ }
+ """
+ authors = []
+
+ for author in raw_xml.find_all("author"):
+
+ first = ""
+ middle = []
+ last = ""
+ suffix = ""
+
+ if author.persname:
+ # forenames include first and middle names
+ forenames = author.persname.find_all("forename")
+
+ # surnames include last names
+ surnames = author.persname.find_all("surname")
+
+ # name suffixes
+ suffixes = author.persname.find_all("suffix")
+
+ for forename in forenames:
+ if forename.has_attr("type"):
+ if forename["type"] == "first":
+ if not first:
+ first = forename.text
+ else:
+ middle.append(forename.text)
+ elif forename["type"] == "middle":
+ middle.append(forename.text)
+
+ if len(surnames) > 1:
+ for surname in surnames[:-1]:
+ middle.append(surname.text)
+ last = surnames[-1].text
+ elif len(surnames) == 1:
+ last = surnames[0].text
+
+ if len(suffix) >= 1:
+ suffix = " ".join([suffix.text for suffix in suffixes])
+
+ affiliation = get_affiliation_from_grobid_xml(author)
+
+ email = ""
+ if author.email:
+ email = author.email.text
+
+ author_dict = {
+ "first": first,
+ "middle": middle,
+ "last": last,
+ "suffix": suffix,
+ "affiliation": affiliation,
+ "email": email
+ }
+
+ authors.append(author_dict)
+
+ return authors
+
+
+def get_year_from_grobid_xml(raw_xml: BeautifulSoup) -> Optional[int]:
+ """
+ Returns date published if exists
+ :return:
+ """
+ if raw_xml.date and raw_xml.date.has_attr("when"):
+ # match year in date text (which is in some unspecified date format)
+ year_match = re.match(r"((19|20)\d{2})", raw_xml.date["when"])
+ if year_match:
+ year = year_match.group(0)
+ if year and year.isnumeric() and len(year) == 4:
+ return int(year)
+ return None
+
+
+def get_venue_from_grobid_xml(raw_xml: BeautifulSoup, title_text: str) -> str:
+ """
+ Returns venue/journal/publisher of bib entry
+ Grobid ref documentation: https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
+ level="j": journal title
+ level="m": "non journal bibliographical item holding the cited article"
+ level="s": series title
+ :return:
+ """
+ title_names = []
+ keep_types = ["j", "m", "s"]
+ # get all titles of the anove types
+ for title_entry in raw_xml.find_all("title"):
+ if title_entry.has_attr("level") \
+ and title_entry["level"] in keep_types \
+ and title_entry.text != title_text:
+ title_names.append((title_entry["level"], title_entry.text))
+ # return the title name that most likely belongs to the journal or publication venue
+ if title_names:
+ title_names.sort(key=lambda x: keep_types.index(x[0]))
+ return title_names[0][1]
+ return ""
+
+
+def get_volume_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Returns the volume number of grobid bib entry
+ Grobid
+ :return:
+ """
+ for bibl_entry in raw_xml.find_all("biblscope"):
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "volume":
+ return bibl_entry.text
+ return ""
+
+
+def get_issue_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Returns the issue number of grobid bib entry
+ Grobid
+ :return:
+ """
+ for bibl_entry in raw_xml.find_all("biblscope"):
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "issue":
+ return bibl_entry.text
+ return ""
+
+
+def get_pages_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Returns the page numbers of grobid bib entry
+ Grobid
+ :return:
+ """
+ for bibl_entry in raw_xml.find_all("biblscope"):
+ if bibl_entry.has_attr("unit") and bibl_entry["unit"] == "page" and bibl_entry.has_attr("from"):
+ from_page = bibl_entry["from"]
+ if bibl_entry.has_attr("to"):
+ to_page = bibl_entry["to"]
+ return f'{from_page}--{to_page}'
+ else:
+ return from_page
+ return ""
+
+
+def get_other_ids_from_grobid_xml(raw_xml: BeautifulSoup) -> Dict[str, List]:
+ """
+ Returns a dictionary of other identifiers from grobid bib entry (arxiv, pubmed, doi)
+ :param raw_xml:
+ :return:
+ """
+ other_ids = defaultdict(list)
+
+ for idno_entry in raw_xml.find_all("idno"):
+ if idno_entry.has_attr("type") and idno_entry.text:
+ other_ids[idno_entry["type"]].append(idno_entry.text)
+
+ return other_ids
+
+
+def get_raw_bib_text_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Returns the raw bibiliography string
+ :param raw_xml:
+ :return:
+ """
+ for note in raw_xml.find_all("note"):
+ if note.has_attr("type") and note["type"] == "raw_reference":
+ return note.text
+ return ""
+
+
+def get_publication_datetime_from_grobid_xml(raw_xml: BeautifulSoup) -> str:
+ """
+ Finds and returns the publication datetime if it exists
+ :param raw_xml:
+ :return:
+ """
+ if raw_xml.publicationStmt:
+ for child in raw_xml.publicationstmt:
+ if child.name == "date" \
+ and child.has_attr("type") \
+ and child["type"] == "published" \
+ and child.has_attr("when"):
+ return child["when"]
+ return ""
+
+
+def parse_bib_entry(bib_entry: BeautifulSoup) -> Dict:
+ """
+ Parse one bib entry
+ :param bib_entry:
+ :return:
+ """
+ clean_tags(bib_entry)
+ title = get_title_from_grobid_xml(bib_entry)
+ return {
+ 'ref_id': bib_entry.attrs.get("xml:id", None),
+ 'title': title,
+ 'authors': get_author_names_from_grobid_xml(bib_entry),
+ 'year': get_year_from_grobid_xml(bib_entry),
+ 'venue': get_venue_from_grobid_xml(bib_entry, title),
+ 'volume': get_volume_from_grobid_xml(bib_entry),
+ 'issue': get_issue_from_grobid_xml(bib_entry),
+ 'pages': get_pages_from_grobid_xml(bib_entry),
+ 'other_ids': get_other_ids_from_grobid_xml(bib_entry),
+ 'raw_text': get_raw_bib_text_from_grobid_xml(bib_entry),
+ 'urls': []
+ }
+
+
+def is_reference_tag(tag: bs4.element.Tag) -> bool:
+ return tag.name == "ref" and tag.attrs.get("type", "") == "bibr"
+
+
+def extract_paper_metadata_from_grobid_xml(tag: bs4.element.Tag) -> Dict:
+ """
+ Extract paper metadata (title, authors, affiliation, year) from grobid xml
+ :param tag:
+ :return:
+ """
+ clean_tags(tag)
+ paper_metadata = {
+ "title": tag.titlestmt.title.text,
+ "authors": get_author_data_from_grobid_xml(tag),
+ "year": get_publication_datetime_from_grobid_xml(tag)
+ }
+ return paper_metadata
\ No newline at end of file
diff --git a/s2orc-doc2json/doc2json/utils/latex_util.py b/s2orc-doc2json/doc2json/utils/latex_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43b4f8e96a45b017d98f61a5e460e21208d0401
--- /dev/null
+++ b/s2orc-doc2json/doc2json/utils/latex_util.py
@@ -0,0 +1,204 @@
+"""
+Many of the REGEX expressions and pipeline in this set of utilities are borrowed or extended from
+the unarXive project: https://github.com/IllDepence/unarXive
+
+Modifications have been made to better identify the primary latex file and expand all other latex
+files into the main file. Latexpand and tralics options have also been changed.
+"""
+import chardet
+import magic
+import os
+import re
+import glob
+import subprocess
+import tempfile
+
+MAIN_TEX_PATT = re.compile(r'(\\begin\s*\{\s*document\s*\})', re.I)
+# ^ with capturing parentheses so that the pattern can be used for splitting
+PDF_EXT_PATT = re.compile(r'^\.pdf$', re.I)
+GZ_EXT_PATT = re.compile(r'^\.gz$', re.I)
+TEX_EXT_PATT = re.compile(r'^\.tex$', re.I)
+NON_TEXT_PATT = re.compile(r'^\.(pdf|eps|jpg|png|gif)$', re.I)
+BBL_SIGN = '\\bibitem'
+# natbib fix
+PRE_FIX_NATBIB = True
+NATBIB_PATT = re.compile((r'\\cite(t|p|alt|alp|author|year|yearpar)\s*?\*?\s*?'
+ '(\[[^\]]*?\]\s*?)*?\s*?\*?\s*?\{([^\}]+?)\}'),
+ re.I)
+# bibitem option fix
+PRE_FIX_BIBOPT = True
+BIBOPT_PATT = re.compile(r'\\bibitem\s*?\[[^]]*?\]', re.I|re.M)
+
+# ↑ above two solve most tralics problems; except for mnras style bibitems
+# (https://ctan.org/pkg/mnras)
+
+# agressive math pre-removal
+PRE_FILTER_MATH = False
+FILTER_PATTS = []
+for env in ['equation', 'displaymath', 'array', 'eqnarray', 'align', 'gather',
+ 'multline', 'flalign', 'alignat']:
+ s = r'\\begin\{{{0}[*]?\}}.+?\\end\{{{0}\}}'.format(env)
+ patt = re.compile(s, re.I | re.M | re.S)
+ FILTER_PATTS.append(patt)
+FILTER_PATTS.append(re.compile(r'\$\$.+?\$\$', re.S))
+FILTER_PATTS.append(re.compile(r'\$.+?\$', re.S))
+FILTER_PATTS.append(re.compile(r'\\\(.+?\\\)', re.S))
+FILTER_PATTS.append(re.compile(r'\\\[.+?\\\]', re.S))
+
+
+def read_file(path):
+ try:
+ with open(path) as f:
+ cntnt = f.read()
+ except UnicodeDecodeError:
+ blob = open(path, 'rb').read()
+ m = magic.Magic(mime_encoding=True)
+ encoding = m.from_buffer(blob)
+ try:
+ cntnt = blob.decode(encoding)
+ except (UnicodeDecodeError, LookupError) as e:
+ encoding = chardet.detect(blob)['encoding']
+ if encoding:
+ try:
+ cntnt = blob.decode(encoding, errors='replace')
+ except:
+ return ''
+ else:
+ return ''
+ return cntnt
+
+
+def remove_math(latex_str):
+ parts = re.split(MAIN_TEX_PATT, latex_str, maxsplit=1)
+ for patt in FILTER_PATTS:
+ parts[2] = re.sub(patt, '', parts[2])
+ return ''.join(parts)
+
+
+def normalize(path, out_dir, write_logs=True):
+ """
+ Normalize an arXiv file
+ Adapted from https://github.com/IllDepence/unarXive
+ with modifications
+
+ Identifies the primary *.tex file, the bibliography file,
+ and expands other tex files and the bibliography into the
+ main tex file
+ """
+ def log(msg):
+ if write_logs:
+ with open(os.path.join(out_dir, 'log.txt'), 'a') as f:
+ f.write('{}\n'.format(msg))
+
+ # break path
+ _, fn = os.path.split(path.strip('/'))
+
+ # identify main tex file
+ main_tex_path = None
+ ignored_names = []
+
+ # check .tex files first
+ for tfn in os.listdir(path):
+
+ if not TEX_EXT_PATT.match(os.path.splitext(tfn)[1]):
+ ignored_names.append(tfn)
+ continue
+
+ try:
+ cntnt = read_file(os.path.join(path, tfn))
+ except:
+ continue
+
+ if re.search(MAIN_TEX_PATT, cntnt) is not None:
+ main_tex_path = tfn
+
+ # try other files
+ if main_tex_path is None:
+ for tfn in ignored_names:
+ if NON_TEXT_PATT.match(os.path.splitext(tfn)[1]):
+ continue
+ try:
+ cntnt = read_file(os.path.join(path, tfn))
+ if re.search(MAIN_TEX_PATT, cntnt) is not None:
+ main_tex_path = tfn
+ except:
+ continue
+
+ # give up
+ if main_tex_path is None:
+ log(('couldn\'t find main tex file in dump archive {}'
+ '').format(fn))
+
+ # flatten to single tex file and save
+ with tempfile.TemporaryDirectory() as tmp_dir_path:
+ temp_tex_fn = os.path.join(tmp_dir_path, f'{fn}.tex')
+
+ # find bbl file
+ main_tex_fn = os.path.join(path, main_tex_path)
+ bbl_files = glob.glob(os.path.join(path, '*.bbl'))
+
+ if bbl_files:
+ latexpand_args = ['latexpand',
+ '--expand-bbl',
+ os.path.split(bbl_files[0])[1],
+ main_tex_path,
+ '--output',
+ temp_tex_fn]
+ else:
+ latexpand_args = ['latexpand',
+ main_tex_path,
+ '--output',
+ temp_tex_fn]
+
+ # run latexpand
+ with open(os.path.join(out_dir, 'log_latexpand.txt'), 'a+') as err:
+ subprocess.run(latexpand_args, stderr=err, cwd=path)
+
+ # re-read and write to ensure utf-8 b/c latexpand doesn't
+ # behave
+ new_tex_fn = os.path.join(out_dir, f'{fn}.tex')
+ cntnt = read_file(temp_tex_fn)
+ if PRE_FIX_NATBIB:
+ cntnt = NATBIB_PATT.sub(r'\\cite{\3}', cntnt)
+ if PRE_FIX_BIBOPT:
+ cntnt = BIBOPT_PATT.sub(r'\\bibitem', cntnt)
+ if PRE_FILTER_MATH:
+ cntnt = remove_math(cntnt)
+ with open(new_tex_fn, mode='w', encoding='utf-8') as f:
+ f.write(cntnt)
+
+
+def latex_to_xml(tex_file: str, out_dir: str, out_file: str, err_file: str, log_file: str):
+ """
+ Convert expanded latex file to XML using tralics
+ :param tex_file:
+ :param out_dir:
+ :param out_file:
+ :param err_file:
+ :param log_file:
+ :return:
+ """
+ with open(os.devnull, 'w') as devnull, \
+ open(err_file, 'a+') as err_f, \
+ open(log_file, 'a+') as skip_f:
+ # run tralics
+ tralics_args = ['tralics',
+ '-silent',
+ '-noxmlerror',
+ '-utf8',
+ '-oe8',
+ '-entnames=false',
+ '-nomathml',
+ f'-output_dir={out_dir}',
+ tex_file]
+ try:
+ subprocess.run(tralics_args, stdout=devnull, stderr=err_f, timeout=5)
+ except subprocess.TimeoutExpired:
+ skip_f.write(f'{tex_file}\n')
+
+ # if no output, skip
+ if not os.path.exists(out_file):
+ skip_f.write(f'{tex_file}\n')
+
+ if os.path.exists(out_file):
+ return out_file
diff --git a/s2orc-doc2json/doc2json/utils/refspan_util.py b/s2orc-doc2json/doc2json/utils/refspan_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..728ef0b2cc70065e44cbe79d0fec75b4b3b82e8b
--- /dev/null
+++ b/s2orc-doc2json/doc2json/utils/refspan_util.py
@@ -0,0 +1,115 @@
+from typing import List, Tuple
+
+
+def replace_refspans(
+ spans_to_replace: List[Tuple[int, int, str, str]],
+ full_string: str,
+ pre_padding: str = "",
+ post_padding: str = "",
+ btwn_padding: str = ", "
+) -> str:
+ """
+ For each span within the full string, replace that span with new text
+ :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
+ :param full_string:
+ :param pre_padding:
+ :param post_padding:
+ :param btwn_padding:
+ :return:
+ """
+ # assert all spans are equal to full_text span
+ assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])
+
+ # assert none of the spans start with the same start ind
+ start_inds = [rep[0] for rep in spans_to_replace]
+ assert len(set(start_inds)) == len(start_inds)
+
+ # sort by start index
+ spans_to_replace.sort(key=lambda x: x[0])
+
+ # form strings for each span group
+ for i, entry in enumerate(spans_to_replace):
+ start, end, span, new_string = entry
+
+ # skip empties
+ if end <= 0:
+ continue
+
+ # compute shift amount
+ shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)
+
+ # shift remaining appropriately
+ for ind in range(i + 1, len(spans_to_replace)):
+ next_start, next_end, next_span, next_string = spans_to_replace[ind]
+ # skip empties
+ if next_end <= 0:
+ continue
+ # if overlap between ref span and current ref span, remove from replacement
+ if next_start < end:
+ next_start = 0
+ next_end = 0
+ next_string = ""
+ # if ref span abuts previous reference span
+ elif next_start == end:
+ next_start += shift_amount
+ next_end += shift_amount
+ next_string = btwn_padding + pre_padding + next_string + post_padding
+ # if ref span starts after, shift starts and ends
+ elif next_start > end:
+ next_start += shift_amount
+ next_end += shift_amount
+ next_string = pre_padding + next_string + post_padding
+ # save adjusted span
+ spans_to_replace[ind] = (next_start, next_end, next_span, next_string)
+
+ spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
+ spans_to_replace.sort(key=lambda x: x[0])
+
+ # apply shifts in series
+ for start, end, span, new_string in spans_to_replace:
+ assert full_string[start:end] == span
+ full_string = full_string[:start] + new_string + full_string[end:]
+
+ return full_string
+
+
+def sub_spans_and_update_indices(
+ spans_to_replace: List[Tuple[int, int, str, str]],
+ full_string: str
+) -> Tuple[str, List]:
+ """
+ Replace all spans and recompute indices
+ :param spans_to_replace:
+ :param full_string:
+ :return:
+ """
+ # TODO: check no spans overlapping
+ # TODO: check all spans well-formed
+
+ # assert all spans are equal to full_text span
+ assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])
+
+ # assert none of the spans start with the same start ind
+ start_inds = [rep[0] for rep in spans_to_replace]
+ assert len(set(start_inds)) == len(start_inds)
+
+ # sort by start index
+ spans_to_replace.sort(key=lambda x: x[0])
+
+ # compute offsets for each span
+ new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace]
+ for i, entry in enumerate(spans_to_replace):
+ start, end, token, surface = entry
+ new_end = start + len(surface)
+ offset = new_end - end
+ new_spans[i][1] += offset
+ for new_span_entry in new_spans[i+1:]:
+ new_span_entry[4] += offset
+
+ # generate new text and create final spans
+ new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
+ new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans]
+
+ return new_text, new_spans
+
+
diff --git a/s2orc-doc2json/doc2json/utils/soup_utils.py b/s2orc-doc2json/doc2json/utils/soup_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4531a64af09bb5b929c7b18ef01ce550b97981b
--- /dev/null
+++ b/s2orc-doc2json/doc2json/utils/soup_utils.py
@@ -0,0 +1,19 @@
+
+from typing import List
+
+from bs4 import BeautifulSoup
+
+def destroy_unimportant_tags_inplace(soup_tag, tags_to_remove: List[str]):
+ """Remove tags like or """
+ for tag_to_remove in tags_to_remove:
+ for match in soup_tag.find_all(tag_to_remove):
+ match.replaceWithChildren()
+
+
+def create_new_parent_tag(soup_tag, parent_tag_name: str, soup):
+ """Wraps soup tag with another parent tag"""
+ new_parent_tag = soup.new_tag(parent_tag_name)
+ contents = soup_tag.replace_with(new_parent_tag)
+ new_parent_tag.append(contents)
+ return new_parent_tag
+
diff --git a/s2orc-doc2json/output_dir/2020.acl-main.207.json b/s2orc-doc2json/output_dir/2020.acl-main.207.json
new file mode 100644
index 0000000000000000000000000000000000000000..a410fa7020b4652162baba981191a98ab135c305
--- /dev/null
+++ b/s2orc-doc2json/output_dir/2020.acl-main.207.json
@@ -0,0 +1,3774 @@
+{
+ "paper_id": "2020",
+ "header": {
+ "generated_with": "S2ORC 1.0.0",
+ "date_generated": "2022-06-22T20:48:30.412654Z"
+ },
+ "title": "SPECTER: Document-level Representation Learning using Citation-informed Transformers",
+ "authors": [
+ {
+ "first": "Arman",
+ "middle": [],
+ "last": "Cohan",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "armanc@allenai.org"
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "sergey@allenai.org"
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "beltagy@allenai.org"
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "dougd@allenai.org"
+ },
+ {
+ "first": "Daniel",
+ "middle": [
+ "S"
+ ],
+ "last": "Weld",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": ""
+ }
+ ],
+ "year": "",
+ "venue": null,
+ "identifiers": {},
+ "abstract": "Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1",
+ "pdf_parse": {
+ "paper_id": "2020",
+ "_pdf_hash": "",
+ "abstract": [
+ {
+ "text": "Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Abstract",
+ "sec_num": null
+ }
+ ],
+ "body_text": [
+ {
+ "text": "As the pace of scientific publication continues to increase, Natural Language Processing (NLP) tools that help users to search, discover and understand the scientific literature have become critical. In recent years, substantial improvements in NLP tools have been brought about by pretrained neural language models (LMs) (Radford et al., 2018; Devlin et al., 2019; . While such models are widely used for representing individual words or sentences, extensions to whole-document embeddings are relatively underexplored. Likewise, methods that do use inter-document signals to produce whole-document embeddings (Tu et al., 2017; ) have yet to incorporate stateof-the-art pretrained LMs. Here, we study how to leverage the power of pretrained language models to learn embeddings for scientific documents.",
+ "cite_spans": [
+ {
+ "start": 322,
+ "end": 344,
+ "text": "(Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 345,
+ "end": 365,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 610,
+ "end": 627,
+ "text": "(Tu et al., 2017;",
+ "ref_id": "BIBREF46"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "A paper's title and abstract provide rich semantic content about the paper, but, as we show in this work, simply passing these textual fields to an \"off-the-shelf\" pretrained language model-even a state-of-the-art model tailored to scientific text like the recent SciBERT (Beltagy et al., 2019) -does not result in accurate paper representations. The language modeling objectives used to pretrain the model do not lead it to output representations that are helpful for document-level tasks such as topic classification or recommendation.",
+ "cite_spans": [
+ {
+ "start": 272,
+ "end": 294,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In this paper, we introduce a new method for learning general-purpose vector representations of scientific documents. Our system, SPECTER, 2 incorporates inter-document context into the Transformer (Vaswani et al., 2017) language models (e.g., SciBERT (Beltagy et al., 2019) ) to learn document representations that are effective across a wide-variety of downstream tasks, without the need for any task-specific fine-tuning of the pretrained language model. We specifically use citations as a naturally occurring, inter-document incidental supervision signal indicating which documents are most related and formulate the signal into a triplet-loss pretraining objective. Unlike many prior works, at inference time, our model does not require any citation information. This is critical for embedding new papers that have not yet been cited. In experiments, we show that SPECTER's representations substantially outperform the state-of-the-art on a variety of document-level tasks, including topic classification, citation prediction, and recommendation.",
+ "cite_spans": [
+ {
+ "start": 198,
+ "end": 220,
+ "text": "(Vaswani et al., 2017)",
+ "ref_id": "BIBREF47"
+ },
+ {
+ "start": 252,
+ "end": 274,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "As an additional contribution of this work, we introduce and release SCIDOCS 3 , a novel collection of data sets and an evaluation suite for documentlevel embeddings in the scientific domain. SCI-DOCS covers seven tasks, and includes tens of thousands of examples of anonymized user signals of document relatedness. We also release our training set (hundreds of thousands of paper titles, abstracts and citations), along with our trained embedding model and its associated code base.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Our goal is to learn task-independent representations of academic papers. Inspired by the recent success of pretrained Transformer language models across various NLP tasks, we use the Transformer model architecture as basis of encoding the input paper. Existing LMs such as BERT, however, are primarily based on masked language modeling objective, only considering intra-document context and do not use any inter-document information. This limits their ability to learn optimal document representations. To learn high-quality documentlevel representations we propose using citations as an inter-document relatedness signal and formulate it as a triplet loss learning objective. We then pretrain the model on a large corpus of citations using this objective, encouraging it to output representations that are more similar for papers that share a citation link than for those that do not. We call our model SPECTER, which learns Scientific Paper Embeddings using Citation-informed Trans-formERs. With respect to the terminology used by Devlin et al. (2019) , unlike most existing LMs that are \"fine-tuning based\", our approach results in embeddings that can be applied to downstream tasks in a \"feature-based\" fashion, meaning the learned paper embeddings can be easily used as features, with no need for further task-specific fine-tuning. In the following, as background information, we briefly describe how pretrained LMs can be applied for document representation and then discuss the details of SPECTER.",
+ "cite_spans": [
+ {
+ "start": 1034,
+ "end": 1054,
+ "text": "Devlin et al. (2019)",
+ "ref_id": "BIBREF11"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "3 https://github.com/allenai/scidocs Transformer (initialized with SciBERT) Related paper (P + ) Query paper (P Q ) Unrelated paper (P \u2212 )",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Triplet loss =max d P Q , P + \u2212 d P Q , P \u2212 + m , 0",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Figure 1: Overview of SPECTER.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Recently, pretrained Transformer networks have demonstrated success on various NLP tasks (Radford et al., 2018; Devlin et al., 2019; Liu et al., 2019) ; we use these models as the foundation for SPECTER. Specifically, we use SciBERT (Beltagy et al., 2019) which is an adaptation of the original BERT (Devlin et al., 2019) architecture to the scientific domain. The BERT model architecture (Devlin et al., 2019) uses multiple layers of Transformers (Vaswani et al., 2017) to encode the tokens in a given input sequence. Each layer consists of a self-attention sublayer followed by a feedforward sublayer. The final hidden state associated with the special [CLS] token is usually called the \"pooled output\", and is commonly used as an aggregate representation of the sequence.",
+ "cite_spans": [
+ {
+ "start": 89,
+ "end": 111,
+ "text": "(Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 112,
+ "end": 132,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 133,
+ "end": 150,
+ "text": "Liu et al., 2019)",
+ "ref_id": null
+ },
+ {
+ "start": 233,
+ "end": 255,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 300,
+ "end": 321,
+ "text": "(Devlin et al., 2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 389,
+ "end": 410,
+ "text": "(Devlin et al., 2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 448,
+ "end": 470,
+ "text": "(Vaswani et al., 2017)",
+ "ref_id": "BIBREF47"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Background: Pretrained Transformers",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Our goal is to represent a given paper P as a dense vector v that best represents the paper and can be used in downstream tasks. SPECTER builds embeddings from the title and abstract of a paper. Intuitively, we would expect these fields to be sufficient to produce accurate embeddings, since they are written to provide a succinct and comprehensive summary of the paper. 4 As such, we encode the concatenated title and abstract using a Transformer LM (e.g., SciBERT) and take the final representation of the [CLS] token as the output representation of the paper:",
+ "cite_spans": [
+ {
+ "start": 371,
+ "end": 372,
+ "text": "4",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "EQUATION",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [
+ {
+ "start": 0,
+ "end": 8,
+ "text": "EQUATION",
+ "ref_id": "EQREF",
+ "raw_str": "5 v = Transformer(input) [CLS] ,",
+ "eq_num": "(1)"
+ }
+ ],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "where Transformer is the Transformer's forward function, and input is the concatenation of the [CLS] token and WordPieces (Wu et al., 2016) of the title and abstract of a paper, separated by the [SEP] token. We use SciBERT as our model initialization as it is optimized for scientific text, though our formulation is general and any Transformer language model instead of SciBERT. Using the above method with an \"off-the-shelf\" SciBERT does not take global inter-document information into account. This is because SciBERT, like other pretrained language models, is trained via language modeling objectives, which only predict words or sentences given their in-document, nearby textual context. In contrast, we propose to incorporate citations into the model as a signal of inter-document relatedness, while still leveraging the model's existing strength in modeling language.",
+ "cite_spans": [
+ {
+ "start": 122,
+ "end": 139,
+ "text": "(Wu et al., 2016)",
+ "ref_id": "BIBREF52"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "A citation from one document to another suggests that the documents are related. To encode this relatedness signal into our representations, we design a loss function that trains the Transformer model to learn closer representations for papers when one cites the other, and more distant representations otherwise. The high-level overview of the model is shown in Figure 1 . In particular, each training instance is a triplet of papers: a query paper P Q , a positive paper P + and a negative paper P \u2212 . The positive paper is a paper that the query paper cites, and the negative paper is a paper that is not cited by the query paper (but that may be cited by P + ). We then train the model using the following triplet margin loss function:",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 363,
+ "end": 371,
+ "text": "Figure 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "L = max d P Q , P + \u2212 d P Q , P \u2212 + m , 0 (2)",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "where d is a distance function and m is the loss margin hyperparameter (we empirically choose m = 1). Here, we use the L2 norm distance:",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "d(P A , P B ) = v A \u2212 v B 2 ,",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "where v A is the vector corresponding to the pooled output of the Transformer run on paper A (Equation 1). 6 Starting from the trained SciBERT model, we pretrain the Transformer parameters on the citation objective to learn paper representations that capture document relatedness.",
+ "cite_spans": [
+ {
+ "start": 107,
+ "end": 108,
+ "text": "6",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "The choice of negative example papers P \u2212 is important when training the model. We consider two sets of negative examples: the first set simply consists of randomly selected papers from the corpus.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "Given a query paper, intuitively we would expect the model to be able to distinguish between cited papers, and uncited papers sampled randomly from the entire corpus. This inductive bias has been also found to be effective in content-based citation recommendation applications . But, random negatives may be easy for the model to distinguish from the positives. To provide a more nuanced training signal, we augment the randomly drawn negatives with a more challenging second set of negative examples. We denote as \"hard negatives\" the papers that are not cited by the query paper, but are cited by a paper cited by the query paper, i.e. if P 1 cite \u2212 \u2212 \u2192 P 2 and P 2 cite \u2212 \u2212 \u2192 P 3",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "but P 1 cite \u2212 \u2212 \u2192 P 3 , then P 3 is a candidate hard negative example for P 1 . We expect the hard negatives to be somewhat related to the query paper, but typically less related than the cited papers. As we show in our experiments ( \u00a76), including hard negatives results in more accurate embeddings compared to using random negatives alone.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "At inference time, the model receives one paper, P, and it outputs the SPECTER's Transfomer pooled output activation as the paper representation for P (Equation 1). We note that for inference, SPECTER requires only the title and abstract of the given input paper; the model does not need any citation information about the input paper. This means that SPECTER can produce embeddings even for new papers that have yet to be cited, which is critical for applications that target recent scientific papers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Inference",
+ "sec_num": "2.5"
+ },
+ {
+ "text": "Previous evaluations of scientific document representations in the literature tend to focus on small datasets over a limited set of tasks, and extremely high (99%+) AUC scores are already possible on these data for English documents . New, larger and more diverse benchmark datasets are necessary. Here, we introduce a new comprehensive evaluation framework to measure the effectiveness of scientific paper embeddings, which we call SCIDOCS. The framework consists of diverse tasks, ranging from citation prediction, to prediction of user activity, to document classification and paper recommendation. Note that SPECTER will not be further fine-tuned on any of the tasks; we simply plug in the embeddings as features for each task. Below, we describe each of the tasks in detail and the evaluation data associated with it. In addition to our training data, we release all the datasets associated with the evaluation tasks.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "SCIDOCS Evaluation Framework",
+ "sec_num": "3"
+ },
+ {
+ "text": "An important test of a document-level embedding is whether it is predictive of the class of the document. Here, we consider two classification tasks in the scientific domain: MeSH Classification In this task, the goals is to classify scientific papers according to their Medical Subject Headings (MeSH) (Lipscomb, 2000) . 7 We construct a dataset consisting of 23K academic medical papers, where each paper is assigned one of 11 top-level disease classes such as cardiovascular diseases, diabetes, digestive diseases derived from the MeSH vocabulary. The most populated category is Neoplasms (cancer) with 5.4K instances (23.3% of the total dataset) while the category with least number of samples is Hepatitis (1.7% of the total dataset). We follow the approach of Feldman et al. (2019) in mapping the MeSH vocabulary to the disease classes.",
+ "cite_spans": [
+ {
+ "start": 303,
+ "end": 319,
+ "text": "(Lipscomb, 2000)",
+ "ref_id": "BIBREF30"
+ },
+ {
+ "start": 322,
+ "end": 323,
+ "text": "7",
+ "ref_id": null
+ },
+ {
+ "start": 766,
+ "end": 787,
+ "text": "Feldman et al. (2019)",
+ "ref_id": "BIBREF13"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Classification",
+ "sec_num": "3.1"
+ },
+ {
+ "text": "Paper Topic Classification This task is predicting the topic associated with a paper using the predefined topic categories of the Microsoft Academic Graph (MAG) (Sinha et al., 2015) 8 . MAG provides a database of papers, each tagged with a list of topics. The topics are organized in a hierarchy of 5 levels, where level 1 is the most general and level 5 is the most specific. For our evaluation, we derive a document classification dataset from the level 1 topics, where a paper is labeled by its corresponding level 1 MAG topic. We construct a dataset of 25K papers, almost evenly split over the 19 different classes of level 1 categories in MAG.",
+ "cite_spans": [
+ {
+ "start": 161,
+ "end": 181,
+ "text": "(Sinha et al., 2015)",
+ "ref_id": "BIBREF45"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Classification",
+ "sec_num": "3.1"
+ },
+ {
+ "text": "As argued above, citations are a key signal of relatedness between papers. We test how well different paper representations can reproduce this signal through citation prediction tasks. In particular, we focus on two sub-tasks: predicting direct citations, and predicting co-citations. We frame these as ranking tasks and evaluate performance using MAP and nDCG, standard ranking metrics. Direct Citations In this task, the model is asked to predict which papers are cited by a given query paper from a given set of candidate papers. The evaluation dataset includes approximately 30K total papers from a held-out pool of papers, consisting of 1K query papers and a candidate set of up to 5 cited papers and 25 (randomly selected) uncited papers. The task is to rank the cited papers higher than the uncited papers. For each embedding method, we require only comparing the L2 distance between the raw embeddings of the query and the candidates, without any additional trainable parameters.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation Prediction",
+ "sec_num": "3.2"
+ },
+ {
+ "text": "Co-Citations This task is similar to the direct citations but instead of predicting a cited paper, the goal is to predict a highly co-cited paper with a given paper. Intuitively, if papers A and B are cited frequently together by several papers, this shows that the papers are likely highly related and a good paper representation model should be able to identify these papers from a given candidate set. The dataset consists of 30K total papers and is constructed similar to the direct citations task.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation Prediction",
+ "sec_num": "3.2"
+ },
+ {
+ "text": "The embeddings for similar papers should be close to each other; we use user activity as a proxy for identifying similar papers and test the model's ability to recover this information. Multiple users consuming the same items as one another is a classic relatedness signal and forms the foundation for recommender systems and other applications (Schafer et al., 2007) . In our case, we would expect that when users look for academic papers, the papers they view in a single browsing session tend to be related. Thus, accurate paper embeddings should, all else being equal, be relatively more similar for papers that are frequently viewed in the same session than for other papers. To build benchmark datasets to test embeddings on user activity, we obtained logs of user sessions from a major academic search engine. We define the following two tasks on which we build benchmark datasets to test embeddings:",
+ "cite_spans": [
+ {
+ "start": 345,
+ "end": 367,
+ "text": "(Schafer et al., 2007)",
+ "ref_id": "BIBREF42"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "Co-Views Our co-views dataset consists of approximately 30K papers. To construct it, we take 1K random papers that are not in our train or development set and associate with each one up to 5 frequently co-viewed papers and 25 randomly selected papers (similar to the approach for citations). Then, we require the embedding model to rank the co-viewed papers higher than the random papers by comparing the L2 distances of raw embeddings. We evaluate performance using standard ranking metrics, nDCG and MAP.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "Co-Reads If the user clicks to access the PDF of a paper from the paper description page, this is a potentially stronger sign of interest in the paper. In such a case we assume the user will read at least parts of the paper and refer to this as a \"read\" action. Accordingly, we define a \"co-reads\" task and dataset analogous to the co-views dataset described above. This dataset is also approximately 30K papers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "In the recommendation task, we evaluate the ability of paper embeddings to boost performance in a production recommendation system. Our recommendation task aims to help users navigate the scientific literature by ranking a set of \"similar papers\" for a given paper. We use a dataset of user clickthrough data for this task which consists of 22K clickthrough events from a public scholarly search engine. We partitioned the examples temporally into train (20K examples), validation (1K), and test (1K) sets. As is typical in clickthrough data on ranked lists, the clicks are biased toward the top of original ranking presented to the user. To counteract this effect, we computed propensity scores using a swap experiment (Agarwal et al., 2019). The propensity scores give, for each position in the ranked list, the relative frequency that the position is over-represented in the data due to exposure bias. We can then compute de-biased evaluation metrics by dividing the score for each test example by the propensity score for the clicked position. We report propensity-adjusted versions of the standard ranking metrics Precision@1 (P @1) and Normalized Discounted Cumulative Gain (nDCG).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Recommendation",
+ "sec_num": "3.4"
+ },
+ {
+ "text": "We test different embeddings on the recommendation task by including cosine embedding distance 9 as a feature within an existing recommendation system that includes several other informative features (title/author similarity, reference and citation overlap, etc.). Thus, the recommendation experiments measure whether the embeddings can boost the performance of a strong baseline system on an end task. For SPECTER, we also perform an online A/B test to measure whether its advantages on the offline dataset translate into improvements on the online recommendation task ( \u00a75).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Recommendation",
+ "sec_num": "3.4"
+ },
+ {
+ "text": "Training Data To train our model, we use a subset of the Semantic Scholar corpus consisting of about 146K query papers (around 26.7M tokens) with their corresponding outgoing citations, and we use an additional 32K papers for validation. For each query paper we construct up to 5 training triples comprised of a query, a positive, and a negative paper. The positive papers are sampled from the direct citations of the query, while negative papers are chosen either randomly or from citations of citations (as discussed in \u00a72.4). We empirically found it helpful to use 2 hard negatives (citations of citations) and 3 easy negatives (randomly selected papers) for each query paper. This process results in about 684K training triples and 145K validation triples.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Training and Implementation We implement our model in AllenNLP . We initialize the model from SciBERT pretrained weights (Beltagy et al., 2019) since it is the stateof-the-art pretrained language model on scientific text. We continue training all model parameters on our training objective (Equation 2). We perform minimal tuning of our model's hyperparameters based on the performance on the validation set, while baselines are extensively tuned. Based on initial experiments, we use a margin m=1 for the triplet loss. For training, we use the Adam optimizer (Kingma and Ba, 2014) following the suggested hyperparameters in Devlin et al. (2019) (LR: 2e-5, Slanted Triangular LR scheduler 10 (Howard and Ruder, 2018) with number of train steps equal to training instances and cut fraction of 0.1). We train the model on a single Titan V GPU (12G memory) for 2 epochs, with batch size of 4 (the maximum that fit in our GPU memory) and use gradient accumulation for an effective batch size of 32. Each training epoch takes approximately 1-2 days to complete on the full dataset. We release our code and data to facilitate reproducibility. 11",
+ "cite_spans": [
+ {
+ "start": 121,
+ "end": 143,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Task-Specific Model Details For the classification tasks, we used a linear SVM where embedding vectors were the only features. The C hyperparameter was tuned via a held-out validation set.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "For the recommendation tasks, we use a feedforward ranking neural network that takes as input ten features designed to capture the similarity between each query and candidate paper, including the cosine similarity between the query and candidate embeddings and manually-designed features computed from the papers' citations, titles, authors, and publication dates.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Baseline Methods Our work falls into the intersection of textual representation, citation mining, and graph learning, and we evaluate against stateof-the-art baselines from each of these areas. We compare with several strong textual models: SIF (Arora et al., 2017) , a method for learning document representations by removing the first principal component of aggregated word-level embeddings which we pretrain on scientific text; SciBERT (Beltagy et al., 2019) a state-of-the-art pretrained Transformer LM for scientific text; and Sent-BERT (Reimers and Gurevych, 2019) , a model that uses negative sampling to tune BERT for producing optimal sentence embeddings. We also compare with Citeomatic , a closely related paper representation model for citation prediction which trains content-based representations with citation graph information via dynamically sampled triplets, and SGC (Wu et al., 2019a) , a state-of-the-art graph-convolutional approach. For completeness, additional baselines are also included; due to space constraints we refer to Appendix A for detailed discussion of all baselines. We tune hyperparameters of baselines to maximize performance on a separate validation set. Table 1 presents the main results corresponding to our evaluation tasks (described in \u00a73). Overall, we observe substantial improvements across all tasks with average performance of 80.0 across all metrics on all tasks which is a 3.1 point absolute improvement over the next-best baseline. We now discuss the results in detail.",
+ "cite_spans": [
+ {
+ "start": 245,
+ "end": 265,
+ "text": "(Arora et al., 2017)",
+ "ref_id": "BIBREF2"
+ },
+ {
+ "start": 439,
+ "end": 461,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 542,
+ "end": 570,
+ "text": "(Reimers and Gurevych, 2019)",
+ "ref_id": "BIBREF40"
+ },
+ {
+ "start": 885,
+ "end": 903,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 1194,
+ "end": 1201,
+ "text": "Table 1",
+ "ref_id": "TABREF1"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "For document classification, we report macro F1, a standard classification metric. We observe that the classifier performance when trained on our representations is better than when trained on any other baseline. Particularly, on the MeSH (MAG) dataset, we obtain an 86.4 (82.0) F1 score which is about a \u2206= + 2.3 (+1.5) point absolute increase over the best baseline on each dataset respectively. Our evaluation of the learned representations on predicting user activity is shown in the \"User activity\" columns of Table 1 . SPECTER achieves a MAP score of 83.8 on the co-view task, and 84.5 on coread, improving over the best baseline (Citeomatic in this case) by 2.7 and 4.0 points, respectively. We observe similar trends for the \"citation\" and \"co-citation\" tasks, with our model outperforming virtually all other baselines except for SGC, which has access to the citation graph at training and test time. 12 Note that methods like SGC cannot be used in real-world setting to embed new papers that are not cited yet. On the other hand, on cocitation data our method is able to achieve the best results with nDCG of 94.8, improving over SGC with 2.3 points. Citeomatic also performs well on the citation tasks, as expected given that its primary design goal was citation prediction. Nevertheless, our method slightly outperforms Citeomatic on the direct citation task, while substantially outperforming it on co-citations (+2.0 nDCG). Finally, for recommendation task, we observe that SPECTER outperforms all other models on this task as well, with nDCG of 53.9. On the recommendations task, as opposed to previous experiments, the differences in method scores are generally smaller. This is because for this task the embeddings are used along with several other informative features in the ranking model (described under task-specific models in \u00a74), meaning that embedding variants have less opportunity for impact on overall performance.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 515,
+ "end": 522,
+ "text": "Table 1",
+ "ref_id": "TABREF1"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "We also performed an online study to evaluate whether SPECTER embeddings offer similar advantages in a live application. We performed an online A/B test comparing our SPECTER-based recommender to an existing production recommender system for similar papers that ranks papers by a textual similarity measure. In a dataset of 4,113 clicks, we found that SPECTER ranker improved clickthrough rate over the baseline by 46.5%, demonstrating its superiority.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "We emphasize that our citation-based pretraining objective is critical for the performance of SPECTER; removing this and using a vanilla SciB-ERT results in decreased performance on all tasks. ",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "In this section, we analyze several design decisions in SPECTER, provide a visualization of its embedding space, and experimentally compare SPECTER's use of fixed embeddings against a finetuning approach.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Ablation Study We start by analyzing how adding or removing metadata fields from the input to SPECTER alters performance. The results are shown in the top four rows of Table 2 (for brevity, here we only report the average of the metrics from each task). We observe that removing the abstract from the textual input and relying only on the title results in a substantial decrease in performance. More surprisingly, adding authors as an input (along with title and abstract) hurts performance. 13 One possible explanation is that author names are sparse in the corpus, making it difficult for the model to infer document-level relatedness from them. As another possible reason of this behavior, tokenization using Wordpieces might be suboptimal for author names. Many author names are out-of-vocabulary for SciBERT and thus, they might be split into sub-words and shared across names that are not semantically related, leading to noisy correlation. Finally, we find that adding venues slightly decreases performance, 14 except on document classification (which makes sense, as we would expect venues to have high correlation 13 We experimented with both concatenating authors with the title and abstract and also considering them as an additional field. Neither were helpful.",
+ "cite_spans": [
+ {
+ "start": 492,
+ "end": 494,
+ "text": "13",
+ "ref_id": null
+ },
+ {
+ "start": 1123,
+ "end": 1125,
+ "text": "13",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 168,
+ "end": 175,
+ "text": "Table 2",
+ "ref_id": "TABREF3"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "14 Venue information in our data came directly from publisher provided metadata and thus was not normalized. with paper topics). The fact that SPECTER does not require inputs like authors or venues makes it applicable in situations where this metadata is not available, such as matching reviewers with anonymized submissions, or performing recommendations of anonymized preprints (e.g., on OpenReview). One design decision in SPECTER is to use a set of hard negative distractors in the citation-based finetuning objective. The fifth row of Table 2 shows that this is important-using only easy negatives reduces performance on all tasks. While there could be other potential ways to include hard negatives in the model, our simple approach of including citations of citations is effective. The sixth row of the table shows that using a strong general-domain language model (BERT-Large) instead of SciBERT in SPECTER reduces performance considerably. This is reasonable because unlike BERT-Large, SciB-ERT is pretrained on scientific text.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 540,
+ "end": 547,
+ "text": "Table 2",
+ "ref_id": "TABREF3"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Visualization Figure 2 shows t-SNE (van der Maaten, 2014) projections of our embeddings (SPECTER) compared with the SciBERT baseline for a random set of papers. When comparing SPECTER embeddings with SciBERT, we observe that our embeddings are better at encoding topical information, as the clusters seem to be more compact. Further, we see some examples of crosstopic relatedness reflected in the embedding space (e.g., Engineering, Mathematics and Computer Science are close to each other, while Business and Economics are also close to each other). To quantify the comparison of visualized embeddings in Figure 2 , we use the DBScan clustering algorithm (Ester et al., 1996) on this 2D projection. We use the completeness and homogeneity clustering quality measures introduced by Rosenberg and Hirschberg (2007) . For the points corresponding to Figure 2 , the homogeneity and completeness values for SPECTER are respectively 0.41 and 0.72 compared with SciBERT's 0.19 and 0.63, a clear improvement on separating topics using the projected embeddings.",
+ "cite_spans": [
+ {
+ "start": 657,
+ "end": 677,
+ "text": "(Ester et al., 1996)",
+ "ref_id": "BIBREF12"
+ },
+ {
+ "start": 783,
+ "end": 814,
+ "text": "Rosenberg and Hirschberg (2007)",
+ "ref_id": "BIBREF41"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 14,
+ "end": 22,
+ "text": "Figure 2",
+ "ref_id": null
+ },
+ {
+ "start": 607,
+ "end": 615,
+ "text": "Figure 2",
+ "ref_id": null
+ },
+ {
+ "start": 849,
+ "end": 857,
+ "text": "Figure 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Comparison with Task Specific Fine-Tuning While the fact that SPECTER does not require finetuning makes its paper embeddings less costly to use, often the best performance from pretrained Transformers is obtained when the models are finetuned directly on each end task. We experiment with fine-tuning SciBERT on our tasks, and find this to be generally inferior to using our fixed representations from SPECTER. Specifically, we finetune SciBERT directly on task-specific signals instead of citations. To fine-tune on task-specific data (e.g., user activity), we used a dataset of coviews with 65K query papers, co-reads with 14K query papers, and co-citations (instead of direct citations) with 83K query papers. As the end tasks are ranking tasks, for all datasets we construct up to 5 triplets and fine-tune the model using triplet ranking loss. The positive papers are sampled from the most co-viewed (co-read, or co-cited) papers corresponding to the query paper. We also include both easy and hard distractors as when training SPECTER (for hard negatives we choose the least non-zero co-viewed (co-read, or co-cited) papers). We also consider training jointly on all task-specific training data sources in a multitask training process, where the model samples training triplets from a distribution over the sources. As illustrated in Table 3, without any additional final task-specific fine-tuning, SPECTER still outperforms a SciBERT model fine-tuned on the end tasks as well as their multitask combination, further demonstrating the effectiveness and versatility of SPECTER embeddings. 15",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Recent representation learning methods in NLP rely on training large neural language models on unsupervised data Radford et al., 2018; Devlin et al., 2019; Beltagy et al., 2019; Liu et al., 2019) . While successful at many sentenceand token-level tasks, our focus is on using the models for document-level representation learning, which has remained relatively under-explored.",
+ "cite_spans": [
+ {
+ "start": 113,
+ "end": 134,
+ "text": "Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 135,
+ "end": 155,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 156,
+ "end": 177,
+ "text": "Beltagy et al., 2019;",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 178,
+ "end": 195,
+ "text": "Liu et al., 2019)",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "There have been other efforts in document representation learning such as extensions of word vectors to documents (Le and Mikolov, 2014; Ganesh et al., 2016; Wu et al., 2018; Gysel et al., 2017) , convolution-based methods Zamani et al., 2018) , and variational autoencoders (Holmer and Marfurt, 2018; . Relevant to document embedding, sentence embedding is a relatively well-studied area of research. Successful approaches include seq2seq models (Kiros et al., 2015) , BiLSTM Siamese networks (Williams et al., 2018) , leveraging supervised data from other corpora (Conneau et al., 2017) , and using discourse relations (Nie et al., 2019) , and BERT-based methods (Reimers and Gurevych, 2019) . Unlike our proposed method, the majority of these approaches do not consider any notion of inter-document relatedness when embedding documents.",
+ "cite_spans": [
+ {
+ "start": 114,
+ "end": 136,
+ "text": "(Le and Mikolov, 2014;",
+ "ref_id": "BIBREF28"
+ },
+ {
+ "start": 137,
+ "end": 157,
+ "text": "Ganesh et al., 2016;",
+ "ref_id": "BIBREF14"
+ },
+ {
+ "start": 158,
+ "end": 174,
+ "text": "Wu et al., 2018;",
+ "ref_id": "BIBREF51"
+ },
+ {
+ "start": 175,
+ "end": 194,
+ "text": "Gysel et al., 2017)",
+ "ref_id": "BIBREF16"
+ },
+ {
+ "start": 223,
+ "end": 243,
+ "text": "Zamani et al., 2018)",
+ "ref_id": "BIBREF55"
+ },
+ {
+ "start": 275,
+ "end": 301,
+ "text": "(Holmer and Marfurt, 2018;",
+ "ref_id": "BIBREF19"
+ },
+ {
+ "start": 447,
+ "end": 467,
+ "text": "(Kiros et al., 2015)",
+ "ref_id": null
+ },
+ {
+ "start": 494,
+ "end": 517,
+ "text": "(Williams et al., 2018)",
+ "ref_id": "BIBREF49"
+ },
+ {
+ "start": 566,
+ "end": 588,
+ "text": "(Conneau et al., 2017)",
+ "ref_id": "BIBREF10"
+ },
+ {
+ "start": 621,
+ "end": 639,
+ "text": "(Nie et al., 2019)",
+ "ref_id": "BIBREF35"
+ },
+ {
+ "start": 665,
+ "end": 693,
+ "text": "(Reimers and Gurevych, 2019)",
+ "ref_id": "BIBREF40"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "Other relevant work combines textual features with network structure (Tu et al., 2017; . These works typically do not leverage the recent pretrained contextual representations and with a few exceptions such as the recent work by , they cannot generalize to unseen documents like our SPECTER approach. Context-based citation recommendation is another related application where models rely on citation contexts (Jeong et al., 2019) to make predictions. These works are orthogonal to ours as the input to our model is just paper title and abstract. Another related line of work is graphbased representation learning methods (Bruna et al., 2014; Kipf and Welling, 2017; Hamilton et al., 2017a,b; Wu et al., 2019a,b) . Here, we compare to a graph representation learning model, SGC (Simple Graph Convolution) (Wu et al., 2019a) , which is a state-of-the-art graph convolution approach for representation learning. SPECTER uses pretrained language models in combination with graph-based citation signals, which enables it to outperform the graph-based approaches in our experiments.",
+ "cite_spans": [
+ {
+ "start": 69,
+ "end": 86,
+ "text": "(Tu et al., 2017;",
+ "ref_id": "BIBREF46"
+ },
+ {
+ "start": 409,
+ "end": 429,
+ "text": "(Jeong et al., 2019)",
+ "ref_id": "BIBREF21"
+ },
+ {
+ "start": 621,
+ "end": 641,
+ "text": "(Bruna et al., 2014;",
+ "ref_id": null
+ },
+ {
+ "start": 642,
+ "end": 665,
+ "text": "Kipf and Welling, 2017;",
+ "ref_id": "BIBREF24"
+ },
+ {
+ "start": 666,
+ "end": 691,
+ "text": "Hamilton et al., 2017a,b;",
+ "ref_id": null
+ },
+ {
+ "start": 692,
+ "end": 711,
+ "text": "Wu et al., 2019a,b)",
+ "ref_id": null
+ },
+ {
+ "start": 804,
+ "end": 822,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "SPECTER embeddings are based on only the title and abstract of the paper. Adding the full text of the paper would provide a more complete picture of the paper's content and could improve accuracy (Cohen et al., 2010; Lin, 2008; Schuemie et al., 2004) . However, the full text of many academic papers is not freely available. Further, modern language models have strict memory limits on input size, which means new techniques would be required in order to leverage the entirety of the paper within the models. Exploring how to use the full paper text within SPECTER is an item of future work.",
+ "cite_spans": [
+ {
+ "start": 196,
+ "end": 216,
+ "text": "(Cohen et al., 2010;",
+ "ref_id": "BIBREF9"
+ },
+ {
+ "start": 217,
+ "end": 227,
+ "text": "Lin, 2008;",
+ "ref_id": "BIBREF29"
+ },
+ {
+ "start": 228,
+ "end": 250,
+ "text": "Schuemie et al., 2004)",
+ "ref_id": "BIBREF43"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "Finally, one pain point in academic paper recommendation research has been a lack of publicly available datasets (Chen and Lee, 2018; Kanakia et al., 2019) . To address this challenge, we release SCIDOCS, our evaluation benchmark which includes an anonymized clickthrough dataset from an online recommendations system.",
+ "cite_spans": [
+ {
+ "start": 113,
+ "end": 133,
+ "text": "(Chen and Lee, 2018;",
+ "ref_id": "BIBREF8"
+ },
+ {
+ "start": 134,
+ "end": 155,
+ "text": "Kanakia et al., 2019)",
+ "ref_id": "BIBREF22"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "We present SPECTER, a model for learning representations of scientific papers, based on a Transformer language model that is pretrained on cita-tions. We achieve substantial improvements over the strongest of a wide variety of baselines, demonstrating the effectiveness of our model. We additionally introduce SCIDOCS, a new evaluation suite consisting of seven document-level tasks and release the corresponding datasets to foster further research in this area.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "The landscape of Transformer language models is rapidly changing and newer and larger models are frequently introduced. It would be interesting to initialize our model weights from more recent Transformer models to investigate if additional gains are possible. Another item of future work is to develop better multitask approaches to leverage multiple signals of relatedness information during training. We used citations to build triplets for our loss function, however there are other metrics that have good support from the bibliometrics literature (Klavans and Boyack, 2006) that warrant exploring as a way to create relatedness graphs. Including other information such as outgoing citations as additional input to the model would be yet another area to explore in future.",
+ "cite_spans": [
+ {
+ "start": 552,
+ "end": 578,
+ "text": "(Klavans and Boyack, 2006)",
+ "ref_id": "BIBREF26"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "A Appendix A -Baseline Details 1. Random Zero-mean 25-dimensional vectors were used as representations for each document.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "2. Doc2Vec Doc2Vec is one of the earlier neural document/paragraph representation methods (Le and Mikolov, 2014) , and is a natural comparison. We trained Doc2Vec on our training subset using Gensim (\u0158eh\u016f\u0159ek and Sojka, 2010) , and chose the hyperparameter grid using suggestions from Lau and Baldwin (2016). The hyperparameter grid used:",
+ "cite_spans": [
+ {
+ "start": 90,
+ "end": 112,
+ "text": "(Le and Mikolov, 2014)",
+ "ref_id": "BIBREF28"
+ },
+ {
+ "start": 199,
+ "end": 224,
+ "text": "(\u0158eh\u016f\u0159ek and Sojka, 2010)",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "{'window': [5, 10, 15] , 'sample': [0, 10 ** -6, 10 ** -5], 'epochs': [50, 100, 200 ]}, for a total of 27 models. The other parameters were set as follows: vector_size=300, min_count=3, alpha=0.025, min_alpha=0.0001, negative=5, dm=0, dbow=1, dbow_words=0. 3. Fasttext-Sum This simple baseline is a weighted sum of pretrained word vectors. We trained our own 300 dimensional fasttext embeddings (Bojanowski et al., 2017) on a corpus of around 3.1B tokens from scientific papers which is similar in size to the SciBERT corpus (Beltagy et al., 2019) . We found that these pretrained embeddings substantially outperform alternative off-theshelf embeddings. We also use these embeddings in other baselines that require pretrained word vectors (i.e., SIF and SGC that are described below). The summed bag of words representation has a number of weighting options, which are extensively tuned on a validation set for best performance. 4. SIF The SIF method of Arora et al. (2017) is a strong text representation baseline that takes a weighted sum of pretrained word vectors (we use fasttext embeddings described above), then computes the first principal component of the document embedding matrix and subtracts out each document embedding's projection to the first principal component.",
+ "cite_spans": [
+ {
+ "start": 11,
+ "end": 14,
+ "text": "[5,",
+ "ref_id": null
+ },
+ {
+ "start": 15,
+ "end": 18,
+ "text": "10,",
+ "ref_id": null
+ },
+ {
+ "start": 19,
+ "end": 22,
+ "text": "15]",
+ "ref_id": null
+ },
+ {
+ "start": 70,
+ "end": 74,
+ "text": "[50,",
+ "ref_id": null
+ },
+ {
+ "start": 75,
+ "end": 79,
+ "text": "100,",
+ "ref_id": null
+ },
+ {
+ "start": 80,
+ "end": 83,
+ "text": "200",
+ "ref_id": null
+ },
+ {
+ "start": 395,
+ "end": 420,
+ "text": "(Bojanowski et al., 2017)",
+ "ref_id": "BIBREF5"
+ },
+ {
+ "start": 525,
+ "end": 547,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 954,
+ "end": 973,
+ "text": "Arora et al. (2017)",
+ "ref_id": "BIBREF2"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "We used a held-out validation set to choose a from the range [1.0e-5, 1.0e-3] spaced evenly on a log scale. The word probability p(w) was estimated on the training set only. When computing term-frequency values for SIF, we used scikit-learn's TfidfVectorizer with the same parameters as enumerated in the preceding section. sublinear_tf, binary, use_idf, smooth_idf were all set to False. Since SIF is a sum of pretrained fasttext vectors, the resulting dimensionality is 300. provides contextualized representations of tokens in a document. It can provide paragraph or document embeddings by averaging each token's representation for all 3 LSTM layers. We used the 768-dimensional pretrained ELMo model in AllenNLP .",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "6. Citeomatic The most relevant baseline is Citeomatic , which is an academic paper representation model that is trained on the citation graph via sampled triplets. Citeomatic representations are an L2 normalized weighted sum of title and abstract embeddings, which are trained on the citation graph with dynamic negative sampling. Citeomatic embeddings are 75-dimensional. 7. SGC Since our algorithm is trained on data from the citation graph, we also compare to a state-ofthe-art graph representation learning model: SGC (Simple Graph Convolution) (Wu et al., 2019a) , which is a graph convolution network. An alternative comparison would have been Graph-SAGE (Hamilton et al., 2017b) , but SGC (with no learning) outperformed an unsupervised variant of GraphSAGE on the Reddit dataset 16 , Note that SGC with no learning boils down to graph propagation on node features (in our case nodes are academic documents). Following Hamilton et al. (2017a), we used SIF features as node representations, and applied SGC with a range of parameter k, which is the number of times the normalized adjacency is multiplied by the SIF feature matrix. Our range of k was 1 through 8 (inclusive), and was chosen with a validation set. For the node features, we chose the SIF model with a = 0.0001, as this model was observed to be a high-performing one. This baseline is also 300 dimensional.",
+ "cite_spans": [
+ {
+ "start": 550,
+ "end": 568,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ },
+ {
+ "start": 662,
+ "end": 686,
+ "text": "(Hamilton et al., 2017b)",
+ "ref_id": "BIBREF18"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "ELMo ELMo",
+ "sec_num": "5."
+ },
+ {
+ "text": "8. SciBERT To isolate the advantage of SPECTER's citation-based fine-tuning objective, we add a controlled comparison with SciBERT (Beltagy et al., 2019) . Following Devlin et al. (2019) we take the last layer hidden state corresponding to the [CLS] token as the aggregate document representation. 17 9. Sentence BERT Sentence BERT (Reimers and Gurevych, 2019 ) is a general-domain pretrained model aimed at embedding sentences. The authors fine-tuned BERT using a triplet loss, where positive sentences were from the same document section as the seed sentence, and distractor sentences came from other document sections. The model is designed to encode sentences as opposed to paragraphs, so we embed the title and each sentence in the abstract separately, sum the embeddings, and L2 normalize the result to produce a final 768-dimensional paper embedding. 18 During hyperparameter optimization we chose how to compute TF and IDF values weights by taking the following non-redundant combinations of scikit-learn's TfidfVectorizer (Pedregosa et al., 2011) parameters: sublinear_tf, binary, use_idf, smooth_idf. There were a total of 9 parameter combinations. The IDF values were estimated on the training set. The other parameters were set as follows: min_df=3, max_df=0.75, strip_accents='ascii', stop_words='english', norm=None, lowercase=True. For training of fasttext, we used all default parameters with the exception of setting dimension to 300 and minCount was set to 25 due to the large corpus.",
+ "cite_spans": [
+ {
+ "start": 131,
+ "end": 153,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 166,
+ "end": 186,
+ "text": "Devlin et al. (2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 332,
+ "end": 359,
+ "text": "(Reimers and Gurevych, 2019",
+ "ref_id": "BIBREF40"
+ },
+ {
+ "start": 858,
+ "end": 860,
+ "text": "18",
+ "ref_id": null
+ },
+ {
+ "start": 1031,
+ "end": 1055,
+ "text": "(Pedregosa et al., 2011)",
+ "ref_id": "BIBREF36"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "ELMo ELMo",
+ "sec_num": "5."
+ },
+ {
+ "text": "SPECTER: Scientific Paper Embeddings using Citationinformed TransformERs",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We also experimented with additional fields such as venues and authors but did not find any empirical advantage in using those (see \u00a76). See \u00a77 for a discussion of using the full text of the paper as input.5 It is also possible to encode title and abstracts individually and then concatenate or combine them to get the final embedding. However, in our experiments this resulted in sub-optimal performance.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We also experimented with other distance functions (e..g, normalized cosine), but they underperformed the L2 loss.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "https://www.nlm.nih.gov/mesh/meshhome. html 8 https://academic.microsoft.com/",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "Embeddings are L2 normalized and in this case cosine distance is equivalent to L2 distance.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "Learning rate linear warmup followed by linear decay. 11 https://github.com/allenai/specter",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "For SGC, we remove development and test set citations and co-citations during training. We also remove incoming citations from development and test set queries as these would not be available at test time in production.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We also experimented with further task-specific finetuning of our SPECTER on the end tasks but we did not observe additional improvements.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "There were no other direct comparisons inWu et al. (2019a) 17 We also tried the alternative of averaging all token representations, but this resulted in a slight performance decrease compared with the [CLS] pooled token.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We used the 'bert-base-wikipedia-sections-mean-tokens' model released by the authors: https://github.com/ UKPLab/sentence-transformers",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ }
+ ],
+ "back_matter": [
+ {
+ "text": "We thank Kyle Lo, Daniel King and Oren Etzioni for helpful research discussions, Russel Reas for setting up the public API, Field Cady for help in initial data collection and the anonymous reviewers (especially Reviewer 1) for comments and suggestions. This work was supported in part by NSF Convergence Accelerator award 1936940, ONR grant N00014-18-1-2193, and the University of Washington WRF/Cable Professorship.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Acknowledgements",
+ "sec_num": null
+ }
+ ],
+ "bib_entries": {
+ "BIBREF0": {
+ "ref_id": "b0",
+ "title": "Estimating position bias without intrusive interventions",
+ "authors": [
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Anant",
+ "suffix": ""
+ },
+ {
+ "first": "Ivan",
+ "middle": [],
+ "last": "Agarwal",
+ "suffix": ""
+ },
+ {
+ "first": "Xuanhui",
+ "middle": [],
+ "last": "Zaitsev",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yen",
+ "middle": [],
+ "last": "Cheng",
+ "suffix": ""
+ },
+ {
+ "first": "Marc",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Thorsten",
+ "middle": [],
+ "last": "Najork",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Joachims",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "WSDM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Anant K. Agarwal, Ivan Zaitsev, Xuanhui Wang, Cheng Yen Li, Marc Najork, and Thorsten Joachims. 2019. Estimating position bias without intrusive in- terventions. In WSDM.",
+ "links": null
+ },
+ "BIBREF1": {
+ "ref_id": "b1",
+ "title": "Construction of the literature graph in semantic scholar",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Dirk",
+ "middle": [],
+ "last": "Groeneveld",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": ""
+ },
+ {
+ "first": "Miles",
+ "middle": [],
+ "last": "Crawford",
+ "suffix": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Dunkelberger",
+ "suffix": ""
+ },
+ {
+ "first": "Ahmed",
+ "middle": [],
+ "last": "Elgohary",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": ""
+ },
+ {
+ "first": "Rodney",
+ "middle": [],
+ "last": "Kinney",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Kohlmeier",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Tyler",
+ "middle": [
+ "C"
+ ],
+ "last": "Murray",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Hsu-Han",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Ooi",
+ "suffix": ""
+ },
+ {
+ "first": "Joanna",
+ "middle": [],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Sam",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [
+ "Lu"
+ ],
+ "last": "Skjonsberg",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Zheng",
+ "middle": [],
+ "last": "Wilhelm",
+ "suffix": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Yuan",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "NAACL-HLT",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Waleed Ammar, Dirk Groeneveld, Chandra Bha- gavatula, Iz Beltagy, Miles Crawford, Doug Downey, Jason Dunkelberger, Ahmed Elgohary, Sergey Feldman, Vu Ha, Rodney Kinney, Sebas- tian Kohlmeier, Kyle Lo, Tyler C. Murray, Hsu- Han Ooi, Matthew E. Peters, Joanna Power, Sam Skjonsberg, Lucy Lu Wang, Christopher Wilhelm, Zheng Yuan, Madeleine van Zuylen, and Oren Et- zioni. 2018. Construction of the literature graph in semantic scholar. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF2": {
+ "ref_id": "b2",
+ "title": "A simple but tough-to-beat baseline for sentence embeddings",
+ "authors": [
+ {
+ "first": "Sanjeev",
+ "middle": [],
+ "last": "Arora",
+ "suffix": ""
+ },
+ {
+ "first": "Yingyu",
+ "middle": [],
+ "last": "Liang",
+ "suffix": ""
+ },
+ {
+ "first": "Tengyu",
+ "middle": [],
+ "last": "Ma",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ICLR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017. A simple but tough-to-beat baseline for sentence em- beddings. In ICLR.",
+ "links": null
+ },
+ "BIBREF3": {
+ "ref_id": "b3",
+ "title": "SciB-ERT: A Pretrained Language Model for Scientific Text",
+ "authors": [
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Arman",
+ "middle": [],
+ "last": "Cohan",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciB- ERT: A Pretrained Language Model for Scientific Text. In EMNLP.",
+ "links": null
+ },
+ "BIBREF4": {
+ "ref_id": "b4",
+ "title": "Content-Based Citation Recommendation",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-Based Citation Recommendation. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF5": {
+ "ref_id": "b5",
+ "title": "Enriching word vectors with subword information",
+ "authors": [
+ {
+ "first": "Piotr",
+ "middle": [],
+ "last": "Bojanowski",
+ "suffix": ""
+ },
+ {
+ "first": "Edouard",
+ "middle": [],
+ "last": "Grave",
+ "suffix": ""
+ },
+ {
+ "first": "Armand",
+ "middle": [],
+ "last": "Joulin",
+ "suffix": ""
+ },
+ {
+ "first": "Tomas",
+ "middle": [],
+ "last": "Mikolov",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.1162/tacl_a_00051"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. 2017. Enriching word vectors with subword information. TACL.",
+ "links": null
+ },
+ "BIBREF7": {
+ "ref_id": "b7",
+ "title": "Improving textual network embedding with global attention via optimal transport",
+ "authors": [
+ {
+ "first": "Liqun",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Guoyin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Chenyang",
+ "middle": [],
+ "last": "Tao",
+ "suffix": ""
+ },
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Pengyu",
+ "middle": [],
+ "last": "Cheng",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Wenlin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yizhe",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Liqun Chen, Guoyin Wang, Chenyang Tao, Ding- han Shen, Pengyu Cheng, Xinyuan Zhang, Wenlin Wang, Yizhe Zhang, and Lawrence Carin. 2019. Im- proving textual network embedding with global at- tention via optimal transport. In ACL.",
+ "links": null
+ },
+ "BIBREF8": {
+ "ref_id": "b8",
+ "title": "Research Paper Recommender Systems on Big Scholarly Data",
+ "authors": [
+ {
+ "first": "Maria",
+ "middle": [],
+ "last": "Tsung Teng Chen",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "Knowledge Management and Acquisition for Intelligent Systems",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Tsung Teng Chen and Maria Lee. 2018. Research Pa- per Recommender Systems on Big Scholarly Data. In Knowledge Management and Acquisition for In- telligent Systems.",
+ "links": null
+ },
+ "BIBREF9": {
+ "ref_id": "b9",
+ "title": "The structural and content aspects of abstracts versus bodies of full text journal articles are different",
+ "authors": [
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Cohen",
+ "suffix": ""
+ },
+ {
+ "first": "Helen",
+ "middle": [
+ "L"
+ ],
+ "last": "Johnson",
+ "suffix": ""
+ },
+ {
+ "first": "Karin",
+ "middle": [
+ "M"
+ ],
+ "last": "Verspoor",
+ "suffix": ""
+ },
+ {
+ "first": "Christophe",
+ "middle": [],
+ "last": "Roeder",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Hunter",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "BMC Bioinformatics",
+ "volume": "11",
+ "issue": "",
+ "pages": "492--492",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "K. Bretonnel Cohen, Helen L. Johnson, Karin M. Ver- spoor, Christophe Roeder, and Lawrence Hunter. 2010. The structural and content aspects of abstracts versus bodies of full text journal articles are different. BMC Bioinformatics, 11:492-492.",
+ "links": null
+ },
+ "BIBREF10": {
+ "ref_id": "b10",
+ "title": "Supervised Learning of Universal Sentence Representations from Natural Language Inference Data",
+ "authors": [
+ {
+ "first": "Alexis",
+ "middle": [],
+ "last": "Conneau",
+ "suffix": ""
+ },
+ {
+ "first": "Douwe",
+ "middle": [],
+ "last": "Kiela",
+ "suffix": ""
+ },
+ {
+ "first": "Holger",
+ "middle": [],
+ "last": "Schwenk",
+ "suffix": ""
+ },
+ {
+ "first": "Lo\u00efc",
+ "middle": [],
+ "last": "Barrault",
+ "suffix": ""
+ },
+ {
+ "first": "Antoine",
+ "middle": [],
+ "last": "Bordes",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/D17-1070"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Alexis Conneau, Douwe Kiela, Holger Schwenk, Lo\u00efc Barrault, and Antoine Bordes. 2017. Supervised Learning of Universal Sentence Representations from Natural Language Inference Data. In EMNLP.",
+ "links": null
+ },
+ "BIBREF11": {
+ "ref_id": "b11",
+ "title": "BERT: Pre-training of deep bidirectional transformers for language understanding",
+ "authors": [
+ {
+ "first": "Jacob",
+ "middle": [],
+ "last": "Devlin",
+ "suffix": ""
+ },
+ {
+ "first": "Ming-Wei",
+ "middle": [],
+ "last": "Chang",
+ "suffix": ""
+ },
+ {
+ "first": "Kenton",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ },
+ {
+ "first": "Kristina",
+ "middle": [],
+ "last": "Toutanova",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "NAACL-HLT",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language under- standing. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF12": {
+ "ref_id": "b12",
+ "title": "A Density-based Algorithm for Discovering Clusters in Large Spatial Databases with Noise",
+ "authors": [
+ {
+ "first": "Martin",
+ "middle": [],
+ "last": "Ester",
+ "suffix": ""
+ },
+ {
+ "first": "Hans-Peter",
+ "middle": [],
+ "last": "Kriegel",
+ "suffix": ""
+ },
+ {
+ "first": "J\u00f6rg",
+ "middle": [],
+ "last": "Sander",
+ "suffix": ""
+ },
+ {
+ "first": "Xiaowei",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ }
+ ],
+ "year": 1996,
+ "venue": "KDD",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martin Ester, Hans-Peter Kriegel, J\u00f6rg Sander, Xiaowei Xu, et al. 1996. A Density-based Algorithm for Dis- covering Clusters in Large Spatial Databases with Noise. In KDD.",
+ "links": null
+ },
+ "BIBREF13": {
+ "ref_id": "b13",
+ "title": "Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction",
+ "authors": [
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Elly",
+ "middle": [],
+ "last": "Trepman",
+ "suffix": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "JAMA",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.1001/jamanetworkopen.2019.6700"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Sergey Feldman, Waleed Ammar, Kyle Lo, Elly Trep- man, Madeleine van Zuylen, and Oren Etzioni. 2019. Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction. JAMA.",
+ "links": null
+ },
+ "BIBREF14": {
+ "ref_id": "b14",
+ "title": "Doc2sent2vec: A novel two-phase approach for learning document representation",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Ganesh",
+ "suffix": ""
+ },
+ {
+ "first": "Manish",
+ "middle": [],
+ "last": "Gupta",
+ "suffix": ""
+ },
+ {
+ "first": "Vijay",
+ "middle": [
+ "K"
+ ],
+ "last": "Varma",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "SIGIR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "J Ganesh, Manish Gupta, and Vijay K. Varma. 2016. Doc2sent2vec: A novel two-phase approach for learning document representation. In SIGIR.",
+ "links": null
+ },
+ "BIBREF15": {
+ "ref_id": "b15",
+ "title": "AllenNLP: A Deep Semantic Natural Language Processing Platform",
+ "authors": [
+ {
+ "first": "Matt",
+ "middle": [],
+ "last": "Gardner",
+ "suffix": ""
+ },
+ {
+ "first": "Joel",
+ "middle": [],
+ "last": "Grus",
+ "suffix": ""
+ },
+ {
+ "first": "Mark",
+ "middle": [],
+ "last": "Neumann",
+ "suffix": ""
+ },
+ {
+ "first": "Oyvind",
+ "middle": [],
+ "last": "Tafjord",
+ "suffix": ""
+ },
+ {
+ "first": "Pradeep",
+ "middle": [],
+ "last": "Dasigi",
+ "suffix": ""
+ },
+ {
+ "first": "Nelson",
+ "middle": [
+ "F"
+ ],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Schmitz",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "Proceedings of Workshop for NLP Open Source Software",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/W18-2501"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Matt Gardner, Joel Grus, Mark Neumann, Oyvind Tafjord, Pradeep Dasigi, Nelson F. Liu, Matthew Pe- ters, Michael Schmitz, and Luke Zettlemoyer. 2018. AllenNLP: A Deep Semantic Natural Language Pro- cessing Platform. In Proceedings of Workshop for NLP Open Source Software (NLP-OSS).",
+ "links": null
+ },
+ "BIBREF16": {
+ "ref_id": "b16",
+ "title": "Neural Vector Spaces for Unsupervised Information Retrieval",
+ "authors": [
+ {
+ "first": "Christophe",
+ "middle": [],
+ "last": "Van Gysel",
+ "suffix": ""
+ },
+ {
+ "first": "Maarten",
+ "middle": [],
+ "last": "De Rijke",
+ "suffix": ""
+ },
+ {
+ "first": "Evangelos",
+ "middle": [],
+ "last": "Kanoulas",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACM Trans. Inf. Syst",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Christophe Van Gysel, Maarten de Rijke, and Evange- los Kanoulas. 2017. Neural Vector Spaces for Un- supervised Information Retrieval. ACM Trans. Inf. Syst.",
+ "links": null
+ },
+ "BIBREF17": {
+ "ref_id": "b17",
+ "title": "Inductive Representation Learning on Large Graphs",
+ "authors": [
+ {
+ "first": "Will",
+ "middle": [],
+ "last": "Hamilton",
+ "suffix": ""
+ },
+ {
+ "first": "Zhitao",
+ "middle": [],
+ "last": "Ying",
+ "suffix": ""
+ },
+ {
+ "first": "Jure",
+ "middle": [],
+ "last": "Leskovec",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017a. Inductive Representation Learning on Large Graphs. In NIPS.",
+ "links": null
+ },
+ "BIBREF18": {
+ "ref_id": "b18",
+ "title": "Inductive representation learning on large graphs",
+ "authors": [
+ {
+ "first": "William",
+ "middle": [
+ "L"
+ ],
+ "last": "Hamilton",
+ "suffix": ""
+ },
+ {
+ "first": "Zhitao",
+ "middle": [],
+ "last": "Ying",
+ "suffix": ""
+ },
+ {
+ "first": "Jure",
+ "middle": [],
+ "last": "Leskovec",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "William L. Hamilton, Zhitao Ying, and Jure Leskovec. 2017b. Inductive representation learning on large graphs. In NIPS.",
+ "links": null
+ },
+ "BIBREF19": {
+ "ref_id": "b19",
+ "title": "Explaining away syntactic structure in semantic document representations",
+ "authors": [
+ {
+ "first": "Erik",
+ "middle": [],
+ "last": "Holmer",
+ "suffix": ""
+ },
+ {
+ "first": "Andreas",
+ "middle": [],
+ "last": "Marfurt",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Erik Holmer and Andreas Marfurt. 2018. Explaining away syntactic structure in semantic document rep- resentations. ArXiv, abs/1806.01620.",
+ "links": null
+ },
+ "BIBREF20": {
+ "ref_id": "b20",
+ "title": "Universal Language Model Fine-tuning for Text Classification",
+ "authors": [
+ {
+ "first": "Jeremy",
+ "middle": [],
+ "last": "Howard",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Ruder",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/P18-1031"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Jeremy Howard and Sebastian Ruder. 2018. Universal Language Model Fine-tuning for Text Classification. In ACL.",
+ "links": null
+ },
+ "BIBREF21": {
+ "ref_id": "b21",
+ "title": "A context-aware citation recommendation model with bert and graph convolutional networks",
+ "authors": [
+ {
+ "first": "Chanwoo",
+ "middle": [],
+ "last": "Jeong",
+ "suffix": ""
+ },
+ {
+ "first": "Sion",
+ "middle": [],
+ "last": "Jang",
+ "suffix": ""
+ },
+ {
+ "first": "Hyuna",
+ "middle": [],
+ "last": "Shin",
+ "suffix": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [],
+ "last": "Eunjeong",
+ "suffix": ""
+ },
+ {
+ "first": "Sungchul",
+ "middle": [],
+ "last": "Park",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Choi",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chanwoo Jeong, Sion Jang, Hyuna Shin, Eun- jeong Lucy Park, and Sungchul Choi. 2019. A context-aware citation recommendation model with bert and graph convolutional networks. ArXiv, abs/1903.06464.",
+ "links": null
+ },
+ "BIBREF22": {
+ "ref_id": "b22",
+ "title": "A Scalable Hybrid Research Paper Recommender System for Microsoft Academic",
+ "authors": [
+ {
+ "first": "Anshul",
+ "middle": [],
+ "last": "Kanakia",
+ "suffix": ""
+ },
+ {
+ "first": "Zhihong",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Darrin",
+ "middle": [],
+ "last": "Eide",
+ "suffix": ""
+ },
+ {
+ "first": "Kuansan",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "WWW",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Anshul Kanakia, Zhihong Shen, Darrin Eide, and Kuansan Wang. 2019. A Scalable Hybrid Research Paper Recommender System for Microsoft Aca- demic. In WWW.",
+ "links": null
+ },
+ "BIBREF23": {
+ "ref_id": "b23",
+ "title": "Adam: A Method for Stochastic Optimization",
+ "authors": [
+ {
+ "first": "P",
+ "middle": [],
+ "last": "Diederik",
+ "suffix": ""
+ },
+ {
+ "first": "Jimmy",
+ "middle": [],
+ "last": "Kingma",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Ba",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Diederik P. Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. ArXiv, abs/1412.6980.",
+ "links": null
+ },
+ "BIBREF24": {
+ "ref_id": "b24",
+ "title": "Semisupervised classification with graph convolutional networks",
+ "authors": [
+ {
+ "first": "N",
+ "middle": [],
+ "last": "Thomas",
+ "suffix": ""
+ },
+ {
+ "first": "Max",
+ "middle": [],
+ "last": "Kipf",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Welling",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Thomas N Kipf and Max Welling. 2017. Semi- supervised classification with graph convolutional networks. ICLR.",
+ "links": null
+ },
+ "BIBREF25": {
+ "ref_id": "b25",
+ "title": "Raquel Urtasun, and Sanja Fidler. 2015. Skip-thought vectors",
+ "authors": [
+ {
+ "first": "Ryan",
+ "middle": [],
+ "last": "Kiros",
+ "suffix": ""
+ },
+ {
+ "first": "Yukun",
+ "middle": [],
+ "last": "Zhu",
+ "suffix": ""
+ },
+ {
+ "first": "Ruslan",
+ "middle": [],
+ "last": "Salakhutdinov",
+ "suffix": ""
+ },
+ {
+ "first": "Richard",
+ "middle": [
+ "S"
+ ],
+ "last": "Zemel",
+ "suffix": ""
+ },
+ {
+ "first": "Antonio",
+ "middle": [],
+ "last": "Torralba",
+ "suffix": ""
+ }
+ ],
+ "year": null,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urta- sun, and Sanja Fidler. 2015. Skip-thought vectors. In NIPS.",
+ "links": null
+ },
+ "BIBREF26": {
+ "ref_id": "b26",
+ "title": "Identifying a better measure of relatedness for mapping science",
+ "authors": [
+ {
+ "first": "Richard",
+ "middle": [],
+ "last": "Klavans",
+ "suffix": ""
+ },
+ {
+ "first": "Kevin",
+ "middle": [
+ "W"
+ ],
+ "last": "Boyack",
+ "suffix": ""
+ }
+ ],
+ "year": 2006,
+ "venue": "Journal of the Association for Information Science and Technology",
+ "volume": "57",
+ "issue": "",
+ "pages": "251--263",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Richard Klavans and Kevin W. Boyack. 2006. Iden- tifying a better measure of relatedness for mapping science. Journal of the Association for Information Science and Technology, 57:251-263.",
+ "links": null
+ },
+ "BIBREF27": {
+ "ref_id": "b27",
+ "title": "An empirical evaluation of doc2vec with practical insights into document embedding generation",
+ "authors": [
+ {
+ "first": "Han",
+ "middle": [],
+ "last": "Jey",
+ "suffix": ""
+ },
+ {
+ "first": "Timothy",
+ "middle": [],
+ "last": "Lau",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Baldwin",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "Rep4NLP@ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jey Han Lau and Timothy Baldwin. 2016. An empirical evaluation of doc2vec with practical in- sights into document embedding generation. In Rep4NLP@ACL.",
+ "links": null
+ },
+ "BIBREF28": {
+ "ref_id": "b28",
+ "title": "Distributed Representations of Sentences and Documents",
+ "authors": [
+ {
+ "first": "Quoc",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ },
+ {
+ "first": "Tomas",
+ "middle": [],
+ "last": "Mikolov",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "ICML",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Quoc Le and Tomas Mikolov. 2014. Distributed Repre- sentations of Sentences and Documents. In ICML.",
+ "links": null
+ },
+ "BIBREF29": {
+ "ref_id": "b29",
+ "title": "Is searching full text more effective than searching abstracts?",
+ "authors": [
+ {
+ "first": "Jimmy",
+ "middle": [
+ "J"
+ ],
+ "last": "Lin",
+ "suffix": ""
+ }
+ ],
+ "year": 2008,
+ "venue": "BMC Bioinformatics",
+ "volume": "10",
+ "issue": "",
+ "pages": "46--46",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jimmy J. Lin. 2008. Is searching full text more effec- tive than searching abstracts? BMC Bioinformatics, 10:46-46.",
+ "links": null
+ },
+ "BIBREF30": {
+ "ref_id": "b30",
+ "title": "Bulletin of the Medical Library Association",
+ "authors": [
+ {
+ "first": "Carolyn",
+ "middle": [
+ "E"
+ ],
+ "last": "Lipscomb",
+ "suffix": ""
+ }
+ ],
+ "year": 2000,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Carolyn E Lipscomb. 2000. Medical Subject Headings (MeSH). Bulletin of the Medical Library Associa- tion.",
+ "links": null
+ },
+ "BIBREF31": {
+ "ref_id": "b31",
+ "title": "Unsupervised Document Embedding with CNNs",
+ "authors": [
+ {
+ "first": "Chundi",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Shunan",
+ "middle": [],
+ "last": "Zhao",
+ "suffix": ""
+ },
+ {
+ "first": "Maksims",
+ "middle": [],
+ "last": "Volkovs",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chundi Liu, Shunan Zhao, and Maksims Volkovs. 2018. Unsupervised Document Embedding with CNNs. ArXiv, abs/1711.04168v3.",
+ "links": null
+ },
+ "BIBREF32": {
+ "ref_id": "b32",
+ "title": "A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis",
+ "authors": [
+ {
+ "first": "Pengfei",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "King",
+ "middle": [
+ "Keung"
+ ],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Helen",
+ "middle": [
+ "M"
+ ],
+ "last": "Meng",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Pengfei Liu, King Keung Wu, and Helen M. Meng. 2017. A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis. IJCNN.",
+ "links": null
+ },
+ "BIBREF34": {
+ "ref_id": "b34",
+ "title": "Accelerating t-SNE Using Tree-based Algorithms",
+ "authors": [
+ {
+ "first": "Laurens",
+ "middle": [],
+ "last": "Van Der Maaten",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "Journal of Machine Learning Research",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Laurens van der Maaten. 2014. Accelerating t-SNE Using Tree-based Algorithms. Journal of Machine Learning Research.",
+ "links": null
+ },
+ "BIBREF35": {
+ "ref_id": "b35",
+ "title": "DisSent: Learning Sentence Representations from Explicit Discourse Relations",
+ "authors": [
+ {
+ "first": "Allen",
+ "middle": [],
+ "last": "Nie",
+ "suffix": ""
+ },
+ {
+ "first": "Erin",
+ "middle": [],
+ "last": "Bennett",
+ "suffix": ""
+ },
+ {
+ "first": "Noah",
+ "middle": [],
+ "last": "Goodman",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/P19-1442"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Allen Nie, Erin Bennett, and Noah Goodman. 2019. DisSent: Learning Sentence Representations from Explicit Discourse Relations. In ACL.",
+ "links": null
+ },
+ "BIBREF36": {
+ "ref_id": "b36",
+ "title": "Scikit-learn: Machine learning in Python",
+ "authors": [
+ {
+ "first": "F",
+ "middle": [],
+ "last": "Pedregosa",
+ "suffix": ""
+ },
+ {
+ "first": "G",
+ "middle": [],
+ "last": "Varoquaux",
+ "suffix": ""
+ },
+ {
+ "first": "A",
+ "middle": [],
+ "last": "Gramfort",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Michel",
+ "suffix": ""
+ },
+ {
+ "first": "B",
+ "middle": [],
+ "last": "Thirion",
+ "suffix": ""
+ },
+ {
+ "first": "O",
+ "middle": [],
+ "last": "Grisel",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Blondel",
+ "suffix": ""
+ },
+ {
+ "first": "P",
+ "middle": [],
+ "last": "Prettenhofer",
+ "suffix": ""
+ },
+ {
+ "first": "R",
+ "middle": [],
+ "last": "Weiss",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Dubourg",
+ "suffix": ""
+ },
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Vanderplas",
+ "suffix": ""
+ },
+ {
+ "first": "A",
+ "middle": [],
+ "last": "Passos",
+ "suffix": ""
+ },
+ {
+ "first": "D",
+ "middle": [],
+ "last": "Cournapeau",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Brucher",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Perrot",
+ "suffix": ""
+ },
+ {
+ "first": "E",
+ "middle": [],
+ "last": "Duchesnay",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "Journal of Machine Learning Research",
+ "volume": "12",
+ "issue": "",
+ "pages": "2825--2830",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "F. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R. Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, and E. Duch- esnay. 2011. Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12:2825-2830.",
+ "links": null
+ },
+ "BIBREF37": {
+ "ref_id": "b37",
+ "title": "Deep Contextualized Word Representations",
+ "authors": [
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Mark",
+ "middle": [],
+ "last": "Neumann",
+ "suffix": ""
+ },
+ {
+ "first": "Mohit",
+ "middle": [],
+ "last": "Iyyer",
+ "suffix": ""
+ },
+ {
+ "first": "Matt",
+ "middle": [],
+ "last": "Gardner",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Clark",
+ "suffix": ""
+ },
+ {
+ "first": "Kenton",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep Contextualized Word Rep- resentations.",
+ "links": null
+ },
+ "BIBREF38": {
+ "ref_id": "b38",
+ "title": "Improving language understanding by generative pre-training",
+ "authors": [
+ {
+ "first": "Alec",
+ "middle": [],
+ "last": "Radford",
+ "suffix": ""
+ },
+ {
+ "first": "Karthik",
+ "middle": [],
+ "last": "Narasimhan",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language under- standing by generative pre-training. arXiv.",
+ "links": null
+ },
+ "BIBREF39": {
+ "ref_id": "b39",
+ "title": "Software Framework for Topic Modelling with Large Corpora",
+ "authors": [
+ {
+ "first": "Petr",
+ "middle": [],
+ "last": "Radim\u0159eh\u016f\u0159ek",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Sojka",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "LREC",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Radim\u0158eh\u016f\u0159ek and Petr Sojka. 2010. Software Frame- work for Topic Modelling with Large Corpora. In LREC.",
+ "links": null
+ },
+ "BIBREF40": {
+ "ref_id": "b40",
+ "title": "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+ "authors": [
+ {
+ "first": "Nils",
+ "middle": [],
+ "last": "Reimers",
+ "suffix": ""
+ },
+ {
+ "first": "Iryna",
+ "middle": [],
+ "last": "Gurevych",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Nils Reimers and Iryna Gurevych. 2019. Sentence- BERT: Sentence Embeddings using Siamese BERT- Networks. In EMNLP.",
+ "links": null
+ },
+ "BIBREF41": {
+ "ref_id": "b41",
+ "title": "Vmeasure: A Conditional Entropy-based External Cluster Evaluation Measure",
+ "authors": [
+ {
+ "first": "Andrew",
+ "middle": [],
+ "last": "Rosenberg",
+ "suffix": ""
+ },
+ {
+ "first": "Julia",
+ "middle": [],
+ "last": "Hirschberg",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Andrew Rosenberg and Julia Hirschberg. 2007. V- measure: A Conditional Entropy-based External Cluster Evaluation Measure. In EMNLP.",
+ "links": null
+ },
+ "BIBREF42": {
+ "ref_id": "b42",
+ "title": "Collaborative filtering recommender systems",
+ "authors": [
+ {
+ "first": "Ben",
+ "middle": [],
+ "last": "Schafer",
+ "suffix": ""
+ },
+ {
+ "first": "Dan",
+ "middle": [],
+ "last": "Frankowski",
+ "suffix": ""
+ },
+ {
+ "first": "Jon",
+ "middle": [],
+ "last": "Herlocker",
+ "suffix": ""
+ },
+ {
+ "first": "Shilad",
+ "middle": [],
+ "last": "Sen",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "The adaptive web",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "J Ben Schafer, Dan Frankowski, Jon Herlocker, and Shilad Sen. 2007. Collaborative filtering recom- mender systems. In The adaptive web. Springer.",
+ "links": null
+ },
+ "BIBREF43": {
+ "ref_id": "b43",
+ "title": "Distribution of information in biomedical abstracts and full-text publications",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Martijn",
+ "suffix": ""
+ },
+ {
+ "first": "Marc",
+ "middle": [],
+ "last": "Schuemie",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Weeber",
+ "suffix": ""
+ },
+ {
+ "first": "J",
+ "middle": [
+ "A"
+ ],
+ "last": "Bob",
+ "suffix": ""
+ },
+ {
+ "first": "Erik",
+ "middle": [
+ "M"
+ ],
+ "last": "Schijvenaars",
+ "suffix": ""
+ },
+ {
+ "first": "C",
+ "middle": [],
+ "last": "Van Mulligen",
+ "suffix": ""
+ },
+ {
+ "first": "Rob",
+ "middle": [],
+ "last": "Christiaan Van Der Eijk",
+ "suffix": ""
+ },
+ {
+ "first": "Barend",
+ "middle": [],
+ "last": "Jelier",
+ "suffix": ""
+ },
+ {
+ "first": "Jan",
+ "middle": [
+ "A"
+ ],
+ "last": "Mons",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Kors",
+ "suffix": ""
+ }
+ ],
+ "year": 2004,
+ "venue": "",
+ "volume": "20",
+ "issue": "",
+ "pages": "2597--604",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martijn J. Schuemie, Marc Weeber, Bob J. A. Schijve- naars, Erik M. van Mulligen, C. Christiaan van der Eijk, Rob Jelier, Barend Mons, and Jan A. Kors. 2004. Distribution of information in biomedical ab- stracts and full-text publications. Bioinformatics, 20(16):2597-604.",
+ "links": null
+ },
+ "BIBREF44": {
+ "ref_id": "b44",
+ "title": "Improved semantic-aware network embedding with fine-grained word alignment",
+ "authors": [
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Ricardo",
+ "middle": [],
+ "last": "Henao",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Dinghan Shen, Xinyuan Zhang, Ricardo Henao, and Lawrence Carin. 2018. Improved semantic-aware network embedding with fine-grained word align- ment. In EMNLP.",
+ "links": null
+ },
+ "BIBREF45": {
+ "ref_id": "b45",
+ "title": "An Overview of Microsoft Academic Service (MAS) and Applications",
+ "authors": [
+ {
+ "first": "Arnab",
+ "middle": [],
+ "last": "Sinha",
+ "suffix": ""
+ },
+ {
+ "first": "Zhihong",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Yang",
+ "middle": [],
+ "last": "Song",
+ "suffix": ""
+ },
+ {
+ "first": "Hao",
+ "middle": [],
+ "last": "Ma",
+ "suffix": ""
+ },
+ {
+ "first": "Darrin",
+ "middle": [],
+ "last": "Eide",
+ "suffix": ""
+ },
+ {
+ "first": "Bo-June Paul",
+ "middle": [],
+ "last": "Hsu",
+ "suffix": ""
+ },
+ {
+ "first": "Kuansan",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "WWW",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Arnab Sinha, Zhihong Shen, Yang Song, Hao Ma, Dar- rin Eide, Bo-June Paul Hsu, and Kuansan Wang. 2015. An Overview of Microsoft Academic Service (MAS) and Applications. In WWW.",
+ "links": null
+ },
+ "BIBREF46": {
+ "ref_id": "b46",
+ "title": "Cane: Context-aware network embedding for relation modeling",
+ "authors": [
+ {
+ "first": "Cunchao",
+ "middle": [],
+ "last": "Tu",
+ "suffix": ""
+ },
+ {
+ "first": "Han",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Zhiyuan",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Maosong",
+ "middle": [],
+ "last": "Sun",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Cunchao Tu, Han Liu, Zhiyuan Liu, and Maosong Sun. 2017. Cane: Context-aware network embedding for relation modeling. In ACL.",
+ "links": null
+ },
+ "BIBREF47": {
+ "ref_id": "b47",
+ "title": "Attention Is All You Need",
+ "authors": [
+ {
+ "first": "Ashish",
+ "middle": [],
+ "last": "Vaswani",
+ "suffix": ""
+ },
+ {
+ "first": "Noam",
+ "middle": [],
+ "last": "Shazeer",
+ "suffix": ""
+ },
+ {
+ "first": "Niki",
+ "middle": [],
+ "last": "Parmar",
+ "suffix": ""
+ },
+ {
+ "first": "Jakob",
+ "middle": [],
+ "last": "Uszkoreit",
+ "suffix": ""
+ },
+ {
+ "first": "Llion",
+ "middle": [],
+ "last": "Jones",
+ "suffix": ""
+ },
+ {
+ "first": "Aidan",
+ "middle": [
+ "N"
+ ],
+ "last": "Gomez",
+ "suffix": ""
+ },
+ {
+ "first": "Lukasz",
+ "middle": [],
+ "last": "Kaiser",
+ "suffix": ""
+ },
+ {
+ "first": "Illia",
+ "middle": [],
+ "last": "Polosukhin",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In NIPS.",
+ "links": null
+ },
+ "BIBREF48": {
+ "ref_id": "b48",
+ "title": "Improving textual network learning with variational homophilic embeddings",
+ "authors": [
+ {
+ "first": "Wenlin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Chenyang",
+ "middle": [],
+ "last": "Tao",
+ "suffix": ""
+ },
+ {
+ "first": "Zhe",
+ "middle": [],
+ "last": "Gan",
+ "suffix": ""
+ },
+ {
+ "first": "Guoyin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Liqun",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Ruiyi",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Qian",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Ricardo",
+ "middle": [],
+ "last": "Henao",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "Advances in Neural Information Processing Systems",
+ "volume": "",
+ "issue": "",
+ "pages": "2074--2085",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Wenlin Wang, Chenyang Tao, Zhe Gan, Guoyin Wang, Liqun Chen, Xinyuan Zhang, Ruiyi Zhang, Qian Yang, Ricardo Henao, and Lawrence Carin. 2019. Improving textual network learning with variational homophilic embeddings. In Advances in Neural In- formation Processing Systems, pages 2074-2085.",
+ "links": null
+ },
+ "BIBREF49": {
+ "ref_id": "b49",
+ "title": "A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference",
+ "authors": [
+ {
+ "first": "Adina",
+ "middle": [],
+ "last": "Williams",
+ "suffix": ""
+ },
+ {
+ "first": "Nikita",
+ "middle": [],
+ "last": "Nangia",
+ "suffix": ""
+ },
+ {
+ "first": "Samuel",
+ "middle": [],
+ "last": "Bowman",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/N18-1101"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Adina Williams, Nikita Nangia, and Samuel Bowman. 2018. A Broad-Coverage Challenge Corpus for Sen- tence Understanding through Inference. In NAACL- HLT.",
+ "links": null
+ },
+ "BIBREF50": {
+ "ref_id": "b50",
+ "title": "Simplifying graph convolutional networks",
+ "authors": [
+ {
+ "first": "Felix",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "H",
+ "middle": [],
+ "last": "Amauri",
+ "suffix": ""
+ },
+ {
+ "first": "Tianyi",
+ "middle": [],
+ "last": "Souza",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Tao",
+ "middle": [],
+ "last": "Fifty",
+ "suffix": ""
+ },
+ {
+ "first": "Kilian",
+ "middle": [
+ "Q"
+ ],
+ "last": "Yu",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Weinberger",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ICML",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Felix Wu, Amauri H. Souza, Tianyi Zhang, Christo- pher Fifty, Tao Yu, and Kilian Q. Weinberger. 2019a. Simplifying graph convolutional networks. In ICML.",
+ "links": null
+ },
+ "BIBREF51": {
+ "ref_id": "b51",
+ "title": "Word Mover's Embedding: From Word2Vec to Document Embedding",
+ "authors": [
+ {
+ "first": "Lingfei",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Ian",
+ "middle": [],
+ "last": "En-Hsu Yen",
+ "suffix": ""
+ },
+ {
+ "first": "Kun",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ },
+ {
+ "first": "Fangli",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ },
+ {
+ "first": "Avinash",
+ "middle": [],
+ "last": "Balakrishnan",
+ "suffix": ""
+ },
+ {
+ "first": "Pin-Yu",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Pradeep",
+ "middle": [],
+ "last": "Ravikumar",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [
+ "J"
+ ],
+ "last": "Witbrock",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Lingfei Wu, Ian En-Hsu Yen, Kun Xu, Fangli Xu, Avinash Balakrishnan, Pin-Yu Chen, Pradeep Ravikumar, and Michael J Witbrock. 2018. Word Mover's Embedding: From Word2Vec to Document Embedding. In EMNLP.",
+ "links": null
+ },
+ "BIBREF52": {
+ "ref_id": "b52",
+ "title": "Google's neural machine translation system: Bridging the gap between human and machine translation",
+ "authors": [
+ {
+ "first": "Yonghui",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Mike",
+ "middle": [],
+ "last": "Schuster",
+ "suffix": ""
+ },
+ {
+ "first": "Zhifeng",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Quoc",
+ "suffix": ""
+ },
+ {
+ "first": "Mohammad",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ },
+ {
+ "first": "Wolfgang",
+ "middle": [],
+ "last": "Norouzi",
+ "suffix": ""
+ },
+ {
+ "first": "Maxim",
+ "middle": [],
+ "last": "Macherey",
+ "suffix": ""
+ },
+ {
+ "first": "Yuan",
+ "middle": [],
+ "last": "Krikun",
+ "suffix": ""
+ },
+ {
+ "first": "Qin",
+ "middle": [],
+ "last": "Cao",
+ "suffix": ""
+ },
+ {
+ "first": "Klaus",
+ "middle": [],
+ "last": "Gao",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Macherey",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. ArXiv, abs/1609.08144.",
+ "links": null
+ },
+ "BIBREF54": {
+ "ref_id": "b54",
+ "title": "Xlnet: Generalized autoregressive pretraining for language understanding",
+ "authors": [
+ {
+ "first": "Zhilin",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Zihang",
+ "middle": [],
+ "last": "Dai",
+ "suffix": ""
+ },
+ {
+ "first": "Yiming",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Jaime",
+ "middle": [
+ "G"
+ ],
+ "last": "Carbonell",
+ "suffix": ""
+ },
+ {
+ "first": "Ruslan",
+ "middle": [],
+ "last": "Salakhutdinov",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Quoc",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Car- bonell, Ruslan Salakhutdinov, and Quoc V. Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. ArXiv, abs/1906.08237.",
+ "links": null
+ },
+ "BIBREF55": {
+ "ref_id": "b55",
+ "title": "From neural re-ranking to neural ranking: Learning a sparse representation for inverted indexing",
+ "authors": [
+ {
+ "first": "Hamed",
+ "middle": [],
+ "last": "Zamani",
+ "suffix": ""
+ },
+ {
+ "first": "Mostafa",
+ "middle": [],
+ "last": "Dehghani",
+ "suffix": ""
+ },
+ {
+ "first": "W",
+ "middle": [
+ "Bruce"
+ ],
+ "last": "Croft",
+ "suffix": ""
+ },
+ {
+ "first": "Erik",
+ "middle": [
+ "G"
+ ],
+ "last": "",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "CIKM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Hamed Zamani, Mostafa Dehghani, W. Bruce Croft, Erik G. Learned-Miller, and Jaap Kamps. 2018. From neural re-ranking to neural ranking: Learn- ing a sparse representation for inverted indexing. In CIKM.",
+ "links": null
+ },
+ "BIBREF56": {
+ "ref_id": "b56",
+ "title": "Diffusion maps for textual network embedding",
+ "authors": [
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Yitong",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xinyuan Zhang, Yitong Li, Dinghan Shen, and Lawrence Carin. 2018. Diffusion maps for textual network embedding. In NeurIPS.",
+ "links": null
+ }
+ },
+ "ref_entries": {
+ "FIGREF0": {
+ "text": "t-SNE visualization of paper embeddings and their corresponding MAG topics.",
+ "type_str": "figure",
+ "uris": null,
+ "num": null
+ },
+ "TABREF1": {
+ "type_str": "table",
+ "text": "Results on the SCIDOCS evaluation suite consisting of 7 tasks.",
+ "content": "",
+ "html": null,
+ "num": null
+ },
+ "TABREF3": {
+ "type_str": "table",
+ "text": "",
+ "content": "",
+ "html": null,
+ "num": null
+ },
+ "TABREF4": {
+ "type_str": "table",
+ "text": "SciBERT fine-tune on co-view 83.0 84.2 84.1 36.4 76.0 SciBERT fine-tune on co-read 82.3 85.4 86.7 36.3 77.1 SciBERT fine-tune on co-citation 82.9 84.3 85.2 36.6 76.4 SciBERT fine-tune on multitask 83.3 86.1 88.2 36.0 78.0",
+ "content": "Training signal CLS USR CITE REC All SPECTER 84.2 88.4 91.5 36.9 80.0
",
+ "html": null,
+ "num": null
+ },
+ "TABREF5": {
+ "type_str": "table",
+ "text": "Comparison with task-specific fine-tuning.",
+ "content": "",
+ "html": null,
+ "num": null
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/s2orc-doc2json/output_dir/N18-3011.json b/s2orc-doc2json/output_dir/N18-3011.json
new file mode 100644
index 0000000000000000000000000000000000000000..b54d02485439a19654deaff791bfa507ce20caaa
--- /dev/null
+++ b/s2orc-doc2json/output_dir/N18-3011.json
@@ -0,0 +1,2134 @@
+{
+ "paper_id": "N18-3011",
+ "header": {
+ "generated_with": "S2ORC 1.0.0",
+ "date_generated": "2022-06-22T20:47:59.855613Z"
+ },
+ "title": "Construction of the Literature Graph in Semantic Scholar",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": "",
+ "affiliation": {},
+ "email": "waleeda@allenai.org"
+ },
+ {
+ "first": "Dirk",
+ "middle": [],
+ "last": "Groeneveld",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Miles",
+ "middle": [],
+ "last": "Crawford",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Dunkelberger",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Ahmed",
+ "middle": [],
+ "last": "Elgohary",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Rodney",
+ "middle": [],
+ "last": "Kinney",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Kohlmeier",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Tyler",
+ "middle": [],
+ "last": "Murray",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Hsu-Han",
+ "middle": [],
+ "last": "Ooi",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [],
+ "last": "Peters",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Joanna",
+ "middle": [],
+ "last": "Power",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sam",
+ "middle": [],
+ "last": "Skjonsberg",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [
+ "Lu"
+ ],
+ "last": "Wang",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Chris",
+ "middle": [],
+ "last": "Wilhelm",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Zheng",
+ "middle": [],
+ "last": "Yuan",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ }
+ ],
+ "year": "",
+ "venue": null,
+ "identifiers": {},
+ "abstract": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org.",
+ "pdf_parse": {
+ "paper_id": "N18-3011",
+ "_pdf_hash": "",
+ "abstract": [
+ {
+ "text": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Abstract",
+ "sec_num": null
+ }
+ ],
+ "body_text": [
+ {
+ "text": "The goal of this work is to facilitate algorithmic discovery in the scientific literature. Despite notable advances in scientific search engines, data mining and digital libraries (e.g., Wu et al., 2014) , researchers remain unable to answer simple questions such as:",
+ "cite_spans": [
+ {
+ "start": 187,
+ "end": 203,
+ "text": "Wu et al., 2014)",
+ "ref_id": "BIBREF25"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "What is the percentage of female subjects in depression clinical trials?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Which of my co-authors published one or more papers on coreference resolution?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Which papers discuss the effects of Ranibizumab on the Retina?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In this paper, we focus on the problem of extracting structured data from scientific documents, which can later be used in natural language interfaces (e.g., Iyer et al., 2017) or to improve ranking of results in academic search (e.g., Xiong et al., Figure 1 : Part of the literature graph. 2017). We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph (see Fig. 1 ). The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries. For example, in order to compute the Erd\u0151s number of an author X, the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erd\u0151s such that all edges on the path are labeled \"authored\".",
+ "cite_spans": [
+ {
+ "start": 158,
+ "end": 176,
+ "text": "Iyer et al., 2017)",
+ "ref_id": "BIBREF12"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 250,
+ "end": 258,
+ "text": "Figure 1",
+ "ref_id": null
+ },
+ {
+ "start": 459,
+ "end": 465,
+ "text": "Fig. 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "We reduce literature graph construction into familiar NLP tasks such as sequence labeling, entity linking and relation extraction, and address some of the impractical assumptions commonly made in the standard formulations of these tasks. For example, most research on named entity recognition tasks report results on large labeled datasets such as CoNLL-2003 and ACE-2005 (e.g., Lample et al., 2016 , and assume that entity types in the test set match those labeled in the training set (including work on domain adaptation, e.g., Daum\u00e9, 2007) . These assumptions, while useful for developing and benchmarking new methods, are unrealistic for many domains and applications. The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines (Etzioni, 2011) .",
+ "cite_spans": [
+ {
+ "start": 348,
+ "end": 358,
+ "text": "CoNLL-2003",
+ "ref_id": null
+ },
+ {
+ "start": 359,
+ "end": 371,
+ "text": "and ACE-2005",
+ "ref_id": null
+ },
+ {
+ "start": 372,
+ "end": 398,
+ "text": "(e.g., Lample et al., 2016",
+ "ref_id": null
+ },
+ {
+ "start": 530,
+ "end": 542,
+ "text": "Daum\u00e9, 2007)",
+ "ref_id": "BIBREF6"
+ },
+ {
+ "start": 821,
+ "end": 836,
+ "text": "(Etzioni, 2011)",
+ "ref_id": "BIBREF8"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In the next section, we start by describing our symbolic representation of the literature. Then, we discuss how we extract metadata associated with a paper such as authors and references, then how we extract the entities mentioned in paper text. Before we conclude, we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "The literature graph is a property graph with directed edges. Unlike Resource Description Framework (RDF) graphs, nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities. In this section, we describe the attributes associated with nodes and edges of different types in the literature graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Structure of The Literature Graph",
+ "sec_num": "2"
+ },
+ {
+ "text": "Papers. We obtain metadata and PDF files of papers via partnerships with publishers (e.g., Springer, Nature), catalogs (e.g., DBLP, MED-LINE), pre-publishing services (e.g., arXiv, bioRxive), as well as web-crawling. Paper nodes are associated with a set of attributes such as 'title', 'abstract', 'full text', 'venues' and 'publication year'. While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF (details in \u00a73). We deterministically remove duplicate papers based on string similarity of their metadata, resulting in 37M unique paper nodes. Papers in the literature graph cover a variety of scientific disciplines, including computer science, molecular biology, microbiology and neuroscience.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Authors. Each node of this type represents a unique author, with attributes such as 'first name' and 'last name'. The literature graph has 12M nodes of this type.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Entities. Each node of this type represents a unique scientific concept discussed in the literature, with attributes such as 'canonical name', 'aliases' and 'description'. Our literature graph has 0.4M nodes of this type. We describe how we populate entity nodes in \u00a74.3.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Entity mentions. Each node of this type represents a textual reference of an entity in one of the papers, with attributes such as 'mention text', 'context', and 'confidence'. We describe how we populate the 237M mentions in the literature graph in \u00a74.1.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Citations. We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id', 'to paper id' and 'contexts' (the textual contexts where p 2 is referenced in p 1 ). While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF as detailed in \u00a73.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Authorship. We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Entity linking edges. We instantiate a directed edge from an extracted entity mention node to the entity it refers to.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Mention-mention relations. We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context. 1 We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 .",
+ "cite_spans": [
+ {
+ "start": 247,
+ "end": 248,
+ "text": "1",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Entity-entity relations. While mentionmention edges represent relations between mentions in a particular context, entity-entity edges represent relations between abstract entities. These relations may be imported from an existing knowledge base (KB) or inferred from other edges in the graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "In the previous section, we described the overall structure of the literature graph. Next, we discuss how we populate paper nodes, author nodes, authorship edges, and citation edges.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Although some publishers provide sufficient metadata about their papers, many papers are provided with incomplete metadata. Also, papers obtained via web-crawling are not associated with any metadata. To fill in this gap, we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks (RNNs). 2 For each paper, the system extracts the paper title, list of authors, and list of references; each reference consists of a title, a list of authors, a venue, and a year.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Preparing the input layer. We split each PDF into individual pages, and feed each page to Apache's PDFBox library 3 to convert it into a sequence of tokens, where each token has features, e.g., 'text', 'font size', 'space width', 'position on the page'.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "We normalize the token-level features before feeding them as inputs to the model. For each of the 'font size' and 'space width' features, we compute three normalized values (with respect to current page, current document, and the whole training corpus), each value ranging between -0.5 to +0.5. The token's 'position on the page' is given in XY coordinate points. We scale the values linearly to range from . 0:5; 0:5/ at the top-left corner of the page to .0:5; 0:5/ at the bottom-right corner.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "In order to capture case information, we add seven numeric features to the input representation of each token: whether the first/second letter is uppercase/lowercase, the fraction of uppercase/lowercase letters and the fraction of digits.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "To help the model make correct predictions for metadata which tend to appear at the beginning (e.g., titles and authors) or at the end of papers (e.g., references), we provide the current page number as two discrete variables (relative to the beginning and end of the PDF file) with values 0, 1 and 2+. These features are repeated for each token on the same page.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "For the k-th token in the sequence, we compute the input representation i k by concatenating the numeric features, an embedding of the 'font size', and the word embedding of the lowercased token. Word embeddings are initialized with GloVe (Pennington et al., 2014) .",
+ "cite_spans": [
+ {
+ "start": 239,
+ "end": 264,
+ "text": "(Pennington et al., 2014)",
+ "ref_id": "BIBREF19"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Model. The input token representations are passed through one fully-connected layer and then ",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "g ! k D LSTM.Wi k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 /; h k D OEh ! k I g k",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "where W is a weight matrix, g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Following Collobert et al. 2011, we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights (often described as a conditional random field layer when used in neural architectures) to account for dependencies between labels.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Training. The ScienceParse system is trained on a snapshot of the data at PubMed Central. It consists of 1.4M PDFs and their associated metadata, which specify the correct titles, authors, and bibliographies. We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens. This labeling process succeeds for 76% of the documents. The remaining documents are not used in the training process. During training, we only use pages which have at least one token with a label that is not \"none\".",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Decoding. At test time, we use Viterbi decoding to find the most likely global sequence, with no further constraints. To get the title, we use the longest continuous sequence of tokens with the \"title\" label. Since there can be multiple authors, we use all continuous sequences of tokens with the \"author\" label as authors, but require that all authors of a paper are mentioned on the same page. If the author labels are predicted in multiple pages, we use the one with the largest number of authors.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Results. We run our final tests on a held-out set from PubMed Central, consisting of about 54K documents. The results are detailed in Table 1 . We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation, with no credit for partial matching.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 134,
+ "end": 141,
+ "text": "Table 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "To give an example for the type of errors our model makes, consider the paper (Wang et al., 2013) titled \"Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis.\" The title we extract for this paper omits the first part \"Clinical review:\". This is likely to be a result of the pattern \"Foo: Bar Baz\" appearing in many training examples with only \"Bar Baz\" labeled as the title.",
+ "cite_spans": [
+ {
+ "start": 78,
+ "end": 97,
+ "text": "(Wang et al., 2013)",
+ "ref_id": "BIBREF23"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "In the previous section, we described how we populate the backbone of the literature graph, i.e., paper nodes, author nodes and citation edges. Next, we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text. In order to focus on more salient entities in a given paper, we only use the title and abstract.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction and Linking",
+ "sec_num": "4"
+ },
+ {
+ "text": "We experiment with three approaches for entity extraction and linking: I. Statistical: uses one or more statistical models for predicting mention spans, then uses another statistical model to link mentions to candidate entities in a KB.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "II. Hybrid: defines a small number of handengineered, deterministic rules for string-based matching of the input text to candidate entities in the KB, then uses a statistical model to disambiguate the mentions. 4 III. Off-the-shelf: uses existing libraries, namely (Ferragina and Scaiella, 2010, TagMe) 5 and (Demner-Fushman et al., 2017, MetaMap Lite) 6 , with minimal post-processing to extract and link entities to the KB. Table 2 : Document-level evaluation of three approaches in two scientific areas: computer science (CS) and biomedical (Bio).",
+ "cite_spans": [
+ {
+ "start": 211,
+ "end": 212,
+ "text": "4",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 426,
+ "end": 433,
+ "text": "Table 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "We evaluate the performance of each approach in two broad scientific areas: computer science (CS) and biomedical research (Bio). For each unique (paper ID, entity ID) pair predicted by one of the approaches, we ask human annotators to label each mention extracted for this entity in the paper. We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label. If one or more of the entity mentions in that paper is judged to be correct, the pair (paper ID, entity ID) counts as one correct instance. Otherwise, it counts as an incorrect instance. We report 'yield' in lieu of 'recall' due to the difficulty of doing a scalable comprehensive annotation. Table 2 shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components. In both domains, the statistical approach gives the highest precision and the lowest yield. The hybrid approach consistently gives the highest yield, but sacrifices precision. The TagMe off-the-shelf library used for the CS domain gives surprisingly good results, with precision within 1 point from the statistical models. However, the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision. Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system, which gives significantly higher yield than any individual approach while maintaining reasonably high precision.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 712,
+ "end": 719,
+ "text": "Table 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "Given the token sequence t 1 ; : : : ; t N in a sentence, we need to identify spans which correspond to entity mentions. We use the BILOU scheme to encode labels at the token level. Unlike most formulations of named entity recognition problems (NER), we do not identify the entity type (e.g., protein, drug, chemical, disease) for each mention since the output mentions are further grounded in a KB with further information about the entity (including its type), using an entity linking module.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Model. First, we construct the token embedding x k D OEc k I w k for each token t k in the input sequence, where c k is a character-based representation computed using a convolutional neural network (CNN) with filter of size 3 characters, and w k are learned word embeddings initialized with the GloVe embeddings (Pennington et al., 2014) .",
+ "cite_spans": [
+ {
+ "start": 313,
+ "end": 338,
+ "text": "(Pennington et al., 2014)",
+ "ref_id": "BIBREF19"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "We also compute context-sensitive word embeddings, denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models (RNN-LM) at position k. The language model (LM) for each direction is trained independently and consists of a single layer long short-term memory (LSTM) network followed by a linear project layer. While training the LM parameters, lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model. See and for more details.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Given the x k and lm k embeddings for each token k 2 f1; : : : ; N g, we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer, respectively. That is,",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "g ! k D LSTM.x k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 /; h k D OEh ! k I h k ;",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction. Similar to the model described in \u00a73, we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Results. We use the standard data splits of the SemEval-2017 Task 10 on entity (and relation) extraction from scientific papers (Augenstein et al., 2017) . Table 3 compares three variants of our entity extraction model. The first line omits the LM embeddings lm k , while the second line is the full model (including LM embeddings) showing a large improvement of 4.2 F1 points. The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points.",
+ "cite_spans": [
+ {
+ "start": 128,
+ "end": 153,
+ "text": "(Augenstein et al., 2017)",
+ "ref_id": "BIBREF1"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 156,
+ "end": 163,
+ "text": "Table 3",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Model instances. In the deployed system, we use three instances of the entity extraction model Description F1 Without LM 49.9",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "With LM 54.1 Avg. of 15 models with LM 55.2 Table 3 : Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture, but trained on different datasets. Two instances are trained on the BC5CDR (Li et al., 2016) and the CHEMDNER datasets (Krallinger et al., 2015) to extract key entity mentions in the biomedical domain such as diseases, drugs and chemical compounds. The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain.",
+ "cite_spans": [
+ {
+ "start": 245,
+ "end": 262,
+ "text": "(Li et al., 2016)",
+ "ref_id": "BIBREF16"
+ },
+ {
+ "start": 289,
+ "end": 314,
+ "text": "(Krallinger et al., 2015)",
+ "ref_id": "BIBREF14"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 44,
+ "end": 51,
+ "text": "Table 3",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "The output of all model instances are pooled together and combined with the rule-based entity extraction module, then fed into the entity linking model (described below).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "In this section, we describe the construction of entity nodes and entity-entity edges. Unlike other knowledge extraction systems such as the Never-Ending Language Learner (NELL) 7 and OpenIE 4, 8 we use existing knowledge bases (KBs) of entities to reduce the burden of identifying coherent concepts. Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions. We use two KBs: UMLS: The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains, and is funded by the U.S. National Library of Medicine. DBpedia: DBpedia provides access to structured information in Wikipedia. Rather than including all Wikipedia pages, we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities, e.g., \"Lord of the Rings\" in DBpedia.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Knowledge Bases",
+ "sec_num": "4.3"
+ },
+ {
+ "text": "Given a text span s identified by the entity extraction model in \u00a74.2 (or with heuristics) and a reference KB, the goal of the entity linking model is to associate the span with the entity it refers to. A span and its surrounding words are collectively referred to as a mention. We first identify a set of candidate entities that a given mention may refer to. Then, we rank the candidate entities based on a score computed using a neural model trained on labeled data.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "For example, given the string \". . . database of facts, an ILP system will . . . \", the entity extraction model identifies the span \"ILP\" as a possible entity and the entity linking model associates it with \"Inductive_Logic_Programming\" as the referent entity (from among other candidates like \"Integer_Linear_Programming\" or \"Instruction-level_Parallelism\").",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Datasets. We used two datasets: i) a biomedical dataset formed by combining MSH (Jimeno-Yepes et al., 2011) and BC5CDR (Li et al., 2016) with UMLS as the reference KB, and ii) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB.",
+ "cite_spans": [
+ {
+ "start": 119,
+ "end": 136,
+ "text": "(Li et al., 2016)",
+ "ref_id": "BIBREF16"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Candidate selection. In a preprocessing step, we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs, along with the frequency this token is associated with that entity. This is similar to the index used in previous entity linking systems (e.g., Bhagavatula et al., 2015) to estimate the probability that a given mention refers to an entity. At train and test time, we use this index to find candidate entities for a given mention by looking up the tokens in the mention. This method also serves as our baseline in Table 4 by selecting the entity with the highest frequency for a given mention.",
+ "cite_spans": [
+ {
+ "start": 309,
+ "end": 334,
+ "text": "Bhagavatula et al., 2015)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 578,
+ "end": 585,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Scoring candidates. Given a mention (m) and a candidate entity (e), the neural model constructs a vector encoding of the mention and the entity. We encode the mention and entity using the functions f and g, respectively, as follows:",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "f.m/ D OEv m.name I avg.v m.lc ; v m.rc /;",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "g.e/ D OEv e.name I v e.def ; where m.surface, m.lc and m.rc are the mention's surface form, left and right contexts, and e.name and e.def are the candidate entity's name and definition, respectively. v text is a bag-of-words sum encoder for text. We use the same encoder for the mention surface form and the candidate name, and another encoder for the mention contexts and entity definition.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Additionally, we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described Table 4 : The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 150,
+ "end": 157,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "earlier. We compute two scores based on the word overlap of (i) mention's context and candidate's definition and (ii) mention's surface span and the candidate entity's name. Finally, we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair (m, e).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Results. We use the Bag of Concepts F1 metric (Ling et al., 2015) for comparison. Table 4 compares the performance of the most-frequent-entity baseline and our neural model described above.",
+ "cite_spans": [
+ {
+ "start": 46,
+ "end": 65,
+ "text": "(Ling et al., 2015)",
+ "ref_id": "BIBREF17"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 82,
+ "end": 89,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "In the previous sections, we discussed how we construct the main components of the literature graph. In this section, we briefly describe several other related challenges we are actively working on.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Author disambiguation. Despite initiatives to have global author IDs ORCID and ResearcherID, most publishers provide author information as names (e.g., arXiv). However, author names cannot be used as a unique identifier since several people often share the same name. Moreover, different venues and sources use different conventions in reporting the author names, e.g., \"first initial, last name\" vs. \"last name, first name\". Inspired by Culotta et al. (2007) , we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters. We only consider merging two author instances if they have the same last name and share the first initial. If the first name is spelled out (rather than abbreviated) in both author instances, we also require that the first name matches.",
+ "cite_spans": [
+ {
+ "start": 438,
+ "end": 459,
+ "text": "Culotta et al. (2007)",
+ "ref_id": "BIBREF5"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Ontology matching. Popular concepts are often represented in multiple KBs. For example, the concept of \"artificial neural networks\" is represented as entity ID D016571 in the MESH ontology, and represented as page ID '21523' in DBpedia. Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies. 9 Limited KB coverage. The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage. Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation, resulting in a large gap in KB coverage of scientific concepts. In order to close this gap, we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers. For the same reasons, we also need to augment the relations imported from the KB with relations extracted from text. Our approach to address both entity and relation coverage is based on distant supervision (Mintz et al., 2009) . In short, we train two models for identifying entity definitions and relations expressed in natural language in scientific documents, and automatically generate labeled data for training these models using known definitions and relations in the KB.",
+ "cite_spans": [
+ {
+ "start": 344,
+ "end": 345,
+ "text": "9",
+ "ref_id": null
+ },
+ {
+ "start": 1003,
+ "end": 1023,
+ "text": "(Mintz et al., 2009)",
+ "ref_id": "BIBREF18"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "We note that the literature graph currently lacks coverage for important entity types (e.g., affiliations) and domains (e.g., physics). Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations. In order to cover additional scientific domains, more agreements need to be signed with publishers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Figure and table extraction. Non-textual components such as charts, diagrams and tables provide key information in many scientific documents, but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction. In Siegel et al. (2018) , we induced high-quality training labels for the task of figure extraction in a large number of scientific documents, with no human intervention. To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents (arXiv and PubMed) to locate figures and their associated captions in the rasterized PDF. We use the resulting dataset to train a deep neural network for end-to-end figure detection, yielding a model that can be more easily extended to new domains compared to previous work.",
+ "cite_spans": [
+ {
+ "start": 269,
+ "end": 289,
+ "text": "Siegel et al. (2018)",
+ "ref_id": "BIBREF21"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Understanding and predicting citations. The citation edges in the literature graph provide a wealth of information (e.g., at what rate a paper is being cited and whether it is accelerating), and opens the door for further research to better understand and predict citations. For example, in order to allow users to better understand what impact a paper had and effectively navigate its citations, we experimented with methods for classifying a citation as important or incidental, as well as more finegrained classes (Valenzuela et al., 2015) . The citation information also enables us to develop models for estimating the potential of a paper or an author. In Weihs and Etzioni (2017), we predict citationbased metrics such as an author's h-index and the citation rate of a paper in the future. Also related is the problem of predicting which papers should be cited in a given draft (Bhagavatula et al., 2018) , which can help improve the quality of a paper draft before it is submitted for peer review, or used to supplement the list of references after a paper is published.",
+ "cite_spans": [
+ {
+ "start": 517,
+ "end": 542,
+ "text": "(Valenzuela et al., 2015)",
+ "ref_id": "BIBREF22"
+ },
+ {
+ "start": 884,
+ "end": 910,
+ "text": "(Bhagavatula et al., 2018)",
+ "ref_id": "BIBREF2"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "In this paper, we discuss the construction of a graph, providing a symbolic representation of the scientific literature. We describe deployed models for identifying authors, references and entities in the paper text, and provide experimental results to evaluate the performance of each model. Three research directions follow from this work and other similar projects, e.g., Hahn-Powell et al. (2017) ; Wu et al. (2014) : i) improving quality and enriching content of the literature graph (e.g., ontology matching and knowledge base population). ii) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole (e.g., identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks). iii) exploring the literature via natural language interfaces.",
+ "cite_spans": [
+ {
+ "start": 375,
+ "end": 400,
+ "text": "Hahn-Powell et al. (2017)",
+ "ref_id": "BIBREF10"
+ },
+ {
+ "start": 403,
+ "end": 419,
+ "text": "Wu et al. (2014)",
+ "ref_id": "BIBREF25"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusion and Future Work",
+ "sec_num": "6"
+ },
+ {
+ "text": "In order to help future research efforts, we make the following resources publicly available: metadata for over 20 million papers, 10 meaningful citations dataset, 11 models for figure and table extraction, 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata, 14 among other resources. 15 ",
+ "cite_spans": [
+ {
+ "start": 330,
+ "end": 332,
+ "text": "15",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusion and Future Work",
+ "sec_num": "6"
+ },
+ {
+ "text": "Due to space constraints, we opted not to discuss our relation extraction models in this draft.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "The ScienceParse libraries can be found at http:// allenai.org/software/.3 https://pdfbox.apache.org",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We also experimented with a \"pure\" rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.5 The TagMe APIs are described at https://sobigdata. d4science.org/web/tagme/tagme-help6 We use v3.4 (L0) of MetaMap Lite, available at https: //metamap.nlm.nih.gov/MetaMapLite.shtml",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "http://rtw.ml.cmu.edu/rtw/ 8 https://github.com/allenai/ openie-standalone",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "Variants of this problem are also known as deduplication or record linkage.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ }
+ ],
+ "back_matter": [],
+ "bib_entries": {
+ "BIBREF0": {
+ "ref_id": "b0",
+ "title": "The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL workshop (SemEval)",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
+ "links": null
+ },
+ "BIBREF1": {
+ "ref_id": "b1",
+ "title": "Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications",
+ "authors": [
+ {
+ "first": "Isabelle",
+ "middle": [],
+ "last": "Augenstein",
+ "suffix": ""
+ },
+ {
+ "first": "Mrinal",
+ "middle": [],
+ "last": "Das",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Riedel",
+ "suffix": ""
+ },
+ {
+ "first": "Lakshmi",
+ "middle": [],
+ "last": "Vikraman",
+ "suffix": ""
+ },
+ {
+ "first": "Andrew",
+ "middle": [
+ "D"
+ ],
+ "last": "Mccallum",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
+ "links": null
+ },
+ "BIBREF2": {
+ "ref_id": "b2",
+ "title": "Content-based citation recommendation",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "NAACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.",
+ "links": null
+ },
+ "BIBREF3": {
+ "ref_id": "b3",
+ "title": "TabEL: entity linking in web tables. In ISWC",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Thanapon",
+ "middle": [],
+ "last": "Noraset",
+ "suffix": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.",
+ "links": null
+ },
+ "BIBREF4": {
+ "ref_id": "b4",
+ "title": "Natural language processing (almost) from scratch",
+ "authors": [
+ {
+ "first": "Ronan",
+ "middle": [],
+ "last": "Collobert",
+ "suffix": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Weston",
+ "suffix": ""
+ },
+ {
+ "first": "L\u00e9on",
+ "middle": [],
+ "last": "Bottou",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Karlen",
+ "suffix": ""
+ },
+ {
+ "first": "Koray",
+ "middle": [],
+ "last": "Kavukcuoglu",
+ "suffix": ""
+ },
+ {
+ "first": "Pavel",
+ "middle": [
+ "P"
+ ],
+ "last": "Kuksa",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "JMLR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ronan Collobert, Jason Weston, L\u00e9on Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.",
+ "links": null
+ },
+ "BIBREF5": {
+ "ref_id": "b5",
+ "title": "Author disambiguation using error-driven machine learning with a ranking loss function",
+ "authors": [
+ {
+ "first": "Aron",
+ "middle": [],
+ "last": "Culotta",
+ "suffix": ""
+ },
+ {
+ "first": "Pallika",
+ "middle": [],
+ "last": "Kanani",
+ "suffix": ""
+ },
+ {
+ "first": "Robert",
+ "middle": [],
+ "last": "Hall",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Wick",
+ "suffix": ""
+ },
+ {
+ "first": "Andrew",
+ "middle": [
+ "D"
+ ],
+ "last": "Mccallum",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "IIWeb Workshop",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.",
+ "links": null
+ },
+ "BIBREF6": {
+ "ref_id": "b6",
+ "title": "Frustratingly easy domain adaptation",
+ "authors": [
+ {
+ "first": "Hal",
+ "middle": [],
+ "last": "Daum\u00e9",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Hal Daum\u00e9. 2007. Frustratingly easy domain adapta- tion. In ACL.",
+ "links": null
+ },
+ "BIBREF7": {
+ "ref_id": "b7",
+ "title": "MetaMap Lite: an evaluation of a new Java implementation of MetaMap",
+ "authors": [
+ {
+ "first": "Dina",
+ "middle": [],
+ "last": "Demner-Fushman",
+ "suffix": ""
+ },
+ {
+ "first": "Willie",
+ "middle": [
+ "J"
+ ],
+ "last": "Rogers",
+ "suffix": ""
+ },
+ {
+ "first": "Alan",
+ "middle": [
+ "R"
+ ],
+ "last": "Aronson",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "JAMIA",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.",
+ "links": null
+ },
+ "BIBREF8": {
+ "ref_id": "b8",
+ "title": "Search needs a shake-up",
+ "authors": [
+ {
+ "first": "Oren",
+ "middle": [
+ "Etzioni"
+ ],
+ "last": "",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "Nature",
+ "volume": "476",
+ "issue": "",
+ "pages": "25--31",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.",
+ "links": null
+ },
+ "BIBREF9": {
+ "ref_id": "b9",
+ "title": "TAGME: on-the-fly annotation of short text fragments (by wikipedia entities)",
+ "authors": [
+ {
+ "first": "Paolo",
+ "middle": [],
+ "last": "Ferragina",
+ "suffix": ""
+ },
+ {
+ "first": "Ugo",
+ "middle": [],
+ "last": "Scaiella",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "CIKM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.",
+ "links": null
+ },
+ "BIBREF10": {
+ "ref_id": "b10",
+ "title": "Swanson linking revisited: Accelerating literature-based discovery across domains using a conceptual influence graph",
+ "authors": [
+ {
+ "first": "Gus",
+ "middle": [],
+ "last": "Hahn-Powell",
+ "suffix": ""
+ },
+ {
+ "first": "Marco",
+ "middle": [
+ "Antonio"
+ ],
+ "last": "Valenzuela-Escarcega",
+ "suffix": ""
+ },
+ {
+ "first": "Mihai",
+ "middle": [],
+ "last": "Surdeanu",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.",
+ "links": null
+ },
+ "BIBREF11": {
+ "ref_id": "b11",
+ "title": "Long short-term memory",
+ "authors": [
+ {
+ "first": "Sepp",
+ "middle": [],
+ "last": "Hochreiter",
+ "suffix": ""
+ },
+ {
+ "first": "J\u00fcrgen",
+ "middle": [],
+ "last": "Schmidhuber",
+ "suffix": ""
+ }
+ ],
+ "year": 1997,
+ "venue": "Neural computation",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation .",
+ "links": null
+ },
+ "BIBREF12": {
+ "ref_id": "b12",
+ "title": "Learning a neural semantic parser from user feedback",
+ "authors": [
+ {
+ "first": "Srinivasan",
+ "middle": [],
+ "last": "Iyer",
+ "suffix": ""
+ },
+ {
+ "first": "Ioannis",
+ "middle": [],
+ "last": "Konstas",
+ "suffix": ""
+ },
+ {
+ "first": "Alvin",
+ "middle": [],
+ "last": "Cheung",
+ "suffix": ""
+ },
+ {
+ "first": "Jayant",
+ "middle": [],
+ "last": "Krishnamurthy",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [
+ "S"
+ ],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.",
+ "links": null
+ },
+ "BIBREF13": {
+ "ref_id": "b13",
+ "title": "Exploiting mesh indexing in medline to generate a data set for word sense disambiguation",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Antonio",
+ "suffix": ""
+ },
+ {
+ "first": "Bridget",
+ "middle": [
+ "T"
+ ],
+ "last": "Jimeno-Yepes",
+ "suffix": ""
+ },
+ {
+ "first": "Alan",
+ "middle": [
+ "R"
+ ],
+ "last": "Mcinnes",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Aronson",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "BMC bioinformatics",
+ "volume": "12",
+ "issue": "1",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.",
+ "links": null
+ },
+ "BIBREF14": {
+ "ref_id": "b14",
+ "title": "CHEMDNER: The drugs and chemical names extraction challenge",
+ "authors": [
+ {
+ "first": "Martin",
+ "middle": [],
+ "last": "Krallinger",
+ "suffix": ""
+ },
+ {
+ "first": "Florian",
+ "middle": [],
+ "last": "Leitner",
+ "suffix": ""
+ },
+ {
+ "first": "Obdulia",
+ "middle": [],
+ "last": "Rabal",
+ "suffix": ""
+ },
+ {
+ "first": "Miguel",
+ "middle": [],
+ "last": "Vazquez",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "In J. Cheminformatics",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.",
+ "links": null
+ },
+ "BIBREF15": {
+ "ref_id": "b15",
+ "title": "Neural architectures for named entity recognition",
+ "authors": [
+ {
+ "first": "Guillaume",
+ "middle": [],
+ "last": "Lample",
+ "suffix": ""
+ },
+ {
+ "first": "Miguel",
+ "middle": [],
+ "last": "Ballesteros",
+ "suffix": ""
+ },
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Sandeep",
+ "suffix": ""
+ },
+ {
+ "first": "Kazuya",
+ "middle": [],
+ "last": "Subramanian",
+ "suffix": ""
+ },
+ {
+ "first": "Chris",
+ "middle": [],
+ "last": "Kawakami",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Dyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "HLT-NAACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.",
+ "links": null
+ },
+ "BIBREF16": {
+ "ref_id": "b16",
+ "title": "Biocreative v cdr task corpus: a resource for chemical disease relation extraction. Database : the journal of biological databases and curation",
+ "authors": [
+ {
+ "first": "Jiao",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Yueping",
+ "middle": [],
+ "last": "Sun",
+ "suffix": ""
+ },
+ {
+ "first": "Robin",
+ "middle": [
+ "J"
+ ],
+ "last": "Johnson",
+ "suffix": ""
+ },
+ {
+ "first": "Daniela",
+ "middle": [],
+ "last": "Sciaky",
+ "suffix": ""
+ },
+ {
+ "first": "Chih-Hsuan",
+ "middle": [],
+ "last": "Wei",
+ "suffix": ""
+ },
+ {
+ "first": "Robert",
+ "middle": [],
+ "last": "Leaman",
+ "suffix": ""
+ },
+ {
+ "first": "Allan",
+ "middle": [
+ "Peter"
+ ],
+ "last": "Davis",
+ "suffix": ""
+ },
+ {
+ "first": "Carolyn",
+ "middle": [
+ "J"
+ ],
+ "last": "Mattingly",
+ "suffix": ""
+ },
+ {
+ "first": "Thomas",
+ "middle": [
+ "C"
+ ],
+ "last": "Wiegers",
+ "suffix": ""
+ },
+ {
+ "first": "Zhiyong",
+ "middle": [],
+ "last": "Lu",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.",
+ "links": null
+ },
+ "BIBREF17": {
+ "ref_id": "b17",
+ "title": "Design challenges for entity linking",
+ "authors": [
+ {
+ "first": "Xiao",
+ "middle": [],
+ "last": "Ling",
+ "suffix": ""
+ },
+ {
+ "first": "Sameer",
+ "middle": [],
+ "last": "Singh",
+ "suffix": ""
+ },
+ {
+ "first": "Daniel",
+ "middle": [
+ "S"
+ ],
+ "last": "Weld",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "Transactions of the Association for Computational Linguistics",
+ "volume": "3",
+ "issue": "",
+ "pages": "315--328",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.",
+ "links": null
+ },
+ "BIBREF18": {
+ "ref_id": "b18",
+ "title": "Distant supervision for relation extraction without labeled data",
+ "authors": [
+ {
+ "first": "Mike",
+ "middle": [],
+ "last": "Mintz",
+ "suffix": ""
+ },
+ {
+ "first": "Steven",
+ "middle": [],
+ "last": "Bills",
+ "suffix": ""
+ }
+ ],
+ "year": 2009,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.",
+ "links": null
+ },
+ "BIBREF19": {
+ "ref_id": "b19",
+ "title": "GloVe: Global vectors for word representation",
+ "authors": [
+ {
+ "first": "Jeffrey",
+ "middle": [],
+ "last": "Pennington",
+ "suffix": ""
+ },
+ {
+ "first": "Richard",
+ "middle": [],
+ "last": "Socher",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [
+ "D"
+ ],
+ "last": "Manning",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.",
+ "links": null
+ },
+ "BIBREF20": {
+ "ref_id": "b20",
+ "title": "Semi-supervised sequence tagging with bidirectional language models",
+ "authors": [
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.",
+ "links": null
+ },
+ "BIBREF21": {
+ "ref_id": "b21",
+ "title": "Extracting scientific figures with distantly supervised neural networks",
+ "authors": [
+ {
+ "first": "Noah",
+ "middle": [],
+ "last": "Siegel",
+ "suffix": ""
+ },
+ {
+ "first": "Nicholas",
+ "middle": [],
+ "last": "Lourie",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.",
+ "links": null
+ },
+ "BIBREF22": {
+ "ref_id": "b22",
+ "title": "Identifying meaningful citations",
+ "authors": [
+ {
+ "first": "Marco",
+ "middle": [],
+ "last": "Valenzuela",
+ "suffix": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "AAAI Workshop (Scholarly Big Data)",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).",
+ "links": null
+ },
+ "BIBREF23": {
+ "ref_id": "b23",
+ "title": "Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and meta-analysis",
+ "authors": [
+ {
+ "first": "Xiang",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yan",
+ "middle": [],
+ "last": "Dong",
+ "suffix": ""
+ },
+ {
+ "first": "Yi-Ming",
+ "middle": [],
+ "last": "Xiang Qian Qi",
+ "suffix": ""
+ },
+ {
+ "first": "Cheng-Guang",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Lijun",
+ "middle": [],
+ "last": "Huang",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Hou",
+ "suffix": ""
+ }
+ ],
+ "year": 2013,
+ "venue": "Critical care",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.",
+ "links": null
+ },
+ "BIBREF24": {
+ "ref_id": "b24",
+ "title": "Learning to predict citation-based impact measures",
+ "authors": [
+ {
+ "first": "Luca",
+ "middle": [],
+ "last": "Weihs",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "JCDL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.",
+ "links": null
+ },
+ "BIBREF25": {
+ "ref_id": "b25",
+ "title": "CiteSeerX: AI in a digital library search engine",
+ "authors": [
+ {
+ "first": "Jian",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Williams",
+ "suffix": ""
+ },
+ {
+ "first": "Hung-Hsuan",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Madian",
+ "middle": [],
+ "last": "Khabsa",
+ "suffix": ""
+ },
+ {
+ "first": "Cornelia",
+ "middle": [],
+ "last": "Caragea",
+ "suffix": ""
+ },
+ {
+ "first": "Alexander",
+ "middle": [],
+ "last": "Ororbia",
+ "suffix": ""
+ },
+ {
+ "first": "Douglas",
+ "middle": [],
+ "last": "Jordan",
+ "suffix": ""
+ },
+ {
+ "first": "C. Lee",
+ "middle": [],
+ "last": "Giles",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "AAAI",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.",
+ "links": null
+ },
+ "BIBREF26": {
+ "ref_id": "b26",
+ "title": "Explicit semantic ranking for academic search via knowledge graph embedding",
+ "authors": [
+ {
+ "first": "Chenyan",
+ "middle": [],
+ "last": "Xiong",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Jamie",
+ "middle": [],
+ "last": "Callan",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.",
+ "links": null
+ }
+ },
+ "ref_entries": {}
+ }
+}
\ No newline at end of file
diff --git a/s2orc-doc2json/requirements.txt b/s2orc-doc2json/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..27a60dae9532abd4852100657ce554c7dab1a233
--- /dev/null
+++ b/s2orc-doc2json/requirements.txt
@@ -0,0 +1,8 @@
+tqdm
+beautifulsoup4==4.7.1
+boto3==1.9.147
+requests==2.21.0
+Flask==1.0.2
+lxml
+python-magic==0.4.18
+latex2mathml==2.16.2
\ No newline at end of file
diff --git a/s2orc-doc2json/scripts/run_grobid.sh b/s2orc-doc2json/scripts/run_grobid.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ca6a933e6d09db4e0ad2dc7570740c11172196a2
--- /dev/null
+++ b/s2orc-doc2json/scripts/run_grobid.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+cd $HOME/grobid-0.6.1
+
+## Start Grobid
+./gradlew run
\ No newline at end of file
diff --git a/s2orc-doc2json/scripts/setup_grobid.sh b/s2orc-doc2json/scripts/setup_grobid.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bc85d56c989869bbb0bdac85138bb404377755ba
--- /dev/null
+++ b/s2orc-doc2json/scripts/setup_grobid.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# put in your pdf2json directory here
+export PDF2JSON_HOME=/app/s2orc-doc2json
+
+# Download Grobid
+cd $HOME
+wget https://github.com/kermitt2/grobid/archive/0.6.1.zip
+unzip 0.6.1.zip
+rm 0.6.1.zip
+cd $HOME/grobid-0.6.1
+./gradlew clean install
+
+## Grobid configurations
+# increase max.connections to slightly more than number of processes
+# decrease logging level
+# this isn't necessary but is nice to have if you are processing lots of files
+cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/config.yaml $HOME/grobid-0.6.1/grobid-service/config/config.yaml
+cp $PDF2JSON_HOME/doc2json/grobid2json/grobid/grobid.properties $HOME/grobid-0.6.1/grobid-home/config/grobid.properties
+
+# ## Start Grobid
+# ./gradlew run
diff --git a/s2orc-doc2json/setup.py b/s2orc-doc2json/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..251e978c0d9dced43448f98b1cfc84ed9752fb75
--- /dev/null
+++ b/s2orc-doc2json/setup.py
@@ -0,0 +1,15 @@
+#!/usr/bin/python
+import setuptools
+
+setuptools.setup(
+ name='doc2json',
+ version='0.1',
+ packages=setuptools.find_packages(),
+ install_requires=[
+ ],
+ tests_require=[
+ ],
+ zip_safe=False,
+ test_suite='py.test',
+ entry_points='',
+)
\ No newline at end of file
diff --git a/s2orc-doc2json/temp_dir/2020.acl-main.207.tei.xml b/s2orc-doc2json/temp_dir/2020.acl-main.207.tei.xml
new file mode 100644
index 0000000000000000000000000000000000000000..f829459d21cdf9b70527f3f6e878dfb3738b751e
--- /dev/null
+++ b/s2orc-doc2json/temp_dir/2020.acl-main.207.tei.xml
@@ -0,0 +1,1746 @@
+
+
+
+
+
+ SPECTER: Document-level Representation Learning using Citation-informed Transformers
+
+
+
+
+
+
+
+
+
+ Arman Cohan
+ armanc@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Sergey Feldman
+ sergey@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Iz Beltagy
+ beltagy@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Doug Downey
+ dougd@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Daniel S Weld
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+
+ Introduction
+
+
+ SPECTER: Document-level Representation Learning using Citation-informed Transformers
+
+
+
+
+
+
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+ Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1
+
+
+
+
+
+Introduction
As the pace of scientific publication continues to increase, Natural Language Processing (NLP) tools that help users to search, discover and understand the scientific literature have become critical. In recent years, substantial improvements in NLP tools have been brought about by pretrained neural language models (LMs) [(Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11). While such models are widely used for representing individual words or sentences, extensions to whole-document embeddings are relatively underexplored. Likewise, methods that do use inter-document signals to produce whole-document embeddings [(Tu et al., 2017;](#b46)) have yet to incorporate stateof-the-art pretrained LMs. Here, we study how to leverage the power of pretrained language models to learn embeddings for scientific documents.
A paper's title and abstract provide rich semantic content about the paper, but, as we show in this work, simply passing these textual fields to an "off-the-shelf" pretrained language model-even a state-of-the-art model tailored to scientific text like the recent SciBERT [(Beltagy et al., 2019)](#b3)-does not result in accurate paper representations. The language modeling objectives used to pretrain the model do not lead it to output representations that are helpful for document-level tasks such as topic classification or recommendation.
In this paper, we introduce a new method for learning general-purpose vector representations of scientific documents. Our system, SPECTER, 2 incorporates inter-document context into the Transformer [(Vaswani et al., 2017)](#b47) language models (e.g., SciBERT [(Beltagy et al., 2019)](#b3)) to learn document representations that are effective across a wide-variety of downstream tasks, without the need for any task-specific fine-tuning of the pretrained language model. We specifically use citations as a naturally occurring, inter-document incidental supervision signal indicating which documents are most related and formulate the signal into a triplet-loss pretraining objective. Unlike many prior works, at inference time, our model does not require any citation information. This is critical for embedding new papers that have not yet been cited. In experiments, we show that SPECTER's representations substantially outperform the state-of-the-art on a variety of document-level tasks, including topic classification, citation prediction, and recommendation.
As an additional contribution of this work, we introduce and release SCIDOCS 3 , a novel collection of data sets and an evaluation suite for documentlevel embeddings in the scientific domain. SCI-DOCS covers seven tasks, and includes tens of thousands of examples of anonymized user signals of document relatedness. We also release our training set (hundreds of thousands of paper titles, abstracts and citations), along with our trained embedding model and its associated code base.
+Model 2.1 Overview
Our goal is to learn task-independent representations of academic papers. Inspired by the recent success of pretrained Transformer language models across various NLP tasks, we use the Transformer model architecture as basis of encoding the input paper. Existing LMs such as BERT, however, are primarily based on masked language modeling objective, only considering intra-document context and do not use any inter-document information. This limits their ability to learn optimal document representations. To learn high-quality documentlevel representations we propose using citations as an inter-document relatedness signal and formulate it as a triplet loss learning objective. We then pretrain the model on a large corpus of citations using this objective, encouraging it to output representations that are more similar for papers that share a citation link than for those that do not. We call our model SPECTER, which learns Scientific Paper Embeddings using Citation-informed Trans-formERs. With respect to the terminology used by [Devlin et al. (2019)](#b11), unlike most existing LMs that are "fine-tuning based", our approach results in embeddings that can be applied to downstream tasks in a "feature-based" fashion, meaning the learned paper embeddings can be easily used as features, with no need for further task-specific fine-tuning. In the following, as background information, we briefly describe how pretrained LMs can be applied for document representation and then discuss the details of SPECTER.
3 https://github.com/allenai/scidocs Transformer (initialized with SciBERT) Related paper (P + ) Query paper (P Q ) Unrelated paper (P − )
Triplet loss =max d P Q , P + − d P Q , P − + m , 0 Figure 1: Overview of SPECTER.
+Background: Pretrained Transformers
Recently, pretrained Transformer networks have demonstrated success on various NLP tasks [(Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11)[Liu et al., 2019)](#b33); we use these models as the foundation for SPECTER. Specifically, we use SciBERT [(Beltagy et al., 2019)](#b3) which is an adaptation of the original BERT [(Devlin et al., 2019)](#b11) architecture to the scientific domain. The BERT model architecture [(Devlin et al., 2019)](#b11) uses multiple layers of Transformers [(Vaswani et al., 2017)](#b47) to encode the tokens in a given input sequence. Each layer consists of a self-attention sublayer followed by a feedforward sublayer. The final hidden state associated with the special [CLS] token is usually called the "pooled output", and is commonly used as an aggregate representation of the sequence.
+Document Representation
Our goal is to represent a given paper P as a dense vector v that best represents the paper and can be used in downstream tasks. SPECTER builds embeddings from the title and abstract of a paper. Intuitively, we would expect these fields to be sufficient to produce accurate embeddings, since they are written to provide a succinct and comprehensive summary of the paper. [4] As such, we encode the concatenated title and abstract using a Transformer LM (e.g., SciBERT) and take the final representation of the [CLS] token as the output representation of the paper:
5 v = Transformer(input) [CLS] ,(1) where Transformer is the Transformer's forward function, and input is the concatenation of the [CLS] token and WordPieces [(Wu et al., 2016)](#b52) of the title and abstract of a paper, separated by the [SEP] token. We use SciBERT as our model initialization as it is optimized for scientific text, though our formulation is general and any Transformer language model instead of SciBERT. Using the above method with an "off-the-shelf" SciBERT does not take global inter-document information into account. This is because SciBERT, like other pretrained language models, is trained via language modeling objectives, which only predict words or sentences given their in-document, nearby textual context. In contrast, we propose to incorporate citations into the model as a signal of inter-document relatedness, while still leveraging the model's existing strength in modeling language.
+Citation-Based Pretraining Objective
A citation from one document to another suggests that the documents are related. To encode this relatedness signal into our representations, we design a loss function that trains the Transformer model to learn closer representations for papers when one cites the other, and more distant representations otherwise. The high-level overview of the model is shown in [Figure 1]. In particular, each training instance is a triplet of papers: a query paper P Q , a positive paper P + and a negative paper P − . The positive paper is a paper that the query paper cites, and the negative paper is a paper that is not cited by the query paper (but that may be cited by P + ). We then train the model using the following triplet margin loss function:
L = max d P Q , P + − d P Q , P − + m , 0 (2) where d is a distance function and m is the loss margin hyperparameter (we empirically choose m = 1). Here, we use the L2 norm distance:
d(P A , P B ) = v A − v B 2 , where v A is the vector corresponding to the pooled output of the Transformer run on paper A (Equation 1). [6] Starting from the trained SciBERT model, we pretrain the Transformer parameters on the citation objective to learn paper representations that capture document relatedness.
+Selecting Negative Distractors
The choice of negative example papers P − is important when training the model. We consider two sets of negative examples: the first set simply consists of randomly selected papers from the corpus.
Given a query paper, intuitively we would expect the model to be able to distinguish between cited papers, and uncited papers sampled randomly from the entire corpus. This inductive bias has been also found to be effective in content-based citation recommendation applications . But, random negatives may be easy for the model to distinguish from the positives. To provide a more nuanced training signal, we augment the randomly drawn negatives with a more challenging second set of negative examples. We denote as "hard negatives" the papers that are not cited by the query paper, but are cited by a paper cited by the query paper, i.e. if P 1 cite − − → P 2 and P 2 cite − − → P 3
but P 1 cite − − → P 3 , then P 3 is a candidate hard negative example for P 1 . We expect the hard negatives to be somewhat related to the query paper, but typically less related than the cited papers. As we show in our experiments ( §6), including hard negatives results in more accurate embeddings compared to using random negatives alone.
+Inference
At inference time, the model receives one paper, P, and it outputs the SPECTER's Transfomer pooled output activation as the paper representation for P (Equation 1). We note that for inference, SPECTER requires only the title and abstract of the given input paper; the model does not need any citation information about the input paper. This means that SPECTER can produce embeddings even for new papers that have yet to be cited, which is critical for applications that target recent scientific papers.
+SCIDOCS Evaluation Framework
Previous evaluations of scientific document representations in the literature tend to focus on small datasets over a limited set of tasks, and extremely high (99%+) AUC scores are already possible on these data for English documents . New, larger and more diverse benchmark datasets are necessary. Here, we introduce a new comprehensive evaluation framework to measure the effectiveness of scientific paper embeddings, which we call SCIDOCS. The framework consists of diverse tasks, ranging from citation prediction, to prediction of user activity, to document classification and paper recommendation. Note that SPECTER will not be further fine-tuned on any of the tasks; we simply plug in the embeddings as features for each task. Below, we describe each of the tasks in detail and the evaluation data associated with it. In addition to our training data, we release all the datasets associated with the evaluation tasks.
+Document Classification
An important test of a document-level embedding is whether it is predictive of the class of the document. Here, we consider two classification tasks in the scientific domain: MeSH Classification In this task, the goals is to classify scientific papers according to their Medical Subject Headings (MeSH) [(Lipscomb, 2000)](#b30). [7] We construct a dataset consisting of 23K academic medical papers, where each paper is assigned one of 11 top-level disease classes such as cardiovascular diseases, diabetes, digestive diseases derived from the MeSH vocabulary. The most populated category is Neoplasms (cancer) with 5.4K instances (23.3% of the total dataset) while the category with least number of samples is Hepatitis (1.7% of the total dataset). We follow the approach of [Feldman et al. (2019)](#b13) in mapping the MeSH vocabulary to the disease classes.
Paper Topic Classification This task is predicting the topic associated with a paper using the predefined topic categories of the Microsoft Academic Graph (MAG) [(Sinha et al., 2015)](#b45) 8 . MAG provides a database of papers, each tagged with a list of topics. The topics are organized in a hierarchy of 5 levels, where level 1 is the most general and level 5 is the most specific. For our evaluation, we derive a document classification dataset from the level 1 topics, where a paper is labeled by its corresponding level 1 MAG topic. We construct a dataset of 25K papers, almost evenly split over the 19 different classes of level 1 categories in MAG.
+Citation Prediction
As argued above, citations are a key signal of relatedness between papers. We test how well different paper representations can reproduce this signal through citation prediction tasks. In particular, we focus on two sub-tasks: predicting direct citations, and predicting co-citations. We frame these as ranking tasks and evaluate performance using MAP and nDCG, standard ranking metrics. Direct Citations In this task, the model is asked to predict which papers are cited by a given query paper from a given set of candidate papers. The evaluation dataset includes approximately 30K total papers from a held-out pool of papers, consisting of 1K query papers and a candidate set of up to 5 cited papers and 25 (randomly selected) uncited papers. The task is to rank the cited papers higher than the uncited papers. For each embedding method, we require only comparing the L2 distance between the raw embeddings of the query and the candidates, without any additional trainable parameters.
Co-Citations This task is similar to the direct citations but instead of predicting a cited paper, the goal is to predict a highly co-cited paper with a given paper. Intuitively, if papers A and B are cited frequently together by several papers, this shows that the papers are likely highly related and a good paper representation model should be able to identify these papers from a given candidate set. The dataset consists of 30K total papers and is constructed similar to the direct citations task.
+User Activity
The embeddings for similar papers should be close to each other; we use user activity as a proxy for identifying similar papers and test the model's ability to recover this information. Multiple users consuming the same items as one another is a classic relatedness signal and forms the foundation for recommender systems and other applications [(Schafer et al., 2007)](#b42). In our case, we would expect that when users look for academic papers, the papers they view in a single browsing session tend to be related. Thus, accurate paper embeddings should, all else being equal, be relatively more similar for papers that are frequently viewed in the same session than for other papers. To build benchmark datasets to test embeddings on user activity, we obtained logs of user sessions from a major academic search engine. We define the following two tasks on which we build benchmark datasets to test embeddings:
Co-Views Our co-views dataset consists of approximately 30K papers. To construct it, we take 1K random papers that are not in our train or development set and associate with each one up to 5 frequently co-viewed papers and 25 randomly selected papers (similar to the approach for citations). Then, we require the embedding model to rank the co-viewed papers higher than the random papers by comparing the L2 distances of raw embeddings. We evaluate performance using standard ranking metrics, nDCG and MAP.
Co-Reads If the user clicks to access the PDF of a paper from the paper description page, this is a potentially stronger sign of interest in the paper. In such a case we assume the user will read at least parts of the paper and refer to this as a "read" action. Accordingly, we define a "co-reads" task and dataset analogous to the co-views dataset described above. This dataset is also approximately 30K papers.
+Recommendation
In the recommendation task, we evaluate the ability of paper embeddings to boost performance in a production recommendation system. Our recommendation task aims to help users navigate the scientific literature by ranking a set of "similar papers" for a given paper. We use a dataset of user clickthrough data for this task which consists of 22K clickthrough events from a public scholarly search engine. We partitioned the examples temporally into train (20K examples), validation (1K), and test (1K) sets. As is typical in clickthrough data on ranked lists, the clicks are biased toward the top of original ranking presented to the user. To counteract this effect, we computed propensity scores using a swap experiment (Agarwal et al., 2019). The propensity scores give, for each position in the ranked list, the relative frequency that the position is over-represented in the data due to exposure bias. We can then compute de-biased evaluation metrics by dividing the score for each test example by the propensity score for the clicked position. We report propensity-adjusted versions of the standard ranking metrics Precision@1 (P @1) and Normalized Discounted Cumulative Gain (nDCG).
We test different embeddings on the recommendation task by including cosine embedding distance 9 as a feature within an existing recommendation system that includes several other informative features (title/author similarity, reference and citation overlap, etc.). Thus, the recommendation experiments measure whether the embeddings can boost the performance of a strong baseline system on an end task. For SPECTER, we also perform an online A/B test to measure whether its advantages on the offline dataset translate into improvements on the online recommendation task ( §5).
+Experiments
Training Data To train our model, we use a subset of the Semantic Scholar corpus consisting of about 146K query papers (around 26.7M tokens) with their corresponding outgoing citations, and we use an additional 32K papers for validation. For each query paper we construct up to 5 training triples comprised of a query, a positive, and a negative paper. The positive papers are sampled from the direct citations of the query, while negative papers are chosen either randomly or from citations of citations (as discussed in §2.4). We empirically found it helpful to use 2 hard negatives (citations of citations) and 3 easy negatives (randomly selected papers) for each query paper. This process results in about 684K training triples and 145K validation triples.
Training and Implementation We implement our model in AllenNLP . We initialize the model from SciBERT pretrained weights [(Beltagy et al., 2019)](#b3) since it is the stateof-the-art pretrained language model on scientific text. We continue training all model parameters on our training objective (Equation 2). We perform minimal tuning of our model's hyperparameters based on the performance on the validation set, while baselines are extensively tuned. Based on initial experiments, we use a margin m=1 for the triplet loss. For training, we use the Adam optimizer (Kingma and Ba, 2014) following the suggested hyperparameters in Devlin et al. (2019) (LR: 2e-5, Slanted Triangular LR scheduler 10 (Howard and Ruder, 2018) with number of train steps equal to training instances and cut fraction of 0.1). We train the model on a single Titan V GPU (12G memory) for 2 epochs, with batch size of 4 (the maximum that fit in our GPU memory) and use gradient accumulation for an effective batch size of 32. Each training epoch takes approximately 1-2 days to complete on the full dataset. We release our code and data to facilitate reproducibility. 11
Task-Specific Model Details For the classification tasks, we used a linear SVM where embedding vectors were the only features. The C hyperparameter was tuned via a held-out validation set.
For the recommendation tasks, we use a feedforward ranking neural network that takes as input ten features designed to capture the similarity between each query and candidate paper, including the cosine similarity between the query and candidate embeddings and manually-designed features computed from the papers' citations, titles, authors, and publication dates.
Baseline Methods Our work falls into the intersection of textual representation, citation mining, and graph learning, and we evaluate against stateof-the-art baselines from each of these areas. We compare with several strong textual models: SIF [(Arora et al., 2017)](#b2), a method for learning document representations by removing the first principal component of aggregated word-level embeddings which we pretrain on scientific text; SciBERT [(Beltagy et al., 2019)](#b3) a state-of-the-art pretrained Transformer LM for scientific text; and Sent-BERT [(Reimers and Gurevych, 2019)](#b40), a model that uses negative sampling to tune BERT for producing optimal sentence embeddings. We also compare with Citeomatic , a closely related paper representation model for citation prediction which trains content-based representations with citation graph information via dynamically sampled triplets, and SGC [(Wu et al., 2019a)](#b50), a state-of-the-art graph-convolutional approach. For completeness, additional baselines are also included; due to space constraints we refer to Appendix A for detailed discussion of all baselines. We tune hyperparameters of baselines to maximize performance on a separate validation set. [Table 1](#tab_1) presents the main results corresponding to our evaluation tasks (described in §3). Overall, we observe substantial improvements across all tasks with average performance of 80.0 across all metrics on all tasks which is a 3.1 point absolute improvement over the next-best baseline. We now discuss the results in detail.
+Results
For document classification, we report macro F1, a standard classification metric. We observe that the classifier performance when trained on our representations is better than when trained on any other baseline. Particularly, on the MeSH (MAG) dataset, we obtain an 86.4 (82.0) F1 score which is about a ∆= + 2.3 (+1.5) point absolute increase over the best baseline on each dataset respectively. Our evaluation of the learned representations on predicting user activity is shown in the "User activity" columns of [Table 1](#tab_1). SPECTER achieves a MAP score of 83.8 on the co-view task, and 84.5 on coread, improving over the best baseline (Citeomatic in this case) by 2.7 and 4.0 points, respectively. We observe similar trends for the "citation" and "co-citation" tasks, with our model outperforming virtually all other baselines except for SGC, which has access to the citation graph at training and test time. 12 Note that methods like SGC cannot be used in real-world setting to embed new papers that are not cited yet. On the other hand, on cocitation data our method is able to achieve the best results with nDCG of 94.8, improving over SGC with 2.3 points. Citeomatic also performs well on the citation tasks, as expected given that its primary design goal was citation prediction. Nevertheless, our method slightly outperforms Citeomatic on the direct citation task, while substantially outperforming it on co-citations (+2.0 nDCG). Finally, for recommendation task, we observe that SPECTER outperforms all other models on this task as well, with nDCG of 53.9. On the recommendations task, as opposed to previous experiments, the differences in method scores are generally smaller. This is because for this task the embeddings are used along with several other informative features in the ranking model (described under task-specific models in §4), meaning that embedding variants have less opportunity for impact on overall performance.
We also performed an online study to evaluate whether SPECTER embeddings offer similar advantages in a live application. We performed an online A/B test comparing our SPECTER-based recommender to an existing production recommender system for similar papers that ranks papers by a textual similarity measure. In a dataset of 4,113 clicks, we found that SPECTER ranker improved clickthrough rate over the baseline by 46.5%, demonstrating its superiority.
We emphasize that our citation-based pretraining objective is critical for the performance of SPECTER; removing this and using a vanilla SciB-ERT results in decreased performance on all tasks.
+Analysis
In this section, we analyze several design decisions in SPECTER, provide a visualization of its embedding space, and experimentally compare SPECTER's use of fixed embeddings against a finetuning approach.
Ablation Study We start by analyzing how adding or removing metadata fields from the input to SPECTER alters performance. The results are shown in the top four rows of [Table 2](#tab_3) (for brevity, here we only report the average of the metrics from each task). We observe that removing the abstract from the textual input and relying only on the title results in a substantial decrease in performance. More surprisingly, adding authors as an input (along with title and abstract) hurts performance. [13] One possible explanation is that author names are sparse in the corpus, making it difficult for the model to infer document-level relatedness from them. As another possible reason of this behavior, tokenization using Wordpieces might be suboptimal for author names. Many author names are out-of-vocabulary for SciBERT and thus, they might be split into sub-words and shared across names that are not semantically related, leading to noisy correlation. Finally, we find that adding venues slightly decreases performance, 14 except on document classification (which makes sense, as we would expect venues to have high correlation [13] We experimented with both concatenating authors with the title and abstract and also considering them as an additional field. Neither were helpful.
14 Venue information in our data came directly from publisher provided metadata and thus was not normalized. with paper topics). The fact that SPECTER does not require inputs like authors or venues makes it applicable in situations where this metadata is not available, such as matching reviewers with anonymized submissions, or performing recommendations of anonymized preprints (e.g., on OpenReview). One design decision in SPECTER is to use a set of hard negative distractors in the citation-based finetuning objective. The fifth row of [Table 2](#tab_3) shows that this is important-using only easy negatives reduces performance on all tasks. While there could be other potential ways to include hard negatives in the model, our simple approach of including citations of citations is effective. The sixth row of the table shows that using a strong general-domain language model (BERT-Large) instead of SciBERT in SPECTER reduces performance considerably. This is reasonable because unlike BERT-Large, SciB-ERT is pretrained on scientific text.
Visualization [Figure 2] shows t-SNE (van der Maaten, 2014) projections of our embeddings (SPECTER) compared with the SciBERT baseline for a random set of papers. When comparing SPECTER embeddings with SciBERT, we observe that our embeddings are better at encoding topical information, as the clusters seem to be more compact. Further, we see some examples of crosstopic relatedness reflected in the embedding space (e.g., Engineering, Mathematics and Computer Science are close to each other, while Business and Economics are also close to each other). To quantify the comparison of visualized embeddings in [Figure 2], we use the DBScan clustering algorithm [(Ester et al., 1996)](#b12) on this 2D projection. We use the completeness and homogeneity clustering quality measures introduced by [Rosenberg and Hirschberg (2007)](#b41). For the points corresponding to [Figure 2], the homogeneity and completeness values for SPECTER are respectively 0.41 and 0.72 compared with SciBERT's 0.19 and 0.63, a clear improvement on separating topics using the projected embeddings.
Comparison with Task Specific Fine-Tuning While the fact that SPECTER does not require finetuning makes its paper embeddings less costly to use, often the best performance from pretrained Transformers is obtained when the models are finetuned directly on each end task. We experiment with fine-tuning SciBERT on our tasks, and find this to be generally inferior to using our fixed representations from SPECTER. Specifically, we finetune SciBERT directly on task-specific signals instead of citations. To fine-tune on task-specific data (e.g., user activity), we used a dataset of coviews with 65K query papers, co-reads with 14K query papers, and co-citations (instead of direct citations) with 83K query papers. As the end tasks are ranking tasks, for all datasets we construct up to 5 triplets and fine-tune the model using triplet ranking loss. The positive papers are sampled from the most co-viewed (co-read, or co-cited) papers corresponding to the query paper. We also include both easy and hard distractors as when training SPECTER (for hard negatives we choose the least non-zero co-viewed (co-read, or co-cited) papers). We also consider training jointly on all task-specific training data sources in a multitask training process, where the model samples training triplets from a distribution over the sources. As illustrated in Table 3, without any additional final task-specific fine-tuning, SPECTER still outperforms a SciBERT model fine-tuned on the end tasks as well as their multitask combination, further demonstrating the effectiveness and versatility of SPECTER embeddings. 15
+Related Work
Recent representation learning methods in NLP rely on training large neural language models on unsupervised data [Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11)[Beltagy et al., 2019;](#b3)[Liu et al., 2019)](#b33). While successful at many sentenceand token-level tasks, our focus is on using the models for document-level representation learning, which has remained relatively under-explored.
There have been other efforts in document representation learning such as extensions of word vectors to documents [(Le and Mikolov, 2014;](#b28)[Ganesh et al., 2016;](#b14)[Wu et al., 2018;](#b51)[Gysel et al., 2017)](#b16), convolution-based methods [Zamani et al., 2018)](#b55), and variational autoencoders [(Holmer and Marfurt, 2018;](#b19). Relevant to document embedding, sentence embedding is a relatively well-studied area of research. Successful approaches include seq2seq models [(Kiros et al., 2015)], BiLSTM Siamese networks [(Williams et al., 2018)](#b49), leveraging supervised data from other corpora [(Conneau et al., 2017)](#b10), and using discourse relations [(Nie et al., 2019)](#b35), and BERT-based methods [(Reimers and Gurevych, 2019)](#b40). Unlike our proposed method, the majority of these approaches do not consider any notion of inter-document relatedness when embedding documents.
Other relevant work combines textual features with network structure [(Tu et al., 2017;](#b46). These works typically do not leverage the recent pretrained contextual representations and with a few exceptions such as the recent work by , they cannot generalize to unseen documents like our SPECTER approach. Context-based citation recommendation is another related application where models rely on citation contexts [(Jeong et al., 2019)](#b21) to make predictions. These works are orthogonal to ours as the input to our model is just paper title and abstract. Another related line of work is graphbased representation learning methods [(Bruna et al., 2014;](#b6)[Kipf and Welling, 2017;](#b24)[Hamilton et al., 2017a,b;][Wu et al., 2019a,b)]. Here, we compare to a graph representation learning model, SGC (Simple Graph Convolution) [(Wu et al., 2019a)](#b50), which is a state-of-the-art graph convolution approach for representation learning. SPECTER uses pretrained language models in combination with graph-based citation signals, which enables it to outperform the graph-based approaches in our experiments.
SPECTER embeddings are based on only the title and abstract of the paper. Adding the full text of the paper would provide a more complete picture of the paper's content and could improve accuracy [(Cohen et al., 2010;](#b9)[Lin, 2008;](#b29)[Schuemie et al., 2004)](#b43). However, the full text of many academic papers is not freely available. Further, modern language models have strict memory limits on input size, which means new techniques would be required in order to leverage the entirety of the paper within the models. Exploring how to use the full paper text within SPECTER is an item of future work.
Finally, one pain point in academic paper recommendation research has been a lack of publicly available datasets [(Chen and Lee, 2018;](#b8)[Kanakia et al., 2019)](#b22). To address this challenge, we release SCIDOCS, our evaluation benchmark which includes an anonymized clickthrough dataset from an online recommendations system.
+Conclusions and Future Work
We present SPECTER, a model for learning representations of scientific papers, based on a Transformer language model that is pretrained on cita-tions. We achieve substantial improvements over the strongest of a wide variety of baselines, demonstrating the effectiveness of our model. We additionally introduce SCIDOCS, a new evaluation suite consisting of seven document-level tasks and release the corresponding datasets to foster further research in this area.
The landscape of Transformer language models is rapidly changing and newer and larger models are frequently introduced. It would be interesting to initialize our model weights from more recent Transformer models to investigate if additional gains are possible. Another item of future work is to develop better multitask approaches to leverage multiple signals of relatedness information during training. We used citations to build triplets for our loss function, however there are other metrics that have good support from the bibliometrics literature [(Klavans and Boyack, 2006)](#b26) that warrant exploring as a way to create relatedness graphs. Including other information such as outgoing citations as additional input to the model would be yet another area to explore in future.
A Appendix A -Baseline Details 1. Random Zero-mean 25-dimensional vectors were used as representations for each document.
2. Doc2Vec Doc2Vec is one of the earlier neural document/paragraph representation methods [(Le and Mikolov, 2014)](#b28), and is a natural comparison. We trained Doc2Vec on our training subset using Gensim [(Řehůřek and Sojka, 2010)], and chose the hyperparameter grid using suggestions from Lau and Baldwin (2016). The hyperparameter grid used:
{'window': [[5,][10,][15]], 'sample': [0, 10 ** -6, 10 ** -5], 'epochs': [[50,][100,][200]]}, for a total of 27 models. The other parameters were set as follows: vector_size=300, min_count=3, alpha=0.025, min_alpha=0.0001, negative=5, dm=0, dbow=1, dbow_words=0. 3. Fasttext-Sum This simple baseline is a weighted sum of pretrained word vectors. We trained our own 300 dimensional fasttext embeddings [(Bojanowski et al., 2017)](#b5) on a corpus of around 3.1B tokens from scientific papers which is similar in size to the SciBERT corpus [(Beltagy et al., 2019)](#b3). We found that these pretrained embeddings substantially outperform alternative off-theshelf embeddings. We also use these embeddings in other baselines that require pretrained word vectors (i.e., SIF and SGC that are described below). The summed bag of words representation has a number of weighting options, which are extensively tuned on a validation set for best performance. 4. SIF The SIF method of [Arora et al. (2017)](#b2) is a strong text representation baseline that takes a weighted sum of pretrained word vectors (we use fasttext embeddings described above), then computes the first principal component of the document embedding matrix and subtracts out each document embedding's projection to the first principal component.
We used a held-out validation set to choose a from the range [1.0e-5, 1.0e-3] spaced evenly on a log scale. The word probability p(w) was estimated on the training set only. When computing term-frequency values for SIF, we used scikit-learn's TfidfVectorizer with the same parameters as enumerated in the preceding section. sublinear_tf, binary, use_idf, smooth_idf were all set to False. Since SIF is a sum of pretrained fasttext vectors, the resulting dimensionality is 300. provides contextualized representations of tokens in a document. It can provide paragraph or document embeddings by averaging each token's representation for all 3 LSTM layers. We used the 768-dimensional pretrained ELMo model in AllenNLP .
+ELMo ELMo
6. Citeomatic The most relevant baseline is Citeomatic , which is an academic paper representation model that is trained on the citation graph via sampled triplets. Citeomatic representations are an L2 normalized weighted sum of title and abstract embeddings, which are trained on the citation graph with dynamic negative sampling. Citeomatic embeddings are 75-dimensional. 7. SGC Since our algorithm is trained on data from the citation graph, we also compare to a state-ofthe-art graph representation learning model: SGC (Simple Graph Convolution) [(Wu et al., 2019a)](#b50), which is a graph convolution network. An alternative comparison would have been Graph-SAGE [(Hamilton et al., 2017b)](#b18), but SGC (with no learning) outperformed an unsupervised variant of GraphSAGE on the Reddit dataset 16 , Note that SGC with no learning boils down to graph propagation on node features (in our case nodes are academic documents). Following Hamilton et al. (2017a), we used SIF features as node representations, and applied SGC with a range of parameter k, which is the number of times the normalized adjacency is multiplied by the SIF feature matrix. Our range of k was 1 through 8 (inclusive), and was chosen with a validation set. For the node features, we chose the SIF model with a = 0.0001, as this model was observed to be a high-performing one. This baseline is also 300 dimensional.
8. SciBERT To isolate the advantage of SPECTER's citation-based fine-tuning objective, we add a controlled comparison with SciBERT [(Beltagy et al., 2019)](#b3). Following [Devlin et al. (2019)](#b11) we take the last layer hidden state corresponding to the [CLS] token as the aggregate document representation. 17 9. Sentence BERT Sentence BERT [(Reimers and Gurevych, 2019](#b40)) is a general-domain pretrained model aimed at embedding sentences. The authors fine-tuned BERT using a triplet loss, where positive sentences were from the same document section as the seed sentence, and distractor sentences came from other document sections. The model is designed to encode sentences as opposed to paragraphs, so we embed the title and each sentence in the abstract separately, sum the embeddings, and L2 normalize the result to produce a final 768-dimensional paper embedding. [18] During hyperparameter optimization we chose how to compute TF and IDF values weights by taking the following non-redundant combinations of scikit-learn's TfidfVectorizer [(Pedregosa et al., 2011)](#b36) parameters: sublinear_tf, binary, use_idf, smooth_idf. There were a total of 9 parameter combinations. The IDF values were estimated on the training set. The other parameters were set as follows: min_df=3, max_df=0.75, strip_accents='ascii', stop_words='english', norm=None, lowercase=True. For training of fasttext, we used all default parameters with the exception of setting dimension to 300 and minCount was set to 25 due to the large corpus.
t-SNE visualization of paper embeddings and their corresponding MAG topics.
+Table 1 :1 Results on the SCIDOCS evaluation suite consisting of 7 tasks.
+Table 2 :2 Ablations: Numbers are averages of metrics for each evaluation task: CLS: classification, USR: User activity, CITE: Citation prediction, REC: Recom- mendation, Avg. average over all tasks & metrics.
+SciBERT fine-tune on co-view 83.0 84.2 84.1 36.4 76.0 SciBERT fine-tune on co-read 82.3 85.4 86.7 36.3 77.1 SciBERT fine-tune on co-citation 82.9 84.3 85.2 36.6 76.4 SciBERT fine-tune on multitask 83.3 86.1 88.2 36.0 78.0 Training signal | CLS USR CITE REC All |
SPECTER | 84.2 88.4 91.5 36.9 80.0 |
+Table 3 :3 Comparison with task-specific fine-tuning.
+ SPECTER: Scientific Paper Embeddings using Citationinformed TransformERs
+ We also experimented with additional fields such as venues and authors but did not find any empirical advantage in using those (see §6). See §7 for a discussion of using the full text of the paper as input.5 It is also possible to encode title and abstracts individually and then concatenate or combine them to get the final embedding. However, in our experiments this resulted in sub-optimal performance.
+ We also experimented with other distance functions (e..g, normalized cosine), but they underperformed the L2 loss.
+ https://www.nlm.nih.gov/mesh/meshhome. html 8 https://academic.microsoft.com/
+ Embeddings are L2 normalized and in this case cosine distance is equivalent to L2 distance.
+ Learning rate linear warmup followed by linear decay. 11 https://github.com/allenai/specter
+ For SGC, we remove development and test set citations and co-citations during training. We also remove incoming citations from development and test set queries as these would not be available at test time in production.
+ We also experimented with further task-specific finetuning of our SPECTER on the end tasks but we did not observe additional improvements.
+ There were no other direct comparisons in[Wu et al. (2019a)](#b50) 17 We also tried the alternative of averaging all token representations, but this resulted in a slight performance decrease compared with the [CLS] pooled token.
+ We used the 'bert-base-wikipedia-sections-mean-tokens' model released by the authors: https://github.com/ UKPLab/sentence-transformers
+
+
+
+
+
Acknowledgements
We thank Kyle Lo, Daniel King and Oren Etzioni for helpful research discussions, Russel Reas for setting up the public API, Field Cady for help in initial data collection and the anonymous reviewers (especially Reviewer 1) for comments and suggestions. This work was supported in part by NSF Convergence Accelerator award 1936940, ONR grant N00014-18-1-2193, and the University of Washington WRF/Cable Professorship.
+
+
+
+
+
+
+
+
+ Estimating position bias without intrusive interventions
+
+ K Anant
+
+
+ Ivan Agarwal
+
+
+ Xuanhui Zaitsev
+
+
+ Wang
+
+
+ Yen Cheng
+
+
+ Marc Li
+
+
+ Thorsten Najork
+
+
+ Joachims
+
+
+
+ WSDM
+
+
+
+
+ Anant K. Agarwal, Ivan Zaitsev, Xuanhui Wang, Cheng Yen Li, Marc Najork, and Thorsten Joachims. 2019. Estimating position bias without intrusive in- terventions. In WSDM.
+
+
+
+
+ Construction of the literature graph in semantic scholar
+
+ Waleed Ammar
+
+
+ Dirk Groeneveld
+
+
+ Chandra Bhagavatula
+
+
+ Iz Beltagy
+
+
+ Miles Crawford
+
+
+ Doug Downey
+
+
+ Jason Dunkelberger
+
+
+ Ahmed Elgohary
+
+
+ Sergey Feldman
+
+
+ Vu Ha
+
+
+ Rodney Kinney
+
+
+ Sebastian Kohlmeier
+
+
+ Kyle Lo
+
+
+ Tyler C Murray
+
+
+ Hsu-Han
+
+
+ Matthew E Ooi
+
+
+ Joanna Peters
+
+
+ Sam Power
+
+
+ Lucy Lu Skjonsberg
+
+
+ Christopher Wang
+
+
+ Zheng Wilhelm
+
+
+ Madeleine Yuan
+
+
+ Oren Van Zuylen
+
+
+ Etzioni
+
+
+
+ NAACL-HLT
+
+
+
+
+ Waleed Ammar, Dirk Groeneveld, Chandra Bha- gavatula, Iz Beltagy, Miles Crawford, Doug Downey, Jason Dunkelberger, Ahmed Elgohary, Sergey Feldman, Vu Ha, Rodney Kinney, Sebas- tian Kohlmeier, Kyle Lo, Tyler C. Murray, Hsu- Han Ooi, Matthew E. Peters, Joanna Power, Sam Skjonsberg, Lucy Lu Wang, Christopher Wilhelm, Zheng Yuan, Madeleine van Zuylen, and Oren Et- zioni. 2018. Construction of the literature graph in semantic scholar. In NAACL-HLT.
+
+
+
+
+ A simple but tough-to-beat baseline for sentence embeddings
+
+ Sanjeev Arora
+
+
+ Yingyu Liang
+
+
+ Tengyu Ma
+
+
+
+ ICLR
+
+
+
+
+ Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017. A simple but tough-to-beat baseline for sentence em- beddings. In ICLR.
+
+
+
+
+ SciB-ERT: A Pretrained Language Model for Scientific Text
+
+ Iz Beltagy
+
+
+ Kyle Lo
+
+
+ Arman Cohan
+
+
+
+ EMNLP
+
+
+
+
+ Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciB- ERT: A Pretrained Language Model for Scientific Text. In EMNLP.
+
+
+
+
+ Content-Based Citation Recommendation
+
+ Chandra Bhagavatula
+
+
+ Sergey Feldman
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+ NAACL-HLT
+
+
+
+
+ Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-Based Citation Recommendation. In NAACL-HLT.
+
+
+
+
+ Enriching word vectors with subword information
+
+ Piotr Bojanowski
+
+
+ Edouard Grave
+
+
+ Armand Joulin
+
+
+ Tomas Mikolov
+
+ 10.1162/tacl_a_00051
+
+
+
+
+ Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. 2017. Enriching word vectors with subword information. TACL.
+
+
+
+
+
+
+ Joan Bruna
+
+
+ Wojciech Zaremba
+
+
+ Arthur Szlam
+
+
+ Yann Lecun
+
+
+
+
+
+ Joan Bruna, Wojciech Zaremba, Arthur Szlam, and Yann LeCun. 2014. Spectral networks and locally connected networks on graphs. ICLR.
+
+
+
+
+ Improving textual network embedding with global attention via optimal transport
+
+ Liqun Chen
+
+
+ Guoyin Wang
+
+
+ Chenyang Tao
+
+
+ Dinghan Shen
+
+
+ Pengyu Cheng
+
+
+ Xinyuan Zhang
+
+
+ Wenlin Wang
+
+
+ Yizhe Zhang
+
+
+ Lawrence Carin
+
+
+
+ ACL
+
+
+
+
+ Liqun Chen, Guoyin Wang, Chenyang Tao, Ding- han Shen, Pengyu Cheng, Xinyuan Zhang, Wenlin Wang, Yizhe Zhang, and Lawrence Carin. 2019. Im- proving textual network embedding with global at- tention via optimal transport. In ACL.
+
+
+
+
+ Research Paper Recommender Systems on Big Scholarly Data
+
+ Maria Tsung Teng Chen
+
+
+ Lee
+
+
+
+ Knowledge Management and Acquisition for Intelligent Systems
+
+
+
+
+ Tsung Teng Chen and Maria Lee. 2018. Research Pa- per Recommender Systems on Big Scholarly Data. In Knowledge Management and Acquisition for In- telligent Systems.
+
+
+
+
+ The structural and content aspects of abstracts versus bodies of full text journal articles are different
+
+ K Cohen
+
+
+ Helen L Johnson
+
+
+ Karin M Verspoor
+
+
+ Christophe Roeder
+
+
+ Lawrence Hunter
+
+
+
+ BMC Bioinformatics
+
+ 11
+
+
+
+
+ K. Bretonnel Cohen, Helen L. Johnson, Karin M. Ver- spoor, Christophe Roeder, and Lawrence Hunter. 2010. The structural and content aspects of abstracts versus bodies of full text journal articles are different. BMC Bioinformatics, 11:492-492.
+
+
+
+
+ Supervised Learning of Universal Sentence Representations from Natural Language Inference Data
+
+ Alexis Conneau
+
+
+ Douwe Kiela
+
+
+ Holger Schwenk
+
+
+ Loïc Barrault
+
+
+ Antoine Bordes
+
+ 10.18653/v1/D17-1070
+
+
+ EMNLP
+
+
+
+
+ Alexis Conneau, Douwe Kiela, Holger Schwenk, Loïc Barrault, and Antoine Bordes. 2017. Supervised Learning of Universal Sentence Representations from Natural Language Inference Data. In EMNLP.
+
+
+
+
+ BERT: Pre-training of deep bidirectional transformers for language understanding
+
+ Jacob Devlin
+
+
+ Ming-Wei Chang
+
+
+ Kenton Lee
+
+
+ Kristina Toutanova
+
+
+
+ NAACL-HLT
+
+
+
+
+ Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language under- standing. In NAACL-HLT.
+
+
+
+
+ A Density-based Algorithm for Discovering Clusters in Large Spatial Databases with Noise
+
+ Martin Ester
+
+
+ Hans-Peter Kriegel
+
+
+ Jörg Sander
+
+
+ Xiaowei Xu
+
+
+
+ KDD
+
+
+
+
+ Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu, et al. 1996. A Density-based Algorithm for Dis- covering Clusters in Large Spatial Databases with Noise. In KDD.
+
+
+
+
+ Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction
+
+ Sergey Feldman
+
+
+ Waleed Ammar
+
+
+ Kyle Lo
+
+
+ Elly Trepman
+
+
+ Madeleine Van Zuylen
+
+
+ Oren Etzioni
+
+ 10.1001/jamanetworkopen.2019.6700
+
+
+ JAMA
+
+
+
+
+ Sergey Feldman, Waleed Ammar, Kyle Lo, Elly Trep- man, Madeleine van Zuylen, and Oren Etzioni. 2019. Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction. JAMA.
+
+
+
+
+ Doc2sent2vec: A novel two-phase approach for learning document representation
+
+ J Ganesh
+
+
+ Manish Gupta
+
+
+ Vijay K Varma
+
+
+
+ SIGIR
+
+
+
+
+ J Ganesh, Manish Gupta, and Vijay K. Varma. 2016. Doc2sent2vec: A novel two-phase approach for learning document representation. In SIGIR.
+
+
+
+
+ AllenNLP: A Deep Semantic Natural Language Processing Platform
+
+ Matt Gardner
+
+
+ Joel Grus
+
+
+ Mark Neumann
+
+
+ Oyvind Tafjord
+
+
+ Pradeep Dasigi
+
+
+ Nelson F Liu
+
+
+ Matthew Peters
+
+
+ Michael Schmitz
+
+
+ Luke Zettlemoyer
+
+ 10.18653/v1/W18-2501
+
+
+ Proceedings of Workshop for NLP Open Source Software
+ Workshop for NLP Open Source Software
+
+
+
+
+ NLP-OSS
+
+
+ Matt Gardner, Joel Grus, Mark Neumann, Oyvind Tafjord, Pradeep Dasigi, Nelson F. Liu, Matthew Pe- ters, Michael Schmitz, and Luke Zettlemoyer. 2018. AllenNLP: A Deep Semantic Natural Language Pro- cessing Platform. In Proceedings of Workshop for NLP Open Source Software (NLP-OSS).
+
+
+
+
+ Neural Vector Spaces for Unsupervised Information Retrieval
+
+ Christophe Van Gysel
+
+
+ Maarten De Rijke
+
+
+ Evangelos Kanoulas
+
+
+
+ ACM Trans. Inf. Syst
+
+
+
+
+ Christophe Van Gysel, Maarten de Rijke, and Evange- los Kanoulas. 2017. Neural Vector Spaces for Un- supervised Information Retrieval. ACM Trans. Inf. Syst.
+
+
+
+
+ Inductive Representation Learning on Large Graphs
+
+ Will Hamilton
+
+
+ Zhitao Ying
+
+
+ Jure Leskovec
+
+
+
+ NIPS
+
+
+
+
+ Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017a. Inductive Representation Learning on Large Graphs. In NIPS.
+
+
+
+
+ Inductive representation learning on large graphs
+
+ William L Hamilton
+
+
+ Zhitao Ying
+
+
+ Jure Leskovec
+
+
+
+ NIPS
+
+
+
+
+ William L. Hamilton, Zhitao Ying, and Jure Leskovec. 2017b. Inductive representation learning on large graphs. In NIPS.
+
+
+
+
+ Explaining away syntactic structure in semantic document representations
+
+ Erik Holmer
+
+
+ Andreas Marfurt
+
+ abs/1806.01620
+
+
+ ArXiv
+
+
+
+
+ Erik Holmer and Andreas Marfurt. 2018. Explaining away syntactic structure in semantic document rep- resentations. ArXiv, abs/1806.01620.
+
+
+
+
+ Universal Language Model Fine-tuning for Text Classification
+
+ Jeremy Howard
+
+
+ Sebastian Ruder
+
+ 10.18653/v1/P18-1031
+
+
+ ACL
+
+
+
+
+ Jeremy Howard and Sebastian Ruder. 2018. Universal Language Model Fine-tuning for Text Classification. In ACL.
+
+
+
+
+ A context-aware citation recommendation model with bert and graph convolutional networks
+
+ Chanwoo Jeong
+
+
+ Sion Jang
+
+
+ Hyuna Shin
+
+
+ Lucy Eunjeong
+
+
+ Sungchul Park
+
+
+ Choi
+
+ abs/1903.06464
+
+
+ ArXiv
+
+
+
+
+ Chanwoo Jeong, Sion Jang, Hyuna Shin, Eun- jeong Lucy Park, and Sungchul Choi. 2019. A context-aware citation recommendation model with bert and graph convolutional networks. ArXiv, abs/1903.06464.
+
+
+
+
+ A Scalable Hybrid Research Paper Recommender System for Microsoft Academic
+
+ Anshul Kanakia
+
+
+ Zhihong Shen
+
+
+ Darrin Eide
+
+
+ Kuansan Wang
+
+
+
+ WWW
+
+
+
+
+ Anshul Kanakia, Zhihong Shen, Darrin Eide, and Kuansan Wang. 2019. A Scalable Hybrid Research Paper Recommender System for Microsoft Aca- demic. In WWW.
+
+
+
+
+ Adam: A Method for Stochastic Optimization
+
+ P Diederik
+
+
+ Jimmy Kingma
+
+
+ Ba
+
+ abs/1412.6980
+
+
+ ArXiv
+
+
+
+
+ Diederik P. Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. ArXiv, abs/1412.6980.
+
+
+
+
+ Semisupervised classification with graph convolutional networks
+
+ N Thomas
+
+
+ Max Kipf
+
+
+ Welling
+
+
+
+
+
+ Thomas N Kipf and Max Welling. 2017. Semi- supervised classification with graph convolutional networks. ICLR.
+
+
+
+
+ Raquel Urtasun, and Sanja Fidler. 2015. Skip-thought vectors
+
+ Ryan Kiros
+
+
+ Yukun Zhu
+
+
+ Ruslan Salakhutdinov
+
+
+ Richard S Zemel
+
+
+ Antonio Torralba
+
+
+
+ NIPS
+
+
+ Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urta- sun, and Sanja Fidler. 2015. Skip-thought vectors. In NIPS.
+
+
+
+
+ Identifying a better measure of relatedness for mapping science
+
+ Richard Klavans
+
+
+ Kevin W Boyack
+
+
+
+ Journal of the Association for Information Science and Technology
+
+ 57
+
+
+
+
+ Richard Klavans and Kevin W. Boyack. 2006. Iden- tifying a better measure of relatedness for mapping science. Journal of the Association for Information Science and Technology, 57:251-263.
+
+
+
+
+ An empirical evaluation of doc2vec with practical insights into document embedding generation
+
+ Han Jey
+
+
+ Timothy Lau
+
+
+ Baldwin
+
+
+
+ Rep4NLP@ACL
+
+
+
+
+ Jey Han Lau and Timothy Baldwin. 2016. An empirical evaluation of doc2vec with practical in- sights into document embedding generation. In Rep4NLP@ACL.
+
+
+
+
+ Distributed Representations of Sentences and Documents
+
+ Quoc Le
+
+
+ Tomas Mikolov
+
+
+
+ ICML
+
+
+
+
+ Quoc Le and Tomas Mikolov. 2014. Distributed Repre- sentations of Sentences and Documents. In ICML.
+
+
+
+
+ Is searching full text more effective than searching abstracts?
+
+ Jimmy J Lin
+
+
+
+ BMC Bioinformatics
+
+ 10
+
+
+
+
+ Jimmy J. Lin. 2008. Is searching full text more effec- tive than searching abstracts? BMC Bioinformatics, 10:46-46.
+
+
+
+
+ Bulletin of the Medical Library Association
+
+ Carolyn E Lipscomb
+
+
+
+
+
+ Medical Subject Headings (MeSH)
+ Carolyn E Lipscomb. 2000. Medical Subject Headings (MeSH). Bulletin of the Medical Library Associa- tion.
+
+
+
+
+ Unsupervised Document Embedding with CNNs
+
+ Chundi Liu
+
+
+ Shunan Zhao
+
+
+ Maksims Volkovs
+
+ abs/1711.04168v3
+
+
+ ArXiv
+
+
+
+
+ Chundi Liu, Shunan Zhao, and Maksims Volkovs. 2018. Unsupervised Document Embedding with CNNs. ArXiv, abs/1711.04168v3.
+
+
+
+
+ A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis
+
+ Pengfei Liu
+
+
+ King Keung Wu
+
+
+ Helen M Meng
+
+
+
+
+
+ IJCNN
+ Pengfei Liu, King Keung Wu, and Helen M. Meng. 2017. A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis. IJCNN.
+
+
+
+
+
+
+ Yinhan Liu
+
+
+ Myle Ott
+
+
+ Naman Goyal
+
+
+ Jingfei Du
+
+
+ Mandar S Joshi
+
+
+ Danqi Chen
+
+
+ Omer Levy
+
+
+ Mike Lewis
+
+
+ Luke S Zettlemoyer
+
+
+ Veselin Stoyanov
+
+ abs/1907.11692
+
+
+ RoBERTa: A Robustly Optimized BERT Pretraining Approach. ArXiv
+
+
+
+
+ Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Man- dar S. Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke S. Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretrain- ing Approach. ArXiv, abs/1907.11692.
+
+
+
+
+ Accelerating t-SNE Using Tree-based Algorithms
+
+ Laurens Van Der Maaten
+
+
+
+ Journal of Machine Learning Research
+
+
+
+
+ Laurens van der Maaten. 2014. Accelerating t-SNE Using Tree-based Algorithms. Journal of Machine Learning Research.
+
+
+
+
+ DisSent: Learning Sentence Representations from Explicit Discourse Relations
+
+ Allen Nie
+
+
+ Erin Bennett
+
+
+ Noah Goodman
+
+ 10.18653/v1/P19-1442
+
+
+ ACL
+
+
+
+
+ Allen Nie, Erin Bennett, and Noah Goodman. 2019. DisSent: Learning Sentence Representations from Explicit Discourse Relations. In ACL.
+
+
+
+
+ Scikit-learn: Machine learning in Python
+
+ F Pedregosa
+
+
+ G Varoquaux
+
+
+ A Gramfort
+
+
+ V Michel
+
+
+ B Thirion
+
+
+ O Grisel
+
+
+ M Blondel
+
+
+ P Prettenhofer
+
+
+ R Weiss
+
+
+ V Dubourg
+
+
+ J Vanderplas
+
+
+ A Passos
+
+
+ D Cournapeau
+
+
+ M Brucher
+
+
+ M Perrot
+
+
+ E Duchesnay
+
+
+
+ Journal of Machine Learning Research
+
+ 12
+
+
+
+
+ F. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R. Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, and E. Duch- esnay. 2011. Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12:2825-2830.
+
+
+
+
+
+ Matthew E Peters
+
+
+ Mark Neumann
+
+
+ Mohit Iyyer
+
+
+ Matt Gardner
+
+
+ Christopher Clark
+
+
+ Kenton Lee
+
+
+ Luke Zettlemoyer
+
+ Deep Contextualized Word Representations
+
+
+
+
+ Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep Contextualized Word Rep- resentations.
+
+
+
+
+ Improving language understanding by generative pre-training
+
+ Alec Radford
+
+
+ Karthik Narasimhan
+
+
+
+
+
+ arXiv
+ Tim Salimans, and Ilya Sutskever
+ Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language under- standing by generative pre-training. arXiv.
+
+
+
+
+ Software Framework for Topic Modelling with Large Corpora
+
+ Petr Radimřehůřek
+
+
+ Sojka
+
+
+
+ LREC
+
+
+
+
+ RadimŘehůřek and Petr Sojka. 2010. Software Frame- work for Topic Modelling with Large Corpora. In LREC.
+
+
+
+
+ Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
+
+ Nils Reimers
+
+
+ Iryna Gurevych
+
+
+
+ EMNLP
+
+
+
+
+ Nils Reimers and Iryna Gurevych. 2019. Sentence- BERT: Sentence Embeddings using Siamese BERT- Networks. In EMNLP.
+
+
+
+
+ Vmeasure: A Conditional Entropy-based External Cluster Evaluation Measure
+
+ Andrew Rosenberg
+
+
+ Julia Hirschberg
+
+
+
+ EMNLP
+
+
+
+
+ Andrew Rosenberg and Julia Hirschberg. 2007. V- measure: A Conditional Entropy-based External Cluster Evaluation Measure. In EMNLP.
+
+
+
+
+ Collaborative filtering recommender systems
+
+ Ben Schafer
+
+
+ Dan Frankowski
+
+
+ Jon Herlocker
+
+
+ Shilad Sen
+
+
+
+ The adaptive web
+
+ Springer
+
+
+
+ J Ben Schafer, Dan Frankowski, Jon Herlocker, and Shilad Sen. 2007. Collaborative filtering recom- mender systems. In The adaptive web. Springer.
+
+
+
+
+
+ J Martijn
+
+
+ Marc Schuemie
+
+
+ Weeber
+
+
+ J A Bob
+
+
+ Erik M Schijvenaars
+
+
+ C Van Mulligen
+
+
+ Rob Christiaan Van Der Eijk
+
+
+ Barend Jelier
+
+
+ Jan A Mons
+
+
+ Kors
+
+
+
+ Distribution of information in biomedical abstracts and full-text publications
+
+
+ 20
+
+
+
+ Martijn J. Schuemie, Marc Weeber, Bob J. A. Schijve- naars, Erik M. van Mulligen, C. Christiaan van der Eijk, Rob Jelier, Barend Mons, and Jan A. Kors. 2004. Distribution of information in biomedical ab- stracts and full-text publications. Bioinformatics, 20(16):2597-604.
+
+
+
+
+ Improved semantic-aware network embedding with fine-grained word alignment
+
+ Dinghan Shen
+
+
+ Xinyuan Zhang
+
+
+ Ricardo Henao
+
+
+ Lawrence Carin
+
+
+
+ EMNLP
+
+
+
+
+ Dinghan Shen, Xinyuan Zhang, Ricardo Henao, and Lawrence Carin. 2018. Improved semantic-aware network embedding with fine-grained word align- ment. In EMNLP.
+
+
+
+
+ An Overview of Microsoft Academic Service (MAS) and Applications
+
+ Arnab Sinha
+
+
+ Zhihong Shen
+
+
+ Yang Song
+
+
+ Hao Ma
+
+
+ Darrin Eide
+
+
+ Bo-June Paul Hsu
+
+
+ Kuansan Wang
+
+
+
+ WWW
+
+
+
+
+ Arnab Sinha, Zhihong Shen, Yang Song, Hao Ma, Dar- rin Eide, Bo-June Paul Hsu, and Kuansan Wang. 2015. An Overview of Microsoft Academic Service (MAS) and Applications. In WWW.
+
+
+
+
+ Cane: Context-aware network embedding for relation modeling
+
+ Cunchao Tu
+
+
+ Han Liu
+
+
+ Zhiyuan Liu
+
+
+ Maosong Sun
+
+
+
+ ACL
+
+
+
+
+ Cunchao Tu, Han Liu, Zhiyuan Liu, and Maosong Sun. 2017. Cane: Context-aware network embedding for relation modeling. In ACL.
+
+
+
+
+ Attention Is All You Need
+
+ Ashish Vaswani
+
+
+ Noam Shazeer
+
+
+ Niki Parmar
+
+
+ Jakob Uszkoreit
+
+
+ Llion Jones
+
+
+ Aidan N Gomez
+
+
+ Lukasz Kaiser
+
+
+ Illia Polosukhin
+
+
+
+ NIPS
+
+
+
+
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In NIPS.
+
+
+
+
+ Improving textual network learning with variational homophilic embeddings
+
+ Wenlin Wang
+
+
+ Chenyang Tao
+
+
+ Zhe Gan
+
+
+ Guoyin Wang
+
+
+ Liqun Chen
+
+
+ Xinyuan Zhang
+
+
+ Ruiyi Zhang
+
+
+ Qian Yang
+
+
+ Ricardo Henao
+
+
+ Lawrence Carin
+
+
+
+ Advances in Neural Information Processing Systems
+
+
+
+
+
+ Wenlin Wang, Chenyang Tao, Zhe Gan, Guoyin Wang, Liqun Chen, Xinyuan Zhang, Ruiyi Zhang, Qian Yang, Ricardo Henao, and Lawrence Carin. 2019. Improving textual network learning with variational homophilic embeddings. In Advances in Neural In- formation Processing Systems, pages 2074-2085.
+
+
+
+
+ A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference
+
+ Adina Williams
+
+
+ Nikita Nangia
+
+
+ Samuel Bowman
+
+ 10.18653/v1/N18-1101
+ NAACL-HLT
+
+
+
+
+ Adina Williams, Nikita Nangia, and Samuel Bowman. 2018. A Broad-Coverage Challenge Corpus for Sen- tence Understanding through Inference. In NAACL- HLT.
+
+
+
+
+ Simplifying graph convolutional networks
+
+ Felix Wu
+
+
+ H Amauri
+
+
+ Tianyi Souza
+
+
+ Christopher Zhang
+
+
+ Tao Fifty
+
+
+ Kilian Q Yu
+
+
+ Weinberger
+
+
+
+ ICML
+
+
+
+
+ Felix Wu, Amauri H. Souza, Tianyi Zhang, Christo- pher Fifty, Tao Yu, and Kilian Q. Weinberger. 2019a. Simplifying graph convolutional networks. In ICML.
+
+
+
+
+ Word Mover's Embedding: From Word2Vec to Document Embedding
+
+ Lingfei Wu
+
+
+ Ian En-Hsu Yen
+
+
+ Kun Xu
+
+
+ Fangli Xu
+
+
+ Avinash Balakrishnan
+
+
+ Pin-Yu Chen
+
+
+ Pradeep Ravikumar
+
+
+ Michael J Witbrock
+
+
+
+ EMNLP
+
+
+
+
+ Lingfei Wu, Ian En-Hsu Yen, Kun Xu, Fangli Xu, Avinash Balakrishnan, Pin-Yu Chen, Pradeep Ravikumar, and Michael J Witbrock. 2018. Word Mover's Embedding: From Word2Vec to Document Embedding. In EMNLP.
+
+
+
+
+ Google's neural machine translation system: Bridging the gap between human and machine translation
+
+ Yonghui Wu
+
+
+ Mike Schuster
+
+
+ Zhifeng Chen
+
+
+ V Quoc
+
+
+ Mohammad Le
+
+
+ Wolfgang Norouzi
+
+
+ Maxim Macherey
+
+
+ Yuan Krikun
+
+
+ Qin Cao
+
+
+ Klaus Gao
+
+
+ Macherey
+
+ abs/1609.08144
+
+
+ ArXiv
+
+
+
+
+ Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. ArXiv, abs/1609.08144.
+
+
+
+
+
+
+ Zonghan Wu
+
+
+ Shirui Pan
+
+
+ Fengwen Chen
+
+
+ Guodong Long
+
+
+ Chengqi Zhang
+
+
+ Philip S Yu
+
+ abs/1901.00596
+
+
+ A Comprehensive Survey on Graph Neural Networks. ArXiv
+
+
+
+
+ Zonghan Wu, Shirui Pan, Fengwen Chen, Guodong Long, Chengqi Zhang, and Philip S Yu. 2019b. A Comprehensive Survey on Graph Neural Networks. ArXiv, abs/1901.00596.
+
+
+
+
+ Xlnet: Generalized autoregressive pretraining for language understanding
+
+ Zhilin Yang
+
+
+ Zihang Dai
+
+
+ Yiming Yang
+
+
+ Jaime G Carbonell
+
+
+ Ruslan Salakhutdinov
+
+
+ V Quoc
+
+
+ Le
+
+ abs/1906.08237
+
+
+ ArXiv
+
+
+
+
+ Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Car- bonell, Ruslan Salakhutdinov, and Quoc V. Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. ArXiv, abs/1906.08237.
+
+
+
+
+ From neural re-ranking to neural ranking: Learning a sparse representation for inverted indexing
+
+ Hamed Zamani
+
+
+ Mostafa Dehghani
+
+
+ W Bruce Croft
+
+
+ Erik G
+
+
+
+ CIKM
+
+
+
+
+ Learned-Miller, and Jaap Kamps
+ Hamed Zamani, Mostafa Dehghani, W. Bruce Croft, Erik G. Learned-Miller, and Jaap Kamps. 2018. From neural re-ranking to neural ranking: Learn- ing a sparse representation for inverted indexing. In CIKM.
+
+
+
+
+ Diffusion maps for textual network embedding
+
+ Xinyuan Zhang
+
+
+ Yitong Li
+
+
+ Dinghan Shen
+
+
+ Lawrence Carin
+
+
+
+
+
+ In NeurIPS
+ Xinyuan Zhang, Yitong Li, Dinghan Shen, and Lawrence Carin. 2018. Diffusion maps for textual network embedding. In NeurIPS.
+
+
+
+
+
+
+
diff --git a/s2orc-doc2json/temp_dir/N18-3011.tei.xml b/s2orc-doc2json/temp_dir/N18-3011.tei.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c3285cf63283af7d6da0ec6364713203cb7a10b6
--- /dev/null
+++ b/s2orc-doc2json/temp_dir/N18-3011.tei.xml
@@ -0,0 +1,833 @@
+
+
+
+
+
+ Construction of the Literature Graph in Semantic Scholar
+
+
+
+
+
+
+
+
+
+ Waleed Ammar
+ waleeda@allenai.org
+
+
+ Dirk Groeneveld
+
+
+ Chandra Bhagavatula
+
+
+ Iz Beltagy
+
+
+ Miles Crawford
+
+
+ Doug Downey
+
+
+ Jason Dunkelberger
+
+
+ Ahmed Elgohary
+
+
+ Sergey Feldman
+
+
+ Vu Ha
+
+
+ Rodney Kinney
+
+
+ Sebastian Kohlmeier
+
+
+ Kyle Lo
+
+
+ Tyler Murray
+
+
+ Hsu-Han Ooi
+
+
+ Matthew Peters
+
+
+ Joanna Power
+
+
+ Sam Skjonsberg
+
+
+ Lucy Lu Wang
+
+
+ Chris Wilhelm
+
+
+ Zheng Yuan
+
+
+ Madeleine Van Zuylen
+
+
+ Oren Etzioni
+
+
+
+ Allen Institute for Artificial Intelligence
+
+ 98103
+ Seattle
+ WA
+ USA
+
+
+
+
+
+ Northwestern University
+
+ 60208
+ Evanston
+ IL
+ USA
+
+
+
+
+
+ Introduction
+
+
+ Construction of the Literature Graph in Semantic Scholar
+
+
+
+
+
+
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+ We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org.
+
+
+
+
+
+Introduction
The goal of this work is to facilitate algorithmic discovery in the scientific literature. Despite notable advances in scientific search engines, data mining and digital libraries (e.g., [Wu et al., 2014)](#b25), researchers remain unable to answer simple questions such as:
What is the percentage of female subjects in depression clinical trials?
Which of my co-authors published one or more papers on coreference resolution?
Which papers discuss the effects of Ranibizumab on the Retina?
In this paper, we focus on the problem of extracting structured data from scientific documents, which can later be used in natural language interfaces (e.g., [Iyer et al., 2017)](#b12) or to improve ranking of results in academic search (e.g., Xiong et al., [Figure 1]: Part of the literature graph. 2017). We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph (see [Fig. 1]). The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries. For example, in order to compute the Erdős number of an author X, the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erdős such that all edges on the path are labeled "authored".
We reduce literature graph construction into familiar NLP tasks such as sequence labeling, entity linking and relation extraction, and address some of the impractical assumptions commonly made in the standard formulations of these tasks. For example, most research on named entity recognition tasks report results on large labeled datasets such as [CoNLL-2003][and ACE-2005][(e.g., Lample et al., 2016], and assume that entity types in the test set match those labeled in the training set (including work on domain adaptation, e.g., [Daumé, 2007)](#b6). These assumptions, while useful for developing and benchmarking new methods, are unrealistic for many domains and applications. The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines [(Etzioni, 2011)](#b8).
In the next section, we start by describing our symbolic representation of the literature. Then, we discuss how we extract metadata associated with a paper such as authors and references, then how we extract the entities mentioned in paper text. Before we conclude, we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph.
+Structure of The Literature Graph
The literature graph is a property graph with directed edges. Unlike Resource Description Framework (RDF) graphs, nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities. In this section, we describe the attributes associated with nodes and edges of different types in the literature graph.
+Node Types
Papers. We obtain metadata and PDF files of papers via partnerships with publishers (e.g., Springer, Nature), catalogs (e.g., DBLP, MED-LINE), pre-publishing services (e.g., arXiv, bioRxive), as well as web-crawling. Paper nodes are associated with a set of attributes such as 'title', 'abstract', 'full text', 'venues' and 'publication year'. While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF (details in §3). We deterministically remove duplicate papers based on string similarity of their metadata, resulting in 37M unique paper nodes. Papers in the literature graph cover a variety of scientific disciplines, including computer science, molecular biology, microbiology and neuroscience.
Authors. Each node of this type represents a unique author, with attributes such as 'first name' and 'last name'. The literature graph has 12M nodes of this type.
Entities. Each node of this type represents a unique scientific concept discussed in the literature, with attributes such as 'canonical name', 'aliases' and 'description'. Our literature graph has 0.4M nodes of this type. We describe how we populate entity nodes in §4.3.
Entity mentions. Each node of this type represents a textual reference of an entity in one of the papers, with attributes such as 'mention text', 'context', and 'confidence'. We describe how we populate the 237M mentions in the literature graph in §4.1.
+Edge Types
Citations. We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id', 'to paper id' and 'contexts' (the textual contexts where p 2 is referenced in p 1 ). While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF as detailed in §3.
Authorship. We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper.
Entity linking edges. We instantiate a directed edge from an extracted entity mention node to the entity it refers to.
Mention-mention relations. We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context. [1] We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 .
Entity-entity relations. While mentionmention edges represent relations between mentions in a particular context, entity-entity edges represent relations between abstract entities. These relations may be imported from an existing knowledge base (KB) or inferred from other edges in the graph.
+Extracting Metadata
In the previous section, we described the overall structure of the literature graph. Next, we discuss how we populate paper nodes, author nodes, authorship edges, and citation edges.
Although some publishers provide sufficient metadata about their papers, many papers are provided with incomplete metadata. Also, papers obtained via web-crawling are not associated with any metadata. To fill in this gap, we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks (RNNs). 2 For each paper, the system extracts the paper title, list of authors, and list of references; each reference consists of a title, a list of authors, a venue, and a year.
Preparing the input layer. We split each PDF into individual pages, and feed each page to Apache's PDFBox library 3 to convert it into a sequence of tokens, where each token has features, e.g., 'text', 'font size', 'space width', 'position on the page'.
We normalize the token-level features before feeding them as inputs to the model. For each of the 'font size' and 'space width' features, we compute three normalized values (with respect to current page, current document, and the whole training corpus), each value ranging between -0.5 to +0.5. The token's 'position on the page' is given in XY coordinate points. We scale the values linearly to range from . 0:5; 0:5/ at the top-left corner of the page to .0:5; 0:5/ at the bottom-right corner.
In order to capture case information, we add seven numeric features to the input representation of each token: whether the first/second letter is uppercase/lowercase, the fraction of uppercase/lowercase letters and the fraction of digits.
To help the model make correct predictions for metadata which tend to appear at the beginning (e.g., titles and authors) or at the end of papers (e.g., references), we provide the current page number as two discrete variables (relative to the beginning and end of the PDF file) with values 0, 1 and 2+. These features are repeated for each token on the same page.
For the k-th token in the sequence, we compute the input representation i k by concatenating the numeric features, an embedding of the 'font size', and the word embedding of the lowercased token. Word embeddings are initialized with GloVe [(Pennington et al., 2014)](#b19).
Model. The input token representations are passed through one fully-connected layer and then
g ! k D LSTM.Wi k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 /; h k D OEh ! k I g k where W is a weight matrix, g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction.
Following Collobert et al. [2011], we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights (often described as a conditional random field layer when used in neural architectures) to account for dependencies between labels.
Training. The ScienceParse system is trained on a snapshot of the data at PubMed Central. It consists of 1.4M PDFs and their associated metadata, which specify the correct titles, authors, and bibliographies. We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens. This labeling process succeeds for 76% of the documents. The remaining documents are not used in the training process. During training, we only use pages which have at least one token with a label that is not "none".
Decoding. At test time, we use Viterbi decoding to find the most likely global sequence, with no further constraints. To get the title, we use the longest continuous sequence of tokens with the "title" label. Since there can be multiple authors, we use all continuous sequences of tokens with the "author" label as authors, but require that all authors of a paper are mentioned on the same page. If the author labels are predicted in multiple pages, we use the one with the largest number of authors.
Results. We run our final tests on a held-out set from PubMed Central, consisting of about 54K documents. The results are detailed in [Table 1]. We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation, with no credit for partial matching.
To give an example for the type of errors our model makes, consider the paper [(Wang et al., 2013)](#b23) titled "Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis." The title we extract for this paper omits the first part "Clinical review:". This is likely to be a result of the pattern "Foo: Bar Baz" appearing in many training examples with only "Bar Baz" labeled as the title.
+Entity Extraction and Linking
In the previous section, we described how we populate the backbone of the literature graph, i.e., paper nodes, author nodes and citation edges. Next, we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text. In order to focus on more salient entities in a given paper, we only use the title and abstract.
+Approaches
We experiment with three approaches for entity extraction and linking: I. Statistical: uses one or more statistical models for predicting mention spans, then uses another statistical model to link mentions to candidate entities in a KB.
II. Hybrid: defines a small number of handengineered, deterministic rules for string-based matching of the input text to candidate entities in the KB, then uses a statistical model to disambiguate the mentions. [4] III. Off-the-shelf: uses existing libraries, namely (Ferragina and Scaiella, 2010, TagMe) 5 and (Demner-Fushman et al., 2017, MetaMap Lite) 6 , with minimal post-processing to extract and link entities to the KB. [Table 2]: Document-level evaluation of three approaches in two scientific areas: computer science (CS) and biomedical (Bio).
We evaluate the performance of each approach in two broad scientific areas: computer science (CS) and biomedical research (Bio). For each unique (paper ID, entity ID) pair predicted by one of the approaches, we ask human annotators to label each mention extracted for this entity in the paper. We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label. If one or more of the entity mentions in that paper is judged to be correct, the pair (paper ID, entity ID) counts as one correct instance. Otherwise, it counts as an incorrect instance. We report 'yield' in lieu of 'recall' due to the difficulty of doing a scalable comprehensive annotation. [Table 2] shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components. In both domains, the statistical approach gives the highest precision and the lowest yield. The hybrid approach consistently gives the highest yield, but sacrifices precision. The TagMe off-the-shelf library used for the CS domain gives surprisingly good results, with precision within 1 point from the statistical models. However, the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision. Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system, which gives significantly higher yield than any individual approach while maintaining reasonably high precision.
+Entity Extraction Models
Given the token sequence t 1 ; : : : ; t N in a sentence, we need to identify spans which correspond to entity mentions. We use the BILOU scheme to encode labels at the token level. Unlike most formulations of named entity recognition problems (NER), we do not identify the entity type (e.g., protein, drug, chemical, disease) for each mention since the output mentions are further grounded in a KB with further information about the entity (including its type), using an entity linking module.
Model. First, we construct the token embedding x k D OEc k I w k for each token t k in the input sequence, where c k is a character-based representation computed using a convolutional neural network (CNN) with filter of size 3 characters, and w k are learned word embeddings initialized with the GloVe embeddings [(Pennington et al., 2014)](#b19).
We also compute context-sensitive word embeddings, denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models (RNN-LM) at position k. The language model (LM) for each direction is trained independently and consists of a single layer long short-term memory (LSTM) network followed by a linear project layer. While training the LM parameters, lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model. See and for more details.
Given the x k and lm k embeddings for each token k 2 f1; : : : ; N g, we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer, respectively. That is,
g ! k D LSTM.x k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 /; h k D OEh ! k I h k ; where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction. Similar to the model described in §3, we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels.
Results. We use the standard data splits of the SemEval-2017 Task 10 on entity (and relation) extraction from scientific papers [(Augenstein et al., 2017)](#b1). [Table 3] compares three variants of our entity extraction model. The first line omits the LM embeddings lm k , while the second line is the full model (including LM embeddings) showing a large improvement of 4.2 F1 points. The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points.
Model instances. In the deployed system, we use three instances of the entity extraction model Description F1 Without LM 49.9
With LM 54.1 Avg. of 15 models with LM 55.2 [Table 3]: Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture, but trained on different datasets. Two instances are trained on the BC5CDR [(Li et al., 2016)](#b16) and the CHEMDNER datasets [(Krallinger et al., 2015)](#b14) to extract key entity mentions in the biomedical domain such as diseases, drugs and chemical compounds. The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain.
The output of all model instances are pooled together and combined with the rule-based entity extraction module, then fed into the entity linking model (described below).
+Knowledge Bases
In this section, we describe the construction of entity nodes and entity-entity edges. Unlike other knowledge extraction systems such as the Never-Ending Language Learner (NELL) 7 and OpenIE 4, 8 we use existing knowledge bases (KBs) of entities to reduce the burden of identifying coherent concepts. Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions. We use two KBs: UMLS: The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains, and is funded by the U.S. National Library of Medicine. DBpedia: DBpedia provides access to structured information in Wikipedia. Rather than including all Wikipedia pages, we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities, e.g., "Lord of the Rings" in DBpedia.
+Entity Linking Models
Given a text span s identified by the entity extraction model in §4.2 (or with heuristics) and a reference KB, the goal of the entity linking model is to associate the span with the entity it refers to. A span and its surrounding words are collectively referred to as a mention. We first identify a set of candidate entities that a given mention may refer to. Then, we rank the candidate entities based on a score computed using a neural model trained on labeled data.
For example, given the string ". . . database of facts, an ILP system will . . . ", the entity extraction model identifies the span "ILP" as a possible entity and the entity linking model associates it with "Inductive_Logic_Programming" as the referent entity (from among other candidates like "Integer_Linear_Programming" or "Instruction-level_Parallelism").
Datasets. We used two datasets: i) a biomedical dataset formed by combining MSH (Jimeno-Yepes et al., 2011) and BC5CDR [(Li et al., 2016)](#b16) with UMLS as the reference KB, and ii) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB.
Candidate selection. In a preprocessing step, we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs, along with the frequency this token is associated with that entity. This is similar to the index used in previous entity linking systems (e.g., [Bhagavatula et al., 2015)](#b3) to estimate the probability that a given mention refers to an entity. At train and test time, we use this index to find candidate entities for a given mention by looking up the tokens in the mention. This method also serves as our baseline in [Table 4] by selecting the entity with the highest frequency for a given mention.
Scoring candidates. Given a mention (m) and a candidate entity (e), the neural model constructs a vector encoding of the mention and the entity. We encode the mention and entity using the functions f and g, respectively, as follows:
f.m/ D OEv m.name I avg.v m.lc ; v m.rc /; g.e/ D OEv e.name I v e.def ; where m.surface, m.lc and m.rc are the mention's surface form, left and right contexts, and e.name and e.def are the candidate entity's name and definition, respectively. v text is a bag-of-words sum encoder for text. We use the same encoder for the mention surface form and the candidate name, and another encoder for the mention contexts and entity definition.
Additionally, we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described [Table 4]: The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets.
earlier. We compute two scores based on the word overlap of (i) mention's context and candidate's definition and (ii) mention's surface span and the candidate entity's name. Finally, we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair (m, e).
Results. We use the Bag of Concepts F1 metric [(Ling et al., 2015)](#b17) for comparison. [Table 4] compares the performance of the most-frequent-entity baseline and our neural model described above.
+Other Research Problems
In the previous sections, we discussed how we construct the main components of the literature graph. In this section, we briefly describe several other related challenges we are actively working on.
Author disambiguation. Despite initiatives to have global author IDs ORCID and ResearcherID, most publishers provide author information as names (e.g., arXiv). However, author names cannot be used as a unique identifier since several people often share the same name. Moreover, different venues and sources use different conventions in reporting the author names, e.g., "first initial, last name" vs. "last name, first name". Inspired by [Culotta et al. (2007)](#b5), we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters. We only consider merging two author instances if they have the same last name and share the first initial. If the first name is spelled out (rather than abbreviated) in both author instances, we also require that the first name matches.
Ontology matching. Popular concepts are often represented in multiple KBs. For example, the concept of "artificial neural networks" is represented as entity ID D016571 in the MESH ontology, and represented as page ID '21523' in DBpedia. Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies. [9] Limited KB coverage. The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage. Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation, resulting in a large gap in KB coverage of scientific concepts. In order to close this gap, we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers. For the same reasons, we also need to augment the relations imported from the KB with relations extracted from text. Our approach to address both entity and relation coverage is based on distant supervision [(Mintz et al., 2009)](#b18). In short, we train two models for identifying entity definitions and relations expressed in natural language in scientific documents, and automatically generate labeled data for training these models using known definitions and relations in the KB.
We note that the literature graph currently lacks coverage for important entity types (e.g., affiliations) and domains (e.g., physics). Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations. In order to cover additional scientific domains, more agreements need to be signed with publishers.
Figure and table extraction. Non-textual components such as charts, diagrams and tables provide key information in many scientific documents, but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction. In [Siegel et al. (2018)](#b21), we induced high-quality training labels for the task of figure extraction in a large number of scientific documents, with no human intervention. To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents (arXiv and PubMed) to locate figures and their associated captions in the rasterized PDF. We use the resulting dataset to train a deep neural network for end-to-end figure detection, yielding a model that can be more easily extended to new domains compared to previous work.
Understanding and predicting citations. The citation edges in the literature graph provide a wealth of information (e.g., at what rate a paper is being cited and whether it is accelerating), and opens the door for further research to better understand and predict citations. For example, in order to allow users to better understand what impact a paper had and effectively navigate its citations, we experimented with methods for classifying a citation as important or incidental, as well as more finegrained classes [(Valenzuela et al., 2015)](#b22). The citation information also enables us to develop models for estimating the potential of a paper or an author. In Weihs and Etzioni (2017), we predict citationbased metrics such as an author's h-index and the citation rate of a paper in the future. Also related is the problem of predicting which papers should be cited in a given draft [(Bhagavatula et al., 2018)](#b2), which can help improve the quality of a paper draft before it is submitted for peer review, or used to supplement the list of references after a paper is published.
+Conclusion and Future Work
In this paper, we discuss the construction of a graph, providing a symbolic representation of the scientific literature. We describe deployed models for identifying authors, references and entities in the paper text, and provide experimental results to evaluate the performance of each model. Three research directions follow from this work and other similar projects, e.g., [Hahn-Powell et al. (2017)](#b10); [Wu et al. (2014)](#b25): i) improving quality and enriching content of the literature graph (e.g., ontology matching and knowledge base population). ii) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole (e.g., identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks). iii) exploring the literature via natural language interfaces.
In order to help future research efforts, we make the following resources publicly available: metadata for over 20 million papers, 10 meaningful citations dataset, 11 models for figure and table extraction, 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata, 14 among other resources. [15]
Due to space constraints, we opted not to discuss our relation extraction models in this draft.
+ The ScienceParse libraries can be found at http:// allenai.org/software/.3 https://pdfbox.apache.org
+ We also experimented with a "pure" rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.5 The TagMe APIs are described at https://sobigdata. d4science.org/web/tagme/tagme-help6 We use v3.4 (L0) of MetaMap Lite, available at https: //metamap.nlm.nih.gov/MetaMapLite.shtml
+ http://rtw.ml.cmu.edu/rtw/ 8 https://github.com/allenai/ openie-standalone
+ Variants of this problem are also known as deduplication or record linkage.
+
+
+
+
+
+
+
+
+ The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction
+
+ Waleed Ammar
+
+
+ Matthew E Peters
+
+
+ Chandra Bhagavatula
+
+
+ Russell Power
+
+
+
+ ACL workshop (SemEval)
+
+
+
+
+ Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).
+
+
+
+
+
+ Isabelle Augenstein
+
+
+ Mrinal Das
+
+
+ Sebastian Riedel
+
+
+ Lakshmi Vikraman
+
+
+ Andrew D Mccallum
+
+ Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications
+
+
+
+
+ ACL workshop (SemEval)
+ Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).
+
+
+
+
+ Content-based citation recommendation
+
+ Chandra Bhagavatula
+
+
+ Sergey Feldman
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+
+
+ NAACL
+
+
+
+
+ Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.
+
+
+
+
+
+ Chandra Bhagavatula
+
+
+ Thanapon Noraset
+
+
+ Doug Downey
+
+ TabEL: entity linking in web tables. In ISWC
+
+
+
+
+ Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.
+
+
+
+
+ Natural language processing (almost) from scratch
+
+ Ronan Collobert
+
+
+ Jason Weston
+
+
+ Léon Bottou
+
+
+ Michael Karlen
+
+
+ Koray Kavukcuoglu
+
+
+ Pavel P Kuksa
+
+
+
+ JMLR
+
+
+
+
+ Ronan Collobert, Jason Weston, Léon Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.
+
+
+
+
+ Author disambiguation using error-driven machine learning with a ranking loss function
+
+ Aron Culotta
+
+
+ Pallika Kanani
+
+
+ Robert Hall
+
+
+ Michael Wick
+
+
+ Andrew D Mccallum
+
+
+
+ IIWeb Workshop
+
+
+
+
+ Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.
+
+
+
+
+ Frustratingly easy domain adaptation
+
+ Hal Daumé
+
+
+
+ ACL
+
+
+
+
+ Hal Daumé. 2007. Frustratingly easy domain adapta- tion. In ACL.
+
+
+
+
+ MetaMap Lite: an evaluation of a new Java implementation of MetaMap
+
+ Dina Demner-Fushman
+
+
+ Willie J Rogers
+
+
+ Alan R Aronson
+
+
+
+ JAMIA
+
+
+
+
+ Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.
+
+
+
+
+ Search needs a shake-up
+
+ Oren Etzioni
+
+
+
+ Nature
+
+ 476
+
+
+
+
+ Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.
+
+
+
+
+ TAGME: on-the-fly annotation of short text fragments (by wikipedia entities)
+
+ Paolo Ferragina
+
+
+ Ugo Scaiella
+
+
+
+ CIKM
+
+
+
+
+ Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.
+
+
+
+
+ Swanson linking revisited: Accelerating literature-based discovery across domains using a conceptual influence graph
+
+ Gus Hahn-Powell
+
+
+ Marco Antonio Valenzuela-Escarcega
+
+
+ Mihai Surdeanu
+
+
+
+ ACL
+
+
+
+
+ Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.
+
+
+
+
+ Long short-term memory
+
+ Sepp Hochreiter
+
+
+ Jürgen Schmidhuber
+
+
+
+ Neural computation
+
+
+
+
+ Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long short-term memory. Neural computation .
+
+
+
+
+ Learning a neural semantic parser from user feedback
+
+ Srinivasan Iyer
+
+
+ Ioannis Konstas
+
+
+ Alvin Cheung
+
+
+ Jayant Krishnamurthy
+
+
+ Luke S Zettlemoyer
+
+
+
+ ACL
+
+
+
+
+ Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.
+
+
+
+
+ Exploiting mesh indexing in medline to generate a data set for word sense disambiguation
+
+ J Antonio
+
+
+ Bridget T Jimeno-Yepes
+
+
+ Alan R Mcinnes
+
+
+ Aronson
+
+
+
+ BMC bioinformatics
+
+ 12
+ 1
+ 223
+
+
+
+ Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.
+
+
+
+
+ CHEMDNER: The drugs and chemical names extraction challenge
+
+ Martin Krallinger
+
+
+ Florian Leitner
+
+
+ Obdulia Rabal
+
+
+ Miguel Vazquez
+
+
+
+ In J. Cheminformatics
+
+
+
+
+ Julen Oyarzabal, and Alfonso Valencia
+ Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.
+
+
+
+
+ Neural architectures for named entity recognition
+
+ Guillaume Lample
+
+
+ Miguel Ballesteros
+
+
+ K Sandeep
+
+
+ Kazuya Subramanian
+
+
+ Chris Kawakami
+
+
+ Dyer
+
+
+
+ HLT-NAACL
+
+
+
+
+ Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.
+
+
+
+
+ Biocreative v cdr task corpus: a resource for chemical disease relation extraction. Database : the journal of biological databases and curation
+
+ Jiao Li
+
+
+ Yueping Sun
+
+
+ Robin J Johnson
+
+
+ Daniela Sciaky
+
+
+ Chih-Hsuan Wei
+
+
+ Robert Leaman
+
+
+ Allan Peter Davis
+
+
+ Carolyn J Mattingly
+
+
+ Thomas C Wiegers
+
+
+ Zhiyong Lu
+
+
+
+
+
+ Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.
+
+
+
+
+ Design challenges for entity linking
+
+ Xiao Ling
+
+
+ Sameer Singh
+
+
+ Daniel S Weld
+
+
+
+ Transactions of the Association for Computational Linguistics
+
+ 3
+
+
+
+
+ Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.
+
+
+
+
+ Distant supervision for relation extraction without labeled data
+
+ Mike Mintz
+
+
+ Steven Bills
+
+
+
+ ACL
+
+
+
+
+ Rion Snow, and Daniel Jurafsky
+ Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.
+
+
+
+
+ GloVe: Global vectors for word representation
+
+ Jeffrey Pennington
+
+
+ Richard Socher
+
+
+ Christopher D Manning
+
+
+
+ EMNLP
+
+
+
+
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.
+
+
+
+
+ Semi-supervised sequence tagging with bidirectional language models
+
+ Matthew E Peters
+
+
+ Waleed Ammar
+
+
+ Chandra Bhagavatula
+
+
+ Russell Power
+
+
+
+ ACL
+
+
+
+
+ Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.
+
+
+
+
+ Extracting scientific figures with distantly supervised neural networks
+
+ Noah Siegel
+
+
+ Nicholas Lourie
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+
+
+
+
+ In JCDL
+ Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.
+
+
+
+
+ Identifying meaningful citations
+
+ Marco Valenzuela
+
+
+ Vu Ha
+
+
+ Oren Etzioni
+
+
+
+ AAAI Workshop (Scholarly Big Data)
+
+
+
+
+ Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).
+
+
+
+
+ Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and meta-analysis
+
+ Xiang Wang
+
+
+ Yan Dong
+
+
+ Yi-Ming Xiang Qian Qi
+
+
+ Cheng-Guang Li
+
+
+ Lijun Huang
+
+
+ Hou
+
+
+
+ Critical care
+
+
+
+
+ Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.
+
+
+
+
+ Learning to predict citation-based impact measures
+
+ Luca Weihs
+
+
+ Oren Etzioni
+
+
+
+ JCDL
+
+
+
+
+ Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.
+
+
+
+
+ CiteSeerX: AI in a digital library search engine
+
+ Jian Wu
+
+
+ Kyle Williams
+
+
+ Hung-Hsuan Chen
+
+
+ Madian Khabsa
+
+
+ Cornelia Caragea
+
+
+ Alexander Ororbia
+
+
+ Douglas Jordan
+
+
+ C. Lee Giles
+
+
+
+ AAAI
+
+
+
+
+ Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.
+
+
+
+
+ Explicit semantic ranking for academic search via knowledge graph embedding
+
+ Chenyan Xiong
+
+
+ Russell Power
+
+
+ Jamie Callan
+
+
+
+ WWW
+
+
+ Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.
+
+
+
+
+
+
+
diff --git a/s2orc-doc2json/tests/jats/PMC5828200.nxml b/s2orc-doc2json/tests/jats/PMC5828200.nxml
new file mode 100644
index 0000000000000000000000000000000000000000..91c0253725351e100cbf17d1e28cc47cceaab51b
--- /dev/null
+++ b/s2orc-doc2json/tests/jats/PMC5828200.nxml
@@ -0,0 +1,2 @@
+
+Oncotarget Oncotarget Oncotarget ImpactJ Oncotarget 1949-2553 Impact Journals LLC 29535835 5828200 24369 10.18632/oncotarget.24369 Research Paper Curcuminoid submicron particle ameliorates cognitive deficits and decreases amyloid pathology in Alzheimer’s disease mouse model Tai Yi-Heng 1 Lin Yu-Yi 1 Wang Kai-Chen 2 Chang Chao-Lin 3 Chen Ru-Yin 3 Wu Chia-Chu 3 Cheng Irene H. 1 4 1 Institute of Brain Science, National Yang-Ming University, Taipei, Taiwan2 Department of Neurology, Cheng-Hsin General Hospital, Taipei, Taiwan3 Food Industry Research and Development Institute, Hsinchu, Taiwan4 Brain Research Center, National Yang-Ming University, Taipei, TaiwanCorrespondence to: Irene H. Cheng, hjcheng@ym.edu.tw 13 2 2018 31 1 2018 9 12 10681 10697 29 8 2017 25 1 2018 Copyright: © 2018 Tai et al. 2018 This article is distributed under the terms of the Creative Commons Attribution License (CC-BY), which permits unrestricted use and redistribution provided that the original author and source are credited. Alzheimer's disease (AD) is the most prevalent neurodegenerative disorder and is triggered via abnormal accumulation of amyloid-β peptide (Aβ). Aggregated Aβ is responsible for disrupting calcium homeostasis, inducing neuroinflammation, and promoting neurodegeneration. In this study, we generated curcuminoid submicron particle (CSP), which reduce the average size to ~60 nm in diameter. CSP had elevated the bioavailability in vivo and better neuroprotective effect against oligomeric Aβ than un-nanosized curcuminoids in vitro . Two months of CSP consumption reversed spatial memory deficits and the loss of a calcium binding protein calbindin-D28k in the hippocampus of AD mouse model. In addition, CSP consumption lowered amyloid plaques and astrogliosis in vivo and enhanced microglial Aβ phagocytosis in vitro , implying that the beneficial effects of CSP also mediated via modulating neuroinflammation and enhancing amyloid clearance. Taken together, our study demonstrated the protective effects of CSP toward ameliorating the memory impairment and pathological deficits in AD mouse model.
Alzheimer’s disease curcuminoid submicron particle APP transgenic mouse amyloid curcumin INTRODUCTION Alzheimer's disease (AD) is the most common form of dementia affecting more than 46 million patients worldwide. Abnormal accumulation of extracellular amyloid-β peptide (Aβ) into amyloid plaques in the brain is one of the pathological hallmarks of AD. Aβ is produced through the sequential proteolysis processing of amyloid precursor protein (APP) by β- and γ-secretases. Among different length of Aβ, Aβ40 is the most abundant species and accounts for 90% of total Aβ in the brain. However, Aβ42 is more aggregation-prone and more neurotoxic than other Aβ species, and thus play the major pathogenic role in AD [1 , 2 ]. Overexpression of Aβ deteriorates the cognitive function; on the contrary, reduced levels of Aβ are often associated with alleviating the cognitive deficits [3 ]. Impaired clearance of Aβ is one of the major the factors that result in the cognitive dysfunction in sporadic AD patients [4 ].
Neuroinflammation triggered by the activation of astrocyte and microglia plays a central role in the pathogenesis of AD [5 , 6 ]. Astrocytes are a key regulator of neuroinflammation and important for maintaining neuronal functions. Extensive proliferation of astrocytes induced by Aβ with a reactive phenotype and abnormal regulation leads to cognitive decline in AD [7 ]. Microglia, the primary immune cells of the brain, play an important role in maintaining neuronal function and protecting the brain from insults. The activation of microglia has both beneficial and detrimental roles in AD. Activated microglia could be classified into two phenotypes: M1 inflammatory microglia and M2 anti-inflammatory microglia [8 ]. M1 phenotype microglia can be triggered by Aβ to produce pro-inflammatory cytokines, which drive downstream cytokine storm to induce cytotoxicity [9 ]. In contrast, M2 phenotype microglia play protective roles against Aβ -induced damages [10 –12 ].
Curcuminoid is a group of natural polyphenol consisted of diarylheptanoid compounds derived from the rhizomes of Curcuma Longa , such as curcumin and demethoxycurcumin. Epidemiological studies suggested that curcuminoid consumption is highly associated with the lower prevalence of AD in India [13 , 14 ]. In addition, curcumin could inhibit neuroinflammation and reduce amyloid deposition in AD mouse model [15 –17 ]. However, curcumin intake failed to reduce the amyloid levels in AD patients in the clinical trial [18 ]. The major limitation using curcuminoid as a treatment/protective agent is its low bioavailability, which is caused by its poor water solubility and low absorption rate in the gastrointestinal tract [19 , 20 ]. Several approaches have been applied to overcome this problem, including structural modifications, pharmaceutical adjuvants, liposomes, and nanoparticles [19 , 21 , 22 ]. However, none of them significantly improve the spatial memory deficits in AD mouse model. Nanoparticle technology has emerged as a promising access to enhance bioavailability of lipophilic molecules such as curcumin [22 –24 ]. The advantage of this nanoparticle technology is to reduce the average size of curcuminoid without structure modification that may alter curcuminoid nature properties.
In this study, we produced curcuminoid submicron particle (CSP) to average size around 60 nm in diameter and investigate the neuroprotective effects of CSP in vitro and in vivo . We found that CSP had higher bioavailability, improved spatial learning and memory, and reduced amyloid pathology in APP transgenic mouse. Furthermore, CSP could inhibit neuroinflammation and promote phagocytosis to clear Aβ. Our study suggested a potential use of CSP for future AD intervention.
RESULTS CSP had better protective effect against oligomeric Aβ in vitro To compare the neuroprotective effect, oligomeric Aβ (oAβ) treated SH-SY5Y human neuroblastoma cells were co-incubated with 5 μM curcuminoid submicron particle (CSP) or un-nanosized curcuminoids (C) for 48 hours, and their viability was determined by MTT assay. The survival rate in oAβ treated cells was significantly decreased compared with non-treated cells. Co-incubation with 0.044 and 0.22 μM CSP or un-nanosized curcuminoids significantly reversed oAβ-induced neuronal death. In particular, 0.22 μM CSP treated cells had significantly higher survival rate than un-nanosized curcuminoids under Aβ stress (Figure 1 ). Because CSP had better neuroprotective effect against Aβ in vitro , we further tested the potential use of it to prevent neurodegeneration in vivo .
Figure 1 CSP had better protective effect against oligomeric Aβ (oAβ) than un-nanosized curcuminoid in vitro SH-SY5Y cells were pre-treated with 0, 0.044 or 0.22 μM curcuminoid submicron particle (CSP) or 0.22 μM un-nanosized curcuminoid (C) for 4 hours and then co-treated with 5 μM oAβ for 48 hours. The cell survival rate was determined by MTT assay in 3 independent experiments (N = 8 per experiment). Results were analyzed by one-way ANOVA. *** , P < 0.001; ** , P < 0.01; * , P < 0.05. The survival rate of SH-SY5Y cells treated with vesicle control was set as 1.
![]()
Genotoxicity and biosafety of CSP Before applying to AD animal model, we first examined the biosafety of CSP after oral consumption. The acute genotoxicity of CSP was determined using micronucleus assay. ICR male mice were administrated with 0.03, 0.3, and 3 g/kg of CSP, vesicle (negative control), or cyclophosphamide (positive control). The percentage of micronucleated erythrocytes in plasma was used as an indicator of chemical-induced genotoxicity. The percentage of micronucleated erythrocytes in CSP treated groups had no significant difference in comparison with vesicle control group, suggesting that CSP did not induce significant genomic instability and toxicity (Supplementary Table 1 ).
The biosafety of CSP was determined by administration of 0, 0.1, 0.5, and 1.0 g/kg /day of CSP to male and female Sprague-Dawley (SD) rats for continuous 28 days (short-term) and 90 days (long-term). All groups had no apparent adverse effects and recorded death (Supplementary Table 2 ). Furthermore, there were no significant changes in average body weight (Figure 2A-2B ), and organ weights (Supplementary Table 3-4 ) among these groups.
Figure 2 Body weight of SD rats in the oral toxicity study of CSP Male (black) and female (red) SD rats received 0 (●), 0.1 (▄), 0.5 (▲), and 1 (▼) g/kg/day of CSP (A) for 28 days and (B) 90 days. N = 6 mice/group in 28-day test. N = 8 mice/group in 90-day test. Results were analyzed by one-way ANOVA.
![]()
Bioavailability of CSP Low bioavailability is one of the major hinders of curcuminoid to be applied as a therapeutic agent [18 ]. To assess the bioavailability of CSP, ICR mice were gavaged with low dose (0.2 g/kg) or high dose (2.5 g/kg) of un-nanosized curcuminoids (C) or curcuminoid submicron particle (CSP), and their plasma were collected at 0, 15, 30, 45, 60, 120, and 300 minutes after gavaging. The level of un-nanosized curcuminoids or CSP in plasma was determined by high-pressure liquid chromatography (HPLC). The pharmacokinetic analysis indicated that the CSP had 35-folded higher absorbability than un-nanosized curcuminoids in high dose group (Table 1 ), illustrating that reducing the particle size could effectively improve the pharmacokinetic properties of curcuminoids.
Table 1 Pharmacokinetic analysis of un-nanosized curcuminoid (C) and curcuminoid submicron particle (CSP) Treatment Dosage (g/kg) Cmax (μg/ml) Tmax (min) AUC (μg/ml∙min) C 0.2 0.47±0.33 45±11 36±12 CSP 5.96±0.72 75±42 1,250±56 Ratio (CSP/C) 12.62 - 35.16 C 2.5 1.83±0.19 48±7 276±21 CSP 12.70±1.01 55±7 1,884±57 Ratio (CSP/C) 6.96 - 6.82
Cmax : peak concentration. Tmax : time to reach peak concentration. AUC: area under curve (plasma concentration-time). C: curcuminoid. CSP: curcuminoid submicron particle. Quantitative data are listed as the mean ± standard deviation.
CSP ameliorated spatial learning and memory deficit of APP mice The neuroprotective effects of CSP in vivo were examined using APP transgenic mouse (line J20), which generates high level of Aβ [25 ] and has age-dependent functional and pathological deficits onset at 4 months of age [26 , 27 ]. APP mice and wild-type (WT) littermate controls were administered with CSP or vesicle control at 0.75 mg/ml in drinking water starting at 3 months of age for two months. The average amount of CSP consumption was 37.5 mg/kg/day. These mice had no significant difference in average body weight (Figure 3A ) and water intake (Figure 3B ) after 2 months treatment. AD-like functional and pathological deficits were examined in following four groups: wild-type fed with vesicle (WT), wild-type fed with CSP (WT/CSP), APP fed with vesicle, and APP fed with CSP (APP/CSP).
Figure 3 Body weight and water intake of WT and APP mice during CSP treatment APP and wildtype littermate control (WT) mice received 0 or 0.75 mg/mL/day of CSP for 2 months. (A) Body weight and (B) water intake were recorded weekly during CSP administration. N = 12-17 mice/group. Results were analyzed by one-way ANOVA.
![]()
To elucidate whether CSP could improve cognitive impairment of APP mice [26 , 28 ], the Morris water maze was adopted to evaluate spatial memory deficits the in these 4 groups of mice. In memory acquisition session, the escape latency to reach the hidden platform was longer in APP mice than WT mice. APP/CSP mice exhibited significantly shorter escape latency than APP mice in the last 4 days (Figure 4A ). In probe trial for memory retention, APP mice spent significantly less time in the platform region than WT, and APP/CSP mice spent significantly longer time in the platform region than APP mice (Figure 4B-4C ). There was no difference between WT and WT/CSP groups in all the tests. The swimming speeds among each group had no significant difference (Figure 4D ), suggesting that the reverse of memory deficits in APP/CSP mice was not due the impairment in motor function. Our results demonstrated that CSP significantly improved both memory acquisition and the memory retention deficits in APP mice.
Figure 4 CSP ameliorated the spatial learning and memory of APP mice in the Morris water maze (A) In hidden platform test, APP/CSP mice had a lower escape latency than APP mice given the control diet. (B) In probe trial, APP/CSP mice spent more time in platform region than APP mice. (C) Representative traces of each group in the probe trial. ○ = platform location. (D) Swimming speed had no significant difference among all four groups. N = 12-17/group. Results were analyzed by one-way ANOVA.* , p < 0.05; *** , p < 0.001.
![]()
CSP did not alter anxiety and locomotor behaviors of APP mice Before and after CSP consumption, we used the elevated plus maze to screen for anxiety-related behavior, and the open field test to monitor anxiety and locomotor activity. Compared with WT mice, APP mice spent more time in the open arm of the elevated plus maze (Figure 5A-5B ), and traveled a longer distance and explored in the center region more frequently in the open field (Figure 5C-5F ), consistent with previous findings [12 , 27 , 28 ]. In the elevated plus maze, 2 months of CSP treatment did not significantly reverse the higher open arm time in APP mice (Figure 5A-5B ). Nevertheless, in the open field, 2 months of CSP treatment reduced the number of center entries (Figure 5C-5D ) but did not alter the total distance moved in APP mice (Figure 5E-5F ). Compared with WT mice, WT/CSP mice had no change in anxiety-related behavioral or locomotor activity. In summary, CSP did not influence locomotor activity and only partially reversed anxiety-related behavioral in APP mice.
Figure 5 Effect of CSP on anxiety-related behavior and locomotor activities of APP mice (A-B) In elevated plus maze, anxiety behavior of these mice was measured by time in open arms before (A) and after (B) CSP treatment. (C-F) In open field, anxiety was measured by the number of center entries (C, D) and locomotor activity was measured by total distance traveled (e, f) before and after CSP consumption. N = 12-17 mice/group. Results were analyzed by two-way ANOVA. * , p < 0.05; ** , p < 0.01; *** , p < 0.001.
![]()
CSP reversed the calbindin-D28K level in the hippocampus of APP mice Memory deficits in APP mice are correlated with the reduced levels of a calcium-binding protein calbindin-D28K and calcium dysregulation in the dentate gyrus [26 , 29 ]. Therefore, the expression of calbindin-D28K was used to as a marker to examine calcium homeostasis in our mice. We found that the level of calbindin-D28K was significantly reduced in APP mice compared to WT mice, but this reduction can be alleviated by CSP consumption in APP mice (Figure 6A-6D ), implying that memory decline in APP mice rescued by CSP could be associated with the revered levels of calbindin-D28K in the dentate gyrus.
Figure 6 CSP reversed the calbindin-D28K level in the hippocampus of APP mice (A) Representative calbindin-D28K images in the hippocampus of WT, APP, APP/CSP mice. (B) Normalized intensity of calbindin-D28K in the dentate gyrus of the hippocampus in each group. N = 6 mice/group, 8-10 brain slices per mouse. Results were analyzed by one-way ANOVA. *** , p < 0.001. Scale bar = 200 μm.
![]()
CSP decreased the amyloid level and astrogliosis in the hippocampus of APP mice Aβ deposition is one of the most important pathological hallmarks of AD. Among different length of Aβ peptides, Aβ42 is more aggregation-prone and more neurotoxic than other Aβ species [3 ]. To investigate whether CSP alters Aβ level in APP mice, we monitored the appearance of amyloid plaques with thioflavin-S staining and the level of Aβ with enzyme-linked immunosorbent assay (ELISA). The number of amyloid plaques (Figure 7A-7B ) and the level of Aβ42 (Figure 7C ) were significantly decreased in the hippocampus of APP/CSP mice compared to those of APP mice. However, there were no significant reductions in total Aβ level (Figure 7D ) and Aβ42 to total Aβ ratio (Figure 7E ) between APP/CSP mice and APP mice. These results indicated that CSP could effectively inhibit the amyloid and neurotoxic Aβ 42 levels in AD.
Figure 7 CSP decreased the amyloid deposition in the hippocampus of APP mice (A) Representative images of β-sheet amyloid plaques in the hippocampus of APP and APP/CSP mice stained by Thioflavin-S. Scale bar = 200 μm. (B) Quantitative analysis of the number of the plaques in the hippocampus. N = 6 mice/group, 6-10 slices per mouse. (C-D) The levels of Aβ42 (C) and total Aβ (D) in the hippocampal lysate were determined by ELISA. (E) Aβ42/total Aβ were unchanged in APP mice treated with CSP. N = 13-17 mice/group. Results were analyzed by t test. ** , p < 0.01; *** , p < 0.001 versus APP mice.
![]()
In addition to amyloid pathology, the inflammatory response in these mice was measured by the activation of astrocyte or microglia. The immunoreactive signals of glial fibrillary acidic protein (GFAP) and ionized calcium-binding adapter molecule 1 (Iba1) were used as astrocyte and microglial markers [30 ]. We found that APP mice had higher GFAP and Iba1 intensity than WT mice. Consumption of CSP reduced the intensity of GFAP positive astrocyte in the hippocampus of APP/CSP mice (Figure 8A-8B ). CSP did not alter the intensity of Iba1-positive microglia in the hippocampus of APP mice (Figure 8C-8D ). Taken together, CSP could mitigate amyloid pathology and inflammatory reaction in APP mice.
Figure 8 CSP inhibited astrocyte activation in the hippocampus of APP mice (A) Representative images of GFAP+ astrocyte activation in the hippocampus of WT, APP and APP/CSP mice. (B) Normalized GFAP intensity in the hippocampus in each group. (C) Representative images of Iba1+ microglia in the hippocampus of WT, APP and APP/CSP mice. (D) The normalized Iba1 intensity in the hippocampus in each group. N = 6 mice/group, 8-10 brain slices per mouse. Results were analyzed by one-way ANOVA. * , p < 0.05; ** , p < 0.01; *** , p < 0.001. Scale bar = 200 μm.
![]()
CSP promoted the microglial phagocytosis and Aβ uptake in BV2 microglia Microglia activation could promote phagocytosis to clear Aβ. Although CSP did not reduce activated microglia, CSP decreased the number of plaques in the hippocampus of APP mice. Therefore, we further identified whether CSP could affect the Aβ clearance through enhancing the microglial phagocytosis [3 ]. To address this question, the BV2 microglial cell was treated with 0 or 2.2 μM CSP for 1 hour and followed by adding 0.001% fluorescent microspheres beads (Figure 9A-9B ). We found that CSP treated cells had significantly higher percentage of phagocytosed cells (Figure 9C ). Furthermore, after co-incubating oAβ with or without CSP, the Aβ level in the medium of Aβ+CSP-treated microglia was significantly lower than Aβ only microglia (Figure 9D ). These results demonstrated that CSP enhances the Aβ -clearance ability of microglia, thereby ameliorating Aβ-induced neurodegeneration.
Figure 9 CSP increased phagocytosis of BV2 microglia (A-B) Representative images of ingested microspheres (red) in the BV2 cells (green) treated with 0 or 2.2 μM CSP for 1 hr. Scale bar = 25 μm. (C) CSP significantly increased the phagocytosis index. N = 11,480 cells in control group; N = 18,199 cells in CSP treated group. (D) BV2 cells were treated with 1 μM oAβ and 0 or 2.2 μM CSP for 24 hrs. Levels of residual Aβ in the media were lower in the CSP treated group. Results were analyzed by t test. * , p < 0.05; ** , p < 0.01.
![]()
We further determined whether CSP could modulate the ratio of M1/M2 phenotype of microglia. The exposure of BV2 to 5 μM oAβ could significantly increase the inducible nitric oxide synthase (iNOS), which is one of the direct consequences of an inflammatory process and commonly uses as a marker for M1 microglia [31 , 32 ]. The iNOS level was decreased in BV2 microglia co-treated with 2.2 μM CSP and oAβ (Figure 10A-10B ). However, there was no change in M2 anti-inflammatory markers YM1 and IL-4 (Figure 10C-10D ). These results suggested that CSP may decrease neuroinflammation but did not alter M1/M2 microglia phenotypes.
Figure 10 CSP reduced the M1 but did not change the M2 type of microglia under oAβ stress BV2 microglia was treated with 5 μM oAβ and 2.2 μM CSP for 24 hours. (A) Representative immunoblot images for pro-inflammatory M1 type microglia marker iNOS in BV2 cells. (B) The level of iNOS in oAβ+CSP treated microglia was significantly lower than oAβ treated microglia. (C-D) Expression of anti-inflammatory M2 type microglia markers YM1 and IL-4 RNA had no significant difference among all groups. Results were analyzed by one-way ANOVA. ** , p < 0.01; *** , p < 0.001.
![]()
CSP did not have anti-aggregation effect on Aβ The reduced Aβ level in APP/CSP mice may also due to the inhibition of Aβ aggregation by CSP. Curcumin has been reported to reduce the amounts of higher molecular Aβ aggregates [16 ]. To monitor the effect of CSP on Aβ aggregation, 5 μM monomeric Aβ was incubated with 0, 22, 44, 220, 440 μM CSP for 24 and 48 hours and subjected to immunoblot analysis. We found that Aβ aggregates into high molecular weight assemblies (>180 kDa) faster in the presence of CSP, indicating that the reduction of Aβ deposition in the APP/CSP mice is not due to the blockage of Aβ aggregation (Figure 11 ).
Figure 11 CSP did not inhibit Aβ aggregation in vitro Representative images of Aβ aggregation states in the presence of CSP. 5μM monomeric Aβ were co-incubated with 0, 22, 44, 220, and 440 μM CSPs for (A) 24 hours and (B) 48 hours. The size distribution of aggregated Aβ was immediately examined by western blot.
![]()
DISCUSSION This study demonstrated that the nano-sized curcuminoid, CSP, had higher bioavailability and better neuroprotective effects than un-nanosized curcuminoid. CSP treatment significantly ameliorated the cognitive function, reduced the amyloid deposition, decreased astrogliosis, reversed calbindin-D28k and enhanced microglial phagocytosis. These findings indicate that CSP has potential to be applied as a prevention agent for AD. Curcumin and curcuminoid do not induce apparent adverse effects up to 8,000 mg/day in healthy adults [33 –35 ]. We demonstrated that both short term and long term CSP consumption is safe under 1000 mg/kg/day. In this study, the amount of CSP consumption for the mouse was 187.5 mg/kg/day, which is approximately equated to 15.2 mg/kg/day for human [36 ]. For a 60 kg adult, daily intake of CSP needs to be 912 mg to reach the similar neuroprotective effect, which is within the safe range for curcuminoid [37 –39 ].
Multiple approaches have been applied to increase the bioavailability and enhance the neuroprotective effect of curcumin or curcuminoid. The two most common approaches were structural modification and size reduction [22 –24 ]. After oral gavage, the plasma concentration of CSP was 6-35 times higher than un-nanosized curcuminoid, which is similar or even better than the bioavailability of curcumin modified by other approaches [22 –24 ]. Whether curcumin could reverse the memory-related deficits in AD mouse models have diverse results. In other curcumin treated AD mouse model, although curcumin could reverse working memory deficits in Y-maze test [38 , 40 ], it did not significantly improve the memory retention deficits in the probe trial of the Morris water maze test, which is more related to the condition in AD patients [37 , 39 , 41 ]. The PLGA nanoparticles modified curcumin could only attenuate memory deficits when co-delivering with Aβ generation inhibitor through intraperitoneal injection to AD mouse model [42 ]. In our study, APP/CSP mice performed significantly better in both hidden platform and probe trial tests, suggesting that the improvement in both memory acquisition and retention of AD mouse model. Our results indicated that the therapeutic efficacy of orally administered CSP was greatly increased in comparison with previous studies.
Aβ -induced neuroinflammation is mostly mediated through CNS-resident cells, including astrocyte and microglia, rather than invading immune cells [43 ]. Reactive astrocytes cause disruptions in synaptic connectivity, imbalance of neurotransmitter homeostasis, and neurodegeneration in AD [44 , 45 ]. Furthermore, the degree of astrogliosis is correlated with cognitive decline in the brain of AD patients [46 ]. In our study, CSP could inhibit reactive astrocyte activation in APP mice, suggesting that CSP could reduce astrocyte-mediated neuroinflammation. On the other hand, although CSP consumption did not alter the intensity of Iba1+ microglia in APP mice, CSP treatment enhanced phagocytic percentage in BV2 microglia cells. Therefore, the reduction of amyloid plaques in the hippocampus of APP mice might be mediated by CSP through the microglial engulfment. Dysfunction of microglial phagocytosis in AD patients has been linked to the disrupted clearance of Aβ, and the enhanced memory impairment [47 , 48 ]. A curcuminoid compound (bisdemethoxycurcumin) has been shown to increase the Aβ phagocytosis in the brains of AD patients [49 ]. Our results provided consistent evidence that CSP enhances microglial phagocytosis.
Calcium-binding proteins could regulate calcium homeostasis and protect neuron against calcium-mediated neurotoxicity [50 ]. The disruption of calcium-binding proteins signaling impairs the synaptic function [51 ]. In the hippocampus, calbindin-D28k containing neurons play roles in memory formation and long-term potentiation [52 ]. The level of calbindin-D28k is highly correlated with memory retention deficits of APP mice [26 ]. In our study, the reduced expression of calbindin-D28k in APP mice was reversed after CSP consumption, which provided the first indication linking the effects of curcuminoid to the levels of calbindin-D28k in AD model.
In summary, this study demonstrated the beneficial effects of CSP on spatial memory deficits and pathological changes in APP mice. CSP can be easily administered to animal model in drinking water as a stable suspension without noticeable adverse effects. Altogether, CSP could be a potential food supplement for long-term treatment of AD.
MATERIALS AND METHODS Animals Short-term (28-day) and long-term (90-day) biosafety tests were performed on male and female SD rats. Micronucleus assay was performed on ICR mice. AD mouse model used in this study is APP transgenic mice (line J20) carrying the human APP minigene with the Swedish (K670N/M671L) and Indiana (V717F) familial mutations. Animals were housed in a specific pathogen-free facility with a light/dark cycle of 12 hours light and 12 hours dark. Food and water for mice were provided ad libitum. Drinking water with 0.75 mg/mL CSP was administrated for mice from 3 to 5 months of ages. The open field and elevated plus maze were performed before and after treatment, and the Morris water maze was carried out after 2 months of CSP consumption. Mice were sacrificed with transcardial perfusion with 0.9% NaCl 2 days after behavioral tests. One hemibrain was drop-fixed in 4% paraformaldehyde for 48 h, and the other hemibrain immediately froze at −70°C. The study was approved by the Institutional Animal Care and Use Committee of National Yang-Ming University. All experimental procedures involving animals and their care were carried out in accordance with the Guide for the Care and Use of Laboratory Animals published by the United States National Institutes of Health.
Preparation of aqueous dispersion with CSP To prepare stabilizer for CSP, 2.5 g L-α-phosphatidylcholine (P7443, Sigma, USA) and 3.42 g sucrose esters (Gemfont Corporation, Taiwan) were sequentially incorporated into 400 mL water. The mixed stabilizer materials were stirred at 25°C, and 40 g curcuminoid powder with curcumin, demethoxycurcumin, and bisdemethoxycurcumin (Toong Yeuan, Taiwan) were added to form a 10 % curcuminoid aqueous solution. This non-homogenously mixed solution was subjected to a high-speed homogenization pretreatment from 4,000 to 6,000g for 10 minutes using a PRO250 homogenizer (PRO Scientific, USA). Next, a nano-grade wet grinder (Netzsch-Fein mahltechnik GmbH, Germany) carried on yttria-stabilized tetragonal zirconia for circulation milling with 0.2 mm beads for 180 minutes to obtain the aqueous dispersion. The average diameter of un-nanosized curcuminoid was 5140±178 nm, and the average diameter of CSP was 59±1 nm. Finally, the nanosized CSP composed of curcumin (83.56%), demethoxycurcumin (14.13%) and bisdemethoxycurcumin (2.31%) was obtained. Right before oral administration, CSP was diluted into the drinking water at concentration 0.75 mg/mL. The vesicle control in this study contain the same stabilizer and went through the same preparation process without adding curcuminoid.
Bioavailability and biosafety analysis For the pharmacokinetics analysis, ICR mice were administered by oral gavage with 0.2 g/kg or 2.5 g/kg of un-nanosized curcuminoid or CSP. At 15, 30, 45, 60, 120, and 300 minutes after gavage, plasma was collected and processed with sulfatase for 2 hours. The level of curcuminoid in plasma was determined by high-pressure liquid chromatography (HPLC).
For biosafety test, including genotoxic analysis, short-term and long-term tests were adopted to investigate the potentially harmful effects of CSP. Micronucleation assay was applied to identify the genotoxicity of CSP. ICR mice were administered orally with 0, 0.03, 0.3, 3.0 g/kg of CSP or 0.1 g/kg cyclophosphamide (positive control) for 48 hours. 200 μL plasma was incubated with 50 μL 100 U/mL sulfatase for 2 hours, and 120 μL processed sample was then fixed by frozen methanol (−80°C). 12 mL buffer (1.8g NaCl and 0.089g NaHCO3 in 200 mL sterile water, 4°C) was added to the fixed sample and centrifuged at 1,000 ×g for 5 minutes to remove supernatant. Cells were resuspended with 80 μL buffer (10 μL/mL CD71-FITC and 1 mg/mL RNAase) for 30 minutes at 4°C, and then 1 mL protease inhibitor (2.5 μg/mL) was added. The percentage micronucleated reticulocyte in their blood will be analyzed by Flow cytometric-Becton Dickinson FacsCalibur and Cell Quest.
In short-term and long-term safety tests, SD rats were administered with 0.1, 0.5 and 1.0 g/kg/day CSP, or sterile water, daily for 28 days and 90 days. The clinical condition, body weights, organ weights studies were performed after 28 days and 90 days.
Morris water maze The water maze consisted of a water pool (122 cm in diameter) containing opaque water and a platform (10 cm in diameter) submerged 1 cm below the water surface. The hidden platform test consisted of 10 sessions over 5 days, and each session comprised three 60-second trials with 15-minute inter-trial intervals. The platform location remained constant during the hidden platform sessions, and the entry points were changed semi-randomly between days. One day after the final day of hidden platform training section, a probe trial was conducted by removing the platform and allowing mice to explore in the pool for 1 minute. The time spent in platform region and swim speed were recorded and analyzed with an EthoVision video tracking system (Noldus, Wageningen, Netherlands).
Elevated plus maze Elevated plus maze consists of two open arms, two closed arms, and a center area. Mice were habituated in the testing room for 1 hour and then were placed individually at the center of the apparatus to explore for 10 minutes. The time spent and distances moved in each of the arms were recorded and analyzed with an EthoVision video tracking system.
Open field Mice were habituated in the testing room for 1 hour and were placed in an open arena (24.32 × 24.32 cm2 ) for 15 minutes. Two infrared photobeam sensor frames, each consisting of a 32 × 32 photobeam array, were used to detect movements in the horizontal and vertical plane (Version 2.0, TRU Scan Photobeam LINC, Coulbourn Instruments, PA, USA). The distance mice traveled and center entries were used as parameters to analyze the general activity and anxiety of mice.
Enzyme-linked immunosorbent assay (ELISA) Frozen hippocampi were homogenized in 5M guanidine/5mM Tris buffer (pH 8.0). The samples were diluted with 0.25 % casein blocking buffer containing 0.5 M guanidine and protease inhibitor mix (04693116001, Roche, Switzerland). Total Aβ and Aβ42 levels in the soluble fraction were analyzed with ELISA kits (27729 and 27711, INL, Germany) according to the manufacturer's instructions.
Immunohistochemistry and thioflavin-S staining Paraformaldehyde-fixed brains were sectioned coronally (at 20μm thickness) using a sliding microtome (CM1900; Leica, Germany). For immunohistochemistry (IHC), slices were blocked with phosphate-buffered saline (PBS) containing 10% fetal bovine serum (FBS) and 0.5% Triton X-100 for 1.5 hours, and incubated with anti-GFAP (Z0334; Dako Cytomation, Denmark), anti-Iba1 (019-19741; Wako, Japan), anti-YM1 (#01404, Stemcell Technologies, Canada) and anti-Calbindin-D28K (CB38; Swant, Switzerland) antibodies at 4°C overnight. Slices were then incubated with Alexa Fluor 488-conjugated AffiniPure Goat anti-rabbit IgG secondary antibody (111–545–003; Jackson ImmunoResearch, USA) for 1.5 hours. For Thioflavin-S staining, slices were stained with 0.015% Thioflavin-S (T1892; Sigma, USA) for 15 minutes at room temperature. After mounting, slides were imaged using a Zeiss fluorescence microscope (Axio Observer A1; Zeiss, Germany).
Cell culture SH-SY5Y human neuroblastoma cell line was maintained in Minimum Essential Medium (MEM, 41500–034; Gibco, USA) plus F-12 nutrient mixture (21700–075; Gibco, USA) supplemented with 10% fetal bovine serum (FBS, SH3007; HyClone, USA), 0.11 g/L sodium pyruvate, and 1.69 g/L sodium bicarbonate. BV2 microglia cell line was maintained in Dulbecco's modified Eagle's medium (DMEM, 12100–046; Gibco, USA) supplemented with 10% FBS, 1% L-glutamine (GLL01; Caisson laboratory, USA), and 1.85 g/L sodium bicarbonate. Cell lines were grown at 37°C in a humidified 5% CO2 chamber.
In vitro Aβ aggregationHFIP treated Aβ (Ultra-pure Aβ42, HFIP, A1163-2; Kelowna) was dissolved with DMSO, and the Aβ-DMSO solution was then added into 10mM Tris/PBS buffer to form 100 uM Aβ solution. The pallet was further removed from Aβ solution by centrifugation (17000 ×g, 15 min, 4°C). The supernatant of 100 μM Aβ solution was then kept at 4°C for 24 h to form oligomer Aβ. The oligomeric Aβ was characterized by using immunoblotting (Figure 11A , CSP=0 μM).
To determine whether CSP could alter Aβ aggregation, 5 μM HFIP-Aβ was incubated with 0, 22, 44, 220, and 440 μM CSP for 24 and 48 hours, and their aggregation status was determined by immunoblotting.
Immunoblotting Cell lysates (10 μg of total protein) or aggregated Aβ peptide were separated via 10% Tris-glycine polyacrylamide gel electrophoresis, transferred to polyvinylidene difluoride (PVDF, IPVH00010; Millipore, Germany) membranes. Membranes were blocked in casein blocking buffer (B6429; Sigma Aldrich, USA) for 1 hours and probed with primary antibody for anti-iNOS (610328; BD Bioscience, USA), anti-GAPDH (G8795; Sigma Aldrich, USA) or anti-Aβ (6E10, SIG-39320; Covance, USA) antibodies. Membranes were washed with TBST buffer (150 mM NaCl, 10 Mm Tris-HCl, and 0.05% Tween-20, pH 8.0) and probed with horseradish peroxidase (HRP) conjugated goat anti-mouse IgG and goat anti-rabbit IgG (12–349, 12-348; Merck Millipore, Germany). Protein signals were visualized using a chemiluminescent HRP substrate ECL detection system (WBKLS0500; Merck Millipore, Germany) and quantified by a luminescence imaging system (LAS-4000; Fujifilm, Japan).
MTT assay SHSY-5Y cells reaching 75% confluence were incubated with different concentrations of CSP or un-nanosized curcuminoid for 4 hours at 37°C, and then treated with 5 μM oligomeric Aβ was added at a final concentration of 5 μM. After 48 hours incubation, medium were removed and 10 μL of 3-(4, 5-Dimethylthiazol-2-yl)-2, 5-diphenyltetrazolium bromide (MTT, 10 mg/mL) solution were added for 4 hours incubation. Cells were then lysed with 100 μl of lysis buffer (10% SDS and 20 mM HCl) at 37°C overnight. Cell survival was determined according to the optical density at 570 nm with the ELISA reader (TECAN Sunrise™ Absorbance Reader, Switzerland).
Phagocytosis assay BV2 microglial cells were seeded in 24-well plates at a density of approximately 8×104 cells in each well. BV2 cells were incubated in the presence or absence of 2.2 μM CSP for 1 hour and then incubated with 0.001% fluorescent microspheres beads (F8821; Molecular Probes, USA) coated with fetal calf serum for 3 hours incubation at 37°C. Cells were washed with PBS for 3 times and then stained with Iba1 antibody. The cells and beads were visualized using fluorescence microscopy. The average number of ingested beads per cells was determined as phagocytosis index.
For Aβ clearance, BV2 microglia cells were seeded at a density of 2×105 cells/well on poly-D-lysine coated coverslips. Attached microglia were treated with 1 μM oAβ with 0 or 2.2 μM CSP for 24 hours. The levels of Aβ remaining in the media were determined by Aβ42 ELISA.
Quantitative real-time PCR (Q-PCR) The RNA from oAβ and CSP treated microglia were purified using TRI reagent (T9424, Sigma, MO, USA), and then immediately reverse transcribed into cDNA by MMLV high-performance reverse transcriptase (RT80125K, Epicentre, WI, USA). The mRNA expression levels were analyzed by using primers mixed with SYBR Green PCR Master Mix (10476600, Roche, Penzberg, Germany). A StepOnePlus Real-Time PCR System (Applied Biosystem, ABI, MA, USA) was used to monitor the changes of fluorescence intensity from PCR products. GAPDH was used as internal control. Primer sequence are: IL-4 F: 5’GAC GCC ATG CAC GGA GAT3’, R: 5’TCT CTG TGG TGT TCT TCG TTG CT3’; YM1 F: 5’TTC TGG TGA AGG AAA TGC GTA AA3’, R: 5’GCA GCC TTG GAA TGT CTT TCT C3’; GAPDH F: 5’GCA TCC ACT GGT GCT GCC3’; R: 5’TCA TCA TAC TTG GCA GGT TTC3’. The data were analyzed using StepOne software version 2.0.
Statistical analysis Statistical analyses were performed with GraphPad Prism (Version 5.0; GraphPad, USA). Differences among multiple means were assessed by one-way, two-way ANOVA, followed by Bonferroni's post-hoc test or Tukey's multiple comparison test. Differences between two means were assessed by paired or unpaired t test. The threshold for significance was defined as p< 0.05. All data are presented as mean ±SEM.
SUPPLEMENTARY MATERIALS TABLES ACKNOWLEDGMENTS AND FUNDING Behavioral studies were carried out at the Animal Behavioral Core at Brain Research Center, National Yang-Ming University. The technical services of confocal images were provided by Imaging Core Facility of Nanotechnology of the UST-NYMU.
This work was supported by Taiwan Ministry of Science and Technology grant (MOST-105-2320-B-010-031), National Health Research Institutes (NHRI-EX106-10614NI), Cheng Hsin General Hospital (105F003C27), Yen Tjing Ling Medical Foundation (CI-106-2), Ministry of Economic Affairs, ROC (103-EC-17-A-03-04-0332), and Taiwan Ministry of Education Aim for Top University Grant.
Author contributions
YHT and YYL performed the AD-related experiment and wrote the manuscript. CLC, RYC, and CCW prepared CSP and performed bioavailability and biosafety tests. IHC and KCW designed the experiment and wrote the manuscript.
CONFLICTS OF INTEREST
The authors declare no conflicts of interest.
REFERENCES 1 Mori H Takio K Ogawara M Selkoe DJ Mass spectrometry of purified amyloid beta protein in Alzheimer's disease J Biol Chem 1992 267 17082 6 1512246 2 Bitan G Kirkitadze MD Lomakin A Vollers SS Benedek GB Teplow DB Amyloid beta -protein (abeta) assembly: abeta 40 and abeta 42 oligomerize through distinct pathways Proc Natl Acad Sci U S A 2003 100 330 5 12506200 3 Querfurth HW LaFerla FM Alzheimer's disease N Engl J Med 2010 362 329 44 20107219 4 Mawuenyega KG Sigurdson W Ovod V Munsell L Kasten T Morris JC Yarasheski KE Bateman RJ Decreased clearance of CNS amyloid-β in Alzheimer's disease Science 2010 330 1774 21148344 5 Becher B Spath S Goverman J Cytokine networks in neuroinflammation Nat Rev Immunol 2017 17 49 59 27916979 6 Heneka MT Carson MJ Khoury JE Landreth GE Brosseron F Feinstein DL Jacobs AH Wyss-Coray T Vitorica J Ransohoff RM Herrup K Frautschy SA Finsen B Neuroinflammation in Alzheimer's disease Lancet Neurol 2015 14 388 405 25792098 7 Rodríguez-Arellano JJ Parpura V Zorec R Verkhratsky A Astrocytes in physiological aging and Alzheimer's disease Neuroscience 2016 323 170 82 25595973 8 Tang Y Le W Differential roles of M1 and M2 microglia in neurodegenerative diseases Mol Neurobiol 2016 53 1181 94 25598354 9 Prokop S Miller KR Heppner FL Microglia actions in Alzheimer's disease Acta Neuropathol 2013 126 461 77 24224195 10 Heneka MT Kummer MP Stutz A Delekate A Schwartz S Vieira-Saecker A Griep A Axt D Remus A Tzeng TC Gelpi E Halle A Korte M NLRP3 is activated in Alzheimer's disease and contributes to pathology in APP/PS1 mice Nature 2013 493 674 8 23254930 11 Cherry JD Olschowka JA O'Banion MK Neuroinflammation and M2 microglia: the good, the bad, and the inflamed J Neuroinflammation 2014 11 98 24889886 12 Liu YL Chen WT Lin YY Lu PH Hsieh SL Cheng IH Amelioration of amyloid-β-induced deficits by DcR3 in an Alzheimer's disease model Mol Neurodegener 2017 12 30 28438208 13 Chandra V Pandav R Dodge HH Johnston JM Belle SH DeKosky ST Ganguli M Incidence of Alzheimer's disease in a rural community in india: the indo-us study Neurology 2001 57 985 9 11571321 14 Vas CJ Pinto C Panikker D Noronha S Deshpande N Kulkarni L Sachdeva S Prevalence of dementia in an urban indian population Int Psychogeriatr 2001 13 439 50 12003250 15 Lim GP Chu T Yang FS Beech W Frautschy SA Cole GM The curry spice curcumin reduces oxidative damage and amyloid pathology in an Alzheimer transgenic mouse J Neurosci 2001 21 8370 7 11606625 16 Yang F Lim GP Begum AN Ubeda OJ Simmons MR Ambegaokar SS Chen PP Kayed R Glabe CG Frautschy SA Cole GM Curcumin inhibits formation of amyloid beta oligomers and fibrils, binds plaques, and reduces amyloid in vivo J Biol Chem 2005 280 5892 901 15590663 17 Frautschy SA Cole GM Why pleiotropic interventions are needed for Alzheimer's disease Mol Neurobiol 2010 41 392 409 20437209 18 Ringman JM Frautschy SA Teng E Begum AN Bardens J Beigi M Gylys KH Badmaev V Heath DD Apostolova LG Porter V Vanek Z Marshall GA Oral curcumin for Alzheimer's disease: tolerability and efficacy in a 24-week randomized, double blind, placebo-controlled study Alzheimers Res Ther 2012 4 43 23107780 19 Anand P Kunnumakkara AB Newman RA Aggarwal BB Bioavailability of curcumin: problems and promises Mol Pharm 2007 4 807 18 17999464 20 Begum AN Jones MR Lim GP Morihara T Kim P Heath DD Rock CL Pruitt MA Yang F Hudspeth B Hu S Faull KF Teter B Curcumin structure-function, bioavailability, and efficacy in models of neuroinflammation and Alzheimer's disease J Pharmacol Exp Ther 2008 326 196 208 18417733 21 Dolai S Shi W Corbo C Sun C Averick S Obeysekera D Farid M Alonso A Banerjee P Raja K “Clicked” sugar-curcumin conjugate: modulator of amyloid-beta and tau peptide aggregation at ultralow concentrations ACS Chem Neurosci 2011 2 694 9 22860163 22 Mourtas S Canovi M Zona C Aurilia D Niarakis A La Ferla B Salmona M Nicotra F Gobbi M Antimisiaris SG Curcumin-decorated nanoliposomes with very high affinity for amyloid-beta1-42 peptide Biomaterials 2011 32 1635 45 21131044 23 Gao Y Li Z Sun M Guo C Yu A Xi Y Cui J Lou H Zhai G Preparation and characterization of intravenously injectable curcumin nanosuspension Drug Deliv 2011 18 131 42 20939679 24 Prasad S Tyagi AK Aggarwal BB Recent developments in delivery, bioavailability, absorption and metabolism of curcumin: the golden pigment from golden spice Cancer Res Treat 2014 46 2 18 24520218 25 Mucke L Masliah E Yu GQ Mallory M Rockenstein EM Tatsuno G Hu K Kholodenko D Johnson-Wood K McConlogue L High-level neuronal expression of abeta 1-42 in wild-type human amyloid protein precursor transgenic mice: synaptotoxicity without plaque formation J Neurosci 2000 20 4050 8 10818140 26 Palop JJ Jones B Kekonius L Chin J Yu GQ Raber J Masliah E Mucke L Neuronal depletion of calcium-dependent proteins in the dentate gyrus is tightly linked to Alzheimer's disease-related cognitive deficits Proc Natl Acad Sci U S A 2003 100 9572 7 12881482 27 Cheng IH Scearce-Levie K Legleiter J Palop JJ Gerstein H Bien-Ly N Puoliväli J Lesné S Ashe KH Muchowski PJ Mucke L Accelerating amyloid-beta fibrillization reduces oligomer levels and functional deficits in Alzheimer disease mouse models J Biol Chem 2007 282 23818 28 17548355 28 Chang WH Chen MC Cheng IH Antroquinonol lowers brain amyloid-β levels and improves spatial learning and memory in a transgenic mouse model of Alzheimer's disease Sci Rep 2015 5 15067 26469245 29 Chin J Palop JJ Puolivali J Massaro C Bien-Ly N Gerstein H Scearce-Levie K Masliah E Mucke L Fyn kinase induces synaptic and cognitive impairments in a transgenic mouse model of Alzheimer's disease J Neurosci 2005 25 9694 703 16237174 30 Wang Y Yin H Wang L Shuboy A Lou J Han B Zhang X Li J Curcumin as a potential treatment for Alzheimer's disease: a study of the effects of curcumin on hippocampal expression of glial fibrillary acidic protein Am J Chin Med 2013 41 59 70 23336507 31 Kraft AD Harry GJ Features of microglia and neuroinflammation relevant to environmental exposure and neurotoxicity Int J Environ Res Public Health 2011 8 2980 3018 21845170 32 Hensley K Maidt ML Yu Z Sang H Markesbery WR Floyd RA Electrochemical analysis of protein nitrotyrosine and dityrosine in the Alzheimer brain indicates region-specific accumulation J Neurosci 1998 18 8126 32 9763459 33 Lao CD Ruffin MT 4th Normolle D Heath DD Murray SI Bailey JM Boggs ME Crowell J Rock CL Brenner DE Dose escalation of a curcuminoid formulation BMC Complement Altern Med 2006 6 10 16545122 34 Cheng AL Hsu CH Lin JK Hsu MM Ho YF Shen TS Ko JY Lin JT Lin BR Ming-Shiang W Yu HS Jee SH Chen GS Phase I clinical trial of curcumin, a chemopreventive agent, in patients with high-risk or pre-malignant lesions Anticancer Res 2001 21 2895 900 11712783 35 Mishra S Palanivelu K The effect of curcumin (turmeric) on Alzheimer's disease: an overview Ann Indian Acad Neurol 2008 11 13 9 19966973 36 Reagan-Shaw S Nihal M Ahmad N Dose translation from animal to human studies revisited FASEB J 2008 22 659 61 17942826 37 Frautschy SA Hu W Kim P Miller SA Chu T Harris-White ME Cole GM Phenolic anti-inflammatory antioxidant reversal of Abeta-induced cognitive deficits and neuropathology Neurobiol Aging 2001 22 993 1005 11755008 38 Zhang L Fang Y Xu Y Lian Y Xie N Wu T Zhang H Sun L Zhang R Wang Z Curcumin improves amyloid beta-peptide (1-42) induced spatial memory deficits through BDNF-ERK signaling pathway PLoS One 2015 10 e0131525 26114940 39 Dong S Zeng Q Mitchell ES Xiu J Duan Y Li C Tiwari JK Hu Y Cao X Zhao Z Curcumin enhances neurogenesis and cognition in aged rats: implications for transcriptional interactions related to growth and synaptic plasticity PLoS One 2012 7 e31211 22359574 40 Ma QL Yang F Rosario ER Ubeda OJ Beech W Gant DJ Chen PP Hudspeth B Chen C Zhao Y Vinters HV Frautschy SA Cole GM Beta-amyloid oligomers induce phosphorylation of tau and inactivation of insulin receptor substrate via c-Jun N-terminal kinase signaling: suppression by omega-3 fatty acids and curcumin J Neurosci 2009 29 9078 89 19605645 41 Wang C Zhang X Teng Z Zhang T Li Y Downregulation of PI3K/Akt/mTOR signaling pathway in curcumin-induced autophagy in APP/PS1 double transgenic mice Eur J Pharmacol 2014 740 312 20 25041840 42 Huang N Lu S Liu XG Zhu J Wang YJ Liu RT PLGA nanoparticles modified with a BBB-penetrating peptide co-delivering Aβ generation inhibitor and curcumin attenuate memory deficits and neuropathology in Alzheimer's disease mice Oncotarget 2017 8 81001 13 https://doi.org/10.18632/oncotarget.20944 29113362 43 Heppner FL Ransohoff RM Becher B Immune attack: the role of inflammation in Alzheimer disease Nat Rev Neurosci 2015 16 358 72 25991443 44 Carson MJ Thrash JC Walter B The cellular response in neuroinflammation: the role of leukocytes, microglia and astrocytes in neuronal death and survival Clin Neurosci Res 2006 6 237 45 19169437 45 Sofroniew MV Molecular dissection of reactive astrogliosis and glial scar formation Trends Neurosci 2009 32 638 47 19782411 46 Simpson JE Ince PG Lace G Forster G Shaw PJ Matthews F Savva G Brayne C Wharton SB MRC Cognitive Function and Ageing Neuropathology Study Group Astrocyte phenotype in relation to Alzheimer-type pathology in the ageing brain Neurobiol Aging 2010 31 578 90 18586353 47 Fiala M Lin J Ringman J Kermani-Arab V Tsao G Patel A Lossinsky AS Graves MC Gustavson A Sayre J Sofroni E Suarez T Chiappelli F Bernard G Ineffective phagocytosis of amyloid-beta by macrophages of Alzheimer's disease patients J Alzheimers Dis 2005 7 221 32 discussion 55-62 16006665 48 Bard F Cannon C Barbour R Burke RL Games D Grajeda H Guido T Hu K Huang J Johnson-Wood K Khan K Kholodenko D Lee M Peripherally administered antibodies against amyloid beta-peptide enter the central nervous system and reduce pathology in a mouse model of Alzheimer disease Nat Med 2000 6 916 9 10932230 49 Fiala M Liu PT Espinosa-Jeffrey A Rosenthal MJ Bernard G Ringman JM Sayre J Zhang L Zaghi J Dejbakhsh S Chiang B Hui J Mahanian M Innate immunity and transcription of MGAT-III and toll-like receptors in Alzheimer's disease patients are improved by bisdemethoxycurcumin Proc Natl Acad Sci U S A 2007 104 12849 54 17652175 50 Nagerl UV Mody I Jeub M Lie AA Elger CE Beck H Surviving granule cells of the sclerotic human hippocampus have reduced Ca(2+) influx because of a loss of calbindin-D(28k) in temporal lobe epilepsy J Neurosci 2000 20 1831 6 10684884 51 Kook SY Jeong H Kang MJ Park R Shin HJ Han SH Son SM Song H Baik SH Moon M Yi EC Hwang D Mook-Jung I Crucial role of calbindin-D28k in the pathogenesis of Alzheimer's disease mouse model Cell Death Differ 2014 21 1575 87 24853300 52 Molinari S Battini R Ferrari S Pozzi L Killcross AS Robbins TW Jouvenceau A Billard JM Dutar P Lamour Y Baker WA Cox H Emson PC Deficits in memory and hippocampal long-term potentiation in mice with reduced calbindin D28K expression Proc Natl Acad Sci U S A 1996 93 8028 33 8755597
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/jats/PMC6398430.nxml b/s2orc-doc2json/tests/jats/PMC6398430.nxml
new file mode 100644
index 0000000000000000000000000000000000000000..a425a17cbae6c44de6774c27a94af4770f916ea4
--- /dev/null
+++ b/s2orc-doc2json/tests/jats/PMC6398430.nxml
@@ -0,0 +1,279 @@
+
+Behav Ecol Behav. Ecol beheco Behavioral Ecology 1045-2249 1465-7279 Oxford University Press UK 30846892 6398430 10.1093/beheco/ary157 ary157 Original Articles Counting crows: population structure and group size variation in an urban population of crows http://orcid.org/0000-0002-3609-5983 Uhl Florian 1 http://orcid.org/0000-0002-4530-4919 Ringler Max 2 3 Miller Rachael 1 4 Deventer Sarah A 1 Bugnyar Thomas 1 Schwab Christine 1 1 Department of Cognitive Biology, University of Vienna, Vienna, Austria2 Department of Ecology and Evolutionary Biology, University of California Los Angeles, Los Angeles, CA, USA3 Department of Integrative Zoology, University of Vienna, Vienna, Austria4 Department of Psychology, University of Cambridge, Cambridge, UKAddress correspondence to F. Uhl. E-mail: FloUhl@gmx.at . Jan-Feb 2019 08 12 2018 08 12 2018 30 1 57 67 03 5 2017 09 10 2018 06 11 2018 © The Author(s) 2018. Published by Oxford University Press on behalf of the International Society for Behavioral Ecology. 2018 This is an Open Access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/4.0/ ), which permits unrestricted reuse, distribution, and reproduction in any medium, provided the original work is properly cited. With data collected over a 1-year period, we detail the spatio-temporal structure of a flock of crows displaying distinct social categories. Presence of carrion and hooded crows was affected by the environmental factors that modulate the party size component of fission–fusion dynamics and thereby influence social complexity.
Abstract Social complexity arises from the formation of social relationships like social bonds and dominance hierarchies. In turn, these aspects may be affected by the degree of fission–fusion dynamics, i.e., changes in group size and composition over time. Whilst fission–fusion dynamics has been studied in mammals, birds have received comparably little attention, despite some species having equally complex social lives. Here, we investigated the influence of environmental factors on aspects of fission–fusion dynamics in a free-ranging population of carrion and hooded crows (Corvus corone ssp .) in the urban zoo of Vienna, Austria over a 1-year period. We investigated 1) the size and 2) spatio-temporal structure of the local flock, and 3) environmental influences on local flock and subgroup size. The local flock size varied considerably over the year, with fewest birds being present during the breeding season. The spatio-temporal structure of the local flock showed 4 distinct presence categories, of which the proportions changed significantly throughout the year. Environmental effects on both local flock and subgroup size were time of day, season, temperature, and weather, with additional pronounced effects of the structure of the surroundings and age class on subgroup size. Our findings show environmental influences on party size at the local flock and subgroup level, as well as indications of structured party composition in respect to the 4 presence categories. These results suggest that environmental factors have significant effects on fission–fusion dynamics in free-ranging crows, thereby influencing social complexity.
Corvus corone crows fission fusion dynamics group size population structure Vienna Science and Technology Fund 10.13039/501100001821 CS11-008 Austrian Science Fund 10.13039/501100002428 Y366-B17 W1234-G17 J3868-B25 P24788-B22 Tiergarten Schönbrunn INTRODUCTION Living in social groups can facilitate predator protection and enhance foraging opportunities, though it may also increase food competition and social complexity (Krause and Ruxton 2002 ). The benefits of group living typically correspond with an increase in group size, for instance, as more individuals are more likely to spot a predator (Hamilton 1971 ; Treisman 1975 ). Social complexity arises mainly due to the formation of social relationships like social bonds and dominance relations (Harcourt and de Waal 1992 ; Cords and Aureli 2000 ), which in turn help individuals to cope with competition (Scheiber et al. 2005 ; Smith et al. 2008 ). In societies structured by social relationships, the number and spatio-temporal distribution of potential interaction partners may further contribute to a species’ level of social complexity (Kappeler and van Schaik 2002 ).
Fission–fusion dynamics—changes in group size and composition over time (Kummer 1971 )—affects the ratio and likelihood of meeting particular individuals. These dynamics may enhance certain cognitive skills, like impulse control and inferential reasoning, as individuals that have been away from the group for a period of time may need to readjust to new situations, like changes in the dominance rank hierarchy or alliances (Paz-y-Mino et al. 2004 ; Amici et al. 2008 ; Aureli et al. 2008 ). Following the introduction of the term, research on fission–fusion dynamics has focused mainly on mammals (Aureli et al. 2008 ) and comparatively few studies have addressed such dynamics in birds (Silk et al. 2014 ). This is surprising as many bird species show high variation in group size and composition, particularly outside the breeding season (Clayton and Emery 2007 ).
The organization of avian groups is highly variable from individuals living in pairs to those living in family groups and communally in large mixed-sex and age groups (Bond et al. 2003 ). In most systems, however, male–female pairs represent the key social unit (also termed primary relationship) (Cockburn 2006 ; Boucherie et al. 2016 ), as pair partners cooperate for reproduction, but also to gain/maintain dominance status and/or access resources (Scheiber et al. 2005 ; Emery et al. 2007 ). A recent study on adult rooks found that, in addition to pair bonds, individuals have secondary relationships with other colony members, suggesting that larger corvid groups are structured by different social layers (Boucherie et al. 2016 ). Similar patterns have been proposed for geese (barnacle geese, Branta leucopsis , Kurvers et al. 2013 ; greylag geese, Anser anser , Scheiber et al. 2013 ) and parrots (spectacled parrotlets, Forups conspicillatus , Wanker 1999 ; review in Bradbury and Balsby 2016 ). A refined social structure can also be seen in family units of cooperatively breeding corvid species (Brown 1970 ; Woolfenden and Fitzpatrick 1984 ; Baglione et al. 2002 ) and, to some extent, even in nonbreeder flocks. Raven nonbreeders, for instance, show different degrees of vagrancy (Braun et al. 2012 ; Loretto et al. 2017 ), whereby birds with low vagrancy status (“residents”) engage in sophisticated interactions, including third-party interventions in others’ conflicts (Szipl et al. 2017 ) and bonding attempts (Massen et al. 2014 ). Taken together, it appears that the social system of some avian species is more complex than simply brief aggregation at shared resources (Vander Wall and Balda 1977 ). Rather, these avian systems are characterized by individualized membership and the formation of social relationships outside of the breeding pair.
Exploration of the ecological factors affecting group formation, specifically its composition and size, can be informative for studies on fission–fusion dynamics. Studies on white-throated magpie-jays (Caloditta formosa ), for instance, showed a positive relationship between food availability and group size (Langen and Vehrencamp 1998 ). Similar effects were also found in primates (Chapman et al. 1995 ; Chapman and Pavelka 2005 ), and in lions (Pantera leo , Caraco and Wolf 1975 ). Another ecological factor that can influence group size is the openness of the habitat, for example, the presence of larger groups in more open habitats (Peek et al. 1974 ; Thirgood 1996 ). However, surprisingly little is known about the impact of environmental factors on the grouping behavior of opportunistic corvids like carrion crows (Corvus corone ).
The present study focused on a population of wild, free-ranging carrion crows utilizing the area of Vienna Zoo (Tiergarten Schönbrunn) in Vienna, Austria (hereafter “local flock”). Carrion crows are highly opportunistic in terms of foraging and habitat use (Coombs 1978 ; von Blotzheim 1993 ). To our knowledge, carrion crows and common ravens do not differ significantly in their social structure: individuals aggregate at food sources and groups show hierarchies determined by age, sex, and body size (Heinrich 1989 ; Richner 1989a ; Braun and Bugnyar 2012 ). In contrast to ravens (Braun et al. 2012 ; Loretto et al. 2015 ) and American crows (Stouffer and Caccamise 1991 ), fission–fusion dynamics have not yet been explored in carrion crows.
The aim of our study was to examine key aspects of fission–fusion dynamics in this focal population of crows. According to the framework proposed by Aureli et al. (2008) , the degree of fission–fusion dynamics in a species can be determined via 3 components: variation in party size, party composition, and spatial cohesion. Here, we draw on this framework and used it as a heuristic device to establish whether crows at Vienna Zoo show temporal variation in social structure across time. To make the distinction between Aureli et al.’s framework and our measurements clear, we use the term “party” only when referring to components of the framework itself and use different expressions when referring to our own empirical measures. Specifically, we investigated variation in party size by examining changes in the number of crows in the zoo area, which we refer to as “local flock size,” and the number of birds foraging together, which we refer to as “subgroup size.” These two measures have also been used for describing party size in ravens from a global and local perspective (Braun and Bugnyar 2012 ), i.e., whether birds join a flock in a specific area (e.g., zoo) and whether they join particular foraging groups within this area (e.g., at particular enclosures). Furthermore, we took a first step towards investigating the variation in party composition at the level of the local flock by looking at changes in the crow’s residency status (categories due to presence/absence patterns) across seasons. Note that we did not focus on variation in the identity of individuals within a subgroup, which would be expected when mapping the framework of Aureli et al. (2008) directly onto the crow social system. We hypothesized that the local flock 1) varies over the year in size and 2) has a nonrandom spatio-temporal social structure, i.e., the residency status of individuals follows specific patterns. Environmental factors likely influence the size of 3) the local flock and 4) the subgroups of crows present at the zoo.
Our predictions were: 1) more crows would be present in the zoo outside the breeding season, as territorial pairs would fend off nonbreeding birds, and offspring of the breeders would still be present after the breeding season (Schwab, personal observation). 2) The local flock would be structured as local birds, likely territorial breeders, and nonbreeders with different degrees of vagrancy, which resembles the social organization of ravens (Heinrich et al. 1994 ; Braun and Bugnyar 2012 ) and show a similar ecology to American crows (Marzluff and Angell 2005 ). Notably, the residency/vagrancy status of individual crows could vary across the year, hinting towards changes in flock composition. 3) The size of the local flock would be influenced by key environmental factors. Weather would influence the local flock size. Fewer birds would be present in rainy weather conditions as nonresident crows would be less likely to fly to the zoo. Alternatively, the crows would be more likely to visit the zoo during rainy weather due to increased food security, as the food is provided for the zoo animals irrespective of the weather. The time of the day would affect the local flock size, as more individuals may arrive when certain zoo animals are being fed, in order to exploit this high-quality food source, such as meat to the carnivores. A higher number of human visitors would increase local flock size as more food becomes available when visitors accidentally drop food or actively feed the birds. 4) Subgroup size would vary with overall local flock size. However, factors relating to the immediate surroundings of the birds (forestation, type of enclosure/visitor area) would also play an important role. Specifically, larger subgroups would be expected in larger open areas for predator protection (Jarman 1974 ) and in areas with more widely dispersed food, as the feeding competition is lower in such locations (Asensio et al. 2008 ). Additionally, we expected to find an influence of age on subgroup composition with younger birds showing a stronger tendency to form groups than adult birds, as younger birds are more likely to be part of nonbreeder flocks (Richner 1989b ).
METHODS Study site and population The present study was conducted at Vienna Zoo (Tiergarten Schönbrunn) (48°10′54.6′′N 16°18′14.3′′E) in Austria (see Figure 1 ), which is located within the city limits. The Zoo consists of parkland and a forested area. Since 2010, crows in the zoo have been caught using “ladder” and Larsen-traps (Kirchmeir et al., in preparation), and individually color-ringed for identification before being immediately released at the site of capture. At the onset of the present study in 2014, 297 crows were ringed, which rose to 322 by the end of the study in 2015. Of these marked individuals, 129 were sighted again follow release during the duration of the present study. Permission for catching and marking the birds was obtained from the municipal authorities of Vienna (Magistrat der Stadt Wien: MA 22–425/2011/6) and the Austrian Ministry for Science and Research (BMWF-66.006/0009-II/3b/2012).
Figure 1 The study area within Vienna Zoo is outlined in red. The black line represents the observation transect and the blue cross indicates the starting point. Dotted lines show temporal deviations when the regular transect was not accessible due to construction work and/or hazardous weather conditions. At dead ends in transects, we only recorded crows in one travel direction.
![]()
The study area is ideal for conducting behavioral observations on group formation and dynamics as the crows use several parts of the zoo for foraging on a wide variety of food (Miller et al. 2014 ; Deventer et al. 2016 ), are easy to spot and are well habituated to human presence. Vienna lies within a hybrid overlapping zone with presence of both carrion and hooded crows, which as classed as subspecies (Corvus corone corone and Corvus corone cornix , von Blotzheim 1993 ; de Knijff 2014 ) and regularly interbreed (Randler 2008 ). Hence, the local flock consists of individuals of both subspecies and their hybrids. The hybridization is interesting under genetical and evolutionary aspects because there is still substantial gene-flow between the 2 subspecies (Poelstra et al. 2014 ) despite evidence for conspecific assortative mating (Risch and Andersen 1998 ; Randler 2007 ). In the area around Vienna, the 2 subspecies interbreed regularly (Randler 2008 ).
The subspecies of each marked individual was visually assessed and recorded during handling when the birds were caught. However, it was difficult to reliably visually identify the subspecies of some individuals during observations in the field, due to challenging environmental conditions, including partly obstructed bodies or variations in lighting. This difficulty is due to some hybrid birds showing similarities to one subspecies (e.g., having greyish plumage parts) over the other, which could lead to mistakes in identifying these individuals in the field (e.g., as a hooded crow rather than a hybrid). We therefore refrained from including subspecies as a factor in our analysis.
Data collection We monitored the crows during daylight hours from 8 January 2014 to 31 January 2015, in order to avoid any seasonal bias and observe changes occurring across an entire year (cf. Marra et al. 2015 ). In winter (8 January to 17 April and 17 October 2014 to 31 January 2015), we conducted 2 observational sessions (“transects” hereafter): one 2-h transect between 0900 and 1300 h (“Morning”) and one 2-h transect between 1300 and 1600 h (“Afternoon”). When daylight hours increased (23 April 2014 to 14 October 2014), we increased to three 2-h transects (per day) between 0800–1200 h (“Morning”), 1200–1500 h (“Noon”), and 1500–1900 h (“Afternoon”). Minor fluctuations in transect duration occurred due to the varying time needed to enter data. Between 27 January–9 February and 10–16 March 2014, no surveys were conducted due to observer illness. We collected data via scan observations along a fixed transect covering the entire zoo (see Figure 1 ); which took on average 2 h. Occasionally, we had to make minor changes to the transect when some paths were closed temporarily. While we kept the starting point of the transect constant, the travel direction was alternated (clockwise/counter-clockwise).
We entered each observation of a crow, marked or unmarked, into a digital map of the zoo using a GPS enabled pocket PC (MobileMapper 10, SpectraPrecision) with a mobile GIS software (ArcPad 10.2, ESRI). Data for the background map were obtained from the Municipal Authorities of Vienna (MA41) and Vienna Zoo. With each observation, we recorded individual parameters of the bird (age class, ID if individual was marked) as well as behavioral and environmental parameters (Supplementary Table S1 ). Regarding age class, we differentiated juveniles (birds in their first summer) from older birds (subadults and adults) as the very distinct appearance of juveniles (slender silhouette, begging behavior) could be reliably determined even under difficult lighting conditions. We refrained from using color-based features (von Blotzheim 1993 ) as these aid in differentiating age classes under good lighting conditions only. We excluded sex because we could not reliably visually determine the sex of unmarked birds in the field. In addition, we recorded the observed subgroup size for individuals seen within spatially associated clusters. We defined subgroups as aggregations of nonflying individuals with a nearest neighbor with a direct line of sight within 5 m (Smolker et al. 1992 ; Wolf et al. 2007 ; Hobson et al. 2014 ). Subgroups were rarely spread across more than one enclosure as most enclosures were separated by physical barriers (trees/walls). We performed a total of 271 transects (104 Morning, 60 Noon, 107 Afternoon), which took place on 122 days (2–3 days a week) with an average of 2.22 (SD: ±0.67) transects per day. These observations were carried out based on observer availability.
Analysis Spatial data were analyzed in the GIS software ArcGIS 10.0 (ESRI) and statistical analyses were performed in R Version 3.4.2 (R Core Team 2017 ). We obtained information on structural characteristics at observation locations via spatial joins from the background map in ArcGIS 10. We characterized and partitioned the study area using the following parameters: openness of the area (“Forested Area”: in forest – out of forest), availability of man-made structures (“On Building”: on building – off building), and type of food found in an area (“Food”: grass, vegetarian, mixed, mainly meat, human gastronomical area) (listed in Supplementary Table S2 ).
We grouped the data into 3 seasons of equal duration according to the birds’ breeding ecology: the breeding season (February – May), the parental care season (June – September) and the nonbreeder season (October – January). The breeding season lasts from nest building by the territorial breeding pairs until fledging of the chicks. The parental care season starts with the formation of larger groups of juvenile birds with their parents and ends when juveniles become independent from their parents. At this point, the nonbreeder season starts, which ends with the start of the new breeding season.
In order to investigate the changes in the size of the local flock, we calculated a conservative minimum estimate for the local flock size based on the total crow observations during a day (prediction 1). As we could not reliably identify unmarked crows, it is possible that we repeatedly registered unmarked individuals during an observational session. We therefore calculated the ratio of resightings of marked birds as a correction factor (CF) to correct for resightings of unmarked birds as
C F = C M t − C M i n d C M t (1) where CMt is the total number of sightings of marked birds including resightings and CMind is the absolute number of unique individuals seen that day.
Using this correction factor, we then calculated a local flock size estimate as
F S e = C M i n d + C U − C U ∗ C F T (2) where FSe is the local flock size estimate per day, CU is the total number of sightings of unmarked crows including potential resightings, and T is the number of transects on a given day to account for sampling effort (only used for unmarked individuals as we know the exact number of unique marked individuals seen per day). These data are used for graphical representation as well as the analysis of temporal autocorrelation. For our model calculations on local flock size, we used the flock size estimate per transect (i.e. the total number of crows per transect times the correction factor of the day). We chose this method in favor of capture–recapture calculations (Pradel et al. 1997 ; Thomas et al. 2002 ) as we wanted to use our count data with adjustments for potential resightings as a conservative measurement for our models.
We conducted a cluster analysis on the presence–absence data of marked, individually identifiable crows in the zoo to assess the spatio-temporal structure (hereafter: “presence categories”) for the local flock (prediction 2). The parameters included in the analysis were: the number of days an individual was seen, the longest period in days without observation of an individual, and the standard deviation for these periods without observations of an individual. For this analysis, we only used marked individuals that were observed on 5 days or more (N = 82), the others (N = 47) were classified as rare visitors. We calculated Euclidean distance matrices for the standardized values of these variables and used hierarchical clustering (hclust, method: complete, package: stats). We chose clusters that were well formed and showed long branches to determine the presence categories. To investigate the spatio-temporal structure, and in particular, whether the presence and absence of individuals was significantly different between categories and seasons (prediction 3), we then compared the percentage of unique individuals per day for each presence category between seasons by calculating Friedman tests and 2-tailed paired Wilcoxon tests using a nonparametric bootstrap (sample size 38 and 10,000 iterations). We also applied a Bonferroni correction (α = 0.017) for multiple testing. We calculated the relative number of days that birds of each presence category (days seen/total days with observations) were seen between seasons using approximative Friedman tests based on Monte-Carlo resamplings (10,000) and exact paired Wilcoxon signed-rank tests in the coin package (Hothorn et al. 2008 ) with Bonferroni correction (α = 0.017). Lastly, we compared the group sizes between the different presence categories by calculating Kruskal–Wallis and pairwise post-hoc 2-tailed Mann–Whitney U tests using Bonferroni correction (α = 0.008).
We used generalized linear mixed models to evaluate the impact of environmental factors on both local flock size and subgroup size of unmarked and marked birds (predictions 3 and 4). For the analysis of the local flock size, the response variable was the estimate of the local flock size per transect, in order to include the effect of transect time into the model. The full model included date as a random factor and the following fixed factors: weather, temperature, transect time, season, visitors for both flock size estimate and subgroup size, age class, forested area, food, and on building (for the respective levels of the fixed factors, see Supplementary Tables S1 and S2 ). The response variable for the latter subgroup-size models was the subgroup size minus one to fit the negative binomial distribution for the model. The full model included date as a random factor and the following fixed factors: weather, temperature, observational session, season, number of visitors, number of crows present during a session, age class, forested area, predation risk, food in enclosure, on building and area (for the respective levels of the fixed factors, see Supplementary Tables S1 and S2 ).
We calculated all possible models (32 for the local flock size estimate and 1024 for the subgroup size) via lme4 (Bates et al., 2015 ) in MuMIn (Barton 2016 ) and used Akaike’s Information Criterion for model selection (Anderson and Burnham 2002 ; Burnham and Anderson 2004 ). We formed model averages from the weighted estimates of all models (Anderson 2008 ). We also tested the intercorrelation between the factors for both models (Supplementary Tables S3 and S4 ) and for temporal autocorrelation (Supplementary Table S5 ).
RESULTS Spatio-temporal structure of the local population of crows In total, we obtained 17,645 observations of marked and unmarked crows between 8 January 2014 and 31 January 2015. The daily estimated minimum zoo population size was 65.2 ± 23.2 crows (mean ± SD) across the 122 days of monitoring, of which 64% (41.4 ± 23.1) were unmarked individuals (Figure 2 ). The correction factor (CF) found was on average 0.174 ± 0.1 (mean ± SD).
Figure 2 Sighting histories of marked birds in the zoo. The 3 categories determined by the cluster analysis are shown in different colors. The order of the individuals corresponds to the leaves of the dendrogram for the cluster analysis. Rare visitors were not included in the cluster analysis and are ordered in this plot by order of first sighting. The individuals with available breeding related data are denoted at the top of the graph (dark red = 2013, bright red = 2014). Inverted triangles indicate when birds either died (black) or were newly ringed (orange). The flock size estimates per day of observation are shown in the right graph. The colors correspond to the different seasons. This graph also shows the proportion of juvenile birds in the study area designated in black.
![]()
Of 322 marked crows, 129 individuals (40%) were observed at the zoo during the study period. Of these 129 birds, 81 were males (63%), 40 were females (31%), and 8 were of unknown sex (6%). 13 were juveniles in their first year (10%) and 116 were older than a year (90%), as indicated by the color of the inner beak (von Blotzheim 1993 ).
Not all observed marked birds could be identified (2894 out of 3356 sightings; 86.2% identification rate) as some birds had lost rings, and sometimes lighting and environmental conditions, such as high grass, foliage and observation distance, prevented identification. We omitted these observations in the analysis. The resighting rate varied widely among the individuals, ranging from 67.2% (sighted on 82 out of 122 days) to 0.8% of all observational days (sighted on 1 day only).
The organization of the local flock was mostly in line with our prediction (2), in that it was similar to the social organization of ravens, with local birds and nonbreeders with differing degrees of vagrancy (Heinrich et al. 1994 ; Braun and Bugnyar 2012 ). However, we found evidence for one further category in the crows as the result of our cluster analysis partitioned the birds into 3 well defined clusters—presence categories—representing “resident” birds (N = 26), “continuous” visitors (N = 41), and “periodic” visitors (N = 15) (Supplementary Table S6 ). Crows with fewer than 5 observational days (N = 47) were a priori considered as “rare” visitors. The temporal occurrence of the birds at the Zoo is shown in Figure 2 , which also shows breeding information of marked individuals breeding in the Zoo in 2013 and 2014 (resident birds: 11/26; continuous visitors: 2/41, periodic visitors: 3/15).
The contribution of crows from all 4 presence categories to the overall population size was significantly different between seasons (Friedman test, nonparametric bootstrap 10,000 iterations, mean ± SE; resident birds: χ 2 = 15.594 ± 0.064, N = 38, P = 0.01 ± 0.0004; continuous visitors: χ 2 = 22.067 ± 0.071, N = 38, P = 0.001 ± 0.0001; periodic visitors: χ 2 = 27.925 ± 0.08, N = 38, P = 0.0002 ± 0.00003; rare visitors: χ 2 = 6.772, N = 38, P = 0.144 ± 0.002; Figure 3 ). The proportion of resident birds was significantly higher during nonbreeder season than during parental care season, with the 2 other seasons not differing significantly from one another (Table 1 ). The proportion of continuous visitors was significantly higher during the parental care season than during breeding and nonbreeder seasons, which did not differ significantly from one another. The daily proportion of periodic visitors was significantly higher during the breeding season than during the parental care and nonbreeder seasons, with no significant difference between the latter 2 seasons. We found no significant differences for the proportion of rare visitors between seasons.
Figure 3 Relative proportions of presence categories per season.
![]()
Table 1 Pairwise comparisons of the presence categories’ proportions between seasons
Resident birds Continuous visitors Periodic visitors Rare visitors BS/PCS
+V = 387.92 ± 0.676
+V = 77.65 ± 0.376
+V = 606.9 ± 0.554
+V = 318.34 ± 0.686
+P = 0.468 ± 0.003
+P = 0.0005 ± 0.00004
+
+P = 0.0001 ± 0.000008
+
+P = 0.24 ± 0.0027 BS/NBS
+V = 160.45 ± 0.539
+V = 288.8 ± 0.667
+V = 593.8 ± 0.592
+V = 318.15 ± 0.699
+P = 0.023 ± 0.0006
+P = 0.355 ± 0.003
+P = 0.00006 ± 0.000006
+
+P = 0.106 ± 0.0018 PCS/NBS
+V = 114.18 ± 0.455
+V = 601.19 ± 0.528
+V = 276.93 ± 0.663
+V = 203 ± 0.591
+P = 0.003 ± 0.0002
+
+P = 0.006 ± 0.528
+
+P = 0.487 ± 0.003
+P = 0. 416 ± 0.003
Paired Wilcoxon signed-rank test, α = 0.017, significant differences in bold, nonparametric bootstrap N = 38, 10,000 iterations, mean ± SE.
BS, breeding season; NBS, nonbreeder season; PCS, parental care season.
The relative number of days that individuals across the different presence categories were present was significantly different between seasons for resident birds and continuous visitors (Approximative Friedman test, 10,000 Monte-Carlo iterations; resident birds: N = 26, χ 2 = 17.146, P < 0.001; continuous visitors: N = 41, χ 2 = 19.858, P < 0.001; periodic visitors: N = 15, χ 2 = 3.444, P = 0.178; rare visitors: N = 47, χ 2 = 3.475, P = 0.172). Breeding season differed significantly in resident birds and parental care season differed significantly in continuous visitors (exact paired Wilcoxon signed-rank tests; Resident Birds, N = 26: Nonbreeder Season ~ Breeding Season, Z = 3.318, P < 0.001, Parental Care Season ~ Breeding Season, Z = 2.666, P = 0.003, Nonbreeder Season ~ Breeding Season, Z = 0.102, P = 0.925; Continuous Visitors, N = 41: Nonbreeder Season ~ Breeding Season, Z = 1.067, P = 0.291, Parental Care Season ~ Breeding Season, Z = 3.411, P < 0.001, Nonbreeder Season ~ Breeding Season, Z = −3.069, P < 0.001). Specifically, the resident birds were seen significantly less during breeding season (mean ± SD; breeding season: 13.03 ± 5.92, parental care season: 19.23 ± 6.57, nonbreeder season: 19.15 ± 7.27) and continuous visitors were seen significantly more often during parental care season (mean ± SD; breeding season: 3.71 ± 3.12, parental care season: 7.85 ± 5.03, nonbreeder season: 5.76 ± 4.49).
Environmental influences on local flock size In the final model that was derived from the weighted average of all models, the factors with the strongest effect on local flock size were: transect time, then season, temperature, and weather (prediction 1 and 3, Table 2 ).
Table 2 Influences on local flock size, average model
Estimate ± SE Pr(>|z|) (Intercept) 4.007 ± 0.052 <0.001 Transect Time Noon −0.020 ± 0.006 0.001 Afternoon 0.534 ± 0.004 <0.001 Season Breeding Season −0.303 ± 0.073 <0.001 Parental care season 0.146 ± 0.072 0.042 Temperature Warm −0.285 ± 0.014 <0.001 Hot −0.189 ± 0.020 <0.001 Weather Clouds 0.154 ± 0.011 <0.001 Sun 0.177 ± 0.012 <0.001 Risk of Enclosure High Risk 0.031 ± 0.007 <0.001 Visitors Some 0.004 ± 0.004 0.391 Many 0.027 ± 0.012 0.018
Calculated from all models (weighted full average), the factors are ordered by influence from high to low and by their category.
Subgroup size Subgroup size, as determined by our definition of all individuals within 5 m of one another with a direct line of sight to at least one other crow, was found to range from 1 up to 33 individuals (Figure 4 ). The mean subgroup size ± SD was 1.85 ± 1.64, N = 9525 (Quantiles: 0%: 1; 25%: 1; 50%: 1; 75%: 2; 100%: 33). When considering only subgroups of 2 individuals or more, the mean subgroup size ± SD was 3.07 ± 2.00, N = 3931 (Quantiles: 0%: 2; 25%: 2; 50%: 2; 75%: 3; 100%: 33). Note that the subgroup sizes differed between the 4 presence categories (Kruskal–Wallis test, H 3 = 51.222, P < 0.001). However, when correcting for multiple pairwise comparisons by setting α to 0.0083, the only statistical significance remained for resident birds. These birds were found in smaller subgroups compared with the birds in other presence categories; continuous and periodic visitors did not differ significantly in subgroup size (Table 3 ).
Figure 4 Histogram of the observed subgroup sizes; y axis log-transformed. Numbers in bars show the exact number of observed subgroups.
![]()
Table 3 Pairwise comparisons for the subgroup sizes across the 4 presence categories
+N
+ Mean ± SD Resident birds Continuous visitors Periodic visitors Resident Birds 1754 2.46 ± 2.91 Continuous Visitors 779 3.34 ± 4.56 W = 585,130
+P < 0.001 Periodic Visitors 183 2.96 ± 3.30 W = 136,960
+P < 0.001 W = 71,442
+P = 0.960 Rare Visitors 88 3.66 ± 5.03 W = 59,463
+P < 0.001 W = 31,384
+P = 0.181 W = 7331
+P = 0.219
Wilcoxon signed-rank tests, α = 0.008.
Influences on subgroup size We derived the final model by averaging all models. We found that all factors had a significant effect on subgroup size (prediction 4, Table 4 ).
Table 4 Influences on subgroup size, average model
Estimate ± SE Pr(>|z|) (Intercept) −0.549 ± 0.152 <0.001 Season Breeding Season −0.633 ± 0.102 <0.001 Parental Care Season −0.169 ± 0.094 0.071 Forested Area Out of Forest 0.605 ± 0.034 <0.001 On Building Off 0.489 ± 0.040 <0.001 Age Nonjuvenile −0.451 ± 0.032 <0.001 Food Vegetarian 0.423 ± 0.024 <0.001 Mixed −0.034 ± 0.047 0.473 Mainly Meat 0.263 ± 0.039 <0.001 Human Gastronomical Area −0.291 ± 0.044 <0.001 Weather Clouds 0.317 ± 0.065 <0.001 Sun 0.138 ± 0.067 0.040 Transect Time Noon −0.113 ± 0.035 0.002 Afternoon 0.280 ± 0.025 <0.001 Visitors Some −0.099 ± 0.026 <0.001 Many 0.195 ± 0.055 <0.001 Number of Crows Present 0.005 ± 0.002 0.009
Calculated from all models (weighted full average), the factors are ordered by influence from high to low and by their category.
DISCUSSION Our findings support the majority of our hypotheses regarding grouping behavior and the influence of environmental factors on population, group size and their composition with regard to residency status (presence categories) in wild carrion/hooded crows utilizing Vienna Zoo. The local flock’s size changed considerably throughout the year and was significantly smaller during the breeding season than other seasons (prediction 1). The composition of the crow local flock resembled that of wild common ravens (Braun and Bugnyar 2012 ), and consisted of resident birds and birds that visit the area either continuously, periodically or only rarely (prediction 2). Environmental factors such as time of day, season, temperature, and weather had a significant effect on the size of the local flock (prediction 3). There were also significant environmental effects on subgroup size (prediction 4), in particular season, structure of the surroundings, age class of the birds, and weather. Relating these findings to the framework for fission–fusion dynamics proposed by Aureli et al. (2008) , temporal variation in party size is evident on the flock and subgroup level, whereas hints towards a temporal variation in party composition are found on the level of the local flock with regard to changes in the residency status of birds.
Crow social structure and dynamics From a socio-cognitive perspective, the structure of the local flock is interesting. Our data fit the well-known picture of corvids forming “open” groups when utilizing resources, with individuals coming and going at different times and rates (Coombs 1978 ; Richner 1989b ; Marzluff and Heinrich 1991 ). Further, they corroborate recent findings in ravens that such “groups” (termed here as the local flock) are composed of individuals with different degrees of residency and vagrancy, respectively, with some individuals showing stronger preferences for a given site than others (Braun and Bugnyar 2012 ; Loretto et al. 2016 ). Moreover, our current analyses of the presence patterns show that a more detailed differentiation is possible: in addition to “resident” birds staying in the Zoo and “continuous visitors” coming to the Zoo regularly, we can distinguish “periodic visitors” coming to the Zoo only for certain time periods from “rare visitors” only rarely being sighted within the Zoo area. From the first 3 categories, some crows could be identified through observations as breeders. The majority of birds in all categories, however, seems to be nonbreeders that are either too young to breed or that failed to find a partner and/or defend a breeding territory.
The proportion of individuals belonging to the 4 presence categories underwent significant changes throughout the year. The proportion of resident birds was higher during the nonbreeder season, which could be due to food availability or to decreased competition for breeding sites in fall and winter. The lower proportion of birds during the breeding season could also be due to resident breeders being less visible while present at the nest. Indeed, the relative number of days that resident individuals were seen is significantly lower during the breeding season. The proportion of continuous visitors was significantly higher during the parental care season than the other seasons, which could stem from small family groups preferably living in the areas surrounding the Zoo, though coming in to the Zoo to raise their young in an area with higher food density. While resident birds and continuous visitors were more similar according to the cluster analysis, periodic visitors were more distinct regarding their presence. The proportion of periodic visitors in the population was higher during the breeding season than the rest of the year. Therefore, we assume birds within this category may come to the Zoo to attempt breeding as the area presents a secure feeding site.
Rare visitors were sighted evenly across all seasons. Therefore, we assume that these individuals could be vagrant birds using large areas, similar to previous findings in ravens (Loretto et al. 2016 ). They may use the Zoo only as a stopover during vagrancy, or the Zoo may be situated at the periphery of their home ranges and the birds seldom visit the area. During observations, we did not observe all the individuals that were marked as part of the longer-term project. This could in part be due to death, as indicated in Figure 3 where 3 individuals were known to have died during the period of the present study, albeit not all in the Zoo. However, we found evidence that some individuals that were marked at the Zoo seemed to have left the area for several months before being sighted there again. This is particularly apparent in the “Rare Visitors” and “Periodic Visitors” category. Therefore, it is plausible that a certain proportion of birds only return irregularly to our study area and some individuals may not return to the Zoo at all.
Although our categories are constructs that describe the crows’ presence patterns along a continuum, it is worth noting that residents and continuous visitors should differ in their likelihood of meeting one another, when compared with periodic or rare visitors. Hence, the different presence patterns may have direct effects on the birds’ social knowledge and behavior. For instance, the birds with spatio-temporally stable patterns may come to recognize each other in individual terms, whereas those with high degrees of vagrancy may be treated according to rules of thumb, such as always supporting the aggressor in agonistic interactions. Further studies are needed to test such assumptions.
Overall, the number of crows using the Zoo differed significantly between the breeding season and the rest of the year, with fewer birds being present during the breeding season, which is consistent with our prediction. This finding could be explained by the territoriality of breeding pairs trying to defend food resources needed for their offspring, similar to American crows (Marzluff and Heinrich 1991 ; Webb et al. 2012 ). As breeding territories cover most of the Zoo area, breeding pairs could possibly repel nonbreeders from foraging there. A study on the same local flock found the crows to be highly tolerant towards other individuals foraging in the same area even in the late breeding season and parental care season, which was likely due to the high availability of food (Miller et al. 2014 ). However, this study did not investigate any seasonal effects on tolerance and measured tolerance via social interactions, rather than our present measures of local flock and subgroup size. Therefore, it is possible that the high tolerance between crows found in the Miller et al. (2014) study may be habitat/population specific. Alternatively, it could be a “dear enemy” effect (Ydenberg et al. 1988 ), where individuals sharing territories close to one another allow foraging in close vicinity. This tolerance towards conspecifics could be a common trait among corvid species. Studies on social networks in New Caledonian Crows (Corvus moneduloides ) and ravens show that information flow within groups is fast and flexible, and can be predicted by association patterns within groups but less so between groups that are less closely associated spatially (St Clair et al. 2015 ; Kulahci et al. 2016 ). Future studies could investigate seasonal differences and habitat specific differences on tolerance in a foraging context within other carrion/hooded crow populations. In regard to the present study, despite the generally high number of crows present and ensuing high competition, the high availability of food in the Zoo may explain why breeding pairs appeared to defend their territories during the breeding season only.
Environmental effects Our measures of local flock size and subgroup size are likely linked, as a larger local flock size could increase the possibility of larger subgroups forming, as indicated in sea birds (Beauchamp 2011 ). Hence, the environmental factors that influenced the local flock size affected subgroup size in a similar manner. The crows’ activity patterns are strongly influenced by the time of day, weather and temperature. Hot temperatures (>25 °C) and rainy weather likely lowered the crows’ mobility, leading to smaller local flock and subgroup sizes as fewer crows with territories outside of the Zoo congregated within the Zoo area. Local flock and subgroup size seem to have a strong link to the availability of food in the Zoo, as they increase significantly in times when food availability in the Zoo is likely to be higher than outside this area. For instance, food available may be higher within the Zoo in colder mean temperature parts of the year (local flock size only), as well as in the afternoon when the birds may have a higher chance of stealing food from animal enclosures, and feeding on the food dropped by visitors, similar to foraging strategies of habituated grizzly bears (Albert and Bowyer 1991 ). The relative abundance of food appears to be a strong social facilitator in terms of both subgroup size and local flock size, similar to ravens converging on carcasses during winter months (Heinrich 1989 ).
Forest cover, habitat openness and man-made structures influenced subgroup size (prediction 4). Large subgroups of crows were more likely to form in the more open, nonforested part of the zoo. This may be due to larger subgroups providing more protection from potential predation, which is more likely to occur in open areas (Jarman 1974 ), or potential risk from humans. We also consider potential sampling limitations with lower visibility in forested areas during the summertime. Foliage may prevent crows that were sitting in trees from being observed, therefore lowering the estimate for the population or subgroup size, while better visibility during winter may lead to higher estimates in either measure. However, a study in another population of hooded crows also found a preference for nesting in more open areas (Kövér et al. 2015 ), suggesting that there may be a general preference for crows to make use of open areas.
Alternatively, the availability and distribution of food in relation to habitat structure may play an important role. Availability of food was generally high as the crows had access both to food provided for zoo animals as well as food dropped or fed to the crows by visitors. Despite this generally high availability of food, the largest recorded subgroup (n = 33) was found in an enclosure with widely scattered food (black crowned crane, Balearica pavonia ). This wide scattering of food likely allowed for a large subgroup of birds to forage and feed together, while lowering the chances of conflicts, similar to findings in spider monkeys (Asensio et al. 2008 , 2009 ). In support of this explanation, subgroup size tended to be larger in the mainly “vegetarian” animal enclosures of the zoo, where food is generally scattered across larger areas compared with enclosures housing carnivorous predator species, where meat/fish is usually presented in a concentrated manner. Similar results have been found in white-throated magpies, where higher food quantity allowed for larger foraging parties (Langen and Vehrencamp 1998 ).
The number of human visitors present at the Zoo has a significant effect on subgroup size, where a high number of human visitors is associated with an increase in subgroup size. It is possible that crows avoid crowds of visitors in the public areas by preferentially spending time within animal enclosures, thereby increasing the likelihood for subgroup formation. Another influential factor for subgroup size was the age class of the crows. Similar to other corvids (Goodwin 1976 ), nonjuvenile crows were less likely to form subgroups, as they tended to have formed pair bonds and engaged in territorial behavior compared with juveniles.
In summary, our findings suggest that carrion and hooded crows in our local flock showed fission–fusion dynamics. The local flock and its subgroup dynamics were influenced by environmental factors. These findings have interesting implications for social complexity in these crows as, for instance, different areas in the Zoo appear to have positive or negative effects on subgroup size. In addition, we identified different levels of temporal flock membership that require further exploration into the potential relationships within and between the different presence categories of crows as well as implications on the information flow between birds. Further, future studies may aim to compare urban and rural populations of crows to explore whether our findings are habitat specific or applicable more generally to this species, as well as other avian species.
FUNDING This work was supported by the Vienna Science and Technology Fund (CS11-008 to C.S.); the Austrian Science Fund (Y366-B17 and W1234-G17 to T.B., J3868-B25 and P24788-B22 [PI: Eva Ringler] to M.R.); and by the Tiergarten Schönbrunn (Zoo Vienna).
Data accessibility: Analysis in this article reproduced by using the original data set provided by Uhl et al. 2018 .
Supplementary Material Supplementary Material Click here for additional data file.
We thank the staff at Zoo Vienna for their hands-on experience and Andreas Futschik and Marlies Dolezal for their help with the cluster analysis.
REFERENCES
+Albert DM , Bowyer RT
+1991
+Factors related to grizzly bear: human interactions in Denali National Park . Wildl Soc Bull . 19 :339 –349 .
+Amici F , Aureli F , Call J
+2008
+Fission-fusion dynamics, behavioral flexibility, and inhibitory control in primates . Curr Biol . 18 :1415 –1419 .18804375
+Anderson DR
+2008
+Model based inference in the life sciences: a primer on evidence . New York : Springer Science+Business Media, LLC .
+Anderson DR , Burnham KP
+2002
+Avoiding pitfalls when using information-theoretic methods . J Wildl Manage . 66 :912 –918 .
+Asensio N , Korstjens AH , Aureli F
+2009
+Fissioning minimizes ranging costs in spider monkeys: a multiple-level approach . Behav Ecol Sociobiol . 63 :649 –659 .
+Asensio N , Korstjens AH , Schaffner CM , Aureli F
+2008
+Intragroup aggression, fission-fusion dynamics and feeding competition in spider monkeys . Behaviour . 145 :983 –1001 .
+Aureli F , Schaffner CM , Boesch C , Bearder SK , Call J , Chapman CA , Connor R , Di Fiore A , Dunbar RIM , Henzi SP , et al
+2008
+Fission-fusion dynamics: new research frameworks . Curr Anthropol . 49 :627 –654 .
+Baglione V , Marcos J , Canestrari D , Murphy M
+2002
+Cooperatively breeding groups of carrion crow (Corvus corone corone ) in northern Spain . Auk . 119 :790 –799 .
+Barton K
+2017
+MuMIn: multi model inference . R package version 1.40.0 . Available from: https://CRAN.R-project.org/package=MuMIn . Accessed 27 November 2017
+Bates D , Maechler M , Bolker B , Walker S
+2015
+Fitting linear mixed models using lme4 . J Stat Softw . 67 :1 –48 .
+Beauchamp G
+2011
+Functional relationship between group size and population density in Northwest Atlantic seabirds . Mar Ecol Prog Ser . 435 :225 –233 .
+von Blotzheim GU
+1993
+Handbuch der Voegel Mitteleuropas . Wiesbaden : Aula-Verlag .
+Bond AB , Kamil AC , Balda RP
+2003
+Social complexity and transitive inference in corvids . Anim Behav . 65 :479 –487 .
+Boucherie PH , Mariette MM , Bret C , Dufour V
+2016
+Bonding beyond the pair in a monogamous bird: impact on social structure in adult rooks (Corvus frugilegus ) . Behaviour . 153 :897 –925 .
+Bradbury JW , Balsby TJS
+2016
+The functions of vocal learning in parrots . Behav Ecol Sociobiol . 70 :293 –312 .
+Braun A , Bugnyar T
+2012
+Social bonds and rank acquisition in raven nonbreeder aggregations . Anim Behav . 84 :1507 –1515 .23264693
+Braun A , Walsdorff T , Fraser ON , Bugnyar T
+2012
+Socialized sub-groups in a temporary stable Raven flock ? J Ornithol . 153 :97 –104 .25892747
+Brown JL
+1970
+Cooperative breeding and altruistic behaviour in the Mexican jay, Aphelocoma ultramarina . Anim Behav . 18 :366 –378 .
+Burnham KP , Anderson DR
+2004
+Multimodel inference . Sociol Methods Res . 33 :261 –304 .
+Caraco T , Wolf LL
+1975
+Ecological determinants of group sizes of foraging lions . Am Nat . 109 :343 –352 .
+Chapman CA , Pavelka MS
+2005
+Group size in folivorous primates: ecological constraints and the possible influence of social factors . Primates . 46 :1 –9 .15197599
+Chapman CA , Wrangham RW , Chapman LJ
+1995
+Ecological constraints on group size : an analysis of spider monkey and chimpanzee subgroups . Behav Ecol Sociobiol . 36 :59 –70 .
+Clayton NS , Emery NJ
+2007
+The social life of corvids . Curr Biol . 17 :R652 –R656 .17714658
+Cockburn A
+2006
+Prevalence of different modes of parental care in birds . Proc Biol Sci . 273 :1375 –1383 .16777726
+Coombs F
+1978
+The crows: a study of the corvids of Europe . In: The crows: a study of the corvids of Europe . London (UK) : BT Batsford Limited .
+Cords M , Aureli F
+2000
+Reconciliation and relationship qualities . In: Aureli F , De Waal FBM , editors. Natural conflict resolution . Berkeley and Los Angeles : University of California Press p. 177 –198 .
+Deventer SA , Uhl F , Bugnyar T , Miller R , Fitch WT , Schiestl M , Ringler M , Schwab C
+2016
+Behavioural type affects space use in a wild population of crows (Corvus corone ) . Ethology . 122 :881 –891 .27840464
+Emery NJ , Seed AM , von Bayern AM , Clayton NS
+2007
+Cognitive adaptations of social bonding in birds . Philos Trans R Soc Lond B Biol Sci . 362 :489 –505 .17255008
+Goodwin D
+1976
+Crows of the world . 1 st ed. London : The British Museum (Natural History) .
+Hamilton WD
+1971
+Geometry for the selfish herd . J Theor Biol . 31 :295 –311 .5104951
+Harcourt AH , de Waal FBM
+1992
+Coalitions and alliances in humans and other animals . Oxford : Oxford University Press .
+Heinrich B
+1989
+Ravens in winter . New York : Summit Books of Simon & Schuster .
+Heinrich B , Kaye D , Knight T , Schaumburg K
+1994
+Dispersal and association among common ravens . Condor . 96 :545 –551 .
+Hobson EA , Avery ML , Wright TF
+2014
+The socioecology of monk parakeets: insights into parrot social complexity . Auk . 131 :756 –775 .
+Hothorn T , Hornik K , van de Wiel MA , Zeileis A
+2008
+Implementing a class of permutation tests: the coin package . J Stat Softw . 28 :1 –23 .27774042
+Jarman PJ
+1974
+The social organisation of antelope in relation to their ecology . Behaviour . 48 :215 –267 .
+Kappeler PM , van Schaik CP
+2002
+Evolution of primate social systems . Int J Primatol . 23 :707 –740 .
+de Knijff P
+2014
+How carrion and hooded crows defeat Linnaeus’s curse . Science . 344 :1345 –1346 .24948724
+Kövér L , Gyüre P , Balogh P , Huettmann F , Lengyel S , Juhász L
+2015
+Recent colonization and nest site selection of the Hooded Crow (Corvus corone cornix L.) in an urban environment . Landsc Urban Plan . 133 :78 –86 .
+Krause J , Ruxton GD
+2002
+Living in groups . Oxford : Oxford University Press .
+Kulahci IG , Rubenstein DI , Bugnyar T , Hoppitt W , Mikus N , Schwab C
+2016
+Social networks predict selective observation and information spread in ravens . R Soc Open Sci . 3 :160256 .27493780
+Kummer H
+1971
+Primate Societies: group techniques of ecological adaptation . Chicago (IL) : Aldine Publishing Company .
+Kurvers RHJM , Adamczyk VMAP , Kraus RHS , Hoffman JI , van Wieren SE , van der Jeugd HP , Amos W , Prins HHT , Jonker RM
+2013
+Contrasting context dependence of familiarity and kinship in animal social networks . Anim Behav . 86 :993 –1001 .
+Langen TA , Vehrencamp SL
+1998
+Ecological factors affecting group and territory size in white-throated magpie-jays . Auk . 115 :327 –339 .
+Loretto M-C , Reimann S , Schuster R , Graulich DM , Bugnyar T
+2015
+Shared space, individually used: spatial behaviour of non-breeding ravens (Corvus corax ) close to a permanent anthropogenic food source . J Ornithol . 157 :1 –12 .
+Loretto MC , Schuster R , Bugnyar T
+2016
+GPS tracking of non-breeding ravens reveals the importance of anthropogenic food sources during their dispersal in the Eastern Alps . Curr Zool . 62 :337 –344 .29491922
+Loretto MC , Schuster R , Itty C , Marchand P , Genero F , Bugnyar T
+2017
+Fission-fusion dynamics over large distances in raven non-breeders . Sci Rep . 7 :380 .28336913
+Marra PP , Cohen EB , Loss SR , Rutter JE , Tonra CM
+2015
+A call for full annual cycle research in animal ecology . Biol Lett . 11 :20150552 .26246337
+Marzluff JM , Angell T
+2005
+In the company of crows and ravens . New Haven (CT) : Yale University Press .
+Marzluff JM , Heinrich B
+1991
+Foraging by common ravens in the presence and absence of territory holders: an experimental analysis of social foraging . Anim Behav . 42 :755 –770 .
+Massen JJ , Pašukonis A , Schmidt J , Bugnyar T
+2014
+Ravens notice dominance reversals among conspecifics within and outside their social group . Nat Commun . 5 :3679 .24755739
+Miller R , Schiestl M , Whiten A , Schwab C , Bugnyar T
+2014
+Tolerance and social facilitation in the foraging behaviour of free-ranging crows (Corvus corone corone ; C. c. cornix) . Ethology . 120 :1248 –1255 .25937686
+Paz-y-Mino G , Bond AB , Kamil AC , Balda RP
+2004
+Pinyon jays use transitive inference to predict social dominance . Nature . 430 :778 –781 .15306809
+Peek JM , LeResche RE , Stevens DR
+1974
+Dynamics of moose aggregations in Alaska, Minnesota, and Montana . J Mammal . 55 :126 –137 .
+Poelstra JW , Vijay N , Bossu CM , Lantz H , Ryll B , Müller I , Baglione V , Unneberg P , Wikelski M , Grabherr MG , et al
+2014
+The genomic landscape underlying phenotypic integrity in the face of gene flow in crows . Science . 344 :1410 –1414 .24948738
+Pradel R , Hines JE , Lebreton J , Nichols JD
+1997
+Capture-recapture survival models taking account of transients . Biometrics . 53 :60 .
+Randler C
+2007
+Assortative mating of carrion Corvus corone and hooded crows C. cornix in the Hybrid Zone in Eastern Germany . Ardea . 95 :143 –149 .
+Randler C
+2008
+Mating patterns in avian hybrid zones - a meta-analysis and review . Ardea . 96 :73 –80 .
+R Core Team
+2017
+R: a language and environment for statistical computing . Version 3.4.2. Vienna (Austria): R Foundation for Statistical Computing. Available from: https://www.R-project.org/ . Accessed 28 September 2017.
+Richner H
+1989a
+Habitat-specific growth and fitness in carrion crows (Corvus corone corone ) . J Anim Ecol . 58 :427 –440 .
+Richner H
+1989b
+Phenotypic correlates of dominance in carrion crows and their effects on access to food . Anim Behav . 38 :606 –612 .
+Risch M , Andersen L
+1998
+Selektive partnerwahl der aaskrähe (Corvus corone ) in der hybridisierungszone von rabenkrähe (C. c. corone ) und nebelkrähe (C. c. cornix ) . J Ornithol . 139 :173 –177 .
+Scheiber IBR , Kotrschal K , Weiß BM , Hemetsberger J
+2013
+The social life of greylag geese . Cambridge : Cambridge University Press .
+Scheiber IB , Weiß BM , Frigerio D , Kotrschal K
+2005
+Active and passive social support in families of greylag geese (Anser anser ) . Behaviour . 142 :1535 –1557 .21984839
+Silk MJ , Croft DP , Tregenza T , Bearhop S
+2014
+The importance of fission – fusion social group dynamics in birds . Ibis (Lond. 1859) . 156 :701 –715 .
+Smith JE , Kolowski JM , Graham KE , Dawes SE , Holekamp KE
+2008
+Social and ecological determinants of fission–fusion dynamics in the spotted hyaena . Anim Behav . 76 :619 –636 .
+Smolker RA , Richards AF , Connor RC , Pepper JW
+1992
+Sex differences in patterns of association among Indian Ocean bottlenose dolphins . Behaviour . 123 :38 –69 .
+St Clair JJ , Burns ZT , Bettaney EM , Morrissey MB , Otis B , Ryder TB , Fleischer RC , James R , Rutz C
+2015
+Experimental resource pulses influence social-network dynamics and the potential for information flow in tool-using crows . Nat Commun . 6 :7197 .26529116
+Stouffer PC , Caccamise DF
+1991
+Roosting and diurnal movements of radio-tagged American crows . Wilson Bull . 103 :387 –400 .
+Szipl G , Ringler E , Spreafico M , Bugnyar T
+2017
+Calls during agonistic interactions vary with arousal and raise audience attention in ravens . Front Zool . 14 :57 .29299036
+Thirgood SJ
+1996
+Ecological factors influencing sexual segregation and group size in fallow deer (Dama dama ) . J Zool . 239 :783 –797 .
+Thomas L , Buckland ST , Burnham KP , Anderson DR , Laake JL , Borchers DL , Strindberg S
+2002
+Distance sampling . In: ElShaarawi AH , Piegorschs WW , editors. Encyclopedia of environmetrics . Chichester (UK) : John Wiley & Sons, Ltd .
+Treisman M
+1975
+Predation and the evolution of gregariousness. II. An economic model for predator-prey interaction . Anim Behav . 23 :801 –825 .
+Uhl F , Ringler M , Miller R , Deventer S , Bugnyar T , Schwab C
+2018
+Data from: counting crows: flock structure and subgroup size variation in an urban population of crows . Dryad Digital Repository . 10.5061/dryad.t0g149j .
+Vander Wall SB , Balda RP
+1977
+Coadaptations of the Clark’s Nutcracker and the pinon pine for efficient seed harvest and dispersal . Ecol Monogr . 47 :89 –111 .
+Wanker R
+1999
+Socialization in spectacled parrotlets (Forpus conspicillatus ): how juveniles compensate for the lack of siblings . Acta Ethol . 2 :23 –28 .
+Webb WC , Marzluff JM , Hepinstall-Cymerman J
+2012
+Differences in space use by common ravens in relation to sex, breeding status, and kinship . Condor . 114 :584 –594 .
+Wolf JBW , Mawdsley D , Trillmich F , James R
+2007
+Social structure in a colonial mammal: unravelling hidden structural layers and their foundations by network analysis . Anim Behav . 74 :1293 –1302 .
+Woolfenden GE , Fitzpatrick JW
+1984
+The Florida scrub jay: demography of a cooperative-breeding bird . Princeton (NJ) : Princeton University Press .
+Ydenberg RC , Giraldeau LA , Falls JB
+1988
+Neighbours, strangers, and the asymmetric war of attrition . Anim Behav . 36 :343 –347 .
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/jats/PMC7417471.nxml b/s2orc-doc2json/tests/jats/PMC7417471.nxml
new file mode 100644
index 0000000000000000000000000000000000000000..bfadf6d064d4be79801bd46793a35e75bff8030c
--- /dev/null
+++ b/s2orc-doc2json/tests/jats/PMC7417471.nxml
@@ -0,0 +1,39 @@
+
+Nano Converg Nano Converg Nano Convergence 2196-5404 Springer Singapore Singapore 32776254 7417471 237 10.1186/s40580-020-00237-4 Review Graphene impregnated electrospun nanofiber sensing materials: a comprehensive overview on bridging laboratory set-up to industry Al-Dhahebi Adel Mohammed 1 2 Gopinath Subash Chandra Bose 3 4 http://orcid.org/0000-0002-4620-889X Saheed Mohamed Shuaib Mohamed shuaib.saheed@utp.edu.my 2 5 1 grid.444487.f 0000 0004 0634 0540 Department of Fundamental & Applied Sciences, Universiti Teknologi PETRONAS, 32610 Seri Iskandar, Perak Darul Ridzuan Malaysia 2 grid.444487.f 0000 0004 0634 0540 Centre of Innovative Nanostructure & Nanodevices (COINN), Universiti Teknologi PETRONAS, 32610 Seri Iskandar, Perak Darul Ridzuan Malaysia 3 grid.430704.4 0000 0000 9363 8679 School of Bioprocess Engineering, Universiti Malaysia Perlis, 02600 Arau, Perlis Malaysia 4 grid.430704.4 0000 0000 9363 8679 Institute of Nano Electronic Engineering, Universiti Malaysia Perlis, 01000 Kangar, Perlis Malaysia 5 grid.444487.f 0000 0004 0634 0540 Department of Mechanical Engineering , Universiti Teknologi PETRONAS , 32610, Seri Iskandar, Perak Darul Ridzuan Malaysia 10 8 2020 10 8 2020 12 2020 7 27 27 12 2019 7 7 2020 © The Author(s) 2020 Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or other third party material in this article are included in the article's Creative Commons licence, unless indicated otherwise in a credit line to the material. If material is not included in the article's Creative Commons licence and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this licence, visit http://creativecommons.org/licenses/by/4.0/ .Owing to the unique structural characteristics as well as outstanding physio–chemical and electrical properties, graphene enables significant enhancement with the performance of electrospun nanofibers, leading to the generation of promising applications in electrospun-mediated sensor technologies. Electrospinning is a simple, cost-effective, and versatile technique relying on electrostatic repulsion between the surface charges to continuously synthesize various scalable assemblies from a wide array of raw materials with diameters down to few nanometers. Recently, electrospun nanocomposites have emerged as promising substrates with a great potential for constructing nanoscale biosensors due to their exceptional functional characteristics such as complex pore structures, high surface area, high catalytic and electron transfer, controllable surface conformation and modification, superior electric conductivity and unique mat structure. This review comprehends graphene-based nanomaterials (GNMs) (graphene, graphene oxide (GO), reduced GO and graphene quantum dots) impregnated electrospun polymer composites for the electro-device developments, which bridges the laboratory set-up to the industry. Different techniques in the base polymers (pre-processing methods) and surface modification methods (post-processing methods) to impregnate GNMs within electrospun polymer nanofibers are critically discussed. The performance and the usage as the electrochemical biosensors for the detection of wide range analytes are further elaborated. This overview catches a great interest and inspires various new opportunities across a wide range of disciplines and designs of miniaturized point-of-care devices.
Keywords Electrospinning Electrospun nanofibers Nanocomposites Graphene Graphene oxide Reduced graphene oxide Graphene quantum dots Electrochemical biosensors URIF 015LB0-020 Saheed Mohamed Shuaib Mohamed issue-copyright-statement © The Author(s) 2020 Introduction Recently, the demands for highly sensitive, selective, and low detection limit biosensors to detect the low abundance of analyte molecules have increased substantially not only in biomedical applications but also in food industries, agriculture and environmental monitoring [1 ]. The development of ultrasensitive devices and new detection approaches for the efficient point-of-care testing with low-cost and high accuracy is an urgent need in the healthcare industry. Biosensors have received tremendous attention as an alternative to the conventional analytical methods due to the unparalleled specificity, sensitivity, rapidity of analysis and the ability to provide a long-term monitoring and a wide range of detection capabilities, including glucose, blood oxygen level, antibodies, mycotoxins, heavy metals in drinking water, pesticides, nucleic acid and body motions pesticides [2 ]. A variety of approaches have been exploited, including electrochemical biosensors [3 –5 ], fluorescent biosensors [6 ], colorimetric biosensors [7 , 8 ], potentiometric biosensors [9 , 10 ], optical biosensors [11 ], and Raman spectroscopy-based platforms [12 , 13 ]. Compared with other detection methods, electrochemistry biosensing platforms provide a more facile, cost-effective and a highly sensitive detection method which enables the fast response-recovery times, monitoring different analytes, and a very low detection limit [14 –16 ]. Recent efforts have focused on improving the sensing features of electrochemical biosensors by increasing the specific surface area of the transducers (interacting materials with the target analyte), where the larger the surface area of the sensing materials, the higher their ability to interact with the medium (analytes) [2 ].
In recent years, nanocomposite transducers comprising nano-sized materials and polymer matrices have captivated immense attention in the field of advanced materials science due to their remarkably improved thermal, chemical and dimensional stabilities, applicability, electrical conductivity, mechanical and functional properties that can be achieved at relatively lower filler loading [17 ]. The improved properties are mainly attributed to a very high aspect ratio (in the range of 100–1000) of nano sized fillers, yielding light-weight composites with alterable multifunctional properties which makes them potential candidates for several advanced applications including diagnostics and repair human tissues [18 , 19 ], aid in cellular growth and proliferation [18 ], detection of pathogens and heavy metals and offer unparalleled platforms for electrochemical biosensing. In particular, nanocomposites made of graphene based nanomaterials (GNMs) with polymers and or nanoparticles such as metals, carbon nanotubes (CNTs), quantum dots, etc., could provide abundant opportunities for fabricating novel sensors and biosensors with enhanced performance [17 , 20 , 21 ].
GNMs including graphene, graphene oxide (GO), reduced graphene oxide (rGO) and graphene quantum dots (GQD) have attracted extensive interest in research/industrial applications because of their potential and unique properties. GNMs are suitable for fabricating a wide range of novel biosensors with improved functionalities and analytical capacities thus providing fascinating opportunities for point-of-care detection, lab-on-chip devices, wearable and flexible electronics, foodborne detection, and environmental monitoring [2 , 22 , 23 ]. The attractiveness of GNMs transducers relies not only on their ability to act as efficient and stabilizing platforms for the biorecognition elements, but also on their large surface area, small size, physio-chemical properties, high reactivity, high catalytic efficiency, strong adsorption ability, controlled morphology and structure, biocompatibility, and electrocatalytic properties [18 , 24 ]. The favourable structural and compositional synergy of GNMs allows them to be excellent electrode materials for fabricating various sensing platforms [1 ]. Specifically, the integration of GNMs and electrochemical biosensors has created various ingenious biosensing strategies for applications in the areas of food safety and clinical diagnosis [25 ].
Despite the great potential of GNMs and polymer nanocomposites, conventional nanocomposite methods including solvent processing, in situ polymerization and the allied processing encounter several issues such as the agglomeration and aggregation of graphene in the polymer matrix solution, the reduction of the electrical and mechanical properties of GNMs as a results of the insulating polymer matrix and poor dispersion of GNMs nanofillers. The aggregation of graphene is caused by its strong intermolecular π–π interaction, and van der Waals forces resulting in a poor dispersion in the polymer matrix [26 , 27 ]. To circumvent these obstacles, electrospinning provides a facile and effective way of incorporating GNMs [28 , 29 ] e.g. GO sheets with very high aspect ratios into the polymer solution overcome the problem of agglomeration since the polymer matrix is converted to nanosized fibers instead of continuous sheets, thus facilitating better dispersion of the exfoliated GO [30 ]. More importantly, properties such as porosity, elasticity, hydrophobicity, mechanical strength, percolation limit and conductivity can also be tuned by controlling the nanofiller size as well as the electrospinning parameters and solution parameters [31 ]. Apart from this, GNMs can be decorated on the surface of electrospun nanofibers (ESNFs) using post-processing methods enabling the possibility to fabricate multifunctional GNMs nanostructures with novel and/or improved biosensing performance. GNMs-polymer nanocomposites prepared by electrospinning possess both the advantages of polymers such as lightweight, flexibility and moldability, and special functionality of GNMs such as high strength, thermal stability and electrochemical properties [32 ]. Furthermore, the functionality and the dispersity of GNMs can be further improved by incorporating secondary phases such as precious metals, metal oxides, gold nanoparticles, CNTs, and hydroxyapatite either during electrospinning or in the post-processing methods, e.g. wet chemical treatment [33 ]. Owning to their remarkable properties, synergy effect, unique structures and the excellent electron and mass transportation, the ESNF-GNMs composites are potential candidates to improve current technology and open the door to fabricate and commercialize extremely miniaturized new generation biosensors and smart wearable electronics for point-of-care detection in biomedicine and healthcare fields [1 , 34 , 35 ].
Electrospinning (electrostatic spinning) involves an electrohydrodynamic process, during which a liquid droplet is electrified to generate a jet, followed by stretching and elongation to generate fibers [36 ]. Electrospinning setup comprises four essential components namely, a spinneret with a metallic needle (a hypodermic needle with blunt tip) and capillary tube, a syringe pump, a high-voltage–power supply, and a grounded (conductive) metal collecting screen (e.g. aluminum alloy) [37 ]. The procedure of electrospinning can be elucidated based on four main stages which are electrification, jet initiation and extension, bending instability and further elongation, and solidification of the jet into fibers [38 ]. ESNFs diameter and morphology play an essential role in constructing biosensors and are controlled by the process parameters (applied voltage, receiving distance and feed rate), solution and solvent conditions (viscosity, concentration, conductivity, surface tension, volatility) and ambient conditions (humidity, temperature, pressure) [28 ]. Electrospinning has been extensively reviewed with respect to its development, principle and fundamentals, and the critical parameters influencing the fiber diameter and morphology in several recent reviews such as [34 , 39 –45 ].
Due to the lack of comprehensive reviews on electrospinning design of GNMs for electrochemical biosensors, this overview aims to adequately exploit the role of electrospun GNMs nanocomposites for designing electrochemical biosensors and sensors with high sensitivity, selectivity and with low detection limits. Additionally, impregnating GNMs into ESNFs either during electrospinning process using pre-processing methods or after electrospinning as surface modification and functionalization using post-processing methods are presented. Besides, the properties of electrospun GNMs nanocomposites (electrochemical, mechanical, thermal stability and electrical conductivity) and their role in electrochemical biosensors design are critically addressed. This review covers a range of electrochemical biosensors and sensors are using electrospun GNMs nanocomposites for the detection of various analytes.
Graphene-based Nanomaterials (GNMs) Graphene (the first ever reported 2D paper like lightweight material) is a sp2 hybridized carbon atoms that are tightly arranged into hexagonal structures to form a 2D monolayer of graphitic structure analogous to a polycyclic aromatic hydrocarbon of quasi infinite size [46 ]. As a basic building block of other carbon dimensionalities (allotropes), graphene can be wrapped to generate 0D “buckyballs” (e.g. fullerenes), rolled up to form 1D nanotubes, and stacked to produce 3D graphite [47 –49 ]. Since its discovery in 2004 [50 ], graphene has been recognized as a “wonder material” mainly due to its atomic crystal multifunctionality which combines remarkable properties such as high electron mobilities in room temperature (250,000 cm2 /V s) at electron densities of 2 × 1011 cm2 [51 , 52 ], unparalleled thermal conductivity in the order of 5000 W/mK [53 ], superlative mechanical strength (Young’s modulus of ~ 1 TPa) [54 ], large surface area (2630 m2 /g) [55 ], and electronic properties, making it attractive for several applications including sensors, biosensors, electronic devices, supercapacitors, spintronic, photonics, flexible and next generation electronics, biomedical applications, energy storage and solar cells [46 , 56 –65 ].
There are excellent recent reviews on the use of graphene for medicine and biology applications [66 ], graphene metal nanocomposites for electrochemical biosensing applications [67 ], graphene nanocomposites for various applications [68 ], graphene based biosensors for food contaminates detection [69 ], graphene for biosensors [70 –74 ], electrochemical sensors [75 –79 ] and sensors [80 –82 ] for biomedical and other downstream applications [73 , 77 , 78 , 83 –88 ].
GNMs fabrication GNMs include 2D, 3D graphene sheets, GO, rGO, and GQDs can be prepared following two types of fabrication methods: (i) top-down and (ii) bottom-up approaches (Fig. 1 a) [89 ]. The former approach relies on exfoliating stacked layers of graphite by chemical, physical, and thermal treatments to form graphene and it includes micromechanical exfoliation [50 ], supramolecular assembly [90 ], conducting polymers [91 ] and water-soluble polymers [92 ]. The latter includes chemical vapor deposition (CVD) and chemical synthesis methods [93 , 94 ]. The electrochemistry of graphene and its derivatives depends on the number of defects, functional groups, stacked layers, size of graphene sheets and dopants or impurities present [95 –100 ]. CVD is a vacuum deposition process used to harvest graphene sheets (single or multilayer) with high quality, fine aromatic structures with limited defects, compact constitutes, high reactive surface, electrical conductivity and elasticity making it highly attractive for electrochemical sensing [101 ] and bioelectrodes to detect molecules and bio-organisms [58 , 102 –105 ]. Single-layer graphene (SLG) possess higher electron conductivity at room temperature [250,000 cm2 /(V s)] [106 ], thus promoting its applicability for electronics and optoelectronic devices. In principle, the CVD procedure is the shortest and most useful method that allows growing graphene flakes on several substrates (transition metals) such as Ge [107 , 108 ], Ni [109 , 110 ], Cu [111 , 112 ], Rh [113 , 114 ], and etc.Fig. 1 a Major fabrication methods of graphene: Top-down and bottom-up fabrication methods. Principal top-down methods include liquid-phase exfoliation and micromechanical cleavage of graphite. An additional method involves the exfoliation of initially oxidized graphite, leading to GO, which is chemically and/or thermally reduced to graphene. The bottom-up fabrication of graphene is usually performed by epitaxial growth on SiC or chemical vapour deposition, typically on Cu using small molecules, such as methane, as precursors. Reproduced with permission from [174 ] Copyright 2017 Nature Publishing Group. b SEM (a, c, e) images and TEM images (b, d, f) of nanofibers (a, b), nanofibers-rGO-5 (c, d), and nanofibers-rGO-10 (e, f) with different magnifications
Reproduced with permission from [173 ] Copyright 2019 Wiley
![]()
Hummer’s method is a top-down approach to fabricate high-quality and scalable oxidized graphene sheets with different nanosized, good solution process-ability, oxygen content, and sheet layers [115 ]. GO is an excellent form of graphene [116 ] having a simultaneous hydrophobic sp2− and sp3− bonded carbon and abundant carboxylic acid groups, epoxide and hydrophilic hydroxyl, especially on the edge and defects of the nanosheet, hence forming a sheet-like amphiphilic colloid [117 ]. GO, due to its abundant residual sp2− and hydrophilic groups can form stable suspension in aqueous and several polar solvents and form π–π interactions with aromatic molecules [118 ]. Furthermore, the polar chemical groups, carboxyl acid, epoxide, and hydroxyl on the basal plane allow GO to undergo weak interactions for example strong electrostatic interactions or hydrogen bonding and metal ion complexes which also provide abundant chemically reactive groups for surface grafting/anchoring of polymers and or nanoparticles [2 ]. The oxidized functional groups of GO improve its dispersion in polymer matrices and minimize the aggregation and phase separation. GO, due to its amphiphilic sheet-like characteristics acts as a surfactant reagent to react with other nanomaterials [117 ]. RGO, can be obtained by chemical or physical reduction of GO by thermal, chemical, and irradiation methods which are cost-effective approaches to fabricate graphene sheets with a good electrical conductivity. Compared to graphene and GO, rGO has more balanced physical and chemical properties regarding surface chemical groups, electrical, mechanical, solvent dispersibility, optical, and thermal performances [118 ]. Due to these properties, rGO nanosheets are potential candidates for the next-generation electronics, sensors and transistors. GQDs are nanometer-sized single layer-fragments (their sizes are less than 20 nm in diameter) of graphene and GO, which are typically synthesized via a top-down approach through “cutting” of graphene or GO nanosheets [119 ]. GQDs exhibit several remarkable physical properties such as the edge defects induced luminescence and the quantum confinement, making GQDs suitable for interesting applications including cell imaging, bioelectrodes and molecular recognition [120 –122 ].
Electrospun nanofibers containing GNMs Research pertaining to electrospinning has gained significant traction in recent years, as it provides a versatile and viable tools for generating various matrices in a continuous process and with uniform pore sizes, where the fiber diameters are adjusted from nanometers to sub-microns [40 , 123 , 124 ]. ESNFs with diameters lower than 1 nm (subnanometers) have also been recently reported [125 , 126 ]. Although, there are several analogous nanofiber production methods such as nanolithography, self-assembly, melt fibrillation, drawing and template synthesis, electrospinning combines simplicity, low cost and versatility with superior capabilities to manufacture high quality nanofibers with diverse and controlled morphologies and complex nanofibrous assemblies [127 –129 ]. Electrospinning has been successfully applied to produce nanofibers from a wide range of materials, including organic and inorganic polymers, ceramics, metals, graphene, carbon nanotubes, small molecules, and their combinations as well as bacteria, viruses, biomolecules [40 , 130 , 131 ]. The incorporation of GNMs into ESNFs enables significant enhancement towards biosensing capability either by improving the response characteristic of the transducer or acting as the immobilisation matrix for a bioreceptor [132 ]. GNMs can be incorporated into the ESNFs using two main strategies: (i) pre-processing methods (direct blending and in situ synthesis) and (ii) post-processing methods (e.g. physical dip-coating, ultrasonication, plasma treatment, wet chemical method and radiation treatment [68 , 133 ].
Electrospinning design of GNMs NF composites using pre-processing methods Introducing GNMs into the polymer solution matrices for electrospinning is a simple and effective method to fabricate electrospun composites for various advanced applications such as sensing and biosensing [28 ]. In principle, the pre-processing methods consider the size distribution and interface interactions during the encapsulation of GNMs within the polymer nanofibers. In this case, the GNMs should be more stable to ensure the long-term storage stability and excellent reusability of the GNMs ESNFs composite biosensors. In GNMs ESNFs prepared by the pre-processing methods for electrochemical biosensing applications, the GNMs act as the electron transfer platform while the polymers act as a selective adsorptive for bio-tests thus both GNMs and polymers work as a device for electrochemical biosensor electrode. GNMs ESNFs act as a bridge between the test biomolecules and the signal transduction system and thus plays a critical role in both sensor and conductor parts of electrochemical biosensors. High dispersion and even distribution of GNMs within the polymer matrices enable the fabrication of nanofiber composites with highly functional nanofiber composites, novel hierarchical architectures, high specific surface area and tuned porosity, excellent chemical, thermal, electrical and electrochemical properties offering unparalleled performance for point-of-care detection and lab-on-chip devices [132 ]. There are two effective strategies to ensure uniform distribution of GNMs into polymer nanofibers; the direct blending or mixing of GNMs with polymer matrix before electrospinning and in situ synthesis during electrospinning.
Direct blending of GNMs in polymer nanofibers Blending of GNMs into polymer matrix solution is the basic and straightforward way to fabricate GNMs NFs composites. In this strategy, the direct doping of GNMs into polymer matrix may decrease the surface energy of GNMs which in turn tends to cause local cross-linking between GNMs and polymers. In the case of electrochemical biosensors, the even distribution and dispersion of GNMs within the polymer solution matrices is an essential attribute for improving the linear detection range, sensitivity and limit of detection. Therefore, other ways to improve the dispersity and homogeneity of GNMs within the polymer solution matrices should be investigated.
Dispersing GNMs using external forces One of the main challenges in fabricating nanofibers GNMs is the fact that they have high specific surface area and free energy and tend to agglomerate and/or aggregate which compromise their final performances for biosensor applications [20 ]. The agglomeration of GNMs may be ascribed to their short-range interactions with the polymeric molecules and the overlapping of interfacial layers of neighbouring graphene nanofillers or polymers. Therefore, if GNMs are not well dispersed and distributed into the polymer matrices at a nanoscale level, the weak molecular interactions take place and the inhomogeneous dispersion may complicate the electrospinnability of solutions, thus reducing the graphene loading capacity and influencing the overall material properties. To overcome these issues, treating the solution with an external force to aid dispersity of GNMs such as manifold repetition of blending and violent stirring, ultrasonic dispersion methods (ultrasonication bath and ultrasonication probe) or by modifying the surface of graphene materials with active surface agents (adding additive to promote the dispersity of GNMs). Adding additives allows mitigating the huge gap in surface energy between the GNMs and the polymer matrices to obtain a better solvability and suitable nano-scaled distribution thus improving their spinnability. Several spacers have been introduced into GNMs to improve the dispersity and to enhance the specific surface area to provide extra adsorption sites for bio and sensing molecules such as metals and metal oxide nanoparticles [134 , 135 ], organic moieties [136 , 137 ], and polymers [134 , 135 ]. Functionalization of graphene using chemical, electrochemical and sonochemical methods improved its dispersion within polymer matrices, for example functionalized graphene such as GO enhances its dispersion in various polymer matrices due to the interfacial interactions between the functionalized graphene and the polymer [138 , 139 ]. Several studies have used external forces and/or adding additives to improve the dispersity and distribution of GNMs in polymer matrix as reported in Table 1 .Table 1 Summary of recent significant works on electrospinning design of GNMs with polymer matrices using pre-processing methods
GNMs Polymer Solvent Additives Dispersion method/external force ES parameters: (distance; voltage; federate) Refs. GO PVDF DMF: acetone
4:1 wt/wt%
– Hydrophobic modification of GO with subsequent sonication and stirring (27.7 cm; 24.1 kV; 1.23 mL/h) [140 ] rGO PANCMA DMF TiO2 Ultrasonication and microwave heating (30 cm; 14 kV; 0.02 mL/h) [141 ] GO Poly (lactic acid) (PLA)/poly(butylene carbonate) DMF solvent PBC Stirring 18 kV [142 ] GO PCL DMF: DCM 1:1 – Stirring 14 cm; 18 kV; 10 mL/h [143 ] rGO poly (ester amide) (PEA) Ultrasonication bath (12 cm; 20 kV; 0.1 mL/h) [144 ] GR PLA DCM: TFA
2:1 v/v
– Ultrasonication (15 cm; 10–20 kV, 2 mL/h) [33 ] GR PU THF: DMAC
3:2 w/v
– Ultrasonication (10 cm; 15 kV; 1 mL/h) [145 ] GO PAN DMF – Probe and bath sonication and stirring (15 cm; 18 kV; 0.2 mL/h) [146 ] Gr 66nylon TFA: acetone
1:1 v/v
– Bath and tip sonication (15–20 cm; 15–20 kV; 0.17 to 0.5 mL/h) [147 ] Gr Polycaprolactone DMF – Stirring (10 cm; 10–14 kV; 0.4–0.5 mL/h) [148 ] GO PLGA 1,1,1,3,3,3-Hexafluoroisopropanol (HFIP) – Stirring (10 cm; 40 kV; 0.07–0.1 mL/min) [149 ] GO/MWCNT PEO DMF Sonication and vigorous stirring (15 cm; 18.4 kV; 0.5 mL/h) [150 ] GR Polyamide 66 Formic acid – Stirring (15 cm; 20 kV;0.25 mL/h) [151 ] GO PVDF DMF – Sonication and stirring (15 cm; 18 kV; 1 mL/h) [152 ] GO-ZnO Gum arabic (GA) and PVA – Stirring and heating (130 mm; 0–50 kV) [130 ] GO Polyurethane (PU) DMF Ag Stirring and heating (18 cm; 18 kV;1 mL/h) [153 ] GO poly(Acrylonitrile-co-maleic acid DMF – Microwave heating and ultrasonication (12; 25 kV; 0.03 mL/h) [154 ] Graphene Nano sheets poly (Trimethylene terephthalate) TFA – Stirring (14 cm; [155 ] GO CA DMF: acetone
2:3 wt/wt%
– Sonication and heating and stirring (15 cm; 27 kV; 0.13 mL/h) [156 ] GO PLA DMF – Stirring (6 cm; 20 kV; 1 mL/h) [157 ] rGO Polystyrene (PS) (DMF: THF) 1:1 – Magnetic stirrer 22 kV [158 ] GQD PAN DMF – DMF Magnetic stirring 240 cm; 15 kV; 0.63 mL/h [159 ] GO CA Acetone/DMAc (w/w 2:1) – Stirring (8–10 cm; 20–25 kV; 1.5 mL/h) [160 ] Fluorine-doped GO, GO, and GOCOOH PVDF DMAC: acetone (v/v 4:6) 1 g of selectfluor and 0.1 g silver nitrate Stirring (12 cm; 25 kV; 0.5 mL/h) [161 ] rGO PVP/Chitosan Acetic acid: water
9:1 (w/v)
– Stirring (6 cm; 22 kV; 0.5 mL/h) [162 ] rGO PMMA/PANI DMF Stirring and sonication (15 cm;18–20 kV; 0.3 mL/h) [163 ] GO PLA/PCL CF: DMF (v/v = 4/1) – Magnetic stirring and sonication (20 cm; 20 kV) [164 ] GR PVDF Sonication and stirring (17 cm; 20 kV; 1 mL/h) [165 ] GR PCL Acetic acid Gelatin Sonication and stirring (15 cm, 10–20 kV; 0.2–1.8 mL/h) [166 ] Gr and GO PVDF Ultrasonication probe (100 W, 40 kHz, 15 mints) (100 mm; 16 kV; 2 mL/h) [167 ]
In-situ synthesis of GNMs in polymer nanofibers Similar to blending, the in situ synthesis is an effective strategy to disperse GNMs into the polymer solution to form GNMs NF composites using several methods such as hydrothermal reaction, sol–gel synthesis, oxidation–reduction reaction and hydrolysis. In this strategy, GNMs dispersity in the polymer matrix can be assisted using reactions triggered by light, heat, electrochemistry and reactive additives to uniformly distribute GNMs ions inside the polymer matrix with controlled sizes and uniformity while avoiding the agglomeration of GNMs. Sahatiya and Badhulika [168 ] reported a facile one step method for in situ synthesis and alignment of a single graphene-doped zinc oxide electrospun nanofiber composite. They optimized the calcination temperature and the time-dependent electrospinning to fabricate aligned graphene-ZnO composite nanofibers across the gold electrode. The reported method is a cost-effective to detect UV and it can be extended to a variety of sensing applications. He et al. [169 ] reported in situ synthesis, carbonization and electrospinning to fabricate porous graphene-doped copper indium disulfide/carbon (p-GN@CuInS2 /C) composite nanofibers in which graphene nanosheets anchored with CuInS2 nanocrystals of 7–12 nm in diameter were overlapped and embedded in a carbon matrix, aligning along the fiber axial direction. The resultant graphene nanofiber composite exhibited smaller charge-transfer resistance, larger surface area, and excellent electrocatalytic activity than CuInS2 /C and p-CuInS2 /C samples.
Dispersion of GNMs using electrospinning Electrospinning applies electrostatic stretching forces to overcome any entanglement and agglomeration of GNMs by increasing their interface contact with the polymer matrix thereby making possible chemical bonds between them. It also provides shear stress transfer mechanics from the polymer matrices to the nanometric of GNMs thus improving the dispersion of GNMs and prevents their aggregation. Additionally, during the electrospinning, the high elongation of the polymer jet improves the orientation and alignment of GNMs along the fiber axis and embeds them in the fiber core thereby achieving highly distributed GNMs-ESNF composites. The content of GNMs influences their dispersion and induces the changes to the solution rheological and physical properties such as electrical conductivity and viscosity and the diameter of the nanofibers. For instance, the increase of GNMs content induces a higher viscosity which in turn results in forming thicker fibers. Meanwhile, the electrical conductivity will rise with the increase of the GNMs content which favours the stretching of thinner fibers [170 ]. Due to these opposite behaviors, some studies have shown variable fiber diameters as the loading of the nanomaterial is increased [170 , 171 ]. Recently, [172 ] reported a dual method comprising of electrospinning and electrospraying to overcome the difficulty of blending and dispersing polyacrylonitrile (PAN) and GO in the same solvent. Shan et al. [173 ] reported the fabrication of a free-standing nitrogen-doped reduced graphene oxide nanofibers using electrospinning technique. The developed nanofibers showed high electronic conductivity and thus has the potential to be used for chemical sensing, separation and drug delivery. Figure 1 b) depicts the scanning electron microscope (SEM) results for the developed PAN-GO ESNF mats.
Electrospinning design of GNMs NF composites using post-processing methods Although direct blending of GNMs is the simplest and most effective method, one of the critical limitations of blending GNMs into the polymer solution is that as-prepared nanofiber composites may show relatively low-conductivity because the conductivity of GNMs could be warped within the insulating polymers. Alternative approach is to impregnate GNMs onto the surface of ESNFs after electrospinning process using the surface modification methods (post-processing methods). This approach aims at avoiding the problems associated with pre-mixing GNMs into the polymer matrix (e.g. agglomeration and low conductivity) and providing a robust strategy to improve the physiochemical and biological properties of ESNFs. In principle, post-processing methods impregnate or coat GNMs on the surface of the desired ESNFs using chemical or physical strategies to alter the surface of the nanofibers by giving them new features (e.g. surface activation, enhancing surface conductivity) [175 ]. This induce large number of active sites for further biomolecular immobilization while considering the surface properties of the nanofibers which mainly depends on the chemical composition of the spinning solution and the surface structure of the fibers [176 ]. This approach is essentially simple and easy to implement and is economically more feasible at an industrial scale than direct mixing of polymers with GNMs. It is worth noting that the arrangement of GNMs should be made to transfer more GNMs to the electrospun polymer nanofiber surface to increase the chance of the interaction between GNMs and bio-analyses which is of great benefit for biosensors [177 ]. The methods for incorporating electrospun nanofibers with GNMs for sensing applications include physical adsorption and coating, surface graft polymerization, layer-by-layer, plasma modification, chemical doping, heteroatoms doping, wet chemical methods etc. Table 2 summarizes the recent post processing methods used to impregnate ESNFs with GNMs.Table 2 Summary of recent significant works on electrospinning design of GNMs with polymer matrices using post-processing methods
GNMs ES NFs Postprocessing method Mechanism Potential applications Refs. Ag-AQGO PEO/PVA Wet chemical route method The ESNFs were immersed into the as-prepared Ag-AQRGO solution to self-assemble the negatively Ag-AQRGO onto the positively charged NFs in an aqueous solution. The Ag-AQRGO was further washed away with deionized water. After drying in air, the AgNP-3D-AQRGO sensor was obtained Gas sensors [32 ] PEDOT-CNT/rGO PVDF-TrFE Spray coating PEDOT-CNT/rGO is decorated on ES PVDF-TrFE NFs following these steps:
1. Functionalization of PVDF-TrFE ES NF: using dip coating of ethanol, potassium hydroxide and potassium permanganate and finally hydrogen peroxide.
2. Spraying of the positively charged MCNTs suspension and negatively biased rGO solution on the functionalized PVDF-TrFE ESNF
3. Coating of PEDOT on the substrate to further enhance the electrical conductivity and sensitivity.
Piezo-electric pressure sensor and wearable smart textiles [33 ] rGO PVP/InCl3 Ultrasonic dispersion The hybrid nanofibers (NFI-rGO) were obtained via ultrasonic dispersion of 2 mg NFI in a rGO aqueous suspension (0.1 mg mL−1 ) for 5 min Gass sensing in different environments. with 44 ppb detection limit and a response time of 17 s [34 ] rGO PVA Cross linking and chemical radiation modification method The PVA nanofibers were crosslinked (to make them stable and water resistant) with UV-light of 253.7 nm (UV-340 lamp) at 30 W with different duration (15, 30, 45 and 75 min) and then they were kept in both water and PBS solutions to optimize crosslinking duration Filtration, sensors/biosensors, thin films and packaging [35 ]
Among the simplest, fastest and easiest methods to endow electrospun nanofibers with GNMs active sites for target interactions is through the physical dip-coating. This method relies on the interaction between the sensitive probe molecules and the nanofibers which often involves van der Waals forces, hydrophobic forces, electrostatic forces, and hydrogen bonding [178 ]. However, the efficiency and strength of biomolecular immobilization in this case is relatively weaker [179 ]. To overcome this limitation, plasma treatment method enables increasing the efficiency of physical absorption onto the hydrophobic nanofibers by creating a more hydrophilic surface thus enhancing biomolecules attachment because of the large availability of carboxyl and hydrophilic surface groups. Layer-by-layer method offer a versatile method to modify the surface of ESNFs by utilizing electrostatic attraction to manipulate the physiochemical, mechanical and biological properties assemble polyelectrolyte multilayers allowing nanoscale control over composition and structure. Chemical doping with atoms is an effective strategy to obtain intrinsic modification of carbon nanomaterials to improve their electrochemical properties [180 ].
Recently, [181 ] prepared pristine SnO2 nanotubes (NTs) by one-step electrospinning and GO was doped into the as-prepared SnO2 NTs nanofibers by calcination treatment as shown in Fig. 2 (1). First the prepared electrospun SnO2 nanotube fibers were annealed at 600 °C for 2 h to remove polymers and the organic residuals and to oxide the inorganic precursors into SnO2 nanostructures. Next, 0.03 g of pristine SnO2 NTs bundles were dipped into 1 ml of GO (mixed in DI water) solution and dried in the air for several hours. Finally, GO-loaded SnO2 were obtained after thermal annealing at 200 °C. SEM images are presented in Fig. 2 (2) and the obtained results revealed that the modification of SnO2 nanotubes by GO shows the improved sensing properties (e.g. faster response) attributed to the large interfacial interaction between the GO and the SnO2 NTs.Fig. 2 (1): Pristine and GO-SnO2 NTs preparation and gas sensor mechanism and (2) SEM images of (a ) as-prepared Sn + poly (vinyl pyrrolidone) (PVP) nanofibers (b , c ) pristine SnO2 , and (d , e ) GO incorporate SnO2 NTs, (f ) Histogram of GO-SnO2 NT diameters
Reproduced with permission from [181 ] Copyright 2019 Elsevier
![]()
Tambakoozadeh et al. [182 ] utilized in situ polymerization to prepare polyaniline (PANI)/graphene–coated polyamide nanofiber composite for the electrochemical applications. The composite of PANI/GO nanofibers were treated with monohydrate to reduce GO to graphene, and this was followed by the re-oxidation of PANI. The electrical conductivity of the composite PANI/graphene-coated nanofiber was enhanced mainly due to the presence of graphene as well as the increase of aniline concentration in the polymerization process. In terms of the mechanical properties, the presence of GO enhanced the tenacity of the coated nanofibers which is ascribed to the homogenous dispersion of graphene nanosheets and thus the effective load transfer from the matrix to graphene because of their strong interfacial adhension. As for the electrochemical properties, the cyclic voltammetry (CV) curves of the coated nanofibers at a scan of 10 V/s and with a potential window from 0 to 0.9 V (Additional file 1 : Figure S1).
Zheng et al. [183 ] assembled RGO onto the polyurethane (PU) electrospun nanofiber composite assisted by ultrasonication to obtain a polymer core-RGO shell structure. First PU was dissolved in dimethylformamide (DMF) solvent and stirred for 12 h at 60 °C to produce a homogenous solution. The solution was then placed in a syringe and the electrospinning was processed at a flow rate of 1 ml/h, a voltage of 15 kV and the receiving distance was 15 cm. RGO solution was prepared by dispersing RGO in ethanol, water or acetone solvents and ultrasonicated for 0.5 h. The resultant ES PU nanofibers were dipped in the dispersed RGO solution under ultrasonication for different duration from 10 s to 20 min during which RGO nanosheets were gradually assembled on the nanofiber surface to form the core–shell structure. Finally, the RGO decorated composite mat was obtained after washing with ethanol and drying at 60 °C for 12 h. Samani et al. [148 ] observed an increase in the conductivity and mechanical properties when adding graphene in the polymer matrix for electrospinning. Gozutok et al. [184 ] dispersed rGO in the poly (vinyl alcohol) (PVA) solutions without using any co-solvent and then electrospinning was used to fabricate nanofiber mats. By adding rGO, the properties of the PVA/rGO NF composite such as the porosity, inter fiber, pore size, and average fiber diameter were relatively improved. It was also observed that, the increase in rGO content improved the mechanical properties, thermal stability and electrical conductivity while the crystal structure of PVA did not change.
Properties of electrospun GNMs nanocomposites ESNFs differentiate themselves by their remarkable functional features such as an extremely high surface-area-to volume ratio, ultra-fine diameter, high aspect ratio of length to diameter and molecular orientation along fiber axis, a complex and large porous structure with excellent pore-interconnectivity and tunability, a great mechanical performance, diverse fibrous morphologies, physio–chemical and electrical properties and adjustable structure and diameter [31 , 43 ]. Due to their specialized features, ultrathin diameters and controlled porosity, electrospun nanofiber have demonstrated high potential for a wide spectrum of applications that includes enhancing the performance of analytical devices, biomedical applications, sensor and biosensor technologies [40 ].
Impregnating GNMs into ESNFs either during the electrospinning through pre-processing or after electrospinning using post-processing methods impart the nanofibers with remarkable properties and morphological structures, useful for electrochemical sensing and biosensing. In terms of electrochemical properties, the 3D interconnected hierarchical structures of GNMs enable facilitating the diffusion of different types of biomolecules as well as maintain their biocatalytic bioactivity functions thereby improving the sensitive and functionality of biosensors. Owning to their intrinsically high strength derived from the very strong carbon bonds as well as their interactions with the polymer solution matrix and their degree of dispersion, the addition of GNMs can overwhelmingly improve the tensile strength and Young’s modulus of the ESNFs. GNMs are remarkable additives to improve the mechanical and electrical properties of electrospun nanofibers [185 ]. Dispersion GNMs into polymer matrices have been reported to improve the electrical, mechanical, thermal properties and other properties of polyslfones [186 , 187 ], polyimide [188 , 189 ], polycarbonates [190 , 191 ], polyamides [192 , 193 ], polyethylene terephthalate [192 , 193 ] and polybutylene terephthalate [194 , 195 ]. Gorji et al. [196 ] reported that the incorporation of GO into electrospun of PU and pH- sensitive dyes contributed to a faster response (7 s) and improved the sensor’s sensitivity to detect pH in chemical vapor solution. Table 3 summarizes the recent studies on impregnating GNMs into ESNFs and the subsequent improved properties. Choi et al. [153 ] reported a stretchable and transparent nanofiber-networked electrode (STNNE) based on intrinsically stretchable electrospun nanofibers of polyurethane (PU)/reduced graphene oxide (rGO)/silver nanoparticles (AgNPs) (Fig. 3 ). It was found that, the highly dispersed AgNPs into the PU/rGo nanofibers improved the electrical conductivity, mechanical stretchability. Furthermore, the presence of rGO and the formation of fused intersections between the nanofibers which occurred during the electrospinning process have concert improvements on the electrical stability of the fabricated STNNE. The fabricated STNNE was successfully demonstrated as a stretchable capacitive touch sensor on an elastomeric substrate.Table 3 Summary of studies on the improvement of various properties when adding GNMs in the polymer spinnable solution
GNMs/polymer Spinning parameters Improved properties Potential applications Remarks Refs. Polyacrylonitrile (PAN)/GO (15 cm; 15 kV; 0.8 mL/h)
GO = 0.4%wt
Mechanical strength (by 3–4 times), the thermal stability and hydrophilicity (by 50%) Water treatment and battery performance The content of GO influences its dispersion and thus may affect the fiber formation as well as the final performance and properties of ES fibers [36 ] GO/GR/Halloysite NT/PVDF (10 cm; 16 kV; 2 mL/h) Piezoelectric and pyroelectric. The thermal stability (by 94%) Young’s modulus increased by 20 times Wearable electronics and energy harvesting applications from body movements The content of the nanofillers shows significant effect on piezoelectric responses due to enhancement of electroactive β‐phase [37 ] GO/PEO/PAN (20 cm; 20 kV; 1 mL/h) Electrolyte uptake ionic conductivity The content of GO influences the fiber diameter and ionic conductivity Homogenous distribution of GO fillers in the polymer matrix causes increase in the electrolyte uptake and electrical conductivity of the nanofibers [38 ] PCL/rGO (10 cm; 10, 15 kV, 6 mL/h) s Mechanical behaviour, electrical conductivity and thermal stability Human tissue repair The evaluated properties were affected according to the amount of rGO used and the applied voltage [39 ] GO/PET (12 cm; 10 kV; 0.1 mL/h) Young modulus by 50% MPa and the electroconductivity Improve cell attachment and proliferation The GO, spinning parameters and concentration control the electroconductivity, mechanical properties and the uniformity of NF [40 ] PU-GO-PDA (20 cm; 20 kV; 0.2 mL/h) Wettability, water absorption, and both cell attachment and proliferation Bone regeneration of tissues PU/GO was prepared by electrospinning and then PDA was coated by immersing PU/GO NF in dopamine hydrochloride solution under constant stirring (1.5 mg/L in 10 mM of Tris buffer pH = 8.5) at room temperature in a dark environment. After 24 h, the scaffold was washed with deionized water three times, and air dried [41 ] GO/(Sulfonated PVA) (17 cm; 15 to 18 kV; 300 to 800 µL/h) Thermal and hydrothermal stability, conductivity retention of humidity Ionic polyelectrolyte membrane The combination of the sulfonation, the crosslinking, and the addition of GO enhanced the proton conductivity [42 ] PVA/rGO (6–10 cm; 15–25 kV; 10–20 µL/min) Tensile strength (~ 5 MPa) and and the elastic modulus (~ 1.5 GPa). Thermal stability and the electrical conductivities Biosensors, sensors etc. The increment of rGO (1% wt) improved PVA NF properties due to the strong interfacial interaction between rGO and PVA
rGO dispersion in the PVA solution did not alter the crystal structure of PVA
[35 ] Ag/rGO/Polyamide (PI) (20 kV; DMAc: THF = 3:2, wt:wt%
Stirring
The λ, Tg and THRI values of the (Ag/rGO)/PI nanocomposites were all increased with increasing the Ag/rGO filler loading – The aggregation of rGO can be effectively restricted by introducing Ag nanoparticles
–
[43 ] PVDF-Pt–Pd/RGO-CeO2 15 cm; 12 kV; 1 mL/h Increased thermal and catalytic properties DMFC applications Novel electrospinning of PVDF-Pt–Pd/RGO-CeO2 nanocomposites [44 ] GO-doped PVDF/CuO/Al 15 cm; 0.07 mm/min Heat of reaction and reaction efficiency of PVDF/CuO/Al nanocomposites
Strong anti-oxidation capability
– Electrospinning and GO doping can improve the reaction efficiency due to the improvement of microstructure quality and nanocomposites performance [45 ] PU and PU/rGO‐Ag 17 cm; 18 kV; 0.3 mL/h Tensile strength
Electrical conductivity
Cardiogenic differentiation
Potential
Wettability
Cardiac tissue engineering Adding rGO‐Ag and concentration influence the fiber diameter and the final properties [46 ] MnO2-GO 10 cm; 15 kV; 0.5 mL/h Electrochemical dielectric behaviors higher charge mobility, diffusivity, and conductivity. Future energy storage devices – [47 ] PCL/GO-Gelatin 13 cm; 14 kV; 3 mL/h Tensile stress and Young’s modulus Anti-tumor effect of classical therapies The presence of 1 wt% graphene oxide increased mechanical strength of PCL/Gel [48 ]
Fig. 3 a Technological flow chart of the patterned STNNE. b FESEM image of the networked nanofibers. c FESEM image of the intersections of the nanofibers. d Optical photographs of the stretchable and transparent networked nanofibers film. Dispersion of PU/rGO/AgNPs in nanofibers. e Raman spectra of PU/GO/AgNPs nanofiber and PU/rGO/AgNPs nanofiber samples with a GO:AgNPs loading ratio of 1:1.25. f TEM images of nanofibers with diameters of ~ 290, ~ 484, and ~ 933 nm. g Schematics of the functional groups on GO, chemical structure of polyurethane, and negative surface charges of AgNPs. GO nanosheets can be hydrogen-bonded to the PU matrix by the functional moieties of the carboxyl and hydroxyl groups. h Optical transmittance-sheet resistance of the networked nanofibers for different types of nanofibers: rGO-coated PU, PU/rGO, PU/AgNPs, and PU/rGO/AgNPs nanofibers with that of copper nanowires, PEDOT: PSS/Zonyl/DMSO and graphene. i Stress–strain curves of PU/rGO and PU/rGO/AgNPs nanofibers. Evaluation of STNNEs under stretching conditions. j Resistance change (ΔR/R0 ) versus elongation of the PU/rGO and PU/rGO/AgNPs nanofiber electrodes on PDMS substrates. k Resistance change (ΔR/R0) versus low strain under tensile and compressive bending of STNNEs
Reproduced with permission from [153 ] Copyright 2019 Royal Society of Chemistry
![]()
Ruan et al. [158 ] reported an increase in the thermal conductivity of polystyrene (PS) as a result of the co-electrospinning of PS with thermally reduced graphene oxide (TRG). More specifically, the addition of 15 wt% TRG could increase the thermally conductive coefficient (λ) value of pure PS from 0.226 to 0.689 W/mK, glass transition coefficient (\documentclass[12pt]{minimal}
+ \usepackage{amsmath}
+ \usepackage{wasysym}
+ \usepackage{amsfonts}
+ \usepackage{amssymb}
+ \usepackage{amsbsy}
+ \usepackage{mathrsfs}
+ \usepackage{upgreek}
+ \setlength{\oddsidemargin}{-69pt}
+ \begin{document}$$a$$\end{document} a ) value from 0.2157 to 0.6545 mm2 /s, glass transition temperature (\documentclass[12pt]{minimal}
+ \usepackage{amsmath}
+ \usepackage{wasysym}
+ \usepackage{amsfonts}
+ \usepackage{amssymb}
+ \usepackage{amsbsy}
+ \usepackage{mathrsfs}
+ \usepackage{upgreek}
+ \setlength{\oddsidemargin}{-69pt}
+ \begin{document}$${\text{T}}_{\text{g}}$$\end{document} T g ) value from 90.3 to 95.0 °C and heat-resistance index (\documentclass[12pt]{minimal}
+ \usepackage{amsmath}
+ \usepackage{wasysym}
+ \usepackage{amsfonts}
+ \usepackage{amssymb}
+ \usepackage{amsbsy}
+ \usepackage{mathrsfs}
+ \usepackage{upgreek}
+ \setlength{\oddsidemargin}{-69pt}
+ \begin{document}$${\text{T}}_{\text{HRI}}$$\end{document} T HRI ) value from 184.2 to 194.3 °C. Gozutok et al. [184 ] observed that, adding rGO to PVA improved the thermal stability as shown in Fig. 4 c. Abdali and Ajji [163 ] reported that, the thermal stability of PANI improved in the presence of graphene as shown in Fig. 4 d, e.Fig. 4 Dimensionally stable anodes (DSC) (a ) and thermogravimetric analysis (TGA) (b ) curves of pure PS matrix and the TRG/PS nanocomposites. Reproduced with permission from [158 ] Copyright 2018 Elsevier. c TGA curves of electrospun PVA mats mixed with GO. Reproduced with permission from [184 ] Copyright 2019 American Scientific Publishers. d TGA curves of rGO, rGO and AM-rGO. e TGA curves electrospun PMMA/PANI/AM-rGO, PMMA/PANI/rGO and PMMA/PANI nanofibers. As shown in e , the thermal degradation temperature of PMMA/PANI/Am-rGO nanofibers increased to ~ 441 °C, a magnitude higher than that of the PMMA/PANI samples at ~ 348 °C. Both d , e are reproduced with permission from [163 ] Copyright 2017 MDPI
![]()
Gebrekrstos et al. [161 ] reported that the addition of fluoro-doped graphene derivatives (GO, GOF and GOOCH) during electrospinning of polyvinylidene fluoride (PVDF) offered remarkable properties including enhanced electroactive β phase, high energy density and improved piezoelectric coefficient. This drastic enhancement can be ascribed to the increase in the amount of β in PVDF/GO fibers and the charge separation induced by the fluorine which acts as a polarization center. Additional file 1 : Figure S2a, b show the piezoelectric response using PFM. Additional file 1 : Figure S2c, d show that, adding GO and GOF provided significantly enhanced dielectric constant of PVDF composites due to the fluorine groups that could trap and accumulate large electrons at the interface. Additional file 1 : Figure S2e depicts the P-E loops for PVDF and GO, GOF and GOOCH.
Electrochemical biosensors based electrospun GNMs nanocomposites Biosensors are analytical devices capable of transferring the response of bio-tests into current signals which comprises two parts, biological detection part and the transduction part. The former is the main part of biosensors which compose of biosensing element (e.g. aptamer, enzyme) that provides selective identification of the bio-tests and converts this detection into processable (current) signals by redox reaction. The latter serves as a platform to transforms the resulting signal from the biomolecule (bioreceptor)-analytes interactions as a current signal to a receiving system for further measurement and quantification. Recently, incorporating GNMs into ESNF to create electrochemical sensors is gaining a wide consideration from researchers mainly because ES GNMs provide a remarkably improved sensitivity and low detection limit caused by their electrochemical probable space, low charge conformation, well-demarcated redox crests, electrocatalytic properties and electron transfer kinetics [197 ]. Additionally, GNMs possess other excellent characteristics such as high surface area, low-cost, and mass electron transfer ability [155 ]. In terms of GNMs NFs biosensors, ESNFs serves as the upholder to GNMs as well as the bioreceptors because they possess no reactive ability and thus, do not involve in the detection and transduction parts. Meanwhile, the GNMs act as the detection and transduction parts due to their high adsorption and reactive and abilities for target analytes via chemical bonding or physical adsorption. Highly and uniformly dispersed and distributed GNMs into ESNFs improves the reactivity, speeds up the both adsorption or releases mechanisms and provides large number of GNMs active sites to act as immobilization matrices for bioreceptors (biorecognition elements) in electrochemical biosensors which enhances the electron transfer rate between the biomolecule and the transducer as well as help to preserve their bioactivity on the sensing electrodes [198 ]. Furthermore, the morphology of ES GNMs NF (porous, core–shell and hollow) contains channels and pores that allows a fluid (e.g. biochemical or chemical species, solvents, gas, etc.) to pass through with minimally reduced mass resistance thereby increasing the analyte diffusion toward the surface of the electrode and provide accurate and ultrasensitive detections [199 , 200 ]. Table 4 shows the summary of ES GNMs and polymer NF composites for sensing applications.Table 4 Summary of studies on ES GNMs and polymer NF composites for biosensing and sensing applications
GNMs/Polymer Spinning parameters Limit of detection limits Biomolecule Target References rGO/PVP/Chi (12 cm; 22 kV; 0.5 mL/h) 0.15 pmol L−1 Laccase enzyme EE2 [162 ] GQDs/PVP (10 cm; 20 kV; 0.5 mL/l) 12 µM – Glucose [203 ] GO/PAN heat treatment > CNT/RGO 15 cm; 15 kV; 1.6 mL/h – Electrochemical detection of l -cysteine [172 ] PAN/GO 15 cm; 10 kV; 0.5 mL/h
DMF, sonication and stirring
0.25 for lidocaine and 0.5 for prilocaine
2.5 for 2,6-xylidine
1.25 for o-toluidine
Extraction of lidocaine and prilocaine [204 ]
Electrospinning is a facile and convenient technique to fabricate nanofibers based biosensors from a wide range of macroporous and mesoporous materials [132 ]. Electrospinning endow the polymer nanofibers with predictable and controlled pore geometries, desired diameter and thickness, confirmations and chemical functionalities which benefit the fabrication of novel nanostructure materials with biosensing capabilities [199 ]. Moreover, the opportunity is to modify and functionalize ES NFs on a largescale allows this technique to meet a vast range of sensing requirements over other methods mainly due to the high surface area, high porosity, control of the chemical compositions and the direct electrospinning on a conductive electrode [201 ]. ESNFs can be functionalized by incorporating GNMs during electrospinning or after electrospinning onto the surface of the as-prepared nanofibers to enhance the essential properties for fabricating electrochemical biosensors (electrical conductivity, electrochemical properties, electron transfer, catalytic reactions). Due to their high specific surface area and high porosity, ESNFs provides immobilizations sites and thus can bind to biorecognition elements through EDC/NHS chemistry enabling biorecognition-analytes interface and enhance the current response for the test biomolecules.
Zhang et al. [202 ] reported a facile fabrication of a highly sensitive, efficient, stable, and reproducible electrochemical biosensor for H2 O2 detection by electrospinning PVA with GQDs onto glass carbon electrode (GCE) (Fig. 5 ). GQDs were added into 0.5 g PVA followed by ultrasonication for 2 h and incubation for 10 h. The final concentration of GQDs was 10–50 mM and the obtained homogeneous solution was used for electrospinning PVA/GQDs nanofibrous membrane. The electrospinning parameters were set to 15 kV applied voltage, 12 cm receiving distance, and 0.3–0.5 mL/h flow rate. The ES GQDs electrochemical biosensor showed a linear detection range of 0.1–200 mM and a detection limit of 0.53 μM. It was found that, GQDs can replace the traditional semiconductor QDs and preserve the electrochemical properties of carbon materials.Fig. 5 a Schematic presentation of electrospinning for producing PVA/GQD onto GCE for electrochemical biosensing and catalyzing of H2 O2 , b the possible detection mechanism, c Zeta potentials of GQDs, PVA, and PVA/GQD nanofibrous membranes at varied pH, d CVs of GCEs modified with PVA and PVA/GQD nanofibrous membranes, sensitivity of the biosensor at different potentials (inset), e CVs of the PVA/GQD nanofibrous membranes modified GCE 0.1 M PBS with different addition of H2 O2
(Reproduced with permission from [202 ], Copywrite 2015 Royal Society of Chemistry)
![]()
Pavinatto et al. [162 ] proposed a novel ultrasensitive and highly selective electrochemical biosensor based on polyvinylpyrrolidone/chitosan/reduced graphene oxide ES NFs for 17α-Ethinylestradiol (EE2) detection. The spinnable solution was prepared by dispersing 4% w/v of PVP in ethanol and 1.2% w/v chitosan in acetic acid/water (9:1 w/v). Both solutions were mixed and stirred overnight at room temperature before adding 0.035% w/v of rGO which was dissolved in ethanol. The spinning parameters were 22 kV applied voltage, 12 cm receiving distance, and 0.5 mL/h feed rate. The nanofiber composite was deposited on FTO electrodes attached to a metallic collector with a deposition time of 2.5 h. Upon the characterization of the fibers and prior to immobilizing Laccase enzyme, the fabricated PVP/Chi/rGO ESNFs were treated with glutaraldehyde solution and subsequent crosslinking solution was applied to the nanofiber composite to activate the amine (–NH2 ) and the hydroxyl (–OH) groups from chitosan and graphene sheets, respectively. Covalent bonding was utilized to immobilize the Laccase enzyme to the nanofiber composite through NH2 groups of Laccase enzyme and the activated groups from the nanofiber composite. The PVP/Chi/rGO/Laccase electrode was used to detect EE2. It was revealed that, the integration of Chi and PVP with rGO increased the charge transfer leading to the excellent electrochemical biosensing properties. Figure 6 a reveals the formation of the electrochemical biosensor in terms of coating Laccase enzyme into the FTO/PVP/Chi/rGO nanofiber composite and the CV, electrochemical impedance spectroscopy (EIS) and amperometry measurements are shown in Fig. 6 b–d respectively. Recently, Nathani and Sharma [129 ] demonstrated the use of electrospun mesoporous poly (Styrene-Block-methylmethacrylate) nanofibers (ES PS-b-PMMA NF) to enhance the analytical performance of electrochemical biosensor by exploiting the effect of porosity and surface area on the sensing ability of electropsun nanofibers. EDC-NHS chemistry was chosen to biofunctionalized the PS-b-PMMA NFs and the redox response was utilized to study the presence of the carboxyl group. The fabricated electrochemical porous biosensor showed an increase of the sensitivity by 2.7-fold, a detection range of 10 fM–10 nM and a detection limit of 0.37 fM along with good selectivity. Figure 6 e shows the voltammetry results of the developed electrochemical biosensor.Fig. 6 a Schematic representation of the fabrication of EE2 electrochemical biosensor. b Cyclic voltammetry measurements using a PBS buffer solution (pH 7.4) and scan rate of 100 mV s−1 for PTO, PVP/Chi/rGO ES NFs and PVP/Chi/rGO ES NFs coated with Laccase enzyme. c Nyquist plots of EIS for (a) FTO, (b) PVP nanofibers, (c) PVP/Chi nanofibers, (d) PVP/Chi/rGO nanofibers and (e) PVP/Chi/rGO nanofibers coated with Laccase in a 5 mmol \documentclass[12pt]{minimal}
+ \usepackage{amsmath}
+ \usepackage{wasysym}
+ \usepackage{amsfonts}
+ \usepackage{amssymb}
+ \usepackage{amsbsy}
+ \usepackage{mathrsfs}
+ \usepackage{upgreek}
+ \setlength{\oddsidemargin}{-69pt}
+ \begin{document}$${\text{L}}^{ - 1} [{\text{Fe}}\left( {{\text{CN}})_{6} } \right]^{3 - /4 - }$$\end{document} L - 1 [ Fe CN ) 6 3 - / 4 - solution with 0.1 mol L−1 KCl. d Amperometric response upon successive additions of EE2 ethanol solution recorded at PVP/Chi/rGO_Laccase coated electrode in a phosphate buffer solution pH 7.0 in concentrations ranging from 0.25 to 20 pmol L−1 at a fixed potential of − 0.3 V. The inset shows the calibration curve with the respective linear fit. a –d reproduced from with permission from [162 ] Copyright 2018 Elsevier. (E) Schematic of cyclic voltammetry shown the electrochemical behaviour of BSA/BH/PNF/GCE in presence of [Fe(CN)6 ]3−/4− at different scan rates (20–160 mV/s). It can be revealed that, the increase in the peak to peak voltage difference is also an indication of the progressive immobilization and the anodic peak shifts towards the higher potential value whereas the cathodic peaks shift towards lower potential value with the increase in the scan rate
Reproduced with permission from [129 ] Copyright 2019 Wiley
![]()
Future outlook Electrospinning has become one of the most vital techniques to fabricate the functional nanofiber composites with the desired structure and compositions. However, several challenges hinder the transition of electrospinning method from the laboratory scale to industrial scale production such as spinneret configuration, rheology, solution concentration, electric field intensity and distribution, humidity and temperature, flowrate, receiving distance and collector geometry. These parameters could also influence the reproducibility of ESNFs over time and in different locations. On the other hand, the integration of GNMs and polymer nanofibers using electrospinning has proved to be an excellent strategy to fabricate efficient sensing materials-taking the dual advantages of the wonderful functional properties of GNMs and electrospun polymeric nanofibers. However, to attain high-performance electrochemical biosensors, some challenges should be circumvented such as to increase GNMs contents without agglomeration or aggregation to and to increase the immobilization sites for bio-tests molecules. Additionally, to optimize the synergistic effects between graphene and other nanomaterials as well as to improve the electrocatalytic efficiency for electrochemical sensors are mandatory. There are appropriate modification and fabrication of GNMs and polymer nanofibers for biosensor design via electrospinning which are pre- and post-processing methods. The former involves mixing the polymers with GNMs before electrospinning which is a universal and efficient method to fabricate ES GNMs nanostructures for biosensors with enhanced stability, physical and chemical properties, reusability, and long-term storage stability. The latter involves coating or decorating the GNMs onto the surface of as-prepared nanofibers for immediate interface with biomolecules which in turn leads to the enhanced performance of electrochemical biosensors. The pre-processing methods show more superiorities for biosensing performance; however, they require few harsh conditions like violent stirring, in situ growth of GNMs and/or the use of complicated device such as coaxial electrospinning. Additional challenges of pre-processing methods include the dispersion, alignment and the appropriate loading of GNMs with the polymer matrices. Furthermore, more studies are required to control the synergistic effect of GNMs and their interactions with the polymer matrices during the electrospinning process to ensure uniformity and dispersity of GNMs. The post-processing methods typically have higher efficiency of utilizing GNMs directly for biosensing applications due to the possibility to decorate a large surface area of as-prepared nanofibers with GNMs thus maximizing the potential interface between GNMs and biomolecules to facilitating ultrasensitive detection of bio-tested analytes. The major challenge of post-processing methods lies on their ability to establish accurate interactions between the GNMs and the polymer nanofibers because GNMs cannot easily integrated with the as-prepared nanofibers. Therefore, more studies are required to optimize the coating or to develop novel coating strategy of GNMs onto electrospun nanofibers to increase the interfacial bonding between GNMs and electrospun nanofibers. Recently, [205 ] reported a facile strategy to realize a strong connection between multi carbon nanotubes (MWCNTs) and poly (vinylidene fluoride-co -hexafluoropropylene) ESNFs via thermal-induced welding. Ren et al. [206 ] reported an effective strategy to improve the structural integrity between CVD graphene and polyacrylonitrile (PAN) ESNFs via annealing process to fabricate a transparent sensor with enhanced conductivity, mechanical strength, sensitivity, stability and a low detection limit.
This review elucidated the recent achievements on electrospun design of functional nanostructures for biosensing applications by exploiting the remarkable properties of GNMs using pre-processing and post processing methods. It can be concluded that, the appropriate modification of GNMs with surface functional groups (e.g. reduction of GO to rGO and or adding additives) improve their dispersion within the polymer matrices thereby enhancing the electrical conductivity, thermal stability, electrochemical and mechanical properties of the electrospun nanostructured composites. Additionally, the modification of electrospun nanofibers as well as optimizing electrospinning design to fabricate porous, core–shell and hollow nanostructures increase the surface area and therefore the immobilization sites for biomolecules increases. This overview highlighted the recent progress on graphene fabrication materials, the remarkable role of GNMs to construct next generation electro-sensing devices and the importance of electrospinning designs of nanostructured composites towards bridging laboratory set-up to the industry.
Supplementary information Additional file 1: Figure S1. Cyclic voltammetry curves of (A ) polyaniline (PANI)–coated nanofibers (NP3) and (B ) PANI/graphene–coated nanofibers (NP3G2). Schematic representation of the charge/discharge curves of (C ) polyaniline (PANI)–coated nanofibers (NP3) and (D ) PANI/graphene–coated nanofibers (NP3G2) Ref. [31 ]. Figure S2. Piezoelectric force microscopy (PFM) amplitude vs dc voltage (voltage varying from −12 to 12 V) hysteresis loops for (a ) PVDF/GOCOOH and (b ) PVDF/GOF. Frequency dependence of (c ) dielectric constant and (d) dielectric loss for PVDF/GO, PVDF/GOCOOH, and PVDF/GOF composites. (e ) schematic representing the P − E loops for PVDF/GO, PVDF/GOCOOH, and PVDF/GOF at room temperature. (a –e ) are obtained from Ref [24 ] Copy Right ACS.
Abbreviations CV Cyclic voltammetry
CVD Chemical vapor deposition
CNTs Carbon nanotubes
DMF Dimethylformamide
EIS Electrochemical impedance spectroscopy
ESNFs Electrospun nanofibers
GNMs Graphene-based nanomaterials
GO Graphene oxide
rGO Reduced graphene oxide
GQD Graphene quantum dots
PAN Polyacrylonitrile
PANI Polyaniline
PS Polystyrene
PU Polyurethane
PVA Poly (vinyl alcohol)
PVDF Polyvinylidene fluoride
PVP Poly (vinyl pyrrolidone)
SEM Scanning electron microscope
Publisher's Note
Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.
Supplementary information Supplementary information accompanies this paper at 10.1186/s40580-020-00237-4.
Acknowledgements The authors gratefully acknowledged UTP for the research facilities provided to conduct experiments.
Authors’ contributions AMAD, SCBG, and MSMS wrote the manuscript. All authors reviewed the final manuscript. All authors read and approved the final manuscript.
Funding This work is financially supported by University Research Internal Fund (URIF) with Grant No.: 015LB0-020.
Availability of data and materials All data and material will be made available upon request.
Competing interests The authors declare that they have no known competing financial interests that could have appeared to influence the work reported in this paper.
References 1. Krishnan SK Singh E Singh P Meyyappan M Nalwa HS A review on graphene-based nanocomposites for electrochemical and fluorescent biosensors RSC Adv. 2019 9 8778 8781 10.1039/c8ra09577a 2. Wongkaew N Simsek M Griesche C Baeumner AJ Functional nanomaterials and nanostructures enhancing electrochemical biosensors and lab-on-a-chip performances: recent progress, applications, and future perspective Chem. Rev. 2019 119 120 194 10.1021/acs.chemrev.8b00172 30247026 3. Asif MH Razaq A Akbar N Danielsson B Sultana I Facile synthesis of multisegment Au/Ni/Au nanowire for high performance electrochemical glucose sensor Mater. Res. Express. 2019 6 95028 4. Asmatulu R Veisi Z Uddin MN Mahapatro A Highly sensitive and reliable electrospun polyaniline nanofiber based biosensor as a robust platform for COX-2 enzyme detections Fibers Polym. 2019 20 966 974 10.1007/s12221-019-1096-x 5. Niu Y Xie H Luo G Weng W Ruan C Li G Sun W Electrochemical performance of myoglobin based on TiO2 -doped carbon nanofiber decorated electrode and its applications in biosensing RSC Adv. 2019 9 4480 4487 10.1039/c8ra07910b 6. Patel R Zaveri P Mukherjee A Agarwal PK More P Munshi NS Development of fluorescent protein-based biosensing strains: a new tool for the detection of aromatic hydrocarbon pollutants in the environment Ecotoxicol. Environ. Saf. 2019 182 109450 10.1016/j.ecoenv.2019.109450 31349104 7. Dhawane M Deshpande A Jain R Dandekar P Colorimetric point-of-care detection of cholesterol using chitosan nanofibers Sensors Actuators, B Chem. 2019 281 72 79 10.1016/j.snb.2018.10.060 8. Hassan F Gentry-weeks C Reynolds M Li YV Study on microstructure and mechanical properties of polydiacetylene composite biosensors J. Appl. Polym. Sci. 2019 47877 1 14 10.1002/app.47877 9. Mengarda P Dias FAL Peixoto JVC Osiecki R Bergamini MF Marcolino LH Determination of lactate levels in biological fluids using a disposable ion-selective potentiometric sensor based on polypyrrole films Sensors Actuators B Chem. 2019 126 663 10. Dantism S Röhlen D Selmer T Wagner T Wagner P Schöning MJ Quantitative differential monitoring of the metabolic activity of Corynebacterium glutamicum cultures utilizing a light-addressable potentiometric sensor system Biosens. Bioelectron. 2019 139 111332 31132723 11. Vizzini P Braidot M Vidic J Manzano M Electrochemical and optical biosensors for the detection of campylobacter and listeria: an update look Micromachines. 2019 10 500 12. Banbury C Rickard JJS Mahajan S GoldbergOppenheimer P Tuneable metamaterial-like platforms for surface-enhanced raman scattering via three-dimensional block co-polymer-based nanoarchitectures ACS Appl. Mater. Interfaces. 2019 11 14437 14444 30880378 13. Jiang Y Sun D-W Pu H Wei Q Ultrasensitive analysis of kanamycin residue in milk by SERS-based aptasensor Talanta 2019 197 151 158 30771917 14. Rezaei Z Mahmoudifard M Pivotal role of electrospun nanofibers in microfluidic diagnostic systems—a review J. Mater. Chem. B. 2019 7 4602 4619 10.1039/c9tb00682f 31364667 15. Zhang J Zhang F Song J Liu L Si Y Yu J Ding B Electrospun flexible nanofibrous membranes for oil/water separation J. Mater. Chem. A. 2019 7 20075 20102 10.1039/c9ta07296a 16. Tite T Chiticaru EA Burns JS Ioniţă M Impact of nano-morphology, lattice defects and conductivity on the performance of graphene based electrochemical biosensors J. Nanobiotechnol. 2019 17 1 22 10.1186/s12951-019-0535-6 17. Park G Kim S Chae S Han H Le TH Yang KS Chang M Kim H Yoon H Combining SWNT and graphene in polymer nanofibers: a route to unique carbon precursors for electrochemical capacitor electrodes Langmuir 2019 35 3077 3086 10.1021/acs.langmuir.8b03766 30703325 18. Correa E Moncada ME Gutiérrez OD Vargas CA Zapata VH Characterization of polycaprolactone/rGO nanocomposite scaffolds obtained by electrospinning Mater. Sci. Eng. C 2019 103 109773 10.1016/j.msec.2019.109773 19. SuganyaBharathi B Stalin T Cerium oxide and peppermint oil loaded polyethylene oxide/graphene oxide electrospun nanofibrous mats as antibacterial wound dressings Mater. Today Commun. 2019 21 100664 10.1016/j.mtcomm.2019.100664 20. Prajapati DG Kandasubramanian B Progress in the development of intrinsically conducting polymer composites as biosensors Macromol. Chem. Phys. 2019 220 1 26 10.1002/macp.201800561 21. Avossa J Paolesse R Di Natale C Zampetti E Bertoni G De Cesare F Scarascia-Mugnozza G Macagnano A Electrospinning of polystyrene/polyhydroxybutyrate nanofibers doped with porphyrin and graphene for chemiresistor gas sensors Nanomaterials. 2019 9 280 10.3390/nano9020280 22. Lee J-H Park S Choi J-W Electrical property of graphene and its application to electrochemical biosensing Nanomaterials. 2019 9 297 10.3390/nano9020297 23. Sengupta J Hussain CM Graphene and its derivatives for analytical lab on chip platforms TrAC 2019 114 326 337 10.1016/j.trac.2019.03.015 24. Muniandy S Teh SJ Thong KL Thiha A Dinshaw IJ Lai CW Ibrahim F Leo BF Carbon nanomaterial-based electrochemical biosensors for foodborne bacterial detection Crit. Rev. Anal. Chem. 2019 10.1080/10408347.2018.1561243 25. Song Y Luo Y Zhu C Li H Du D Lin Y Recent advances in electrochemical biosensors based on graphene two-dimensional nanomaterials Biosens. Bioelectron. 2016 76 195 212 10.1016/j.bios.2015.07.002 26187396 26. Liang A Jiang X Hong X Jiang Y Shao Z Zhu D Recent developments concerning the dispersion methods and mechanisms of graphene Coatings. 2018 8 33 10.3390/coatings8010033 27. Guo Y Yang X Ruan K Kong J Dong M Zhang J Gu J Guo Z Reduced graphene oxide heterostructured silver nanoparticles significantly enhanced thermal conductivities in hot-pressed electrospun polyimide nanocomposites ACS Appl. Mater. Interfaces. 2019 11 25465 25473 10.1021/acsami.9b10161 31268646 28. Mercante LA Scagion VP Migliorini FL Mattoso LHC Correa DS Electrospinning-based (bio)sensors for food and agricultural applications: a review TrAC 2017 91 91 103 10.1016/j.trac.2017.04.004 29. Zhang M Li Y Su Z Wei G Recent advances in the synthesis and applications of graphene–polymer nanocomposites Polym. Chem. 2015 6 6107 6124 30. Abdel-Mottaleb MM Khalil A Karim S Osman TA Khattab A High performance of PAN/GO-ZnO composite nanofibers for photocatalytic degradation under visible irradiation J. Mech. Behav. Biomed. Mater. 2019 96 118 124 31035062 31. Lee JKY Chen N Peng S Li L Tian L Thakor N Ramakrishna S Polymer-based composites by electrospinning: preparation and functionalization with nanocarbons Prog. Polym. Sci. 2018 86 40 84 32. DemiroğluMustafov S Mohanty AK Misra M Seydibeyoğlu MÖ Fabrication of conductive Lignin/PAN carbon nanofiber with enhanced graphene for the modified electrode Carbon N. Y. 2019 147 262 275 10.1016/j.carbon.2019.02.058 33. Darzi ME Golestaneh SI Kamali M Karimi G Thermal and electrical performance analysis of co-electrospun-electrosprayed PCM nanofiber composites in the presence of graphene and carbon fiber powder Renew. Energy. 2019 135 719 728 10.1016/j.renene.2018.12.028 34. Jiang S Chen Y Duan G Mei C Greiner A Agarwal S Electrospun nanofiber reinforced composites: a review Polym. Chem. 2018 9 2685 2720 10.1039/c8py00378e 35. Lu X Li M Wang H Wang C Advanced electrospun nanomaterials for highly efficient electrocatalysis Inorg. Chem. Front. 2019 10.1039/c9qi00799g 36. Liu Q Chen Z Pei X Guo C Teng K Hu Y Xu Z Qian X Review: applications, effects and the prospects for electrospun nanofibrous mats in membrane separation J. Mater. Sci. 2019 10.1007/s10853-019-04012-7 37. Reneker DH Yarin AL Electrospinning jets and polymer nanofibers Polymer (Guildf). 2008 49 2387 2425 38. Ray SS Chen S-S Li C-W Nguyen NC Nguyen HT A comprehensive review: electrospinning technique for fabrication and surface modification of membranes for water treatment application RSC Adv. 2016 6 85495 85514 10.1039/C6RA14952A 39. Haider A Haider S Kang I-K A comprehensive review summarizing the effect of electrospinning parameters and potential applications of nanofibers in biomedical and biotechnology Arab. J. Chem. 2018 11 1165 1188 40. Xue J Wu T Dai Y Xia Y States U Electrospinning and electrospun nano fibers: methods, materials, and applications Chem. Rev. 2019 10.1021/acs.chemrev.8b00593 41. Wang C Wang J Zeng L Qiao Z Liu X Liu H Zhang J Ding J Fabrication of electrospun polymer nanofibers with diverse morphologies Molecules 2019 10.3390/molecules24050834 42. Wang L Yang G Peng S Wang J Yan W Ramakrishna S One-dimensional nanomaterials toward electrochemical sodium-ion storage applications via electrospinning Energy Storage Mater. 2019 10.1016/j.ensm.2019.09.036 43. Sun Y Cheng S Lu W Wang Y Zhang P Yao Q Electrospun fibers and their application in drug controlled release, biological dressings, tissue repair, and enzyme immobilization RSC Adv. 2019 9 25712 25729 10.1039/C9RA05012D 44. Huang Y Song J Yang C Long Y Wu H Scalable manufacturing and applications of nanofibers Mater. Today 2019 28 98 113 10.1016/j.mattod.2019.04.018 45. Mishra RK Mishra P Verma K Mondal A Chaudhary RG Abolhasani MM Loganathan S Electrospinning production of nanofibrous membranes Springer Int. Publ. 2018 10.1007/s10311-018-00838-w 46. Kumar S Chatterjee K Comprehensive review on the use of graphene-based substrates for regenerative medicine and biomedical devices ACS Appl. Mater. Interfaces. 2016 8 26431 26457 10.1021/acsami.6b09801 27662057 47. Kong Y-C Wu P-R Dong J-W Ding H-L Liu Z Cheng Z-L Oxalic acid assisted expansion–reduction exfoliation of graphene oxide into graphene nanosheets Mater. Lett. 2018 231 51 55 48. Che Y Zhang G Zhang Y Cao X Cao M Yu Y Dai H Yao J Solution-processed graphene phototransistor functionalized with P3HT/graphene bulk heterojunction Opt. Commun. 2018 425 161 165 49. Braga SF Coluci VR Legoas SB Giro R Galvão DS Baughman RH Structure and dynamics of carbon nanoscrolls Nano Lett. 2004 4 881 884 50. Novoselov KS Geim AK Morozov SV Jiang D Zhang Y Dubonos SV Grigorieva IV Firsov AA Electric field effect in atomically thin carbon films Science (80−) 2004 306 666 669 51. Bolotin KI Sikes KJ Jiang Z Klima M Fudenberg G Hone J Kim P Stormer HL Ultrahigh electron mobility in suspended graphene Solid State Commun. 2008 146 351 355 52. Novoselov KS Geim AK Morozov S Jiang D Katsnelson M Grigorieva I Dubonos S Firsov AA Two-dimensional gas of massless Dirac fermions in graphene Nature. 2005 438 197 16281030 53. Balandin AA Ghosh S Bao W Calizo I Teweldebrhan D Miao F Lau CN Superior thermal conductivity of single-layer graphene Nano Lett. 2008 8 902 907 18284217 54. Lee C Wei X Kysar JW Hone J Measurement of the elastic properties and intrinsic strength of monolayer graphene Science (80−) 2008 321 385 388 55. Zhu Y Murali S Cai W Li X Suk JW Potts JR Ruoff RS Graphene and graphene oxide: synthesis, properties, and applications Adv. Mater. 2010 22 3906 3924 10.1002/adma.201001068 20706983 56. Zhu X Liu Y Li P Nie Z Li J Applications of graphene and its derivatives in intracellular biosensing and bioimaging Analyst. 2016 141 4541 4553 10.1039/c6an01090c 27373227 57. Dasari Shareena TP McShan D Dasmahapatra AK Tchounwou PB A review on graphene-based nanomaterials in biomedical applications and risks in environment and health Nano-Micro Lett. 2018 10 1 34 10.1007/s40820-018-0206-4 58. Kumar S Bukkitgar SD Singhratibha S Singh V Reddy KR Shetti NP Venkata Reddy C Sadhu V Naveen S Electrochemical sensors and biosensors based on graphene functionalized with metal oxide nanostructures for healthcare applications ChemistrySelect. 2019 4 5322 5337 10.1002/slct.201803871 59. Papageorgiou DG Kinloch IA Young RJ Mechanical properties of graphene and graphene-based nanocomposites Prog. Mater Sci. 2017 90 75 127 10.1016/j.pmatsci.2017.07.004 60. Yadav SK Cho JW Functionalized graphene nanoplatelets for enhanced mechanical and thermal properties of polyurethane nanocomposites Appl. Surf. Sci. 2013 266 360 367 61. Tong W Zhang Y Yu L Luan X An Q Zhang Q Lv F Chu PK Shen B Zhang Z Novel method for the fabrication of flexible film with oriented arrays of graphene in poly (vinylidene fluoride-co-hexafluoropropylene) with low dielectric loss J. Phys. Chem. C 2014 118 10567 10573 62. Yi M Shen Z A review on mechanical exfoliation for the scalable production of graphene J. Mater. Chem. A. 2015 3 11700 11715 63. Edwards RS Coleman KS Graphene synthesis: relationship to applications Nanoscale. 2013 5 38 51 23160190 64. Javed K Oolo M Savest N Krumme A A review on graphene-based electrospun conductive nanofibers, supercapacitors, anodes, and cathodes for lithium-ion batteries Crit. Rev. Solid State Mater. Sci. 2018 10.1080/10408436.2018.1492367 65. Garain S Jana S Sinha TK Mandal D Design of in situ poled Ce3+ -doped electrospun PVDF/graphene composite nanofibers for fabrication of nanopressure sensor and ultrasensitive acoustic nanogenerator ACS Appl. Mater. Interfaces. 2016 8 4532 4540 10.1021/acsami.5b11356 26829464 66. Priyadarsini S Mohanty S Mukherjee S Basu S Mishra M Graphene and graphene oxide as nanomaterials for medicine and biology application J. Nanostruct. Chem. 2018 8 123 137 10.1007/s40097-018-0265-6 67. Khalil I Rahmati S MuhdJulkapli N Yehye WA Graphene metal nanocomposites—recent progress in electrochemical biosensing applications J. Ind. Eng. Chem. 2018 59 425 439 10.1016/j.jiec.2017.11.001 68. Lawal AT Graphene-based nano composites and their applications. A review Biosens. Bioelectron. 2019 141 111384 10.1016/j.bios.2019.111384 31195196 69. Bobrinetskiy II Knezevic NZ Graphene-based biosensors for on-site detection of contaminants in food Anal. Methods 2018 10 5061 5070 10.1039/c8ay01913d 70. Chauhan N Maekawa T Kumar DNS Graphene based biosensors—Accelerating medical diagnostics to new-dimensions J. Mater. Res. 2017 32 2860 2882 10.1557/jmr.2017.91 71. Terse-Thakoor T Badhulika S Mulchandani A Graphene based biosensors for healthcare J. Mater. Res. 2017 32 2905 2929 10.1557/jmr.2017.175 72. Suvarnaphaet P Pechprasarn S Graphene-based materials for biosensors: a review Sensors 2017 10.3390/s17102161 73. Peña-Bahamonde J Nguyen HN Fanourakis SK Rodrigues DF Recent advances in graphene-based biosensor technology with applications in life sciences J. Nanobiotechnol. 2018 16 1 17 10.1186/s12951-018-0400-z 74. Lawal AT Progress in utilisation of graphene for electrochemical biosensors Biosens. Bioelectron. 2018 106 149 178 10.1016/j.bios.2018.01.030 29414083 75. Gnana Kumar G Amala G Gowtham SM Recent advancements, key challenges and solutions in non-enzymatic electrochemical glucose sensors based on graphene platforms RSC Adv. 2017 7 36949 36976 10.1039/c7ra02845h 76. Cinti S Arduini F Graphene-based screen-printed electrochemical (bio) sensors and their applications: efforts and criticisms Biosens. Bioelectron. 2017 89 107 122 27522348 77. Zhang C Zhang Z Yang Q Chen W Graphene-based electrochemical glucose sensors: fabrication and sensing properties Electroanalysis 2018 30 2504 2524 10.1002/elan.201800522 78. Song H Zhang X Liu Y Su Z Developing graphene-based nanohybrids for electrochemical sensing Chem. Rec. 2019 19 534 549 10.1002/tcr.201800084 30182467 79. Bo X Zhou M Guo L Electrochemical sensors and biosensors based on less aggregated graphene Biosens. Bioelectron. 2017 89 167 186 10.1016/j.bios.2016.05.002 27161575 80. Justino CIL Gomes AR Freitas AC Duarte AC Rocha-Santos TAP Graphene based sensors and biosensors TrAC 2017 91 53 66 10.1016/j.trac.2017.04.003 81. Tung TT Nine MJ Krebsz M Pasinszki T Coghlan CJ Tran DNH Losic D Recent advances in sensing applications of graphene assemblies and their composites Adv. Funct. Mater. 2017 27 1 57 10.1002/adfm.201702891 82. Nag A Mitra A Mukhopadhyay SC Graphene and its sensor-based applications: a review Sensors Actuat. A Phys. 2018 270 177 194 10.1016/j.sna.2017.12.028 83. Wang L Wu A Wei G Graphene-based aptasensors: from molecule-interface interactions to sensor design and biomedical diagnostics Analyst. 2018 143 1526 1543 10.1039/c8an00081f 29528071 84. Singh DP Herrera CE Singh B Singh S Singh RK Kumar R Graphene oxide: an efficient material and recent approach for biotechnological and biomedical applications Mater. Sci. Eng. C 2018 86 173 197 10.1016/j.msec.2018.01.004 85. Taniselass S Arshad MKM Gopinath SCB Graphene-based electrochemical biosensors for monitoring noncommunicable disease biomarkers Biosens. Bioelectron. 2019 130 276 292 10.1016/j.bios.2019.01.047 30771717 86. Wang W Su H Wu Y Zhou T Li T Review-biosensing and biomedical applications of graphene: a review of current progress and future prospect J. Electrochem. Soc. 2019 166 B505 B520 10.1149/2.1231906jes 87. Nie C Ma L Li S Fan X Yang Y Cheng C Zhao W Zhao C Recent progresses in graphene based bio-functional nanostructures for advanced biological and cellular interfaces Nano Today. 2019 26 57 97 10.1016/j.nantod.2019.03.003 88. Reina G González-Domínguez JM Criado A Vázquez E Bianco A Prato M Promises, facts and challenges for graphene in biomedical applications Chem. Soc. Rev. 2017 46 4400 4416 10.1039/c7cs00363c 28722038 89. Park CS Yoon H Kwon OS Graphene-based nanoelectronic biosensors J. Ind. Eng. Chem. 2016 38 13 22 90. Haar S Ciesielski A Clough J Yang H Mazzaro R Richard F Conti S Merstorf N Cecchini M Morandi V A supramolecular strategy to leverage the liquid-phase exfoliation of graphene in the presence of surfactants: unraveling the role of the length of fatty acids Small 2015 11 1691 1702 25504589 91. Choi H Ahn K Lee Y Noh S Yoon H Free-standing, multilayered graphene/polyaniline-glue/graphene nanostructures for flexible, solid-state electrochemical capacitor application Adv. Mater. Interfaces. 2015 2 1500117 92. Xu Y Bai H Lu G Li C Shi G Flexible graphene films via the filtration of water-soluble noncovalent functionalized graphene sheets J. Am. Chem. Soc. 2008 130 5856 5857 18399634 93. Chua CK Pumera M Chemical reduction of graphene oxide: a synthetic chemistry viewpoint Chem. Soc. Rev. 2014 43 291 312 24121318 94. Novoselov KS Fal VI Colombo L Gellert PR Schwab MG Kim K A roadmap for graphene Nature 2012 490 192 23060189 95. Vasilescu A Hayat A Gáspár S Marty JL Advantages of carbon nanomaterials in electrochemical aptasensors for food analysis Electroanalysis 2018 30 2 19 10.1002/elan.201700578 96. Pumera M Graphene-based nanomaterials and their electrochemistry Chem. Soc. Rev. 2010 39 4146 4157 20623061 97. Ambrosi A Chua CK Bonanni A Pumera M Electrochemistry of graphene and related materials Chem. Rev. 2014 114 7150 7188 24895834 98. Pumera M Electrochemistry of graphene: new horizons for sensing and energy storage Chem. Rec. 2009 9 211 223 19739147 99. Pumera M Electrochemistry of graphene, graphene oxide and other graphenoids Electrochem. Commun. 2013 36 14 18 100. Kaplan A Yuan Z Benck JD Govind Rajan A Chu XS Wang QH Strano MS Current and future directions in electron transfer chemistry of graphene Chem. Soc. Rev. 2017 46 4530 4571 10.1039/c7cs00181a 28621376 101. Wang X Gao D Li M Li H Li C Wu X Yang B CVD graphene as an electrochemical sensing platform for simultaneous detection of biomolecules Sci. Rep. 2017 7 1 9 10.1038/s41598-017-07646-2 28127051 102. Zhang T Liu J Wang C Leng X Xiao Y Fu L Synthesis of graphene and related two-dimensional materials for bioelectronics devices Biosens. Bioelectron. 2017 89 28 42 27396820 103. Sprinkle M Ruan M Hu Y Hankinson J Rubio-Roy M Zhang B Wu X Berger C De Heer WA Scalable templated growth of graphene nanoribbons on SiC Nat. Nanotechnol. 2010 5 727 20890273 104. Lee DH Kim JE Han TH Hwang JW Jeon S Choi S Hong SH Lee WJ Ruoff RS Kim SO Versatile carbon hybrid films composed of vertical carbon nanotubes grown on mechanically compliant graphene films Adv. Mater. 2010 22 1247 1252 20437513 105. Li X Cai W An J Kim S Nah J Yang D Piner R Velamakanni A Jung I Tutuc E Large-area synthesis of high-quality and uniform graphene films on copper foils Science (80−) 2009 324 1312 1314 106. Zhang YI Zhang L Zhou C Review of chemical vapor deposition of graphene and related applications Acc. Chem. Res. 2013 46 2329 2339 23480816 107. Pasternak I Wesolowski M Jozwik I Lukosius M Lupina G Dabrowski P Baranowski JM Strupinski W Graphene growth on Ge (100)/Si (100) substrates by CVD method Sci. Rep. 2016 6 21773 26899732 108. Dabrowski J Lippert G Avila J Baringhaus J Colambo I Dedkov YS Herziger F Lupina G Maultzsch J Schaffus T Understanding the growth mechanism of graphene on Ge/Si (001) surfaces Sci. Rep. 2016 6 31639 27531322 109. Baraton L He ZB Lee CS Cojocaru CS Châtelet M Maurice J-L Lee YH Pribat D On the mechanisms of precipitation of graphene on nickel thin films EPL. 2011 96 46003 110. Losurdo M Giangregorio MM Capezzuto P Bruno G Graphene CVD growth on copper and nickel: role of hydrogen in kinetics and structure Phys. Chem. Chem. Phys. 2011 13 20836 20843 22006173 111. Zhang J Wang Z Niu T Wang S Li Z Chen W Elementary process for CVD graphene on Cu (110): size-selective carbon clusters Sci. Rep. 2014 4 4431 24651211 112. Niu T Zhou M Zhang J Feng Y Chen W Growth intermediates for CVD graphene on Cu (111): carbon clusters and defective graphene J. Am. Chem. Soc. 2013 135 8409 8414 23675983 113. Liu M Gao Y Zhang Y Zhang Y Ma D Ji Q Gao T Chen Y Liu Z Single and polycrystalline graphene on Rh (111) following different growth mechanisms Small 2013 9 1360 1366 23436758 114. Kordatos A Kelaidis N Giamini SA Marquez-Velasco J Xenogiannopoulou E Tsipas P Kordas G Dimoulas A AB stacked few layer graphene growth by chemical vapor deposition on single crystal Rh (1 1 1) and electronic structure characterization Appl. Surf. Sci. 2016 369 251 256 115. Yang S Lohe MR Müllen K Feng X New-generation graphene from electrochemical approaches: production and applications Adv. Mater. 2016 28 6213 6221 26836313 116. Li F Peng H Xia D Yang J Yang K Yin F Yuan W Highly sensitive, selective, and flexible NO2 chemiresistors based on multilevel structured three-dimensional reduced graphene oxide fiber scaffold modified with aminoanthroquinone moieties and Ag nanoparticles ACS Appl. Mater. Interfaces. 2019 11 9309 9316 10.1021/acsami.8b20462 30758937 117. Kim J Cote LJ Kim F Yuan W Shull KR Huang J Graphene oxide sheets at interfaces J. Am. Chem. Soc. 2010 132 8180 8186 20527938 118. Cheng C Li S Thomas A Kotov NA Haag R Functional graphene nanomaterials based architectures: biointeractions, fabrications, and emerging biological applications Chem. Rev. 2017 117 1826 1914 10.1021/acs.chemrev.6b00520 28075573 119. Bacon M Bradley SJ Nann T Graphene quantum dots Part. Part. Syst. Charact. 2014 31 415 428 120. Suzuki N Wang Y Elvati P Qu Z-B Kim K Jiang S Baumeister E Lee J Yeom B Bahng JH Chiral graphene quantum dots ACS Nano 2016 10 1744 1755 26743467 121. Chong Y Ma Y Shen H Tu X Zhou X Xu J Dai J Fan S Zhang Z The in vitro and in vivo toxicity of graphene quantum dots Biomaterials 2014 35 5041 5048 24685264 122. Ye Q Guo L Wu D Yang B Tao Y Deng L Kong Y Covalent functionalization of bovine serum albumin with graphene quantum dots for stereospecific molecular recognition Anal. Chem. 2019 91 11864 11871 10.1021/acs.analchem.9b02605 31415149 123. Cleeton C Keirouz A Chen X Radacsi N Electrospun nanofibers for drug delivery and biosensing ACS Biomater. Sci. Eng. 2019 5 4183 4205 10.1021/acsbiomaterials.9b00853 124. Ding Y Li W Zhang F Liu Z ZanjanizadehEzazi N Liu D Santos HA Electrospun fibrous architectures for drug delivery, tissue engineering and cancer therapy Adv. Funct. Mater. 2019 29 1 35 10.1002/adfm.201802852 125. Jian S Zhu J Jiang S Chen S Fang H Song Y Duan G Zhang Y Hou H Nanofibers with diameter below one nanometer from electrospinning† RSC Adv. 2018 8 4794 4802 10.1039/c7ra13444d 126. Kenry CT Lim, nanofiber technology: current status and emerging developments Prog. Polym. Sci. 2017 70 1 17 10.1016/j.progpolymsci.2017.03.002 127. Aliheidari N Aliahmad N Agarwal M Dalir H Electrospun nanofibers for label-free sensor applications Sensors. 2019 19 3587 10.3390/s19163587 128. Aydogdu A Sumnu G Sahin S A novel electrospun hydroxypropyl methylcellulose/polyethylene oxide blend nanofibers: morphology and physicochemical properties Carbohydr. Polym. 2018 181 234 246 10.1016/j.carbpol.2017.10.071 29253968 129. Nathani A Sharma CS Electrospun mesoporous poly(styrene-block-methylmethacrylate) nanofibers as biosensing platform: effect of fibers porosity on sensitivity Electroanalysis. 2019 1 8 10.1002/elan.201800796 130. Silvestri D Mikšíček J Wacławek S Torres-Mendieta R Padil VVT Černík M Production of electrospun nanofibers based on graphene oxide/gum Arabic Int. J. Biol. Macromol. 2019 124 396 402 10.1016/j.ijbiomac.2018.11.243 30500492 131. Zhao R Lu X Wang C Electrospinning based all-nano composite materials: recent achievements and perspectives Compos. Commun. 2018 10 140 150 10.1016/j.coco.2018.09.005 132. Zhang M Zhao X Zhang G Wei G Su Z Electrospinning design of functional nanostructures for biosensor applications J. Mater. Chem. B. 2017 5 1699 1711 10.1039/c6tb03121h 32263911 133. Salavagione HJ Gómez-Fatou MA Shuttleworth PS Ellis GJ New perspectives on graphene/polymer fibers and fabrics for smart textiles: the relevance of the polymer/graphene interphase Front. Mater. 2018 5 1 6 10.3389/fmats.2018.00018 134. Wu Q Xu Y Yao Z Liu A Shi G Supercapacitors based on flexible graphene/polyaniline nanofiber composite films ACS Nano 2010 4 1963 1970 20355733 135. Kumar NA Choi H-J Shin YR Chang DW Dai L Baek J-B Polyaniline-grafted reduced graphene oxide for efficient electrochemical supercapacitors ACS Nano 2012 6 1715 1723 22276770 136. Si Y Samulski ET Synthesis of water soluble graphene Nano Lett. 2008 8 1679 1682 18498200 137. Wu Q Sun Y Bai H Shi G High-performance supercapacitor electrodes based on graphene hydrogels modified with 2-aminoanthraquinone moieties Phys. Chem. Chem. Phys. 2011 13 11193 11198 21562653 138. Cai D Song M Recent advance in functionalized graphene/polymer nanocomposites J. Mater. Chem. 2010 20 7906 7915 139. Ramanathan T Abdala AA Stankovich S Dikin DA Herrera-Alonso M Piner RD Adamson DH Schniepp HC Chen X Ruoff RS Functionalized graphene sheets for polymer nanocomposites Nat. Nanotechnol. 2008 3 327 18654541 140. Li H Shi W Zeng X Huang S Zhang H Qin X Improved desalination properties of hydrophobic GO-incorporated PVDF electrospun nanofibrous composites for vacuum membrane distillation Sep. Purif. Technol. 2020 230 115889 10.1016/j.seppur.2019.115889 141. Du F Sun L Huang Z Chen Z Xu Z Ruan G Zhao C Electrospun reduced graphene oxide/TiO2/poly(acrylonitrile-co-maleic acid) composite nanofibers for efficient adsorption and photocatalytic removal of malachite green and leucomalachite green Chemosphere 2020 239 124764 10.1016/j.chemosphere.2019.124764 31527004 142. Gu X Li Y Cao R Liu S Fu C Feng S Yang C Cheng W Wang Y Novel electrospun poly(lactic acid)/poly(butylene carbonate)/graphene oxide nanofiber membranes for antibacterial applications AIP Adv. 2019 10.1063/1.5100109 143. Parandeh S Kharaziha M Karimzadeh F An eco-friendly triboelectric hybrid nanogenerators based on graphene oxide incorporated polycaprolactone fibers and cellulose paper Nano Energy. 2019 59 412 421 10.1016/j.nanoen.2019.02.058 144. Stone H Lin S Mequanint K Preparation and characterization of electrospun rGO-poly(ester amide) conductive scaffolds Mater. Sci. Eng. C 2019 98 324 332 10.1016/j.msec.2018.12.122 145. Bahrami S Solouk A Mirzadeh H Seifalian AM Electroconductive polyurethane/graphene nanocomposite for biomedical applications Compos. Part B Eng. 2019 168 421 431 10.1016/j.compositesb.2019.03.044 146. Ghaderi G Tavanai H Bazarganipour M Electrospun graphene oxide incorporated PAN nanofibers, before and after activation Mater. Res. Express. 2019 6 105047 147. Maccaferri E Mazzocchetti L Benelli T Zucchelli A Giorgini L Morphology, thermal, mechanical properties and ageing of nylon 6,6/graphene nanofibers as Nano2 materials Compos. Part B Eng. 2019 166 120 129 10.1016/j.compositesb.2018.11.096 148. Samani DA Doostmohammadi A Nilforoushan MR Nazari H Electrospun polycaprolactone/graphene/baghdadite composite nanofibres with improved mechanical and biological properties Fibers Polym. 2019 20 982 990 10.1007/s12221-019-1161-5 149. Pan S Qi Z Li Q Ma Y Fu C Zheng S Kong W Liu Q Yang X Graphene oxide-PLGA hybrid nanofibres for the local delivery of IGF-1 and BDNF in spinal cord repair Artif. Cells Nanomed. Biotechnol. 2019 47 651 664 10.1080/21691401.2019.1575843 30829545 150. Banitaba SN Semnani D Heydari-Soureshjani E Rezaei B Ensafi AA Nanofibrous poly(ethylene oxide)-based structures incorporated with multi-walled carbon nanotube and graphene oxide as all-solid-state electrolytes for lithium ion batteries Polym. Int. 2019 10.1002/pi.5889 151. Jahan Biglari M Semnani Rahbar R Shabanian M Khonakdar HA Novel composite nanofibers based on polyamide 66/graphene oxide- grafted aliphatic- aromatic polyamide: preparation and characterization Polym. Plast. Technol. Eng. 2018 10.1080/03602559.2018.1542712 152. Ren J Woo YC Yao M Lim S Tijing LD Shon HK Nanoscale zero-valent iron (nZVI) immobilization onto graphene oxide (GO)-incorporated electrospun polyvinylidene fluoride (PVDF) nanofiber membrane for groundwater remediation via gravity-driven membrane filtration Sci. Total Environ. 2019 688 787 796 10.1016/j.scitotenv.2019.05.393 31255817 153. Choi YI Hwang BU Meeseepong M Hanif A Ramasundaram S Trung TQ Lee NE Stretchable and transparent nanofiber-networked electrodes based on nanocomposites of polyurethane/reduced graphene oxide/silver nanoparticles with high dispersion and fused junctions Nanoscale. 2019 11 4015 4024 10.1039/c8nr10170a 30768112 154. Weng R Sun L Jiang L Li N Ruan G Li J Du F Electrospun graphene oxide-doped nanofiber-based solid phase extraction followed by high-performance liquid chromatography for the determination of tetracycline antibiotic residues in food samples Food Anal. Methods 2019 12 1594 1603 10.1007/s12161-019-01495-7 155. Huang CL Wu HH Jeng YC Liang WZ Electrospun graphene nanosheet-filled poly(trimethylene terephthalate) composite fibers: Effects of the graphene nanosheet content on morphologies, electrical conductivity, crystallization behavior, and mechanical properties Polymers 2019 10.3390/polym11010164 156. Aboamera NM Mohamed A Salama A Osman TA Khattab A An effective removal of organic dyes using surface functionalized cellulose acetate/graphene oxide composite nanofibers Cellulose 2018 25 4155 4166 10.1007/s10570-018-1870-8 157. Davoodi AH Mazinani S Sharif F Ranaei-Siadat SO GO nanosheets localization by morphological study on PLA-GO electrospun nanocomposite nanofibers J. Polym. Res. 2018 25 16 19 10.1007/s10965-018-1589-0 158. Ruan K Guo Y Tang Y Zhang Y Zhang J He M Kong J Gu J Improved thermal conductivities in polystyrene nanocomposites by incorporating thermal reduced graphene oxide via electrospinning-hot press technique Compos. Commun. 2018 10 68 72 10.1016/j.coco.2018.07.003 159. Ruiz V Pérez-Marquez A Maudes J Grande HJ Murillo N Enhanced photostability and sensing performance of graphene quantum dots encapsulated in electrospun polyacrylonitrile nanofibrous filtering membranes Sensors Actuators B Chem. 2018 262 902 912 10.1016/j.snb.2018.02.081 160. Javed K Krumme A Viirsalu M Krasnou I Plamus T Vassiljeva V Tarasova E Savest N Mere A Mikli V Danilson M Kaljuvee T Lange S Yuan Q Topham PD Chen CM A method for producing conductive graphene biopolymer nanofibrous fabrics by exploitation of an ionic liquid dispersant in electrospinning Carbon N. Y. 2018 140 148 156 10.1016/j.carbon.2018.08.034 161. Gebrekrstos A Madras G Bose S Piezoelectric response in electrospun poly(vinylidene fluoride) fibers containing fluoro-doped graphene derivatives ACS Omega. 2018 3 5317 5326 10.1021/acsomega.8b00237 31458741 162. Pavinatto A Mercante LA Facure MHM Pena RB Sanfelice RC Mattoso LHC Correa DS Ultrasensitive biosensor based on polyvinylpyrrolidone/chitosan/reduced graphene oxide electrospun nanofibers for 17α – Ethinylestradiol electrochemical detection Appl. Surf. Sci. 2018 458 431 437 10.1016/j.apsusc.2018.07.035 163. Abdali H Ajji A Preparation of electrospun nanocomposite nanofibers of polyaniline/poly(methyl methacrylate) with amino-functionalized graphene Polymers (Basel). 2017 9 453 10.3390/polym9090453 164. Wang X Gao Y Li X Xu Y Jiang J Hou J Li Q Turng LS Selective localization of graphene oxide in electrospun polylactic acid/poly(ε-caprolactone) blended nanofibers Polym. Test. 2017 59 396 403 10.1016/j.polymertesting.2017.02.022 165. Abolhasani MM Shirvanimoghaddam K Naebe M PVDF/graphene composite nanofibers with enhanced piezoelectric performance for development of robust nanogenerators Compos. Sci. Technol. 2017 138 49 56 10.1016/j.compscitech.2016.11.017 166. Heidari M Bahrami H Ranjbar-Mohammadi M Fabrication, optimization and characterization of electrospun poly(caprolactone)/gelatin/graphene nanofibrous mats Mater. Sci. Eng. C 2017 78 218 229 10.1016/j.msec.2017.04.095 167. Abbasipour M Khajavi R Yousefi AA Yazdanshenas ME Razaghian F The piezoelectric response of electrospun PVDF nanofibers with graphene oxide, graphene, and halloysite nanofillers: a comparative study J. Mater. Sci.: Mater. Electron. 2017 28 15942 15952 10.1007/s10854-017-7491-4 168. Sahatiya P Badhulika S One-step in situ synthesis of single aligned graphene-ZnO nanofiber for UV sensing RSC Adv. 2015 5 82481 82487 10.1039/c5ra15351d 169. He J Zhou M Wang L Zhao S Wang Q Ding B Cui S Electrospinning in situ synthesis of graphene-doped porous copper indium disulfide/carbon composite nanofibers for highly efficient counter electrode in dye-sensitized solar cells Electrochim. Acta 2016 215 626 636 170. Huang Y-L Baji A Tien H-W Yang Y-K Yang S-Y Ma C-CM Liu H-Y Mai Y-W Wang N-H Self-assembly of graphene onto electrospun polyamide 66 nanofibers as transparent conductive thin films Nanotechnology. 2011 22 475603 22056343 171. Jose MV Steinert BW Thomas V Dean DR Abdalla MA Price G Janowski GM Morphology and mechanical properties of Nylon 6/MWNT nanofibers Polymer (Guildf). 2007 48 1096 1104 172. Jasso-Ramos LE Ojeda-Hernández A Guerrero-Bermea C Garcia-Gómez NA Manriquez J Sepulveda-Guzmán S Cruz-Silva R Simultaneous intercalated assembly of mesostructured hybrid carbon nanofiber/reduced graphene oxide and its use in electrochemical sensing Nanotechnology. 2019 10.1088/1361-6528/aae879 173. Shan C Wang Y Xie S Guan H Argueta M Yue Y Free-standing nitrogen-doped graphene-carbon nanofiber composite mats: electrospinning synthesis and application as anode material for lithium-ion batteries J. Chem. Technol. Biotechnol. 2019 10.1002/jctb.6114 174. Wang X-Y Narita A Müllen K Precision synthesis versus bulk-scale fabrication of graphenes Nat. Rev. Chem. 2017 2 1 10 10.1038/s41570-017-0100 175. Feng ZQ Wu F Jin L Wang T Dong W Zheng J Graphene nanofibrous foam designed as an efficient oil absorbent Ind. Eng. Chem. Res. 2019 58 3000 3008 10.1021/acs.iecr.8b05646 176. Soikkeli M Kurppa K Kainlauri M Arpiainen S Paananen A Gunnarsson D Joensuu JJ Laaksonen P Prunnila M Linder MB Graphene biosensor programming with genetically engineered fusion protein monolayers ACS Appl. Mater. Interfaces. 2016 8 8257 8264 26960769 177. Zhang P Huang Y Lu X Zhang S Li J Wei G Su Z One-step synthesis of large-scale graphene film doped with gold nanoparticles at liquid–air interface for electrochemistry and Raman detection applications Langmuir 2014 30 8980 8989 25015184 178. Beachley V Wen X Polymer nanofibrous structures: fabrication, biofunctionalization, and cell interactions Prog. Polym. Sci. 2010 35 868 892 20582161 179. Koh HS Yong T Chan CK Ramakrishna S Enhancement of neurite outgrowth using nano-structured scaffolds coupled with laminin Biomaterials 2008 29 3574 3582 18533251 180. Liu C Shi G Wang G Mishra P Jia S Jiang X Zhang P Dong Y Wang Z Preparation and electrochemical studies of electrospun phosphorus doped porous carbon nanofibers RSC Adv. 2019 9 6898 6906 10.1039/c8ra10193k 181. Reddy CS Murali G Reddy AS Park S In I GO incorporated SnO2 nanotubes as fast response sensors for ethanol vapor in different atmospheres J. Alloys Compd. 2019 813 152251 10.1016/j.jallcom.2019.152251 182. Tambakoozadeh N Youssefi M Semnani D A composite polyaniline/graphene–coated polyamide6 nanofiber mat for electrochemical applications Polym. Adv. Technol. 2019 10.1002/pat.4714 183. Zheng N Song Y Wang L Gao J Wang Y Dong X Improved electrical and mechanical properties for the reduced graphene oxide-decorated polymer nanofiber composite with a core-shell structure Ind. Eng. Chem. Res. 2019 58 15470 15478 10.1021/acs.iecr.9b01766 184. Gozutok M Sadhu V Sasmazel HT Development of poly(vinyl alcohol) (PVA)/reduced graphene oxide (rGO) electrospun mats J. Nanosci. Nanotechnol. 2019 19 4292 4298 10.1166/jnn.2019.16290 30765008 185. Nazari H Azadi S Hatamie S Zomorrod MS Ashtari K Soleimani M Hosseinzadeh S Fabrication of graphene-silver/polyurethane nanofibrous scaffolds for cardiac tissue engineering Polym. Adv. Technol. 2019 30 2086 2099 10.1002/pat.4641 186. Ionita M Pandele AM Crica L Pilan L Improving the thermal and mechanical properties of polysulfone by incorporation of graphene oxide Compos. Part B Eng. 2014 59 133 139 187. Rezaee R Nasseri S Mahvi AH Nabizadeh R Mousavi SA Rashidi A Jafari A Nazmara S Fabrication and characterization of a polysulfone-graphene oxide nanocomposite membrane for arsenate rejection from water J. Environ. Heal. Sci. Eng. 2015 13 61 188. Yoonessi M Shi Y Scheiman DA Lebron-Colon M Tigelaar DM Weiss RA Meador MA Graphene polyimide nanocomposites; thermal, mechanical, and high-temperature shape memory effects ACS Nano 2012 6 7644 7655 22931435 189. Tseng I Chang J Huang S Tsai M Enhanced thermal conductivity and dimensional stability of flexible polyimide nanocomposite film by addition of functionalized graphene oxide Polym. Int. 2013 62 827 835 190. Yoonessi M Gaier JR Highly conductive multifunctional graphene polycarbonate nanocomposites ACS Nano 2010 4 7211 7220 21082818 191. Gedler G Antunes M Realinho V Velasco JI Thermal stability of polycarbonate-graphene nanocomposite foams Polym. Degrad. Stab. 2012 97 1297 1304 192. Zheng D Tang G Zhang H-B Yu Z-Z Yavari F Koratkar N Lim S-H Lee M-W In situ thermal reduction of graphene oxide for high electrical conductivity and low percolation threshold in polyamide 6 nanocomposites Compos. Sci. Technol. 2012 72 284 289 193. Liu H Hou L Peng W Zhang Q Zhang X Fabrication and characterization of polyamide 6-functionalized graphene nanocomposite fiber J. Mater. Sci. 2012 47 8052 8060 194. Bian J Lin HL He FX Wang L Wei XW Chang I-T Sancaktar E Processing and assessment of high-performance poly (butylene terephthalate) nanocomposites reinforced with microwave exfoliated graphite oxide nanosheets Eur. Polym. J. 2013 49 1406 1423 195. Chen H Huang C Yu W Zhou C Effect of thermally reduced graphite oxide (TrGO) on the polymerization kinetics of poly (butylene terephthalate)(pCBT)/TrGO nanocomposites prepared by in situ ring-opening polymerization of cyclic butylene terephthalate Polymer (Guildf). 2013 54 1603 1611 196. Gorji M Sadeghianmaryan A Rajabinejad H Nasherolahkam S Chen X Development of highly pH-sensitive hybrid membranes by simultaneous electrospinning of amphiphilic nanofibers reinforced with graphene oxide J. Funct. Biomater. 2019 10.3390/jfb10020023 197. Nosrati H Mamoory RS Dabir F Le DQS Bünger CE Perez MC Rodriguez MA Effects of hydrothermal pressure on in situ synthesis of 3D graphene-hydroxyapatite nano structured powders Ceram. Int. 2019 45 1761 1769 198. Zhang C Zhao F He Y She Y Hong S Ma J Wang M Cao Z Li T EI-Aty AMA Ping J Ying Y Wang J A disposable electrochemical sensor based on electrospinning of molecularly imprinted nanohybrid films for highly sensitive determination of the organotin acaricide cyhexatin Microchim. Acta. 2019 10.1007/s00604-019-3631-2 199. Macagnano A Zampetti E Kny E Electrospinning for high performance sensors 2015 Berlin Springer 200. Agarwal S Greiner A Wendorff JH Functional materials by electrospinning of polymers Prog. Polym. Sci. 2013 38 963 991 10.1016/j.progpolymsci.2013.02.001 201. Zhang CL Yu SH Nanoparticles meet electrospinning: recent advances and future prospects Chem. Soc. Rev. 2014 43 4423 4448 10.1039/c3cs60426h 24695773 202. Zhang P Zhao X Ji Y Ouyang Z Wen X Li J Su Z Wei G Electrospinning graphene quantum dots into a nanofibrous membrane for dual-purpose fluorescent and electrochemical biosensors J. Mater. Chem. B. 2015 3 2487 2496 10.1039/c4tb02092h 32262123 203. Cabral TS Sgobbi LF Delezuk J Pessoa RS Lobo AO Rodrigues BVM Glucose sensing via a green and low-cost platform from electrospun poly (vinyl alcohol)/graphene quantum dots fibers Mater. Today Proc. 2019 14 694 699 10.1016/j.matpr.2019.02.008 204. Karimiyan H Uheida A Hadjmohammadi M Moein MM Abdel-Rehim M Polyacrylonitrile/graphene oxide nanofibers for packed sorbent microextraction of drugs and their metabolites from human plasma samples Talanta 2019 201 474 479 10.1016/j.talanta.2019.04.027 31122453 205. Li H Zhang W Ding Q Jin X Ke Q Li Z Wang D Huang C Facile strategy for fabrication of flexible, breathable, and washable piezoelectric sensors via welding of nanofibers with multiwalled carbon nanotubes (MWCNTs) ACS Appl. Mater. Interfaces. 2019 10.1021/acsami.9b10886 206. Ren H Zheng L Wang G Gao X Tan Z Shan J Cui L Li K Jian M Zhu L Zhang Y Peng H Wei D Liu Z Transfer-medium-free nanofiber-reinforced graphene film and applications in wearable transparent pressure sensors ACS Nano 2019 13 5541 5548 10.1021/acsnano.9b00395 31034773
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/pdf/2020.acl-main.207.json b/s2orc-doc2json/tests/pdf/2020.acl-main.207.json
new file mode 100644
index 0000000000000000000000000000000000000000..e945a209d6ea7365db44953174e02616b2164192
--- /dev/null
+++ b/s2orc-doc2json/tests/pdf/2020.acl-main.207.json
@@ -0,0 +1,3723 @@
+{
+ "paper_id": "2020",
+ "header": {
+ "generated_with": "S2ORC 1.0.0",
+ "date_generated": "2021-02-12T10:05:38.729693Z"
+ },
+ "title": "SPECTER: Document-level Representation Learning using Citation-informed Transformers",
+ "authors": [
+ {
+ "first": "Arman",
+ "middle": [],
+ "last": "Cohan",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "armanc@allenai.org"
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "sergey@allenai.org"
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "beltagy@allenai.org"
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": "dougd@allenai.org"
+ },
+ {
+ "first": "Daniel",
+ "middle": [
+ "S"
+ ],
+ "last": "Weld",
+ "suffix": "",
+ "affiliation": {
+ "laboratory": "",
+ "institution": "University of Washington",
+ "location": {}
+ },
+ "email": ""
+ }
+ ],
+ "year": "",
+ "abstract": "Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1",
+ "pdf_parse": {
+ "paper_id": "2020",
+ "_pdf_hash": "",
+ "abstract": [
+ {
+ "text": "Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Abstract",
+ "sec_num": null
+ }
+ ],
+ "body_text": [
+ {
+ "text": "As the pace of scientific publication continues to increase, Natural Language Processing (NLP) tools that help users to search, discover and understand the scientific literature have become critical. In recent years, substantial improvements in NLP tools have been brought about by pretrained neural language models (LMs) (Radford et al., 2018; Devlin et al., 2019; . While such models are widely used for representing individual words or sentences, extensions to whole-document embeddings are relatively underexplored. Likewise, methods that do use inter-document signals to produce whole-document embeddings (Tu et al., 2017; ) have yet to incorporate stateof-the-art pretrained LMs. Here, we study how to leverage the power of pretrained language models to learn embeddings for scientific documents.",
+ "cite_spans": [
+ {
+ "start": 322,
+ "end": 344,
+ "text": "(Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 345,
+ "end": 365,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 610,
+ "end": 627,
+ "text": "(Tu et al., 2017;",
+ "ref_id": "BIBREF46"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "A paper's title and abstract provide rich semantic content about the paper, but, as we show in this work, simply passing these textual fields to an \"off-the-shelf\" pretrained language model-even a state-of-the-art model tailored to scientific text like the recent SciBERT (Beltagy et al., 2019) -does not result in accurate paper representations. The language modeling objectives used to pretrain the model do not lead it to output representations that are helpful for document-level tasks such as topic classification or recommendation.",
+ "cite_spans": [
+ {
+ "start": 272,
+ "end": 294,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In this paper, we introduce a new method for learning general-purpose vector representations of scientific documents. Our system, SPECTER, 2 incorporates inter-document context into the Transformer (Vaswani et al., 2017) language models (e.g., SciBERT (Beltagy et al., 2019) ) to learn document representations that are effective across a wide-variety of downstream tasks, without the need for any task-specific fine-tuning of the pretrained language model. We specifically use citations as a naturally occurring, inter-document incidental supervision signal indicating which documents are most related and formulate the signal into a triplet-loss pretraining objective. Unlike many prior works, at inference time, our model does not require any citation information. This is critical for embedding new papers that have not yet been cited. In experiments, we show that SPECTER's representations substantially outperform the state-SPECTER: Scientific Paper Embeddings using Citationinformed TransformERs of-the-art on a variety of document-level tasks, including topic classification, citation prediction, and recommendation.",
+ "cite_spans": [
+ {
+ "start": 198,
+ "end": 220,
+ "text": "(Vaswani et al., 2017)",
+ "ref_id": "BIBREF47"
+ },
+ {
+ "start": 252,
+ "end": 274,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "As an additional contribution of this work, we introduce and release SCIDOCS 3 , a novel collection of data sets and an evaluation suite for documentlevel embeddings in the scientific domain. SCI-DOCS covers seven tasks, and includes tens of thousands of examples of anonymized user signals of document relatedness. We also release our training set (hundreds of thousands of paper titles, abstracts and citations), along with our trained embedding model and its associated code base.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Our goal is to learn task-independent representations of academic papers. Inspired by the recent success of pretrained Transformer language models across various NLP tasks, we use the Transformer model architecture as basis of encoding the input paper. Existing LMs such as BERT, however, are primarily based on masked language modeling objective, only considering intra-document context and do not use any inter-document information. This limits their ability to learn optimal document representations. To learn high-quality documentlevel representations we propose using citations as an inter-document relatedness signal and formulate it as a triplet loss learning objective. We then pretrain the model on a large corpus of citations using this objective, encouraging it to output representations that are more similar for papers that share a citation link than for those that do not. We call our model SPECTER, which learns Scientific Paper Embeddings using Citation-informed Trans-formERs. With respect to the terminology used by Devlin et al. (2019) , unlike most existing LMs that are \"fine-tuning based\", our approach results in embeddings that can be applied to downstream tasks in a \"feature-based\" fashion, meaning the learned paper embeddings can be easily used as features, with no need for further task-specific fine-tuning. In the following, as background information, we briefly describe how pretrained LMs can be applied for document representation and then discuss the details of SPECTER.",
+ "cite_spans": [
+ {
+ "start": 1034,
+ "end": 1054,
+ "text": "Devlin et al. (2019)",
+ "ref_id": "BIBREF11"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "https://github.com/allenai/scidocs Transformer (initialized with SciBERT) Related paper (P + ) Query paper (P Q ) Unrelated paper (P \u2212 )",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Triplet loss =max d P Q , P + \u2212 d P Q , P \u2212 + m , 0",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Figure 1: Overview of SPECTER.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Model 2.1 Overview",
+ "sec_num": "2"
+ },
+ {
+ "text": "Recently, pretrained Transformer networks have demonstrated success on various NLP tasks (Radford et al., 2018; Devlin et al., 2019; Liu et al., 2019) ; we use these models as the foundation for SPECTER. Specifically, we use SciBERT (Beltagy et al., 2019) which is an adaptation of the original BERT (Devlin et al., 2019) architecture to the scientific domain. The BERT model architecture (Devlin et al., 2019) uses multiple layers of Transformers (Vaswani et al., 2017) to encode the tokens in a given input sequence. Each layer consists of a self-attention sublayer followed by a feedforward sublayer. The final hidden state associated with the special [CLS] token is usually called the \"pooled output\", and is commonly used as an aggregate representation of the sequence.",
+ "cite_spans": [
+ {
+ "start": 89,
+ "end": 111,
+ "text": "(Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 112,
+ "end": 132,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 133,
+ "end": 150,
+ "text": "Liu et al., 2019)",
+ "ref_id": null
+ },
+ {
+ "start": 233,
+ "end": 255,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 300,
+ "end": 321,
+ "text": "(Devlin et al., 2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 389,
+ "end": 410,
+ "text": "(Devlin et al., 2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 448,
+ "end": 470,
+ "text": "(Vaswani et al., 2017)",
+ "ref_id": "BIBREF47"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Background: Pretrained Transformers",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Our goal is to represent a given paper P as a dense vector v that best represents the paper and can be used in downstream tasks. SPECTER builds embeddings from the title and abstract of a paper. Intuitively, we would expect these fields to be sufficient to produce accurate embeddings, since they are written to provide a succinct and comprehensive summary of the paper. 4 As such, we encode the concatenated title and abstract using a Transformer LM (e.g., SciBERT) and take the final representation of the [CLS] token as the output representation of the paper:",
+ "cite_spans": [
+ {
+ "start": 371,
+ "end": 372,
+ "text": "4",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "EQUATION",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [
+ {
+ "start": 0,
+ "end": 8,
+ "text": "EQUATION",
+ "ref_id": "EQREF",
+ "raw_str": "5 v = Transformer(input) [CLS] ,",
+ "eq_num": "(1)"
+ }
+ ],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "where Transformer is the Transformer's forward function, and input is the concatenation of the [CLS] token and WordPieces (Wu et al., 2016) of the title and abstract of a paper, separated by the [SEP] token. We use SciBERT as our model initialization as it is optimized for scientific text, though our formulation is general and any Transformer language model instead of SciBERT. Using the above method with an \"off-the-shelf\" SciBERT does not take global inter-document information into account. This is because SciBERT, like other pretrained language models, is trained via language modeling objectives, which only predict words or sentences given their in-document, nearby textual context. In contrast, we propose to incorporate citations into the model as a signal of inter-document relatedness, while still leveraging the model's existing strength in modeling language.",
+ "cite_spans": [
+ {
+ "start": 122,
+ "end": 139,
+ "text": "(Wu et al., 2016)",
+ "ref_id": "BIBREF52"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Representation",
+ "sec_num": null
+ },
+ {
+ "text": "A citation from one document to another suggests that the documents are related. To encode this relatedness signal into our representations, we design a loss function that trains the Transformer model to learn closer representations for papers when one cites the other, and more distant representations otherwise. The high-level overview of the model is shown in Figure 1 . In particular, each training instance is a triplet of papers: a query paper P Q , a positive paper P + and a negative paper P \u2212 . The positive paper is a paper that the query paper cites, and the negative paper is a paper that is not cited by the query paper (but that may be cited by P + ). We then train the model using the following triplet margin loss function:",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 363,
+ "end": 371,
+ "text": "Figure 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "L = max d P Q , P + \u2212 d P Q , P \u2212 + m , 0 (2)",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "where d is a distance function and m is the loss margin hyperparameter (we empirically choose m = 1). Here, we use the L2 norm distance:",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "d(P A , P B ) = v A \u2212 v B 2 ,",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "where v A is the vector corresponding to the pooled output of the Transformer run on paper A (Equation 1). 6 Starting from the trained SciBERT model, we pretrain the Transformer parameters on the citation objective to learn paper representations that capture document relatedness.",
+ "cite_spans": [
+ {
+ "start": 107,
+ "end": 108,
+ "text": "6",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation-Based Pretraining Objective",
+ "sec_num": "2.3"
+ },
+ {
+ "text": "The choice of negative example papers P \u2212 is important when training the model. We consider two sets of negative examples: the first set simply consists of randomly selected papers from the corpus.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "We also experimented with other distance functions (e..g, normalized cosine), but they underperformed the L2 loss.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "Given a query paper, intuitively we would expect the model to be able to distinguish between cited papers, and uncited papers sampled randomly from the entire corpus. This inductive bias has been also found to be effective in content-based citation recommendation applications . But, random negatives may be easy for the model to distinguish from the positives. To provide a more nuanced training signal, we augment the randomly drawn negatives with a more challenging second set of negative examples. We denote as \"hard negatives\" the papers that are not cited by the query paper, but are cited by a paper cited by the query paper, i.e. if P 1 cite \u2212 \u2212 \u2192 P 2 and P 2 cite \u2212 \u2212 \u2192 P 3",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "but P 1 cite \u2212 \u2212 \u2192 P 3 , then P 3 is a candidate hard negative example for P 1 . We expect the hard negatives to be somewhat related to the query paper, but typically less related than the cited papers. As we show in our experiments ( \u00a76), including hard negatives results in more accurate embeddings compared to using random negatives alone.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Selecting Negative Distractors",
+ "sec_num": "2.4"
+ },
+ {
+ "text": "At inference time, the model receives one paper, P, and it outputs the SPECTER's Transfomer pooled output activation as the paper representation for P (Equation 1). We note that for inference, SPECTER requires only the title and abstract of the given input paper; the model does not need any citation information about the input paper. This means that SPECTER can produce embeddings even for new papers that have yet to be cited, which is critical for applications that target recent scientific papers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Inference",
+ "sec_num": "2.5"
+ },
+ {
+ "text": "Previous evaluations of scientific document representations in the literature tend to focus on small datasets over a limited set of tasks, and extremely high (99%+) AUC scores are already possible on these data for English documents . New, larger and more diverse benchmark datasets are necessary. Here, we introduce a new comprehensive evaluation framework to measure the effectiveness of scientific paper embeddings, which we call SCIDOCS. The framework consists of diverse tasks, ranging from citation prediction, to prediction of user activity, to document classification and paper recommendation. Note that SPECTER will not be further fine-tuned on any of the tasks; we simply plug in the embeddings as features for each task. Below, we describe each of the tasks in detail and the evaluation data associated with it. In addition to our training data, we release all the datasets associated with the evaluation tasks.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "SCIDOCS Evaluation Framework",
+ "sec_num": "3"
+ },
+ {
+ "text": "An important test of a document-level embedding is whether it is predictive of the class of the document. Here, we consider two classification tasks in the scientific domain: MeSH Classification In this task, the goals is to classify scientific papers according to their Medical Subject Headings (MeSH) (Lipscomb, 2000) . 7 We construct a dataset consisting of 23K academic medical papers, where each paper is assigned one of 11 top-level disease classes such as cardiovascular diseases, diabetes, digestive diseases derived from the MeSH vocabulary. The most populated category is Neoplasms (cancer) with 5.4K instances (23.3% of the total dataset) while the category with least number of samples is Hepatitis (1.7% of the total dataset). We follow the approach of Feldman et al. (2019) in mapping the MeSH vocabulary to the disease classes.",
+ "cite_spans": [
+ {
+ "start": 303,
+ "end": 319,
+ "text": "(Lipscomb, 2000)",
+ "ref_id": "BIBREF30"
+ },
+ {
+ "start": 322,
+ "end": 323,
+ "text": "7",
+ "ref_id": null
+ },
+ {
+ "start": 766,
+ "end": 787,
+ "text": "Feldman et al. (2019)",
+ "ref_id": "BIBREF13"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Classification",
+ "sec_num": "3.1"
+ },
+ {
+ "text": "Paper Topic Classification This task is predicting the topic associated with a paper using the predefined topic categories of the Microsoft Academic Graph (MAG) (Sinha et al., 2015) 8 . MAG provides a database of papers, each tagged with a list of topics. The topics are organized in a hierarchy of 5 levels, where level 1 is the most general and level 5 is the most specific. For our evaluation, we derive a document classification dataset from the level 1 topics, where a paper is labeled by its corresponding level 1 MAG topic. We construct a dataset of 25K papers, almost evenly split over the 19 different classes of level 1 categories in MAG.",
+ "cite_spans": [
+ {
+ "start": 161,
+ "end": 181,
+ "text": "(Sinha et al., 2015)",
+ "ref_id": "BIBREF45"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Document Classification",
+ "sec_num": "3.1"
+ },
+ {
+ "text": "As argued above, citations are a key signal of relatedness between papers. We test how well different paper representations can reproduce this signal through citation prediction tasks. In particular, we focus on two sub-tasks: predicting direct citations, and predicting co-citations. We frame these as ranking tasks and evaluate performance using MAP and nDCG, standard ranking metrics. Direct Citations In this task, the model is asked to predict which papers are cited by a given query paper from a given set of candidate papers. The evaluation dataset includes approximately 30K total papers from a held-out pool of papers, consisting of 1K query papers and a candidate set of up to 5 cited papers and 25 (randomly selected) uncited papers. The task is to rank the cited papers higher than the uncited papers. For each embedding method, we require only comparing the L2 distance between the raw embeddings of the query and the candidates, without any additional trainable parameters.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation Prediction",
+ "sec_num": "3.2"
+ },
+ {
+ "text": "Co-Citations This task is similar to the direct citations but instead of predicting a cited paper, the goal is to predict a highly co-cited paper with a given paper. Intuitively, if papers A and B are cited frequently together by several papers, this shows that the papers are likely highly related and a good paper representation model should be able to identify these papers from a given candidate set. The dataset consists of 30K total papers and is constructed similar to the direct citations task.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Citation Prediction",
+ "sec_num": "3.2"
+ },
+ {
+ "text": "The embeddings for similar papers should be close to each other; we use user activity as a proxy for identifying similar papers and test the model's ability to recover this information. Multiple users consuming the same items as one another is a classic relatedness signal and forms the foundation for recommender systems and other applications (Schafer et al., 2007) . In our case, we would expect that when users look for academic papers, the papers they view in a single browsing session tend to be related. Thus, accurate paper embeddings should, all else being equal, be relatively more similar for papers that are frequently viewed in the same session than for other papers. To build benchmark datasets to test embeddings on user activity, we obtained logs of user sessions from a major academic search engine. We define the following two tasks on which we build benchmark datasets to test embeddings:",
+ "cite_spans": [
+ {
+ "start": 345,
+ "end": 367,
+ "text": "(Schafer et al., 2007)",
+ "ref_id": "BIBREF42"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "Co-Views Our co-views dataset consists of approximately 30K papers. To construct it, we take 1K random papers that are not in our train or development set and associate with each one up to 5 frequently co-viewed papers and 25 randomly selected papers (similar to the approach for citations). Then, we require the embedding model to rank the co-viewed papers higher than the random papers by comparing the L2 distances of raw embeddings. We evaluate performance using standard ranking metrics, nDCG and MAP.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "Co-Reads If the user clicks to access the PDF of a paper from the paper description page, this is a potentially stronger sign of interest in the paper. In such a case we assume the user will read at least parts of the paper and refer to this as a \"read\" action. Accordingly, we define a \"co-reads\" task and dataset analogous to the co-views dataset described above. This dataset is also approximately 30K papers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "User Activity",
+ "sec_num": "3.3"
+ },
+ {
+ "text": "In the recommendation task, we evaluate the ability of paper embeddings to boost performance in a production recommendation system. Our recommendation task aims to help users navigate the scientific literature by ranking a set of \"similar papers\" for a given paper. We use a dataset of user clickthrough data for this task which consists of 22K clickthrough events from a public scholarly search engine. We partitioned the examples temporally into train (20K examples), validation (1K), and test (1K) sets. As is typical in clickthrough data on ranked lists, the clicks are biased toward the top of original ranking presented to the user. To counteract this effect, we computed propensity scores using a swap experiment (Agarwal et al., 2019). The propensity scores give, for each position in the ranked list, the relative frequency that the position is over-represented in the data due to exposure bias. We can then compute de-biased evaluation metrics by dividing the score for each test example by the propensity score for the clicked position. We report propensity-adjusted versions of the standard ranking metrics Precision@1 (P @1) and Normalized Discounted Cumulative Gain (nDCG).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Recommendation",
+ "sec_num": "3.4"
+ },
+ {
+ "text": "We test different embeddings on the recommendation task by including cosine embedding distance 9 as a feature within an existing recommendation system that includes several other informative features (title/author similarity, reference and citation overlap, etc.). Thus, the recommendation experiments measure whether the embeddings can boost the performance of a strong baseline system on an end task. For SPECTER, we also perform an online A/B test to measure whether its advantages on the offline dataset translate into improvements on the online recommendation task ( \u00a75).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Recommendation",
+ "sec_num": "3.4"
+ },
+ {
+ "text": "Training Data To train our model, we use a subset of the Semantic Scholar corpus consisting of about 146K query papers (around 26.7M tokens) with their corresponding outgoing citations, and we use an additional 32K papers for validation. For each query paper we construct up to 5 training triples comprised of a query, a positive, and a negative paper. The positive papers are sampled from the direct citations of the query, while negative papers are chosen either randomly or from citations of citations (as discussed in \u00a72.4). We empirically found it helpful to use 2 hard negatives (citations of citations) and 3 easy negatives (randomly selected papers) for each query paper. This process results in about 684K training triples and 145K validation triples.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Training and Implementation We implement our model in AllenNLP . We initialize the model from SciBERT pretrained weights (Beltagy et al., 2019) since it is the stateof-the-art pretrained language model on scientific text. We continue training all model parameters on our training objective (Equation 2). We perform minimal tuning of our model's hyperparameters based on the performance on the validation set, while baselines are extensively tuned. Based on initial experiments, we use a margin m=1 for the triplet loss. For training, we use the Adam optimizer (Kingma and Ba, 2014) following the suggested hyperparameters in Devlin et al. (2019) (LR: 2e-5, Slanted Triangular LR scheduler 10 (Howard and Ruder, 2018) with number of train steps equal to training instances and cut fraction of 0.1). We train the model on a single Titan V GPU (12G memory) for 2 epochs, with batch size of 4 (the maximum that fit in our GPU memory) and use gradient accumulation for an effective batch size of 32. Each training epoch takes approximately 1-2 days to complete on the full dataset. We release our code and data to facilitate reproducibility. 11",
+ "cite_spans": [
+ {
+ "start": 121,
+ "end": 143,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Task-Specific Model Details For the classification tasks, we used a linear SVM where embedding vectors were the only features. The C hyperparameter was tuned via a held-out validation set.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "For the recommendation tasks, we use a feedforward ranking neural network that takes as input ten features designed to capture the similarity between each query and candidate paper, including the cosine similarity between the query and candidate embeddings and manually-designed features computed from the papers' citations, titles, authors, and publication dates.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "Baseline Methods Our work falls into the intersection of textual representation, citation mining, and graph learning, and we evaluate against stateof-the-art baselines from each of these areas. We compare with several strong textual models: SIF (Arora et al., 2017) , a method for learning document representations by removing the first principal component of aggregated word-level embeddings which we pretrain on scientific text; SciBERT (Beltagy et al., 2019) a state-of-the-art pretrained Transformer LM for scientific text; and Sent-BERT (Reimers and Gurevych, 2019) , a model that uses negative sampling to tune BERT for producing optimal sentence embeddings. We also compare with Citeomatic , a closely related paper representation model for citation prediction which trains content-based representations with citation graph information via dynamically sampled triplets, and SGC (Wu et al., 2019a) , a state-of-the-art graph-convolutional approach. For completeness, additional baselines are also included; due to space constraints we refer to Appendix A for detailed discussion of all baselines. We tune hyperparameters of baselines to maximize performance on a separate validation set. Table 1 presents the main results corresponding to our evaluation tasks (described in \u00a73). Overall, we observe substantial improvements across all tasks with average performance of 80.0 across all metrics on all tasks which is a 3.1 point absolute improvement over the next-best baseline. We now discuss the results in detail.",
+ "cite_spans": [
+ {
+ "start": 245,
+ "end": 265,
+ "text": "(Arora et al., 2017)",
+ "ref_id": "BIBREF2"
+ },
+ {
+ "start": 542,
+ "end": 570,
+ "text": "(Reimers and Gurevych, 2019)",
+ "ref_id": "BIBREF40"
+ },
+ {
+ "start": 885,
+ "end": 903,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 1194,
+ "end": 1201,
+ "text": "Table 1",
+ "ref_id": "TABREF1"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Experiments",
+ "sec_num": "4"
+ },
+ {
+ "text": "For document classification, we report macro F1, a standard classification metric. We observe that the classifier performance when trained on our representations is better than when trained on any other baseline. Particularly, on the MeSH (MAG) dataset, we obtain an 86.4 (82.0) F1 score which is about a \u2206= + 2.3 (+1.5) point absolute increase over the best baseline on each dataset respectively. Our evaluation of the learned representations on predicting user activity is shown in the \"User activity\" columns of Table 1 . SPECTER achieves a MAP score of 83.8 on the co-view task, and 84.5 on coread, improving over the best baseline (Citeomatic in this case) by 2.7 and 4.0 points, respectively. We observe similar trends for the \"citation\" and \"co-citation\" tasks, with our model outperforming virtually all other baselines except for SGC, which has access to the citation graph at training and test time. 12 Note that methods like SGC cannot be used in real-world setting to embed new papers that are not cited yet. On the other hand, on cocitation data our method is able to achieve the best results with nDCG of 94.8, improving over SGC with 2.3 points. Citeomatic also performs well on the citation tasks, as expected given that its primary design goal was citation prediction. Nevertheless, our method slightly outperforms Citeomatic on the direct citation task, while substantially outperforming it on co-citations (+2.0 nDCG). Finally, for recommendation task, we observe that SPECTER outperforms all other models on this task as well, with nDCG of 53.9. On the recommendations task, as opposed to previous experiments, the differences in method scores are generally smaller. This is because for this task the embeddings are used along with several other informative features in the ranking model (described under task-specific models in \u00a74), meaning that embedding variants have less opportunity for impact on overall performance.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 515,
+ "end": 522,
+ "text": "Table 1",
+ "ref_id": "TABREF1"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "We also performed an online study to evaluate whether SPECTER embeddings offer similar advantages in a live application. We performed an online A/B test comparing our SPECTER-based recommender to an existing production recommender system for similar papers that ranks papers by a textual similarity measure. In a dataset of 4,113 clicks, we found that SPECTER ranker improved clickthrough rate over the baseline by 46.5%, demonstrating its superiority.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "We emphasize that our citation-based pretraining objective is critical for the performance of SPECTER; removing this and using a vanilla SciB-ERT results in decreased performance on all tasks.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "For SGC, we remove development and test set citations and co-citations during training. We also remove incoming citations from development and test set queries as these would not be available at test time in production. ",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Results",
+ "sec_num": "5"
+ },
+ {
+ "text": "In this section, we analyze several design decisions in SPECTER, provide a visualization of its embedding space, and experimentally compare SPECTER's use of fixed embeddings against a finetuning approach.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Ablation Study We start by analyzing how adding or removing metadata fields from the input to SPECTER alters performance. The results are shown in the top four rows of Table 2 (for brevity, here we only report the average of the metrics from each task). We observe that removing the abstract from the textual input and relying only on the title results in a substantial decrease in performance. More surprisingly, adding authors as an input (along with title and abstract) hurts performance. 13 One possible explanation is that author names are sparse in the corpus, making it difficult for the model to infer document-level relatedness from them. As another possible reason of this behavior, tokenization using Wordpieces might be suboptimal for author names. Many author names are out-of-vocabulary for SciBERT and thus, they might be split into sub-words and shared across names that are not semantically related, leading to noisy correlation. Finally, we find that adding venues slightly decreases performance, 14 except on document classification (which makes sense, as we would expect venues to have high correlation 13 We experimented with both concatenating authors with the title and abstract and also considering them as an additional field. Neither were helpful.",
+ "cite_spans": [
+ {
+ "start": 492,
+ "end": 494,
+ "text": "13",
+ "ref_id": null
+ },
+ {
+ "start": 1123,
+ "end": 1125,
+ "text": "13",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 168,
+ "end": 175,
+ "text": "Table 2",
+ "ref_id": "TABREF3"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "14 Venue information in our data came directly from publisher provided metadata and thus was not normalized. Venue normalization could help improve results. with paper topics). The fact that SPECTER does not require inputs like authors or venues makes it applicable in situations where this metadata is not available, such as matching reviewers with anonymized submissions, or performing recommendations of anonymized preprints (e.g., on OpenReview). One design decision in SPECTER is to use a set of hard negative distractors in the citation-based finetuning objective. The fifth row of Table 2 shows that this is important-using only easy negatives reduces performance on all tasks. While there could be other potential ways to include hard negatives in the model, our simple approach of including citations of citations is effective. The sixth row of the table shows that using a strong general-domain language model (BERT-Large) instead of SciBERT in SPECTER reduces performance considerably. This is reasonable because unlike BERT-Large, SciB-ERT is pretrained on scientific text.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 588,
+ "end": 595,
+ "text": "Table 2",
+ "ref_id": "TABREF3"
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Visualization Figure 2 shows t-SNE (van der Maaten, 2014) projections of our embeddings (SPECTER) compared with the SciBERT baseline for a random set of papers. When comparing SPECTER embeddings with SciBERT, we observe that our embeddings are better at encoding topical information, as the clusters seem to be more compact. Further, we see some examples of crosstopic relatedness reflected in the embedding space (e.g., Engineering, Mathematics and Computer Science are close to each other, while Business and Economics are also close to each other). To quantify the comparison of visualized embeddings in Figure 2 , we use the DBScan clustering algorithm (Ester et al., 1996) on this 2D projection. We use the completeness and homogeneity clustering quality measures introduced by Rosenberg and Hirschberg (2007) . For the points corresponding to Figure 2 , the homogeneity and completeness values for SPECTER are respectively 0.41 and 0.72 compared with SciBERT's 0.19 and 0.63, a clear improvement on separating topics using the projected embeddings.",
+ "cite_spans": [
+ {
+ "start": 657,
+ "end": 677,
+ "text": "(Ester et al., 1996)",
+ "ref_id": "BIBREF12"
+ },
+ {
+ "start": 783,
+ "end": 814,
+ "text": "Rosenberg and Hirschberg (2007)",
+ "ref_id": "BIBREF41"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 14,
+ "end": 22,
+ "text": "Figure 2",
+ "ref_id": null
+ },
+ {
+ "start": 607,
+ "end": 615,
+ "text": "Figure 2",
+ "ref_id": null
+ },
+ {
+ "start": 849,
+ "end": 857,
+ "text": "Figure 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Comparison with Task Specific Fine-Tuning While the fact that SPECTER does not require finetuning makes its paper embeddings less costly to use, often the best performance from pretrained Transformers is obtained when the models are finetuned directly on each end task. We experiment with fine-tuning SciBERT on our tasks, and find this to be generally inferior to using our fixed representations from SPECTER. Specifically, we finetune SciBERT directly on task-specific signals instead of citations. To fine-tune on task-specific data (e.g., user activity), we used a dataset of coviews with 65K query papers, co-reads with 14K query papers, and co-citations (instead of direct citations) with 83K query papers. As the end tasks are ranking tasks, for all datasets we construct up to 5 triplets and fine-tune the model using triplet ranking loss. The positive papers are sampled from the most co-viewed (co-read, or co-cited) papers corresponding to the query paper. We also include both easy and hard distractors as when training SPECTER (for hard negatives we choose the least non-zero co-viewed (co-read, or co-cited) papers). We also consider training jointly on all task-specific training data sources in a multitask training process, where the model samples training triplets from a distribution over the sources. As illustrated in Table 3, without any additional final task-specific fine-tuning, SPECTER still outperforms a SciBERT model fine-tuned on the end tasks as well as their multitask combination, further demonstrating the effectiveness and versatility of SPECTER embeddings. 15",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Analysis",
+ "sec_num": "6"
+ },
+ {
+ "text": "Recent representation learning methods in NLP rely on training large neural language models on unsupervised data Radford et al., 2018; Devlin et al., 2019; Beltagy et al., 2019; Liu et al., 2019) . While successful at many sentenceand token-level tasks, our focus is on using the models for document-level representation learning, which has remained relatively under-explored.",
+ "cite_spans": [
+ {
+ "start": 113,
+ "end": 134,
+ "text": "Radford et al., 2018;",
+ "ref_id": "BIBREF38"
+ },
+ {
+ "start": 135,
+ "end": 155,
+ "text": "Devlin et al., 2019;",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 156,
+ "end": 177,
+ "text": "Beltagy et al., 2019;",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 178,
+ "end": 195,
+ "text": "Liu et al., 2019)",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "There have been other efforts in document representation learning such as extensions of word vectors to documents (Le and Mikolov, 2014; Ganesh et al., 2016; Wu et al., 2018; Gysel et al., 2017) , convolution-based methods Zamani et al., 2018) , and variational autoencoders (Holmer and Marfurt, 2018; . Relevant to document embedding, sentence embedding is a relatively well-studied area of research. Successful approaches include seq2seq models (Kiros et al., 2015) , BiLSTM Siamese networks (Williams et al., 2018) , leveraging supervised data from other corpora (Conneau et al., 2017) , and using discourse relations (Nie et al., 2019) , and BERT-based methods (Reimers and Gurevych, 2019) . Unlike our proposed method, the majority of these approaches do not consider any notion of inter-document relatedness when embedding documents.",
+ "cite_spans": [
+ {
+ "start": 114,
+ "end": 136,
+ "text": "(Le and Mikolov, 2014;",
+ "ref_id": "BIBREF28"
+ },
+ {
+ "start": 137,
+ "end": 157,
+ "text": "Ganesh et al., 2016;",
+ "ref_id": "BIBREF14"
+ },
+ {
+ "start": 158,
+ "end": 174,
+ "text": "Wu et al., 2018;",
+ "ref_id": "BIBREF51"
+ },
+ {
+ "start": 175,
+ "end": 194,
+ "text": "Gysel et al., 2017)",
+ "ref_id": "BIBREF16"
+ },
+ {
+ "start": 223,
+ "end": 243,
+ "text": "Zamani et al., 2018)",
+ "ref_id": "BIBREF55"
+ },
+ {
+ "start": 275,
+ "end": 301,
+ "text": "(Holmer and Marfurt, 2018;",
+ "ref_id": "BIBREF19"
+ },
+ {
+ "start": 447,
+ "end": 467,
+ "text": "(Kiros et al., 2015)",
+ "ref_id": null
+ },
+ {
+ "start": 494,
+ "end": 517,
+ "text": "(Williams et al., 2018)",
+ "ref_id": "BIBREF49"
+ },
+ {
+ "start": 566,
+ "end": 588,
+ "text": "(Conneau et al., 2017)",
+ "ref_id": "BIBREF10"
+ },
+ {
+ "start": 621,
+ "end": 639,
+ "text": "(Nie et al., 2019)",
+ "ref_id": "BIBREF35"
+ },
+ {
+ "start": 665,
+ "end": 693,
+ "text": "(Reimers and Gurevych, 2019)",
+ "ref_id": "BIBREF40"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "Other relevant work combines textual features with network structure (Tu et al., 2017; . These works typically do not leverage the recent pretrained contextual representations and with a few exceptions such as the recent work by , they cannot generalize to unseen documents like our SPECTER approach. Context-based citation recommendation is another related application where models rely on citation contexts (Jeong et al., 2019) to make predictions. These works are orthogonal to ours as the input to our model is just paper title and abstract. Another related line of work is graphbased representation learning methods (Bruna et al., 2014; Kipf and Welling, 2017; Hamilton et al., 2017a,b; Wu et al., 2019a,b) . Here, we compare to a graph representation learning model, SGC (Simple Graph Convolution) (Wu et al., 2019a) , which is a state-of-the-art graph convolution approach for representation learning. SPECTER uses pretrained language models in combination with graph-based citation signals, which enables it to outperform the graph-based approaches in our experiments. SPECTER embeddings are based on only the title and abstract of the paper. Adding the full text of the paper would provide a more complete picture of the paper's content and could improve accuracy (Cohen et al., 2010; Lin, 2008; Schuemie et al., 2004) . However, the full text of many academic papers is not freely available. Further, modern language models have strict memory limits on input size, which means new techniques would be required in order to leverage the entirety of the paper within the models. Exploring how to use the full paper text within SPECTER is an item of future work.",
+ "cite_spans": [
+ {
+ "start": 69,
+ "end": 86,
+ "text": "(Tu et al., 2017;",
+ "ref_id": "BIBREF46"
+ },
+ {
+ "start": 409,
+ "end": 429,
+ "text": "(Jeong et al., 2019)",
+ "ref_id": "BIBREF21"
+ },
+ {
+ "start": 621,
+ "end": 641,
+ "text": "(Bruna et al., 2014;",
+ "ref_id": null
+ },
+ {
+ "start": 642,
+ "end": 665,
+ "text": "Kipf and Welling, 2017;",
+ "ref_id": "BIBREF24"
+ },
+ {
+ "start": 666,
+ "end": 691,
+ "text": "Hamilton et al., 2017a,b;",
+ "ref_id": null
+ },
+ {
+ "start": 692,
+ "end": 711,
+ "text": "Wu et al., 2019a,b)",
+ "ref_id": null
+ },
+ {
+ "start": 804,
+ "end": 822,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ },
+ {
+ "start": 1273,
+ "end": 1293,
+ "text": "(Cohen et al., 2010;",
+ "ref_id": "BIBREF9"
+ },
+ {
+ "start": 1294,
+ "end": 1304,
+ "text": "Lin, 2008;",
+ "ref_id": "BIBREF29"
+ },
+ {
+ "start": 1305,
+ "end": 1327,
+ "text": "Schuemie et al., 2004)",
+ "ref_id": "BIBREF43"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "Finally, one pain point in academic paper recommendation research has been a lack of publicly available datasets (Chen and Lee, 2018; Kanakia et al., 2019) . To address this challenge, we release SCIDOCS, our evaluation benchmark which includes an anonymized clickthrough dataset from an online recommendations system.",
+ "cite_spans": [
+ {
+ "start": 113,
+ "end": 133,
+ "text": "(Chen and Lee, 2018;",
+ "ref_id": "BIBREF8"
+ },
+ {
+ "start": 134,
+ "end": 155,
+ "text": "Kanakia et al., 2019)",
+ "ref_id": "BIBREF22"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Related Work",
+ "sec_num": "7"
+ },
+ {
+ "text": "We present SPECTER, a model for learning representations of scientific papers, based on a Transformer language model that is pretrained on cita-tions. We achieve substantial improvements over the strongest of a wide variety of baselines, demonstrating the effectiveness of our model. We additionally introduce SCIDOCS, a new evaluation suite consisting of seven document-level tasks and release the corresponding datasets to foster further research in this area.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "The landscape of Transformer language models is rapidly changing and newer and larger models are frequently introduced. It would be interesting to initialize our model weights from more recent Transformer models to investigate if additional gains are possible. Another item of future work is to develop better multitask approaches to leverage multiple signals of relatedness information during training. We used citations to build triplets for our loss function, however there are other metrics that have good support from the bibliometrics literature (Klavans and Boyack, 2006) that warrant exploring as a way to create relatedness graphs. Including other information such as outgoing citations as additional input to the model would be yet another area to explore in future.",
+ "cite_spans": [
+ {
+ "start": 552,
+ "end": 578,
+ "text": "(Klavans and Boyack, 2006)",
+ "ref_id": "BIBREF26"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "4. SIF The SIF method of Arora et al. (2017) is a strong text representation baseline that takes a weighted sum of pretrained word vectors (we use fasttext embeddings described above), then computes the first principal component of the document embedding matrix and subtracts out each document embedding's projection to the first principal component.",
+ "cite_spans": [
+ {
+ "start": 25,
+ "end": 44,
+ "text": "Arora et al. (2017)",
+ "ref_id": "BIBREF2"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "We used a held-out validation set to choose a from the range [1.0e-5, 1.0e-3] spaced evenly on a log scale. The word probability p(w) was estimated on the training set only. When computing term-frequency values for SIF, we used scikit-learn's TfidfVectorizer with the same parameters as enumerated in the preceding section. sublinear_tf, binary, use_idf, smooth_idf were all set to False. Since SIF is a sum of pretrained fasttext vectors, the resulting dimensionality is 300. provides contextualized representations of tokens in a document. It can provide paragraph or document embeddings by averaging each token's representation for all 3 LSTM layers. We used the 768-dimensional pretrained ELMo model in AllenNLP .",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusions and Future Work",
+ "sec_num": "8"
+ },
+ {
+ "text": "6. Citeomatic The most relevant baseline is Citeomatic , which is an academic paper representation model that is trained on the citation graph via sampled triplets. Citeomatic representations are an L2 normalized weighted sum of title and abstract embeddings, which are trained on the citation graph with dynamic negative sampling. Citeomatic embeddings are 75-dimensional. 7. SGC Since our algorithm is trained on data from the citation graph, we also compare to a state-ofthe-art graph representation learning model: SGC (Simple Graph Convolution) (Wu et al., 2019a) , which is a graph convolution network. An alternative comparison would have been Graph-SAGE (Hamilton et al., 2017b) , but SGC (with no learning) outperformed an unsupervised variant of GraphSAGE on the Reddit dataset 16 , Note that SGC with no learning boils down to graph propagation on node features (in our case nodes are academic documents). Following Hamilton et al. (2017a), we used SIF features as node representations, and applied SGC with a range of parameter k, which is the number of times the normalized adjacency is multiplied by the SIF feature matrix. Our range of k was 1 through 8 (inclusive), and was chosen with a validation set. For the node features, we chose the SIF model with a = 0.0001, as this model was observed to be a high-performing one. This baseline is also 300 dimensional.",
+ "cite_spans": [
+ {
+ "start": 550,
+ "end": 568,
+ "text": "(Wu et al., 2019a)",
+ "ref_id": "BIBREF50"
+ },
+ {
+ "start": 662,
+ "end": 686,
+ "text": "(Hamilton et al., 2017b)",
+ "ref_id": "BIBREF18"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "ELMo ELMo",
+ "sec_num": "5."
+ },
+ {
+ "text": "8. SciBERT To isolate the advantage of SPECTER's citation-based fine-tuning objective, we add a controlled comparison with SciBERT (Beltagy et al., 2019) . Following Devlin et al. (2019) we take the last layer hidden state corresponding to the [CLS] token as the aggregate document representation. 17 There were no other direct comparisons in Wu et al. (2019a) 17 We also tried the alternative of averaging all token representations, but this resulted in a slight performance decrease compared with the [CLS] pooled token. 9. Sentence BERT Sentence BERT (Reimers and Gurevych, 2019 ) is a general-domain pretrained model aimed at embedding sentences. The authors fine-tuned BERT using a triplet loss, where positive sentences were from the same document section as the seed sentence, and distractor sentences came from other document sections. The model is designed to encode sentences as opposed to paragraphs, so we embed the title and each sentence in the abstract separately, sum the embeddings, and L2 normalize the result to produce a final 768-dimensional paper embedding. 18 During hyperparameter optimization we chose how to compute TF and IDF values weights by taking the following non-redundant combinations of scikit-learn's TfidfVectorizer (Pedregosa et al., 2011) parameters: sublinear_tf, binary, use_idf, smooth_idf. There were a total of 9 parameter combinations. The IDF values were estimated on the training set. The other parameters were set as follows: min_df=3, max_df=0.75, strip_accents='ascii', stop_words='english', norm=None, lowercase=True. For training of fasttext, we used all default parameters with the exception of setting dimension to 300 and minCount was set to 25 due to the large corpus.",
+ "cite_spans": [
+ {
+ "start": 131,
+ "end": 153,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ },
+ {
+ "start": 166,
+ "end": 186,
+ "text": "Devlin et al. (2019)",
+ "ref_id": "BIBREF11"
+ },
+ {
+ "start": 298,
+ "end": 300,
+ "text": "17",
+ "ref_id": null
+ },
+ {
+ "start": 343,
+ "end": 360,
+ "text": "Wu et al. (2019a)",
+ "ref_id": "BIBREF50"
+ },
+ {
+ "start": 554,
+ "end": 581,
+ "text": "(Reimers and Gurevych, 2019",
+ "ref_id": "BIBREF40"
+ },
+ {
+ "start": 1080,
+ "end": 1082,
+ "text": "18",
+ "ref_id": null
+ },
+ {
+ "start": 1253,
+ "end": 1277,
+ "text": "(Pedregosa et al., 2011)",
+ "ref_id": "BIBREF36"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "ELMo ELMo",
+ "sec_num": "5."
+ },
+ {
+ "text": "We used the 'bert-base-wikipedia-sections-mean-tokens' model released by the authors: https://github.com/ UKPLab/sentence-transformers",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "ELMo ELMo",
+ "sec_num": "5."
+ },
+ {
+ "text": "We also experimented with additional fields such as venues and authors but did not find any empirical advantage in using those (see \u00a76). See \u00a77 for a discussion of using the full text of the paper as input.5 It is also possible to encode title and abstracts individually and then concatenate or combine them to get the final embedding. However, in our experiments this resulted in sub-optimal performance.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "https://www.nlm.nih.gov/mesh/meshhome. html 8 https://academic.microsoft.com/",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "Embeddings are L2 normalized and in this case cosine distance is equivalent to L2 distance.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "Learning rate linear warmup followed by linear decay. 11 https://github.com/allenai/specter",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "We also experimented with further task-specific finetuning of our SPECTER on the end tasks but we did not observe additional improvements.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ }
+ ],
+ "back_matter": [
+ {
+ "text": "We thank Kyle Lo, Daniel King and Oren Etzioni for helpful research discussions, Russel Reas for setting up the public API, Field Cady for help in initial data collection and the anonymous reviewers (especially Reviewer 1) for comments and suggestions. This work was supported in part by NSF Convergence Accelerator award 1936940, ONR grant N00014-18-1-2193, and the University of Washington WRF/Cable Professorship.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Acknowledgements",
+ "sec_num": null
+ },
+ {
+ "text": "A Appendix A -Baseline Details 1. Random Zero-mean 25-dimensional vectors were used as representations for each document.2. Doc2Vec Doc2Vec is one of the earlier neural document/paragraph representation methods (Le and Mikolov, 2014) , and is a natural comparison. We trained Doc2Vec on our training subset using Gensim (\u0158eh\u016f\u0159ek and Sojka, 2010) , and chose the hyperparameter grid using suggestions from Lau and Baldwin (2016). The hyperparameter grid used:{'window': [5, 10, 15] , 'sample': [0, 10 ** -6, 10 ** -5], 'epochs': [50, 100, 200]}, for a total of 27 models. The other parameters were set as follows: vector_size=300, min_count=3, alpha=0.025, min_alpha=0.0001, negative=5, dm=0, dbow=1, dbow_words=0. 3. Fasttext-Sum This simple baseline is a weighted sum of pretrained word vectors. We trained our own 300 dimensional fasttext embeddings (Bojanowski et al., 2017) on a corpus of around 3.1B tokens from scientific papers which is similar in size to the SciBERT corpus (Beltagy et al., 2019) . We found that these pretrained embeddings substantially outperform alternative off-theshelf embeddings. We also use these embeddings in other baselines that require pretrained word vectors (i.e., SIF and SGC that are described below). The summed bag of words representation has a number of weighting options, which are extensively tuned on a validation set for best performance.",
+ "cite_spans": [
+ {
+ "start": 211,
+ "end": 233,
+ "text": "(Le and Mikolov, 2014)",
+ "ref_id": "BIBREF28"
+ },
+ {
+ "start": 320,
+ "end": 345,
+ "text": "(\u0158eh\u016f\u0159ek and Sojka, 2010)",
+ "ref_id": null
+ },
+ {
+ "start": 469,
+ "end": 472,
+ "text": "[5,",
+ "ref_id": null
+ },
+ {
+ "start": 473,
+ "end": 476,
+ "text": "10,",
+ "ref_id": null
+ },
+ {
+ "start": 477,
+ "end": 480,
+ "text": "15]",
+ "ref_id": null
+ },
+ {
+ "start": 852,
+ "end": 877,
+ "text": "(Bojanowski et al., 2017)",
+ "ref_id": "BIBREF5"
+ },
+ {
+ "start": 982,
+ "end": 1004,
+ "text": "(Beltagy et al., 2019)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "annex",
+ "sec_num": null
+ }
+ ],
+ "bib_entries": {
+ "BIBREF0": {
+ "ref_id": "b0",
+ "title": "Estimating position bias without intrusive interventions",
+ "authors": [
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Anant",
+ "suffix": ""
+ },
+ {
+ "first": "Ivan",
+ "middle": [],
+ "last": "Agarwal",
+ "suffix": ""
+ },
+ {
+ "first": "Xuanhui",
+ "middle": [],
+ "last": "Zaitsev",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yen",
+ "middle": [],
+ "last": "Cheng",
+ "suffix": ""
+ },
+ {
+ "first": "Marc",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Thorsten",
+ "middle": [],
+ "last": "Najork",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Joachims",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "WSDM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Anant K. Agarwal, Ivan Zaitsev, Xuanhui Wang, Cheng Yen Li, Marc Najork, and Thorsten Joachims. 2019. Estimating position bias without intrusive in- terventions. In WSDM.",
+ "links": null
+ },
+ "BIBREF1": {
+ "ref_id": "b1",
+ "title": "Construction of the literature graph in semantic scholar",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Dirk",
+ "middle": [],
+ "last": "Groeneveld",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": ""
+ },
+ {
+ "first": "Miles",
+ "middle": [],
+ "last": "Crawford",
+ "suffix": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Dunkelberger",
+ "suffix": ""
+ },
+ {
+ "first": "Ahmed",
+ "middle": [],
+ "last": "Elgohary",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": ""
+ },
+ {
+ "first": "Rodney",
+ "middle": [],
+ "last": "Kinney",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Kohlmeier",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Tyler",
+ "middle": [
+ "C"
+ ],
+ "last": "Murray",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Hsu-Han",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Ooi",
+ "suffix": ""
+ },
+ {
+ "first": "Joanna",
+ "middle": [],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Sam",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [
+ "Lu"
+ ],
+ "last": "Skjonsberg",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Zheng",
+ "middle": [],
+ "last": "Wilhelm",
+ "suffix": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Yuan",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "NAACL-HLT",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Waleed Ammar, Dirk Groeneveld, Chandra Bha- gavatula, Iz Beltagy, Miles Crawford, Doug Downey, Jason Dunkelberger, Ahmed Elgohary, Sergey Feldman, Vu Ha, Rodney Kinney, Sebas- tian Kohlmeier, Kyle Lo, Tyler C. Murray, Hsu- Han Ooi, Matthew E. Peters, Joanna Power, Sam Skjonsberg, Lucy Lu Wang, Christopher Wilhelm, Zheng Yuan, Madeleine van Zuylen, and Oren Et- zioni. 2018. Construction of the literature graph in semantic scholar. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF2": {
+ "ref_id": "b2",
+ "title": "A simple but tough-to-beat baseline for sentence embeddings",
+ "authors": [
+ {
+ "first": "Sanjeev",
+ "middle": [],
+ "last": "Arora",
+ "suffix": ""
+ },
+ {
+ "first": "Yingyu",
+ "middle": [],
+ "last": "Liang",
+ "suffix": ""
+ },
+ {
+ "first": "Tengyu",
+ "middle": [],
+ "last": "Ma",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ICLR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017. A simple but tough-to-beat baseline for sentence em- beddings. In ICLR.",
+ "links": null
+ },
+ "BIBREF3": {
+ "ref_id": "b3",
+ "title": "SciB-ERT: A Pretrained Language Model for Scientific Text",
+ "authors": [
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Arman",
+ "middle": [],
+ "last": "Cohan",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciB- ERT: A Pretrained Language Model for Scientific Text. In EMNLP.",
+ "links": null
+ },
+ "BIBREF4": {
+ "ref_id": "b4",
+ "title": "Content-Based Citation Recommendation",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-Based Citation Recommendation. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF5": {
+ "ref_id": "b5",
+ "title": "Enriching word vectors with subword information",
+ "authors": [
+ {
+ "first": "Piotr",
+ "middle": [],
+ "last": "Bojanowski",
+ "suffix": ""
+ },
+ {
+ "first": "Edouard",
+ "middle": [],
+ "last": "Grave",
+ "suffix": ""
+ },
+ {
+ "first": "Armand",
+ "middle": [],
+ "last": "Joulin",
+ "suffix": ""
+ },
+ {
+ "first": "Tomas",
+ "middle": [],
+ "last": "Mikolov",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.1162/tacl_a_00051"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. 2017. Enriching word vectors with subword information. TACL.",
+ "links": null
+ },
+ "BIBREF7": {
+ "ref_id": "b7",
+ "title": "Improving textual network embedding with global attention via optimal transport",
+ "authors": [
+ {
+ "first": "Liqun",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Guoyin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Chenyang",
+ "middle": [],
+ "last": "Tao",
+ "suffix": ""
+ },
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Pengyu",
+ "middle": [],
+ "last": "Cheng",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Wenlin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yizhe",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Liqun Chen, Guoyin Wang, Chenyang Tao, Ding- han Shen, Pengyu Cheng, Xinyuan Zhang, Wenlin Wang, Yizhe Zhang, and Lawrence Carin. 2019. Im- proving textual network embedding with global at- tention via optimal transport. In ACL.",
+ "links": null
+ },
+ "BIBREF8": {
+ "ref_id": "b8",
+ "title": "Research Paper Recommender Systems on Big Scholarly Data",
+ "authors": [
+ {
+ "first": "Maria",
+ "middle": [],
+ "last": "Tsung Teng Chen",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "Knowledge Management and Acquisition for Intelligent Systems",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Tsung Teng Chen and Maria Lee. 2018. Research Pa- per Recommender Systems on Big Scholarly Data. In Knowledge Management and Acquisition for In- telligent Systems.",
+ "links": null
+ },
+ "BIBREF9": {
+ "ref_id": "b9",
+ "title": "The structural and content aspects of abstracts versus bodies of full text journal articles are different",
+ "authors": [
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Cohen",
+ "suffix": ""
+ },
+ {
+ "first": "Helen",
+ "middle": [
+ "L"
+ ],
+ "last": "Johnson",
+ "suffix": ""
+ },
+ {
+ "first": "Karin",
+ "middle": [
+ "M"
+ ],
+ "last": "Verspoor",
+ "suffix": ""
+ },
+ {
+ "first": "Christophe",
+ "middle": [],
+ "last": "Roeder",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Hunter",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "BMC Bioinformatics",
+ "volume": "11",
+ "issue": "",
+ "pages": "492--492",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "K. Bretonnel Cohen, Helen L. Johnson, Karin M. Ver- spoor, Christophe Roeder, and Lawrence Hunter. 2010. The structural and content aspects of abstracts versus bodies of full text journal articles are different. BMC Bioinformatics, 11:492-492.",
+ "links": null
+ },
+ "BIBREF10": {
+ "ref_id": "b10",
+ "title": "Supervised Learning of Universal Sentence Representations from Natural Language Inference Data",
+ "authors": [
+ {
+ "first": "Alexis",
+ "middle": [],
+ "last": "Conneau",
+ "suffix": ""
+ },
+ {
+ "first": "Douwe",
+ "middle": [],
+ "last": "Kiela",
+ "suffix": ""
+ },
+ {
+ "first": "Holger",
+ "middle": [],
+ "last": "Schwenk",
+ "suffix": ""
+ },
+ {
+ "first": "Lo\u00efc",
+ "middle": [],
+ "last": "Barrault",
+ "suffix": ""
+ },
+ {
+ "first": "Antoine",
+ "middle": [],
+ "last": "Bordes",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/D17-1070"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Alexis Conneau, Douwe Kiela, Holger Schwenk, Lo\u00efc Barrault, and Antoine Bordes. 2017. Supervised Learning of Universal Sentence Representations from Natural Language Inference Data. In EMNLP.",
+ "links": null
+ },
+ "BIBREF11": {
+ "ref_id": "b11",
+ "title": "BERT: Pre-training of deep bidirectional transformers for language understanding",
+ "authors": [
+ {
+ "first": "Jacob",
+ "middle": [],
+ "last": "Devlin",
+ "suffix": ""
+ },
+ {
+ "first": "Ming-Wei",
+ "middle": [],
+ "last": "Chang",
+ "suffix": ""
+ },
+ {
+ "first": "Kenton",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ },
+ {
+ "first": "Kristina",
+ "middle": [],
+ "last": "Toutanova",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "NAACL-HLT",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language under- standing. In NAACL-HLT.",
+ "links": null
+ },
+ "BIBREF12": {
+ "ref_id": "b12",
+ "title": "A Density-based Algorithm for Discovering Clusters in Large Spatial Databases with Noise",
+ "authors": [
+ {
+ "first": "Martin",
+ "middle": [],
+ "last": "Ester",
+ "suffix": ""
+ },
+ {
+ "first": "Hans-Peter",
+ "middle": [],
+ "last": "Kriegel",
+ "suffix": ""
+ },
+ {
+ "first": "J\u00f6rg",
+ "middle": [],
+ "last": "Sander",
+ "suffix": ""
+ },
+ {
+ "first": "Xiaowei",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ }
+ ],
+ "year": 1996,
+ "venue": "KDD",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martin Ester, Hans-Peter Kriegel, J\u00f6rg Sander, Xiaowei Xu, et al. 1996. A Density-based Algorithm for Dis- covering Clusters in Large Spatial Databases with Noise. In KDD.",
+ "links": null
+ },
+ "BIBREF13": {
+ "ref_id": "b13",
+ "title": "Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction",
+ "authors": [
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": ""
+ },
+ {
+ "first": "Elly",
+ "middle": [],
+ "last": "Trepman",
+ "suffix": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "JAMA",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.1001/jamanetworkopen.2019.6700"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Sergey Feldman, Waleed Ammar, Kyle Lo, Elly Trep- man, Madeleine van Zuylen, and Oren Etzioni. 2019. Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction. JAMA.",
+ "links": null
+ },
+ "BIBREF14": {
+ "ref_id": "b14",
+ "title": "Doc2sent2vec: A novel two-phase approach for learning document representation",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Ganesh",
+ "suffix": ""
+ },
+ {
+ "first": "Manish",
+ "middle": [],
+ "last": "Gupta",
+ "suffix": ""
+ },
+ {
+ "first": "Vijay",
+ "middle": [
+ "K"
+ ],
+ "last": "Varma",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "SIGIR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "J Ganesh, Manish Gupta, and Vijay K. Varma. 2016. Doc2sent2vec: A novel two-phase approach for learning document representation. In SIGIR.",
+ "links": null
+ },
+ "BIBREF15": {
+ "ref_id": "b15",
+ "title": "AllenNLP: A Deep Semantic Natural Language Processing Platform",
+ "authors": [
+ {
+ "first": "Matt",
+ "middle": [],
+ "last": "Gardner",
+ "suffix": ""
+ },
+ {
+ "first": "Joel",
+ "middle": [],
+ "last": "Grus",
+ "suffix": ""
+ },
+ {
+ "first": "Mark",
+ "middle": [],
+ "last": "Neumann",
+ "suffix": ""
+ },
+ {
+ "first": "Oyvind",
+ "middle": [],
+ "last": "Tafjord",
+ "suffix": ""
+ },
+ {
+ "first": "Pradeep",
+ "middle": [],
+ "last": "Dasigi",
+ "suffix": ""
+ },
+ {
+ "first": "Nelson",
+ "middle": [
+ "F"
+ ],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Schmitz",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "Proceedings of Workshop for NLP Open Source Software",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/W18-2501"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Matt Gardner, Joel Grus, Mark Neumann, Oyvind Tafjord, Pradeep Dasigi, Nelson F. Liu, Matthew Pe- ters, Michael Schmitz, and Luke Zettlemoyer. 2018. AllenNLP: A Deep Semantic Natural Language Pro- cessing Platform. In Proceedings of Workshop for NLP Open Source Software (NLP-OSS).",
+ "links": null
+ },
+ "BIBREF16": {
+ "ref_id": "b16",
+ "title": "Neural Vector Spaces for Unsupervised Information Retrieval",
+ "authors": [
+ {
+ "first": "Christophe",
+ "middle": [],
+ "last": "Van Gysel",
+ "suffix": ""
+ },
+ {
+ "first": "Maarten",
+ "middle": [],
+ "last": "De Rijke",
+ "suffix": ""
+ },
+ {
+ "first": "Evangelos",
+ "middle": [],
+ "last": "Kanoulas",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACM Trans. Inf. Syst",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Christophe Van Gysel, Maarten de Rijke, and Evange- los Kanoulas. 2017. Neural Vector Spaces for Un- supervised Information Retrieval. ACM Trans. Inf. Syst.",
+ "links": null
+ },
+ "BIBREF17": {
+ "ref_id": "b17",
+ "title": "Inductive Representation Learning on Large Graphs",
+ "authors": [
+ {
+ "first": "Will",
+ "middle": [],
+ "last": "Hamilton",
+ "suffix": ""
+ },
+ {
+ "first": "Zhitao",
+ "middle": [],
+ "last": "Ying",
+ "suffix": ""
+ },
+ {
+ "first": "Jure",
+ "middle": [],
+ "last": "Leskovec",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017a. Inductive Representation Learning on Large Graphs. In NIPS.",
+ "links": null
+ },
+ "BIBREF18": {
+ "ref_id": "b18",
+ "title": "Inductive representation learning on large graphs",
+ "authors": [
+ {
+ "first": "William",
+ "middle": [
+ "L"
+ ],
+ "last": "Hamilton",
+ "suffix": ""
+ },
+ {
+ "first": "Zhitao",
+ "middle": [],
+ "last": "Ying",
+ "suffix": ""
+ },
+ {
+ "first": "Jure",
+ "middle": [],
+ "last": "Leskovec",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "William L. Hamilton, Zhitao Ying, and Jure Leskovec. 2017b. Inductive representation learning on large graphs. In NIPS.",
+ "links": null
+ },
+ "BIBREF19": {
+ "ref_id": "b19",
+ "title": "Explaining away syntactic structure in semantic document representations",
+ "authors": [
+ {
+ "first": "Erik",
+ "middle": [],
+ "last": "Holmer",
+ "suffix": ""
+ },
+ {
+ "first": "Andreas",
+ "middle": [],
+ "last": "Marfurt",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Erik Holmer and Andreas Marfurt. 2018. Explaining away syntactic structure in semantic document rep- resentations. ArXiv, abs/1806.01620.",
+ "links": null
+ },
+ "BIBREF20": {
+ "ref_id": "b20",
+ "title": "Universal Language Model Fine-tuning for Text Classification",
+ "authors": [
+ {
+ "first": "Jeremy",
+ "middle": [],
+ "last": "Howard",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Ruder",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/P18-1031"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Jeremy Howard and Sebastian Ruder. 2018. Universal Language Model Fine-tuning for Text Classification. In ACL.",
+ "links": null
+ },
+ "BIBREF21": {
+ "ref_id": "b21",
+ "title": "A context-aware citation recommendation model with bert and graph convolutional networks",
+ "authors": [
+ {
+ "first": "Chanwoo",
+ "middle": [],
+ "last": "Jeong",
+ "suffix": ""
+ },
+ {
+ "first": "Sion",
+ "middle": [],
+ "last": "Jang",
+ "suffix": ""
+ },
+ {
+ "first": "Hyuna",
+ "middle": [],
+ "last": "Shin",
+ "suffix": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [],
+ "last": "Eunjeong",
+ "suffix": ""
+ },
+ {
+ "first": "Sungchul",
+ "middle": [],
+ "last": "Park",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Choi",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chanwoo Jeong, Sion Jang, Hyuna Shin, Eun- jeong Lucy Park, and Sungchul Choi. 2019. A context-aware citation recommendation model with bert and graph convolutional networks. ArXiv, abs/1903.06464.",
+ "links": null
+ },
+ "BIBREF22": {
+ "ref_id": "b22",
+ "title": "A Scalable Hybrid Research Paper Recommender System for Microsoft Academic",
+ "authors": [
+ {
+ "first": "Anshul",
+ "middle": [],
+ "last": "Kanakia",
+ "suffix": ""
+ },
+ {
+ "first": "Zhihong",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Darrin",
+ "middle": [],
+ "last": "Eide",
+ "suffix": ""
+ },
+ {
+ "first": "Kuansan",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "WWW",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Anshul Kanakia, Zhihong Shen, Darrin Eide, and Kuansan Wang. 2019. A Scalable Hybrid Research Paper Recommender System for Microsoft Aca- demic. In WWW.",
+ "links": null
+ },
+ "BIBREF23": {
+ "ref_id": "b23",
+ "title": "Adam: A Method for Stochastic Optimization",
+ "authors": [
+ {
+ "first": "P",
+ "middle": [],
+ "last": "Diederik",
+ "suffix": ""
+ },
+ {
+ "first": "Jimmy",
+ "middle": [],
+ "last": "Kingma",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Ba",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Diederik P. Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. ArXiv, abs/1412.6980.",
+ "links": null
+ },
+ "BIBREF24": {
+ "ref_id": "b24",
+ "title": "Semisupervised classification with graph convolutional networks",
+ "authors": [
+ {
+ "first": "N",
+ "middle": [],
+ "last": "Thomas",
+ "suffix": ""
+ },
+ {
+ "first": "Max",
+ "middle": [],
+ "last": "Kipf",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Welling",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Thomas N Kipf and Max Welling. 2017. Semi- supervised classification with graph convolutional networks. ICLR.",
+ "links": null
+ },
+ "BIBREF25": {
+ "ref_id": "b25",
+ "title": "Raquel Urtasun, and Sanja Fidler. 2015. Skip-thought vectors",
+ "authors": [
+ {
+ "first": "Ryan",
+ "middle": [],
+ "last": "Kiros",
+ "suffix": ""
+ },
+ {
+ "first": "Yukun",
+ "middle": [],
+ "last": "Zhu",
+ "suffix": ""
+ },
+ {
+ "first": "Ruslan",
+ "middle": [],
+ "last": "Salakhutdinov",
+ "suffix": ""
+ },
+ {
+ "first": "Richard",
+ "middle": [
+ "S"
+ ],
+ "last": "Zemel",
+ "suffix": ""
+ },
+ {
+ "first": "Antonio",
+ "middle": [],
+ "last": "Torralba",
+ "suffix": ""
+ }
+ ],
+ "year": null,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urta- sun, and Sanja Fidler. 2015. Skip-thought vectors. In NIPS.",
+ "links": null
+ },
+ "BIBREF26": {
+ "ref_id": "b26",
+ "title": "Identifying a better measure of relatedness for mapping science",
+ "authors": [
+ {
+ "first": "Richard",
+ "middle": [],
+ "last": "Klavans",
+ "suffix": ""
+ },
+ {
+ "first": "Kevin",
+ "middle": [
+ "W"
+ ],
+ "last": "Boyack",
+ "suffix": ""
+ }
+ ],
+ "year": 2006,
+ "venue": "Journal of the Association for Information Science and Technology",
+ "volume": "57",
+ "issue": "",
+ "pages": "251--263",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Richard Klavans and Kevin W. Boyack. 2006. Iden- tifying a better measure of relatedness for mapping science. Journal of the Association for Information Science and Technology, 57:251-263.",
+ "links": null
+ },
+ "BIBREF27": {
+ "ref_id": "b27",
+ "title": "An empirical evaluation of doc2vec with practical insights into document embedding generation",
+ "authors": [
+ {
+ "first": "Han",
+ "middle": [],
+ "last": "Jey",
+ "suffix": ""
+ },
+ {
+ "first": "Timothy",
+ "middle": [],
+ "last": "Lau",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Baldwin",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "Rep4NLP@ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jey Han Lau and Timothy Baldwin. 2016. An empirical evaluation of doc2vec with practical in- sights into document embedding generation. In Rep4NLP@ACL.",
+ "links": null
+ },
+ "BIBREF28": {
+ "ref_id": "b28",
+ "title": "Distributed Representations of Sentences and Documents",
+ "authors": [
+ {
+ "first": "Quoc",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ },
+ {
+ "first": "Tomas",
+ "middle": [],
+ "last": "Mikolov",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "ICML",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Quoc Le and Tomas Mikolov. 2014. Distributed Repre- sentations of Sentences and Documents. In ICML.",
+ "links": null
+ },
+ "BIBREF29": {
+ "ref_id": "b29",
+ "title": "Is searching full text more effective than searching abstracts?",
+ "authors": [
+ {
+ "first": "Jimmy",
+ "middle": [
+ "J"
+ ],
+ "last": "Lin",
+ "suffix": ""
+ }
+ ],
+ "year": 2008,
+ "venue": "BMC Bioinformatics",
+ "volume": "10",
+ "issue": "",
+ "pages": "46--46",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jimmy J. Lin. 2008. Is searching full text more effec- tive than searching abstracts? BMC Bioinformatics, 10:46-46.",
+ "links": null
+ },
+ "BIBREF30": {
+ "ref_id": "b30",
+ "title": "Bulletin of the Medical Library Association",
+ "authors": [
+ {
+ "first": "Carolyn",
+ "middle": [
+ "E"
+ ],
+ "last": "Lipscomb",
+ "suffix": ""
+ }
+ ],
+ "year": 2000,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Carolyn E Lipscomb. 2000. Medical Subject Headings (MeSH). Bulletin of the Medical Library Associa- tion.",
+ "links": null
+ },
+ "BIBREF31": {
+ "ref_id": "b31",
+ "title": "Unsupervised Document Embedding with CNNs",
+ "authors": [
+ {
+ "first": "Chundi",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Shunan",
+ "middle": [],
+ "last": "Zhao",
+ "suffix": ""
+ },
+ {
+ "first": "Maksims",
+ "middle": [],
+ "last": "Volkovs",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chundi Liu, Shunan Zhao, and Maksims Volkovs. 2018. Unsupervised Document Embedding with CNNs. ArXiv, abs/1711.04168v3.",
+ "links": null
+ },
+ "BIBREF32": {
+ "ref_id": "b32",
+ "title": "A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis",
+ "authors": [
+ {
+ "first": "Pengfei",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "King",
+ "middle": [
+ "Keung"
+ ],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Helen",
+ "middle": [
+ "M"
+ ],
+ "last": "Meng",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Pengfei Liu, King Keung Wu, and Helen M. Meng. 2017. A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis. IJCNN.",
+ "links": null
+ },
+ "BIBREF34": {
+ "ref_id": "b34",
+ "title": "Accelerating t-SNE Using Tree-based Algorithms",
+ "authors": [
+ {
+ "first": "Laurens",
+ "middle": [],
+ "last": "Van Der Maaten",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "Journal of Machine Learning Research",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Laurens van der Maaten. 2014. Accelerating t-SNE Using Tree-based Algorithms. Journal of Machine Learning Research.",
+ "links": null
+ },
+ "BIBREF35": {
+ "ref_id": "b35",
+ "title": "DisSent: Learning Sentence Representations from Explicit Discourse Relations",
+ "authors": [
+ {
+ "first": "Allen",
+ "middle": [],
+ "last": "Nie",
+ "suffix": ""
+ },
+ {
+ "first": "Erin",
+ "middle": [],
+ "last": "Bennett",
+ "suffix": ""
+ },
+ {
+ "first": "Noah",
+ "middle": [],
+ "last": "Goodman",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/P19-1442"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Allen Nie, Erin Bennett, and Noah Goodman. 2019. DisSent: Learning Sentence Representations from Explicit Discourse Relations. In ACL.",
+ "links": null
+ },
+ "BIBREF36": {
+ "ref_id": "b36",
+ "title": "Scikit-learn: Machine learning in Python",
+ "authors": [
+ {
+ "first": "F",
+ "middle": [],
+ "last": "Pedregosa",
+ "suffix": ""
+ },
+ {
+ "first": "G",
+ "middle": [],
+ "last": "Varoquaux",
+ "suffix": ""
+ },
+ {
+ "first": "A",
+ "middle": [],
+ "last": "Gramfort",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Michel",
+ "suffix": ""
+ },
+ {
+ "first": "B",
+ "middle": [],
+ "last": "Thirion",
+ "suffix": ""
+ },
+ {
+ "first": "O",
+ "middle": [],
+ "last": "Grisel",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Blondel",
+ "suffix": ""
+ },
+ {
+ "first": "P",
+ "middle": [],
+ "last": "Prettenhofer",
+ "suffix": ""
+ },
+ {
+ "first": "R",
+ "middle": [],
+ "last": "Weiss",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Dubourg",
+ "suffix": ""
+ },
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Vanderplas",
+ "suffix": ""
+ },
+ {
+ "first": "A",
+ "middle": [],
+ "last": "Passos",
+ "suffix": ""
+ },
+ {
+ "first": "D",
+ "middle": [],
+ "last": "Cournapeau",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Brucher",
+ "suffix": ""
+ },
+ {
+ "first": "M",
+ "middle": [],
+ "last": "Perrot",
+ "suffix": ""
+ },
+ {
+ "first": "E",
+ "middle": [],
+ "last": "Duchesnay",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "Journal of Machine Learning Research",
+ "volume": "12",
+ "issue": "",
+ "pages": "2825--2830",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "F. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R. Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, and E. Duch- esnay. 2011. Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12:2825-2830.",
+ "links": null
+ },
+ "BIBREF37": {
+ "ref_id": "b37",
+ "title": "Deep Contextualized Word Representations",
+ "authors": [
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Mark",
+ "middle": [],
+ "last": "Neumann",
+ "suffix": ""
+ },
+ {
+ "first": "Mohit",
+ "middle": [],
+ "last": "Iyyer",
+ "suffix": ""
+ },
+ {
+ "first": "Matt",
+ "middle": [],
+ "last": "Gardner",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Clark",
+ "suffix": ""
+ },
+ {
+ "first": "Kenton",
+ "middle": [],
+ "last": "Lee",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep Contextualized Word Rep- resentations.",
+ "links": null
+ },
+ "BIBREF38": {
+ "ref_id": "b38",
+ "title": "Improving language understanding by generative pre-training",
+ "authors": [
+ {
+ "first": "Alec",
+ "middle": [],
+ "last": "Radford",
+ "suffix": ""
+ },
+ {
+ "first": "Karthik",
+ "middle": [],
+ "last": "Narasimhan",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language under- standing by generative pre-training. arXiv.",
+ "links": null
+ },
+ "BIBREF39": {
+ "ref_id": "b39",
+ "title": "Software Framework for Topic Modelling with Large Corpora",
+ "authors": [
+ {
+ "first": "Petr",
+ "middle": [],
+ "last": "Radim\u0159eh\u016f\u0159ek",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Sojka",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "LREC",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Radim\u0158eh\u016f\u0159ek and Petr Sojka. 2010. Software Frame- work for Topic Modelling with Large Corpora. In LREC.",
+ "links": null
+ },
+ "BIBREF40": {
+ "ref_id": "b40",
+ "title": "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+ "authors": [
+ {
+ "first": "Nils",
+ "middle": [],
+ "last": "Reimers",
+ "suffix": ""
+ },
+ {
+ "first": "Iryna",
+ "middle": [],
+ "last": "Gurevych",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Nils Reimers and Iryna Gurevych. 2019. Sentence- BERT: Sentence Embeddings using Siamese BERT- Networks. In EMNLP.",
+ "links": null
+ },
+ "BIBREF41": {
+ "ref_id": "b41",
+ "title": "Vmeasure: A Conditional Entropy-based External Cluster Evaluation Measure",
+ "authors": [
+ {
+ "first": "Andrew",
+ "middle": [],
+ "last": "Rosenberg",
+ "suffix": ""
+ },
+ {
+ "first": "Julia",
+ "middle": [],
+ "last": "Hirschberg",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Andrew Rosenberg and Julia Hirschberg. 2007. V- measure: A Conditional Entropy-based External Cluster Evaluation Measure. In EMNLP.",
+ "links": null
+ },
+ "BIBREF42": {
+ "ref_id": "b42",
+ "title": "Collaborative filtering recommender systems",
+ "authors": [
+ {
+ "first": "Ben",
+ "middle": [],
+ "last": "Schafer",
+ "suffix": ""
+ },
+ {
+ "first": "Dan",
+ "middle": [],
+ "last": "Frankowski",
+ "suffix": ""
+ },
+ {
+ "first": "Jon",
+ "middle": [],
+ "last": "Herlocker",
+ "suffix": ""
+ },
+ {
+ "first": "Shilad",
+ "middle": [],
+ "last": "Sen",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "The adaptive web",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "J Ben Schafer, Dan Frankowski, Jon Herlocker, and Shilad Sen. 2007. Collaborative filtering recom- mender systems. In The adaptive web. Springer.",
+ "links": null
+ },
+ "BIBREF43": {
+ "ref_id": "b43",
+ "title": "Distribution of information in biomedical abstracts and full-text publications",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Martijn",
+ "suffix": ""
+ },
+ {
+ "first": "Marc",
+ "middle": [],
+ "last": "Schuemie",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Weeber",
+ "suffix": ""
+ },
+ {
+ "first": "J",
+ "middle": [
+ "A"
+ ],
+ "last": "Bob",
+ "suffix": ""
+ },
+ {
+ "first": "Erik",
+ "middle": [
+ "M"
+ ],
+ "last": "Schijvenaars",
+ "suffix": ""
+ },
+ {
+ "first": "C",
+ "middle": [],
+ "last": "Van Mulligen",
+ "suffix": ""
+ },
+ {
+ "first": "Rob",
+ "middle": [],
+ "last": "Christiaan Van Der Eijk",
+ "suffix": ""
+ },
+ {
+ "first": "Barend",
+ "middle": [],
+ "last": "Jelier",
+ "suffix": ""
+ },
+ {
+ "first": "Jan",
+ "middle": [
+ "A"
+ ],
+ "last": "Mons",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Kors",
+ "suffix": ""
+ }
+ ],
+ "year": 2004,
+ "venue": "",
+ "volume": "20",
+ "issue": "",
+ "pages": "2597--604",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martijn J. Schuemie, Marc Weeber, Bob J. A. Schijve- naars, Erik M. van Mulligen, C. Christiaan van der Eijk, Rob Jelier, Barend Mons, and Jan A. Kors. 2004. Distribution of information in biomedical ab- stracts and full-text publications. Bioinformatics, 20(16):2597-604.",
+ "links": null
+ },
+ "BIBREF44": {
+ "ref_id": "b44",
+ "title": "Improved semantic-aware network embedding with fine-grained word alignment",
+ "authors": [
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Ricardo",
+ "middle": [],
+ "last": "Henao",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Dinghan Shen, Xinyuan Zhang, Ricardo Henao, and Lawrence Carin. 2018. Improved semantic-aware network embedding with fine-grained word align- ment. In EMNLP.",
+ "links": null
+ },
+ "BIBREF45": {
+ "ref_id": "b45",
+ "title": "An Overview of Microsoft Academic Service (MAS) and Applications",
+ "authors": [
+ {
+ "first": "Arnab",
+ "middle": [],
+ "last": "Sinha",
+ "suffix": ""
+ },
+ {
+ "first": "Zhihong",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Yang",
+ "middle": [],
+ "last": "Song",
+ "suffix": ""
+ },
+ {
+ "first": "Hao",
+ "middle": [],
+ "last": "Ma",
+ "suffix": ""
+ },
+ {
+ "first": "Darrin",
+ "middle": [],
+ "last": "Eide",
+ "suffix": ""
+ },
+ {
+ "first": "Bo-June Paul",
+ "middle": [],
+ "last": "Hsu",
+ "suffix": ""
+ },
+ {
+ "first": "Kuansan",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "WWW",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Arnab Sinha, Zhihong Shen, Yang Song, Hao Ma, Dar- rin Eide, Bo-June Paul Hsu, and Kuansan Wang. 2015. An Overview of Microsoft Academic Service (MAS) and Applications. In WWW.",
+ "links": null
+ },
+ "BIBREF46": {
+ "ref_id": "b46",
+ "title": "Cane: Context-aware network embedding for relation modeling",
+ "authors": [
+ {
+ "first": "Cunchao",
+ "middle": [],
+ "last": "Tu",
+ "suffix": ""
+ },
+ {
+ "first": "Han",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Zhiyuan",
+ "middle": [],
+ "last": "Liu",
+ "suffix": ""
+ },
+ {
+ "first": "Maosong",
+ "middle": [],
+ "last": "Sun",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Cunchao Tu, Han Liu, Zhiyuan Liu, and Maosong Sun. 2017. Cane: Context-aware network embedding for relation modeling. In ACL.",
+ "links": null
+ },
+ "BIBREF47": {
+ "ref_id": "b47",
+ "title": "Attention Is All You Need",
+ "authors": [
+ {
+ "first": "Ashish",
+ "middle": [],
+ "last": "Vaswani",
+ "suffix": ""
+ },
+ {
+ "first": "Noam",
+ "middle": [],
+ "last": "Shazeer",
+ "suffix": ""
+ },
+ {
+ "first": "Niki",
+ "middle": [],
+ "last": "Parmar",
+ "suffix": ""
+ },
+ {
+ "first": "Jakob",
+ "middle": [],
+ "last": "Uszkoreit",
+ "suffix": ""
+ },
+ {
+ "first": "Llion",
+ "middle": [],
+ "last": "Jones",
+ "suffix": ""
+ },
+ {
+ "first": "Aidan",
+ "middle": [
+ "N"
+ ],
+ "last": "Gomez",
+ "suffix": ""
+ },
+ {
+ "first": "Lukasz",
+ "middle": [],
+ "last": "Kaiser",
+ "suffix": ""
+ },
+ {
+ "first": "Illia",
+ "middle": [],
+ "last": "Polosukhin",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "NIPS",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In NIPS.",
+ "links": null
+ },
+ "BIBREF48": {
+ "ref_id": "b48",
+ "title": "Improving textual network learning with variational homophilic embeddings",
+ "authors": [
+ {
+ "first": "Wenlin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Chenyang",
+ "middle": [],
+ "last": "Tao",
+ "suffix": ""
+ },
+ {
+ "first": "Zhe",
+ "middle": [],
+ "last": "Gan",
+ "suffix": ""
+ },
+ {
+ "first": "Guoyin",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Liqun",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Ruiyi",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Qian",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Ricardo",
+ "middle": [],
+ "last": "Henao",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "Advances in Neural Information Processing Systems",
+ "volume": "",
+ "issue": "",
+ "pages": "2074--2085",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Wenlin Wang, Chenyang Tao, Zhe Gan, Guoyin Wang, Liqun Chen, Xinyuan Zhang, Ruiyi Zhang, Qian Yang, Ricardo Henao, and Lawrence Carin. 2019. Improving textual network learning with variational homophilic embeddings. In Advances in Neural In- formation Processing Systems, pages 2074-2085.",
+ "links": null
+ },
+ "BIBREF49": {
+ "ref_id": "b49",
+ "title": "A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference",
+ "authors": [
+ {
+ "first": "Adina",
+ "middle": [],
+ "last": "Williams",
+ "suffix": ""
+ },
+ {
+ "first": "Nikita",
+ "middle": [],
+ "last": "Nangia",
+ "suffix": ""
+ },
+ {
+ "first": "Samuel",
+ "middle": [],
+ "last": "Bowman",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {
+ "DOI": [
+ "10.18653/v1/N18-1101"
+ ]
+ },
+ "num": null,
+ "urls": [],
+ "raw_text": "Adina Williams, Nikita Nangia, and Samuel Bowman. 2018. A Broad-Coverage Challenge Corpus for Sen- tence Understanding through Inference. In NAACL- HLT.",
+ "links": null
+ },
+ "BIBREF50": {
+ "ref_id": "b50",
+ "title": "Simplifying graph convolutional networks",
+ "authors": [
+ {
+ "first": "Felix",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "H",
+ "middle": [],
+ "last": "Amauri",
+ "suffix": ""
+ },
+ {
+ "first": "Tianyi",
+ "middle": [],
+ "last": "Souza",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Tao",
+ "middle": [],
+ "last": "Fifty",
+ "suffix": ""
+ },
+ {
+ "first": "Kilian",
+ "middle": [
+ "Q"
+ ],
+ "last": "Yu",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Weinberger",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ICML",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Felix Wu, Amauri H. Souza, Tianyi Zhang, Christo- pher Fifty, Tao Yu, and Kilian Q. Weinberger. 2019a. Simplifying graph convolutional networks. In ICML.",
+ "links": null
+ },
+ "BIBREF51": {
+ "ref_id": "b51",
+ "title": "Word Mover's Embedding: From Word2Vec to Document Embedding",
+ "authors": [
+ {
+ "first": "Lingfei",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Ian",
+ "middle": [],
+ "last": "En-Hsu Yen",
+ "suffix": ""
+ },
+ {
+ "first": "Kun",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ },
+ {
+ "first": "Fangli",
+ "middle": [],
+ "last": "Xu",
+ "suffix": ""
+ },
+ {
+ "first": "Avinash",
+ "middle": [],
+ "last": "Balakrishnan",
+ "suffix": ""
+ },
+ {
+ "first": "Pin-Yu",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Pradeep",
+ "middle": [],
+ "last": "Ravikumar",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [
+ "J"
+ ],
+ "last": "Witbrock",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Lingfei Wu, Ian En-Hsu Yen, Kun Xu, Fangli Xu, Avinash Balakrishnan, Pin-Yu Chen, Pradeep Ravikumar, and Michael J Witbrock. 2018. Word Mover's Embedding: From Word2Vec to Document Embedding. In EMNLP.",
+ "links": null
+ },
+ "BIBREF52": {
+ "ref_id": "b52",
+ "title": "Google's neural machine translation system: Bridging the gap between human and machine translation",
+ "authors": [
+ {
+ "first": "Yonghui",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Mike",
+ "middle": [],
+ "last": "Schuster",
+ "suffix": ""
+ },
+ {
+ "first": "Zhifeng",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Quoc",
+ "suffix": ""
+ },
+ {
+ "first": "Mohammad",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ },
+ {
+ "first": "Wolfgang",
+ "middle": [],
+ "last": "Norouzi",
+ "suffix": ""
+ },
+ {
+ "first": "Maxim",
+ "middle": [],
+ "last": "Macherey",
+ "suffix": ""
+ },
+ {
+ "first": "Yuan",
+ "middle": [],
+ "last": "Krikun",
+ "suffix": ""
+ },
+ {
+ "first": "Qin",
+ "middle": [],
+ "last": "Cao",
+ "suffix": ""
+ },
+ {
+ "first": "Klaus",
+ "middle": [],
+ "last": "Gao",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Macherey",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. ArXiv, abs/1609.08144.",
+ "links": null
+ },
+ "BIBREF54": {
+ "ref_id": "b54",
+ "title": "Xlnet: Generalized autoregressive pretraining for language understanding",
+ "authors": [
+ {
+ "first": "Zhilin",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Zihang",
+ "middle": [],
+ "last": "Dai",
+ "suffix": ""
+ },
+ {
+ "first": "Yiming",
+ "middle": [],
+ "last": "Yang",
+ "suffix": ""
+ },
+ {
+ "first": "Jaime",
+ "middle": [
+ "G"
+ ],
+ "last": "Carbonell",
+ "suffix": ""
+ },
+ {
+ "first": "Ruslan",
+ "middle": [],
+ "last": "Salakhutdinov",
+ "suffix": ""
+ },
+ {
+ "first": "V",
+ "middle": [],
+ "last": "Quoc",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Le",
+ "suffix": ""
+ }
+ ],
+ "year": 2019,
+ "venue": "ArXiv",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Car- bonell, Ruslan Salakhutdinov, and Quoc V. Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. ArXiv, abs/1906.08237.",
+ "links": null
+ },
+ "BIBREF55": {
+ "ref_id": "b55",
+ "title": "From neural re-ranking to neural ranking: Learning a sparse representation for inverted indexing",
+ "authors": [
+ {
+ "first": "Hamed",
+ "middle": [],
+ "last": "Zamani",
+ "suffix": ""
+ },
+ {
+ "first": "Mostafa",
+ "middle": [],
+ "last": "Dehghani",
+ "suffix": ""
+ },
+ {
+ "first": "W",
+ "middle": [
+ "Bruce"
+ ],
+ "last": "Croft",
+ "suffix": ""
+ },
+ {
+ "first": "Erik",
+ "middle": [
+ "G"
+ ],
+ "last": "",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "CIKM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Hamed Zamani, Mostafa Dehghani, W. Bruce Croft, Erik G. Learned-Miller, and Jaap Kamps. 2018. From neural re-ranking to neural ranking: Learn- ing a sparse representation for inverted indexing. In CIKM.",
+ "links": null
+ },
+ "BIBREF56": {
+ "ref_id": "b56",
+ "title": "Diffusion maps for textual network embedding",
+ "authors": [
+ {
+ "first": "Xinyuan",
+ "middle": [],
+ "last": "Zhang",
+ "suffix": ""
+ },
+ {
+ "first": "Yitong",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Dinghan",
+ "middle": [],
+ "last": "Shen",
+ "suffix": ""
+ },
+ {
+ "first": "Lawrence",
+ "middle": [],
+ "last": "Carin",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xinyuan Zhang, Yitong Li, Dinghan Shen, and Lawrence Carin. 2018. Diffusion maps for textual network embedding. In NeurIPS.",
+ "links": null
+ }
+ },
+ "ref_entries": {
+ "FIGREF0": {
+ "uris": null,
+ "type_str": "figure",
+ "num": null,
+ "text": "t-SNE visualization of paper embeddings and their corresponding MAG topics."
+ },
+ "TABREF1": {
+ "content": "",
+ "type_str": "table",
+ "num": null,
+ "text": "Results on the SCIDOCS evaluation suite consisting of 7 tasks."
+ },
+ "TABREF3": {
+ "content": ": Ablations: Numbers are averages of metrics for each evaluation task: CLS: classification, USR: User activity, CITE: Citation prediction, REC: Recom- mendation, Avg. average over all tasks & metrics.
",
+ "type_str": "table",
+ "num": null,
+ "text": ""
+ },
+ "TABREF4": {
+ "content": "Training signal CLS USR CITE REC All SPECTER 84.2 88.4 91.5 36.9 80.0
",
+ "type_str": "table",
+ "num": null,
+ "text": "SciBERT fine-tune on co-view 83.0 84.2 84.1 36.4 76.0 SciBERT fine-tune on co-read 82.3 85.4 86.7 36.3 77.1 SciBERT fine-tune on co-citation 82.9 84.3 85.2 36.6 76.4 SciBERT fine-tune on multitask 83.3 86.1 88.2 36.0 78.0"
+ },
+ "TABREF5": {
+ "content": "",
+ "type_str": "table",
+ "num": null,
+ "text": "Comparison with task-specific fine-tuning."
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/pdf/2020.acl-main.207.tei.xml b/s2orc-doc2json/tests/pdf/2020.acl-main.207.tei.xml
new file mode 100644
index 0000000000000000000000000000000000000000..2f63cacea5370e5abde1b22a441f5b1c3c93a622
--- /dev/null
+++ b/s2orc-doc2json/tests/pdf/2020.acl-main.207.tei.xml
@@ -0,0 +1,1739 @@
+
+
+
+
+
+ SPECTER: Document-level Representation Learning using Citation-informed Transformers
+
+
+
+
+
+
+
+
+
+ Arman Cohan
+ armanc@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Sergey Feldman
+ sergey@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Iz Beltagy
+ beltagy@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Doug Downey
+ dougd@allenai.org
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+ Daniel S Weld
+
+ Allen Institute for Artificial Intelligence ‡ Paul G. Allen School of Computer Science & Engineering
+ University of Washington
+
+
+
+
+ Introduction
+
+
+ SPECTER: Document-level Representation Learning using Citation-informed Transformers
+
+
+
+
+
+
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+ Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token-and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SCIDOCS, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark. 1
+
+
+
+
+
+Introduction
As the pace of scientific publication continues to increase, Natural Language Processing (NLP) tools that help users to search, discover and understand the scientific literature have become critical. In recent years, substantial improvements in NLP tools have been brought about by pretrained neural language models (LMs) [(Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11). While such models are widely used for representing individual words or sentences, extensions to whole-document embeddings are relatively underexplored. Likewise, methods that do use inter-document signals to produce whole-document embeddings [(Tu et al., 2017;](#b46)) have yet to incorporate stateof-the-art pretrained LMs. Here, we study how to leverage the power of pretrained language models to learn embeddings for scientific documents.
A paper's title and abstract provide rich semantic content about the paper, but, as we show in this work, simply passing these textual fields to an "off-the-shelf" pretrained language model-even a state-of-the-art model tailored to scientific text like the recent SciBERT [(Beltagy et al., 2019)](#b3)-does not result in accurate paper representations. The language modeling objectives used to pretrain the model do not lead it to output representations that are helpful for document-level tasks such as topic classification or recommendation.
In this paper, we introduce a new method for learning general-purpose vector representations of scientific documents. Our system, SPECTER, 2 incorporates inter-document context into the Transformer [(Vaswani et al., 2017)](#b47) language models (e.g., SciBERT [(Beltagy et al., 2019)](#b3)) to learn document representations that are effective across a wide-variety of downstream tasks, without the need for any task-specific fine-tuning of the pretrained language model. We specifically use citations as a naturally occurring, inter-document incidental supervision signal indicating which documents are most related and formulate the signal into a triplet-loss pretraining objective. Unlike many prior works, at inference time, our model does not require any citation information. This is critical for embedding new papers that have not yet been cited. In experiments, we show that SPECTER's representations substantially outperform the state-SPECTER: Scientific Paper Embeddings using Citationinformed TransformERs of-the-art on a variety of document-level tasks, including topic classification, citation prediction, and recommendation.
As an additional contribution of this work, we introduce and release SCIDOCS 3 , a novel collection of data sets and an evaluation suite for documentlevel embeddings in the scientific domain. SCI-DOCS covers seven tasks, and includes tens of thousands of examples of anonymized user signals of document relatedness. We also release our training set (hundreds of thousands of paper titles, abstracts and citations), along with our trained embedding model and its associated code base.
+Model 2.1 Overview
Our goal is to learn task-independent representations of academic papers. Inspired by the recent success of pretrained Transformer language models across various NLP tasks, we use the Transformer model architecture as basis of encoding the input paper. Existing LMs such as BERT, however, are primarily based on masked language modeling objective, only considering intra-document context and do not use any inter-document information. This limits their ability to learn optimal document representations. To learn high-quality documentlevel representations we propose using citations as an inter-document relatedness signal and formulate it as a triplet loss learning objective. We then pretrain the model on a large corpus of citations using this objective, encouraging it to output representations that are more similar for papers that share a citation link than for those that do not. We call our model SPECTER, which learns Scientific Paper Embeddings using Citation-informed Trans-formERs. With respect to the terminology used by [Devlin et al. (2019)](#b11), unlike most existing LMs that are "fine-tuning based", our approach results in embeddings that can be applied to downstream tasks in a "feature-based" fashion, meaning the learned paper embeddings can be easily used as features, with no need for further task-specific fine-tuning. In the following, as background information, we briefly describe how pretrained LMs can be applied for document representation and then discuss the details of SPECTER.
https://github.com/allenai/scidocs Transformer (initialized with SciBERT) Related paper (P + ) Query paper (P Q ) Unrelated paper (P − )
Triplet loss =max d P Q , P + − d P Q , P − + m , 0 Figure 1: Overview of SPECTER.
+Background: Pretrained Transformers
Recently, pretrained Transformer networks have demonstrated success on various NLP tasks [(Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11)[Liu et al., 2019)](#b33); we use these models as the foundation for SPECTER. Specifically, we use SciBERT [(Beltagy et al., 2019)](#b3) which is an adaptation of the original BERT [(Devlin et al., 2019)](#b11) architecture to the scientific domain. The BERT model architecture [(Devlin et al., 2019)](#b11) uses multiple layers of Transformers [(Vaswani et al., 2017)](#b47) to encode the tokens in a given input sequence. Each layer consists of a self-attention sublayer followed by a feedforward sublayer. The final hidden state associated with the special [CLS] token is usually called the "pooled output", and is commonly used as an aggregate representation of the sequence.
+Document Representation
Our goal is to represent a given paper P as a dense vector v that best represents the paper and can be used in downstream tasks. SPECTER builds embeddings from the title and abstract of a paper. Intuitively, we would expect these fields to be sufficient to produce accurate embeddings, since they are written to provide a succinct and comprehensive summary of the paper. [4] As such, we encode the concatenated title and abstract using a Transformer LM (e.g., SciBERT) and take the final representation of the [CLS] token as the output representation of the paper:
5 v = Transformer(input) [CLS] ,(1) where Transformer is the Transformer's forward function, and input is the concatenation of the [CLS] token and WordPieces [(Wu et al., 2016)](#b52) of the title and abstract of a paper, separated by the [SEP] token. We use SciBERT as our model initialization as it is optimized for scientific text, though our formulation is general and any Transformer language model instead of SciBERT. Using the above method with an "off-the-shelf" SciBERT does not take global inter-document information into account. This is because SciBERT, like other pretrained language models, is trained via language modeling objectives, which only predict words or sentences given their in-document, nearby textual context. In contrast, we propose to incorporate citations into the model as a signal of inter-document relatedness, while still leveraging the model's existing strength in modeling language.
+Citation-Based Pretraining Objective
A citation from one document to another suggests that the documents are related. To encode this relatedness signal into our representations, we design a loss function that trains the Transformer model to learn closer representations for papers when one cites the other, and more distant representations otherwise. The high-level overview of the model is shown in [Figure 1]. In particular, each training instance is a triplet of papers: a query paper P Q , a positive paper P + and a negative paper P − . The positive paper is a paper that the query paper cites, and the negative paper is a paper that is not cited by the query paper (but that may be cited by P + ). We then train the model using the following triplet margin loss function:
L = max d P Q , P + − d P Q , P − + m , 0 (2) where d is a distance function and m is the loss margin hyperparameter (we empirically choose m = 1). Here, we use the L2 norm distance:
d(P A , P B ) = v A − v B 2 , where v A is the vector corresponding to the pooled output of the Transformer run on paper A (Equation 1). [6] Starting from the trained SciBERT model, we pretrain the Transformer parameters on the citation objective to learn paper representations that capture document relatedness.
+Selecting Negative Distractors
The choice of negative example papers P − is important when training the model. We consider two sets of negative examples: the first set simply consists of randomly selected papers from the corpus.
We also experimented with other distance functions (e..g, normalized cosine), but they underperformed the L2 loss.
Given a query paper, intuitively we would expect the model to be able to distinguish between cited papers, and uncited papers sampled randomly from the entire corpus. This inductive bias has been also found to be effective in content-based citation recommendation applications . But, random negatives may be easy for the model to distinguish from the positives. To provide a more nuanced training signal, we augment the randomly drawn negatives with a more challenging second set of negative examples. We denote as "hard negatives" the papers that are not cited by the query paper, but are cited by a paper cited by the query paper, i.e. if P 1 cite − − → P and P 2 cite − − → P 3
but P 1 cite − − → P 3 , then P 3 is a candidate hard negative example for P 1 . We expect the hard negatives to be somewhat related to the query paper, but typically less related than the cited papers. As we show in our experiments ( §6), including hard negatives results in more accurate embeddings compared to using random negatives alone.
+Inference
At inference time, the model receives one paper, P, and it outputs the SPECTER's Transfomer pooled output activation as the paper representation for P (Equation 1). We note that for inference, SPECTER requires only the title and abstract of the given input paper; the model does not need any citation information about the input paper. This means that SPECTER can produce embeddings even for new papers that have yet to be cited, which is critical for applications that target recent scientific papers.
+SCIDOCS Evaluation Framework
Previous evaluations of scientific document representations in the literature tend to focus on small datasets over a limited set of tasks, and extremely high (99%+) AUC scores are already possible on these data for English documents . New, larger and more diverse benchmark datasets are necessary. Here, we introduce a new comprehensive evaluation framework to measure the effectiveness of scientific paper embeddings, which we call SCIDOCS. The framework consists of diverse tasks, ranging from citation prediction, to prediction of user activity, to document classification and paper recommendation. Note that SPECTER will not be further fine-tuned on any of the tasks; we simply plug in the embeddings as features for each task. Below, we describe each of the tasks in detail and the evaluation data associated with it. In addition to our training data, we release all the datasets associated with the evaluation tasks.
+Document Classification
An important test of a document-level embedding is whether it is predictive of the class of the document. Here, we consider two classification tasks in the scientific domain: MeSH Classification In this task, the goals is to classify scientific papers according to their Medical Subject Headings (MeSH) [(Lipscomb, 2000)](#b30). [7] We construct a dataset consisting of 23K academic medical papers, where each paper is assigned one of 11 top-level disease classes such as cardiovascular diseases, diabetes, digestive diseases derived from the MeSH vocabulary. The most populated category is Neoplasms (cancer) with 5.4K instances (23.3% of the total dataset) while the category with least number of samples is Hepatitis (1.7% of the total dataset). We follow the approach of [Feldman et al. (2019)](#b13) in mapping the MeSH vocabulary to the disease classes.
Paper Topic Classification This task is predicting the topic associated with a paper using the predefined topic categories of the Microsoft Academic Graph (MAG) [(Sinha et al., 2015)](#b45) 8 . MAG provides a database of papers, each tagged with a list of topics. The topics are organized in a hierarchy of 5 levels, where level 1 is the most general and level 5 is the most specific. For our evaluation, we derive a document classification dataset from the level 1 topics, where a paper is labeled by its corresponding level 1 MAG topic. We construct a dataset of 25K papers, almost evenly split over the 19 different classes of level 1 categories in MAG.
+Citation Prediction
As argued above, citations are a key signal of relatedness between papers. We test how well different paper representations can reproduce this signal through citation prediction tasks. In particular, we focus on two sub-tasks: predicting direct citations, and predicting co-citations. We frame these as ranking tasks and evaluate performance using MAP and nDCG, standard ranking metrics. Direct Citations In this task, the model is asked to predict which papers are cited by a given query paper from a given set of candidate papers. The evaluation dataset includes approximately 30K total papers from a held-out pool of papers, consisting of 1K query papers and a candidate set of up to 5 cited papers and 25 (randomly selected) uncited papers. The task is to rank the cited papers higher than the uncited papers. For each embedding method, we require only comparing the L2 distance between the raw embeddings of the query and the candidates, without any additional trainable parameters.
Co-Citations This task is similar to the direct citations but instead of predicting a cited paper, the goal is to predict a highly co-cited paper with a given paper. Intuitively, if papers A and B are cited frequently together by several papers, this shows that the papers are likely highly related and a good paper representation model should be able to identify these papers from a given candidate set. The dataset consists of 30K total papers and is constructed similar to the direct citations task.
+User Activity
The embeddings for similar papers should be close to each other; we use user activity as a proxy for identifying similar papers and test the model's ability to recover this information. Multiple users consuming the same items as one another is a classic relatedness signal and forms the foundation for recommender systems and other applications [(Schafer et al., 2007)](#b42). In our case, we would expect that when users look for academic papers, the papers they view in a single browsing session tend to be related. Thus, accurate paper embeddings should, all else being equal, be relatively more similar for papers that are frequently viewed in the same session than for other papers. To build benchmark datasets to test embeddings on user activity, we obtained logs of user sessions from a major academic search engine. We define the following two tasks on which we build benchmark datasets to test embeddings:
Co-Views Our co-views dataset consists of approximately 30K papers. To construct it, we take 1K random papers that are not in our train or development set and associate with each one up to 5 frequently co-viewed papers and 25 randomly selected papers (similar to the approach for citations). Then, we require the embedding model to rank the co-viewed papers higher than the random papers by comparing the L2 distances of raw embeddings. We evaluate performance using standard ranking metrics, nDCG and MAP.
Co-Reads If the user clicks to access the PDF of a paper from the paper description page, this is a potentially stronger sign of interest in the paper. In such a case we assume the user will read at least parts of the paper and refer to this as a "read" action. Accordingly, we define a "co-reads" task and dataset analogous to the co-views dataset described above. This dataset is also approximately 30K papers.
+Recommendation
In the recommendation task, we evaluate the ability of paper embeddings to boost performance in a production recommendation system. Our recommendation task aims to help users navigate the scientific literature by ranking a set of "similar papers" for a given paper. We use a dataset of user clickthrough data for this task which consists of 22K clickthrough events from a public scholarly search engine. We partitioned the examples temporally into train (20K examples), validation (1K), and test (1K) sets. As is typical in clickthrough data on ranked lists, the clicks are biased toward the top of original ranking presented to the user. To counteract this effect, we computed propensity scores using a swap experiment (Agarwal et al., 2019). The propensity scores give, for each position in the ranked list, the relative frequency that the position is over-represented in the data due to exposure bias. We can then compute de-biased evaluation metrics by dividing the score for each test example by the propensity score for the clicked position. We report propensity-adjusted versions of the standard ranking metrics Precision@1 (P @1) and Normalized Discounted Cumulative Gain (nDCG).
We test different embeddings on the recommendation task by including cosine embedding distance 9 as a feature within an existing recommendation system that includes several other informative features (title/author similarity, reference and citation overlap, etc.). Thus, the recommendation experiments measure whether the embeddings can boost the performance of a strong baseline system on an end task. For SPECTER, we also perform an online A/B test to measure whether its advantages Embeddings are L2 normalized and in this case cosine distance is equivalent to L2 distance. on the offline dataset translate into improvements on the online recommendation task ( §5).
+Experiments
Training Data To train our model, we use a subset of the Semantic Scholar corpus consisting of about 146K query papers (around 26.7M tokens) with their corresponding outgoing citations, and we use an additional 32K papers for validation. For each query paper we construct up to 5 training triples comprised of a query, a positive, and a negative paper. The positive papers are sampled from the direct citations of the query, while negative papers are chosen either randomly or from citations of citations (as discussed in §2.4). We empirically found it helpful to use 2 hard negatives (citations of citations) and 3 easy negatives (randomly selected papers) for each query paper. This process results in about 684K training triples and 145K validation triples.
Training and Implementation We implement our model in AllenNLP . We initialize the model from SciBERT pretrained weights [(Beltagy et al., 2019)](#b3) since it is the stateof-the-art pretrained language model on scientific text. We continue training all model parameters on our training objective (Equation 2). We perform minimal tuning of our model's hyperparameters based on the performance on the validation set, while baselines are extensively tuned. Based on initial experiments, we use a margin m=1 for the triplet loss. For training, we use the Adam optimizer (Kingma and Ba, 2014) following the suggested hyperparameters in Devlin et al. (2019) (LR: 2e-5, Slanted Triangular LR scheduler 10 (Howard and Ruder, 2018) with number of train steps equal to training instances and cut fraction of 0.1). We train the model on a single Titan V GPU (12G memory) for 2 epochs, with batch size of 4 (the maximum that fit in our GPU memory) and use gradient accumulation for an effective batch size of 32. Each training epoch takes approximately 1-2 days to complete on the full dataset. We release our code and data to facilitate reproducibility. 11
Task-Specific Model Details For the classification tasks, we used a linear SVM where embedding vectors were the only features. The C hyperparameter was tuned via a held-out validation set.
For the recommendation tasks, we use a feedforward ranking neural network that takes as input ten features designed to capture the similarity between each query and candidate paper, including the cosine similarity between the query and candidate embeddings and manually-designed features computed from the papers' citations, titles, authors, and publication dates.
Baseline Methods Our work falls into the intersection of textual representation, citation mining, and graph learning, and we evaluate against stateof-the-art baselines from each of these areas. We compare with several strong textual models: SIF [(Arora et al., 2017)](#b2), a method for learning document representations by removing the first principal component of aggregated word-level embeddings which we pretrain on scientific text; SciBERT [(Beltagy et al., 2019)](#b3) a state-of-the-art pretrained Transformer LM for scientific text; and Sent-BERT [(Reimers and Gurevych, 2019)](#b40), a model that uses negative sampling to tune BERT for producing optimal sentence embeddings. We also compare with Citeomatic , a closely related paper representation model for citation prediction which trains content-based representations with citation graph information via dynamically sampled triplets, and SGC [(Wu et al., 2019a)](#b50), a state-of-the-art graph-convolutional approach. For completeness, additional baselines are also included; due to space constraints we refer to Appendix A for detailed discussion of all baselines. We tune hyperparameters of baselines to maximize performance on a separate validation set. [Table 1](#tab_1) presents the main results corresponding to our evaluation tasks (described in §3). Overall, we observe substantial improvements across all tasks with average performance of 80.0 across all metrics on all tasks which is a 3.1 point absolute improvement over the next-best baseline. We now discuss the results in detail.
+Results
For document classification, we report macro F1, a standard classification metric. We observe that the classifier performance when trained on our representations is better than when trained on any other baseline. Particularly, on the MeSH (MAG) dataset, we obtain an 86.4 (82.0) F1 score which is about a ∆= + 2.3 (+1.5) point absolute increase over the best baseline on each dataset respectively. Our evaluation of the learned representations on predicting user activity is shown in the "User activity" columns of [Table 1](#tab_1). SPECTER achieves a MAP score of 83.8 on the co-view task, and 84.5 on coread, improving over the best baseline (Citeomatic in this case) by 2.7 and 4.0 points, respectively. We observe similar trends for the "citation" and "co-citation" tasks, with our model outperforming virtually all other baselines except for SGC, which has access to the citation graph at training and test time. 12 Note that methods like SGC cannot be used in real-world setting to embed new papers that are not cited yet. On the other hand, on cocitation data our method is able to achieve the best results with nDCG of 94.8, improving over SGC with 2.3 points. Citeomatic also performs well on the citation tasks, as expected given that its primary design goal was citation prediction. Nevertheless, our method slightly outperforms Citeomatic on the direct citation task, while substantially outperforming it on co-citations (+2.0 nDCG). Finally, for recommendation task, we observe that SPECTER outperforms all other models on this task as well, with nDCG of 53.9. On the recommendations task, as opposed to previous experiments, the differences in method scores are generally smaller. This is because for this task the embeddings are used along with several other informative features in the ranking model (described under task-specific models in §4), meaning that embedding variants have less opportunity for impact on overall performance.
We also performed an online study to evaluate whether SPECTER embeddings offer similar advantages in a live application. We performed an online A/B test comparing our SPECTER-based recommender to an existing production recommender system for similar papers that ranks papers by a textual similarity measure. In a dataset of 4,113 clicks, we found that SPECTER ranker improved clickthrough rate over the baseline by 46.5%, demonstrating its superiority.
We emphasize that our citation-based pretraining objective is critical for the performance of SPECTER; removing this and using a vanilla SciB-ERT results in decreased performance on all tasks.
For SGC, we remove development and test set citations and co-citations during training. We also remove incoming citations from development and test set queries as these would not be available at test time in production.
+Analysis
In this section, we analyze several design decisions in SPECTER, provide a visualization of its embedding space, and experimentally compare SPECTER's use of fixed embeddings against a finetuning approach.
Ablation Study We start by analyzing how adding or removing metadata fields from the input to SPECTER alters performance. The results are shown in the top four rows of [Table 2](#tab_3) (for brevity, here we only report the average of the metrics from each task). We observe that removing the abstract from the textual input and relying only on the title results in a substantial decrease in performance. More surprisingly, adding authors as an input (along with title and abstract) hurts performance. [13] One possible explanation is that author names are sparse in the corpus, making it difficult for the model to infer document-level relatedness from them. As another possible reason of this behavior, tokenization using Wordpieces might be suboptimal for author names. Many author names are out-of-vocabulary for SciBERT and thus, they might be split into sub-words and shared across names that are not semantically related, leading to noisy correlation. Finally, we find that adding venues slightly decreases performance, 14 except on document classification (which makes sense, as we would expect venues to have high correlation [13] We experimented with both concatenating authors with the title and abstract and also considering them as an additional field. Neither were helpful.
14 Venue information in our data came directly from publisher provided metadata and thus was not normalized. with paper topics). The fact that SPECTER does not require inputs like authors or venues makes it applicable in situations where this metadata is not available, such as matching reviewers with anonymized submissions, or performing recommendations of anonymized preprints (e.g., on OpenReview). One design decision in SPECTER is to use a set of hard negative distractors in the citation-based finetuning objective. The fifth row of [Table 2](#tab_3) shows that this is important-using only easy negatives reduces performance on all tasks. While there could be other potential ways to include hard negatives in the model, our simple approach of including citations of citations is effective. The sixth row of the table shows that using a strong general-domain language model (BERT-Large) instead of SciBERT in SPECTER reduces performance considerably. This is reasonable because unlike BERT-Large, SciB-ERT is pretrained on scientific text.
Visualization [Figure 2] shows t-SNE (van der Maaten, 2014) projections of our embeddings (SPECTER) compared with the SciBERT baseline for a random set of papers. When comparing SPECTER embeddings with SciBERT, we observe that our embeddings are better at encoding topical information, as the clusters seem to be more compact. Further, we see some examples of crosstopic relatedness reflected in the embedding space (e.g., Engineering, Mathematics and Computer Science are close to each other, while Business and Economics are also close to each other). To quantify the comparison of visualized embeddings in [Figure 2], we use the DBScan clustering algorithm [(Ester et al., 1996)](#b12) on this 2D projection. We use the completeness and homogeneity clustering quality measures introduced by [Rosenberg and Hirschberg (2007)](#b41). For the points corresponding to [Figure 2], the homogeneity and completeness values for SPECTER are respectively 0.41 and 0.72 compared with SciBERT's 0.19 and 0.63, a clear improvement on separating topics using the projected embeddings.
Comparison with Task Specific Fine-Tuning While the fact that SPECTER does not require finetuning makes its paper embeddings less costly to use, often the best performance from pretrained Transformers is obtained when the models are finetuned directly on each end task. We experiment with fine-tuning SciBERT on our tasks, and find this to be generally inferior to using our fixed representations from SPECTER. Specifically, we finetune SciBERT directly on task-specific signals instead of citations. To fine-tune on task-specific data (e.g., user activity), we used a dataset of coviews with 65K query papers, co-reads with 14K query papers, and co-citations (instead of direct citations) with 83K query papers. As the end tasks are ranking tasks, for all datasets we construct up to 5 triplets and fine-tune the model using triplet ranking loss. The positive papers are sampled from the most co-viewed (co-read, or co-cited) papers corresponding to the query paper. We also include both easy and hard distractors as when training SPECTER (for hard negatives we choose the least non-zero co-viewed (co-read, or co-cited) papers). We also consider training jointly on all task-specific training data sources in a multitask training process, where the model samples training triplets from a distribution over the sources. As illustrated in Table 3, without any additional final task-specific fine-tuning, SPECTER still outperforms a SciBERT model fine-tuned on the end tasks as well as their multitask combination, further demonstrating the effectiveness and versatility of SPECTER embeddings. 15
+Related Work
Recent representation learning methods in NLP rely on training large neural language models on unsupervised data [Radford et al., 2018;](#b38)[Devlin et al., 2019;](#b11)[Beltagy et al., 2019;](#b3)[Liu et al., 2019)](#b33). While successful at many sentenceand token-level tasks, our focus is on using the models for document-level representation learning, which has remained relatively under-explored.
There have been other efforts in document representation learning such as extensions of word vectors to documents [(Le and Mikolov, 2014;](#b28)[Ganesh et al., 2016;](#b14)[Wu et al., 2018;](#b51)[Gysel et al., 2017)](#b16), convolution-based methods [Zamani et al., 2018)](#b55), and variational autoencoders [(Holmer and Marfurt, 2018;](#b19). Relevant to document embedding, sentence embedding is a relatively well-studied area of research. Successful approaches include seq2seq models [(Kiros et al., 2015)], BiLSTM Siamese networks [(Williams et al., 2018)](#b49), leveraging supervised data from other corpora [(Conneau et al., 2017)](#b10), and using discourse relations [(Nie et al., 2019)](#b35), and BERT-based methods [(Reimers and Gurevych, 2019)](#b40). Unlike our proposed method,
We also experimented with further task-specific finetuning of our SPECTER on the end tasks but we did not observe additional improvements. the majority of these approaches do not consider any notion of inter-document relatedness when embedding documents.
Other relevant work combines textual features with network structure [(Tu et al., 2017;](#b46). These works typically do not leverage the recent pretrained contextual representations and with a few exceptions such as the recent work by , they cannot generalize to unseen documents like our SPECTER approach. Context-based citation recommendation is another related application where models rely on citation contexts [(Jeong et al., 2019)](#b21) to make predictions. These works are orthogonal to ours as the input to our model is just paper title and abstract. Another related line of work is graphbased representation learning methods [(Bruna et al., 2014;](#b6)[Kipf and Welling, 2017;](#b24)[Hamilton et al., 2017a,b;][Wu et al., 2019a,b)]. Here, we compare to a graph representation learning model, SGC (Simple Graph Convolution) [(Wu et al., 2019a)](#b50), which is a state-of-the-art graph convolution approach for representation learning. SPECTER uses pretrained language models in combination with graph-based citation signals, which enables it to outperform the graph-based approaches in our experiments.
SPECTER embeddings are based on only the title and abstract of the paper. Adding the full text of the paper would provide a more complete picture of the paper's content and could improve accuracy [(Cohen et al., 2010;](#b9)[Lin, 2008;](#b29)[Schuemie et al., 2004)](#b43). However, the full text of many academic papers is not freely available. Further, modern language models have strict memory limits on input size, which means new techniques would be required in order to leverage the entirety of the paper within the models. Exploring how to use the full paper text within SPECTER is an item of future work.
Finally, one pain point in academic paper recommendation research has been a lack of publicly available datasets [(Chen and Lee, 2018;](#b8)[Kanakia et al., 2019)](#b22). To address this challenge, we release SCIDOCS, our evaluation benchmark which includes an anonymized clickthrough dataset from an online recommendations system.
+Conclusions and Future Work
We present SPECTER, a model for learning representations of scientific papers, based on a Transformer language model that is pretrained on cita-tions. We achieve substantial improvements over the strongest of a wide variety of baselines, demonstrating the effectiveness of our model. We additionally introduce SCIDOCS, a new evaluation suite consisting of seven document-level tasks and release the corresponding datasets to foster further research in this area.
The landscape of Transformer language models is rapidly changing and newer and larger models are frequently introduced. It would be interesting to initialize our model weights from more recent Transformer models to investigate if additional gains are possible. Another item of future work is to develop better multitask approaches to leverage multiple signals of relatedness information during training. We used citations to build triplets for our loss function, however there are other metrics that have good support from the bibliometrics literature [(Klavans and Boyack, 2006)](#b26) that warrant exploring as a way to create relatedness graphs. Including other information such as outgoing citations as additional input to the model would be yet another area to explore in future.
A Appendix A -Baseline Details 1. Random Zero-mean 25-dimensional vectors were used as representations for each document.
2. Doc2Vec Doc2Vec is one of the earlier neural document/paragraph representation methods [(Le and Mikolov, 2014)](#b28), and is a natural comparison. We trained Doc2Vec on our training subset using Gensim [(Řehůřek and Sojka, 2010)], and chose the hyperparameter grid using suggestions from Lau and Baldwin (2016). The hyperparameter grid used:
{'window': [[5,][10,][15]], 'sample': [0, 10 ** -6, 10 ** -5], 'epochs': [[50,][100,][200]]}, for a total of 27 models. The other parameters were set as follows: vector_size=300, min_count=3, alpha=0.025, min_alpha=0.0001, negative=5, dm=0, dbow=1, dbow_words=0. 3. Fasttext-Sum This simple baseline is a weighted sum of pretrained word vectors. We trained our own 300 dimensional fasttext embeddings [(Bojanowski et al., 2017)](#b5) on a corpus of around 3.1B tokens from scientific papers which is similar in size to the SciBERT corpus [(Beltagy et al., 2019)](#b3). We found that these pretrained embeddings substantially outperform alternative off-theshelf embeddings. We also use these embeddings in other baselines that require pretrained word vectors (i.e., SIF and SGC that are described below). The summed bag of words representation has a number of weighting options, which are extensively tuned on a validation set for best performance. 4. SIF The SIF method of [Arora et al. (2017)](#b2) is a strong text representation baseline that takes a weighted sum of pretrained word vectors (we use fasttext embeddings described above), then computes the first principal component of the document embedding matrix and subtracts out each document embedding's projection to the first principal component.
We used a held-out validation set to choose a from the range [1.0e-5, 1.0e-3] spaced evenly on a log scale. The word probability p(w) was estimated on the training set only. When computing term-frequency values for SIF, we used scikit-learn's TfidfVectorizer with the same parameters as enumerated in the preceding section. sublinear_tf, binary, use_idf, smooth_idf were all set to False. Since SIF is a sum of pretrained fasttext vectors, the resulting dimensionality is 300. provides contextualized representations of tokens in a document. It can provide paragraph or document embeddings by averaging each token's representation for all 3 LSTM layers. We used the 768-dimensional pretrained ELMo model in AllenNLP .
+ELMo ELMo
6. Citeomatic The most relevant baseline is Citeomatic , which is an academic paper representation model that is trained on the citation graph via sampled triplets. Citeomatic representations are an L2 normalized weighted sum of title and abstract embeddings, which are trained on the citation graph with dynamic negative sampling. Citeomatic embeddings are 75-dimensional. 7. SGC Since our algorithm is trained on data from the citation graph, we also compare to a state-ofthe-art graph representation learning model: SGC (Simple Graph Convolution) [(Wu et al., 2019a)](#b50), which is a graph convolution network. An alternative comparison would have been Graph-SAGE [(Hamilton et al., 2017b)](#b18), but SGC (with no learning) outperformed an unsupervised variant of GraphSAGE on the Reddit dataset 16 , Note that SGC with no learning boils down to graph propagation on node features (in our case nodes are academic documents). Following Hamilton et al. (2017a), we used SIF features as node representations, and applied SGC with a range of parameter k, which is the number of times the normalized adjacency is multiplied by the SIF feature matrix. Our range of k was 1 through 8 (inclusive), and was chosen with a validation set. For the node features, we chose the SIF model with a = 0.0001, as this model was observed to be a high-performing one. This baseline is also 300 dimensional.
8. SciBERT To isolate the advantage of SPECTER's citation-based fine-tuning objective, we add a controlled comparison with SciBERT [(Beltagy et al., 2019)](#b3). Following [Devlin et al. (2019)](#b11) we take the last layer hidden state corresponding to the [CLS] token as the aggregate document representation. [17] There were no other direct comparisons in [Wu et al. (2019a)](#b50) 17 We also tried the alternative of averaging all token representations, but this resulted in a slight performance decrease compared with the [CLS] pooled token. 9. Sentence BERT Sentence BERT [(Reimers and Gurevych, 2019](#b40)) is a general-domain pretrained model aimed at embedding sentences. The authors fine-tuned BERT using a triplet loss, where positive sentences were from the same document section as the seed sentence, and distractor sentences came from other document sections. The model is designed to encode sentences as opposed to paragraphs, so we embed the title and each sentence in the abstract separately, sum the embeddings, and L2 normalize the result to produce a final 768-dimensional paper embedding. [18] During hyperparameter optimization we chose how to compute TF and IDF values weights by taking the following non-redundant combinations of scikit-learn's TfidfVectorizer [(Pedregosa et al., 2011)](#b36) parameters: sublinear_tf, binary, use_idf, smooth_idf. There were a total of 9 parameter combinations. The IDF values were estimated on the training set. The other parameters were set as follows: min_df=3, max_df=0.75, strip_accents='ascii', stop_words='english', norm=None, lowercase=True. For training of fasttext, we used all default parameters with the exception of setting dimension to 300 and minCount was set to 25 due to the large corpus.
We used the 'bert-base-wikipedia-sections-mean-tokens' model released by the authors: https://github.com/ UKPLab/sentence-transformers
t-SNE visualization of paper embeddings and their corresponding MAG topics.
+Table 1 :1 Results on the SCIDOCS evaluation suite consisting of 7 tasks.
+Table 2 :2 Ablations: Numbers are averages of metrics for each evaluation task: CLS: classification, USR: User activity, CITE: Citation prediction, REC: Recom- mendation, Avg. average over all tasks & metrics.
+SciBERT fine-tune on co-view 83.0 84.2 84.1 36.4 76.0 SciBERT fine-tune on co-read 82.3 85.4 86.7 36.3 77.1 SciBERT fine-tune on co-citation 82.9 84.3 85.2 36.6 76.4 SciBERT fine-tune on multitask 83.3 86.1 88.2 36.0 78.0 Training signal | CLS USR CITE REC All |
SPECTER | 84.2 88.4 91.5 36.9 80.0 |
+Table 3 :3 Comparison with task-specific fine-tuning.
+ We also experimented with additional fields such as venues and authors but did not find any empirical advantage in using those (see §6). See §7 for a discussion of using the full text of the paper as input.5 It is also possible to encode title and abstracts individually and then concatenate or combine them to get the final embedding. However, in our experiments this resulted in sub-optimal performance.
+ https://www.nlm.nih.gov/mesh/meshhome. html 8 https://academic.microsoft.com/
+ Learning rate linear warmup followed by linear decay. 11 https://github.com/allenai/specter
+
+
+
+
+
Acknowledgements
We thank Kyle Lo, Daniel King and Oren Etzioni for helpful research discussions, Russel Reas for setting up the public API, Field Cady for help in initial data collection and the anonymous reviewers (especially Reviewer 1) for comments and suggestions. This work was supported in part by NSF Convergence Accelerator award 1936940, ONR grant N00014-18-1-2193, and the University of Washington WRF/Cable Professorship.
+
+
+
+
+
+
+
+
+ Estimating position bias without intrusive interventions
+
+ K Anant
+
+
+ Ivan Agarwal
+
+
+ Xuanhui Zaitsev
+
+
+ Wang
+
+
+ Yen Cheng
+
+
+ Marc Li
+
+
+ Thorsten Najork
+
+
+ Joachims
+
+
+
+ WSDM
+
+
+
+
+ Anant K. Agarwal, Ivan Zaitsev, Xuanhui Wang, Cheng Yen Li, Marc Najork, and Thorsten Joachims. 2019. Estimating position bias without intrusive in- terventions. In WSDM.
+
+
+
+
+ Construction of the literature graph in semantic scholar
+
+ Waleed Ammar
+
+
+ Dirk Groeneveld
+
+
+ Chandra Bhagavatula
+
+
+ Iz Beltagy
+
+
+ Miles Crawford
+
+
+ Doug Downey
+
+
+ Jason Dunkelberger
+
+
+ Ahmed Elgohary
+
+
+ Sergey Feldman
+
+
+ Vu Ha
+
+
+ Rodney Kinney
+
+
+ Sebastian Kohlmeier
+
+
+ Kyle Lo
+
+
+ Tyler C Murray
+
+
+ Hsu-Han
+
+
+ Matthew E Ooi
+
+
+ Joanna Peters
+
+
+ Sam Power
+
+
+ Lucy Lu Skjonsberg
+
+
+ Christopher Wang
+
+
+ Zheng Wilhelm
+
+
+ Madeleine Yuan
+
+
+ Oren Van Zuylen
+
+
+ Etzioni
+
+
+
+ NAACL-HLT
+
+
+
+
+ Waleed Ammar, Dirk Groeneveld, Chandra Bha- gavatula, Iz Beltagy, Miles Crawford, Doug Downey, Jason Dunkelberger, Ahmed Elgohary, Sergey Feldman, Vu Ha, Rodney Kinney, Sebas- tian Kohlmeier, Kyle Lo, Tyler C. Murray, Hsu- Han Ooi, Matthew E. Peters, Joanna Power, Sam Skjonsberg, Lucy Lu Wang, Christopher Wilhelm, Zheng Yuan, Madeleine van Zuylen, and Oren Et- zioni. 2018. Construction of the literature graph in semantic scholar. In NAACL-HLT.
+
+
+
+
+ A simple but tough-to-beat baseline for sentence embeddings
+
+ Sanjeev Arora
+
+
+ Yingyu Liang
+
+
+ Tengyu Ma
+
+
+
+ ICLR
+
+
+
+
+ Sanjeev Arora, Yingyu Liang, and Tengyu Ma. 2017. A simple but tough-to-beat baseline for sentence em- beddings. In ICLR.
+
+
+
+
+ SciB-ERT: A Pretrained Language Model for Scientific Text
+
+ Iz Beltagy
+
+
+ Kyle Lo
+
+
+ Arman Cohan
+
+
+
+ EMNLP
+
+
+
+
+ Iz Beltagy, Kyle Lo, and Arman Cohan. 2019. SciB- ERT: A Pretrained Language Model for Scientific Text. In EMNLP.
+
+
+
+
+ Content-Based Citation Recommendation
+
+ Chandra Bhagavatula
+
+
+ Sergey Feldman
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+ NAACL-HLT
+
+
+
+
+ Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-Based Citation Recommendation. In NAACL-HLT.
+
+
+
+
+ Enriching word vectors with subword information
+
+ Piotr Bojanowski
+
+
+ Edouard Grave
+
+
+ Armand Joulin
+
+
+ Tomas Mikolov
+
+ 10.1162/tacl_a_00051
+
+
+
+
+ Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov. 2017. Enriching word vectors with subword information. TACL.
+
+
+
+
+
+
+ Joan Bruna
+
+
+ Wojciech Zaremba
+
+
+ Arthur Szlam
+
+
+ Yann Lecun
+
+
+
+
+
+ Joan Bruna, Wojciech Zaremba, Arthur Szlam, and Yann LeCun. 2014. Spectral networks and locally connected networks on graphs. ICLR.
+
+
+
+
+ Improving textual network embedding with global attention via optimal transport
+
+ Liqun Chen
+
+
+ Guoyin Wang
+
+
+ Chenyang Tao
+
+
+ Dinghan Shen
+
+
+ Pengyu Cheng
+
+
+ Xinyuan Zhang
+
+
+ Wenlin Wang
+
+
+ Yizhe Zhang
+
+
+ Lawrence Carin
+
+
+
+ ACL
+
+
+
+
+ Liqun Chen, Guoyin Wang, Chenyang Tao, Ding- han Shen, Pengyu Cheng, Xinyuan Zhang, Wenlin Wang, Yizhe Zhang, and Lawrence Carin. 2019. Im- proving textual network embedding with global at- tention via optimal transport. In ACL.
+
+
+
+
+ Research Paper Recommender Systems on Big Scholarly Data
+
+ Maria Tsung Teng Chen
+
+
+ Lee
+
+
+
+ Knowledge Management and Acquisition for Intelligent Systems
+
+
+
+
+ Tsung Teng Chen and Maria Lee. 2018. Research Pa- per Recommender Systems on Big Scholarly Data. In Knowledge Management and Acquisition for In- telligent Systems.
+
+
+
+
+ The structural and content aspects of abstracts versus bodies of full text journal articles are different
+
+ K Cohen
+
+
+ Helen L Johnson
+
+
+ Karin M Verspoor
+
+
+ Christophe Roeder
+
+
+ Lawrence Hunter
+
+
+
+ BMC Bioinformatics
+
+ 11
+
+
+
+
+ K. Bretonnel Cohen, Helen L. Johnson, Karin M. Ver- spoor, Christophe Roeder, and Lawrence Hunter. 2010. The structural and content aspects of abstracts versus bodies of full text journal articles are different. BMC Bioinformatics, 11:492-492.
+
+
+
+
+ Supervised Learning of Universal Sentence Representations from Natural Language Inference Data
+
+ Alexis Conneau
+
+
+ Douwe Kiela
+
+
+ Holger Schwenk
+
+
+ Loïc Barrault
+
+
+ Antoine Bordes
+
+ 10.18653/v1/D17-1070
+
+
+ EMNLP
+
+
+
+
+ Alexis Conneau, Douwe Kiela, Holger Schwenk, Loïc Barrault, and Antoine Bordes. 2017. Supervised Learning of Universal Sentence Representations from Natural Language Inference Data. In EMNLP.
+
+
+
+
+ BERT: Pre-training of deep bidirectional transformers for language understanding
+
+ Jacob Devlin
+
+
+ Ming-Wei Chang
+
+
+ Kenton Lee
+
+
+ Kristina Toutanova
+
+
+
+ NAACL-HLT
+
+
+
+
+ Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language under- standing. In NAACL-HLT.
+
+
+
+
+ A Density-based Algorithm for Discovering Clusters in Large Spatial Databases with Noise
+
+ Martin Ester
+
+
+ Hans-Peter Kriegel
+
+
+ Jörg Sander
+
+
+ Xiaowei Xu
+
+
+
+ KDD
+
+
+
+
+ Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu, et al. 1996. A Density-based Algorithm for Dis- covering Clusters in Large Spatial Databases with Noise. In KDD.
+
+
+
+
+ Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction
+
+ Sergey Feldman
+
+
+ Waleed Ammar
+
+
+ Kyle Lo
+
+
+ Elly Trepman
+
+
+ Madeleine Van Zuylen
+
+
+ Oren Etzioni
+
+ 10.1001/jamanetworkopen.2019.6700
+
+
+ JAMA
+
+
+
+
+ Sergey Feldman, Waleed Ammar, Kyle Lo, Elly Trep- man, Madeleine van Zuylen, and Oren Etzioni. 2019. Quantifying Sex Bias in Clinical Studies at Scale With Automated Data Extraction. JAMA.
+
+
+
+
+ Doc2sent2vec: A novel two-phase approach for learning document representation
+
+ J Ganesh
+
+
+ Manish Gupta
+
+
+ Vijay K Varma
+
+
+
+ SIGIR
+
+
+
+
+ J Ganesh, Manish Gupta, and Vijay K. Varma. 2016. Doc2sent2vec: A novel two-phase approach for learning document representation. In SIGIR.
+
+
+
+
+ AllenNLP: A Deep Semantic Natural Language Processing Platform
+
+ Matt Gardner
+
+
+ Joel Grus
+
+
+ Mark Neumann
+
+
+ Oyvind Tafjord
+
+
+ Pradeep Dasigi
+
+
+ Nelson F Liu
+
+
+ Matthew Peters
+
+
+ Michael Schmitz
+
+
+ Luke Zettlemoyer
+
+ 10.18653/v1/W18-2501
+
+
+ Proceedings of Workshop for NLP Open Source Software
+ Workshop for NLP Open Source Software
+
+
+
+
+ NLP-OSS
+
+
+ Matt Gardner, Joel Grus, Mark Neumann, Oyvind Tafjord, Pradeep Dasigi, Nelson F. Liu, Matthew Pe- ters, Michael Schmitz, and Luke Zettlemoyer. 2018. AllenNLP: A Deep Semantic Natural Language Pro- cessing Platform. In Proceedings of Workshop for NLP Open Source Software (NLP-OSS).
+
+
+
+
+ Neural Vector Spaces for Unsupervised Information Retrieval
+
+ Christophe Van Gysel
+
+
+ Maarten De Rijke
+
+
+ Evangelos Kanoulas
+
+
+
+ ACM Trans. Inf. Syst
+
+
+
+
+ Christophe Van Gysel, Maarten de Rijke, and Evange- los Kanoulas. 2017. Neural Vector Spaces for Un- supervised Information Retrieval. ACM Trans. Inf. Syst.
+
+
+
+
+ Inductive Representation Learning on Large Graphs
+
+ Will Hamilton
+
+
+ Zhitao Ying
+
+
+ Jure Leskovec
+
+
+
+ NIPS
+
+
+
+
+ Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017a. Inductive Representation Learning on Large Graphs. In NIPS.
+
+
+
+
+ Inductive representation learning on large graphs
+
+ William L Hamilton
+
+
+ Zhitao Ying
+
+
+ Jure Leskovec
+
+
+
+ NIPS
+
+
+
+
+ William L. Hamilton, Zhitao Ying, and Jure Leskovec. 2017b. Inductive representation learning on large graphs. In NIPS.
+
+
+
+
+ Explaining away syntactic structure in semantic document representations
+
+ Erik Holmer
+
+
+ Andreas Marfurt
+
+ abs/1806.01620
+
+
+ ArXiv
+
+
+
+
+ Erik Holmer and Andreas Marfurt. 2018. Explaining away syntactic structure in semantic document rep- resentations. ArXiv, abs/1806.01620.
+
+
+
+
+ Universal Language Model Fine-tuning for Text Classification
+
+ Jeremy Howard
+
+
+ Sebastian Ruder
+
+ 10.18653/v1/P18-1031
+
+
+ ACL
+
+
+
+
+ Jeremy Howard and Sebastian Ruder. 2018. Universal Language Model Fine-tuning for Text Classification. In ACL.
+
+
+
+
+ A context-aware citation recommendation model with bert and graph convolutional networks
+
+ Chanwoo Jeong
+
+
+ Sion Jang
+
+
+ Hyuna Shin
+
+
+ Lucy Eunjeong
+
+
+ Sungchul Park
+
+
+ Choi
+
+ abs/1903.06464
+
+
+ ArXiv
+
+
+
+
+ Chanwoo Jeong, Sion Jang, Hyuna Shin, Eun- jeong Lucy Park, and Sungchul Choi. 2019. A context-aware citation recommendation model with bert and graph convolutional networks. ArXiv, abs/1903.06464.
+
+
+
+
+ A Scalable Hybrid Research Paper Recommender System for Microsoft Academic
+
+ Anshul Kanakia
+
+
+ Zhihong Shen
+
+
+ Darrin Eide
+
+
+ Kuansan Wang
+
+
+
+ WWW
+
+
+
+
+ Anshul Kanakia, Zhihong Shen, Darrin Eide, and Kuansan Wang. 2019. A Scalable Hybrid Research Paper Recommender System for Microsoft Aca- demic. In WWW.
+
+
+
+
+ Adam: A Method for Stochastic Optimization
+
+ P Diederik
+
+
+ Jimmy Kingma
+
+
+ Ba
+
+ abs/1412.6980
+
+
+ ArXiv
+
+
+
+
+ Diederik P. Kingma and Jimmy Ba. 2014. Adam: A Method for Stochastic Optimization. ArXiv, abs/1412.6980.
+
+
+
+
+ Semisupervised classification with graph convolutional networks
+
+ N Thomas
+
+
+ Max Kipf
+
+
+ Welling
+
+
+
+
+
+ Thomas N Kipf and Max Welling. 2017. Semi- supervised classification with graph convolutional networks. ICLR.
+
+
+
+
+ Raquel Urtasun, and Sanja Fidler. 2015. Skip-thought vectors
+
+ Ryan Kiros
+
+
+ Yukun Zhu
+
+
+ Ruslan Salakhutdinov
+
+
+ Richard S Zemel
+
+
+ Antonio Torralba
+
+
+
+ NIPS
+
+
+ Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urta- sun, and Sanja Fidler. 2015. Skip-thought vectors. In NIPS.
+
+
+
+
+ Identifying a better measure of relatedness for mapping science
+
+ Richard Klavans
+
+
+ Kevin W Boyack
+
+
+
+ Journal of the Association for Information Science and Technology
+
+ 57
+
+
+
+
+ Richard Klavans and Kevin W. Boyack. 2006. Iden- tifying a better measure of relatedness for mapping science. Journal of the Association for Information Science and Technology, 57:251-263.
+
+
+
+
+ An empirical evaluation of doc2vec with practical insights into document embedding generation
+
+ Han Jey
+
+
+ Timothy Lau
+
+
+ Baldwin
+
+
+
+ Rep4NLP@ACL
+
+
+
+
+ Jey Han Lau and Timothy Baldwin. 2016. An empirical evaluation of doc2vec with practical in- sights into document embedding generation. In Rep4NLP@ACL.
+
+
+
+
+ Distributed Representations of Sentences and Documents
+
+ Quoc Le
+
+
+ Tomas Mikolov
+
+
+
+ ICML
+
+
+
+
+ Quoc Le and Tomas Mikolov. 2014. Distributed Repre- sentations of Sentences and Documents. In ICML.
+
+
+
+
+ Is searching full text more effective than searching abstracts?
+
+ Jimmy J Lin
+
+
+
+ BMC Bioinformatics
+
+ 10
+
+
+
+
+ Jimmy J. Lin. 2008. Is searching full text more effec- tive than searching abstracts? BMC Bioinformatics, 10:46-46.
+
+
+
+
+ Bulletin of the Medical Library Association
+
+ Carolyn E Lipscomb
+
+
+
+
+
+ Medical Subject Headings (MeSH)
+ Carolyn E Lipscomb. 2000. Medical Subject Headings (MeSH). Bulletin of the Medical Library Associa- tion.
+
+
+
+
+ Unsupervised Document Embedding with CNNs
+
+ Chundi Liu
+
+
+ Shunan Zhao
+
+
+ Maksims Volkovs
+
+ abs/1711.04168v3
+
+
+ ArXiv
+
+
+
+
+ Chundi Liu, Shunan Zhao, and Maksims Volkovs. 2018. Unsupervised Document Embedding with CNNs. ArXiv, abs/1711.04168v3.
+
+
+
+
+ A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis
+
+ Pengfei Liu
+
+
+ King Keung Wu
+
+
+ Helen M Meng
+
+
+
+
+
+ IJCNN
+ Pengfei Liu, King Keung Wu, and Helen M. Meng. 2017. A Model of Extended Paragraph Vector for Document Categorization and Trend Analysis. IJCNN.
+
+
+
+
+
+
+ Yinhan Liu
+
+
+ Myle Ott
+
+
+ Naman Goyal
+
+
+ Jingfei Du
+
+
+ Mandar S Joshi
+
+
+ Danqi Chen
+
+
+ Omer Levy
+
+
+ Mike Lewis
+
+
+ Luke S Zettlemoyer
+
+
+ Veselin Stoyanov
+
+ abs/1907.11692
+
+
+ RoBERTa: A Robustly Optimized BERT Pretraining Approach. ArXiv
+
+
+
+
+ Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Man- dar S. Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke S. Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretrain- ing Approach. ArXiv, abs/1907.11692.
+
+
+
+
+ Accelerating t-SNE Using Tree-based Algorithms
+
+ Laurens Van Der Maaten
+
+
+
+ Journal of Machine Learning Research
+
+
+
+
+ Laurens van der Maaten. 2014. Accelerating t-SNE Using Tree-based Algorithms. Journal of Machine Learning Research.
+
+
+
+
+ DisSent: Learning Sentence Representations from Explicit Discourse Relations
+
+ Allen Nie
+
+
+ Erin Bennett
+
+
+ Noah Goodman
+
+ 10.18653/v1/P19-1442
+
+
+ ACL
+
+
+
+
+ Allen Nie, Erin Bennett, and Noah Goodman. 2019. DisSent: Learning Sentence Representations from Explicit Discourse Relations. In ACL.
+
+
+
+
+ Scikit-learn: Machine learning in Python
+
+ F Pedregosa
+
+
+ G Varoquaux
+
+
+ A Gramfort
+
+
+ V Michel
+
+
+ B Thirion
+
+
+ O Grisel
+
+
+ M Blondel
+
+
+ P Prettenhofer
+
+
+ R Weiss
+
+
+ V Dubourg
+
+
+ J Vanderplas
+
+
+ A Passos
+
+
+ D Cournapeau
+
+
+ M Brucher
+
+
+ M Perrot
+
+
+ E Duchesnay
+
+
+
+ Journal of Machine Learning Research
+
+ 12
+
+
+
+
+ F. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R. Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, and E. Duch- esnay. 2011. Scikit-learn: Machine learning in Python. Journal of Machine Learning Research, 12:2825-2830.
+
+
+
+
+
+ Matthew E Peters
+
+
+ Mark Neumann
+
+
+ Mohit Iyyer
+
+
+ Matt Gardner
+
+
+ Christopher Clark
+
+
+ Kenton Lee
+
+
+ Luke Zettlemoyer
+
+ Deep Contextualized Word Representations
+
+
+
+
+ Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, and Luke Zettlemoyer. 2018. Deep Contextualized Word Rep- resentations.
+
+
+
+
+ Improving language understanding by generative pre-training
+
+ Alec Radford
+
+
+ Karthik Narasimhan
+
+
+
+
+
+ arXiv
+ Tim Salimans, and Ilya Sutskever
+ Alec Radford, Karthik Narasimhan, Tim Salimans, and Ilya Sutskever. 2018. Improving language under- standing by generative pre-training. arXiv.
+
+
+
+
+ Software Framework for Topic Modelling with Large Corpora
+
+ Petr Radimřehůřek
+
+
+ Sojka
+
+
+
+ LREC
+
+
+
+
+ RadimŘehůřek and Petr Sojka. 2010. Software Frame- work for Topic Modelling with Large Corpora. In LREC.
+
+
+
+
+ Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
+
+ Nils Reimers
+
+
+ Iryna Gurevych
+
+
+
+ EMNLP
+
+
+
+
+ Nils Reimers and Iryna Gurevych. 2019. Sentence- BERT: Sentence Embeddings using Siamese BERT- Networks. In EMNLP.
+
+
+
+
+ Vmeasure: A Conditional Entropy-based External Cluster Evaluation Measure
+
+ Andrew Rosenberg
+
+
+ Julia Hirschberg
+
+
+
+ EMNLP
+
+
+
+
+ Andrew Rosenberg and Julia Hirschberg. 2007. V- measure: A Conditional Entropy-based External Cluster Evaluation Measure. In EMNLP.
+
+
+
+
+ Collaborative filtering recommender systems
+
+ Ben Schafer
+
+
+ Dan Frankowski
+
+
+ Jon Herlocker
+
+
+ Shilad Sen
+
+
+
+ The adaptive web
+
+ Springer
+
+
+
+ J Ben Schafer, Dan Frankowski, Jon Herlocker, and Shilad Sen. 2007. Collaborative filtering recom- mender systems. In The adaptive web. Springer.
+
+
+
+
+
+ J Martijn
+
+
+ Marc Schuemie
+
+
+ Weeber
+
+
+ J A Bob
+
+
+ Erik M Schijvenaars
+
+
+ C Van Mulligen
+
+
+ Rob Christiaan Van Der Eijk
+
+
+ Barend Jelier
+
+
+ Jan A Mons
+
+
+ Kors
+
+
+
+ Distribution of information in biomedical abstracts and full-text publications
+
+
+ 20
+
+
+
+ Martijn J. Schuemie, Marc Weeber, Bob J. A. Schijve- naars, Erik M. van Mulligen, C. Christiaan van der Eijk, Rob Jelier, Barend Mons, and Jan A. Kors. 2004. Distribution of information in biomedical ab- stracts and full-text publications. Bioinformatics, 20(16):2597-604.
+
+
+
+
+ Improved semantic-aware network embedding with fine-grained word alignment
+
+ Dinghan Shen
+
+
+ Xinyuan Zhang
+
+
+ Ricardo Henao
+
+
+ Lawrence Carin
+
+
+
+ EMNLP
+
+
+
+
+ Dinghan Shen, Xinyuan Zhang, Ricardo Henao, and Lawrence Carin. 2018. Improved semantic-aware network embedding with fine-grained word align- ment. In EMNLP.
+
+
+
+
+ An Overview of Microsoft Academic Service (MAS) and Applications
+
+ Arnab Sinha
+
+
+ Zhihong Shen
+
+
+ Yang Song
+
+
+ Hao Ma
+
+
+ Darrin Eide
+
+
+ Bo-June Paul Hsu
+
+
+ Kuansan Wang
+
+
+
+ WWW
+
+
+
+
+ Arnab Sinha, Zhihong Shen, Yang Song, Hao Ma, Dar- rin Eide, Bo-June Paul Hsu, and Kuansan Wang. 2015. An Overview of Microsoft Academic Service (MAS) and Applications. In WWW.
+
+
+
+
+ Cane: Context-aware network embedding for relation modeling
+
+ Cunchao Tu
+
+
+ Han Liu
+
+
+ Zhiyuan Liu
+
+
+ Maosong Sun
+
+
+
+ ACL
+
+
+
+
+ Cunchao Tu, Han Liu, Zhiyuan Liu, and Maosong Sun. 2017. Cane: Context-aware network embedding for relation modeling. In ACL.
+
+
+
+
+ Attention Is All You Need
+
+ Ashish Vaswani
+
+
+ Noam Shazeer
+
+
+ Niki Parmar
+
+
+ Jakob Uszkoreit
+
+
+ Llion Jones
+
+
+ Aidan N Gomez
+
+
+ Lukasz Kaiser
+
+
+ Illia Polosukhin
+
+
+
+ NIPS
+
+
+
+
+ Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In NIPS.
+
+
+
+
+ Improving textual network learning with variational homophilic embeddings
+
+ Wenlin Wang
+
+
+ Chenyang Tao
+
+
+ Zhe Gan
+
+
+ Guoyin Wang
+
+
+ Liqun Chen
+
+
+ Xinyuan Zhang
+
+
+ Ruiyi Zhang
+
+
+ Qian Yang
+
+
+ Ricardo Henao
+
+
+ Lawrence Carin
+
+
+
+ Advances in Neural Information Processing Systems
+
+
+
+
+
+ Wenlin Wang, Chenyang Tao, Zhe Gan, Guoyin Wang, Liqun Chen, Xinyuan Zhang, Ruiyi Zhang, Qian Yang, Ricardo Henao, and Lawrence Carin. 2019. Improving textual network learning with variational homophilic embeddings. In Advances in Neural In- formation Processing Systems, pages 2074-2085.
+
+
+
+
+ A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference
+
+ Adina Williams
+
+
+ Nikita Nangia
+
+
+ Samuel Bowman
+
+ 10.18653/v1/N18-1101
+ NAACL-HLT
+
+
+
+
+ Adina Williams, Nikita Nangia, and Samuel Bowman. 2018. A Broad-Coverage Challenge Corpus for Sen- tence Understanding through Inference. In NAACL- HLT.
+
+
+
+
+ Simplifying graph convolutional networks
+
+ Felix Wu
+
+
+ H Amauri
+
+
+ Tianyi Souza
+
+
+ Christopher Zhang
+
+
+ Tao Fifty
+
+
+ Kilian Q Yu
+
+
+ Weinberger
+
+
+
+ ICML
+
+
+
+
+ Felix Wu, Amauri H. Souza, Tianyi Zhang, Christo- pher Fifty, Tao Yu, and Kilian Q. Weinberger. 2019a. Simplifying graph convolutional networks. In ICML.
+
+
+
+
+ Word Mover's Embedding: From Word2Vec to Document Embedding
+
+ Lingfei Wu
+
+
+ Ian En-Hsu Yen
+
+
+ Kun Xu
+
+
+ Fangli Xu
+
+
+ Avinash Balakrishnan
+
+
+ Pin-Yu Chen
+
+
+ Pradeep Ravikumar
+
+
+ Michael J Witbrock
+
+
+
+ EMNLP
+
+
+
+
+ Lingfei Wu, Ian En-Hsu Yen, Kun Xu, Fangli Xu, Avinash Balakrishnan, Pin-Yu Chen, Pradeep Ravikumar, and Michael J Witbrock. 2018. Word Mover's Embedding: From Word2Vec to Document Embedding. In EMNLP.
+
+
+
+
+ Google's neural machine translation system: Bridging the gap between human and machine translation
+
+ Yonghui Wu
+
+
+ Mike Schuster
+
+
+ Zhifeng Chen
+
+
+ V Quoc
+
+
+ Mohammad Le
+
+
+ Wolfgang Norouzi
+
+
+ Maxim Macherey
+
+
+ Yuan Krikun
+
+
+ Qin Cao
+
+
+ Klaus Gao
+
+
+ Macherey
+
+ abs/1609.08144
+
+
+ ArXiv
+
+
+
+
+ Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. ArXiv, abs/1609.08144.
+
+
+
+
+
+
+ Zonghan Wu
+
+
+ Shirui Pan
+
+
+ Fengwen Chen
+
+
+ Guodong Long
+
+
+ Chengqi Zhang
+
+
+ Philip S Yu
+
+ abs/1901.00596
+
+
+ A Comprehensive Survey on Graph Neural Networks. ArXiv
+
+
+
+
+ Zonghan Wu, Shirui Pan, Fengwen Chen, Guodong Long, Chengqi Zhang, and Philip S Yu. 2019b. A Comprehensive Survey on Graph Neural Networks. ArXiv, abs/1901.00596.
+
+
+
+
+ Xlnet: Generalized autoregressive pretraining for language understanding
+
+ Zhilin Yang
+
+
+ Zihang Dai
+
+
+ Yiming Yang
+
+
+ Jaime G Carbonell
+
+
+ Ruslan Salakhutdinov
+
+
+ V Quoc
+
+
+ Le
+
+ abs/1906.08237
+
+
+ ArXiv
+
+
+
+
+ Zhilin Yang, Zihang Dai, Yiming Yang, Jaime G. Car- bonell, Ruslan Salakhutdinov, and Quoc V. Le. 2019. Xlnet: Generalized autoregressive pretraining for language understanding. ArXiv, abs/1906.08237.
+
+
+
+
+ From neural re-ranking to neural ranking: Learning a sparse representation for inverted indexing
+
+ Hamed Zamani
+
+
+ Mostafa Dehghani
+
+
+ W Bruce Croft
+
+
+ Erik G
+
+
+
+ CIKM
+
+
+
+
+ Learned-Miller, and Jaap Kamps
+ Hamed Zamani, Mostafa Dehghani, W. Bruce Croft, Erik G. Learned-Miller, and Jaap Kamps. 2018. From neural re-ranking to neural ranking: Learn- ing a sparse representation for inverted indexing. In CIKM.
+
+
+
+
+ Diffusion maps for textual network embedding
+
+ Xinyuan Zhang
+
+
+ Yitong Li
+
+
+ Dinghan Shen
+
+
+ Lawrence Carin
+
+
+
+
+
+ In NeurIPS
+ Xinyuan Zhang, Yitong Li, Dinghan Shen, and Lawrence Carin. 2018. Diffusion maps for textual network embedding. In NeurIPS.
+
+
+
+
+
+
+
diff --git a/s2orc-doc2json/tests/pdf/N18-3011.json b/s2orc-doc2json/tests/pdf/N18-3011.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c658b3f0e4a617091d23f4e416db3c706590d29
--- /dev/null
+++ b/s2orc-doc2json/tests/pdf/N18-3011.json
@@ -0,0 +1,2147 @@
+{
+ "paper_id": "N18-3011",
+ "header": {
+ "generated_with": "S2ORC 1.0.0",
+ "date_generated": "2021-02-12T10:00:43.166943Z"
+ },
+ "title": "Construction of the Literature Graph in Semantic Scholar",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": "",
+ "affiliation": {},
+ "email": "waleeda@allenai.org"
+ },
+ {
+ "first": "Dirk",
+ "middle": [],
+ "last": "Groeneveld",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Iz",
+ "middle": [],
+ "last": "Beltagy",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Miles",
+ "middle": [],
+ "last": "Crawford",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Dunkelberger",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Ahmed",
+ "middle": [],
+ "last": "Elgohary",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Rodney",
+ "middle": [],
+ "last": "Kinney",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Kohlmeier",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Lo",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Tyler",
+ "middle": [],
+ "last": "Murray",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Hsu-Han",
+ "middle": [],
+ "last": "Ooi",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [],
+ "last": "Peters",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Joanna",
+ "middle": [],
+ "last": "Power",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Sam",
+ "middle": [],
+ "last": "Skjonsberg",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Lucy",
+ "middle": [
+ "Lu"
+ ],
+ "last": "Wang",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Chris",
+ "middle": [],
+ "last": "Wilhelm",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Zheng",
+ "middle": [],
+ "last": "Yuan",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Madeleine",
+ "middle": [],
+ "last": "Van Zuylen",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": "",
+ "affiliation": {},
+ "email": ""
+ }
+ ],
+ "year": "",
+ "abstract": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org. Due to space constraints, we opted not to discuss our relation extraction models in this draft.",
+ "pdf_parse": {
+ "paper_id": "N18-3011",
+ "_pdf_hash": "",
+ "abstract": [
+ {
+ "text": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org. Due to space constraints, we opted not to discuss our relation extraction models in this draft.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Abstract",
+ "sec_num": null
+ }
+ ],
+ "body_text": [
+ {
+ "text": "The goal of this work is to facilitate algorithmic discovery in the scientific literature. Despite notable advances in scientific search engines, data mining and digital libraries (e.g., Wu et al., 2014) , researchers remain unable to answer simple questions such as:",
+ "cite_spans": [
+ {
+ "start": 187,
+ "end": 203,
+ "text": "Wu et al., 2014)",
+ "ref_id": "BIBREF25"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "What is the percentage of female subjects in depression clinical trials?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Which of my co-authors published one or more papers on coreference resolution?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "Which papers discuss the effects of Ranibizumab on the Retina?",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In this paper, we focus on the problem of extracting structured data from scientific documents, which can later be used in natural language interfaces (e.g., Iyer et al., 2017) or to improve ranking of results in academic search (e.g., Xiong et al., Figure 1 : Part of the literature graph. 2017). We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph (see Fig. 1 ). The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries. For example, in order to compute the Erd\u0151s number of an author X, the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erd\u0151s such that all edges on the path are labeled \"authored\".",
+ "cite_spans": [
+ {
+ "start": 158,
+ "end": 176,
+ "text": "Iyer et al., 2017)",
+ "ref_id": "BIBREF12"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 250,
+ "end": 258,
+ "text": "Figure 1",
+ "ref_id": null
+ },
+ {
+ "start": 459,
+ "end": 465,
+ "text": "Fig. 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "We reduce literature graph construction into familiar NLP tasks such as sequence labeling, entity linking and relation extraction, and address some of the impractical assumptions commonly made in the standard formulations of these tasks. For example, most research on named entity recognition tasks report results on large labeled datasets such as CoNLL-2003 and ACE-2005 (e.g., Lample et al., 2016 , and assume that entity types in the test set match those labeled in the training set (including work on domain adaptation, e.g., Daum\u00e9, 2007) . These assumptions, while useful for developing and benchmarking new methods, are unrealistic for many domains and applications. The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines (Etzioni, 2011) .",
+ "cite_spans": [
+ {
+ "start": 348,
+ "end": 358,
+ "text": "CoNLL-2003",
+ "ref_id": null
+ },
+ {
+ "start": 359,
+ "end": 371,
+ "text": "and ACE-2005",
+ "ref_id": null
+ },
+ {
+ "start": 372,
+ "end": 398,
+ "text": "(e.g., Lample et al., 2016",
+ "ref_id": null
+ },
+ {
+ "start": 530,
+ "end": 542,
+ "text": "Daum\u00e9, 2007)",
+ "ref_id": "BIBREF6"
+ },
+ {
+ "start": 821,
+ "end": 836,
+ "text": "(Etzioni, 2011)",
+ "ref_id": "BIBREF8"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "In the next section, we start by describing our symbolic representation of the literature. Then, we discuss how we extract metadata associated with a paper such as authors and references, then how we extract the entities mentioned in paper text. Before we conclude, we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Introduction",
+ "sec_num": "1"
+ },
+ {
+ "text": "The literature graph is a property graph with directed edges. Unlike Resource Description Framework (RDF) graphs, nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities. In this section, we describe the attributes associated with nodes and edges of different types in the literature graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Structure of The Literature Graph",
+ "sec_num": "2"
+ },
+ {
+ "text": "Papers. We obtain metadata and PDF files of papers via partnerships with publishers (e.g., Springer, Nature), catalogs (e.g., DBLP, MED-LINE), pre-publishing services (e.g., arXiv, bioRxive), as well as web-crawling. Paper nodes are associated with a set of attributes such as 'title', 'abstract', 'full text', 'venues' and 'publication year'. While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF (details in \u00a73). We deterministically remove duplicate papers based on string similarity of their metadata, resulting in 37M unique paper nodes. Papers in the literature graph cover a variety of scientific disciplines, including computer science, molecular biology, microbiology and neuroscience.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Authors. Each node of this type represents a unique author, with attributes such as 'first name' and 'last name'. The literature graph has 12M nodes of this type.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Entities. Each node of this type represents a unique scientific concept discussed in the literature, with attributes such as 'canonical name', 'aliases' and 'description'. Our literature graph has 0.4M nodes of this type. We describe how we populate entity nodes in \u00a74.3.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Entity mentions. Each node of this type represents a textual reference of an entity in one of the papers, with attributes such as 'mention text', 'context', and 'confidence'. We describe how we populate the 237M mentions in the literature graph in \u00a74.1.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Node Types",
+ "sec_num": "2.1"
+ },
+ {
+ "text": "Citations. We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id', 'to paper id' and 'contexts' (the textual contexts where p 2 is referenced in p 1 ). While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF as detailed in \u00a73.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Authorship. We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Entity linking edges. We instantiate a directed edge from an extracted entity mention node to the entity it refers to.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Mention-mention relations. We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context. 1 We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 .",
+ "cite_spans": [
+ {
+ "start": 247,
+ "end": 248,
+ "text": "1",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "Entity-entity relations. While mentionmention edges represent relations between mentions in a particular context, entity-entity edges represent relations between abstract entities. These relations may be imported from an existing knowledge base (KB) or inferred from other edges in the graph.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Edge Types",
+ "sec_num": "2.2"
+ },
+ {
+ "text": "In the previous section, we described the overall structure of the literature graph. Next, we discuss how we populate paper nodes, author nodes, authorship edges, and citation edges.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Although some publishers provide sufficient metadata about their papers, many papers are provided with incomplete metadata. Also, papers obtained via web-crawling are not associated with any metadata. To fill in this gap, we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks (RNNs). 2 For each paper, the system extracts the paper title, list of authors, and list of references; each reference consists of a title, a list of authors, a venue, and a year.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Preparing the input layer. We split each PDF into individual pages, and feed each page to Apache's PDFBox library 3 to convert it into a sequence of tokens, where each token has features, e.g., 'text', 'font size', 'space width', 'position on the page'.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "We normalize the token-level features before feeding them as inputs to the model. For each of the 'font size' and 'space width' features, we compute three normalized values (with respect to current page, current document, and the whole training corpus), each value ranging between -0.5 to +0.5. The token's 'position on the page' is given in XY coordinate points. We scale the values linearly to range from . 0:5; 0:5/ at the top-left corner of the page to .0:5; 0:5/ at the bottom-right corner.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "In order to capture case information, we add seven numeric features to the input representation of each token: whether the first/second letter is uppercase/lowercase, the fraction of uppercase/lowercase letters and the fraction of digits.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "To help the model make correct predictions for metadata which tend to appear at the beginning (e.g., titles and authors) or at the end of papers (e.g., references), we provide the current page number as two discrete variables (relative to the beginning and end of the PDF file) with values 0, 1 and 2+. These features are repeated for each token on the same page.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "For the k-th token in the sequence, we compute the input representation i k by concatenating the numeric features, an embedding of the 'font size', and the word embedding of the lowercased token. Word embeddings are initialized with GloVe (Pennington et al., 2014) .",
+ "cite_spans": [
+ {
+ "start": 239,
+ "end": 264,
+ "text": "(Pennington et al., 2014)",
+ "ref_id": "BIBREF19"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Model. The input token representations are passed through one fully-connected layer and then ",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "g ! k D LSTM.Wi k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 /; h k D OEh ! k I g k",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "where W is a weight matrix, g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Following Collobert et al. 2011, we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights (often described as a conditional random field layer when used in neural architectures) to account for dependencies between labels.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Training. The ScienceParse system is trained on a snapshot of the data at PubMed Central. It consists of 1.4M PDFs and their associated metadata, which specify the correct titles, authors, and bibliographies. We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens. This labeling process succeeds for 76% of the documents. The remaining documents are not used in the training process. During training, we only use pages which have at least one token with a label that is not \"none\".",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Decoding. At test time, we use Viterbi decoding to find the most likely global sequence, with no further constraints. To get the title, we use the longest continuous sequence of tokens with the \"title\" label. Since there can be multiple authors, we use all continuous sequences of tokens with the \"author\" label as authors, but require that all authors of a paper are mentioned on the same page. If the author labels are predicted in multiple pages, we use the one with the largest number of authors.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "Results. We run our final tests on a held-out set from PubMed Central, consisting of about 54K documents. The results are detailed in Table 1 . We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation, with no credit for partial matching.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 134,
+ "end": 141,
+ "text": "Table 1",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "To give an example for the type of errors our model makes, consider the paper (Wang et al., 2013) titled \"Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis.\" The title we extract for this paper omits the first part \"Clinical review:\". This is likely to be a result of the pattern \"Foo: Bar Baz\" appearing in many training examples with only \"Bar Baz\" labeled as the title.",
+ "cite_spans": [
+ {
+ "start": 78,
+ "end": 97,
+ "text": "(Wang et al., 2013)",
+ "ref_id": "BIBREF23"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Extracting Metadata",
+ "sec_num": "3"
+ },
+ {
+ "text": "In the previous section, we described how we populate the backbone of the literature graph, i.e., paper nodes, author nodes and citation edges. Next, we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text. In order to focus on more salient entities in a given paper, we only use the title and abstract.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction and Linking",
+ "sec_num": "4"
+ },
+ {
+ "text": "We experiment with three approaches for entity extraction and linking: I. Statistical: uses one or more statistical models for predicting mention spans, then uses another statistical model to link mentions to candidate entities in a KB.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "II. Hybrid: defines a small number of handengineered, deterministic rules for string-based matching of the input text to candidate entities in the KB, then uses a statistical model to disambiguate the mentions. 4 III. Off-the-shelf: uses existing libraries, namely (Ferragina and Scaiella, 2010, TagMe) 5 and (Demner-Fushman et al., 2017, MetaMap Lite) 6 , with minimal post-processing to extract and link entities to the KB.",
+ "cite_spans": [
+ {
+ "start": 211,
+ "end": 212,
+ "text": "4",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "We also experimented with a \"pure\" rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "5 The TagMe APIs are described at https://sobigdata. d4science.org/web/tagme/tagme-help 6 We use v3. Table 2 : Document-level evaluation of three approaches in two scientific areas: computer science (CS) and biomedical (Bio).",
+ "cite_spans": [
+ {
+ "start": 88,
+ "end": 89,
+ "text": "6",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 101,
+ "end": 108,
+ "text": "Table 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "We evaluate the performance of each approach in two broad scientific areas: computer science (CS) and biomedical research (Bio). For each unique (paper ID, entity ID) pair predicted by one of the approaches, we ask human annotators to label each mention extracted for this entity in the paper. We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label. If one or more of the entity mentions in that paper is judged to be correct, the pair (paper ID, entity ID) counts as one correct instance. Otherwise, it counts as an incorrect instance. We report 'yield' in lieu of 'recall' due to the difficulty of doing a scalable comprehensive annotation. Table 2 shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components. In both domains, the statistical approach gives the highest precision and the lowest yield. The hybrid approach consistently gives the highest yield, but sacrifices precision. The TagMe off-the-shelf library used for the CS domain gives surprisingly good results, with precision within 1 point from the statistical models. However, the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision. Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system, which gives significantly higher yield than any individual approach while maintaining reasonably high precision.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 712,
+ "end": 719,
+ "text": "Table 2",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Approaches",
+ "sec_num": "4.1"
+ },
+ {
+ "text": "Given the token sequence t 1 ; : : : ; t N in a sentence, we need to identify spans which correspond to entity mentions. We use the BILOU scheme to encode labels at the token level. Unlike most formulations of named entity recognition problems (NER), we do not identify the entity type (e.g., protein, drug, chemical, disease) for each mention since the output mentions are further grounded in a KB with further information about the entity (including its type), using an entity linking module.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Model. First, we construct the token embedding x k D OEc k I w k for each token t k in the input sequence, where c k is a character-based representation computed using a convolutional neural network (CNN) with filter of size 3 characters, and w k are learned word embeddings initialized with the GloVe embeddings (Pennington et al., 2014) .",
+ "cite_spans": [
+ {
+ "start": 313,
+ "end": 338,
+ "text": "(Pennington et al., 2014)",
+ "ref_id": "BIBREF19"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "We also compute context-sensitive word embeddings, denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models (RNN-LM) at position k. The language model (LM) for each direction is trained independently and consists of a single layer long short-term memory (LSTM) network followed by a linear project layer. While training the LM parameters, lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model. See and for more details.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Given the x k and lm k embeddings for each token k 2 f1; : : : ; N g, we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer, respectively. That is,",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "g ! k D LSTM.x k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 /; h k D OEh ! k I h k ;",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction. Similar to the model described in \u00a73, we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Results. We use the standard data splits of the SemEval-2017 Task 10 on entity (and relation) extraction from scientific papers (Augenstein et al., 2017) . Table 3 compares three variants of our entity extraction model. The first line omits the LM embeddings lm k , while the second line is the full model (including LM embeddings) showing a large improvement of 4.2 F1 points. The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points.",
+ "cite_spans": [
+ {
+ "start": 128,
+ "end": 153,
+ "text": "(Augenstein et al., 2017)",
+ "ref_id": "BIBREF1"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 156,
+ "end": 163,
+ "text": "Table 3",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "Model instances. In the deployed system, we use three instances of the entity extraction model Description F1 Without LM 49.9",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "With LM 54.1 Avg. of 15 models with LM 55.2 Table 3 : Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture, but trained on different datasets. Two instances are trained on the BC5CDR (Li et al., 2016) and the CHEMDNER datasets (Krallinger et al., 2015) to extract key entity mentions in the biomedical domain such as diseases, drugs and chemical compounds. The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain.",
+ "cite_spans": [
+ {
+ "start": 245,
+ "end": 262,
+ "text": "(Li et al., 2016)",
+ "ref_id": "BIBREF16"
+ },
+ {
+ "start": 289,
+ "end": 314,
+ "text": "(Krallinger et al., 2015)",
+ "ref_id": "BIBREF14"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 44,
+ "end": 51,
+ "text": "Table 3",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "The output of all model instances are pooled together and combined with the rule-based entity extraction module, then fed into the entity linking model (described below).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Extraction Models",
+ "sec_num": "4.2"
+ },
+ {
+ "text": "In this section, we describe the construction of entity nodes and entity-entity edges. Unlike other knowledge extraction systems such as the Never-Ending Language Learner (NELL) 7 and OpenIE 4, 8 we use existing knowledge bases (KBs) of entities to reduce the burden of identifying coherent concepts. Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions. We use two KBs: UMLS: The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains, and is funded by the U.S. National Library of Medicine. DBpedia: DBpedia provides access to structured information in Wikipedia. Rather than including all Wikipedia pages, we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities, e.g., \"Lord of the Rings\" in DBpedia.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Knowledge Bases",
+ "sec_num": "4.3"
+ },
+ {
+ "text": "Given a text span s identified by the entity extraction model in \u00a74.2 (or with heuristics) and a reference KB, the goal of the entity linking model is to associate the span with the entity it refers to. A span and its surrounding words are collectively referred to as a mention. We first identify a set of candidate entities that a given mention may refer to. Then, we rank the candidate entities based on a score computed using a neural model trained on labeled data.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "For example, given the string \". . . database of facts, an ILP system will . . . \", the entity extraction model identifies the span \"ILP\" as a possible entity and the entity linking model associates it with \"Inductive_Logic_Programming\" as the referent entity (from among other candidates like \"Integer_Linear_Programming\" or \"Instruction-level_Parallelism\").",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Datasets. We used two datasets: i) a biomedical dataset formed by combining MSH (Jimeno-Yepes et al., 2011) and BC5CDR (Li et al., 2016) with UMLS as the reference KB, and ii) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB.",
+ "cite_spans": [
+ {
+ "start": 119,
+ "end": 136,
+ "text": "(Li et al., 2016)",
+ "ref_id": "BIBREF16"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Candidate selection. In a preprocessing step, we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs, along with the frequency this token is associated with that entity. This is similar to the index used in previous entity linking systems (e.g., Bhagavatula et al., 2015) to estimate the probability that a given mention refers to an entity. At train and test time, we use this index to find candidate entities for a given mention by looking up the tokens in the mention. This method also serves as our baseline in Table 4 by selecting the entity with the highest frequency for a given mention.",
+ "cite_spans": [
+ {
+ "start": 309,
+ "end": 334,
+ "text": "Bhagavatula et al., 2015)",
+ "ref_id": "BIBREF3"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 578,
+ "end": 585,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Scoring candidates. Given a mention (m) and a candidate entity (e), the neural model constructs a vector encoding of the mention and the entity. We encode the mention and entity using the functions f and g, respectively, as follows:",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "f.m/ D OEv m.name I avg.v m.lc ; v m.rc /;",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "g.e/ D OEv e.name I v e.def ; where m.surface, m.lc and m.rc are the mention's surface form, left and right contexts, and e.name and e.def are the candidate entity's name and definition, respectively. v text is a bag-of-words sum encoder for text. We use the same encoder for the mention surface form and the candidate name, and another encoder for the mention contexts and entity definition.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Additionally, we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described Table 4 : The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets.",
+ "cite_spans": [],
+ "ref_spans": [
+ {
+ "start": 150,
+ "end": 157,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "earlier. We compute two scores based on the word overlap of (i) mention's context and candidate's definition and (ii) mention's surface span and the candidate entity's name. Finally, we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair (m, e).",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "Results. We use the Bag of Concepts F1 metric (Ling et al., 2015) for comparison. Table 4 compares the performance of the most-frequent-entity baseline and our neural model described above.",
+ "cite_spans": [
+ {
+ "start": 46,
+ "end": 65,
+ "text": "(Ling et al., 2015)",
+ "ref_id": "BIBREF17"
+ }
+ ],
+ "ref_spans": [
+ {
+ "start": 82,
+ "end": 89,
+ "text": "Table 4",
+ "ref_id": null
+ }
+ ],
+ "eq_spans": [],
+ "section": "Entity Linking Models",
+ "sec_num": "4.4"
+ },
+ {
+ "text": "In the previous sections, we discussed how we construct the main components of the literature graph. In this section, we briefly describe several other related challenges we are actively working on.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Author disambiguation. Despite initiatives to have global author IDs ORCID and ResearcherID, most publishers provide author information as names (e.g., arXiv). However, author names cannot be used as a unique identifier since several people often share the same name. Moreover, different venues and sources use different conventions in reporting the author names, e.g., \"first initial, last name\" vs. \"last name, first name\". Inspired by Culotta et al. (2007) , we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters. We only consider merging two author instances if they have the same last name and share the first initial. If the first name is spelled out (rather than abbreviated) in both author instances, we also require that the first name matches.",
+ "cite_spans": [
+ {
+ "start": 438,
+ "end": 459,
+ "text": "Culotta et al. (2007)",
+ "ref_id": "BIBREF5"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Ontology matching. Popular concepts are often represented in multiple KBs. For example, the concept of \"artificial neural networks\" is represented as entity ID D016571 in the MESH ontology, and represented as page ID '21523' in DBpedia. Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies. 9 Limited KB coverage. The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage. Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation, resulting in a large gap in KB coverage of scientific concepts. In order to close this gap, we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers. For the same reasons, we also need to augment the relations imported from the KB with relations extracted from text. Our approach to address both entity and relation coverage is based on distant supervision (Mintz et al., 2009) . In short, we train two models for identifying entity definitions and relations expressed in natural language in scientific documents, and automatically generate labeled data for training these models using known definitions and relations in the KB.",
+ "cite_spans": [
+ {
+ "start": 344,
+ "end": 345,
+ "text": "9",
+ "ref_id": null
+ },
+ {
+ "start": 1003,
+ "end": 1023,
+ "text": "(Mintz et al., 2009)",
+ "ref_id": "BIBREF18"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "We note that the literature graph currently lacks coverage for important entity types (e.g., affiliations) and domains (e.g., physics). Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations. In order to cover additional scientific domains, more agreements need to be signed with publishers.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Figure and table extraction. Non-textual components such as charts, diagrams and tables provide key information in many scientific documents, but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction. In Siegel et al. (2018) , we induced high-quality training labels for the task of figure extraction in a large number of scientific documents, with no human intervention. To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents (arXiv and PubMed) to locate figures and their associated captions in the rasterized PDF. We use the resulting dataset to train a deep neural network for end-to-end figure detection, yielding a model that can be more easily extended to new domains compared to previous work.",
+ "cite_spans": [
+ {
+ "start": 269,
+ "end": 289,
+ "text": "Siegel et al. (2018)",
+ "ref_id": "BIBREF21"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Understanding and predicting citations. The citation edges in the literature graph provide a wealth of information (e.g., at what rate a paper",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "Variants of this problem are also known as deduplication or record linkage.",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "is being cited and whether it is accelerating), and opens the door for further research to better understand and predict citations. For example, in order to allow users to better understand what impact a paper had and effectively navigate its citations, we experimented with methods for classifying a citation as important or incidental, as well as more finegrained classes (Valenzuela et al., 2015) . The citation information also enables us to develop models for estimating the potential of a paper or an author. In Weihs and Etzioni (2017), we predict citationbased metrics such as an author's h-index and the citation rate of a paper in the future. Also related is the problem of predicting which papers should be cited in a given draft (Bhagavatula et al., 2018) , which can help improve the quality of a paper draft before it is submitted for peer review, or used to supplement the list of references after a paper is published.",
+ "cite_spans": [
+ {
+ "start": 374,
+ "end": 399,
+ "text": "(Valenzuela et al., 2015)",
+ "ref_id": "BIBREF22"
+ },
+ {
+ "start": 741,
+ "end": 767,
+ "text": "(Bhagavatula et al., 2018)",
+ "ref_id": "BIBREF2"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Other Research Problems",
+ "sec_num": "5"
+ },
+ {
+ "text": "In this paper, we discuss the construction of a graph, providing a symbolic representation of the scientific literature. We describe deployed models for identifying authors, references and entities in the paper text, and provide experimental results to evaluate the performance of each model. Three research directions follow from this work and other similar projects, e.g., Hahn-Powell et al. (2017) ; Wu et al. (2014) : i) improving quality and enriching content of the literature graph (e.g., ontology matching and knowledge base population). ii) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole (e.g., identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks). iii) exploring the literature via natural language interfaces.",
+ "cite_spans": [
+ {
+ "start": 375,
+ "end": 400,
+ "text": "Hahn-Powell et al. (2017)",
+ "ref_id": "BIBREF10"
+ },
+ {
+ "start": 403,
+ "end": 419,
+ "text": "Wu et al. (2014)",
+ "ref_id": "BIBREF25"
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusion and Future Work",
+ "sec_num": "6"
+ },
+ {
+ "text": "In order to help future research efforts, we make the following resources publicly available: metadata for over 20 million papers, 10 meaningful citations dataset, 11 models for figure and table extraction, 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata, 14 among other resources. 15 ",
+ "cite_spans": [
+ {
+ "start": 330,
+ "end": 332,
+ "text": "15",
+ "ref_id": null
+ }
+ ],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "Conclusion and Future Work",
+ "sec_num": "6"
+ },
+ {
+ "text": "The ScienceParse libraries can be found at http:// allenai.org/software/.3 https://pdfbox.apache.org",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ },
+ {
+ "text": "http://rtw.ml.cmu.edu/rtw/ 8 https://github.com/allenai/ openie-standalone",
+ "cite_spans": [],
+ "ref_spans": [],
+ "eq_spans": [],
+ "section": "",
+ "sec_num": null
+ }
+ ],
+ "back_matter": [],
+ "bib_entries": {
+ "BIBREF0": {
+ "ref_id": "b0",
+ "title": "The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction",
+ "authors": [
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL workshop (SemEval)",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).",
+ "links": null
+ },
+ "BIBREF1": {
+ "ref_id": "b1",
+ "title": "Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications",
+ "authors": [
+ {
+ "first": "Isabelle",
+ "middle": [],
+ "last": "Augenstein",
+ "suffix": ""
+ },
+ {
+ "first": "Mrinal",
+ "middle": [],
+ "last": "Das",
+ "suffix": ""
+ },
+ {
+ "first": "Sebastian",
+ "middle": [],
+ "last": "Riedel",
+ "suffix": ""
+ },
+ {
+ "first": "Lakshmi",
+ "middle": [],
+ "last": "Vikraman",
+ "suffix": ""
+ },
+ {
+ "first": "Andrew",
+ "middle": [
+ "D"
+ ],
+ "last": "Mccallum",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).",
+ "links": null
+ },
+ "BIBREF2": {
+ "ref_id": "b2",
+ "title": "Content-based citation recommendation",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Sergey",
+ "middle": [],
+ "last": "Feldman",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "NAACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.",
+ "links": null
+ },
+ "BIBREF3": {
+ "ref_id": "b3",
+ "title": "TabEL: entity linking in web tables. In ISWC",
+ "authors": [
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Thanapon",
+ "middle": [],
+ "last": "Noraset",
+ "suffix": ""
+ },
+ {
+ "first": "Doug",
+ "middle": [],
+ "last": "Downey",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.",
+ "links": null
+ },
+ "BIBREF4": {
+ "ref_id": "b4",
+ "title": "Natural language processing (almost) from scratch",
+ "authors": [
+ {
+ "first": "Ronan",
+ "middle": [],
+ "last": "Collobert",
+ "suffix": ""
+ },
+ {
+ "first": "Jason",
+ "middle": [],
+ "last": "Weston",
+ "suffix": ""
+ },
+ {
+ "first": "L\u00e9on",
+ "middle": [],
+ "last": "Bottou",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Karlen",
+ "suffix": ""
+ },
+ {
+ "first": "Koray",
+ "middle": [],
+ "last": "Kavukcuoglu",
+ "suffix": ""
+ },
+ {
+ "first": "Pavel",
+ "middle": [
+ "P"
+ ],
+ "last": "Kuksa",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "JMLR",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Ronan Collobert, Jason Weston, L\u00e9on Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.",
+ "links": null
+ },
+ "BIBREF5": {
+ "ref_id": "b5",
+ "title": "Author disambiguation using error-driven machine learning with a ranking loss function",
+ "authors": [
+ {
+ "first": "Aron",
+ "middle": [],
+ "last": "Culotta",
+ "suffix": ""
+ },
+ {
+ "first": "Pallika",
+ "middle": [],
+ "last": "Kanani",
+ "suffix": ""
+ },
+ {
+ "first": "Robert",
+ "middle": [],
+ "last": "Hall",
+ "suffix": ""
+ },
+ {
+ "first": "Michael",
+ "middle": [],
+ "last": "Wick",
+ "suffix": ""
+ },
+ {
+ "first": "Andrew",
+ "middle": [
+ "D"
+ ],
+ "last": "Mccallum",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "IIWeb Workshop",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.",
+ "links": null
+ },
+ "BIBREF6": {
+ "ref_id": "b6",
+ "title": "Frustratingly easy domain adaptation",
+ "authors": [
+ {
+ "first": "Hal",
+ "middle": [],
+ "last": "Daum\u00e9",
+ "suffix": ""
+ }
+ ],
+ "year": 2007,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Hal Daum\u00e9. 2007. Frustratingly easy domain adapta- tion. In ACL.",
+ "links": null
+ },
+ "BIBREF7": {
+ "ref_id": "b7",
+ "title": "MetaMap Lite: an evaluation of a new Java implementation of MetaMap",
+ "authors": [
+ {
+ "first": "Dina",
+ "middle": [],
+ "last": "Demner-Fushman",
+ "suffix": ""
+ },
+ {
+ "first": "Willie",
+ "middle": [
+ "J"
+ ],
+ "last": "Rogers",
+ "suffix": ""
+ },
+ {
+ "first": "Alan",
+ "middle": [
+ "R"
+ ],
+ "last": "Aronson",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "JAMIA",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.",
+ "links": null
+ },
+ "BIBREF8": {
+ "ref_id": "b8",
+ "title": "Search needs a shake-up",
+ "authors": [
+ {
+ "first": "Oren",
+ "middle": [
+ "Etzioni"
+ ],
+ "last": "",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "Nature",
+ "volume": "476",
+ "issue": "",
+ "pages": "25--31",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.",
+ "links": null
+ },
+ "BIBREF9": {
+ "ref_id": "b9",
+ "title": "TAGME: on-the-fly annotation of short text fragments (by wikipedia entities)",
+ "authors": [
+ {
+ "first": "Paolo",
+ "middle": [],
+ "last": "Ferragina",
+ "suffix": ""
+ },
+ {
+ "first": "Ugo",
+ "middle": [],
+ "last": "Scaiella",
+ "suffix": ""
+ }
+ ],
+ "year": 2010,
+ "venue": "CIKM",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.",
+ "links": null
+ },
+ "BIBREF10": {
+ "ref_id": "b10",
+ "title": "Swanson linking revisited: Accelerating literature-based discovery across domains using a conceptual influence graph",
+ "authors": [
+ {
+ "first": "Gus",
+ "middle": [],
+ "last": "Hahn-Powell",
+ "suffix": ""
+ },
+ {
+ "first": "Marco",
+ "middle": [
+ "Antonio"
+ ],
+ "last": "Valenzuela-Escarcega",
+ "suffix": ""
+ },
+ {
+ "first": "Mihai",
+ "middle": [],
+ "last": "Surdeanu",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.",
+ "links": null
+ },
+ "BIBREF11": {
+ "ref_id": "b11",
+ "title": "Long short-term memory",
+ "authors": [
+ {
+ "first": "Sepp",
+ "middle": [],
+ "last": "Hochreiter",
+ "suffix": ""
+ },
+ {
+ "first": "J\u00fcrgen",
+ "middle": [],
+ "last": "Schmidhuber",
+ "suffix": ""
+ }
+ ],
+ "year": 1997,
+ "venue": "Neural computation",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation .",
+ "links": null
+ },
+ "BIBREF12": {
+ "ref_id": "b12",
+ "title": "Learning a neural semantic parser from user feedback",
+ "authors": [
+ {
+ "first": "Srinivasan",
+ "middle": [],
+ "last": "Iyer",
+ "suffix": ""
+ },
+ {
+ "first": "Ioannis",
+ "middle": [],
+ "last": "Konstas",
+ "suffix": ""
+ },
+ {
+ "first": "Alvin",
+ "middle": [],
+ "last": "Cheung",
+ "suffix": ""
+ },
+ {
+ "first": "Jayant",
+ "middle": [],
+ "last": "Krishnamurthy",
+ "suffix": ""
+ },
+ {
+ "first": "Luke",
+ "middle": [
+ "S"
+ ],
+ "last": "Zettlemoyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.",
+ "links": null
+ },
+ "BIBREF13": {
+ "ref_id": "b13",
+ "title": "Exploiting mesh indexing in medline to generate a data set for word sense disambiguation",
+ "authors": [
+ {
+ "first": "J",
+ "middle": [],
+ "last": "Antonio",
+ "suffix": ""
+ },
+ {
+ "first": "Bridget",
+ "middle": [
+ "T"
+ ],
+ "last": "Jimeno-Yepes",
+ "suffix": ""
+ },
+ {
+ "first": "Alan",
+ "middle": [
+ "R"
+ ],
+ "last": "Mcinnes",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Aronson",
+ "suffix": ""
+ }
+ ],
+ "year": 2011,
+ "venue": "BMC bioinformatics",
+ "volume": "12",
+ "issue": "1",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.",
+ "links": null
+ },
+ "BIBREF14": {
+ "ref_id": "b14",
+ "title": "CHEMDNER: The drugs and chemical names extraction challenge",
+ "authors": [
+ {
+ "first": "Martin",
+ "middle": [],
+ "last": "Krallinger",
+ "suffix": ""
+ },
+ {
+ "first": "Florian",
+ "middle": [],
+ "last": "Leitner",
+ "suffix": ""
+ },
+ {
+ "first": "Obdulia",
+ "middle": [],
+ "last": "Rabal",
+ "suffix": ""
+ },
+ {
+ "first": "Miguel",
+ "middle": [],
+ "last": "Vazquez",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "In J. Cheminformatics",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.",
+ "links": null
+ },
+ "BIBREF15": {
+ "ref_id": "b15",
+ "title": "Neural architectures for named entity recognition",
+ "authors": [
+ {
+ "first": "Guillaume",
+ "middle": [],
+ "last": "Lample",
+ "suffix": ""
+ },
+ {
+ "first": "Miguel",
+ "middle": [],
+ "last": "Ballesteros",
+ "suffix": ""
+ },
+ {
+ "first": "K",
+ "middle": [],
+ "last": "Sandeep",
+ "suffix": ""
+ },
+ {
+ "first": "Kazuya",
+ "middle": [],
+ "last": "Subramanian",
+ "suffix": ""
+ },
+ {
+ "first": "Chris",
+ "middle": [],
+ "last": "Kawakami",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Dyer",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "HLT-NAACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.",
+ "links": null
+ },
+ "BIBREF16": {
+ "ref_id": "b16",
+ "title": "Biocreative v cdr task corpus: a resource for chemical disease relation extraction. Database : the journal of biological databases and curation",
+ "authors": [
+ {
+ "first": "Jiao",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Yueping",
+ "middle": [],
+ "last": "Sun",
+ "suffix": ""
+ },
+ {
+ "first": "Robin",
+ "middle": [
+ "J"
+ ],
+ "last": "Johnson",
+ "suffix": ""
+ },
+ {
+ "first": "Daniela",
+ "middle": [],
+ "last": "Sciaky",
+ "suffix": ""
+ },
+ {
+ "first": "Chih-Hsuan",
+ "middle": [],
+ "last": "Wei",
+ "suffix": ""
+ },
+ {
+ "first": "Robert",
+ "middle": [],
+ "last": "Leaman",
+ "suffix": ""
+ },
+ {
+ "first": "Allan",
+ "middle": [
+ "Peter"
+ ],
+ "last": "Davis",
+ "suffix": ""
+ },
+ {
+ "first": "Carolyn",
+ "middle": [
+ "J"
+ ],
+ "last": "Mattingly",
+ "suffix": ""
+ },
+ {
+ "first": "Thomas",
+ "middle": [
+ "C"
+ ],
+ "last": "Wiegers",
+ "suffix": ""
+ },
+ {
+ "first": "Zhiyong",
+ "middle": [],
+ "last": "Lu",
+ "suffix": ""
+ }
+ ],
+ "year": 2016,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.",
+ "links": null
+ },
+ "BIBREF17": {
+ "ref_id": "b17",
+ "title": "Design challenges for entity linking",
+ "authors": [
+ {
+ "first": "Xiao",
+ "middle": [],
+ "last": "Ling",
+ "suffix": ""
+ },
+ {
+ "first": "Sameer",
+ "middle": [],
+ "last": "Singh",
+ "suffix": ""
+ },
+ {
+ "first": "Daniel",
+ "middle": [
+ "S"
+ ],
+ "last": "Weld",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "Transactions of the Association for Computational Linguistics",
+ "volume": "3",
+ "issue": "",
+ "pages": "315--328",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.",
+ "links": null
+ },
+ "BIBREF18": {
+ "ref_id": "b18",
+ "title": "Distant supervision for relation extraction without labeled data",
+ "authors": [
+ {
+ "first": "Mike",
+ "middle": [],
+ "last": "Mintz",
+ "suffix": ""
+ },
+ {
+ "first": "Steven",
+ "middle": [],
+ "last": "Bills",
+ "suffix": ""
+ }
+ ],
+ "year": 2009,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.",
+ "links": null
+ },
+ "BIBREF19": {
+ "ref_id": "b19",
+ "title": "GloVe: Global vectors for word representation",
+ "authors": [
+ {
+ "first": "Jeffrey",
+ "middle": [],
+ "last": "Pennington",
+ "suffix": ""
+ },
+ {
+ "first": "Richard",
+ "middle": [],
+ "last": "Socher",
+ "suffix": ""
+ },
+ {
+ "first": "Christopher",
+ "middle": [
+ "D"
+ ],
+ "last": "Manning",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "EMNLP",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.",
+ "links": null
+ },
+ "BIBREF20": {
+ "ref_id": "b20",
+ "title": "Semi-supervised sequence tagging with bidirectional language models",
+ "authors": [
+ {
+ "first": "Matthew",
+ "middle": [
+ "E"
+ ],
+ "last": "Peters",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ },
+ {
+ "first": "Chandra",
+ "middle": [],
+ "last": "Bhagavatula",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "ACL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.",
+ "links": null
+ },
+ "BIBREF21": {
+ "ref_id": "b21",
+ "title": "Extracting scientific figures with distantly supervised neural networks",
+ "authors": [
+ {
+ "first": "Noah",
+ "middle": [],
+ "last": "Siegel",
+ "suffix": ""
+ },
+ {
+ "first": "Nicholas",
+ "middle": [],
+ "last": "Lourie",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Waleed",
+ "middle": [],
+ "last": "Ammar",
+ "suffix": ""
+ }
+ ],
+ "year": 2018,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.",
+ "links": null
+ },
+ "BIBREF22": {
+ "ref_id": "b22",
+ "title": "Identifying meaningful citations",
+ "authors": [
+ {
+ "first": "Marco",
+ "middle": [],
+ "last": "Valenzuela",
+ "suffix": ""
+ },
+ {
+ "first": "Vu",
+ "middle": [],
+ "last": "Ha",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2015,
+ "venue": "AAAI Workshop (Scholarly Big Data)",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).",
+ "links": null
+ },
+ "BIBREF23": {
+ "ref_id": "b23",
+ "title": "Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and meta-analysis",
+ "authors": [
+ {
+ "first": "Xiang",
+ "middle": [],
+ "last": "Wang",
+ "suffix": ""
+ },
+ {
+ "first": "Yan",
+ "middle": [],
+ "last": "Dong",
+ "suffix": ""
+ },
+ {
+ "first": "Yi-Ming",
+ "middle": [],
+ "last": "Xiang Qian Qi",
+ "suffix": ""
+ },
+ {
+ "first": "Cheng-Guang",
+ "middle": [],
+ "last": "Li",
+ "suffix": ""
+ },
+ {
+ "first": "Lijun",
+ "middle": [],
+ "last": "Huang",
+ "suffix": ""
+ },
+ {
+ "first": "",
+ "middle": [],
+ "last": "Hou",
+ "suffix": ""
+ }
+ ],
+ "year": 2013,
+ "venue": "Critical care",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.",
+ "links": null
+ },
+ "BIBREF24": {
+ "ref_id": "b24",
+ "title": "Learning to predict citation-based impact measures",
+ "authors": [
+ {
+ "first": "Luca",
+ "middle": [],
+ "last": "Weihs",
+ "suffix": ""
+ },
+ {
+ "first": "Oren",
+ "middle": [],
+ "last": "Etzioni",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "JCDL",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.",
+ "links": null
+ },
+ "BIBREF25": {
+ "ref_id": "b25",
+ "title": "CiteSeerX: AI in a digital library search engine",
+ "authors": [
+ {
+ "first": "Jian",
+ "middle": [],
+ "last": "Wu",
+ "suffix": ""
+ },
+ {
+ "first": "Kyle",
+ "middle": [],
+ "last": "Williams",
+ "suffix": ""
+ },
+ {
+ "first": "Hung-Hsuan",
+ "middle": [],
+ "last": "Chen",
+ "suffix": ""
+ },
+ {
+ "first": "Madian",
+ "middle": [],
+ "last": "Khabsa",
+ "suffix": ""
+ },
+ {
+ "first": "Cornelia",
+ "middle": [],
+ "last": "Caragea",
+ "suffix": ""
+ },
+ {
+ "first": "Alexander",
+ "middle": [],
+ "last": "Ororbia",
+ "suffix": ""
+ },
+ {
+ "first": "Douglas",
+ "middle": [],
+ "last": "Jordan",
+ "suffix": ""
+ },
+ {
+ "first": "C. Lee",
+ "middle": [],
+ "last": "Giles",
+ "suffix": ""
+ }
+ ],
+ "year": 2014,
+ "venue": "AAAI",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.",
+ "links": null
+ },
+ "BIBREF26": {
+ "ref_id": "b26",
+ "title": "Explicit semantic ranking for academic search via knowledge graph embedding",
+ "authors": [
+ {
+ "first": "Chenyan",
+ "middle": [],
+ "last": "Xiong",
+ "suffix": ""
+ },
+ {
+ "first": "Russell",
+ "middle": [],
+ "last": "Power",
+ "suffix": ""
+ },
+ {
+ "first": "Jamie",
+ "middle": [],
+ "last": "Callan",
+ "suffix": ""
+ }
+ ],
+ "year": 2017,
+ "venue": "",
+ "volume": "",
+ "issue": "",
+ "pages": "",
+ "other_ids": {},
+ "num": null,
+ "urls": [],
+ "raw_text": "Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.",
+ "links": null
+ }
+ },
+ "ref_entries": {}
+ }
+}
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/pdf/N18-3011.tei.xml b/s2orc-doc2json/tests/pdf/N18-3011.tei.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c92c05ab7743fab976ca6ab6e5acc025afc09b18
--- /dev/null
+++ b/s2orc-doc2json/tests/pdf/N18-3011.tei.xml
@@ -0,0 +1,830 @@
+
+
+
+
+
+ Construction of the Literature Graph in Semantic Scholar
+
+
+
+
+
+
+
+
+
+ Waleed Ammar
+ waleeda@allenai.org
+
+
+ Dirk Groeneveld
+
+
+ Chandra Bhagavatula
+
+
+ Iz Beltagy
+
+
+ Miles Crawford
+
+
+ Doug Downey
+
+
+ Jason Dunkelberger
+
+
+ Ahmed Elgohary
+
+
+ Sergey Feldman
+
+
+ Vu Ha
+
+
+ Rodney Kinney
+
+
+ Sebastian Kohlmeier
+
+
+ Kyle Lo
+
+
+ Tyler Murray
+
+
+ Hsu-Han Ooi
+
+
+ Matthew Peters
+
+
+ Joanna Power
+
+
+ Sam Skjonsberg
+
+
+ Lucy Lu Wang
+
+
+ Chris Wilhelm
+
+
+ Zheng Yuan
+
+
+ Madeleine Van Zuylen
+
+
+ Oren Etzioni
+
+
+
+ Allen Institute for Artificial Intelligence
+
+ 98103
+ Seattle
+ WA
+ USA
+
+
+
+
+
+ Northwestern University
+
+ 60208
+ Evanston
+ IL
+ USA
+
+
+
+
+
+ Introduction
+
+
+ Construction of the Literature Graph in Semantic Scholar
+
+
+
+
+
+
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+ We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph consists of more than 280M nodes, representing papers, authors, entities and various interactions between them (e.g., authorships, citations, entity mentions). We reduce literature graph construction into familiar NLP tasks (e.g., entity extraction and linking), point out research challenges due to differences from standard formulations of these tasks, and report empirical results for each task. The methods described in this paper are used to enable semantic features in www.semanticscholar.org. Due to space constraints, we opted not to discuss our relation extraction models in this draft.
+
+
+
+
+
+Introduction
The goal of this work is to facilitate algorithmic discovery in the scientific literature. Despite notable advances in scientific search engines, data mining and digital libraries (e.g., [Wu et al., 2014)](#b25), researchers remain unable to answer simple questions such as:
What is the percentage of female subjects in depression clinical trials?
Which of my co-authors published one or more papers on coreference resolution?
Which papers discuss the effects of Ranibizumab on the Retina?
In this paper, we focus on the problem of extracting structured data from scientific documents, which can later be used in natural language interfaces (e.g., [Iyer et al., 2017)](#b12) or to improve ranking of results in academic search (e.g., Xiong et al., [Figure 1]: Part of the literature graph. 2017). We describe methods used in a scalable deployed production system for extracting structured information from scientific documents into the literature graph (see [Fig. 1]). The literature graph is a directed property graph which summarizes key information in the literature and can be used to answer the queries mentioned earlier as well as more complex queries. For example, in order to compute the Erdős number of an author X, the graph can be queried to find the number of nodes on the shortest undirected path between author X and Paul Erdős such that all edges on the path are labeled "authored".
We reduce literature graph construction into familiar NLP tasks such as sequence labeling, entity linking and relation extraction, and address some of the impractical assumptions commonly made in the standard formulations of these tasks. For example, most research on named entity recognition tasks report results on large labeled datasets such as [CoNLL-2003][and ACE-2005][(e.g., Lample et al., 2016], and assume that entity types in the test set match those labeled in the training set (including work on domain adaptation, e.g., [Daumé, 2007)](#b6). These assumptions, while useful for developing and benchmarking new methods, are unrealistic for many domains and applications. The paper also serves as an overview of the approach we adopt at www.semanticscholar.org in a step towards more intelligent academic search engines [(Etzioni, 2011)](#b8).
In the next section, we start by describing our symbolic representation of the literature. Then, we discuss how we extract metadata associated with a paper such as authors and references, then how we extract the entities mentioned in paper text. Before we conclude, we briefly describe other research challenges we are actively working on in order to improve the quality of the literature graph.
+Structure of The Literature Graph
The literature graph is a property graph with directed edges. Unlike Resource Description Framework (RDF) graphs, nodes and edges in property graphs have an internal structure which is more suitable for representing complex data types such as papers and entities. In this section, we describe the attributes associated with nodes and edges of different types in the literature graph.
+Node Types
Papers. We obtain metadata and PDF files of papers via partnerships with publishers (e.g., Springer, Nature), catalogs (e.g., DBLP, MED-LINE), pre-publishing services (e.g., arXiv, bioRxive), as well as web-crawling. Paper nodes are associated with a set of attributes such as 'title', 'abstract', 'full text', 'venues' and 'publication year'. While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF (details in §3). We deterministically remove duplicate papers based on string similarity of their metadata, resulting in 37M unique paper nodes. Papers in the literature graph cover a variety of scientific disciplines, including computer science, molecular biology, microbiology and neuroscience.
Authors. Each node of this type represents a unique author, with attributes such as 'first name' and 'last name'. The literature graph has 12M nodes of this type.
Entities. Each node of this type represents a unique scientific concept discussed in the literature, with attributes such as 'canonical name', 'aliases' and 'description'. Our literature graph has 0.4M nodes of this type. We describe how we populate entity nodes in §4.3.
Entity mentions. Each node of this type represents a textual reference of an entity in one of the papers, with attributes such as 'mention text', 'context', and 'confidence'. We describe how we populate the 237M mentions in the literature graph in §4.1.
+Edge Types
Citations. We instantiate a directed citation edge from paper nodes p 1 ! p 2 for each p 2 referenced in p 1 . Citation edges have attributes such as 'from paper id', 'to paper id' and 'contexts' (the textual contexts where p 2 is referenced in p 1 ). While some of the paper sources provide these attributes as metadata, it is often necessary to extract them from the paper PDF as detailed in §3.
Authorship. We instantiate a directed authorship edge between an author node and a paper node a ! p for each author of that paper.
Entity linking edges. We instantiate a directed edge from an extracted entity mention node to the entity it refers to.
Mention-mention relations. We instantiate a directed edge between a pair of mentions in the same sentential context if the textual relation extraction model predicts one of a predefined list of relation types between them in a sentential context. [1] We encode a symmetric relation between m 1 and m 2 as two directed edges m 1 ! m 2 and m 2 ! m 1 .
Entity-entity relations. While mentionmention edges represent relations between mentions in a particular context, entity-entity edges represent relations between abstract entities. These relations may be imported from an existing knowledge base (KB) or inferred from other edges in the graph.
+Extracting Metadata
In the previous section, we described the overall structure of the literature graph. Next, we discuss how we populate paper nodes, author nodes, authorship edges, and citation edges.
Although some publishers provide sufficient metadata about their papers, many papers are provided with incomplete metadata. Also, papers obtained via web-crawling are not associated with any metadata. To fill in this gap, we built the Sci-enceParse system to predict structured data from the raw PDFs using recurrent neural networks (RNNs). 2 For each paper, the system extracts the paper title, list of authors, and list of references; each reference consists of a title, a list of authors, a venue, and a year.
Preparing the input layer. We split each PDF into individual pages, and feed each page to Apache's PDFBox library 3 to convert it into a sequence of tokens, where each token has features, e.g., 'text', 'font size', 'space width', 'position on the page'.
We normalize the token-level features before feeding them as inputs to the model. For each of the 'font size' and 'space width' features, we compute three normalized values (with respect to current page, current document, and the whole training corpus), each value ranging between -0.5 to +0.5. The token's 'position on the page' is given in XY coordinate points. We scale the values linearly to range from . 0:5; 0:5/ at the top-left corner of the page to .0:5; 0:5/ at the bottom-right corner.
In order to capture case information, we add seven numeric features to the input representation of each token: whether the first/second letter is uppercase/lowercase, the fraction of uppercase/lowercase letters and the fraction of digits.
To help the model make correct predictions for metadata which tend to appear at the beginning (e.g., titles and authors) or at the end of papers (e.g., references), we provide the current page number as two discrete variables (relative to the beginning and end of the PDF file) with values 0, 1 and 2+. These features are repeated for each token on the same page.
For the k-th token in the sequence, we compute the input representation i k by concatenating the numeric features, an embedding of the 'font size', and the word embedding of the lowercased token. Word embeddings are initialized with GloVe [(Pennington et al., 2014)](#b19).
Model. The input token representations are passed through one fully-connected layer and then
g ! k D LSTM.Wi k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.g k ; h ! k 1 /; h k D OEh ! k I g k where W is a weight matrix, g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction.
Following Collobert et al. [2011], we feed the output of the second layer h k into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights (often described as a conditional random field layer when used in neural architectures) to account for dependencies between labels.
Training. The ScienceParse system is trained on a snapshot of the data at PubMed Central. It consists of 1.4M PDFs and their associated metadata, which specify the correct titles, authors, and bibliographies. We use a heuristic labeling process that finds the strings from the metadata in the tokenized PDFs to produce labeled tokens. This labeling process succeeds for 76% of the documents. The remaining documents are not used in the training process. During training, we only use pages which have at least one token with a label that is not "none".
Decoding. At test time, we use Viterbi decoding to find the most likely global sequence, with no further constraints. To get the title, we use the longest continuous sequence of tokens with the "title" label. Since there can be multiple authors, we use all continuous sequences of tokens with the "author" label as authors, but require that all authors of a paper are mentioned on the same page. If the author labels are predicted in multiple pages, we use the one with the largest number of authors.
Results. We run our final tests on a held-out set from PubMed Central, consisting of about 54K documents. The results are detailed in [Table 1]. We use a conservative evaluation where an instance is correct if it exactly matches the gold annotation, with no credit for partial matching.
To give an example for the type of errors our model makes, consider the paper [(Wang et al., 2013)](#b23) titled "Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and metaanalysis." The title we extract for this paper omits the first part "Clinical review:". This is likely to be a result of the pattern "Foo: Bar Baz" appearing in many training examples with only "Bar Baz" labeled as the title.
+Entity Extraction and Linking
In the previous section, we described how we populate the backbone of the literature graph, i.e., paper nodes, author nodes and citation edges. Next, we discuss how we populate mentions and entities in the literature graph using entity extraction and linking on the paper text. In order to focus on more salient entities in a given paper, we only use the title and abstract.
+Approaches
We experiment with three approaches for entity extraction and linking: I. Statistical: uses one or more statistical models for predicting mention spans, then uses another statistical model to link mentions to candidate entities in a KB.
II. Hybrid: defines a small number of handengineered, deterministic rules for string-based matching of the input text to candidate entities in the KB, then uses a statistical model to disambiguate the mentions. [4] III. Off-the-shelf: uses existing libraries, namely (Ferragina and Scaiella, 2010, TagMe) 5 and (Demner-Fushman et al., 2017, MetaMap Lite) 6 , with minimal post-processing to extract and link entities to the KB.
We also experimented with a "pure" rules-based approach which disambiguates deterministically but the hybrid approach consistently gave better results.
5 The TagMe APIs are described at https://sobigdata. d4science.org/web/tagme/tagme-help [6] We use v3. [Table 2]: Document-level evaluation of three approaches in two scientific areas: computer science (CS) and biomedical (Bio).
We evaluate the performance of each approach in two broad scientific areas: computer science (CS) and biomedical research (Bio). For each unique (paper ID, entity ID) pair predicted by one of the approaches, we ask human annotators to label each mention extracted for this entity in the paper. We use CrowdFlower to manage human annotations and only include instances where three or more annotators agree on the label. If one or more of the entity mentions in that paper is judged to be correct, the pair (paper ID, entity ID) counts as one correct instance. Otherwise, it counts as an incorrect instance. We report 'yield' in lieu of 'recall' due to the difficulty of doing a scalable comprehensive annotation. [Table 2] shows the results based on 500 papers using v1.1.2 of our entity extraction and linking components. In both domains, the statistical approach gives the highest precision and the lowest yield. The hybrid approach consistently gives the highest yield, but sacrifices precision. The TagMe off-the-shelf library used for the CS domain gives surprisingly good results, with precision within 1 point from the statistical models. However, the MetaMap Lite off-the-shelf library we used for the biomedical domain suffered a huge loss in precision. Our error analysis showed that each of the approaches is able to predict entities not predicted by the other approaches so we decided to pool their outputs in our deployed system, which gives significantly higher yield than any individual approach while maintaining reasonably high precision.
+Entity Extraction Models
Given the token sequence t 1 ; : : : ; t N in a sentence, we need to identify spans which correspond to entity mentions. We use the BILOU scheme to encode labels at the token level. Unlike most formulations of named entity recognition problems (NER), we do not identify the entity type (e.g., protein, drug, chemical, disease) for each mention since the output mentions are further grounded in a KB with further information about the entity (including its type), using an entity linking module.
Model. First, we construct the token embedding x k D OEc k I w k for each token t k in the input sequence, where c k is a character-based representation computed using a convolutional neural network (CNN) with filter of size 3 characters, and w k are learned word embeddings initialized with the GloVe embeddings [(Pennington et al., 2014)](#b19).
We also compute context-sensitive word embeddings, denoted as lm k D OElm ! k I lm k , by concatenating the projected outputs of forward and backward recurrent neural network language models (RNN-LM) at position k. The language model (LM) for each direction is trained independently and consists of a single layer long short-term memory (LSTM) network followed by a linear project layer. While training the LM parameters, lm ! k is used to predict t kC1 and lm k is used to predict t k 1 . We fix the LM parameters during training of the entity extraction model. See and for more details.
Given the x k and lm k embeddings for each token k 2 f1; : : : ; N g, we use a two-layer bidirectional LSTM to encode the sequence with x k and lm k feeding into the first and second layer, respectively. That is,
g ! k D LSTM.x k ; g ! k 1 /; g k D OEg ! k I g k ; h ! k D LSTM.OEg k I lm k ; h ! k 1 /; h k D OEh ! k I h k ; where g k and h k are defined similarly to g ! k and h ! k but process token sequences in the opposite direction. Similar to the model described in §3, we feed the output of the second LSTM into a dense layer to predict unnormalized label weights for each token and learn label bigram feature weights to account for dependencies between labels.
Results. We use the standard data splits of the SemEval-2017 Task 10 on entity (and relation) extraction from scientific papers [(Augenstein et al., 2017)](#b1). [Table 3] compares three variants of our entity extraction model. The first line omits the LM embeddings lm k , while the second line is the full model (including LM embeddings) showing a large improvement of 4.2 F1 points. The third line shows that creating an ensemble of 15 models further improves the results by 1.1 F1 points.
Model instances. In the deployed system, we use three instances of the entity extraction model Description F1 Without LM 49.9
With LM 54.1 Avg. of 15 models with LM 55.2 [Table 3]: Results of the entity extraction model on the development set of SemEval-2017 task 10. with a similar architecture, but trained on different datasets. Two instances are trained on the BC5CDR [(Li et al., 2016)](#b16) and the CHEMDNER datasets [(Krallinger et al., 2015)](#b14) to extract key entity mentions in the biomedical domain such as diseases, drugs and chemical compounds. The third instance is trained on mention labels induced from Wikipedia articles in the computer science domain.
The output of all model instances are pooled together and combined with the rule-based entity extraction module, then fed into the entity linking model (described below).
+Knowledge Bases
In this section, we describe the construction of entity nodes and entity-entity edges. Unlike other knowledge extraction systems such as the Never-Ending Language Learner (NELL) 7 and OpenIE 4, 8 we use existing knowledge bases (KBs) of entities to reduce the burden of identifying coherent concepts. Grounding the entity mentions in a manually-curated KB also increases user confidence in automated predictions. We use two KBs: UMLS: The UMLS metathesaurus integrates information about concepts in specialized ontologies in several biomedical domains, and is funded by the U.S. National Library of Medicine. DBpedia: DBpedia provides access to structured information in Wikipedia. Rather than including all Wikipedia pages, we used a short list of Wikipedia categories about CS and included all pages up to depth four in their trees in order to exclude irrelevant entities, e.g., "Lord of the Rings" in DBpedia.
+Entity Linking Models
Given a text span s identified by the entity extraction model in §4.2 (or with heuristics) and a reference KB, the goal of the entity linking model is to associate the span with the entity it refers to. A span and its surrounding words are collectively referred to as a mention. We first identify a set of candidate entities that a given mention may refer to. Then, we rank the candidate entities based on a score computed using a neural model trained on labeled data.
For example, given the string ". . . database of facts, an ILP system will . . . ", the entity extraction model identifies the span "ILP" as a possible entity and the entity linking model associates it with "Inductive_Logic_Programming" as the referent entity (from among other candidates like "Integer_Linear_Programming" or "Instruction-level_Parallelism").
Datasets. We used two datasets: i) a biomedical dataset formed by combining MSH (Jimeno-Yepes et al., 2011) and BC5CDR [(Li et al., 2016)](#b16) with UMLS as the reference KB, and ii) a CS dataset we curated using Wikipedia articles about CS concepts with DBpedia as the reference KB.
Candidate selection. In a preprocessing step, we build an index which maps any token used in a labeled mention or an entity name in the KB to associated entity IDs, along with the frequency this token is associated with that entity. This is similar to the index used in previous entity linking systems (e.g., [Bhagavatula et al., 2015)](#b3) to estimate the probability that a given mention refers to an entity. At train and test time, we use this index to find candidate entities for a given mention by looking up the tokens in the mention. This method also serves as our baseline in [Table 4] by selecting the entity with the highest frequency for a given mention.
Scoring candidates. Given a mention (m) and a candidate entity (e), the neural model constructs a vector encoding of the mention and the entity. We encode the mention and entity using the functions f and g, respectively, as follows:
f.m/ D OEv m.name I avg.v m.lc ; v m.rc /; g.e/ D OEv e.name I v e.def ; where m.surface, m.lc and m.rc are the mention's surface form, left and right contexts, and e.name and e.def are the candidate entity's name and definition, respectively. v text is a bag-of-words sum encoder for text. We use the same encoder for the mention surface form and the candidate name, and another encoder for the mention contexts and entity definition.
Additionally, we include numerical features to estimate the confidence of a candidate entity based on the statistics collected in the index described [Table 4]: The Bag of Concepts F1 score of the baseline and neural model on the two curated datasets.
earlier. We compute two scores based on the word overlap of (i) mention's context and candidate's definition and (ii) mention's surface span and the candidate entity's name. Finally, we feed the concatenation of the cosine similarity between f.m/ and g.e/ and the intersection-based scores into an affine transformation followed by a sigmoid nonlinearity to compute the final score for the pair (m, e).
Results. We use the Bag of Concepts F1 metric [(Ling et al., 2015)](#b17) for comparison. [Table 4] compares the performance of the most-frequent-entity baseline and our neural model described above.
+Other Research Problems
In the previous sections, we discussed how we construct the main components of the literature graph. In this section, we briefly describe several other related challenges we are actively working on.
Author disambiguation. Despite initiatives to have global author IDs ORCID and ResearcherID, most publishers provide author information as names (e.g., arXiv). However, author names cannot be used as a unique identifier since several people often share the same name. Moreover, different venues and sources use different conventions in reporting the author names, e.g., "first initial, last name" vs. "last name, first name". Inspired by [Culotta et al. (2007)](#b5), we train a supervised binary classifier for merging pairs of author instances and use it to incrementally create author clusters. We only consider merging two author instances if they have the same last name and share the first initial. If the first name is spelled out (rather than abbreviated) in both author instances, we also require that the first name matches.
Ontology matching. Popular concepts are often represented in multiple KBs. For example, the concept of "artificial neural networks" is represented as entity ID D016571 in the MESH ontology, and represented as page ID '21523' in DBpedia. Ontology matching is the problem of identifying semantically-equivalent entities across KBs or ontologies. [9] Limited KB coverage. The convenience of grounding entities in a hand-curated KB comes at the cost of limited coverage. Introduction of new concepts and relations in the scientific literature occurs at a faster pace than KB curation, resulting in a large gap in KB coverage of scientific concepts. In order to close this gap, we need to develop models which can predict textual relations as well as detailed concept descriptions in scientific papers. For the same reasons, we also need to augment the relations imported from the KB with relations extracted from text. Our approach to address both entity and relation coverage is based on distant supervision [(Mintz et al., 2009)](#b18). In short, we train two models for identifying entity definitions and relations expressed in natural language in scientific documents, and automatically generate labeled data for training these models using known definitions and relations in the KB.
We note that the literature graph currently lacks coverage for important entity types (e.g., affiliations) and domains (e.g., physics). Covering affiliations requires small modifications to the metadata extraction model followed by an algorithm for matching author names with their affiliations. In order to cover additional scientific domains, more agreements need to be signed with publishers.
Figure and table extraction. Non-textual components such as charts, diagrams and tables provide key information in many scientific documents, but the lack of large labeled datasets has impeded the development of data-driven methods for scientific figure extraction. In [Siegel et al. (2018)](#b21), we induced high-quality training labels for the task of figure extraction in a large number of scientific documents, with no human intervention. To accomplish this we leveraged the auxiliary data provided in two large web collections of scientific documents (arXiv and PubMed) to locate figures and their associated captions in the rasterized PDF. We use the resulting dataset to train a deep neural network for end-to-end figure detection, yielding a model that can be more easily extended to new domains compared to previous work.
Understanding and predicting citations. The citation edges in the literature graph provide a wealth of information (e.g., at what rate a paper
Variants of this problem are also known as deduplication or record linkage.
is being cited and whether it is accelerating), and opens the door for further research to better understand and predict citations. For example, in order to allow users to better understand what impact a paper had and effectively navigate its citations, we experimented with methods for classifying a citation as important or incidental, as well as more finegrained classes [(Valenzuela et al., 2015)](#b22). The citation information also enables us to develop models for estimating the potential of a paper or an author. In Weihs and Etzioni (2017), we predict citationbased metrics such as an author's h-index and the citation rate of a paper in the future. Also related is the problem of predicting which papers should be cited in a given draft [(Bhagavatula et al., 2018)](#b2), which can help improve the quality of a paper draft before it is submitted for peer review, or used to supplement the list of references after a paper is published.
+Conclusion and Future Work
In this paper, we discuss the construction of a graph, providing a symbolic representation of the scientific literature. We describe deployed models for identifying authors, references and entities in the paper text, and provide experimental results to evaluate the performance of each model. Three research directions follow from this work and other similar projects, e.g., [Hahn-Powell et al. (2017)](#b10); [Wu et al. (2014)](#b25): i) improving quality and enriching content of the literature graph (e.g., ontology matching and knowledge base population). ii) aggregating domain-specific extractions across many papers to enable a better understanding of the literature as a whole (e.g., identifying demographic biases in clinical trial participants and summarizing empirical results on important tasks). iii) exploring the literature via natural language interfaces.
In order to help future research efforts, we make the following resources publicly available: metadata for over 20 million papers, 10 meaningful citations dataset, 11 models for figure and table extraction, 12 models for predicting citations in a paper draft 13 and models for extracting paper metadata, 14 among other resources. [15]
The ScienceParse libraries can be found at http:// allenai.org/software/.3 https://pdfbox.apache.org
+ http://rtw.ml.cmu.edu/rtw/ 8 https://github.com/allenai/ openie-standalone
+
+
+
+
+
+
+
+
+ The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction
+
+ Waleed Ammar
+
+
+ Matthew E Peters
+
+
+ Chandra Bhagavatula
+
+
+ Russell Power
+
+
+
+ ACL workshop (SemEval)
+
+
+
+
+ Waleed Ammar, Matthew E. Peters, Chandra Bhagavat- ula, and Russell Power. 2017. The ai2 system at semeval-2017 task 10 (scienceie): semi-supervised end-to-end entity and relation extraction. In ACL workshop (SemEval).
+
+
+
+
+
+ Isabelle Augenstein
+
+
+ Mrinal Das
+
+
+ Sebastian Riedel
+
+
+ Lakshmi Vikraman
+
+
+ Andrew D Mccallum
+
+ Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications
+
+
+
+
+ ACL workshop (SemEval)
+ Isabelle Augenstein, Mrinal Das, Sebastian Riedel, Lakshmi Vikraman, and Andrew D. McCallum. 2017. Semeval 2017 task 10 (scienceie): Extracting keyphrases and relations from scientific publications. In ACL workshop (SemEval).
+
+
+
+
+ Content-based citation recommendation
+
+ Chandra Bhagavatula
+
+
+ Sergey Feldman
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+
+
+ NAACL
+
+
+
+
+ Chandra Bhagavatula, Sergey Feldman, Russell Power, and Waleed Ammar. 2018. Content-based citation recommendation. In NAACL.
+
+
+
+
+
+ Chandra Bhagavatula
+
+
+ Thanapon Noraset
+
+
+ Doug Downey
+
+ TabEL: entity linking in web tables. In ISWC
+
+
+
+
+ Chandra Bhagavatula, Thanapon Noraset, and Doug Downey. 2015. TabEL: entity linking in web tables. In ISWC.
+
+
+
+
+ Natural language processing (almost) from scratch
+
+ Ronan Collobert
+
+
+ Jason Weston
+
+
+ Léon Bottou
+
+
+ Michael Karlen
+
+
+ Koray Kavukcuoglu
+
+
+ Pavel P Kuksa
+
+
+
+ JMLR
+
+
+
+
+ Ronan Collobert, Jason Weston, Léon Bottou, Michael Karlen, Koray Kavukcuoglu, and Pavel P. Kuksa. 2011. Natural language processing (almost) from scratch. In JMLR.
+
+
+
+
+ Author disambiguation using error-driven machine learning with a ranking loss function
+
+ Aron Culotta
+
+
+ Pallika Kanani
+
+
+ Robert Hall
+
+
+ Michael Wick
+
+
+ Andrew D Mccallum
+
+
+
+ IIWeb Workshop
+
+
+
+
+ Aron Culotta, Pallika Kanani, Robert Hall, Michael Wick, and Andrew D. McCallum. 2007. Author disambiguation using error-driven machine learning with a ranking loss function. In IIWeb Workshop.
+
+
+
+
+ Frustratingly easy domain adaptation
+
+ Hal Daumé
+
+
+
+ ACL
+
+
+
+
+ Hal Daumé. 2007. Frustratingly easy domain adapta- tion. In ACL.
+
+
+
+
+ MetaMap Lite: an evaluation of a new Java implementation of MetaMap
+
+ Dina Demner-Fushman
+
+
+ Willie J Rogers
+
+
+ Alan R Aronson
+
+
+
+ JAMIA
+
+
+
+
+ Dina Demner-Fushman, Willie J. Rogers, and Alan R. Aronson. 2017. MetaMap Lite: an evaluation of a new Java implementation of MetaMap. In JAMIA.
+
+
+
+
+ Search needs a shake-up
+
+ Oren Etzioni
+
+
+
+ Nature
+
+ 476
+
+
+
+
+ Oren Etzioni. 2011. Search needs a shake-up. Nature 476 7358:25-6.
+
+
+
+
+ TAGME: on-the-fly annotation of short text fragments (by wikipedia entities)
+
+ Paolo Ferragina
+
+
+ Ugo Scaiella
+
+
+
+ CIKM
+
+
+
+
+ Paolo Ferragina and Ugo Scaiella. 2010. TAGME: on-the-fly annotation of short text fragments (by wikipedia entities). In CIKM.
+
+
+
+
+ Swanson linking revisited: Accelerating literature-based discovery across domains using a conceptual influence graph
+
+ Gus Hahn-Powell
+
+
+ Marco Antonio Valenzuela-Escarcega
+
+
+ Mihai Surdeanu
+
+
+
+ ACL
+
+
+
+
+ Gus Hahn-Powell, Marco Antonio Valenzuela- Escarcega, and Mihai Surdeanu. 2017. Swanson linking revisited: Accelerating literature-based dis- covery across domains using a conceptual influence graph. In ACL.
+
+
+
+
+ Long short-term memory
+
+ Sepp Hochreiter
+
+
+ Jürgen Schmidhuber
+
+
+
+ Neural computation
+
+
+
+
+ Sepp Hochreiter and Jürgen Schmidhuber. 1997. Long short-term memory. Neural computation .
+
+
+
+
+ Learning a neural semantic parser from user feedback
+
+ Srinivasan Iyer
+
+
+ Ioannis Konstas
+
+
+ Alvin Cheung
+
+
+ Jayant Krishnamurthy
+
+
+ Luke S Zettlemoyer
+
+
+
+ ACL
+
+
+
+
+ Srinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke S. Zettlemoyer. 2017. Learning a neural semantic parser from user feed- back. In ACL.
+
+
+
+
+ Exploiting mesh indexing in medline to generate a data set for word sense disambiguation
+
+ J Antonio
+
+
+ Bridget T Jimeno-Yepes
+
+
+ Alan R Mcinnes
+
+
+ Aronson
+
+
+
+ BMC bioinformatics
+
+ 12
+ 1
+ 223
+
+
+
+ Antonio J. Jimeno-Yepes, Bridget T. McInnes, and Alan R. Aronson. 2011. Exploiting mesh indexing in medline to generate a data set for word sense dis- ambiguation. BMC bioinformatics 12(1):223.
+
+
+
+
+ CHEMDNER: The drugs and chemical names extraction challenge
+
+ Martin Krallinger
+
+
+ Florian Leitner
+
+
+ Obdulia Rabal
+
+
+ Miguel Vazquez
+
+
+
+ In J. Cheminformatics
+
+
+
+
+ Julen Oyarzabal, and Alfonso Valencia
+ Martin Krallinger, Florian Leitner, Obdulia Rabal, Miguel Vazquez, Julen Oyarzabal, and Alfonso Va- lencia. 2015. CHEMDNER: The drugs and chemi- cal names extraction challenge. In J. Cheminformat- ics.
+
+
+
+
+ Neural architectures for named entity recognition
+
+ Guillaume Lample
+
+
+ Miguel Ballesteros
+
+
+ K Sandeep
+
+
+ Kazuya Subramanian
+
+
+ Chris Kawakami
+
+
+ Dyer
+
+
+
+ HLT-NAACL
+
+
+
+
+ Guillaume Lample, Miguel Ballesteros, Sandeep K Subramanian, Kazuya Kawakami, and Chris Dyer. 2016. Neural architectures for named entity recog- nition. In HLT-NAACL.
+
+
+
+
+ Biocreative v cdr task corpus: a resource for chemical disease relation extraction. Database : the journal of biological databases and curation
+
+ Jiao Li
+
+
+ Yueping Sun
+
+
+ Robin J Johnson
+
+
+ Daniela Sciaky
+
+
+ Chih-Hsuan Wei
+
+
+ Robert Leaman
+
+
+ Allan Peter Davis
+
+
+ Carolyn J Mattingly
+
+
+ Thomas C Wiegers
+
+
+ Zhiyong Lu
+
+
+
+
+
+ Jiao Li, Yueping Sun, Robin J. Johnson, Daniela Sci- aky, Chih-Hsuan Wei, Robert Leaman, Allan Peter Davis, Carolyn J. Mattingly, Thomas C. Wiegers, and Zhiyong Lu. 2016. Biocreative v cdr task cor- pus: a resource for chemical disease relation extrac- tion. Database : the journal of biological databases and curation 2016.
+
+
+
+
+ Design challenges for entity linking
+
+ Xiao Ling
+
+
+ Sameer Singh
+
+
+ Daniel S Weld
+
+
+
+ Transactions of the Association for Computational Linguistics
+
+ 3
+
+
+
+
+ Xiao Ling, Sameer Singh, and Daniel S. Weld. 2015. Design challenges for entity linking. Transactions of the Association for Computational Linguistics 3:315-328.
+
+
+
+
+ Distant supervision for relation extraction without labeled data
+
+ Mike Mintz
+
+
+ Steven Bills
+
+
+
+ ACL
+
+
+
+
+ Rion Snow, and Daniel Jurafsky
+ Mike Mintz, Steven Bills, Rion Snow, and Daniel Ju- rafsky. 2009. Distant supervision for relation extrac- tion without labeled data. In ACL.
+
+
+
+
+ GloVe: Global vectors for word representation
+
+ Jeffrey Pennington
+
+
+ Richard Socher
+
+
+ Christopher D Manning
+
+
+
+ EMNLP
+
+
+
+
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global vectors for word rep- resentation. In EMNLP.
+
+
+
+
+ Semi-supervised sequence tagging with bidirectional language models
+
+ Matthew E Peters
+
+
+ Waleed Ammar
+
+
+ Chandra Bhagavatula
+
+
+ Russell Power
+
+
+
+ ACL
+
+
+
+
+ Matthew E. Peters, Waleed Ammar, Chandra Bhagavat- ula, and Russell Power. 2017. Semi-supervised se- quence tagging with bidirectional language models. In ACL.
+
+
+
+
+ Extracting scientific figures with distantly supervised neural networks
+
+ Noah Siegel
+
+
+ Nicholas Lourie
+
+
+ Russell Power
+
+
+ Waleed Ammar
+
+
+
+
+
+ In JCDL
+ Noah Siegel, Nicholas Lourie, Russell Power, and Waleed Ammar. 2018. Extracting scientific figures with distantly supervised neural networks. In JCDL.
+
+
+
+
+ Identifying meaningful citations
+
+ Marco Valenzuela
+
+
+ Vu Ha
+
+
+ Oren Etzioni
+
+
+
+ AAAI Workshop (Scholarly Big Data)
+
+
+
+
+ Marco Valenzuela, Vu Ha, and Oren Etzioni. 2015. Identifying meaningful citations. In AAAI Workshop (Scholarly Big Data).
+
+
+
+
+ Clinical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a systematic review and meta-analysis
+
+ Xiang Wang
+
+
+ Yan Dong
+
+
+ Yi-Ming Xiang Qian Qi
+
+
+ Cheng-Guang Li
+
+
+ Lijun Huang
+
+
+ Hou
+
+
+
+ Critical care
+
+
+
+
+ Xiang Wang, Yan Dong, Xiang qian Qi, Yi-Ming Li, Cheng-Guang Huang, and Lijun Hou. 2013. Clin- ical review: Efficacy of antimicrobial-impregnated catheters in external ventricular drainage -a system- atic review and meta-analysis. In Critical care.
+
+
+
+
+ Learning to predict citation-based impact measures
+
+ Luca Weihs
+
+
+ Oren Etzioni
+
+
+
+ JCDL
+
+
+
+
+ Luca Weihs and Oren Etzioni. 2017. Learning to pre- dict citation-based impact measures. In JCDL.
+
+
+
+
+ CiteSeerX: AI in a digital library search engine
+
+ Jian Wu
+
+
+ Kyle Williams
+
+
+ Hung-Hsuan Chen
+
+
+ Madian Khabsa
+
+
+ Cornelia Caragea
+
+
+ Alexander Ororbia
+
+
+ Douglas Jordan
+
+
+ C. Lee Giles
+
+
+
+ AAAI
+
+
+
+
+ Jian Wu, Kyle Williams, Hung-Hsuan Chen, Madian Khabsa, Cornelia Caragea, Alexander Ororbia, Dou- glas Jordan, and C. Lee Giles. 2014. CiteSeerX: AI in a digital library search engine. In AAAI.
+
+
+
+
+ Explicit semantic ranking for academic search via knowledge graph embedding
+
+ Chenyan Xiong
+
+
+ Russell Power
+
+
+ Jamie Callan
+
+
+
+ WWW
+
+
+ Chenyan Xiong, Russell Power, and Jamie Callan. 2017. Explicit semantic ranking for academic search via knowledge graph embedding. In WWW.
+
+
+
+
+
+
+
diff --git a/s2orc-doc2json/tests/s2orc/20190928/10002293.json b/s2orc-doc2json/tests/s2orc/20190928/10002293.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c520e66208b6b51911a8d6ac5130ada413a7d08
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20190928/10002293.json
@@ -0,0 +1 @@
+{"paper_id": "10002293", "metadata": {"title": "Antimicrobial Photodynamic Therapy against Endodontic Enterococcus faecalis and Candida albicans Mono and Mixed Biofilms in the Presence of Photosensitizers: A Comparative Study with Classical Endodontic Irrigants", "authors": [{"first": "Patr\u00edcia", "middle": [], "last": "Diogo", "suffix": ""}, {"first": "Chantal", "middle": [], "last": "Fernandes", "suffix": ""}, {"first": "Francisco", "middle": [], "last": "Caramelo", "suffix": ""}, {"first": "Marta", "middle": [], "last": "Mota", "suffix": ""}, {"first": "Isabel", "middle": ["M."], "last": "Miranda", "suffix": ""}, {"first": "M.", "middle": ["A.", "F."], "last": "Faustino", "suffix": ""}, {"first": "M.", "middle": ["G.", "P.", "M.", "S."], "last": "Neves", "suffix": ""}, {"first": "Marciana", "middle": ["P."], "last": "Uliana", "suffix": ""}, {"first": "Kleber", "middle": ["T."], "last": "de Oliveira", "suffix": ""}, {"first": "Jo\u00e3o", "middle": ["M."], "last": "Santos", "suffix": ""}, {"first": "Teresa", "middle": [], "last": "Gon\u00e7alves", "suffix": ""}], "abstract": "Endodontic biofilms eradication from the infected root canal system remains as the primary focus in endodontic field. In this study, it was assessed the efficacy of antimicrobial Photodynamic Therapy (aPDT) with the Zn(II)chlorin e6 methyl ester (Zn(II)e6Me) activated by red light against monospecies and mixed biofilms of Enterococcus faecalis and Candida albicans. The results were compared with the ones obtained with Rose Bengal (RB), Toluidine Blue-O (TBO), the synthetic tetracationic porphyrin (TMPyP) as well as classical endodontic irrigants (3% NaOCl, 17% EDTA and 2% CHX). The antimicrobial efficacy of aPDT toward monospecies and mixed biofilms was quantified resorting to safranin red method. The changes of biofilm organization and of cellular ultrastructure were evaluated through several microscopy techniques (light, laser confocal and transmission electron microscopy). Zn(II)e6Me once activated with light for 60 or 90 s was able to remove around 60% of the biofilm's biomass. It was more efficient than TBO and RB and showed similar efficiency to TMPyP and classical irrigants, CHX and EDTA. As desirable in a PS, Zn(II)e6Me in the dark showed smaller activity than TMPyP. Only NaOCl revealed higher efficiency, with 70-90% of the biofilm's biomass removal. The organization of biofilms and the normal microbial cell ultrastructure were extensively damaged by the presence of Zn(II)e6Me. aPDT with Zn(II)e6Me showed to be an efficient antimicrobial strategy deserving further studies leading to a future clinical usage in endodontic disinfection.", "year": "2017", "arxiv_id": null, "acl_id": null, "pmc_id": "PMC5371592", "pubmed_id": "28424663", "doi": "10.3389/fmicb.2017.00498", "venue": "Frontiers in microbiology", "journal": "Frontiers in microbiology"}, "s2_pdf_hash": "5f6e186429d76704a651eb1d0d7d37c0e906cc65", "grobid_parse": {"abstract": [{"text": "Endodontic biofilms eradication from the infected root canal system remains as the primary focus in endodontic field. In this study, it was assessed the efficacy of antimicrobial Photodynamic Therapy (aPDT) with the Zn(II)chlorin e6 methyl ester (Zn(II)e 6 Me) activated by red light against monospecies and mixed biofilms of Enterococcus faecalis and Candida albicans. The results were compared with the ones obtained with Rose Bengal (RB), Toluidine Blue-O (TBO), the synthetic tetracationic porphyrin (TMPyP) as well as classical endodontic irrigants (3% NaOCl, 17% EDTA and 2% CHX). The antimicrobial efficacy of aPDT toward monospecies and mixed biofilms was quantified resorting to safranin red method. The changes of biofilm organization and of cellular ultrastructure were evaluated through several microscopy techniques (light, laser confocal and transmission electron microscopy). Zn(II)e 6 Me once activated with light for 60 or 90 s was able to remove around 60% of the biofilm's biomass. It was more efficient than TBO and RB and showed similar efficiency to TMPyP and classical irrigants, CHX and EDTA. As desirable in a PS, Zn(II)e 6 Me in the dark showed smaller activity than TMPyP. Only NaOCl revealed higher efficiency, with 70-90% of the biofilm's biomass removal. The organization of biofilms and the normal microbial cell ultrastructure were extensively damaged by the presence of Zn(II)e 6 Me. aPDT with Zn(II)e 6 Me showed to be an efficient antimicrobial strategy deserving further studies leading to a future clinical usage in endodontic disinfection.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Abstract"}], "body_text": [{"text": "Apical periodontitis is an inflammatory reaction of periradicular tissues caused by a microbial infection in the root canal system (Siqueira et al., 2000; Nair, 2006) . Microbial biofilms are considered the major cause for primary and secondary root canal infection and the success of endodontic treatment relies on the effective eradication of such biofilms (Nair, 2006) . Conventionally, this is accomplished by chemo-mechanical disruption with instruments and antimicrobial chemicals used topically inside root canals. However, current treatment strategies are insufficient to reduce microrganisms inside root canals below detection limits before permanent root filling. This is mandatory to achieve optimal healing conditions for the periapical tissues (Sj\u00f6gren et al., 1997) . Therefore, advanced disinfection approaches are required to effectively eradicate biofilms and increase the endodontic treatment success rate.It is widely accepted that the main reason for endodontic treatment failure is the insufficient root canal microrganisms eradication (Siqueira et al., 2000) . As residual species are not reachable to the host's immune system, propagation and re-colonization is highly possible, endorsing microbial spread inside root canal system, which leads to endodontic infections. In 1965, apical periodontitis was recognized as a microbial mediated infection, which was later reinforced by ultrastructural microscopic techniques, revealing bacteria organized as biofilms within the infected root canals (Nair, 1987) . Moreover, histopathological studies have also contributed to the concept that apical periodontitis is indeed a microbial biofilmmediated disease (Carr et al., 2009; Ricucci et al., 2009; Ricucci and Siqueira, 2010) .As described by Donlan and Costerton (2002) , a biofilm is a microbially derived sessile community characterized by cells that are irreversibly attached to a substratum or interface or to each other, are imbedded in a matrix of extracellular polymeric substances that they have produced, and exhibit an altered phenotype with respect to growth rate and gene transfer. In addition, it is accepted that microbial cells comprising the biofilm are more resistant than the planktonic counterparts (Donlan and Costerton, 2002) and multispecies or mixed biofilms are more resistant to drugs than monomicrobial biofilms (Costerton et al., 1999) . As such, polymicrobial biofilms diseases are associated with worse clinical outcomes than monomicrobial infections for decades (McKenzie, 2008) . Although the endodontic biofilm is constituted by multiple microrganisms (Tan et al., 2015) , most of in vitro studies have been made in monospecies biofilms of bacteria or combined with C. albicans (Sabino et al., 2015) .The problem of endodontic biofilms eradication from the infected root canal system remains as the primary focus in endodontic field. In recent years, photodynamic therapy (PDT) has been applied with success in several types of cancers (Ochsner, 1997; Gomes et al., 2015) , age-related macular degeneration (Kawczyk-Krupka et al., 2015) and also in the photoinactivation of several microrganisms (Almeida et al., 2014) , called antimicrobial photodynamic therapy (aPDT). In the endodontic field, aPDT has emerged as an optional extra to classical irrigation solutions in root canal asepsis (Bonsor et al., 2006a,b) such as sodium hypochlorite (NaOCl), chlorhexidine gluconate (CHX) and ethylenediamine tetraacetic acid (EDTA). The NaOCl solution is the most widely used in endodontic treatment (Siqueira et al., 2007; Mohammadi, 2008; Vaziri et al., 2012; Wang et al., 2015) albeit with some degree of toxicity. To avoid this toxicity, other root canal asepsis approaches with lower or insignificant toxicity should be implemented. Therefore, aPDT has emerged with promising experimental results, anticipating a possible new era in endodontic disinfection (Siddiqui et al., 2013; Chrepa et al., 2014) .Photodynamic therapy involves the combination of a nontoxic photosensitizer (PS) with a harmless visible light source in the presence of oxygen. After being excited by light, the PS releases its energy or electrons to molecular oxygen producing highly reactive oxygen species (ROS) such as singlet oxygen ( 1 O 2 ), which induce microrganism's injury and death, ideally with no host cell damage. Also, it has been indicated as bearing a strong potential in the fight against antimicrobial resistance (Hamblin and Hasan, 2004; Tavares et al., 2010; Costa et al., 2011) . aPDT has also been studied as an auspicious approach to eradicate oral pathogenic microrganisms that cause, not only endodontic diseases, but also periodontitis, peri-implantitis, caries lesions and mucositis (Diogo et al., 2015) .In this study, we analyzed the aPDT efficacy against monospecies and mixed biofilms of E. faecalis and C. albicans using the following PSs: toluidine blue (TBO), rose bengal (RB), a synthetic porphyrin 5,10,15,20-tetrakis(1-methylpyridinium-4-yl)porphyrin (TMPyP) and Zn(II)chlorin e 6 methyl ester (Zn(II)e 6 Me) obtained from chlorophyll a (Figure 1) . The antimicrobial results obtained by aPDT approach were compared with the ones achieved with three endodontic classical irrigants 3% NaOCl, 2% CHX, and 17% EDTA toward in vitro biofilms.", "cite_spans": [{"start": 131, "end": 154, "text": "(Siqueira et al., 2000;", "latex": null, "ref_id": "BIBREF53"}, {"start": 155, "end": 166, "text": "Nair, 2006)", "latex": null, "ref_id": "BIBREF38"}, {"start": 359, "end": 371, "text": "(Nair, 2006)", "latex": null, "ref_id": "BIBREF38"}, {"start": 757, "end": 779, "text": "(Sj\u00f6gren et al., 1997)", "latex": null, "ref_id": "BIBREF54"}, {"start": 1057, "end": 1080, "text": "(Siqueira et al., 2000)", "latex": null, "ref_id": "BIBREF53"}, {"start": 1516, "end": 1528, "text": "(Nair, 1987)", "latex": null, "ref_id": "BIBREF37"}, {"start": 1676, "end": 1695, "text": "(Carr et al., 2009;", "latex": null, "ref_id": "BIBREF8"}, {"start": 1696, "end": 1717, "text": "Ricucci et al., 2009;", "latex": null, "ref_id": "BIBREF44"}, {"start": 1718, "end": 1745, "text": "Ricucci and Siqueira, 2010)", "latex": null, "ref_id": "BIBREF43"}, {"start": 1763, "end": 1790, "text": "Donlan and Costerton (2002)", "latex": null, "ref_id": "BIBREF19"}, {"start": 2239, "end": 2267, "text": "(Donlan and Costerton, 2002)", "latex": null, "ref_id": "BIBREF19"}, {"start": 2359, "end": 2383, "text": "(Costerton et al., 1999)", "latex": null, "ref_id": "BIBREF15"}, {"start": 2513, "end": 2529, "text": "(McKenzie, 2008)", "latex": null, "ref_id": "BIBREF32"}, {"start": 2605, "end": 2623, "text": "(Tan et al., 2015)", "latex": null, "ref_id": "BIBREF55"}, {"start": 2731, "end": 2752, "text": "(Sabino et al., 2015)", "latex": null, "ref_id": "BIBREF46"}, {"start": 2989, "end": 3004, "text": "(Ochsner, 1997;", "latex": null, "ref_id": "BIBREF39"}, {"start": 3005, "end": 3024, "text": "Gomes et al., 2015)", "latex": null, "ref_id": "BIBREF23"}, {"start": 3060, "end": 3089, "text": "(Kawczyk-Krupka et al., 2015)", "latex": null, "ref_id": "BIBREF29"}, {"start": 3149, "end": 3171, "text": "(Almeida et al., 2014)", "latex": null, "ref_id": "BIBREF0"}, {"start": 3343, "end": 3358, "text": "(Bonsor et al.,", "latex": null, "ref_id": null}, {"start": 3547, "end": 3570, "text": "(Siqueira et al., 2007;", "latex": null, "ref_id": "BIBREF52"}, {"start": 3571, "end": 3587, "text": "Mohammadi, 2008;", "latex": null, "ref_id": "BIBREF35"}, {"start": 3588, "end": 3608, "text": "Vaziri et al., 2012;", "latex": null, "ref_id": "BIBREF59"}, {"start": 3609, "end": 3627, "text": "Wang et al., 2015)", "latex": null, "ref_id": "BIBREF61"}, {"start": 3909, "end": 3932, "text": "(Siddiqui et al., 2013;", "latex": null, "ref_id": "BIBREF51"}, {"start": 3933, "end": 3953, "text": "Chrepa et al., 2014)", "latex": null, "ref_id": "BIBREF12"}, {"start": 4455, "end": 4480, "text": "(Hamblin and Hasan, 2004;", "latex": null, "ref_id": "BIBREF25"}, {"start": 4481, "end": 4502, "text": "Tavares et al., 2010;", "latex": null, "ref_id": "BIBREF56"}, {"start": 4503, "end": 4522, "text": "Costa et al., 2011)", "latex": null, "ref_id": "BIBREF14"}, {"start": 4734, "end": 4754, "text": "(Diogo et al., 2015)", "latex": null, "ref_id": "BIBREF18"}], "ref_spans": [{"start": 5098, "end": 5108, "text": "(Figure 1)", "latex": null, "ref_id": "FIGREF0"}], "eq_spans": [], "section": null}, {"text": "The strain of C. albicans (YP0037) used in this study was obtained from the Pathogenic Yeast Collection of FMUC, University of Coimbra. E. faecalis (ATCC29212) was purchased from the American Type Culture Collection (ATCC). Microrganisms were stored at \u221280 \u2022 C in 25% glycerol. When needed, pre-cultures were prepared by defrozen microbial cells in appropriate media, brain-heart infusion (BHI) for E. faecalis and YPD (0.5% yeast extract, 1% bacto-peptone, and 2% glucose) for C. albicans. For C. albicans growth it was used YPD broth. E. faecalis growth and biofilm formation was obtained in BHI liquid medium (Difco, Detroit, MI, USA). C. albicans biofilms and mixed biofilms consisting of C. albicans with E. faecalis were obtained in RPMI 1640 (Roswell Park Memorial Institute) medium (R8755, SigmaAldrich R ).", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "For E. faecalis in vitro biofilm formation, bacterial cells were previously grown in 4 mL of BHI overnight, at 37 \u2022 C. These cells were harvested by centrifugation (Biofuge Fresco, Heraeus, UK), at 16,000 g during 5 min at 4 \u2022 C, and washed twice in sterile BHI. A bacterial cell suspension with a density of 1.5 \u00d7 10 8 cells/mL (0.5 McF of McFarland scale) was obtained; 200 \u00b5L of this suspension was pipetted to each well of sterile 96-well polystyrene microtiter plates (Nunc F, Nalgene, Denmark). These plates were covered and sealed with parafilm, and incubated during 48 h at 37 \u2022 C without agitation. For the preparation of C. albicans biofilm, a loopful of cells from the solid stock cultures was used to inoculate 20 mL of YPD and incubated overnight in an orbital shaker (120 rpm) at 30 \u2022 C. The cells were harvested by centrifugation (16,000 g for 5 min at 4 \u2022 C), and washed twice with phosphate buffered saline (PBS). The final pellet was resuspended in pre-warmed RPMI-1640 at 37 \u2022 C. The resulting cell suspension was diluted in RPMI to obtain a final suspension with a cell density of 1.0 \u00d7 10 6 cells/mL. This was used to prepare C. albicans biofilms in 96-well polystyrene microtiter plate (Nunc F, Nalgene, Denmark). For that 200 \u00b5L of C. albicans suspension was pipetted to the plate wells. After sealing with parafilm, they were left to incubate during 48 h at 37 \u2022 C, without agitation.For the mixed biofilm of E. faecalis and C. albicans, the two microbial species were pre-grown overnight and prepared as described for the monospecies biofilms except that they were resuspended in 37 \u2022 C pre-warmed RPMI-1640 (R8755, SigmaAldrich R ). The two microbe suspensions at a concentration of 1.0 \u00d7 10 6 cells/mL were mixed in pre-warmed RPMI-1640 in 1:1 ratio and incubated at 37 \u2022 C, during 48 h to allow biofilm formation.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "The classical irrigants tested were 3% NaOCl, 17% EDTA, and 2% CHX (CanalPro TM -endodontic irrigating solutions, Coltene). The biofilms were exposed to the irrigants for 60 and 90 s. A longer period of irrigation (30 min) was also tested because some authors defend that a continuous irrigation and time are important factors for the efficacy of classical irrigating solutions (Bystrom and Sundqvist, 1985; Haapasalo et al., 2010) . After each period, the supernatants were removed and the chemical reactions were stopped using adequate inhibitors: 200 \u00b5L of sodium thiosulfate (S7026, Sigma-Aldrich) was added to the NaOCl treated group; 3% Tween 80 (T2575, Sigma-Aldrich) was used to neutralize CHX. Finally, 200 \u00b5L sterile distilled water was applied to dilute the 17% EDTA. Controls were made without the irrigants, in which the stop solutions were added, proving that these stop solutions, especially 3% Tween 80, did not interfered with the biofilm biomass quantification.", "cite_spans": [{"start": 378, "end": 407, "text": "(Bystrom and Sundqvist, 1985;", "latex": null, "ref_id": "BIBREF7"}, {"start": 408, "end": 431, "text": "Haapasalo et al., 2010)", "latex": null, "ref_id": "BIBREF24"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In the aPDT experiments all the PSs tested (TBO, RB, TMPyP, and Zn(II)e 6 Me) were used at the same concentration (0.1 mg/mL). This concentration was chosen based on market formulation FotoSan agent R , in which the active substance is TBO at 0.1 mg/mL. This formulation is available in the dentistry market with a light source device (FotoSan R : 630, CMS Dental A/S, Glyngore, Roslev, Denmark) (Rios et al., 2011) . The cationic porphyrin TMPyP and the modified chlorophyll, Zn(II)e 6 Me, were synthetized and isolated according to the literature Menezes et al., 2014) . Their 1 H NMR and UV-vis spectra were consistent with literature data and their purity was confirmed by thin layer chromatography and 1 H NMR (data not shown). TBO and RB used were purchased from Sigma Aldrich (T3260 and 330000-1G, respectively). Stock solutions (10 mg/mL) of each porphyrinic derivative (TMPyP and Zn(II)e 6 Me) were prepared in dimethyl sulfoxide (DMSO). For biological assays, the stock solutions of photosensitizers were diluted to the final concentrations in PBS.The irradiations of the PSs in the aPDT experiments were performed in the presence of adequate light emitting diode (LED) source setup to comply with the 96-well plates used in this study (Figure 2) . The LED sources were built at request by the Telecommunications Institute -Informatics, Electronics and Telecommunications Engineering Department of the University of Aveiro, Portugal. RB was irradiated with a green LED with a wavelength peak centered at 557 nm, made with gallium phosphide pure (GaP), an output of 62.5 mW, continuous waves, density power of 42 mW.cm \u22122 , energy fluence of 3780 J.cm \u22122, voltage of 2.5 V. TBO, TMPyP, and Zn(II)e 6 Me were irradiated with a red LED device with a wavelength peak centered at 627 nm, a gallium arsenide phosphide on gallium phosphide (GaAsP/GaP), with an output power of 75 mW, continuous waves, density power of 35 mW.cm \u22122 , energy fluence of 3150 J.cm \u22122 and a voltage of 2.5 V.The experimental methodology included a pre-incubation period of the biofilms with the PSs, in total absence of light, for 15 min, to allow the entrance of PSs into the cells (Diogo et al., 2015) . After that, light activation was performed for 60 or 90 s. Also, in each assay it were included controls in which the PSs were not added, to study the impact of irradiation in the biofilms.", "cite_spans": [{"start": 396, "end": 415, "text": "(Rios et al., 2011)", "latex": null, "ref_id": "BIBREF45"}, {"start": 549, "end": 570, "text": "Menezes et al., 2014)", "latex": null, "ref_id": "BIBREF33"}, {"start": 2165, "end": 2185, "text": "(Diogo et al., 2015)", "latex": null, "ref_id": "BIBREF18"}], "ref_spans": [{"start": 1246, "end": 1256, "text": "(Figure 2)", "latex": null, "ref_id": "FIGREF1"}], "eq_spans": [], "section": null}, {"text": "The biofilm biomass was quantified using the safranin red (SR) assay (Kueng et al., 1989) . After each experiment, 200 \u00b5L of methanol was added to each well of the 96-well plate. After 15 min, the content of each well was aspirated and let to dry. After drying, 0.1% SR solution was added and incubated for 20 min. The resulting solution was removed with a Pasteur pipette and two washes were made with distilled water. Two hundred microliter of acetic acid 33% (v/v) was added and 20 min after the absorbance was measured at 590 nm on a microplate reader (SpectraMAX Gemini XM, Molecular Devices, USA). The results were expressed as a percentage of biofilm removal when compared with the biomass quantified before irradiation or before exposure to the irrigants.", "cite_spans": [{"start": 69, "end": 89, "text": "(Kueng et al., 1989)", "latex": null, "ref_id": "BIBREF30"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "E. faecalis and C. albicans were grown and prepared as described above. One mL of the final suspensions was added to sterile 12-well polystyrene microtiter plates with glass coverslips (CBAD00120RA1#1.5, ThermoScientific-Menzel) coated with poly-D-lysine (Sigma-Aldrich R , P1149). After seeding, the entire microtiter plate was covered and sealed with Parafilm R and incubated for 48 h at 37 \u2022 C without agitation. For confocal fluorescence microscopy, fresh cultures of biofilms were used. E. faecalis was stained with Syto 13 Green Fluorescent Nucleic Acid Stain (ThermoFisher Scientific R ). C. albicans in monospecies biofilm was probed with polyclonal primary antibody Acris Antibodies Gmbh R RGTX40096 with anti-rabbit secondary antibody Alexa Fluor R 594 (Invitrogen R , RA21207). Images were obtained with a Carl Zeiss Cell Observer Spinning Disk with Alpha Plan-Apochromat objective, at a magnification of 100\u00d7. For light microscopy, it was used an Olympus BX-40 microscope at 400\u00d7 total magnification. Images were recorded on an Olympus C-200 digital camera.For transmission electronic microscopy (TEM), samples of 48 h-biofilms were fixed with 2.5% glutaraldehyde in 0.5 M sodium cacodylate buffer (pH 7.2) for 2 h. Post-fixation was performed using 1% osmium tetroxide for 1 h. The samples were then rinsed with the same buffer, and dehydrated in a graded ethanol series (30 to 100%). Then, they were impregnated and embedded in Epoxy resin (Fluka Analytical). Ultrathin sections (\u223c70 nm) were mounted on copper grids (300 mesh) and stained with uranyl acetate 2% (15 min) and 0.2% lead citrate (10 min). Observations were carried out on a FEI-Tecnai G2 Spirit Bio Twin transmission electron microscope at 100 kV.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Data were analyzed using Prism (version 5) software (GraphPad Software, Inc., La Jolla, CA, USA). Statistical differences between groups were assessed with the independent samples student's t-test or Mann-Whitney test and a significance level of 0.05 was assumed.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Before initiating the comparative study of the efficacy of aPDT and classical irrigants in the clearance of biofilms, it was important to verify if the PSs selected, TBO, RB, TMPyP, and Zn(II)e 6 Me, had the ability to disturb the biofilms in the dark (i.e., in the absence of light activation) at the same concentration used in the aPDT studies. It was clearly desirable that the PSs had zero or very low activity in total absence of light indicating that aPDT efficacy resulted strictly from the ROS generated by PS light activation. The results obtained from the biofilm biomass analysis, using the SR assay, showed that upon 15 min of exposing the biofilms to the different PSs, in the dark, there was a decrease of the biofilms biomass in values ranging from 5.7 to 16.6% (Table 1) . Following a pre-incubation period in the dark with the PSs, each preparation was irradiated with the appropriate LED light. Thus, TBO, TMPyP, and Zn(II)e 6 Me were irradiated with a wavelength of 627 nm while 557 nm was used for RB. Three periods of irradiation were tested, 60, 90 s and 30 min. Since there were no differences between the 90 s and the 30 min periods, this longer period was abandoned (data not shown). Also, the controls of the impact of light irradiation in the biofilms during the 60 or 90 s of irradiation showed no damage of the biofilm, as assessed by the SR assay and by microscopic observation of the biofilm morphology (results not shown).The results summarized in Figure 3 (upper left panel) showed that Zn(II)e 6 Me, is more effective in the removal of E. faecalis biofilm than the other PSs used, in both irradiation periods (60 and 90 s) (P = 0.0079). Similar reduction values of E. faecalis biofilm were obtained using TMPyP and RB (Figure 3 ; upper left panel). For C. albicans biofilm, Zn(II)e 6 Me and TMPyP (1) The quantification of biofilm biomass was obtained with the SR assay and results are expressed as a percentage of biofilm removal when compared to the control biofilm biomass (n = 3).FIGURE 3 | Biofilm biomass removal upon antimicrobial photodynamic therapy (PDT) and treatment with classical irrigants. Monospecies biofilms and mixed biofilms of E. faecalis and of C. albicans were treated by a PDT using several photosensitizers (left panels) and classical irrigants (right panels). The quantification of biofilm biomass was performed with the safranin red (SR) assay. The percentage of biofilm biomass loss was calculated in relation to untreated biofilm. Differences were analyzed by Student's t-test using Prism software and considered significant at P-values of < 0.05. * P < 0.05; * * P < 0.01; * * * P < 0.001; * * * * P < 0.0001.Frontiers in Microbiology | www.frontiersin.orghad similar efficacies in decreasing biofilm biomass upon 90 s of irradiation. However, a shorter period of irradiation, 60 s, revealed significant differences between the efficacies of both dyes in the capacity to remove biofilm biomass (Figure 3 ; middle left panel). Otherwise, Zn(II)e 6 Me was much more effective than the other PSs used, TBO (P = 0.0317), and RB (P = 0.0079) after 90 s of irradiation and at the same concentration.The mixed biofilm seems to be less susceptible to aPDT than the monospecies biofilm, especially when using TBO as PS, (P = 0.0013), (Figure 3; lower panel) . In this mixed community of E. faecalis and C. albicans, Zn(II)e 6 Me demonstrated to be the most efficient PS removing 58.98% of biofilm biomass (P < 0.001). When compared to classical irrigants, Zn(II)e 6 Me was not as effective as NaOCl, the treatment that causes the higher damage, regardless of the type of biofilm (Figure 3 ; right panels). In fact, Zn(II)e 6 Me (with an irradiation period of 90 s) was more effective in removing E. faecalis biofilm than EDTA or CHX (Table 2 and Figure 3 ; upper panel). Zn(II)e 6 Me reveal the same effect of CHX or EDTA treatment toward C. albicans biofilms (90 s) and toward mixed biofilms, either with 60 s or with 90 s of irradiation (Table 2 and Figure 3 ; middle and lower panels).", "cite_spans": [], "ref_spans": [{"start": 777, "end": 786, "text": "(Table 1)", "latex": null, "ref_id": "TABREF0"}, {"start": 1480, "end": 1488, "text": "Figure 3", "latex": null, "ref_id": null}, {"start": 1752, "end": 1761, "text": "(Figure 3", "latex": null, "ref_id": null}, {"start": 2958, "end": 2967, "text": "(Figure 3", "latex": null, "ref_id": null}, {"start": 3289, "end": 3312, "text": "(Figure 3; lower panel)", "latex": null, "ref_id": null}, {"start": 3634, "end": 3643, "text": "(Figure 3", "latex": null, "ref_id": null}, {"start": 3801, "end": 3809, "text": "Figure 3", "latex": null, "ref_id": null}, {"start": 4007, "end": 4015, "text": "Figure 3", "latex": null, "ref_id": null}], "eq_spans": [], "section": null}, {"text": "The study of biofilms morphology was performed after 48 h of biofilm maturation, because 48 h-and 72 h-biofilms had similar morphologies. The changes observed in the biofilm organization developed for 48 h when treated with Zn(II)e 6 Me and NaOCl (the classical irrigant with the best outcome) were compared with the untreated biofilms (control).Zn(II)e 6 Me eliminated most of the E. faecalis (Figures 4A,B ) but C. albicans preparations retained some hyphae and yeast cells (Figures 4D,E) . Otherwise, NaOCl eliminated all the cells adhered to the glass slide, either in E. faecalis or C. albicans biofilms (Figures 4C,F) .Using light microscopy, it was observed that while NaOCl lead to an almost complete loss of living cells (Figure 5C ), aPDT with Zn(II)e 6 Me resulted in a mixed biofilm with less E. faecalis cells and less C. albicans hypha, with a predominance of pear shaped cells (Figure 5B ), when compared with the morphology of the untreated mixed biofilm (Figure 5A ).", "cite_spans": [], "ref_spans": [{"start": 394, "end": 407, "text": "(Figures 4A,B", "latex": null, "ref_id": "FIGREF2"}, {"start": 476, "end": 490, "text": "(Figures 4D,E)", "latex": null, "ref_id": "FIGREF2"}, {"start": 609, "end": 623, "text": "(Figures 4C,F)", "latex": null, "ref_id": "FIGREF2"}, {"start": 730, "end": 740, "text": "(Figure 5C", "latex": null, "ref_id": "FIGREF3"}, {"start": 892, "end": 902, "text": "(Figure 5B", "latex": null, "ref_id": "FIGREF3"}, {"start": 971, "end": 981, "text": "(Figure 5A", "latex": null, "ref_id": "FIGREF3"}], "eq_spans": [], "section": null}, {"text": "The different morphological aspects observed in the fluorescence confocal microscopy lead us to study the ultrastructural changes using TEM. After several attempts, it was realized that the remainings of the biofilms (either monospecies or mixed biofilm) treated with NaOCl were so drastically damaged that no signs of cells were observed in the epoxy resin blocks sections (data not shown). The ultrastructural modification of bacterial and fungal cells were studied in biofilms exposed to Zn(II)e 6 Me with an activation period of 90 s. In E. faecalis monospecies biofilm it was observed the existence of cell (1) Differences were analyzed by Student's t-test using Prism software and considered significant at P-values of < 0.05. * P < 0.05; * * P < 0.01; * * * P < 0.001, ns: no significant difference.wall \"ghosts\", i.e., bacterial cell wall forming a structure with typical morphology of E. faecalis, without its intracellular content ( Figures 6A,D-F) . The complexity of the cellular ultrastructure of C. albicans, a eukaryote, allowed the observation of induced modifications. Most of the yeast cells showed an atypical irregular cell wall thickness and the cytoplasmic membrane integrity was lost, with cell membrane invaginations ( Figure 6G ), caused by 90 s of Zn(II)e 6 Me-aPDT treatment.The cell membrane was damaged and the cell wall surface was rougher ( Figure 6H ) than in control cells ( Figure 6B ). Abnormal intracellular membrane arrangements probably corresponding to endoplasmic reticulum (ER) whorls ( Figure 6I) were also observed. Some C. albicans cells exhibited big vacuoles with electrodense materials (Figure 6J) . A general view of the mixed biofilms showed E. faecalis cell wall structures devoid of the intracellular content and irregular C. albicans cell walls ( Figure 6K) . In fact, in mixed biofilms microbial cells ultrastructural modifications were similar to those observed in monospecies biofilms, including invaginations of the cell membrane found in C. albicans cells (Figure 6L) . Additionally, in mixed biofilms, C. albicans cells showed persistent extracellular vesicles, at the surface of the cell wall, with different sizes and shapes ( Figure 6M) . In Figure 6N it is also possible to observe several ultrastructural features of mixed biofilms treated with Zn(II)e 6 Me by PDT: in the extracellular matrix, besides the spread of electrodense materials typical of a biofilm matrix, fragments of membranes or of fibrous materials were also observed, which were not observed in E. faecalis-C. albicans mixed biofilms untreated ( Figure 6C) ; also, the cytoplasm of E. faecalis showed electrodense agglomerates and some fungal cells showed a twisted irregular shape.", "cite_spans": [], "ref_spans": [{"start": 943, "end": 958, "text": "Figures 6A,D-F)", "latex": null, "ref_id": "FIGREF4"}, {"start": 1243, "end": 1252, "text": "Figure 6G", "latex": null, "ref_id": "FIGREF4"}, {"start": 1372, "end": 1381, "text": "Figure 6H", "latex": null, "ref_id": "FIGREF4"}, {"start": 1408, "end": 1417, "text": "Figure 6B", "latex": null, "ref_id": "FIGREF4"}, {"start": 1528, "end": 1538, "text": "Figure 6I)", "latex": null, "ref_id": "FIGREF4"}, {"start": 1633, "end": 1644, "text": "(Figure 6J)", "latex": null, "ref_id": "FIGREF4"}, {"start": 1799, "end": 1809, "text": "Figure 6K)", "latex": null, "ref_id": "FIGREF4"}, {"start": 2013, "end": 2024, "text": "(Figure 6L)", "latex": null, "ref_id": "FIGREF4"}, {"start": 2187, "end": 2197, "text": "Figure 6M)", "latex": null, "ref_id": "FIGREF4"}, {"start": 2203, "end": 2212, "text": "Figure 6N", "latex": null, "ref_id": "FIGREF4"}, {"start": 2577, "end": 2587, "text": "Figure 6C)", "latex": null, "ref_id": "FIGREF4"}], "eq_spans": [], "section": null}, {"text": "The aim of this work was to compare the efficacy of Zn(II)e 6 Me to disturb in vitro models of endodontic biofilms comparatively with three other PSs, TBO, RB, and TMPyP, and also with endodontic classical irrigants. For this, monospecies biofilms of E. faecalis and of C. albicans were used, together with a mixed biofilm model with both microrganisms. The main conclusion is that Zn(II)e 6 Me had a better antimicrobial efficacy than the clinically used PSs, TBO, and RB. Although the efficacy of Zn(II)e 6 Me and TMPyP is similar, one of the main advantage of using Zn(II)e 6 Me is its availability from natural sources, associated to a lower toxicity in the total absence of light. It also presented the same antimicrobial potential than the clinically used classical irrigants, CHX and EDTA. It is worth mentioning that TBO (Seal et al., 2002; Bergmans et al., 2008; Rios et al., 2011) is available in the market under the name of Fotosan R agent (Gambarini et al., 2011; Rios et al., 2011) and RB has been widely studied (Shrestha et al., 2012 (Shrestha et al., , 2014 Persadmehr et al., 2014) . As expected, 3% NaOCl had the best final outcomes. In fact, NaOCl at different concentrations, is considered an excellent irrigant solution in endodontics (Jeansonne and White, 1994; Siqueira et al., 2007; Mohammadi, 2008) , nevertheless it also displays high toxicity levels toward the host tissues (Estrela et al., 2002; \u00d6n\u00e7a\u01e7 et al., 2003; Trevino et al., 2011; Wang et al., 2015) .The antimicrobial effect of aPDT is dependent both on the cellular localization of the PS, which may be determined by its physicochemical properties (Castano et al., 2004) and on the diffusion of singlet oxygen that should be sufficient to inactivate microrganisms structures and biomolecules. There have been several reports on the use of aPDT to kill both yeast and bacteria, however, fungi are much more complex targets than bacteria. Nevertheless, similarities with mammalian cells should be considered and this may indicate the use of cationic PSs, rather than their anionic counterparts, since the latter exhibit facile uptake by mammalian cells (Bonnett, 1995) . The biochemical and functional effects of photosensitization include peroxidation of lipids, resulting in cell membranes disruption, lysosomes and mitochondria lysis and consequently autophagy (Schuck et al., 2014) . The phenothiaziniums, such as TBO and MB, are known to target plasma membrane of yeast and bacteria (de Melo et al., 2013; Baltazar et al., 2015) ; TBO was described as increasing cell wall permeability (Wainwright et al., 1997) , whereas MB produces bacterial DNA damage (Menezes et al., 1990) .The use of cholorophylls in endodontic root canal treatment was previously described (Mohammadi et al., 2013) . There are evidencies showing that clorophyll present in green tea can be used in endodontic root canal treatment due to its antibacterial effects (Horiba et al., 1991) . In this work, we describe Zn(II)e 6 Me, obtained from the natural chlorophyll a, as a encouraging PS candidate displaying consistent antimicrobial outcomes. The ultrastructural study of microbial cells upon aPDT demonstrated that using Zn(II)e 6 Me as PS, results in the irreversible damage of E. faecalis cells (mono and dualspecies biofilms), displaying 'cell ghosts' , empty of its cellular content but with almost intact cell walls. The presence of these inactive \"ghost\" cells was corroborated by the biomass loss assessed by SR assay. Before, it was described that E. faecalis elimination with aPDT resulted in bleb formations suggestive of damage of membrane components (L\u00f3pez-Jim\u00e9nez et al., 2015) , shrunken, bacterial cell diameter reduction, rough and fractured appearance of the bacterial cells (Cheng et al., 2012) . It was also described the presence of bacterial cell membrane shriveling and alterations including loss of cocci or bacilli shape, grooves on the cell surface and draining of the intracellular components (Garcez et al., 2013) . According to our observations, E. faecalis cell wall destruction was sporadic and not a massive one, which appear to indicate that the induced damage was directed to proteins and/or lipids of the cytoplasmic membrane, resulting in the leakage of cellular contents, as described by others (Girotti, 2001) . It is also known that the extension of biochemical changes induced by aPDT is dependent on the PS nature and on the irradiation period (Dai et al., 2009) . In this study the aPDT proceeded during a short period (60 or 90 s), which can justify the punctual cell wall destruction in E. faecalis.In C. albicans cells in monospecies biofilms and in C. albicans cells in mixed biofilms, it was noticed several changes in the cellular organization, with a cytoplasmic membrane disruption, vacuoles morphology and organelles damage including signals of autophagy (e.g., ER whorls, and organelles inclusion in vacuoles) as described by others (Prates et al., 2011; Schuck et al., 2014) . The intracellular damage induced by aPDT with Zn(II)e 6 Me is probably dependent on the entry of this PS, since this feature is crucial for aPDT efficacy and outcomes (Hamblin and Hasan, 2004; Baltazar et al., 2015) . This lead us to speculate that the preincubation period of biofilms with PSs during 15 min in total absence of light, before the short irradiation period (60 and 90 s) most certainly contributed for the interaction between the PS and the cell. This would lead to intracellular PS distribution (due to its hydrophobic nature), impacting in the genesis of the intracellular damage observed. The questions raised by these observations highlight the importance of future further studies to unravel the intracellular distribution of Zn(II)e 6 Me. The Zn(II)e 6 Me antimicrobial potential, that we showed by quantification of biofilm biomass loss and by a microscopic study of the biofilm morphology and of the cellular ultrastructure, leads to the importance of defining the mechanism by which this modified chlorophyll affects the endodontic biofilms.Based on this, further research will be mandatory to improve the antimicrobial efficacy of aPDT in the root canal system, such as the ones recently published (Tennert et al., 2015; Cieplik et al., 2016) using human tooth models, ultimately leading to an optimization of light delivery and new PS formulations.", "cite_spans": [{"start": 829, "end": 848, "text": "(Seal et al., 2002;", "latex": null, "ref_id": "BIBREF48"}, {"start": 849, "end": 871, "text": "Bergmans et al., 2008;", "latex": null, "ref_id": "BIBREF2"}, {"start": 872, "end": 890, "text": "Rios et al., 2011)", "latex": null, "ref_id": "BIBREF45"}, {"start": 952, "end": 976, "text": "(Gambarini et al., 2011;", "latex": null, "ref_id": "BIBREF21"}, {"start": 977, "end": 995, "text": "Rios et al., 2011)", "latex": null, "ref_id": "BIBREF45"}, {"start": 1027, "end": 1049, "text": "(Shrestha et al., 2012", "latex": null, "ref_id": "BIBREF49"}, {"start": 1050, "end": 1074, "text": "(Shrestha et al., , 2014", "latex": null, "ref_id": "BIBREF50"}, {"start": 1075, "end": 1099, "text": "Persadmehr et al., 2014)", "latex": null, "ref_id": "BIBREF41"}, {"start": 1257, "end": 1284, "text": "(Jeansonne and White, 1994;", "latex": null, "ref_id": "BIBREF27"}, {"start": 1285, "end": 1307, "text": "Siqueira et al., 2007;", "latex": null, "ref_id": "BIBREF52"}, {"start": 1308, "end": 1324, "text": "Mohammadi, 2008)", "latex": null, "ref_id": "BIBREF35"}, {"start": 1425, "end": 1444, "text": "\u00d6n\u00e7a\u01e7 et al., 2003;", "latex": null, "ref_id": "BIBREF40"}, {"start": 1445, "end": 1466, "text": "Trevino et al., 2011;", "latex": null, "ref_id": "BIBREF58"}, {"start": 1467, "end": 1485, "text": "Wang et al., 2015)", "latex": null, "ref_id": "BIBREF61"}, {"start": 1636, "end": 1658, "text": "(Castano et al., 2004)", "latex": null, "ref_id": "BIBREF10"}, {"start": 2139, "end": 2154, "text": "(Bonnett, 1995)", "latex": null, "ref_id": "BIBREF3"}, {"start": 2350, "end": 2371, "text": "(Schuck et al., 2014)", "latex": null, "ref_id": "BIBREF47"}, {"start": 2474, "end": 2496, "text": "(de Melo et al., 2013;", "latex": null, "ref_id": "BIBREF17"}, {"start": 2497, "end": 2519, "text": "Baltazar et al., 2015)", "latex": null, "ref_id": "BIBREF1"}, {"start": 2577, "end": 2602, "text": "(Wainwright et al., 1997)", "latex": null, "ref_id": "BIBREF60"}, {"start": 2646, "end": 2668, "text": "(Menezes et al., 1990)", "latex": null, "ref_id": "BIBREF34"}, {"start": 2755, "end": 2779, "text": "(Mohammadi et al., 2013)", "latex": null, "ref_id": "BIBREF36"}, {"start": 2928, "end": 2949, "text": "(Horiba et al., 1991)", "latex": null, "ref_id": "BIBREF26"}, {"start": 3629, "end": 3657, "text": "(L\u00f3pez-Jim\u00e9nez et al., 2015)", "latex": null, "ref_id": "BIBREF31"}, {"start": 3759, "end": 3779, "text": "(Cheng et al., 2012)", "latex": null, "ref_id": "BIBREF11"}, {"start": 3986, "end": 4007, "text": "(Garcez et al., 2013)", "latex": null, "ref_id": "BIBREF22"}, {"start": 4298, "end": 4313, "text": "(Girotti, 2001)", "latex": null, "ref_id": "BIBREF23"}, {"start": 4451, "end": 4469, "text": "(Dai et al., 2009)", "latex": null, "ref_id": "BIBREF16"}, {"start": 4951, "end": 4972, "text": "(Prates et al., 2011;", "latex": null, "ref_id": "BIBREF42"}, {"start": 4973, "end": 4993, "text": "Schuck et al., 2014)", "latex": null, "ref_id": "BIBREF47"}, {"start": 5163, "end": 5188, "text": "(Hamblin and Hasan, 2004;", "latex": null, "ref_id": "BIBREF25"}, {"start": 5189, "end": 5211, "text": "Baltazar et al., 2015)", "latex": null, "ref_id": "BIBREF1"}, {"start": 6219, "end": 6241, "text": "(Tennert et al., 2015;", "latex": null, "ref_id": "BIBREF57"}, {"start": 6242, "end": 6263, "text": "Cieplik et al., 2016)", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "PD, IM, JS, FC, and TG were responsible for the conception and design of the study, and for the analysis and interpretation of data; PD, CF, FC, and MM did most of the lab work and analysis of data; PD and TG did most of the manuscript writings; MF, MN, MU, and KdO extracted, modified and analyzed two of the PDT compounds; all the authors contributed equally to the revision of the manuscript and approved the final version to be submitted. ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}], "ref_entries": {"FIGREF0": {"text": "FIGURE 1 | Chemical structures of the photosensitizers tested.", "latex": null, "type": "figure"}, "FIGREF1": {"text": "FIGURE 2 | Model of the experimental light emitting diode (LED) sources [green light diode (GaP); 557 nm of wavelength].", "latex": null, "type": "figure"}, "FIGREF2": {"text": "FIGURE 4 | Effect of aPDT with Zn(II)e 6 Me as photosensitizer compared with the classical irrigant NaOCl in the morphology of monospecies biofilms. Endodontic in vitro 48 h-biofilms of E. faecalis and C. albicans were obtained and treated as described under Materials and Methods. (A-C) E. faecalis was stained with Syto 13 Green Fluorescent Nucleic Acid Stain. (D-F) C. albicans was probed with polyclonal primary antibody and with anti-rabbit secondary antibody Alexafluor R 594. Representative images of biofilms untreated (A,D), treated with Zn(II)e 6 Me as PS activated for 90 s (B,E), and treated with NaOCl (C,F), were obtained with a Carl Zeiss Cell Observer Spinning Disk with Alpha Plan-Apochromat objective (100\u00d7).", "latex": null, "type": "figure"}, "FIGREF3": {"text": "FIGURE 5 | Effect of PDT with Zn(II)e 6 Me as photosensitizer compared with the classical irrigant NaOCl in the morphology of mixed biofilm. The images (A) untreated controls, (B) Zn(II)e 6 Me activated during 90 s, and (C) with NaOCl, were obtained with using an Olympus BX-40 microscope at 1000\u00d7 total magnification. Images were recorded at different time periods on an Olympus C-200 digital camera. Bars: 20 \u00b5m.", "latex": null, "type": "figure"}, "FIGREF4": {"text": "FIGURE 6 | Ultrastructural modification of microbial cells upon antimicrobial photodynamic therapy (aPDT) with Zn(II)e 6 Me as photosensitizer. 48 hbiofilms of E. faecalis, C. albicans and mixed biofilms with both microrganisms untreated (A-C) and treated with aPDT with Zn(II)e 6 Me as photosensitizer (E. faecalis biofilm: D-F), (C. albicans biofilm: G-J), (mixed biofilms: K-N). Solid arrows indicate peculiar intracellular membrane arrangements probably corresponding to endoplasmic reticulum whorls and open arrows depicted large vacuoles with electrodense materials. Bars, 1000 nm, except for E and I, 500 nm.", "latex": null, "type": "figure"}, "FIGREF5": {"text": "This work was partially supported by FEDER funds through the Operational Programme Competitiveness Factors -COMPETE and national funds by Foundation for Science and Technology (FCT) under the strategic project UID/NEU/04539/2013; by QOPNA research Unit of University of Aveiro and FCT/MEC (FCT UID/QUI/00062/2013) co- financed by the FEDER, (PT2020) Partnership Agreement and also the Portuguese NMR Network; and by FAPESP (S\u00e3o Paulo Research Foundation -Brazil) 2013/07276-1 and 2015/21110-4 for the semi-synthesis of Zn(II)e6Me. CF and IM are recipients of postdoctoral fellowships from FCT (respectively, SFRH/BPD/63733/2009 and SFRH/BPD/113285/2015).", "latex": null, "type": "figure"}, "TABREF0": {"text": "Photosensitizers (PSs) effect in biofilm biomass (1) in the total absence of light during an incubation period of 15 min.", "latex": null, "type": "table"}, "TABREF1": {"text": "Statistical analysis (1) of the efficiency of Zn(II)e 6 Me against microbial biofilms in comparison with classical irrigants in the clearance of E. faecalis, C. albicans, and mixed biofilms.", "latex": null, "type": "table"}}, "bib_entries": {"BIBREF0": {"ref_id": "b0", "title": "Photodynamic inactivation of multidrug-resistant bacteria in hospital wastewaters: influence of residual antibiotics", "authors": [{"first": "J", "middle": [], "last": "Almeida", "suffix": ""}, {"first": "J", "middle": ["P"], "last": "Tom\u00e9", "suffix": ""}, {"first": "M", "middle": ["G"], "last": "Neves", "suffix": ""}, {"first": "A", "middle": ["C"], "last": "Tom\u00e9", "suffix": ""}, {"first": "J", "middle": ["A"], "last": "Cavaleiro", "suffix": ""}, {"first": "\u00c2", "middle": [], "last": "Cunha", "suffix": ""}], "year": 2014, "venue": "Photochem. Photobiol. Sci", "volume": "13", "issn": "", "pages": "626--633", "other_ids": {"doi": ["10.1039/c3pp50195g"]}, "links": "25540194"}, "BIBREF1": {"ref_id": "b1", "title": "Antimicrobial photodynamic therapy: an effective alternative approach to control fungal infections", "authors": [{"first": "L", "middle": ["M"], "last": "Baltazar", "suffix": ""}, {"first": "A", "middle": [], "last": "Ray", "suffix": ""}, {"first": "D", "middle": ["A"], "last": "Santos", "suffix": ""}, {"first": "P", "middle": ["S"], "last": "Cisalpino", "suffix": ""}, {"first": "A", "middle": ["J"], "last": "Friedman", "suffix": ""}, {"first": "J", "middle": ["D"], "last": "Nosanchuk", "suffix": ""}], "year": 2015, "venue": "Front. Microbiol", "volume": "6", "issn": "", "pages": "", "other_ids": {"doi": ["10.3389/fmicb.2015.00202"]}, "links": "1613877"}, "BIBREF2": {"ref_id": "b2", "title": "Effect of photo-activated disinfection on endodontic pathogens ex vivo", "authors": [{"first": "L", "middle": [], "last": "Bergmans", "suffix": ""}, {"first": "P", "middle": [], "last": "Moisiadis", "suffix": ""}, {"first": "B", "middle": [], "last": "Huybrethts", "suffix": ""}, {"first": "B", "middle": [], "last": "Van Meerbeek", "suffix": ""}, {"first": "M", "middle": [], "last": "Quirynen", "suffix": ""}, {"first": "P", "middle": [], "last": "Lambrechts", "suffix": ""}], "year": 2008, "venue": "Int. Endod. J", "volume": "41", "issn": "", "pages": "227--239", "other_ids": {"doi": ["10.1111/j.1365-2591.2007.01344.x"]}, "links": null}, "BIBREF3": {"ref_id": "b3", "title": "Photosensitizers of the porphyrin and phthalocyanine series for photodynamic therapy", "authors": [{"first": "R", "middle": [], "last": "Bonnett", "suffix": ""}], "year": 1995, "venue": "Chem. Soc. Rev", "volume": "24", "issn": "", "pages": "", "other_ids": {"doi": ["10.1039/cs9952400019"]}, "links": null}, "BIBREF4": {"ref_id": "b4", "title": "Microbiological evaluation of photo-activated disinfection in endodontics (an in vivo study)", "authors": [{"first": "S", "middle": ["J"], "last": "Bonsor", "suffix": ""}, {"first": "R", "middle": [], "last": "Nichol", "suffix": ""}, {"first": "T", "middle": ["M"], "last": "Reid", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Pearson", "suffix": ""}], "year": 2006, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": "23440285"}, "BIBREF6": {"ref_id": "b6", "title": "An alternative regimen for root canal disinfection", "authors": [{"first": "S", "middle": ["J"], "last": "Bonsor", "suffix": ""}, {"first": "R", "middle": [], "last": "Nichol", "suffix": ""}, {"first": "T", "middle": ["M S"], "last": "Reid", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Pearson", "suffix": ""}], "year": 2006, "venue": "Br. Dent. J", "volume": "201", "issn": "", "pages": "101--105", "other_ids": {}, "links": "25610723"}, "BIBREF7": {"ref_id": "b7", "title": "The antibacterial action of sodium hypochlorite and EDTA in 60 cases of endodontic therapy", "authors": [{"first": "A", "middle": [], "last": "Bystrom", "suffix": ""}, {"first": "G", "middle": [], "last": "Sundqvist", "suffix": ""}], "year": 1985, "venue": "Int. Endod. J", "volume": "18", "issn": "", "pages": "35--40", "other_ids": {"doi": ["10.1111/j.1365-2591.1985.tb00416.x"]}, "links": null}, "BIBREF8": {"ref_id": "b8", "title": "Ultrastructural examination of failed molar retreatment with secondary apical periodontitis: an examination of endodontic biofilms in an endodontic retreatment failure", "authors": [{"first": "G", "middle": ["B"], "last": "Carr", "suffix": ""}, {"first": "R", "middle": ["S"], "last": "Schwarts", "suffix": ""}, {"first": "C", "middle": [], "last": "Shchaudinn", "suffix": ""}, {"first": "A", "middle": [], "last": "Gorur", "suffix": ""}, {"first": "J", "middle": ["W"], "last": "Costerton", "suffix": ""}], "year": 2009, "venue": "J. Endod", "volume": "35", "issn": "", "pages": "1303--1309", "other_ids": {"doi": ["10.1016/j.joen.2009.05.035"]}, "links": "44670765"}, "BIBREF9": {"ref_id": "b9", "title": "Functional cationic nanomagnet-Porphyrin hybrids for the photoinactivation of microorganisms", "authors": [{"first": "C", "middle": ["M"], "last": "Carvalho", "suffix": ""}, {"first": "E", "middle": [], "last": "Alves", "suffix": ""}, {"first": "L", "middle": [], "last": "Costa", "suffix": ""}, {"first": "J", "middle": ["P"], "last": "Tom\u00e9", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Faustino", "suffix": ""}, {"first": "M", "middle": ["G"], "last": "Neves", "suffix": ""}], "year": 2010, "venue": "ACS Nano", "volume": "4", "issn": "", "pages": "7133--7140", "other_ids": {}, "links": "9190603"}, "BIBREF10": {"ref_id": "b10", "title": "Mechanisms in photodynamic therapy: part one-Photosensitizers, photochemistry and cellular localization", "authors": [{"first": "A", "middle": ["P"], "last": "Castano", "suffix": ""}, {"first": "T", "middle": ["N"], "last": "Demidova", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}], "year": 2004, "venue": "Photodiagnosis Photodyn. Ther", "volume": "1", "issn": "", "pages": "7--11", "other_ids": {"doi": ["10.1016/S1572-1000(05"]}, "links": "32020592"}, "BIBREF11": {"ref_id": "b11", "title": "Evaluation of the bactericidal effect of Nd:YAG, Er:YAG, Er,Cr:YSGG laser radiation, and antimicrobial photodynamic therapy (aPDT) in experimentally infected root canals", "authors": [{"first": "X", "middle": [], "last": "Cheng", "suffix": ""}, {"first": "S", "middle": [], "last": "Guan", "suffix": ""}, {"first": "H", "middle": [], "last": "Lu", "suffix": ""}, {"first": "C", "middle": [], "last": "Zhao", "suffix": ""}, {"first": "X", "middle": [], "last": "Chen", "suffix": ""}, {"first": "N", "middle": [], "last": "Li", "suffix": ""}], "year": 2012, "venue": "Lasers Surg. Med", "volume": "44", "issn": "", "pages": "824--831", "other_ids": {"doi": ["10.1002/lsm.22092"]}, "links": "2069235"}, "BIBREF12": {"ref_id": "b12", "title": "The Effect of photodynamic therapy in root canal disinfection: a systematic review", "authors": [{"first": "V", "middle": [], "last": "Chrepa", "suffix": ""}, {"first": "G", "middle": ["A"], "last": "Kotsakis", "suffix": ""}, {"first": "T", "middle": ["C"], "last": "Pagonis", "suffix": ""}, {"first": "K", "middle": ["M"], "last": "Hargreaves", "suffix": ""}], "year": 2014, "venue": "J. Endod", "volume": "40", "issn": "", "pages": "891--898", "other_ids": {"doi": ["10.1016/j.joen.2014.03.005"]}, "links": "42278858"}, "BIBREF13": {"ref_id": "b13", "title": "Photodynamic inactivation of root canal bacteria by light activation through human dental hard and simulated surrounding tissue", "authors": [{"first": "F", "middle": [], "last": "Cieplik", "suffix": ""}, {"first": "A", "middle": [], "last": "Pummer", "suffix": ""}, {"first": "C", "middle": [], "last": "Leibl", "suffix": ""}, {"first": "J", "middle": [], "last": "Regensburger", "suffix": ""}, {"first": "G", "middle": [], "last": "Schmalz", "suffix": ""}, {"first": "W", "middle": [], "last": "Buchalla", "suffix": ""}], "year": 2016, "venue": "Front. Microbiol", "volume": "7", "issn": "", "pages": "", "other_ids": {"doi": ["10.3389/fmicb.2016.00929"]}, "links": "17923488"}, "BIBREF14": {"ref_id": "b14", "title": "Evaluation of resistance development and viability recovery by a non-enveloped virus after repeated cycles of aPDT", "authors": [{"first": "L", "middle": [], "last": "Costa", "suffix": ""}, {"first": "J", "middle": ["P C"], "last": "Tom\u00e9", "suffix": ""}, {"first": "M", "middle": ["G P M S"], "last": "Neves", "suffix": ""}, {"first": "A", "middle": ["C"], "last": "Tom\u00e9", "suffix": ""}, {"first": "J", "middle": ["A S"], "last": "Cavaleiro", "suffix": ""}, {"first": "M", "middle": ["A F"], "last": "Faustino", "suffix": ""}], "year": 2011, "venue": "Antivir. Res", "volume": "91", "issn": "", "pages": "278--282", "other_ids": {"doi": ["10.1016/j.antiviral.2011.06.007"]}, "links": "5801968"}, "BIBREF15": {"ref_id": "b15", "title": "Bacterial biofilms: a common cause of persistent infections", "authors": [{"first": "J", "middle": ["W"], "last": "Costerton", "suffix": ""}, {"first": "P", "middle": ["S"], "last": "Stewart", "suffix": ""}, {"first": "E", "middle": ["P"], "last": "Greenberg", "suffix": ""}], "year": 1999, "venue": "Science", "volume": "284", "issn": "", "pages": "1318--1322", "other_ids": {}, "links": "27364291"}, "BIBREF16": {"ref_id": "b16", "title": "Photodynamic therapy for localized infections-state of the art", "authors": [{"first": "T", "middle": [], "last": "Dai", "suffix": ""}, {"first": "Y", "middle": ["Y"], "last": "Huang", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}], "year": 2009, "venue": "Photodiagnosis Photodyn. Ther", "volume": "6", "issn": "", "pages": "170--188", "other_ids": {"doi": ["10.1016/j.pdpdt.2009.10.008"]}, "links": "46028762"}, "BIBREF17": {"ref_id": "b17", "title": "Photodynamic inactivation of biofilm: taking a lightly colored approach to stubborn infection", "authors": [{"first": "W", "middle": ["C"], "last": "De Melo", "suffix": ""}, {"first": "P", "middle": [], "last": "Avci", "suffix": ""}, {"first": "M", "middle": ["N"], "last": "De Oliveira", "suffix": ""}, {"first": "A", "middle": [], "last": "Gupta", "suffix": ""}, {"first": "D", "middle": [], "last": "Vecchio", "suffix": ""}, {"first": "M", "middle": [], "last": "Sadasivam", "suffix": ""}], "year": 2013, "venue": "Expert Rev. Anti Infect. Ther", "volume": "11", "issn": "", "pages": "669--693", "other_ids": {"doi": ["10.1586/14787210.2013.811861"]}, "links": "13413088"}, "BIBREF18": {"ref_id": "b18", "title": "Photodynamic antimicrobial chemotherapy for root canal system asepsis: a narrative literature review", "authors": [{"first": "P", "middle": [], "last": "Diogo", "suffix": ""}, {"first": "T", "middle": [], "last": "Gon\u00e7alves", "suffix": ""}, {"first": "P", "middle": [], "last": "Palma", "suffix": ""}, {"first": "J", "middle": ["M"], "last": "Santos", "suffix": ""}], "year": 2015, "venue": "Int. J. Dent", "volume": "2015", "issn": "", "pages": "1--26", "other_ids": {"doi": ["10.1155/2015/269205"]}, "links": "15189479"}, "BIBREF19": {"ref_id": "b19", "title": "Biofilms: survival mechanisms of clinically relevant microorganisms", "authors": [{"first": "R", "middle": ["M"], "last": "Donlan", "suffix": ""}, {"first": "J", "middle": ["W"], "last": "Costerton", "suffix": ""}], "year": 2002, "venue": "Clin. Microbiol. Rev", "volume": "15", "issn": "", "pages": "167--193", "other_ids": {"doi": ["10.1128/CMR.15.2.167-193.2002"]}, "links": null}, "BIBREF20": {"ref_id": "b20", "title": "Mechanism of action of sodium hypochlorite", "authors": [{"first": "C", "middle": [], "last": "Estrela", "suffix": ""}, {"first": "E", "middle": ["L"], "last": "Barbin", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Marchesan", "suffix": ""}, {"first": "J", "middle": ["D"], "last": "P\u00e9cora", "suffix": ""}], "year": 2002, "venue": "Braz. Dent. J", "volume": "13", "issn": "", "pages": "113--117", "other_ids": {"doi": ["10.1590/S0103-64402002000200007"]}, "links": null}, "BIBREF21": {"ref_id": "b21", "title": "In vitro evaluation of the cytotoxicity of FotoSanTM light-activated disinfection on human fibroblasts", "authors": [{"first": "G", "middle": [], "last": "Gambarini", "suffix": ""}, {"first": "G", "middle": [], "last": "Plotino", "suffix": ""}, {"first": "N", "middle": ["M"], "last": "Grande", "suffix": ""}, {"first": "G", "middle": [], "last": "Nocca", "suffix": ""}, {"first": "A", "middle": [], "last": "Lupi", "suffix": ""}, {"first": "B", "middle": [], "last": "Giardina", "suffix": ""}], "year": 2011, "venue": "Med. Sci. Monit", "volume": "17", "issn": "", "pages": "21--25", "other_ids": {}, "links": "10158819"}, "BIBREF22": {"ref_id": "b22", "title": "Effects of photodynamic therapy on Grampositive and Gram-negative bacterial biofilms by bioluminescence imaging and scanning electron microscopic analysis", "authors": [{"first": "A", "middle": ["S"], "last": "Garcez", "suffix": ""}, {"first": "S", "middle": ["C"], "last": "N\u00fa\u00f1ez", "suffix": ""}, {"first": "", "middle": [], "last": "Azambuja", "suffix": ""}, {"first": "", "middle": [], "last": "Jr", "suffix": ""}, {"first": "E", "middle": ["R"], "last": "Fregnani", "suffix": ""}, {"first": "H", "middle": ["M"], "last": "Rodriguez", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}], "year": 2013, "venue": "Photomed. Laser Surg", "volume": "31", "issn": "", "pages": "519--525", "other_ids": {"doi": ["10.1089/pho.2012.3341"]}, "links": "19617437"}, "BIBREF23": {"ref_id": "b23", "title": "Photosensitized oxidation of membrane lipids: reaction pathways, cytotoxic effects, and cytoprotective mechanisms", "authors": [{"first": "A", "middle": ["W"], "last": "Girotti", "suffix": ""}, {"first": "A", "middle": ["T P C"], "last": "Gomes", "suffix": ""}, {"first": "M", "middle": ["A F"], "last": "Faustino", "suffix": ""}, {"first": "M", "middle": ["G P M"], "last": "Neves", "suffix": ""}, {"first": "V", "middle": ["F"], "last": "Ferreira", "suffix": ""}, {"first": "A", "middle": [], "last": "Juarranz", "suffix": ""}, {"first": "J", "middle": ["A S"], "last": "Cavaleiro", "suffix": ""}], "year": 2001, "venue": "J. Photochem. Photobiol. B", "volume": "63", "issn": "", "pages": "33496--33502", "other_ids": {"doi": ["10.1016/S1011-1344(01)00207"]}, "links": "5674458"}, "BIBREF24": {"ref_id": "b24", "title": "Irrigation in Endodontics", "authors": [{"first": "M", "middle": [], "last": "Haapasalo", "suffix": ""}, {"first": "Y", "middle": [], "last": "Shen", "suffix": ""}, {"first": "W", "middle": [], "last": "Qian", "suffix": ""}, {"first": "Y", "middle": [], "last": "Gao", "suffix": ""}], "year": 2010, "venue": "Dent. Clin. North. Am", "volume": "54", "issn": "", "pages": "291--312", "other_ids": {"doi": ["10.1016/j.cden.2009.12.001"]}, "links": "34624771"}, "BIBREF25": {"ref_id": "b25", "title": "Photodynamic therapy: a new antimicrobial approach to infectious disease?", "authors": [{"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}, {"first": "T", "middle": [], "last": "Hasan", "suffix": ""}], "year": 2004, "venue": "Photochem. Photobiol. Sci", "volume": "3", "issn": "", "pages": "436--450", "other_ids": {"doi": ["10.1039/b311900a"]}, "links": "637778"}, "BIBREF26": {"ref_id": "b26", "title": "A pilot study of Japanese green tea as a medicament: antibacterial and bactericidal effects", "authors": [{"first": "N", "middle": [], "last": "Horiba", "suffix": ""}, {"first": "Y", "middle": [], "last": "Maekawa", "suffix": ""}, {"first": "T", "middle": [], "last": "Matsumoto", "suffix": ""}, {"first": "H", "middle": [], "last": "Nakamura", "suffix": ""}], "year": 1991, "venue": "J. Endod", "volume": "17", "issn": "", "pages": "81743--81750", "other_ids": {"doi": ["10.1016/S0099-2399(06"]}, "links": null}, "BIBREF27": {"ref_id": "b27", "title": "A Comparison of 2", "authors": [{"first": "M", "middle": ["J"], "last": "Jeansonne", "suffix": ""}, {"first": "R", "middle": ["R"], "last": "White", "suffix": ""}], "year": 1994, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF28": {"ref_id": "b28", "title": "Chlorhexidine Gluconate and 5,25 % Sodium Hypochlorite as antimicrobial endodontic irrigants", "authors": [], "year": null, "venue": "J. Endod", "volume": "20", "issn": "", "pages": "276--278", "other_ids": {"doi": ["10.1016/S0099-2399(06)80815-0"]}, "links": null}, "BIBREF29": {"ref_id": "b29", "title": "Vascular-targeted photodynamic therapy in the treatment of neovascular age-related macular degeneration: clinical perspectives", "authors": [{"first": "A", "middle": [], "last": "Kawczyk-Krupka", "suffix": ""}, {"first": "A", "middle": ["M"], "last": "Bugaj", "suffix": ""}, {"first": "M", "middle": [], "last": "Potempa", "suffix": ""}, {"first": "K", "middle": [], "last": "Wasilewska", "suffix": ""}, {"first": "W", "middle": [], "last": "Latos", "suffix": ""}, {"first": "A", "middle": [], "last": "Siero\u00f1", "suffix": ""}], "year": 2015, "venue": "Photodiagnosis Photodyn. Ther", "volume": "12", "issn": "", "pages": "161--175", "other_ids": {"doi": ["10.1016/j.pdpdt.2015.03.007"]}, "links": "223171"}, "BIBREF30": {"ref_id": "b30", "title": "Quantification of cells cultured on 96-well plates", "authors": [{"first": "W", "middle": [], "last": "Kueng", "suffix": ""}, {"first": "E", "middle": [], "last": "Silber", "suffix": ""}, {"first": "U", "middle": [], "last": "Eppenberger", "suffix": ""}], "year": 1989, "venue": "Anal. Biochem", "volume": "182", "issn": "", "pages": "16--19", "other_ids": {"doi": ["10.1016/0003-2697(89)90710-0"]}, "links": "7089162"}, "BIBREF31": {"ref_id": "b31", "title": "Effects of photodynamic therapy on Enterococcus faecalis biofilms", "authors": [{"first": "L", "middle": [], "last": "L\u00f3pez-Jim\u00e9nez", "suffix": ""}, {"first": "E", "middle": [], "last": "Fust\u00e9", "suffix": ""}, {"first": "B", "middle": [], "last": "Mart\u00ednez-Garriga", "suffix": ""}, {"first": "J", "middle": [], "last": "Arnabat-Dom\u00ednguez", "suffix": ""}, {"first": "T", "middle": [], "last": "Vinuesa", "suffix": ""}, {"first": "M", "middle": [], "last": "Vi\u00f1as", "suffix": ""}], "year": 2015, "venue": "Lasers Med. Sci", "volume": "30", "issn": "", "pages": "1519--1526", "other_ids": {}, "links": "17352331"}, "BIBREF32": {"ref_id": "b32", "title": "Case mortality in polymicrobial bloodstream infections", "authors": [{"first": "F", "middle": ["E"], "last": "Mckenzie", "suffix": ""}], "year": 2008, "venue": "J. Clin. Epidemiol", "volume": "59", "issn": "", "pages": "760--761", "other_ids": {"doi": ["10.1016/j.jclinepi.2005.12.009"]}, "links": "13791608"}, "BIBREF33": {"ref_id": "b33", "title": "Synthesis of new chlorine6 trimethyl and protoporphyrin IX dimethyl ester derivatives and their photophysical and electrochemical characterizations", "authors": [{"first": "J", "middle": ["C"], "last": "Menezes", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Faustino", "suffix": ""}, {"first": "", "middle": [], "last": "De", "suffix": ""}, {"first": "K", "middle": ["T"], "last": "Oliveira", "suffix": ""}, {"first": "M", "middle": ["P"], "last": "Uliana", "suffix": ""}, {"first": "V", "middle": ["F"], "last": "Ferreira", "suffix": ""}, {"first": "S", "middle": [], "last": "Hackbarth", "suffix": ""}], "year": 2014, "venue": "Chemistry", "volume": "20", "issn": "", "pages": "13644--13655", "other_ids": {}, "links": "9099630"}, "BIBREF34": {"ref_id": "b34", "title": "Photodynamic action of methylene blue: repair and mutation in Escherichia coli", "authors": [{"first": "S", "middle": [], "last": "Menezes", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Capella", "suffix": ""}, {"first": "L", "middle": ["R"], "last": "Caldas", "suffix": ""}], "year": 1990, "venue": "J. Photochem. Photobiol. B", "volume": "5", "issn": "", "pages": "505--517", "other_ids": {"doi": ["10.1016/1011-1344(90)85062-2"]}, "links": "23609299"}, "BIBREF35": {"ref_id": "b35", "title": "Sodium hypochlorite in endodontics: an update review", "authors": [{"first": "Z", "middle": [], "last": "Mohammadi", "suffix": ""}], "year": 2008, "venue": "Int. Dent. J", "volume": "58", "issn": "", "pages": "329--341", "other_ids": {"doi": ["10.1111/j.1875-595X.2008.tb00354.x"]}, "links": "3898648"}, "BIBREF36": {"ref_id": "b36", "title": "Microbial biofilms in endodontic infections: an update review", "authors": [{"first": "Z", "middle": [], "last": "Mohammadi", "suffix": ""}, {"first": "F", "middle": [], "last": "Palazzi", "suffix": ""}, {"first": "L", "middle": [], "last": "Giardino", "suffix": ""}, {"first": "S", "middle": [], "last": "Shalavi", "suffix": ""}], "year": 2013, "venue": "Biomed. J", "volume": "36", "issn": "", "pages": "59--70", "other_ids": {}, "links": "35407718"}, "BIBREF37": {"ref_id": "b37", "title": "Light and electron microscopic studies of root canal flora and periapical lesions", "authors": [{"first": "P", "middle": ["N"], "last": "Nair", "suffix": ""}], "year": 1987, "venue": "J. Endod", "volume": "13", "issn": "", "pages": "29--39", "other_ids": {"doi": ["10.1016/S0099-2399(87)80089-4"]}, "links": null}, "BIBREF38": {"ref_id": "b38", "title": "On the causes of persistent apical periodontitis-a review", "authors": [{"first": "P", "middle": ["N"], "last": "Nair", "suffix": ""}], "year": 2006, "venue": "Int. Endod. J", "volume": "39", "issn": "", "pages": "249--281", "other_ids": {"doi": ["10.1111/j.1365-2591.2006.01099.x"]}, "links": null}, "BIBREF39": {"ref_id": "b39", "title": "Photophysical and photobiological processes in the photodynamic therapy of tumours", "authors": [{"first": "M", "middle": [], "last": "Ochsner", "suffix": ""}], "year": 1997, "venue": "J. Photochem. Photobiol. B", "volume": "39", "issn": "", "pages": "1--18", "other_ids": {"doi": ["10.1016/S1011-1344(96)07428-3"]}, "links": "5606728"}, "BIBREF40": {"ref_id": "b40", "title": "Comparison of antibacterial and toxic effects of various root canal irrigants", "authors": [{"first": "O", "middle": [], "last": "\u00d6n\u00e7a\u01e7", "suffix": ""}, {"first": "M", "middle": [], "last": "Hosgor", "suffix": ""}, {"first": "S", "middle": [], "last": "Hilmiiolu", "suffix": ""}, {"first": "O", "middle": [], "last": "Zekioglu", "suffix": ""}, {"first": "C", "middle": [], "last": "Eronat", "suffix": ""}, {"first": "D", "middle": [], "last": "Burhano\u00f0lu", "suffix": ""}], "year": 2003, "venue": "Int. Endo. J", "volume": "36", "issn": "", "pages": "423--432", "other_ids": {"doi": ["10.1046/j.1365-2591.2003.00673.x"]}, "links": null}, "BIBREF41": {"ref_id": "b41", "title": "Bioactive chitosan nanoparticles and photodynamic therapy inhibit collagen degradation in vitro", "authors": [{"first": "A", "middle": [], "last": "Persadmehr", "suffix": ""}, {"first": "C", "middle": ["D"], "last": "Torneck", "suffix": ""}, {"first": "D", "middle": ["G"], "last": "Cvitkovitch", "suffix": ""}, {"first": "V", "middle": [], "last": "Pinto", "suffix": ""}, {"first": "I", "middle": [], "last": "Talior", "suffix": ""}, {"first": "M", "middle": [], "last": "Kazembe", "suffix": ""}], "year": 2014, "venue": "J. Endod", "volume": "40", "issn": "", "pages": "703--709", "other_ids": {"doi": ["10.1016/j.joen.2013.11.004"]}, "links": "25228199"}, "BIBREF42": {"ref_id": "b42", "title": "Influence of multidrug efflux systems on methylene blue-mediated photodynamic inactivation of Candida albicans", "authors": [{"first": "R", "middle": ["A"], "last": "Prates", "suffix": ""}, {"first": "I", "middle": ["T"], "last": "Kato", "suffix": ""}, {"first": "M", "middle": ["S"], "last": "Ribeiro", "suffix": ""}, {"first": "G", "middle": ["P"], "last": "Tegos", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}], "year": 2011, "venue": "J. Antimicrob. Chemother", "volume": "66", "issn": "", "pages": "1525--1532", "other_ids": {"doi": ["10.1093/jac/dkr160"]}, "links": "10822974"}, "BIBREF43": {"ref_id": "b43", "title": "Biofilms and apical periodontitis: study of prevalence and association with clinical and histopathologic findings", "authors": [{"first": "D", "middle": [], "last": "Ricucci", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "Siqueira", "suffix": ""}], "year": 2010, "venue": "J. Endod", "volume": "36", "issn": "", "pages": "1277--1288", "other_ids": {"doi": ["10.1016/j.joen.2010.04.007"]}, "links": "25245397"}, "BIBREF44": {"ref_id": "b44", "title": "Histologic investigation of root canal-treated teeth with apical periodontitis: a retrospective study from twenty-four patients", "authors": [{"first": "D", "middle": [], "last": "Ricucci", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "Siqueira", "suffix": ""}, {"first": "A", "middle": ["L"], "last": "Bate", "suffix": ""}, {"first": "Pitt", "middle": [], "last": "Ford", "suffix": ""}, {"first": "T", "middle": ["R"], "last": "", "suffix": ""}], "year": 2009, "venue": "J. Endod", "volume": "35", "issn": "", "pages": "493--502", "other_ids": {"doi": ["10.1016/j.joen.2008.12.014"]}, "links": "6412603"}, "BIBREF45": {"ref_id": "b45", "title": "Evaluation of photodynamic therapy using a light-emitting diode lamp against Enterococcus faecalis in extracted human teeth", "authors": [{"first": "A", "middle": [], "last": "Rios", "suffix": ""}, {"first": "J", "middle": [], "last": "He", "suffix": ""}, {"first": "G", "middle": ["N"], "last": "Glickman", "suffix": ""}, {"first": "R", "middle": [], "last": "Spears", "suffix": ""}, {"first": "E", "middle": ["D"], "last": "Schneiderman", "suffix": ""}, {"first": "A", "middle": ["L"], "last": "Honeyman", "suffix": ""}], "year": 2011, "venue": "J. Endod", "volume": "37", "issn": "", "pages": "856--859", "other_ids": {"doi": ["10.1016/j.joen.2011.03.014"]}, "links": "44964984"}, "BIBREF46": {"ref_id": "b46", "title": "Real-time evaluation of two light delivery systems for photodynamic disinfection of Candida albicans biofilm in curved root canals", "authors": [{"first": "C", "middle": ["P"], "last": "Sabino", "suffix": ""}, {"first": "A", "middle": ["S"], "last": "Garcez", "suffix": ""}, {"first": "S", "middle": ["C"], "last": "N\u00fa\u00f1ez", "suffix": ""}, {"first": "M", "middle": ["S"], "last": "Ribeiro", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}], "year": 2015, "venue": "Lasers Med. Sci", "volume": "30", "issn": "", "pages": "1657--1665", "other_ids": {"doi": ["10.1007/s10103-014-1629-x"]}, "links": "20839632"}, "BIBREF47": {"ref_id": "b47", "title": "ER-phagy mediates selective degradation of endoplasmic reticulum independently of the core autophagy machinery", "authors": [{"first": "S", "middle": [], "last": "Schuck", "suffix": ""}, {"first": "C", "middle": ["M"], "last": "Gallagher", "suffix": ""}, {"first": "P", "middle": [], "last": "Walter", "suffix": ""}], "year": 2014, "venue": "J. Cell Sci", "volume": "127", "issn": "", "pages": "4078--4088", "other_ids": {"doi": ["10.1242/jcs.154716"]}, "links": "1393705"}, "BIBREF48": {"ref_id": "b48", "title": "An in vitro comparison of the bactericidal efficacy of lethal photosensitization or sodium hyphochlorite irrigation on Streptococcus intermedius biofilms in root canals", "authors": [{"first": "G", "middle": ["J"], "last": "Seal", "suffix": ""}, {"first": "Y", "middle": ["L"], "last": "Ng", "suffix": ""}, {"first": "D", "middle": [], "last": "Spratt", "suffix": ""}, {"first": "M", "middle": [], "last": "Bhatti", "suffix": ""}, {"first": "K", "middle": [], "last": "Gulabivala", "suffix": ""}], "year": 2002, "venue": "Int. Endod. J", "volume": "35", "issn": "", "pages": "268--274", "other_ids": {"doi": ["10.1046/j.1365-2591.2002.00477.x"]}, "links": "41151575"}, "BIBREF49": {"ref_id": "b49", "title": "Characterization of a conjugate between rose bengal and chitosan for targeted antibiofilm and tissue stabilization effects as a potential treatment of infected dentin", "authors": [{"first": "A", "middle": [], "last": "Shrestha", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}, {"first": "A", "middle": [], "last": "Kishen", "suffix": ""}], "year": 2012, "venue": "Antimicrob. Agents Chemother", "volume": "56", "issn": "", "pages": "4876--4884", "other_ids": {"doi": ["10.1128/AAC.00810-12"]}, "links": "39885602"}, "BIBREF50": {"ref_id": "b50", "title": "Photoactivated rose bengal functionalized chitosan nanoparticles produce antibacterial/biofilm activity and stabilize dentin-collagen", "authors": [{"first": "A", "middle": [], "last": "Shrestha", "suffix": ""}, {"first": "M", "middle": ["R"], "last": "Hamblin", "suffix": ""}, {"first": "A", "middle": [], "last": "Kishen", "suffix": ""}], "year": 2014, "venue": "Nanomedicine", "volume": "10", "issn": "", "pages": "491--501", "other_ids": {"doi": ["10.1016/j.nano.2013.10.010"]}, "links": "1837150"}, "BIBREF51": {"ref_id": "b51", "title": "Bactericidal efficacy of photodynamic therapy against Enterococcus faecalis in infected root canals: a systematic literature review", "authors": [{"first": "S", "middle": ["H"], "last": "Siddiqui", "suffix": ""}, {"first": "K", "middle": ["H"], "last": "Awan", "suffix": ""}, {"first": "F", "middle": [], "last": "Javed", "suffix": ""}], "year": 2013, "venue": "Photodiagnosis Photodyn. Ther", "volume": "10", "issn": "", "pages": "632--643", "other_ids": {"doi": ["10.1016/j.pdpdt.2013.07.006"]}, "links": "2318213"}, "BIBREF52": {"ref_id": "b52", "title": "Effects of chemomechanical preparation with 2.5% sodium hypochlorite and intracanal medication with calcium hydroxide on cultivable bacteria in infected root canals", "authors": [{"first": "J", "middle": ["F"], "last": "Siqueira", "suffix": ""}, {"first": "T", "middle": [], "last": "Guimar\u00e3es-Pinto", "suffix": ""}, {"first": "I", "middle": ["N"], "last": "R\u00f4\u00e7as", "suffix": ""}], "year": 2007, "venue": "J. Endod", "volume": "33", "issn": "", "pages": "800--805", "other_ids": {"doi": ["10.1016/j.joen.2006.11.023"]}, "links": "15655263"}, "BIBREF53": {"ref_id": "b53", "title": "Chemomechanical reduction of the bacterial population in the root canal after instrumentation and irrigation with 1%, 2.5% and 5.25% sodium hypochlorite", "authors": [{"first": "J", "middle": ["F"], "last": "Siqueira", "suffix": ""}, {"first": "I", "middle": ["N"], "last": "R\u00f4\u00e7as", "suffix": ""}, {"first": "A", "middle": [], "last": "Favieri", "suffix": ""}, {"first": "K", "middle": ["C"], "last": "Lima", "suffix": ""}], "year": 2000, "venue": "J. Endod", "volume": "26", "issn": "", "pages": "331--334", "other_ids": {"doi": ["10.1097/00004770-200006000-00006"]}, "links": "11437409"}, "BIBREF54": {"ref_id": "b54", "title": "Influence of infection at the time of root filling on the outcome of endodontic treatment of teeth with apical periodontitis", "authors": [{"first": "U", "middle": [], "last": "Sj\u00f6gren", "suffix": ""}, {"first": "D", "middle": [], "last": "Figdor", "suffix": ""}, {"first": "S", "middle": [], "last": "Persson", "suffix": ""}, {"first": "G", "middle": [], "last": "Sundqvist", "suffix": ""}], "year": 1997, "venue": "Int. Endod. J", "volume": "30", "issn": "", "pages": "297--306", "other_ids": {"doi": ["10.1111/j.1365-2591.1997.tb00714.x"]}, "links": "28498587"}, "BIBREF55": {"ref_id": "b55", "title": "Rapid method for the detection of root canal bacteria in endodontic therapy", "authors": [{"first": "K", "middle": ["S"], "last": "Tan", "suffix": ""}, {"first": "V", "middle": ["S"], "last": "Yu", "suffix": ""}, {"first": "S", "middle": ["Y"], "last": "Quah", "suffix": ""}, {"first": "G", "middle": [], "last": "Bergenholtz", "suffix": ""}], "year": 2015, "venue": "J. Endod", "volume": "41", "issn": "", "pages": "447--450", "other_ids": {"doi": ["10.1016/j.joen.2014.11.025"]}, "links": "2562777"}, "BIBREF56": {"ref_id": "b56", "title": "Antimicrobial photodynamic therapy: study of bacterial recovery viability and potential development of resistance after treatment", "authors": [{"first": "A", "middle": [], "last": "Tavares", "suffix": ""}, {"first": "C", "middle": ["M B"], "last": "Carvalho", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Faustino", "suffix": ""}, {"first": "M", "middle": ["G P M S"], "last": "Neves", "suffix": ""}, {"first": "J", "middle": ["P C"], "last": "Tom\u00e9", "suffix": ""}, {"first": "A", "middle": ["C"], "last": "Tom\u00e9", "suffix": ""}], "year": 2010, "venue": "Mar. Drugs", "volume": "8", "issn": "", "pages": "91--105", "other_ids": {"doi": ["10.3390/md8010091"]}, "links": "7851710"}, "BIBREF57": {"ref_id": "b57", "title": "Ultrasonic activation and chemical modification of photosensitizers enhances the effects of photodynamic therapy against Enterococcus faecalis root-canal isolates", "authors": [{"first": "C", "middle": [], "last": "Tennert", "suffix": ""}, {"first": "A", "middle": ["M"], "last": "Drews", "suffix": ""}, {"first": "V", "middle": [], "last": "Walther", "suffix": ""}, {"first": "M", "middle": ["J"], "last": "Altenburger", "suffix": ""}, {"first": "L", "middle": [], "last": "Karygianni", "suffix": ""}, {"first": "K", "middle": ["T"], "last": "Wrbas", "suffix": ""}], "year": 2015, "venue": "Photodiagnosis Photodyn. Ther", "volume": "12", "issn": "", "pages": "244--251", "other_ids": {"doi": ["10.1016/j.pdpdt.2015.02.002"]}, "links": "22626017"}, "BIBREF58": {"ref_id": "b58", "title": "Effect of irrigants on the survival of human stem cells of the apical papilla in a platelet-rich plasma scaffold in human root tips", "authors": [{"first": "E", "middle": ["G"], "last": "Trevino", "suffix": ""}, {"first": "A", "middle": ["N"], "last": "Patwardhan", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Henry", "suffix": ""}, {"first": "G", "middle": [], "last": "Perry", "suffix": ""}, {"first": "N", "middle": [], "last": "Dybdal-Hargreaves", "suffix": ""}, {"first": "K", "middle": ["M"], "last": "Hargreaves", "suffix": ""}], "year": 2011, "venue": "J. Endod", "volume": "37", "issn": "", "pages": "1109--1115", "other_ids": {"doi": ["10.1016/j.joen.2011.05.013"]}, "links": "23707645"}, "BIBREF59": {"ref_id": "b59", "title": "Comparison of the bactericidal efficacy of photodynamic therapy, 2.5% sodium hypochlorite, and 2% chlorhexidine against Enterococcus faecalis in root canals; an in vitro study", "authors": [{"first": "S", "middle": [], "last": "Vaziri", "suffix": ""}, {"first": "A", "middle": [], "last": "Kangarlou", "suffix": ""}, {"first": "R", "middle": [], "last": "Shahbazi", "suffix": ""}, {"first": "A", "middle": [], "last": "Nazari Nasab", "suffix": ""}, {"first": "M", "middle": [], "last": "Naseri", "suffix": ""}], "year": 2012, "venue": "Dent. Res. J", "volume": "9", "issn": "", "pages": "613--618", "other_ids": {"doi": ["10.4103/1735-3327.104882"]}, "links": "7003242"}, "BIBREF60": {"ref_id": "b60", "title": "A study of photobactericidal activity in the phenothiazinium series", "authors": [{"first": "M", "middle": [], "last": "Wainwright", "suffix": ""}, {"first": "D", "middle": ["A"], "last": "Phoenix", "suffix": ""}, {"first": "J", "middle": [], "last": "Marland", "suffix": ""}, {"first": "D", "middle": ["R"], "last": "Wareing", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Bolton", "suffix": ""}], "year": 1997, "venue": "FEMS Immunol. Med. Microbiol", "volume": "19", "issn": "", "pages": "75--80", "other_ids": {"doi": ["10.1111/j.1574-695X.1997.tb01074.x"]}, "links": "20699719"}, "BIBREF61": {"ref_id": "b61", "title": "Minimizing concentration of sodium hypochlorite in root canal irrigation by combination of ultrasonic irrigation with photodynamic treatment", "authors": [{"first": "Y", "middle": [], "last": "Wang", "suffix": ""}, {"first": "S", "middle": [], "last": "Xiao", "suffix": ""}, {"first": "D", "middle": [], "last": "Ma", "suffix": ""}, {"first": "X", "middle": [], "last": "Huang", "suffix": ""}, {"first": "Z", "middle": [], "last": "Cai", "suffix": ""}], "year": 2015, "venue": "Photochem. Photobiol", "volume": "91", "issn": "", "pages": "937--941", "other_ids": {"doi": ["10.1111/php.12459"]}, "links": "24100299"}}}, "latex_parse": null}
diff --git a/s2orc-doc2json/tests/s2orc/20190928/10042064.json b/s2orc-doc2json/tests/s2orc/20190928/10042064.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcd616fa6ccedb3783a5fed94a8aa52bffdd1b55
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20190928/10042064.json
@@ -0,0 +1 @@
+{"paper_id": "10042064", "metadata": {"title": "Unionisation and Foreign Direct Investment: Challenging Conventional Wisdom?", "authors": [{"first": "Dermot", "middle": [], "last": "Leahy", "suffix": ""}, {"first": "Catia", "middle": [], "last": "Montagna", "suffix": ""}], "abstract": "This paper investigates the effects of different degrees of wage setting centralisation on the incentive of a MNE to locate in a host country, and on the host country's welfare. Decentralised and centralised wage bargaining are considered. The nature of product market competition between the MNE and domestic firms proves crucial to results which cast doubt on some of the conventional wisdom on FDI. In particular, we show that: (i) it is not always welfare improving to attract inward FDI, and (ii) the MNE may prefer centralised to decentralised wage setting regimes.", "year": "2000", "arxiv_id": null, "acl_id": null, "pmc_id": null, "pubmed_id": null, "doi": "10.1111/1468-0297.00522", "venue": "The Economic Journal", "journal": "The Economic Journal"}, "s2_pdf_hash": "1feed22bdbcab3f53deb97b294ee034a4fa1601a", "grobid_parse": {"abstract": [{"text": "Abstract:This paper investigates the effects of different degrees of wage setting centralisation on (1) the incentive of a MNE to locate in a host country, (2) the optimal level of investment it decides to commit to its foreign operation, and (3) the host country's welfare. Decentralised and centralised wage bargaining are considered. The nature of product market competition between the MNE and domestic firms affects results which cast doubt on some of the conventional wisdom on FDI. In particular, we show that: (i) it is not always welfare improving to attract inward FDI, and (ii) the MNE may prefer centralised to decentralised wage setting regimes.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Abstract"}], "body_text": [{"text": "The last few decades have witnessed a substantial growth of foreign direct investment (FDI). As a result, increasing attention has been devoted in policy debates to the welfare consequences of FDI for the host country and to the factors affecting multinationals' choice of location. A consensus seems to have emerged around some commonly accepted views which form a \"conventional wisdom\" on FDI. Three of these conventional views are: (1) Inward FDI is welfare improving for a host country, regardless of the type of labour market institutions; (2) Multinational enterprises (MNEs) prefer decentralised firm level wage bargaining processes to centralised ones; (3) Governments should subsidise inward FDI, in particular in the presence of unionised labour markets.The first of these conventional wisdoms concerns the welfare effects of FDI for the host country. Although different interest groups within an economy may hold conflicting views as to the desirability of inward FDI, it is probably appropriate to state that on balance governments see it as welfare improving 1 . Thus, increasingly, \"international competitiveness\" is not only meant to reflect the ability of a country to compete on international goods markets, but also its attractiveness to foreign MNEs.The second conventional wisdom relates to the determinants of multinationals' choice of location, amongst which labour market characteristics and institutions are considered major factors. According to the emerging consensus, MNEs prefer flexible nonunionised labour markets and, when unionisation is in place, decentralised firm level wage bargaining processes over centralised ones. Despite ample empirical evidence (UN Investment Report, 1997) suggesting that labour costs are just one of the many considerations behind MNEs' location decisions, this view rests on the assumption that they are crucial in determining countries' ability to compete for FDI: by increasing labour costs, unionisation is detrimental to a country's attractiveness to MNEs. This type of argument has often been used in relatively highly unionised industrialised countries to endorse legislation aimed at limiting unions' power and deregulating the bargaining 1 Typical benefits associated with inward FDI are increased employment, technological externalities, emergence of new and/or more dynamic sectors, procompetitive effects on industry structure. Amongst its costs is the excessive competitive pressure on home firms which may force them out of business rather than stimulate their efficiency. Furthermore, as Zhao (1995) and Lahiri and Ono (1997) point out, the possible emergence of cross-hauling in FDI, may induce an \"export of jobs\" thus outweighing the creation of new jobs by the inward FDI.", "cite_spans": [{"start": 1702, "end": 1715, "text": "Report, 1997)", "latex": null, "ref_id": null}, {"start": 2564, "end": 2575, "text": "Zhao (1995)", "latex": null, "ref_id": "BIBREF23"}, {"start": 2580, "end": 2601, "text": "Lahiri and Ono (1997)", "latex": null, "ref_id": "BIBREF18"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "process.The third conventional wisdom is almost a corollary of the other two. The desirability of inward FDI suggests the need for subsidising it, in particular when unionised labour markets are likely to make a host country less attractive to foreign MNEs. It is in fact common practice for national governments to subsidise and to compete for inward FDI 2 .This paper attempts to evaluate the conventional wisdom outlined above in the context of unionised labour markets.The interaction between labour market unionisation and FDI has received surprisingly little attention in the theoretical literature where the two have mostly been studied separately. Imperfect competition and positive profits create incentives for strategic trade policy to attract and extract rents from MNEs, as for instance in Brander and Spencer (1987) , Bond and Samuelson (1989) and Janeba (1996) . The effects of trade on labour markets have been typically studied within the theory of distortions (e.g. Bhagwati and Srinivasan, 1983) or the political economy framework (Dinoupoulos, 1983) .More recently, Brecher and Van Long (1989) analyse the impact of protection on employment and welfare in an open economy with a central union. Brander and Spencer (1988) and Santoni (1996) analyse the effect of unionisation and trade policies in a NashCournot duopoly where a unionised home firm competes against a foreign firm. Huzinga (1993) and Naylor (1998) examine the effects of market integration within a unionised sector. Very little work has been done on the relationship between labour market unionisation and FDI. Notable exception is Zhao (1995 Zhao ( , 1998 where cross-hauling FDI is generated between two countries with imperfectly competitive product markets and unionised labour markets. Naylor and Santoni (1998) analyse the effects of union power and degree of substitutability between products on FDI.Within the unionisation-FDI literature, our paper is the first of which we are aware to: (1) examine the effects of different degrees of union centralisation on FDI and welfare, (2) study the role of different modes of product market interaction between MNEs and domestic firms, (3) analyse the optimal policy towards FDI when unions extract rents from wage it faces 3 . The focus of our analysis may be especially relevant to Europe where labour markets are generally highly unionised but where the degree of centralisation varies greatly across countries (Freeman and Katz, 1995) and where countries are characterised by different patterns of inwards FDI (Barrel and Pain, 1997) .We consider two alternative wage setting regimes. The first is decentralised, with trade unions bargaining individually with each firm. The second is a centralised regime in which one single wage is set for an industry or group of industries. In the latter case, the wage setting process represents a link between the MNE and the domestic firms, even when direct product market competition between them is absent.Our analysis shows that the results crucially depend on the nature of product market interaction between the MNE and host country's firms. We consider two situations. In the first there is no product market interaction, and the MNE will not compete with home firms either in the domestic or in any other market. This allows us to isolate the effects of wage determination on the host country's welfare and on the MNE's decisions. We subsequently allow for product market interaction, which could occur in either the domestic or in an export market. This is not an academic distinction but one which may be seen as reflecting the motives behind a MNE's decision to locate in a host country. When the latter is seen by the MNE as an export base, the absence of direct competition with domestic firms is a plausible assumption. This is more likely the smaller is the economic size of the host country relative to the MNE's export market. A typical example of this is Ireland: its high share of inward FDI, which could not be ascribed to the extent of its domestic market, has mainly occurred in sectors which did not have a significant presence of indigenous firms. If, however, the MNE is attracted by domestic sales, it is likely (particularly in relatively large industrialised countries) that it will face competition from the host country's firms .The model is developed in Section 2. Section 3 and 4 look at the alternative product market interaction cases, carry out welfare comparisons across different regimes and discuss the implications for policy towards FDI. Section 5 endogenises the level of capital investment of the MNE and analyses the government's optimal policy. Section 6 concludes the paper.", "cite_spans": [{"start": 800, "end": 829, "text": "in Brander and Spencer (1987)", "latex": null, "ref_id": "BIBREF5"}, {"start": 832, "end": 857, "text": "Bond and Samuelson (1989)", "latex": null, "ref_id": "BIBREF4"}, {"start": 862, "end": 875, "text": "Janeba (1996)", "latex": null, "ref_id": "BIBREF17"}, {"start": 984, "end": 1014, "text": "Bhagwati and Srinivasan, 1983)", "latex": null, "ref_id": "BIBREF2"}, {"start": 1050, "end": 1069, "text": "(Dinoupoulos, 1983)", "latex": null, "ref_id": null}, {"start": 1086, "end": 1113, "text": "Brecher and Van Long (1989)", "latex": null, "ref_id": "BIBREF7"}, {"start": 1214, "end": 1240, "text": "Brander and Spencer (1988)", "latex": null, "ref_id": "BIBREF6"}, {"start": 1245, "end": 1259, "text": "Santoni (1996)", "latex": null, "ref_id": "BIBREF21"}, {"start": 1408, "end": 1414, "text": "(1993)", "latex": null, "ref_id": null}, {"start": 1419, "end": 1432, "text": "Naylor (1998)", "latex": null, "ref_id": "BIBREF19"}, {"start": 1618, "end": 1628, "text": "Zhao (1995", "latex": null, "ref_id": "BIBREF23"}, {"start": 1629, "end": 1642, "text": "Zhao ( , 1998", "latex": null, "ref_id": "BIBREF24"}, {"start": 1777, "end": 1802, "text": "Naylor and Santoni (1998)", "latex": null, "ref_id": "BIBREF20"}, {"start": 2450, "end": 2474, "text": "(Freeman and Katz, 1995)", "latex": null, "ref_id": null}, {"start": 2550, "end": 2573, "text": "(Barrel and Pain, 1997)", "latex": null, "ref_id": "BIBREF0"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Consider a model with a multinational which may set up in a host country where N symmetric domestic firms are already located. All firms (including the MNE) are unionised.We model the interaction between agents as a three-stage game. In stage one, the MNE decides whether to locate in the host country. In stage two, unions choose the wage at which labour will be supplied to firms 4 . We consider two alternative wage setting regimes. The first is decentralised with the unions bargaining individually with each firm. In the second, the wage bargaining is concluded centrally and one wage is set for all firms in the country. In stage three, firms choose their output levels. Two types of product market competition are analysed: (1) the home firms and the MNE do not compete in the product market and the MNE exports all of its output; (2) domestic firms and the MNE all compete in the same market which may or may not be the home market itself.In the absence of direct product market interaction between the MNE and domestic firms, the latter may sell in the home market or be exporters. If the MNE locates it will hire labour at a wage * w , set in stage two by the unions.For a firm to prosper in a foreign environment, it will generally possess some firm specific advantages (Dunning, 1988) . We assume this firm-specific advantage to consist of 6 a higher labour productivity, resulting from technical superiority and/or higher capital intensity 6 . The MNE's unit labour input requirement is negatively related to its capital investment (K) which we initially take to be exogenous. The profit function of the multinational is then:F ) K , y ( L w ) D , y ( R * * * * * \u2212 \u2212 = \u03c0 (1)where * L is the labour employed in its host country's operation and F is a fixed cost which may include the cost of capital.The typical home firm's profit function is:i i i i i i L w ) D , q ( R \u2212 = \u03c0 (2) where i i q L = is labour employed.The firm-specific unions' utility functions for the the multinational and the typical home firm are respectively:* * * L ) w w ( U \u2212 =(3)andi i i L ) w w ( U \u2212 = (4)where w is the constant wage paid to non-unionised workers 7 .The national welfare function is given by: where by assumption\u2211 + + + = n i i i * ) U ( U CS W \u03c0(5)* y L \u2264 i q i L =1.In stage two the unions choose wages to maximise total labour rents. We assume a right-to-manage model where firms retain discretion over employment decisions .First assume that wages are set on a decentralised basis by unions which are monopolists at the firm level. In this case the particular wage in the multinational sector will be m * w w = . The first-order condition for the union in the multinational is:0 L w L ) w w ( w U * m * m m * = + \u2202 \u2202 \u2212 = \u2202 \u2202 (8)and that for a typical domestic firm's union is :0 L w L ) w w ( w U i i i i i i = + \u2202 \u2202 \u2212 = \u2202 \u2202 (9)Instead, with centralised bargaining, a single wage is set to maximise the sum of the individual unions' utilities: V=U * +NU. This implies the following first order condition:8 0 w ) w w ( dw dV c c c = + \u2202 \u2202 \u2212 = \u039b \u039b (10)where * c w w w = = is the centralised wage and* i L NL + \u2261 \u039b .In stage one of the game, the MNE will locate if its profit * \u03c0 is greater than a reservation profit \u03c0 which is assumed to be exogenous and to reflect the profit opportunities of locating elsewhere. The exogeneity of \u03c0 implies that the host country is small in the market for FDI.", "cite_spans": [{"start": 1281, "end": 1296, "text": "(Dunning, 1988)", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In the absence of product market interaction between the multinational and domestic firms, the MNE is assumed to export all its output.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "With decentralised bargaining, the equilibrium values of wages are implicitly givenby (8) and (9).To obtain some clearer results we will often adopt specific functional forms. In particular we shall impose the following assumptions: (A.1) Linear demands:i i i * * q s 1 a p , y s 1 a p \u2212 = \u2212 =(11)where i s represents a typical home firm's market size, and * s is similarly defined for the multinational. Symmetry among domestic firms implies that s= is \u2200 i (but s may differ from * s ). (A.2) The MNEs' labour input requirement is \u03b1(K)= * L /y, with \u2032 \u03b1 ( ) K <0.Given A.1 and A.2 in (8) and (9) we obtain:w w a m = + \uf8eb \uf8ed \uf8ec \uf8f6 \uf8f8 \uf8f7 1 2 \u03b1(12)9 and ( )w w a = + 1 2 (13)where w is the symmetric domestic firms' wage. Given that \u03b1 \u2264 1, it is obvious that m w \u2265 w: the higher relative labour productivity of the multinational enables its union to extract a higher wage than that extracted from domestic firms 8 .", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In the centralised bargaining case, the equilibrium wage is implicitly given by (10).With specific functional forms A.1 and A.2, the centralised wage isw w a c = + + + \uf8ee \uf8f0 \uf8ef \uf8f9 \uf8fb \uf8fa 1 2 2 ( ) \u03b1 \u03b1 \u0393 \u0393 (14)where \u0393\u2261 Ns/ * s is an inverse measure of the MNE's market size relative to the total of the market sizes of domestic firms and reflects the relative labour market importance of the multinational sector from the point of view of the unions. Equation (14) ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "It is commonly argued that FDI is welfare improving for the receiving country. To assess this view we shall evaluate the \"welfare value\" of the MNE to the host country, defined as the ceteris paribus difference between the levels of welfare achieved with and without the MNE.The conventional wisdom holds without market interaction, when the \"welfare value\" of FDI is positive regardless of the type of wage setting. It is easy to show that0 W W a j > \u2212for j=d,c where the superscript j refers to the wage setting regime and the subscript a refers to the absence of the MNE 10 . The welfare value, however, is lower under centralised than under decentralised wage setting, because rent extraction from the MNE is lower with wage centralisation and there are negative spill-over effects of the higher centralised wage on domestic firms' profitability. It follows that the actual welfare level is higher under decentralised than under centralised wage setting (i.e.c d W W > ).counterparts (e.g. Driffield, 1996) . 9 Unions face a trade-off between wage and employment. A high w c will damage employment not only in the MNE but also in the domestic sector. To protect employment, the unions may choose to reduce the wage. This would not occur in the decentralised case where the domestic sector is not a concern for the MNE's union. ", "cite_spans": [{"start": 994, "end": 1010, "text": "Driffield, 1996)", "latex": null, "ref_id": "BIBREF12"}, {"start": 1013, "end": 1014, "text": "9", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "The commonly held view that MNEs prefer firm-level wage setting processes to centralised ones is not supported by our analysis in the absence of direct product market competition with domestic firms, given that c m w w > implies higher MNE's profits under centralised wage setting. Hence, a divergence of interests emerges between the MNE and the host country's government, with centralised wage setting making entry more attractive to the MNE but leading to lower welfare levels.The policy implication of this analysis is that when the MNE and the domestic firms do not directly compete with each other in the product market, it is desirable for a host country's government to attract inward FDI. In this case, the government will be willing to pay a lump-sum location subsidy up to the \"welfare value\" of the MNE. Clearly, the subsidy, which would be higher under decentralised wage setting, would only be required if\u03c0 \u03c0 \u2264 * .", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "We will now allow for product market competition between the multinational and the domestic firms. Assuming that firms produce an identical commodity, the industry's ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In the decentralised bargaining case, unions play Nash so when a typical home union i chooses its wage i w it takes as given m w and the wages set by all other domestic unions ( i w \u2212 ). Symmetry implies that all domestic firms will face the same equilibrium wage (i.e. w w i = , \u2200i). Thus, using (8) and (9), we obtain the reaction functions: 12 ) K , w ( w ) K ,", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "With market interaction, when looking at the effects of inward FDI on the host country's welfare, it is important to distinguish between the cases in which firms do and do not sell in the domestic market.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "When firms are exporting only, the \"welfare value\" of inward FDI is negative under both decentralised and centralised wage settings, at least for ,c) . This is because the introduction of a new (foreign) firm into the export sector of the economy has a negative effect on the terms of trade 13 . At low N and high \u03b1 the welfare costs of a MNE entering the export sector are greater under 11 In the linear case i t is straightforward to show that an increase in K commits the MNE to a higher output and reduces its rivals' output. The effect on q operates directly, via product market competition, and indirectly via the wage setting process. The first effect is negative and will dominate the second which is positive. 12 As in the no market interaction case, dw c /dK will be negative for sufficiently large N and small \u03b1.. 13 Of course, if the MNE was already exporting into the foreign market, FDI would not introduce a new competitor and the welfare losses to the host country would be lower than here. The exact effect of this on welfare would depend on the cost competitiveness of the MNE's operation outside the home country. We ignore this possibility here because our concern is with cross comparisons between centralised and which case the much higher centralised wage has serious negative consequences for the profitability of domestic firms.N>1 (i.e., 0 W W j a j < \u2212 for j=dHaving considered the welfare values ( , it is straightforward to show that the MNE prefers to locate in the host country under centralised rather than under decentralised wage setting (except for \u03b1 very close to unity), because in this case the higher centralised wage hurts its host country's rivals more than itself. Hence, for sufficiently large \u03b1 and N a congruence of interest applies between the MNE and the government, with both preferring the centralised wage setting regime.", "cite_spans": [{"start": 146, "end": 149, "text": ",c)", "latex": null, "ref_id": null}, {"start": 388, "end": 390, "text": "11", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "When firms are selling on the home market, the procompetitive effect of the extra firm benefits consumer surplus 14 . Nevertheless, the possibility of welfare gains is normally restricted to the decentralised wage setting case, with the \"welfare value\" of inward FDI being typically negative under wage centralisation, particularly for low N and low \u03b1. This is because the negative effects of the MNE on home firms via the centralised wage outweighs any gains to consumers.Turning now from welfare values to welfare levels, we can show thatc d W W > .With domestic sales, the \"tax-like\" effect of the higher centralised wage always results in welfare being higher under decentralised than under centralised wage setting. Thus, in contrast to the case in which firms export only, a conflict of interest between the MNE and the host country government would normally arise.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "It is worth noting that the centralised wage setting process considered so far assumes that the centralised union sets a decentralised wage settings.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "single wage for all firms. However, the existence of labour productivity gaps between firms, may encourage unions to cooperatively set different wages for different firms. Given the assumed symmetry of domestic firms, this would imply the setting of a single wage for the domestic firms and a different wage for the MNE. With no market interaction, this case collapses into the decentralised wage setting regime. With market interaction, the centralised union will exploit the higher productivity of the foreign firm by setting a MNEspecific wage which exceeds that paid by the host country's firms. This higher wage implies that the welfare value of the MNE would typically be positive when firms compete in the domestic market. This is in sharp contrast to the standard non-discriminatory centralised wage setting regime discussed above. Inward FDI would still typically be welfare reducing when firms compete in an export market (at least for sufficiently large \u03b1). The losses, however, would be smaller than in the decentralised and standard centralised regimes.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "\"centralised firm specific wage setting\" case under market interaction, and (ii) when its productivity advantage is extremely small, the MNE will prefer a centralised wage setting regime (with and without market interaction), thus contradicting one of the conventional wisdoms mentioned in the introduction.With respect to the welfare value of inward FDI, we find that direct product market competition makes welfare losses from inward FDI more likely, because the MNE will capture market shares from the indigenous firms, thus reducing their profits. This will have a direct adverse effect on welfare, not compensated by the entrant's profits which are repatriated. The fall in welfare will often be larger under wage centralisation because of the additional externality on domestic firms generated by the common industry wide-wage.The overall policy implication of these results is that there are circumstances in which the government would prefer to prevent entry, unless it could raise a location tax at least equal to the negative \"welfare value\" of FDI.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "encourage inward FDI is its supposedly positive employment effects to the host country.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Although we do not explicitly take account of employment effects in our welfare function in (5), it still may be interesting to consider the effect of the entry of the foreign MNE on the host country's employment 15 . It is straightforward to show that without market interaction, inward FDI increases employment regardless of the bargaining regime. Note however that, due to the negative externality that the MNE's entry has on home firms under centralised wage setting, ( d \u039b - d a \u039b )>( c \u039b - c a \u039b ),", "cite_spans": [{"start": 213, "end": 215, "text": "15", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "So far we have assumed an exogenous level of the MNE's capital investment in the host country. An important dimension of FDI, however, concerns the level of capital the MNE decides to commit to its foreign operation. As is clear from the analysis carried out so far, the amount of capital invested, by determining the cost advantage of the MNE over domestic firms, affects the unions optimal policy. In this section we endogenise the MNE's optimal investment decision and we analyse how it is affected by the different bargaining regimes. In order to isolate this issue, in the first instance we shall not model the location decision of the MNE 16 . This will subsequently be added to the model. Furthermore, we shall allow for the host country's government to be policy active towards the MNE by optimally choosing a capital subsidy. Our aim is to analyse how the government's optimal policy is affected by the nature of the wage setting process.In order to address the issues outlined above, we now modify the structure of the game discussed in the previous sections. We shall model the interaction between agents as a four stage game. In stage one, the government chooses the capital subsidy \u03c3. In stage two, the multinational chooses to invest a level capital K. In stage three, the unions set the wage under either centralised or decentralised bargaining and in stage four firms make their output decisions. As before, we shall consider two types of product market competition:15 Of course, the welfare function could be easily modified to give a positive weight to employment. 16 This situation can be thought of as one in which either the MNE has already made its location decision market interaction and no-market interaction. Hence, the last two stages of the game correspond to the last two stages analysed in the previous sections. Since these have already been discussed, in what follows we shall confine our attention to stage one and two.", "cite_spans": [{"start": 1583, "end": 1585, "text": "16", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In stage two of the game the MNE chooses the level of capital investment in the host country by maximising the following profit function with respect to KK ) K ( L w ) D , y ( R * * * * * \u03c3 \u03a8 \u03c0 + \u2212 \u2212 =(17)Compared to the profit function in (1), equation (17) contains an additional term reflecting the total subsidy received by the MNE (\u03c3K). Also, the fixed cost F has been replaced bythe total cost of capital ) K ( \u03a8 . The first order condition is: 0 dK dw L )} ( ) y , K ( L w { dK d * * * K * * = \u2212 \u2212 \u2212 \u2212 = \u03c8 \u03c3 \u03c0 (18) where \u03c8\u2261 d K dK \u03a8 ( )is the marginal cost of capital. The term in chain brackets represents the non-strategic effect of capital investment. If wages were exogenous, this term would be set equal to zero and the firm would choose the cost minimising level of K. Instead, the MNE must now take account of the strategic effect of its investment on the union wage, given by the last term of the right-hand-side of equation (18). The sign of this term will differ depending on the nature of the wage setting process.When wages are set at the firm level,dw dK dw dK m * = will be positive if \u2202 \u2202 \u2202 2 0 U w K m * > ,which we could take to be the \"normal\" case. In the linear case, it is easy to see that dw dK m >0. In that case, it therefore follows from this that MNE will strategically underinvest in K to keep the wages down. As far as we know this is the first time that this strategic incentive to under-engage in FDI has been isolated: it is not just high wages that deter FDI, or where its profit constraint is not binding. . However, as was previously discussed, even in the linear case this condition may not hold: a higher K will increase the potential for rent extraction, but the resulting higher wage will damage employment opportunities not only in the MNE but also in the less efficient domestic sector. Therefore circumstances may arise where unions choose to reduce the wage to increase employment. Hence, under centralised bargaining, even in the linear case, the MNE may over or underinvest in capital, depending on the effect of its investment on the wage.In the first stage of the game, the government chooses the capital subsidy to maximise domestic welfare. The total welfare function now includes the total subsidy bill and is given by:K ) U ( U CS W n i i i * \u03c3 \u03c0 \u2212 + + + = \u2211 (19)With decentralised unions, we obtain the following optimal subsidy:K ) K , w ( L ) w w ( d dK K m * m d NI \u2202 \u2202 \u03c3 \u03c3 \u2212 + \u2212 =(20)where the subscript \"NI\" and the superscript \"d\" indicate \"no market interaction\" and \"decentralised wage setting\" respectively. The two terms on the right-hand-side of (20) represent the two rent extraction motives for the government subsidy/tax policy. The first is a direct effect through taxing the multinational firm's capital and is negative. The second is indirect and stems from the unionisation of the labour market (this term clearly equalsK ) w , K ( U m * \u2202 \u2202). This term occurs because the government wishes to raise employment in the multinational sector when there are rents to be extracted there. If this is negative, so that an increase in K reduces employment, this effect works towards a tax. It is straightforward to show that in the linear case this must hold.In the centralised case, the optimal subsidy will be given by:dK dw 1 dw dq ) q ( p N M Nq K ) K , w ( L ) w w ( d dK K c c c * c c NI \uf8fe \uf8fd \uf8fc \uf8f3 \uf8f2 \uf8f1 + \u2032 \u2212 \u2212 + \u2212 = \u2202 \u2202 \u03c3 \u03c3 (21)Clearly the first two terms on the right-hand-side of (21) have the same interpretation as in equation (20) Equations (20) and (21) \u03c3 . With centralised wage setting, the negative spill-over effect on domestic firms' profitability of a higher subsidy-induced capital investment would be taken into account by the government.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "When competing directly with the host country's firms on the product market, the MNE's optimal choice of capital investment in stage two of the game will be determined by the following first order condition: The optimal subsidy with decentralised wage setting is given by0 dK dD R dK dw L )} ( L w { dKdK ) CS ( d R dK dy dK dq ) 1 N ( N dK dq ) w w ( N dK dw w ) K , w , w ( L K ) K , w , w ( L ) w w ( d / dK K D m * m * m d I + \uf8fa \uf8fb \uf8f9 \uf8ef \uf8f0 \uf8ee + \u2212 + \u2212 + \uf8f4 \uf8fe \uf8f4 \uf8fd \uf8fc \uf8f4 \uf8f3 \uf8f4 \uf8f2 \uf8f1 + \u2212 + \u2212 = \u2202 \u2202 \u2202 \u2202 \u03c3 \u03c3 (23)where the superscript \"I\" indicates \"product-market-interaction\". As in the no-marketinteraction case, the first term of (23) captures the incentive to tax K and reflects the direct cost to the government of subsidisation. This term, therefore, works against a subsidy. The second term, within chain brackets, reflects the impact of a capital increase on the MNE's level of employment. It consists of a direct effect at constant wages (present even without market interaction) and an indirect effect which works through the rivals' wages; the former may be positive or negative, depending on which of the productivity or output effects of a capital increase dominates; the indirect effect will typically be negative, thus working towards a tax. The third term captures the effects of a policy induced change in K on the domestic sector's employment and union rents. This can be written as: This overall effect is also likely to be negative. The fourth term in (23) is new and captures changes in domestic firms' profitability resulting from changes in K. This term is likely to be negative and therefore works towards a tax. If the multinational sells its product on the domestic market, the last term of equation (23) will exist and reflect the effect of the capital subsidy on consumers' surplus. This term is typically positive.\uf8fa \uf8fb \uf8f9 \uf8ef \uf8f0 \uf8ee \u2212 \u2202 \u2202 \u2212 + \u2202 \u2202 \u2212 = \u2212 dK dw Q dK dw w Q ) w w ( K Q ) w w ( dKThe optimal subsidy under wage centralisation will be given by:dK ) CS ( d R dK dy dK dq ) 1 N ( N dK dq ) w w ( N K ) K , w ( L ) w w ( d / dK K D c * c c I + \uf8fa \uf8fb \uf8f9 \uf8ef \uf8f0 \uf8ee + \u2212 + \u2212 + \u2212 + \u2212 = \u2202 \u2202 \u03c3 \u03c3 (24)The first two terms of (24) have the same interpretation as the first two terms on the righthand-side of (23). The third can now be written as: Clearly, when the constraint is binding, the capital subsidy plays a role analogous to the location subsidy examined in Section 5. The government will want to encourage the MNE's only if its \"welfare value\" is positive. As we have seen in the previous section, circumstances may exist in which this welfare value may not be positive.\uf8fa \uf8fb \uf8f9 \uf8ef \uf8f0 \uf8ee \u2212 \u2202 \u2202 \u2212 = \u2212 dK dw Q K Q ) w w ( dK dQ ) w w (In this section we have endogenized the MNE's capital choice and explored the optimal policy towards foreign capital investment. The strategic effect of the MNE's investment on the wage is crucial in determining both the MNE's investment decision and the government's optimal policy. In the decentralised wage-setting case, a higher level of K raises the wage faced by the MNE ( w * ). This is also the normal case under centralised wage setting without market interaction. However under centralised wage setting and market interaction this is often reversed.It follows from the above that when wages are set at the firm level, the MNE will strategically underinvest in capital in order to keep the wage down. This will be reversed in those centralised wage setting situations in which the wage is negatively related to the level of capital invested.With respect to the optimal policy, the no-market interaction subsidy under wage centralisation contains a term that is not present when wages are set at the firm level. This extra term, due to the negative distortionary effects of a higher centralised wage, works towards a tax when the wage increases in capital. Things are less clear in the market interaction case as the centralised wage may fall in the level of K. In general however, these results confirm that the host country's government may not find it optimal to encourage a higher level of FDI via capital subsidisation.Thus, in the paper we have looked at the two dimension of FDI: location and the amount of capital committed. We have found that in both cases the government will often wish to discourage FDI, in particular in the presence of negative externalities generated by the MNE towards domestic firms through either the goods or the unionised labour market.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "This paper focuses on the nature of product market interaction between a MNE and a host country's firms in examining the effects of different degrees of wage setting centralisation on (1) the incentive of a MNE to locate in the host country, (2) the optimal level of investment it decides to commit to its foreign operation, and (3) the host country's welfare.Our results suggest that there may be circumstances in which it will not be desirable for a government to encourage inward FDI. We find that negative welfare effects of FDI occur when there is direct product market competition between the MNE and the host country's firms. In this case the foreign MNE will effectively capture market shares from indigenous firms thus reducing their profits. This effect is shown to be particularly strong in the presence of centralised wage setting processes. Hence, we highlight a channel through which inward FDI can have adverse effects on the host country's welfare. Clearly, in real world situations the net change in welfare would depend on many other factors (e.g.technical spill-overs from foreign firms). However, we believe that our findings, which cast at least some doubts on the general validity of the commonly held view that FDI is always welfare improving, are intuitive and are consistent with some of the empirical evidence on FDI (see Caves, 1996 , for a discussion of the effects of FDI on market structure and indigenous firms profitability). Therefore, our analysis points to the need for empirical research to disentangle the relative importance of this and of the other factors which determine the welfare effects of FDI on the receiving country.When the capital investment of the MNE is endogenous, the wage setting regime affects the amount of capital invested and the potential for rent extraction. We show that the MNE will tend to underinvest relative to the cost-minimising level in order to limit the rent extraction of the union, if the negotiated wage is positively related to capital investment.We also show that, contrary to what frequently argued, in the absence of taxes/subsidies, the MNE will be less likely to locate in the host country under a decentralised than under a centralised wage setting regime, despite the fact that the latter will typically yield higher wages.Many natural extensions suggest themselves, such as technological spill-overs fromMNEs to domestic firms, experimenting with different degrees of bargaining power of the unions, and examining cases in which the MNE can negotiate special union arrangements", "cite_spans": [{"start": 1348, "end": 1359, "text": "Caves, 1996", "latex": null, "ref_id": "BIBREF9"}], "ref_spans": [], "eq_spans": [], "section": null}], "ref_entries": {"FIGREF0": {"text": "7 where consumer surplus (CS) is zero when firms only export and is positive when firms sell to home consumers. The stage-three first-order conditions for the choice of output of the multinational and the typical home firm are respectively given by", "latex": null, "type": "figure"}, "FIGREF1": {"text": ", the greater the relative market share of the MNE the higher is the centralised wage. Furthermore, it is easy to show that c w lies strictly between the two decentralised wages ( m w and i w ), as illustrated in Figure 1 below. Thus, the centralised wage is always lower than the decentralised wage paid by the multinational, because even in the absence of product market interaction, centralisation generates a labour market link between the MNE and the domestic firms. Taking this link into account, the unions choose to limit rent extraction from the MNE, in order to maintain employment in the less efficient domestic sector. This also explains the seemingly paradoxical impact of a higher capital investment of the MNE on the centralised wage. Although it raises the potential for rent extraction from 10 the MNE, a higher K does not always result in a higher centralised wage.", "latex": null, "type": "figure"}, "FIGREF2": {"text": "Figure 1: The centralised wage without product market interaction", "latex": null, "type": "figure"}, "FIGREF4": {"text": "inverse demand function is given by p=p(Q+y)", "latex": null, "type": "figure"}, "FIGREF5": {"text": "an equilibrium in wages. The analysis of the product market interaction case is unwieldy for general functional forms. In order to develop the intuition we focus on the linear demand function P=a-b(Q+y) (assumption A.3). A.2 and A.3 imply positively sloped wage reaction functions. It is straightforward to show in the linear case that,", "latex": null, "type": "figure"}, "FIGREF6": {"text": "Figure 2 plots the locus of the multinational's marginal cost as a function of the marginal cost of domestic firms (MM) and the locus of the home firms' marginal cost as a function of the MNE's (HH).", "latex": null, "type": "figure"}, "FIGREF7": {"text": "Figure 2: Effect of an increase in K on the marginal costs of the MNE and of a typical home country's firm.", "latex": null, "type": "figure"}, "FIGREF8": {"text": "), we turn to the discussion of welfare levels in order to determine the government's preferred bargaining regime, in the presence of FDI. We find that c d W W < (for sufficiently large \u03b1 and N).and the higher centralised wage would act like an export tax: it yields rents to domestic residents (the unions) and it reduces output through an increase in the export price, moving the economy towards the monopoly output level. Despite the fact that", "latex": null, "type": "figure"}, "FIGREF9": {"text": "(j=d,c) are total employment with and without the MNE respectively. With market interaction, the negative externality on home firms of inward FDI will reduce the gains in terms of employment: ( d \u039b -a \u039b )<0 for sufficiently small \u03b1 and large N, and ( c \u039b -a \u039b )<0 unless \u03b1 is very close to unity.", "latex": null, "type": "figure"}, "FIGREF11": {"text": "above, though the second of these terms may not have the same sign. The last term captures two additional interrelated effects that arise in the centralised bargaining case. The first is an effect on consumer surplus which arises when M\u2264N of the domestic firms sell at home. The second gives the home producer surplus net of union rent 17 and captures the link between the MNE and the domestic firms generated by the centralised wage. If dw dK c >0 this final term is negative and works against a subsidy. This is because the subsidy, by increasing K and thus c w , reduces total surplus (consumer surplus, plus home producers surplus, plus union rents) in the domestic sector.", "latex": null, "type": "figure"}, "FIGREF12": {"text": "on rivals' output of an increase in capital investment through both the product output. If this term is positive 18 , the multinational over-invests in capital in order tothis is positive, as was shown to happen in the linear case, the multinational will under-invest in capital in order to keep the union wage down. When the wage is centralised,the linear case. Thus, with centralised wage bargaining the MNE may under or over invest.", "latex": null, "type": "figure"}, "FIGREF13": {"text": "first two terms on the right hand side represent the direct and indirect effects (working through m w ). The last 21 term represents the impact of K on w and thus on home producer surplus net of union rents.", "latex": null, "type": "figure"}, "FIGREF14": {"text": "have similar interpretation as those in (23). In particular, the last one captures the effect of changes in the centralised wage of domestic firms' producer surplus net of changes in the domestic sector's union rents. An increase in c w leads to a transfer of rents from the domestic firms to the unions. However, the fall in firms' rents exceeds the gain in unions' rents. The final two terms in (24) have the same interpretation as those in (23). From equations (23) and (24) we are less likely to getequations (20) and (21). One of the reasons for this is that we are more likely to get 0 dK dw c < which implies that a subsidy has positive benefits on both producers' and consumers' surplus. In general, a positive capital subsidy is less likely with market interaction than without. The principal reason for this is the negative effect of a higher K on the MNE's domestic rivals' profitability. If the MNE's location decision is subject to a binding profit constraint ( )j=d,c; k=NI,I). The extra term (\u03bbK)>0 will work towards a subsidy.", "latex": null, "type": "figure"}, "TABREF0": {"text": "For simplicity, in this case we shall also rule out competition between domestic firms. Thus, with decentralised wage setting there is no link between the home firms and the MNE. However, when bargaining is centralised, the wage setting process represents such a link. With direct product market", "latex": null, "type": "table"}}, "bib_entries": {"BIBREF0": {"ref_id": "b0", "title": "The growth of foreign direct investment in Europe", "authors": [{"first": "R", "middle": [], "last": "Barrel", "suffix": ""}, {"first": "N", "middle": [], "last": "Pain", "suffix": ""}], "year": 1997, "venue": "National Institute Economic Review", "volume": "160", "issn": "", "pages": "63--75", "other_ids": {}, "links": null}, "BIBREF1": {"ref_id": "b1", "title": "FDI and trade: the Irish host-country experience", "authors": [{"first": "F", "middle": [], "last": "Barry", "suffix": ""}, {"first": "J", "middle": [], "last": "Bradley", "suffix": ""}], "year": 1997, "venue": "Economic Journal", "volume": "107", "issn": "", "pages": "1798--811", "other_ids": {}, "links": "154766783"}, "BIBREF2": {"ref_id": "b2", "title": "Lectures on Trade Theory", "authors": [{"first": "J", "middle": ["N"], "last": "Bhagwati", "suffix": ""}, {"first": "T", "middle": ["N"], "last": "Srinivasan", "suffix": ""}], "year": 1983, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF3": {"ref_id": "b3", "title": "Investment Incentives as Tariff Substitutes: A Comprehensive Measure of Protection", "authors": [{"first": "E", "middle": [], "last": "Bond", "suffix": ""}, {"first": "S", "middle": [], "last": "Guisinger", "suffix": ""}], "year": 1985, "venue": "Review of Economics and Statistics", "volume": "67", "issn": "", "pages": "91--97", "other_ids": {}, "links": "153996816"}, "BIBREF4": {"ref_id": "b4", "title": "Bargaining with Commitment, Choice of Techniques, and Foreign Direct Investment", "authors": [{"first": "E", "middle": [], "last": "Bond", "suffix": ""}, {"first": "L", "middle": [], "last": "Samuelson", "suffix": ""}], "year": 1989, "venue": "Journal of International Economics", "volume": "26", "issn": "", "pages": "77--97", "other_ids": {}, "links": null}, "BIBREF5": {"ref_id": "b5", "title": "Foreign Direct Investment with Unemployment and Endogenous Taxes and Tariffs", "authors": [{"first": "B", "middle": ["J"], "last": "Brander", "suffix": ""}, {"first": "J", "middle": [], "last": "Spencer", "suffix": ""}], "year": 1987, "venue": "Journal of International Economics", "volume": "22", "issn": "", "pages": "257--279", "other_ids": {}, "links": "153824590"}, "BIBREF6": {"ref_id": "b6", "title": "Unionised Oligopoly and International Trade Policy", "authors": [{"first": "B", "middle": ["J"], "last": "Brander", "suffix": ""}, {"first": "J", "middle": [], "last": "Spencer", "suffix": ""}], "year": 1988, "venue": "Journal of International Economics", "volume": "24", "issn": "", "pages": "217--234", "other_ids": {}, "links": null}, "BIBREF7": {"ref_id": "b7", "title": "Trade Unions in an Open Economy: A General Equilibrium Analysis", "authors": [{"first": "R", "middle": ["A"], "last": "Brecher", "suffix": ""}, {"first": "Van", "middle": [], "last": "Long", "suffix": ""}, {"first": "N", "middle": [], "last": "", "suffix": ""}], "year": 1989, "venue": "Economic Record", "volume": "65", "issn": "", "pages": "234--239", "other_ids": {}, "links": "154186204"}, "BIBREF8": {"ref_id": "b8", "title": "Strategic Direct Investment under Unionised Oligopoly", "authors": [{"first": "Vannini", "middle": [], "last": "Bughin", "suffix": ""}], "year": 1995, "venue": "International Journal of Industrial Organisation", "volume": "13", "issn": "", "pages": "127--145", "other_ids": {}, "links": null}, "BIBREF9": {"ref_id": "b9", "title": "Multinational Enterprise and Economic Analysis", "authors": [{"first": "R", "middle": ["E"], "last": "Caves", "suffix": ""}], "year": 1996, "venue": "CUP", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF10": {"ref_id": "b10", "title": "Characterising Relative Performance: The Productivity Advantage of Foreign Owned Firms in the U.K", "authors": [{"first": "S", "middle": ["W"], "last": "Davies", "suffix": ""}, {"first": "B", "middle": ["R"], "last": "Lyons", "suffix": ""}], "year": 1992, "venue": "Oxford Economic Papers", "volume": "43", "issn": "", "pages": "584--595", "other_ids": {}, "links": "166228891"}, "BIBREF11": {"ref_id": "b11", "title": "Import Competition, International Factor Mobility and Lobbying Responses: The Shumpeterian Industry Case", "authors": [{"first": "E", "middle": [], "last": "Dinopoulos", "suffix": ""}], "year": 1983, "venue": "Journal of International Economics", "volume": "14", "issn": "", "pages": "395--410", "other_ids": {}, "links": "153441415"}, "BIBREF12": {"ref_id": "b12", "title": "Global Competition and the Labour Market", "authors": [{"first": "N", "middle": ["L"], "last": "Driffield", "suffix": ""}], "year": 1996, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": "153299035"}, "BIBREF13": {"ref_id": "b13", "title": "The Eclectic Paradigm of International Production: A Restatement of some Possible Extentions", "authors": [{"first": "J", "middle": ["H"], "last": "Dunning", "suffix": ""}], "year": 1988, "venue": "Journal of International Business Studies", "volume": "19", "issn": "", "pages": "1--29", "other_ids": {}, "links": null}, "BIBREF14": {"ref_id": "b14", "title": "Differences and Changes in Wage Structures", "authors": [{"first": "R", "middle": ["B"], "last": "Freeman", "suffix": ""}, {"first": "L", "middle": ["F"], "last": "Katz", "suffix": ""}], "year": null, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": "152405667"}, "BIBREF15": {"ref_id": "b15", "title": "Competition for Foreign Direct Investment", "authors": [{"first": "P", "middle": [], "last": "Haaparanta", "suffix": ""}], "year": 1996, "venue": "Journal of Public Economics", "volume": "63", "issn": "", "pages": "141--153", "other_ids": {}, "links": "154459768"}, "BIBREF16": {"ref_id": "b16", "title": "International Market Integration and Union Bargaining", "authors": [{"first": "H", "middle": [], "last": "Huizinga", "suffix": ""}], "year": 1993, "venue": "Scandinavian Journal of Economics", "volume": "95", "issn": "", "pages": "249--255", "other_ids": {}, "links": null}, "BIBREF17": {"ref_id": "b17", "title": "Foreign Direct Investment under Oligopoly: Profit Shifting or Profit Capturing?", "authors": [{"first": "E", "middle": [], "last": "Janeba", "suffix": ""}], "year": 1996, "venue": "Journal of Public Economics", "volume": "60", "issn": "", "pages": "423--445", "other_ids": {}, "links": "153901127"}, "BIBREF18": {"ref_id": "b18", "title": "Tax Policy on Foreign Direct Investment in the Presence of Cross-Hauling", "authors": [{"first": "S", "middle": [], "last": "Lahiri", "suffix": ""}, {"first": "Y", "middle": [], "last": "Ono", "suffix": ""}], "year": 1997, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF19": {"ref_id": "b19", "title": "International Trade and Economic Integration when Labour Markets are Generally Unionised", "authors": [{"first": "R", "middle": [], "last": "Naylor", "suffix": ""}], "year": 1998, "venue": "European Economic Review", "volume": "42", "issn": "", "pages": "1251--1267", "other_ids": {}, "links": "58890733"}, "BIBREF20": {"ref_id": "b20", "title": "Wage Bargaining and Foreign Direct Investment", "authors": [{"first": "R", "middle": [], "last": "Naylor", "suffix": ""}, {"first": "M", "middle": [], "last": "Santoni", "suffix": ""}], "year": 1998, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF21": {"ref_id": "b21", "title": "Union-Oligopoly Sequential Bargaining: Trade and Industrial Policies", "authors": [{"first": "M", "middle": [], "last": "Santoni", "suffix": ""}], "year": 1996, "venue": "Oxford Economic Papers", "volume": "48", "issn": "", "pages": "640--663", "other_ids": {}, "links": "168105577"}, "BIBREF22": {"ref_id": "b22", "title": "World Investment Report: Transnational Corporation, Market Structure and Competition Policy", "authors": [], "year": 1997, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": null}, "BIBREF23": {"ref_id": "b23", "title": "Cross-hauling Direct Foreign Investment and Unionised Oligopoly", "authors": [{"first": "L", "middle": [], "last": "Zhao", "suffix": ""}], "year": 1995, "venue": "European Economic Review", "volume": "39", "issn": "", "pages": "1237--1253", "other_ids": {}, "links": null}, "BIBREF24": {"ref_id": "b24", "title": "The Impact of Foreign Direct Investment on Wages and Employment", "authors": [{"first": "L", "middle": [], "last": "Zhao", "suffix": ""}], "year": 1998, "venue": "", "volume": "50", "issn": "", "pages": "284--301", "other_ids": {}, "links": null}}}, "latex_parse": null}
diff --git a/s2orc-doc2json/tests/s2orc/20190928/10045593.json b/s2orc-doc2json/tests/s2orc/20190928/10045593.json
new file mode 100644
index 0000000000000000000000000000000000000000..abcb0147e44272f0c03c77f6de462b3b8c65dec5
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20190928/10045593.json
@@ -0,0 +1 @@
+{"paper_id": "10045593", "metadata": {"title": "Postmortem muscle protein degradation in humans as a tool for PMI delimitation", "authors": [{"first": "Stefan", "middle": [], "last": "Pittner", "suffix": ""}, {"first": "Bianca", "middle": [], "last": "Ehrenfellner", "suffix": ""}, {"first": "Fabio", "middle": ["C."], "last": "Monticelli", "suffix": ""}, {"first": "Angela", "middle": [], "last": "Zissler", "suffix": ""}, {"first": "Alexandra", "middle": ["M."], "last": "S\u00e4nger", "suffix": ""}, {"first": "Walter", "middle": [], "last": "Stoiber", "suffix": ""}, {"first": "Peter", "middle": [], "last": "Steinbacher", "suffix": ""}], "abstract": "Forensic estimation of time since death relies on diverse approaches, including measurement and comparison of environmental and body core temperature and analysis of insect colonization on a dead body. However, most of the applied methods have practical limitations or provide insufficient results under certain circumstances. Thus, new methods that can easily be implemented into forensic routine work are required to deliver more and discrete information about the postmortem interval (PMI). Following a previous work on skeletal muscle degradation in the porcine model, we analyzed human postmortem skeletal muscle samples of 40 forensic cases by Western blotting and casein zymography. Our results demonstrate predictable protein degradation processes in human muscle that are distinctly associated with temperature and the PMI. We provide information on promising degradation markers for certain periods of time postmortem, which can be useful tools for time since death delimitation. In addition, we discuss external influencing factors such as age, body mass index, sex, and cause of death that need to be considered in future routine application of the method in humans.", "year": "2016", "arxiv_id": null, "acl_id": null, "pmc_id": "PMC5055573", "pubmed_id": "26951243", "doi": "10.1007/s00414-016-1349-9", "venue": "International journal of legal medicine", "journal": "International journal of legal medicine"}, "s2_pdf_hash": "31fd7ddfda2e62c7870b0645590c7a1359c4e162", "grobid_parse": {"abstract": [{"text": "Abstract Forensic estimation of time since death relies on diverse approaches, including measurement and comparison of environmental and body core temperature and analysis of insect colonization on a dead body. However, most of the applied methods have practical limitations or provide insufficient results under certain circumstances. Thus, new methods that can easily be implemented into forensic routine work are required to deliver more and discrete information about the postmortem interval (PMI). Following a previous work on skeletal muscle degradation in the porcine model, we analyzed human postmortem skeletal muscle samples of 40 forensic cases by Western blotting and casein zymography. Our results demonstrate predictable protein degradation processes in human muscle that are distinctly associated with temperature and the PMI. We provide information on promising degradation markers for certain periods of time postmortem, which can be useful tools for time since death delimitation. In addition, we discuss external influencing factors such as age, body mass index, sex, and cause of death that need to be considered in future routine application of the method in humans.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Abstract"}], "body_text": [{"text": "One of the most important questions at a crime scene and in forensic laboratories is BWhen did the person die?^A most precise estimate of the time since death is one of the central aspects of forensic science. This is necessary to gain crucial information about the circumstances of death and is, in criminal cases, essential for the confirmation or invalidation of alibis and ultimately for the solution of a crime. Since decades, forensic scientists have investigated changes in postmortem body composition to characterize different phases of decomposition within the postmortem interval (PMI) [1] .The earliest alterations after death (i.e., within the first 24 h postmortem, hpm) include the development and regression of in rigor mortis (stiffening and relaxation of skeletal muscles), the progression of livor mortis (the settling of blood products in lower parts of the body), and algor mortis (the adjustment of the body to environmental temperature) [2] . Further approaches to characterize changes within the early postmortem phase include pharmacological excitability of the eye and electrical stimulation of certain muscles [3] [4] [5] . Although these methods are applied to delimitate the PMI in everyday forensic work, there are still great inaccuracies and limitations in many cases.After several days or even weeks postmortem, it is especially forensic entomology (the analyses of life stages of cadaver-feeding insects) that has shown to be able deliver valuable information about the time since death [6] . However, this method is highly dependent upon the local fauna, the (seasonal) weather conditions, and exposure of the body. Other, more elaborate methods for investigation of the later postmortem phases are colorimetric measurement of tooth pulp [7] and analysis of abdominal and superficial microbial mats [8, 9] .A most prominent lack of reliable methods exists especially for PMI determination in the intermediate phase (i.e., between 24 h and approximately 7 days postmortem). Various studies using animal models have aimed to characterize certain changes in this phase. Especially, degeneration processes in soft tissue (such as brain, heart, lung, liver, kidney, and muscle) have been found to occur within this period of time [10] . However, despite that a large number of studies were able to identify such changes, only very few appear to be practicable or accurate enough to become established for everyday forensic work [1] .Degradation processes of soft tissues have already been described on behalf of other aspects, such as the development of tenderness in stored meat [11] [12] [13] . Especially, the analysis of the disintegration of structural muscle proteins using biochemical techniques such as Western blotting or zymography revealed distinct changes in protein band appearance within a certain period of time postmortem [14] . Interestingly, such alterations were reported among proteins in different species of domestic animals in similar fashion (all above), allowing to hypothesize that these patterns also occur in human muscle tissue.There are several reasons why skeletal muscle is a promising candidate tissue for use in PMI delimitation. Muscle is the most abundant tissue of the human body and is easily accessible while being relatively well protected by the skin. A large number of its proteins are very well characterized, and numerous antibodies for the identification of such proteins are commercially available. Compared to internal organs and nervous tissue, skeletal muscle has a greater delay in postmortem changes but still decomposes faster than cartilage and bone [10] . Together, this makes muscle tissue well suited for routine analysis in the forensic lab and a promising candidate for the analysis of changes postmortem.In a previous study, we have described degradation processes of several skeletal muscle proteins and enzymes in the pig model [14] . Some of these proteins degraded in a regular and predictably time-dependent fashion, thus making them promising candidates for PMI delimitation also in humans. This was tested in the present work, using samples of human skeletal muscle from 40 forensic cases in Salzburg/Austria. Western blot analysis was employed to examine the degradation behavior of cardiac troponin T, desmin, and tropomyosin. Casein zymography served to determine the inactive-active transitions of calpain 1 and calpain 2. To make additional allowance for temperature, the most important physical factor influencing degradation processes, the data are expressed in accumulated degree-days (ADDs; mean ambient temperature \u00d7 PMI). As there is still no research available on the relationship between ADD and muscle protein degradation, this study aims to investigate specifically this aspect, together with other possible influencing factors, such as age, sex, body weight, and cause of death, to provide a basis for future routine application of the method in humans.", "cite_spans": [{"start": 596, "end": 599, "text": "[1]", "latex": null, "ref_id": "BIBREF0"}, {"start": 959, "end": 962, "text": "[2]", "latex": null, "ref_id": "BIBREF1"}, {"start": 1136, "end": 1139, "text": "[3]", "latex": null, "ref_id": "BIBREF2"}, {"start": 1140, "end": 1143, "text": "[4]", "latex": null, "ref_id": "BIBREF3"}, {"start": 1144, "end": 1147, "text": "[5]", "latex": null, "ref_id": "BIBREF4"}, {"start": 1520, "end": 1523, "text": "[6]", "latex": null, "ref_id": "BIBREF5"}, {"start": 1772, "end": 1775, "text": "[7]", "latex": null, "ref_id": "BIBREF6"}, {"start": 1833, "end": 1836, "text": "[8,", "latex": null, "ref_id": "BIBREF7"}, {"start": 1837, "end": 1839, "text": "9]", "latex": null, "ref_id": "BIBREF8"}, {"start": 2259, "end": 2263, "text": "[10]", "latex": null, "ref_id": "BIBREF9"}, {"start": 2457, "end": 2460, "text": "[1]", "latex": null, "ref_id": "BIBREF0"}, {"start": 2609, "end": 2613, "text": "[11]", "latex": null, "ref_id": "BIBREF10"}, {"start": 2614, "end": 2618, "text": "[12]", "latex": null, "ref_id": "BIBREF11"}, {"start": 2619, "end": 2623, "text": "[13]", "latex": null, "ref_id": "BIBREF12"}, {"start": 2867, "end": 2871, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}, {"start": 3632, "end": 3636, "text": "[10]", "latex": null, "ref_id": "BIBREF9"}, {"start": 3918, "end": 3922, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "A total of 40 forensic cases were analyzed in this work (23 female and 17 male). To attain minimal inaccuracy, the work was confined to cases with well-known times since death and comprehensible ambient temperature profiles. The subjects were of ages between 2 and 90 years (mean 69.1 \u00b1 19.5 years), with a body mass index between 10.7 and 50.4 (mean 26.2 \u00b1 6.1). Causes of death were diverse and included 27 cases of internal malfunction and organ failure, 10 cases of external trauma, 1 case of intoxication, and 2 unknown causes of death. From a legal point of view, 26 subjects died naturally, 8 in accidents, 2 were suicides, 2 homicides, and 2 were unclear cases. PMI, the time between death and sampling, ranged between 4.0 and 92.8 h postmortem (mean 37.7 \u00b1 27.8 hpm). Twenty eight of the cases were cooled to 4\u00b0C according to Central European standards, with a mean cooling time of 39.1 \u00b1 26.3 h for these cases (Table 1) .", "cite_spans": [], "ref_spans": [{"start": 921, "end": 930, "text": "(Table 1)", "latex": null, "ref_id": "TABREF0"}], "eq_spans": [], "section": null}, {"text": "ADDs are defined as the product of time and ambient temperature. ADDs have already been successfully employed in animal models and humans to predict postmortem change in corpse morphology [15] , DNA degradation [16] , and insect colonization [17] . Analysis of postmortem tissue degradation in forensic cases frequently suffers from the limitation that there is no control of environmental conditions. Given this precondition, the use of ADD offers a valuable approach to ). Values of ambient temperature (T a ) were derived from police records or from the nearest meteorological monitoring station. Cooling times were documented at the Forensic Medicine Department of the University of Salzburg. Information on postmortem intervals (PMIs) and ranges of variation were taken from police or medical records. PMI range is used as a measure of PMI value accurateness. ADD values were calculated as follows: PMI \u00d7 T a account for both of the most important influencing factors of postmortem degradation processes, time and temperature, in a standardized manner. For this purpose, the PMI of each case, as measured in days, was multiplied with the respective environmental temperature (in\u00b0C). Known phases of different temperature within the PMI (e.g., times of cooling to 4\u00b0C) were separately calculated and the results added to obtain the final ADD value. For dead bodies discovered outside, information on environmental temperature was taken from police records if available or was estimated using the data of the nearest meteorological station. An ambient temperature of 0\u00b0C was considered the lower threshold to prevent negative values. ADD calculation for all 40 cases resulted in a mean of 10.4 \u00b1 7.7 with a minimum of 2.6\u00b0d and a maximum of 36.0\u00b0d.", "cite_spans": [{"start": 188, "end": 192, "text": "[15]", "latex": null, "ref_id": "BIBREF14"}, {"start": 211, "end": 215, "text": "[16]", "latex": null, "ref_id": "BIBREF15"}, {"start": 242, "end": 246, "text": "[17]", "latex": null, "ref_id": "BIBREF16"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "In all cases, a piece of muscle tissue with a size of approximately 2 \u00d7 2 \u00d7 2 cm was removed from the lateral thigh muscle (Musculus vastus lateralis) in a depth of 2 cm. Excised muscle samples were sectioned to smaller pieces of approximately 100 mg, snap frozen, and stored in liquid nitrogen. Frozen tissue was homogenized by cryogenic grinding and sonication. For casein zymography, an extraction buffer containing 50 mM Tris, 5 mM EDTA, and 10 mM 3-mercaptopropane-1, 2-diol was used. For SDS-PAGE and Western blotting, RIPA buffer with a protease inhibitor cocktail was used as extraction buffer. The homogenate was centrifuged at 1000\u00d7g for 6 min, and the supernatant was removed and stored until further use. Protein concentration was measured using Pierce BCA Assay Kit.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "SDS-PAGE was performed according to Laemmli [18] . Tenpercent polyacrylamide resolving gels (acrylamide:N,N\u2032-bismethylene acrylamide = 37.5:1, 0.1 % SDS, 0.05 % TEMED, 0.05 % APS, and 375 mM Tris-HCl, pH 8.8) were used for the detection of tropomyosin, desmin, and cTnT. Five-percent polyacrylamide gels (acrylamide:N,N\u2032-bis-methylene acrylamide = 37.5:1, 0.1 % SDS, 0.125 % TEMED, 0.075 % APS, and 125 mM Tris-HCl, pH 6.8) were used as stacking gels. Thirty microgram of total protein was diluted in 15 \u03bcl Aqua bidest and 5 \u03bcl sample buffer (40 % glycerine, 10 % mercaptoethanol, 0.04 % bromphenol blue, and 250 mM TrisHCl, pH 6.75). Samples were then denatured at 90\u00b0C for 5 min and inserted into the gel wells. Electrophoresis was run at a constant voltage of 150 V until the dye front reached the bottom of the gel (approximately 1.5 h). The running buffer contained 25 mM Tris, 195 mM glycine, 100 mM EDTA, and 0.1 % SDS. Proteins were transferred onto polyvinylidene fluoride (PVDF) membranes in transfer buffer (192 mM glycine, 20 % methanol, and 25 mM Tris) at a constant current of 250 mA for 75 min. All blots were blocked for 1 h in TTBS (150 mM NaCl, 0.05 % Tween, and 25 mM Tris, pH 7.5) including 1 % dried milk as a blocking agent. The following primary antisera were used: mouse monoclonal anti-tropomyosin, mouse monoclonal anti-desmin, and mouse monoclonal anti-cardiac troponin T. HRP-conjugated polyclonal goat antimouse was applied as secondary antibody. All antibodies were diluted in a 1 % dried milk solution in TTBS and applied for 1 h. After each antibody application, the membranes were extensively washed and rinsed in TTBS. Antibody binding was visualized by application of chemiluminescence substrate and photographed using a digital gel documentation system.", "cite_spans": [{"start": 44, "end": 48, "text": "[18]", "latex": null, "ref_id": "BIBREF17"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Casein zymography was performed in accordance to the method of Raser [19] with slight modifications. Of polyacrylamide gels (acrylamide:N,N\u2032-bis-methylene acrylamide = 37.5:1, 0.1 % TEMED, 0.05 % APS, and 375 mM Tris-HCl, pH 8.8), 12.5 % were copolymerized with 0.1 % casein and used as resolving gels. Five-percent polyacrylamide gels (acrylamide:N,N\u2032-bis-methylene acrylamide = 37.5:1, 0.125 % TEMED, 0.075 % APS, and 330 mM Tris-HCl, pH 6.8) were used as stacking gels. Two-part sample supernatants were mixed with one-part sample buffer (25 % glycerol, 0.1 % bromphenol blue, and 62.5 mM Tris-HCl, pH 6.8) and onepart A. bidest. The running buffer contained 25 mM Tris, 192 mM glycine, and 1 mM EDTA. After a 15-min prerun at 75 V and 4\u00b0C, the samples were inserted into the gel wells and electrophoresis was run for approximately 7 h at 150 V and 4\u00b0C.The gels were briefly rinsed with A. bidest, transferred to incubation buffer (0.1 % 3-mercapto-1,2-propanediol and 50 mM Tris-HCl, pH 7.5) with 4 mM CaCl 2 , and incubated overnight (12-18 h at room temperature). The gels were stained in Coomassie dye (0.1 % Coomassie Brilliant Blue R250, 50 % methanol, and 10 % acidic acid) for 1 h, destained in the dye solvent, and photographed using a digital gel documentation system.", "cite_spans": [{"start": 69, "end": 73, "text": "[19]", "latex": null, "ref_id": "BIBREF18"}], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Protein band intensities were measured using the gel analysis tools of ImageJ software (v.1.48 NIH, National Institutes of Health, USA); histograms of the tonal distribution of the images were plotted and areas underneath the graphs were measured according to the program's standard protocol. All signals <1 % of the respective dominant band were considered background (i.e., not regarded a protein band). This provided the basis for obtaining binary information on the presence (1) or absence (0) of proteins, the target variable of the study. All data pairs consisting of information on the presence of an individual protein (1/0) and the respective ADD value were then analyzed for bivariate correlations. Spearman's \u03c1 and p values were determined to test whether protein presence is random within the ADD range investigated (Table 2) . Significant correlations were further described using logistic regressions. This allows relating the presence probability of a protein band to ADD, thus enabling to determine the ADD at which a specific degradation product can be expected to be present in a significant amount of cases (e.g., 95 %). All statistical analyses were performed using SPSS 22 (IBM, USA).", "cite_spans": [], "ref_spans": [{"start": 828, "end": 837, "text": "(Table 2)", "latex": null, "ref_id": "TABREF1"}], "eq_spans": [], "section": null}, {"text": "All samples analyzed exhibited a characteristic tropomyosin double band at approximately 36 to 38 kDa, depicting two isoforms of tropomyosin (Fig. 1) . There was no appearance of tropomyosin degradation products or lack of a native band detected in any of the samples.Similarly, analysis of cTnT revealed a band doublet between 40 and 50 kDa in all investigated cases. However, in 37 of the 40 cases, an additional band, most probably representing a cTnT degradation product (cTnT dp1), was detected at approximately 38 kDa. The small number of cases without that particular extra band prevented a statistical correlation with ADDs. In addition, a second degradation product (cTnT dp2) with a molecular weight of about 33 kDa was found below the cTnT dp1 band in 22 cases. The correlation between the presence of the cTnT dp2 band and ADD is highly significant (Spearman's \u03c1 = 0.552, p \u2264 0.001). Logistic regression analysis reveals that cTnT dp2 is more likely to be present than absent from 9.4\u00b0d onward (inflection point of curve) and significantly present from 28.2\u00b0d onward (>95 % likelihood of band presence; Fig. 2) .Desmin Western blots showed a band triplet between 45 and 55 kDa in all but one cases. The only exception was the sample of a 77-year-old male with a PMI of 3.7\u00b0d which rendered only a single desmin band. Many of the samples showed additional bands representing desmin degradation products, desmin dp1 at about 38 kDa in 24 cases, desmin dp2 at 35 kDa in 21 cases, and desmin dp3 at approximately 32 kDa in 8 cases (Fig. 1) . Although these degradation products seemed to appear consecutively (in no sample, dp2 was found in the absence of dp1 or dp3 in the absence dp1 and dp2), there was no statistical correlation found between the presence of desmin dp2 and dp3 and the ADD. By contrast, desmin dp1 was found to be significantly correlated with ADD (\u03c1 = 0.500, p \u2264 0.01). This degradation product was significantly present from 28.1\u00b0d onward (in 50 % of the cases at 6.7\u00b0d).A native PAGE on casein-copolymerized gels with subsequent zymography was performed to obtain insight into the postmortem activity of the calpain system. Once activated in incubation buffer, two bands were detected in all samples. The first column depicts the number of cases with a certain degradation product (dp). Given that two groups (absent vs present dp) provide large enough sample size, the possible correlation with the ADD was determined using Spearman's \u03c1 (second column H1, there is a correlation between ADD value and the presence (1) or absence (0) of degradation products, and H0, presence of degradation products is random within ADD values). Logistic regression curves were determined for all degradation products with significant correlation to the ADD (p \u2264 0.01). The third column represents the predicted presence probabilities (P) of 50 and 95 % of the logistic regression These bands are known to represent calpain 1 and calpain 2 in a native state [19] . An additional band (here termed calpain dp1) localized between the bands of calpain 1 and 2 could be detected in 14 cases. This band is described to represent an autolyzed and hence activated form of calpain 1 [20] . Its presence could be highly significantly correlated with the time since death (\u03c1 = 0.491, p \u2264 0.01). Regression analysis reveals that this degradation product is present in half of the cases and in 95 % of the cases at 15.0 and 39.6\u00b0d, respectively. No activated form of calpain 2 was detected in any of the 40 cases.", "cite_spans": [{"start": 2974, "end": 2978, "text": "[19]", "latex": null, "ref_id": "BIBREF18"}, {"start": 3191, "end": 3195, "text": "[20]", "latex": null, "ref_id": "BIBREF19"}], "ref_spans": [{"start": 141, "end": 149, "text": "(Fig. 1)", "latex": null, "ref_id": "FIGREF0"}, {"start": 1115, "end": 1122, "text": "Fig. 2)", "latex": null, "ref_id": null}, {"start": 1539, "end": 1547, "text": "(Fig. 1)", "latex": null, "ref_id": "FIGREF0"}], "eq_spans": [], "section": null}, {"text": "Apart from the effects of temperature, there is little information on the influencing factors of protein degradation processes in human muscle. Aiming to reduce this deficit, we investigated the possible influence of the factors age, BMI, cause of death, and sex on postmortem protein decomposition.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "For this purpose, we performed, in addition to analysis of the total group of cases, a statistical analysis in an age-corrected group, excluding individuals below 18 and above 80 years. This was done to evaluate whether the potential presence of developmental isoforms of muscle proteins in young persons or changes related to severe sarcopenia in very old persons influences the correlation between protein degradation data and the ADD. Correlation coefficients reveal that the degradation of all investigated proteins correlates stronger with the ADD in the age-corrected group than in the total group (Table 2) . This includes that the second degradation product of desmin (desmin dp2), which does not correlate with ADD in the total group, shows a significant level of correlation when tested against the age-corrected group (Spearman's \u03c1 = 0.567, p \u2264 0.01). In addition to correlation analysis, logistic regression models were fitted for each degradation product in the age-corrected group. Regression curves showed distinctly steeper slopes in all cases, indicating smaller time windows for the appearance of degradation products (Table 2 and Fig. 2 ). Thus, cTnT dp2 and desmin dp1 become significantly present (P > 95 %) at about half of the ADD value compared to the total group (12.6 vs 28.2\u00b0d for cTnT dp1 and 15.4 vs 28.1\u00b0d for desmin dp1). Desmin dp2 and calpain dp1 exceed 95 % probability of presence at 23.3 and 21.3\u00b0d, respectively.", "cite_spans": [], "ref_spans": [{"start": 604, "end": 614, "text": "(Table 2)", "latex": null, "ref_id": "TABREF1"}, {"start": 1137, "end": 1145, "text": "(Table 2", "latex": null, "ref_id": "TABREF1"}, {"start": 1150, "end": 1156, "text": "Fig. 2", "latex": null, "ref_id": null}], "eq_spans": [], "section": null}, {"text": "In a second evaluation, we examined groups that were corrected on the basis of body mass index, with similar results as obtained with age-corrected groups. Exclusion of all cases with a BMI higher than 30 (value widely considered as the limit of obesity) and below 19 (considered as underweight or even cachectic) results in increased correlation coefficients compared to the total group for cTnT dp2 and calpain 1 dp1. This was confirmed by logistic regression, with curves becoming steeper in both cases. The presence probability of cTnT dp2 exceeds 95 % at 12.4\u00b0d, compared to 28.2\u00b0d when using all samples, and calpain dp1 reaches 95 % probability of presence at 21.2\u00b0d compared to 39.6\u00b0d. By contrast, ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Attempts to identify cause of death as influencing factor turned out to be complex. The 40 subjects examined had over 20 different causes of death diagnosed, thus making it virtually impossible to cluster them into groups, large enough to allow sensible comparison. When causes of death are classified into the four categories, Binternal^(27 cases), Btrauma( 10), Bintoxication^(1), and unknown (2), evidently, only the categories internal and trauma are large enough for attempting further analysis. Correlating these causes of death with age expectedly revealed a highly significant relationship. Internal cases correlate positively with age (Spearman's \u03c1 = 0.502, p \u2264 0.001), and trauma cases negatively (\u03c1 = \u22120.494, p \u2264 0.001), making it impossible to determine cause of death effects in an uninfluenced manner.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": null}, {"text": "Cases were assigned accordingly to analyze sex as a further possible influencing factor. No major influence of sex on the correlation with ADD was detected for troponin (female Spearman's \u03c1 = 0.566, male \u03c1 = 0.561) and calpain 1 degradation products (female Spearman's \u03c1 = 0.513, male \u03c1 = 0.561). By contrast, the correlations for the presence of both degradation products of desmin were clearly increased in the female group (desmin dp1 \u03c1 = 0.662, desmin dp2 \u03c1 = 0.407) but decreased in the male group (desmin dp1 \u03c1 = 0.361, desmin dp2 \u03c1 = 0.195), both compared to the total group. Fig. 2 Logistic regression curves of significantly ADD-correlated degradation products cTnT dp2 (a), calpain 1 dp (b), desmin dp1 (c), and desmin dp2 (d) represent presence probability development. Regression curves are plotted within the ADD range from 2.6 to 36.0. Solid lines stand for the regression of the total group (40 samples), whereas broken lines represent age-corrected (dashes) and BMI-corrected (dots) groups. With increasing ADD, the presence probability of all degradation products rises. Especially, the age-corrected regressions are steep in all cases and exceed the P = 0.95 confidence limit (dotted horizontal line) at lower ADD compared to the total group", "cite_spans": [], "ref_spans": [{"start": 583, "end": 589, "text": "Fig. 2", "latex": null, "ref_id": null}], "eq_spans": [], "section": null}, {"text": "The results of this study clearly demonstrate the capability of muscle protein analyses to serve as a novel method for the delimitation of the time since death in humans. Degradation processes of skeletal muscle are identified to exhibit a discrete dependence upon accumulated degree-days, as a measure of the postmortem interval. Specifically, we were able to identify particular proteins (desmin, cTnT, and calpain 1) and their degradation products that can be used as markers for specific time intervals in the postmortem decomposition of a human body.The desmin degradation products of humans are similar to those found in meat science studies, analyzing molecular composition of muscle proteins in various domestic animals such as cattle [21] , pigs [22] , lamb [23] , and chicken [24] . These split products of desmin appeared regularly from 1 to 2 days postmortem onward, and their development was shown to be promoted by incubation of the muscle tissue with calpain 1 and calpain 2 [25, 26] . Similar desmin degradation products were also detected in our previous study of postmortem muscle alteration in pigs [14] . In this work, we monitored desmin degradation products over a period of 10 days. Comparison to the present human data indicates a later appearance of some of these split products in the porcine muscle. This accounts, for example, for the 38-kDa desmin degradation product (desmin dp1), which is significantly present from 28.1\u00b0d onward in humans but did not arise before 56.5\u00b0d in pigs. Whether this divergence results from interspecific differences in the protein itself, or as a consequence of the different physical Bstorage\u0109 onditions (e.g., regarding humidity), remains to be determined. Interspecific variation in the degradation velocity of the desmin protein may be an explanatory factor. However, information on desmin degradation time is limited and suggests that it occurs rather slowly. In cattle muscle, a complete degradation of the native desmin band has not been found before 112\u00b0d [12] . Because the maximum ADD of the present cases in far below this value, the present work does not yield further improvement in this respect. A high degree of similarity with our previous study in pigs [14] is also noted for the results on the decomposition of cTnT. This relates specifically to the observations that degradation products in pigs and humans are the same in size and that their presence is clearly associated with the time since death. The finding that the appearance of cTnT dp1 does not correlate with the ADD is most probably due to the small number of cases (3 out of 40), in which this degradation product was not detected (Table 2) . However, the observation that two of the three cases without that band had a rather short ADD of 2.6 and 3.1\u00b0d might provide a hint that cTnT dp1 emerges soon after death. The third case, a person with an ADD of 15.5\u00b0d and a disproportionately high BMI of 50.4, interestingly presented a band for cTnT dp2 (indicating advanced cTnT degradation) while lacking a band for cTnT dp1.In human muscle, particularly, cTnT dp2 correlates significantly with the ADD. Exhibiting the highest correlation coefficient of all protein derivatives analyzed, cTnT dp2 presents itself as a most valuable decomposition marker in PMI analysis. Similar to desmin, the cTnT degradation products appeared earlier in humans than in pigs (e.g., cTnT dp2 at 28.2 and 125.7\u00b0d, respectively). Temperature-and PMI-dependent degradation kinetics of cTnT have recently been found in explanted human heart tissue [27] . Note that cardiac troponin T is not limited to cardiac muscle but also occurs in skeletal muscle tissue [25] .As a further result of the present work, it was demonstrated that some proteins remain unaffected by degradation processes throughout the observed period. This accounts for tropomyosin which was present as a double band in all analyzed cases. This is in agreement with our findings on porcine muscle protein degradation in the first 10 days postmortem [14] . However, as tropomyosin is also known to be a substrate for calpain proteolysis [28] , it is a candidate for split product occurrence in more advanced stages of muscle decomposition stages.A particular focus in the present work was placed on the patterns of calpain activation. Enzymes of the calpain family are known to play a major role in muscle protein degradation [29] . They are responsible for proteolytic degradation of a variety of proteins, including desmin, tropomyosin, and cTnT [30] . The presence of the autolyzed and hence activated form of calpain 1 in the investigated human muscle samples may therefore be a direct cause for the observed degradation patterns of desmin and cTnT in these samples. By contrast, there was no autolyzed form of calpain 2 until 36\u00b0d. This is in agreement with our previous data from porcine muscle, demonstrating that activation of calpain 2 is substantially delayed compared to calpain 1 (100.6 vs 18.9\u00b0d) [14] . It may be hypothesized that this delay is evoked by the different Ca 2+ sensitivity of the two isoforms. It is much higher for calpain 1 than for calpain 2 [29] . The present data demonstrate that the activation of calpain 1 itself is also dependent upon PMI and temperature and significantly occurs at 39.6\u00b0d. This complies with results from Western blot analyses in bovine, ovine, and porcine muscles, showing that the 80-kDa subunit of calpain 1 almost entirely degraded into an active 76-kDa subunit within the first 7 days postmortem, while the 80-kDa subunit of calpain 2 remained un-degraded in this period [14, 23, 31] . Thus, the activity profiles of these enzymes also characterize specific postmortem phases, making them promising candidates for PMI delimitation.In addition to identifying indicator proteins for future use in time-precise PMI determination, the findings of this study contribute to extend the present understanding of postmortem muscle decomposition generally and of how they are influenced by environmental and demographic factors. Comprehensive knowledge of these aspects is crucial for the development of a protein degradation-based method for PMI analysis and essential to define excluding factors of the method and to determine the scope of the estimation.The most important external element influencing body decomposition is temperature. Showing clear correlations between the degradation of skeletal muscle proteins and ADD, the present data confirm the usefulness of ADD as a reference measure in PMI delimitation. In addition, the data provide evidence that age and body mass are also important factors that influence postmortem muscle protein degradation. Exclusion of extreme cases (i.e., very old or very young and obese or underweight/cachectic, respectively) definitely improves the predictive precision of the proposed method (Table 1 ). This is also clearly evident from the steeper slopes of the curves rendered by the regression models for the ageand BMI-corrected groups. Considerations as to the reasons of the age and BMI effects drive attention to a variety of aspects.Regarding age, the differences in muscle protein degradation may be attributed to either the presence of developmental isoforms in young or often pronounced muscle wasting (i.e., sarcopenia) in the very old subjects [32] . Interestingly, an analysis of the postmortem rise in vitreous potassium levels also demonstrated that subjects below 18 and over 80 years substantially deviate from the age groups in between [33] .It is generally accepted that body mass influences the progression of algor mortis because loss of temperature occurs more slowly in bodies with high masses than in cachectic ones. Thus, the BMI is applied as an important correction factor in this regard [1] . The influence of body mass on protein degradation observed in the present work may, on the one hand, be a direct consequence of the above described differences in body cooling rates. On the other hand, it may also be associated with the finding that subjects with very high and very low BMI have sarcopenic symptoms and abnormal protein metabolism rates [32, 34] . A similar BMI-dependent influence has been demonstrated for RNA integrity in muscle tissue after death, which is significantly lower in samples from persons with a BMI > 25 [35] .In contrast to age and body mass, sex was found to exert no major effects on postmortem protein degradation. The only exception is the ADD correlation of the two degradation products of desmin, which is clearly lower in males than in females. The underlying mechanism of this difference remains unclear, as well as to whether there is any relation to the fact that desmin is one of only a few muscle proteins with higher expression levels in men than in women [36] .In summary, the present study demonstrates that analysis of muscle protein degradation processes has a high potential for being a useful tool in forensic PMI delimitation. We identify candidate proteins with degradation properties to make them most suitable for delimiting certain periods of time postmortem, even under the heterogeneous conditions that are inevitably encountered when examining non-standardized human subjects. This offers a chance to establish a new method for the delimitation of the time since death. The proposed approach provides two important advantages: (i) Muscle tissue is highly abundant in human bodies; it is easily accessible during forensic examination and contains a vast set of proteins, each with the capability to act as a marker molecule for different stages postmortem. (ii) Protein analysis by gel electrophoresis and Western blotting is an easy method, established in almost every biochemical laboratory, and delivers discrete information about degradation processes. Several proteins can be simultaneously tested within 24 h, and a huge variety of antisera for muscle proteins is commercially available. Thus, this method has the potential to support and eventually substitute complicated or less accurate existing approaches. However, additional research is still needed to amplify the group of marker proteins. So far, we present two lower limits for ADD intervals with 95 % probability in an overall approach, as well as four lower 95 % limits considering demographic restrictions for this method. Also, it is still necessary to fine-tune the correction for the bias introduced by factors such as age and body mass. Moreover, specifically, longitudinal studies allowing repeated sampling of individual subjects through an extended period of time postmortem would offer the chance to obtain valuable additional information about the exact point of time of certain degradation phenomena.", "cite_spans": [{"start": 743, "end": 747, "text": "[21]", "latex": null, "ref_id": "BIBREF20"}, {"start": 755, "end": 759, "text": "[22]", "latex": null, "ref_id": "BIBREF21"}, {"start": 767, "end": 771, "text": "[23]", "latex": null, "ref_id": "BIBREF22"}, {"start": 786, "end": 790, "text": "[24]", "latex": null, "ref_id": "BIBREF23"}, {"start": 990, "end": 994, "text": "[25,", "latex": null, "ref_id": "BIBREF24"}, {"start": 995, "end": 998, "text": "26]", "latex": null, "ref_id": "BIBREF25"}, {"start": 1118, "end": 1122, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}, {"start": 2023, "end": 2027, "text": "[12]", "latex": null, "ref_id": "BIBREF11"}, {"start": 2229, "end": 2233, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}, {"start": 3564, "end": 3568, "text": "[27]", "latex": null, "ref_id": "BIBREF26"}, {"start": 3675, "end": 3679, "text": "[25]", "latex": null, "ref_id": "BIBREF24"}, {"start": 4033, "end": 4037, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}, {"start": 4120, "end": 4124, "text": "[28]", "latex": null, "ref_id": "BIBREF27"}, {"start": 4409, "end": 4413, "text": "[29]", "latex": null, "ref_id": "BIBREF28"}, {"start": 4531, "end": 4535, "text": "[30]", "latex": null, "ref_id": "BIBREF29"}, {"start": 4993, "end": 4997, "text": "[14]", "latex": null, "ref_id": "BIBREF13"}, {"start": 5156, "end": 5160, "text": "[29]", "latex": null, "ref_id": "BIBREF28"}, {"start": 5614, "end": 5618, "text": "[14,", "latex": null, "ref_id": "BIBREF13"}, {"start": 5619, "end": 5622, "text": "23,", "latex": null, "ref_id": "BIBREF22"}, {"start": 5623, "end": 5626, "text": "31]", "latex": null, "ref_id": "BIBREF30"}, {"start": 7336, "end": 7340, "text": "[32]", "latex": null, "ref_id": "BIBREF31"}, {"start": 7534, "end": 7538, "text": "[33]", "latex": null, "ref_id": "BIBREF32"}, {"start": 7795, "end": 7798, "text": "[1]", "latex": null, "ref_id": "BIBREF0"}, {"start": 8155, "end": 8159, "text": "[32,", "latex": null, "ref_id": "BIBREF31"}, {"start": 8160, "end": 8163, "text": "34]", "latex": null, "ref_id": "BIBREF33"}, {"start": 8339, "end": 8343, "text": "[35]", "latex": null, "ref_id": "BIBREF34"}, {"start": 8805, "end": 8809, "text": "[36]", "latex": null, "ref_id": "BIBREF35"}], "ref_spans": [{"start": 2671, "end": 2680, "text": "(Table 2)", "latex": null, "ref_id": "TABREF1"}, {"start": 6870, "end": 6878, "text": "(Table 1", "latex": null, "ref_id": "TABREF0"}], "eq_spans": [], "section": null}], "ref_entries": {"FIGREF0": {"text": "Fig. 1 Degradation behavior of tropomyosin, cTnT, desmin, and calpain of four individual cases with varying ADD. Western blot (a-c) and zymography (d) analyses of muscle protein degradation. Tropomyosin bands (a) remain stable independent of ADD and no degradation", "latex": null, "type": "figure"}, "TABREF0": {"text": "Summary of data collected and calculated for all 40 forensic cases (23 female, 17 male) in the present study", "latex": null, "type": "table"}, "TABREF1": {"text": "The results of protein degradation analyses within the total group of cases, as well as-when meaningful-age-and BMI-restricted groups", "latex": null, "type": "table"}}, "bib_entries": {"BIBREF0": {"ref_id": "b0", "title": "Estimation of the time since death", "authors": [{"first": "C", "middle": [], "last": "Henssge", "suffix": ""}, {"first": "B", "middle": [], "last": "Madea", "suffix": ""}], "year": 2007, "venue": "Forensic Sci Int", "volume": "165", "issn": "", "pages": "182--184", "other_ids": {"doi": ["10.1016/j.forsciint.2006.05.017"]}, "links": "42680165"}, "BIBREF1": {"ref_id": "b1", "title": "An overview of methods used for estimation of time since death", "authors": [{"first": "A", "middle": [], "last": "Mathur", "suffix": ""}, {"first": "Y", "middle": ["K"], "last": "Agrawal", "suffix": ""}], "year": 2011, "venue": "Aust J Forensic Sci", "volume": "43", "issn": "", "pages": "275--285", "other_ids": {"doi": ["10.1080/00450618.2011.568970"]}, "links": "73119687"}, "BIBREF2": {"ref_id": "b2", "title": "Importance of supravitality in forensic medicine", "authors": [{"first": "B", "middle": [], "last": "Madea", "suffix": ""}], "year": 1994, "venue": "Forensic Sci Int", "volume": "69", "issn": "", "pages": "221--241", "other_ids": {}, "links": "42002356"}, "BIBREF3": {"ref_id": "b3", "title": "Experiences with a compound method for estimating the time since death. II. Integration of non-temperature-based methods", "authors": [{"first": "C", "middle": [], "last": "Henssge", "suffix": ""}, {"first": "L", "middle": [], "last": "Althaus", "suffix": ""}, {"first": "J", "middle": [], "last": "Bolt", "suffix": ""}], "year": 2000, "venue": "Int J Legal Med", "volume": "113", "issn": "", "pages": "320--331", "other_ids": {}, "links": null}, "BIBREF4": {"ref_id": "b4", "title": "Experiences with a compound method for estimating the time since death. I. Rectal temperature nomogram for time since death", "authors": [{"first": "C", "middle": [], "last": "Henssge", "suffix": ""}, {"first": "L", "middle": [], "last": "Althaus", "suffix": ""}, {"first": "J", "middle": [], "last": "Bolt", "suffix": ""}], "year": 2000, "venue": "Int J Legal Med", "volume": "113", "issn": "", "pages": "303--319", "other_ids": {}, "links": null}, "BIBREF5": {"ref_id": "b5", "title": "Forensic entomology: applications and limitations", "authors": [{"first": "J", "middle": [], "last": "Amendt", "suffix": ""}, {"first": "C", "middle": ["S"], "last": "Richards", "suffix": ""}, {"first": "C", "middle": ["P"], "last": "Campobasso", "suffix": ""}], "year": 2011, "venue": "Forensic Sci Med Pathol", "volume": "7", "issn": "", "pages": "379--392", "other_ids": {"doi": ["10.1007/s12024-010-9209-2"]}, "links": "43714395"}, "BIBREF6": {"ref_id": "b6", "title": "Estimating postmortem interval using RNA degradation and morphological changes in tooth pulp", "authors": [{"first": "S", "middle": ["T"], "last": "Young", "suffix": ""}, {"first": "J", "middle": ["D"], "last": "Wells", "suffix": ""}, {"first": "G", "middle": ["R"], "last": "Hobbs", "suffix": ""}, {"first": "C", "middle": ["P"], "last": "Bishop", "suffix": ""}], "year": 2013, "venue": "Forensic Sci Int", "volume": "229", "issn": "", "pages": "163--164", "other_ids": {}, "links": "8369571"}, "BIBREF7": {"ref_id": "b7", "title": "A microbial clock provides an accurate estimate of the postmortem interval in a mouse model system", "authors": [{"first": "J", "middle": ["L"], "last": "Metcalf", "suffix": ""}, {"first": "L", "middle": [], "last": "Wegener Parfrey", "suffix": ""}, {"first": "A", "middle": [], "last": "Gonzalez", "suffix": ""}], "year": 2013, "venue": "", "volume": "2", "issn": "", "pages": "", "other_ids": {"doi": ["10.7554/eLife.01104"]}, "links": "13121602"}, "BIBREF8": {"ref_id": "b8", "title": "Estimating time since death from postmortem human gut microbial communities", "authors": [{"first": "K", "middle": ["A"], "last": "Hauther", "suffix": ""}, {"first": "K", "middle": ["L"], "last": "Cobaugh", "suffix": ""}, {"first": "L", "middle": ["M"], "last": "Jantz", "suffix": ""}], "year": 2015, "venue": "J Forensic Sci", "volume": "60", "issn": "", "pages": "1234--1240", "other_ids": {"doi": ["10.1111/1556-4029.12828"]}, "links": "28321113"}, "BIBREF9": {"ref_id": "b9", "title": "Decomposition chemistry of human remains: a new methodology for determining the postmortem interval", "authors": [{"first": "A", "middle": ["A"], "last": "Vass", "suffix": ""}, {"first": "S-A", "middle": [], "last": "Barshick", "suffix": ""}, {"first": "G", "middle": [], "last": "Sega", "suffix": ""}], "year": 2002, "venue": "J Forensic Sci", "volume": "47", "issn": "", "pages": "542--553", "other_ids": {}, "links": "19621832"}, "BIBREF10": {"ref_id": "b10", "title": "Is Z-disk degradation responsible for postmortem tenderization?", "authors": [{"first": "R", "middle": ["G"], "last": "Taylor", "suffix": ""}, {"first": "G", "middle": ["H"], "last": "Geesink", "suffix": ""}, {"first": "V", "middle": ["F"], "last": "Thompson", "suffix": ""}], "year": 1995, "venue": "J Anim Sci", "volume": "73", "issn": "", "pages": "1351--1367", "other_ids": {}, "links": "8585905"}, "BIBREF11": {"ref_id": "b11", "title": "Proteolysis of specific muscle structural proteins by mu-calpain at low pH and temperature is similar to degradation in postmortem bovine muscle", "authors": [{"first": "E", "middle": [], "last": "Huff-Lonergan", "suffix": ""}, {"first": "T", "middle": [], "last": "Mitsuhashi", "suffix": ""}, {"first": "D", "middle": ["D"], "last": "Beekman", "suffix": ""}], "year": 1996, "venue": "J Anim Sci", "volume": "74", "issn": "", "pages": "993--1008", "other_ids": {}, "links": "3488657"}, "BIBREF12": {"ref_id": "b12", "title": "Sodium dodecyl sulfate-polyacrylamide gel electrophoresis and western blotting comparisons of purified myofibrils and whole muscle preparations for evaluating titin and nebulin in postmortem bovine muscle", "authors": [{"first": "E", "middle": [], "last": "Huff-Lonergan", "suffix": ""}, {"first": "T", "middle": [], "last": "Mitsuhashi", "suffix": ""}, {"first": "F", "middle": ["C"], "last": "Parrish", "suffix": ""}, {"first": "R", "middle": ["M"], "last": "Robson", "suffix": ""}], "year": 1996, "venue": "J Anim Sci", "volume": "74", "issn": "", "pages": "779--785", "other_ids": {}, "links": "35478934"}, "BIBREF13": {"ref_id": "b13", "title": "Postmortem degradation of skeletal muscle proteins: a novel approach to determine the time since death", "authors": [{"first": "S", "middle": [], "last": "Pittner", "suffix": ""}, {"first": "F", "middle": ["C"], "last": "Monticelli", "suffix": ""}, {"first": "A", "middle": [], "last": "Pfisterer", "suffix": ""}], "year": 2015, "venue": "Int J Legal Med", "volume": "", "issn": "", "pages": "1--11", "other_ids": {"doi": ["10.1007/s00414-015-1210-6"]}, "links": "9502893"}, "BIBREF14": {"ref_id": "b14", "title": "Using accumulated degree-days to estimate the postmortem interval from decomposed human remains", "authors": [{"first": "M", "middle": ["S"], "last": "Megyesi", "suffix": ""}, {"first": "S", "middle": ["P"], "last": "Nawrocki", "suffix": ""}, {"first": "N", "middle": ["H"], "last": "Haskell", "suffix": ""}], "year": 2005, "venue": "J Forensic Sci", "volume": "50", "issn": "", "pages": "618--626", "other_ids": {}, "links": "30301335"}, "BIBREF15": {"ref_id": "b15", "title": "Using accumulated degree-days to estimate postmortem interval from the DNA yield of porcine skeletal muscle", "authors": [{"first": "B", "middle": [], "last": "Larkin", "suffix": ""}, {"first": "S", "middle": [], "last": "Iaschi", "suffix": ""}, {"first": "I", "middle": [], "last": "Dadour", "suffix": ""}, {"first": "G", "middle": ["K"], "last": "Tay", "suffix": ""}], "year": 2009, "venue": "Forensic Sci Med Pathol", "volume": "6", "issn": "", "pages": "83--92", "other_ids": {}, "links": "23592455"}, "BIBREF16": {"ref_id": "b16", "title": "The effect of temperature on development of Sarconesia chlorogaster, a blowfly of forensic importance", "authors": [{"first": "M", "middle": ["C"], "last": "Lecheta", "suffix": ""}, {"first": "P", "middle": ["J"], "last": "Thyssen", "suffix": ""}, {"first": "M", "middle": ["O"], "last": "Moura", "suffix": ""}], "year": 2015, "venue": "Forensic Sci Med Pathol", "volume": "11", "issn": "", "pages": "538--543", "other_ids": {"doi": ["10.1007/s12024-015-9727-z"]}, "links": "5646679"}, "BIBREF17": {"ref_id": "b17", "title": "Cleavage of structural proteins during the assembly of the head of bacteriophage T4", "authors": [{"first": "U", "middle": ["K"], "last": "Laemmli", "suffix": ""}], "year": 1970, "venue": "Nature", "volume": "227", "issn": "", "pages": "680--685", "other_ids": {"doi": ["10.1038/227680a0"]}, "links": "3105149"}, "BIBREF18": {"ref_id": "b18", "title": "Casein zymography: a method to study \u03bc-calpain, m-calpain, and their inhibitory agents", "authors": [{"first": "K", "middle": ["J"], "last": "Raser", "suffix": ""}, {"first": "A", "middle": [], "last": "Posner", "suffix": ""}, {"first": "Kkw", "middle": [], "last": "Wang", "suffix": ""}], "year": 1995, "venue": "Arch Biochem Biophys", "volume": "319", "issn": "", "pages": "211--216", "other_ids": {"doi": ["10.1006/abbi.1995.1284"]}, "links": "8087616"}, "BIBREF19": {"ref_id": "b19", "title": "The effect of temperature on the activity of \u03bc-and m-calpain and calpastatin during post-mortem storage of porcine longissimus muscle", "authors": [{"first": "L", "middle": [], "last": "Pomponio", "suffix": ""}, {"first": "P", "middle": [], "last": "Ertbjerg", "suffix": ""}], "year": 2012, "venue": "Meat Sci", "volume": "91", "issn": "", "pages": "50--55", "other_ids": {"doi": ["10.1016/j.meatsci.2011.12.005"]}, "links": "44655143"}, "BIBREF20": {"ref_id": "b20", "title": "Oxidative environments decrease tenderization of beef steaks through inactivation of mu-calpain", "authors": [{"first": "L", "middle": ["J"], "last": "Rowe", "suffix": ""}, {"first": "K", "middle": ["R"], "last": "Maddock", "suffix": ""}, {"first": "S", "middle": ["M"], "last": "Lonergan", "suffix": ""}, {"first": "E", "middle": [], "last": "Huff-Lonergan", "suffix": ""}], "year": 2004, "venue": "J Anim Sci", "volume": "82", "issn": "", "pages": "3254--3266", "other_ids": {}, "links": "4028850"}, "BIBREF21": {"ref_id": "b21", "title": "Contribution of postmortem changes of integrin, desmin and \u03bc-calpain to variation in water holding capacity of pork", "authors": [{"first": "W", "middle": ["G"], "last": "Zhang", "suffix": ""}, {"first": "S", "middle": ["M"], "last": "Lonergan", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Gardner", "suffix": ""}, {"first": "E", "middle": [], "last": "Huff-Lonergan", "suffix": ""}], "year": 2006, "venue": "Meat Sci", "volume": "74", "issn": "", "pages": "578--585", "other_ids": {"doi": ["10.1016/j.meatsci.2006.05.008"]}, "links": "3483666"}, "BIBREF22": {"ref_id": "b22", "title": "Postmortem proteolysis and calpain/calpastatin activity in callipyge and normal lamb biceps femoris during extended postmortem storage", "authors": [{"first": "G", "middle": ["H"], "last": "Geesink", "suffix": ""}, {"first": "M", "middle": [], "last": "Koohmaraie", "suffix": ""}], "year": 1999, "venue": "J Anim Sci", "volume": "77", "issn": "", "pages": "1490--1501", "other_ids": {}, "links": "9969394"}, "BIBREF23": {"ref_id": "b23", "title": "Quantitative determination of titin and nebulin in poultry meat by SDS-PAGE with an internal standard", "authors": [{"first": "J", "middle": [], "last": "Tomaszewska-Gras", "suffix": ""}, {"first": "J", "middle": [], "last": "Kijowski", "suffix": ""}, {"first": "Fjg", "middle": [], "last": "Schreurs", "suffix": ""}], "year": 2002, "venue": "Meat Sci", "volume": "62", "issn": "", "pages": "228--233", "other_ids": {}, "links": "36139253"}, "BIBREF24": {"ref_id": "b24", "title": "Cardiac troponin T composition in normal and regenerating human skeletal muscle", "authors": [{"first": "G", "middle": ["S"], "last": "Bodor", "suffix": ""}, {"first": "L", "middle": [], "last": "Survant", "suffix": ""}, {"first": "E", "middle": ["M"], "last": "Voss", "suffix": ""}], "year": 1997, "venue": "Clin Chem", "volume": "43", "issn": "", "pages": "476--484", "other_ids": {}, "links": "506213"}, "BIBREF25": {"ref_id": "b25", "title": "Effect of postmortem storage on mu-calpain and m-calpain in ovine skeletal muscle", "authors": [{"first": "E", "middle": [], "last": "Veiseth", "suffix": ""}, {"first": "S", "middle": ["D"], "last": "Shackelford", "suffix": ""}, {"first": "T", "middle": ["L"], "last": "Wheeler", "suffix": ""}, {"first": "M", "middle": [], "last": "Koohmaraie", "suffix": ""}], "year": 2001, "venue": "J Anim Sci", "volume": "79", "issn": "", "pages": "1502--1508", "other_ids": {}, "links": "1816539"}, "BIBREF26": {"ref_id": "b26", "title": "Temperature-dependent postmortem changes in human cardiac troponin-T (cTnT): an approach in estimation of time since death", "authors": [{"first": "S", "middle": [], "last": "Kumar", "suffix": ""}, {"first": "W", "middle": [], "last": "Ali", "suffix": ""}, {"first": "U", "middle": ["S"], "last": "Singh", "suffix": ""}], "year": 2015, "venue": "J Forensic Sci", "volume": "", "issn": "", "pages": "", "other_ids": {}, "links": "1350530"}, "BIBREF27": {"ref_id": "b27", "title": "Immunological detection of m-and \u03bc-calpains in the skeletal muscle of Marchigiana cattle", "authors": [{"first": "E", "middle": [], "last": "Varricchio", "suffix": ""}, {"first": "M", "middle": ["G"], "last": "Russolillo", "suffix": ""}, {"first": "L", "middle": [], "last": "Maruccio", "suffix": ""}], "year": 2013, "venue": "Eur J Histochem", "volume": "57", "issn": "", "pages": "", "other_ids": {"doi": ["10.4081/ejh.2013.e2"]}, "links": "17420514"}, "BIBREF28": {"ref_id": "b28", "title": "The calpain system", "authors": [{"first": "D", "middle": ["E"], "last": "Goll", "suffix": ""}, {"first": "V", "middle": ["F"], "last": "Thompson", "suffix": ""}, {"first": "H", "middle": [], "last": "Li", "suffix": ""}], "year": 2003, "venue": "Physiol Rev", "volume": "83", "issn": "", "pages": "731--801", "other_ids": {"doi": ["10.1152/physrev.00029.2002"]}, "links": "37422100"}, "BIBREF29": {"ref_id": "b29", "title": "Biochemistry of postmortem muscle-lessons on mechanisms of meat tenderization", "authors": [{"first": "E", "middle": [], "last": "Huff Lonergan", "suffix": ""}, {"first": "W", "middle": [], "last": "Zhang", "suffix": ""}, {"first": "S", "middle": ["M"], "last": "Lonergan", "suffix": ""}], "year": 2010, "venue": "Meat Sci", "volume": "86", "issn": "", "pages": "184--195", "other_ids": {"doi": ["10.1016/j.meatsci.2010.05.004"]}, "links": "13514465"}, "BIBREF30": {"ref_id": "b30", "title": "Changes in the calpains and calpastatin during postmortem storage of bovine muscle", "authors": [{"first": "M", "middle": ["L"], "last": "Boehm", "suffix": ""}, {"first": "T", "middle": ["L"], "last": "Kendall", "suffix": ""}, {"first": "V", "middle": ["F"], "last": "Thompson", "suffix": ""}, {"first": "D", "middle": ["E"], "last": "Goll", "suffix": ""}], "year": 1998, "venue": "J Anim Sci", "volume": "76", "issn": "", "pages": "2415--2434", "other_ids": {}, "links": "3001356"}, "BIBREF31": {"ref_id": "b31", "title": "Skeletal muscle wasting in cachexia and sarcopenia: molecular pathophysiology and impact of exercise training", "authors": [{"first": "T", "middle": ["S"], "last": "Bowen", "suffix": ""}, {"first": "G", "middle": [], "last": "Schuler", "suffix": ""}, {"first": "V", "middle": [], "last": "Adams", "suffix": ""}], "year": 2015, "venue": "J Cachexia Sarcopenia Muscle", "volume": "6", "issn": "", "pages": "197--207", "other_ids": {"doi": ["10.1002/jcsm.12043"]}, "links": "1067743"}, "BIBREF32": {"ref_id": "b32", "title": "A new model for the estimation of time of death from vitreous potassium levels corrected for age and temperature", "authors": [{"first": "B", "middle": [], "last": "Zilg", "suffix": ""}, {"first": "S", "middle": [], "last": "Bernard", "suffix": ""}, {"first": "K", "middle": [], "last": "Alkass", "suffix": ""}], "year": 2015, "venue": "Forensic Sci Int", "volume": "254", "issn": "", "pages": "158--166", "other_ids": {}, "links": "39011591"}, "BIBREF33": {"ref_id": "b33", "title": "Abnormal protein turnover and anabolic resistance to exercise in sarcopenic obesity", "authors": [{"first": "M", "middle": ["I"], "last": "Nilsson", "suffix": ""}, {"first": "J", "middle": ["P"], "last": "Dobson", "suffix": ""}, {"first": "N", "middle": ["P"], "last": "Greene", "suffix": ""}], "year": 2013, "venue": "FASEB J", "volume": "27", "issn": "", "pages": "3905--3916", "other_ids": {"doi": ["10.1096/fj.12-224006"]}, "links": "7368325"}, "BIBREF34": {"ref_id": "b34", "title": "RNA integrity in post-mortem samples: influencing parameters and implications on RT-qPCR assays", "authors": [{"first": "A", "middle": [], "last": "Koppelkamm", "suffix": ""}, {"first": "B", "middle": [], "last": "Vennemann", "suffix": ""}, {"first": "S", "middle": [], "last": "Lutz-Bonengel", "suffix": ""}], "year": 2011, "venue": "Int J Legal Med", "volume": "125", "issn": "", "pages": "573--580", "other_ids": {"doi": ["10.1007/s00414-011-0578-1"]}, "links": "3038907"}, "BIBREF35": {"ref_id": "b35", "title": "Heart protein expression related to age and sex in mice and humans", "authors": [{"first": "M", "middle": [], "last": "Diedrich", "suffix": ""}, {"first": "J", "middle": [], "last": "Tadic", "suffix": ""}, {"first": "L", "middle": [], "last": "Mao", "suffix": ""}], "year": 2007, "venue": "Int J Mol Med", "volume": "20", "issn": "", "pages": "865--874", "other_ids": {}, "links": "24681175"}}}, "latex_parse": null}
diff --git a/s2orc-doc2json/tests/s2orc/20200705/18980380.json b/s2orc-doc2json/tests/s2orc/20200705/18980380.json
new file mode 100644
index 0000000000000000000000000000000000000000..36f80b132b3c67fbca00f051d17287d7acbc4106
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20200705/18980380.json
@@ -0,0 +1 @@
+{"paper_id": "18980380", "_pdf_hash": "ffe93b67a395cc51d6dc4c5f438a6bbc08a3f31a", "abstract": [{"section": "Abstract", "text": "This technical note studies Markov decision processes under parameter uncertainty. We adapt the distributionally robust optimization framework, assume that the uncertain parameters are random variables following an unknown distribution, and seek the strategy which maximizes the expected performance under the most adversarial distribution. In particular, we generalize a previous study [1] which concentrates on distribution sets with very special structure to a considerably more generic class of distribution sets, and show that the optimal strategy can be obtained efficiently under mild technical conditions. This significantly extends the applicability of distributionally robust MDPs by incorporating probabilistic information of uncertainty in a more flexible way.", "cite_spans": [], "ref_spans": []}, {"section": "Abstract", "text": "Index Terms-Distributional robustness, Markov decision processes, parameter uncertainty.", "cite_spans": [], "ref_spans": []}], "body_text": [{"section": "", "text": ". Illustration of the confidence sets.", "cite_spans": [], "ref_spans": []}, {"section": "", "text": "optimizing variable and \u03be is the unknown parameter, distributionally robust optimization solves max x\u2208X [inf \u03bc\u2208C E \u03be\u223c\u03bc u(x, \u03be)], where C is an a priori known set of distributions.", "cite_spans": [], "ref_spans": []}, {"section": "", "text": "We highlight our contributions by comparing with [1] . In [1] the state-wise ambiguity set is restricted to the following form:C s = {\u03bc s |\u03bc s (O i s ) \u2265 \u03b1 i s \u2200 i = 1, . . . , n s }, where \u03b1 i s \u2264 \u03b1 j s and O i s is a proper set of uncertain parameters with a \"nested-set\" structure, i.e., satisfying O i s \u2286 O j s , for all i < j [see Fig. 1(a) ]. This setup can effectively model distributions with a single mode (such as a Gaussian distribution), but less so when modeling multi-mode distributions such as a mixture Gaussian distribution. Moreover, other probabilistic information such as mean, variance etc. cannot be incorporated. Thus, in this technical note, we extend the distributionally robust MDP approach to handle ambiguity sets with more general structures. In particular, we consider a class of ambiguity sets, first proposed in [18] as a unifying framework for modeling and solving distributionally robust single-stage optimization problems, and embed them into the distributionally robust MDPs setup. These ambiguity sets are considerably more general: they are characterized by a class of O i s which can either be nested or disjoint [as shown in Fig. 1(b) ], and moreover, additional linear constraints are allowed to define the ambiguity set, which can be used to incorporate probabilistic information such as mean, covariance or other variation measures. We show that, under this more general class of ambiguity sets, the resulting distributionally robust MDPs remain tractable under mild technical conditions, and often outperform previous methods thanks to the fact that it can model uncertainty in a more flexible way.", "cite_spans": [{"start": 49, "end": 52, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 58, "end": 61, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 845, "end": 849, "text": "[18]", "ref_id": "BIBREF17"}], "ref_spans": [{"start": 337, "end": 346, "text": "Fig. 1(a)", "ref_id": "FIGREF2"}, {"start": 1166, "end": 1175, "text": "Fig. 1(b)", "ref_id": "FIGREF2"}]}, {"section": "II. PRELIMINARIES", "text": "Throughout the technical note, we use capital letters to denote matrices, and bold face letters to denote column vectors. We use e i (m) to denote the ith elementary vector of length m, and use R n + to denote the nonnegative orthant of R n . If C is the set of joint probability distributions of three random vectors a, b, and c, then (a,b) C denotes the set of marginal distributions of (a, b). We use \u2295 to represent mixture distribution: given two probability distributions F 1 , F 2 and a Bernoulli random variable x which takes value 1 w.p. p, xF 1 \u2295 (1 \u2212 x)F 2 is a random variable such that it follows distribution F 1 w.p. p, and follows F 2 w.p. 1 \u2212 p. We use N (m, \u03c3 2 ) to represent a Gaussian distribution with mean m and variance \u03c3 2 .", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "A (finite) Markov Decision Process (MDP) is defined as a 6-tuple T, \u03b3, S, A, p, r . Here, T is the (possibly infinite) decision horizon; 0018-9286 \u00a9 2015 IEEE. Personal use is permitted, but republication/redistribution requires IEEE permission.", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "See http://www.ieee.org/publications_standards/publications/rights/index.html for more information.", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "\u03b3 \u2208 (0, 1] is the discount factor; S is the state set and A s is the action set of state s \u2208 S, both assumed to be finite. The parameter p and r are the transition probability and the expected reward, respectively. That is, for s \u2208 S and a \u2208 A s , r(s, a) is the expected reward and p(s |s, a) is the probability that the next state is s . Following [2] , we denote the set of all history-dependent randomized strategies by \u03a0 HR . We use subscript s to denote the value associated with the state s: e.g., r s denotes the vector form of the rewards associated with the state s, and \u03c0 s is the (randomized) action chosen at state s for strategy \u03c0.", "cite_spans": [{"start": 350, "end": 353, "text": "[2]", "ref_id": "BIBREF1"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "The elements in the vector p s are listed in the following way: the transition probabilities of the same action are arranged in the same block, and inside each block they are listed according to the order of the next state. We use s to denote the (random) state following s, and \u0394(s) to denote the probability simplex on A s . We use to represent Cartesian product, e.g., p = s\u2208S p s . For a given strategy \u03c0 \u2208 \u03a0 HR , we denote the expected (discounted) total-reward under parameters pair (p, r) as u(\u03c0, p, r)", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "A Distributionally Ambiguous MDP (DAMDP) is defined as a tuple T, \u03b3, S, A,C S , where the transition probability p and the expected reward r are unknown. Instead, they are assumed to obey a joint distribution \u03bc 0 (also unknown) that belongs to a known ambiguity set", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "While the DAMDP framework can be very general, mostC S result in formulations that are computationally intractable (e.g., [1] , [19] ). Hence, we make the following requirement ofC S such that the parameters among different states are independent.", "cite_spans": [{"start": 122, "end": 125, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 128, "end": 132, "text": "[19]", "ref_id": "BIBREF18"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "Assumption 1: The ambiguity setC S has the following property:", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "where \"state-wise ambiguity set\"C s is a set of distributions of parameters of state s. By the definition ofC S , the state-wise property applies to C S as well. This property is the same as the concept of \"s-rectangularity\" in [16] , and is essential for reducing DAMDP to robust MDP in Lemma 1. In addition, [20] showed that the robust MDP with coupled uncertainty sets is computationally challenging, which implies solving DAMDP with nonrectangular ambiguity sets is even harder.", "cite_spans": [{"start": 228, "end": 232, "text": "[16]", "ref_id": "BIBREF15"}, {"start": 310, "end": 314, "text": "[20]", "ref_id": "BIBREF19"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "We now discuss the admissible state-wise ambiguity set. Our formulation of the state-wise ambiguity set follows the unifying framework of [18] . In specific, given s \u2208 S, the state-wise ambiguity set is representable with the following standard form:", "cite_spans": [{"start": 138, "end": 142, "text": "[18]", "ref_id": "BIBREF17"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "are the lower and upper bounds of the probability that parameters belong to a confidence set. Thus, each confidence set O i s provides an estimation of the uncertain parameters pair (p s , r s ,\u0169 s ) subject to a different confidence level. Ambiguity setsC s contain prescribed conic representable confidence sets and mean values residing on an affine manifold, which is rich enough to encompass and extend several ambiguity sets considered in recent literature (e.g., [1] , [19] , [21] ). The set of joint distribution of (p s , r s ) is hence C s \u0394 = (ps ,rs)C s . Notice that a classical technique called \"lifting\" is used here: We introduce an auxiliary random vector\u0169, so that some non-linear relationship can be modeled linearly. For example, a constraint on the variance can be modeled using this standard form (see [22, Example 2] ), which is otherwise impossible without the auxiliary variable. This lifting technique thus allows us to model a rich variety of structural information about the marginal distribution of (p, r) in a unified manner. Note when the ambiguity set only contains the support of random variables, i.e.,", "cite_spans": [{"start": 469, "end": 472, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 475, "end": 479, "text": "[19]", "ref_id": "BIBREF18"}, {"start": 482, "end": 486, "text": "[21]", "ref_id": "BIBREF21"}, {"start": 823, "end": 827, "text": "[22,", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "where the a-priori information of unknown parameters is that they belong to an uncertainty set.", "cite_spans": [], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "Assumptions 2 to 4 are standard requirements for the confidence sets, proposed in [18] . The first one asserts the relationship between different confidence sets.", "cite_spans": [{"start": 82, "end": 86, "text": "[18]", "ref_id": "BIBREF17"}], "ref_spans": []}, {"section": "II. PRELIMINARIES", "text": "The nesting condition is illustrated in Fig. 1(b) . Next, for any s \u2208 S we require thatC s satisfies the following regularity condition.", "cite_spans": [], "ref_spans": [{"start": 40, "end": 49, "text": "Fig. 1(b)", "ref_id": "FIGREF2"}]}, {"section": "Assumption 3 (Regularity Conditions forC s ):", "text": "1) The confidence set O ns s is bounded and has probability one, that is, ", "cite_spans": [], "ref_spans": []}, {"section": "Assumption 3 (Regularity Conditions forC s ):", "text": "s are proper cones (i.e., a closed, convex and pointed cone with nonempty interior).", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "This section focuses on DAMDP with a finite number of decision stages. We show that a strategy defined through backward induction, which we call S-robust strategy, is distributionally robust. We further show such a strategy is solvable in polynomial time under mild technical conditions. This generalizes results in [1] to a significantly more general class of ambiguity sets.", "cite_spans": [{"start": 316, "end": 319, "text": "[1]", "ref_id": "BIBREF0"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Similar to [10] , we assume that when a state is visited multiple times, each time it can take a different parameter realization (nonstationary model). This assumption is justified mainly because the stationary model is generally intractable and a lower-bound of it is given by the non-stationary model. Therefore, multiple visits to a state can be treated as visiting different states. By introducing dummy states as in [1, Assumption 2.2], for finite horizon DAMDP we make the following assumption without loss of generality. This will simplify our exposition.", "cite_spans": [{"start": 11, "end": 15, "text": "[10]", "ref_id": "BIBREF9"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Assumption 5: 1) Each state belongs to only one stage.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "2) The terminal reward equals zero.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "3) The first stage only contains one state s ini .", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Using the condition 1 of Assumption 5, we partition S according to the stage each state belongs to. That is, we let S t be the set of states belong to tth stage.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "For \u03c0 \u2208 \u03a0 HR and \u03bc \u2208 C S , we denote the expected performance of a DAMDP as w \u03c0, \u03bc, (s ini ) \u0394 = E (p,r)\u223c\u03bc {u(\u03c0, p, r)} = u(\u03c0, p, r)d\u03bc(p, r).", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "In words, each strategy is evaluated by its expected performance under the (respective) most adversarial distribution of the uncertain parameters, and a distributionally robust strategy is the optimal strategy according to this metric. The main focus of this section is deriving approaches to solve the distributionally robust strategy. To this end, we need the following definition.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Definition 2: Given a DAMDP T, \u03b3, S, A,C S , we define the Srobust strategy as follows", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "3) A strategy\u03c0 * is a Srobust strategy if \u2200 s \u2208 S, and every history h that ends at s, we have\u03c0 * s , conditioned on history h, is a S-robust action.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "The definition requires that the strategy must be robust w.r.t. each sub-problem, and hence the name \"S-robust.\" The following theorem shows any S-robust strategy \u03c0 * is distributionally robust, and is the main result of this technical note.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Theorem 1: Let T < \u221e. Under Assumptions 1, 2, 4, and 5, if \u03c0 * is a S-robust strategy, then 1) \u03c0 * is a distributionally robust strategy with respect to C S . 2) There exists \u03bc * \u2208 C s such that (\u03c0 * , \u03bc * ) is a saddle point. That is", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Proof: We first state a Lemma from [1, Lemma 3.2] without proof.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Lemma 1: Under Assumption 1, fix \u03c0 \u2208 \u03a0 HR and \u03bc \u2208 C S , denote p = E \u03bc (p) and r = E \u03bc (r). We have w(\u03c0, \u03bc, (s ini )) = u(\u03c0, p, r).", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Lemma 1 means for any strategy, the expected performance under an admissible distribution \u03bc only depends on the expected value of parameters under \u03bc. Thus, the distributionally robust MDPs reduce to robust MDPs. Next we characterize the set of expected value of the parameters.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Lemma 2: For s \u2208 S and \u03c0 s \u2208 \u0394(s), we define the set Z s = {E \u03bcs (p s , r s )|\u03bc s \u2208 C s }. Then set Z s is convex and compact.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Proof: First, we show that, for s \u2208 S and \u03c0 s \u2208 \u0394(s), the set defined asZ s = {E \u03bcs (p s , r s ,\u0169 s )|\u03bc s \u2208C s } is convex and compact. The convexity can be easily shown, which is omitted due to space constraints (see [22] for details). To show the compactness, notice thatC s is weakly closed (i.e., closed w.r.t. to the weak topology) since the feasible set of each of constraint is weakly closed which implies their intersection is also weakly closed. Thus,Z s is closed since it is the image ofC s under expectation (which is a continuous function). This impliesZ s is compact since O ns s is bounded and henceZ s is bounded. Finally, since Z s is the projection onto the first two coordinates of set Z s , its convexity and compactness thus follow.", "cite_spans": [{"start": 218, "end": 222, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Lemma 2 implies that, for s \u2208 S and \u03c0 s \u2208 \u0394(s), there exists (p * s , r * s ) \u2208 Z s that satisfies inf (ps,rs)\u2208Zs u(\u03c0 s , p s , r s ) = u(\u03c0 s , p * s , r * s ). Since saddle point of the minimax objective exists for robust MDPs (e.g., [10] , [11] ), we can complete the proof of part 2) following a similar procedure as the last portion of proof for [1, Theorem 3.1]. We omit the details due to space constraint (see [22] for details). Part 1) then follows part 2) immediately.", "cite_spans": [{"start": 235, "end": 239, "text": "[10]", "ref_id": "BIBREF9"}, {"start": 242, "end": 246, "text": "[11]", "ref_id": "BIBREF10"}, {"start": 417, "end": 421, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "We now investigate the computational aspect of finding the S-robust action.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Theorem 2: Under Assumption 2, 3, 4, and 5, for s \u2208 S t where t < T , the S-robust action is the optimal solution of the following optimization problem (termed Srobust problem hereafter): Proof: The proof essentially follows from [18] and duality of convex optimization [23] , and can be found in the longer version [22] of this technical note.", "cite_spans": [{"start": 230, "end": 234, "text": "[18]", "ref_id": "BIBREF17"}, {"start": 270, "end": 274, "text": "[23]", "ref_id": "BIBREF23"}, {"start": 316, "end": 320, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "Thus, since for s \u2208 S t , \u0394(s) is compact, we can solve the S-robust action in polynomial time if all K i s are \"easy\" cones such as linear, conic quadratic or semidefinite cones. Moreover, using Theorem 1, by backward induction, we can obtain the S-robust strategy efficiently.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "By virtue of the lifting technique [18, Theorem 5], we show below several widely used ambiguity sets are indeed special cases ofC s defined in (1) . We further derive their corresponding S-robust problems. See [22] for additional examples (variance and expected Huber loss function). ", "cite_spans": [{"start": 143, "end": 146, "text": "(1)", "ref_id": "BIBREF0"}, {"start": 210, "end": 214, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "This example can also be treated via \"classical\" robust optimization by virtue of Lemma 1.", "cite_spans": [], "ref_spans": []}, {"section": "III. FINITE HORIZON DISTRIBUTIONALLY ROBUST MDPS", "text": "The finite horizon DAMDP can be easily extended to discountedreward infinite horizon setup. We can generalize the notion of S-robust strategy, which turns to be distributionally robust in both stationary and non-stationary models. This extension is similar to [1] and can be found in [22] .", "cite_spans": [{"start": 260, "end": 263, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 284, "end": 288, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": []}, {"section": "IV. SIMULATION", "text": "In this section, we study two synthetic numerical examples: a machine replacement problem and a path planning problem. In the machine replacement problem, the reward parameters are uncertain; whereas in the path planning problem, the transition probabilities are uncertain. All results were generated on desktop with Intel Core i5-3570 CPU of 3.40 GHz clock speed and 8 GB RAM. The S-robust problems are solved in Matlab using the CVX package [24] .", "cite_spans": [{"start": 443, "end": 447, "text": "[24]", "ref_id": "BIBREF24"}], "ref_spans": []}, {"section": "A. Reward Uncertainty in the Machine Replacement Problem", "text": "We consider a machine replacement problem similar to the one in [12] . Consider the repair cost incurred by a factory that holds a large number of machines, given that each of these machines is modeled with a same underlying MDP for which rewards are subject to uncertainty.", "cite_spans": [{"start": 64, "end": 68, "text": "[12]", "ref_id": "BIBREF11"}], "ref_spans": []}, {"section": "1) Machine Replacement as a MDP With Gaussian Rewards:", "text": "We first consider a machine replacement problem with 50 states, 2 actions (\"repair\" and \"not repair\") for each state, deterministic transitions, a discount factor of 0.8, and uncertain rewards following Gaussian distributions independently [see Fig. 2(a) ]: For the first 48 states, the \"repair\" action has a cost N (130, 1) . The 49th and 50th states of the machine's life are designed to be risky: not repairing at state 50 incurs a highly uncertain cost N (100, 800), while repairing at both states is a more secure but still uncertain option with a cost N (130, 10) . The detailed implementation is as follows: We use the mean value of uncertain rewards to compute the nominal strategy. For both robust and distributionally robust strategy, we construct confidence sets usin\u011d m \u00b1 3\u03c3 for the first 49 states, andm \u00b1 4\u03c3 for state 50 wherem and\u03c3 2 are mean and variance estimated from samples (see [22] for details), as it is more risky and thus hard to estimate. In addition, we construct an extra confidence set (centered at the mean) with 60%-70% confidence level (i.e., \u03b1 1 50 = 0.6, \u03b1 1 50 = 0.7) for distributionally robust strategy. The optimal paths followed by three strategies are shown in Fig. 2(a) .", "cite_spans": [{"start": 899, "end": 903, "text": "[22]", "ref_id": "BIBREF22"}], "ref_spans": [{"start": 245, "end": 254, "text": "Fig. 2(a)", "ref_id": "FIGREF3"}, {"start": 309, "end": 324, "text": "cost N (130, 1)", "ref_id": "FIGREF1"}, {"start": 553, "end": 569, "text": "cost N (130, 10)", "ref_id": "FIGREF1"}, {"start": 1201, "end": 1210, "text": "Fig. 2(a)", "ref_id": "FIGREF3"}]}, {"section": "1) Machine Replacement as a MDP With Gaussian Rewards:", "text": "The performance of the strategies obtained by using the nominal, the robust and the distributionally robust approaches is presented in Fig. 3 . The corresponding average total discounted rewards and computational times are shown in Table I . The nominal strategy results in the highest average total discounted rewards. This is well expected as we are using the exact mean value of the reward as the nominal Fig. 2 . Two instances of a machine replacement problem. Fig. 2(a) shows Gaussian uncertainty in the rewards, while Fig. 2(b) shows mixed Gaussian uncertainty in the rewards. parameter. However, the nominal strategy is highly risky: it cannot prevent bad performance (e.g., \u22120.025) from happening, which is undesirable. While the nominal strategy, blind to any form of risk, finds no advantage in ever repairing, the robust strategy ends up following a highly conservative policy (repairing the machine at state 49 to avoid state 50). In contrast, the distributionally robust optimal strategy makes use of more distributional information and handles the risk efficiently by waiting until state 50 and then repair the machine. Therefore, this strategy beats the nominal and robust strategies in that it strikes a good tradeoff between high mean reward and low variance over 10,000 different trials. These results coincide with what one would typically expect from the three solution concepts. Fig. 4 . Illustration of the confidence sets for two distributionally robust strategies.", "cite_spans": [], "ref_spans": [{"start": 135, "end": 141, "text": "Fig. 3", "ref_id": "FIGREF1"}, {"start": 232, "end": 239, "text": "Table I", "ref_id": "TABREF0"}, {"start": 408, "end": 414, "text": "Fig. 2", "ref_id": "FIGREF3"}, {"start": 465, "end": 474, "text": "Fig. 2(a)", "ref_id": "FIGREF3"}, {"start": 524, "end": 533, "text": "Fig. 2(b)", "ref_id": "FIGREF3"}, {"start": 1400, "end": 1406, "text": "Fig. 4", "ref_id": "FIGREF0"}]}, {"section": "2) Machine Replacement as a MDP With Mixed Gaussian Rewards:", "text": "The second experiment has a similar setup as the previous one, except that not repairing at the 50th state has a reward which follows a mixed Gaussian distribution [see Fig. 2(b) ]. This experiment illustrates the effect of the two different nested-set structures shown in Fig. 1 . In specific, we apply the two different distributionally robust approaches (proposed in [1] and this technical note respectively), and show that our method outperforms. The detailed implementation is as follows: For the robust and two distributionally robust strategies, we construct uncertainty set corresponding to 99% probability support of the rewards for the first 49 states, and 99.9% for the 50th state that is more risky, using estimated mean and variance (see [22] for details). For the first distributionally robust strategy proposed in [1] , we construct two additional nested confidence sets O 1 50 and O 2 50 [see Fig. 4(a) ], which w.p. 40%-50% and 60%-70% respectively the uncertain rewards belong to. In contrast, for the second distributionally robust strategy proposed in this technical note, we construct two disjoint confidence sets O 1 50 and O 2 50 [see Fig. 4 (b)] with 70%-80% and 0%-10% confidence level, respectively. Specifically, we select these two intervals around the peaks of the two Gaussian elements [i.e., N (100, 10) and N (140, 2)] to better model this mixed distribution. The optimal paths followed for the three strategies are shown in Fig. 2(b) .", "cite_spans": [{"start": 370, "end": 373, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 751, "end": 755, "text": "[22]", "ref_id": "BIBREF22"}, {"start": 829, "end": 832, "text": "[1]", "ref_id": "BIBREF0"}], "ref_spans": [{"start": 169, "end": 178, "text": "Fig. 2(b)", "ref_id": "FIGREF3"}, {"start": 273, "end": 279, "text": "Fig. 1", "ref_id": "FIGREF2"}, {"start": 909, "end": 918, "text": "Fig. 4(a)", "ref_id": "FIGREF0"}, {"start": 1158, "end": 1164, "text": "Fig. 4", "ref_id": "FIGREF0"}, {"start": 1457, "end": 1466, "text": "Fig. 2(b)", "ref_id": "FIGREF3"}]}, {"section": "2) Machine Replacement as a MDP With Mixed Gaussian Rewards:", "text": "The performance of the three strategies obtained is presented in Fig. 5 . The corresponding average total discounted rewards and computational times are shown in Table II . As expected, the robust strategy ends up following a highly conservative policy repairing the machine at state 49 to avoid state 50. The first distributionally robust strategy, not modeling the mixture Gaussian distribution well, finds it advantageous to repair at the 50th state. In contrast, capable of capturing the distribution information in a more flexible way, the second distributionally robust strategy better models the uncertainty and finds not repairing the machine at state 50 is optimal. The performance comparison clearly shows the second distributionally robust strategy is more desirable, which highlights the distributionally robust approach with general structure of confidence sets can be beneficial in practice.", "cite_spans": [], "ref_spans": [{"start": 65, "end": 71, "text": "Fig. 5", "ref_id": "FIGREF5"}, {"start": 162, "end": 170, "text": "Table II", "ref_id": "TABREF0"}]}, {"section": "2) Machine Replacement as a MDP With Mixed Gaussian Rewards:", "text": "We remark that, in practice, one can obtain the modality structure of uncertain parameters in a data-driven way by applying clustering algorithms to an initial primitive data set. For example, one may check the histogram of historical observations. If the data concentrates on several distinct and disjoint bins, our multi-model DAMDP approach can be applied. Moreover, we note that networked control systems (NCSs) have recently emerged as a topic of significant interest in the control community. A typical application of NCSs is in modern [25] and [26] proposed a novel two-layer structure to solve the setpoints compensation problem for industrial processes under network-based environment.", "cite_spans": [{"start": 542, "end": 546, "text": "[25]", "ref_id": "BIBREF25"}, {"start": 551, "end": 555, "text": "[26]", "ref_id": "BIBREF26"}], "ref_spans": []}, {"section": "B. Transition Uncertainty in the Path Planning Problem", "text": "We now consider a path planning problem, similar to the one presented in [1] : an agent wants to exit a 4 \u00d7 21 maze [shown in Fig. 6(a) ] using the least possible time. Starting from the upper-left corner, the agent can move up, down, left and right, but can only exit the grid at the lower-right corner. Here, a white box stands for a normal place where the agent needs one time unit to pass through. A shaded box represents a \"shaky\" place: if an agent reaches a \"shaky\" place, then he may risk jumping to the starting point (\"reboot\"). The true transition probability of the jump follows a distribution", "cite_spans": [{"start": 73, "end": 76, "text": "[1]", "ref_id": "BIBREF0"}], "ref_spans": [{"start": 126, "end": 135, "text": "Fig. 6(a)", "ref_id": "FIGREF6"}]}, {"section": "B. Transition Uncertainty in the Path Planning Problem", "text": "The four approaches are implemented as follows: The nominal approach neglects this random jump. The robust approach takes a worst-case analysis, i.e., it assumes that with 30%, the whole probability support of transition, the agent will jump to the spot with the highest costto-go. The first distributionally robust approach takes into account an additional information by using two nested confidence sets: the jump probability parameter belonging to 9%-11% is of a confidence 1 \u2212 \u03bb. The second distributionally robust approach, which is proposed in this technical note, incorporates more information. In specific, we construct an extra confidence interval disjoint with the above 9%-11% interval. It states that the chance of jumping with probability 20% is \u03bb.", "cite_spans": [], "ref_spans": []}, {"section": "B. Transition Uncertainty in the Path Planning Problem", "text": "The performance of strategies of the nominal, the robust and the two distributionally robust approaches is shown in Fig. 6(b) , where the error bars show the standard error of the expected time to exit. The CPU times of computing optimal policies for four strategies are 0.461, 549, 642, and 654 seconds, respectively. The second distributionally robust approach achieves the best performance over virtually the whole spectrum of \u03bb. This is well expected, since additional probabilistic Fig. 6(a) illustrates the maze for the path plawnning problem. Fig. 6(b) shows the performance comparisons between nominal, robust and two distributionally robust strategies over 3,000 runs of the path planning problem.", "cite_spans": [], "ref_spans": [{"start": 116, "end": 125, "text": "Fig. 6(b)", "ref_id": "FIGREF6"}, {"start": 487, "end": 496, "text": "Fig. 6(a)", "ref_id": "FIGREF6"}, {"start": 550, "end": 559, "text": "Fig. 6(b)", "ref_id": "FIGREF6"}]}, {"section": "B. Transition Uncertainty in the Path Planning Problem", "text": "information is available to and incorporated by the second distributionally robust approach which considers ambiguity sets with more general structures.", "cite_spans": [], "ref_spans": []}, {"section": "V. CONCLUSION", "text": "In this technical note, we considered Markov decision problems with uncertainty. Specifically, we generalized the distributionally robust approach proposed in [1] to incorporate more general ambiguity sets proposed in [18] to model a-priori probabilistic information of the uncertain parameters. We proposed a way to compute the distributionally robust strategy through a Bellman type backward induction. We showed that the strategy, which achieves maximum expected utility under the worst admissible distributions of uncertain parameters, can be solved in polynomial time under some mild technical conditions. We believe that many important problems that are usually addressed using standard MDP models could be revisited and better resolved using the proposed models when parameter uncertainty exists, as this formulation naturally enables the decision maker to account for more general parameter uncertainty.", "cite_spans": [{"start": 159, "end": 162, "text": "[1]", "ref_id": "BIBREF0"}, {"start": 218, "end": 222, "text": "[18]", "ref_id": "BIBREF17"}], "ref_spans": []}], "bib_entries": {"BIBREF0": {"title": "Distributionally robust Markov decision processes", "authors": [{"first": "H", "middle": [], "last": "Xu", "suffix": ""}, {"first": "S", "middle": [], "last": "Mannor", "suffix": ""}], "year": 2012, "venue": "Math. Oper. Res", "link": "7229756"}, "BIBREF1": {"title": "Markov Decision Processes: Discrete Stochastic Dynamic Programming", "authors": [{"first": "M", "middle": ["L"], "last": "Puterman", "suffix": ""}], "year": 2014, "venue": "", "link": "57464058"}, "BIBREF2": {"title": "Neuro-dynamic programming (optimization and neural computation series, 3)", "authors": [{"first": "D", "middle": ["P"], "last": "Bertsekas", "suffix": ""}, {"first": "J", "middle": ["N"], "last": "Tsitsiklis", "suffix": ""}], "year": 1996, "venue": "Athena Scientific", "link": null}, "BIBREF3": {"title": "Reinforcement Learning: An Introduction", "authors": [{"first": "A", "middle": ["G"], "last": "Barto", "suffix": ""}], "year": 1998, "venue": "", "link": "9166388"}, "BIBREF4": {"title": "Bias and variance approximation in value function estimates", "authors": [{"first": "S", "middle": [], "last": "Mannor", "suffix": ""}, {"first": "D", "middle": [], "last": "Simester", "suffix": ""}, {"first": "P", "middle": [], "last": "Sun", "suffix": ""}, {"first": "J", "middle": ["N"], "last": "Tsitsiklis", "suffix": ""}], "year": 2007, "venue": "Manag. Sci", "link": "10603007"}, "BIBREF5": {"title": "Convex programming with set-inclusive constraints and applications to inexact linear programming", "authors": [{"first": "A", "middle": ["L"], "last": "Soyster", "suffix": ""}], "year": 1973, "venue": "Oper. Res", "link": null}, "BIBREF6": {"title": "Robust solutions of uncertain linear programs", "authors": [{"first": "A", "middle": [], "last": "Ben-Tal", "suffix": ""}, {"first": "A", "middle": [], "last": "Nemirovski", "suffix": ""}], "year": 1999, "venue": "Oper. Res. Lett", "link": "2474018"}, "BIBREF7": {"title": "The price of robustness", "authors": [{"first": "D", "middle": ["P"], "last": "Bertsimas", "suffix": ""}, {"first": "M", "middle": [], "last": "Sim", "suffix": ""}], "year": 2004, "venue": "Oper. Res", "link": "8946639"}, "BIBREF8": {"title": "Robust Optimization", "authors": [{"first": "A", "middle": [], "last": "Ben-Tal", "suffix": ""}, {"first": "L", "middle": ["El"], "last": "Ghaoui", "suffix": ""}, {"first": "A", "middle": [], "last": "Nemirovski", "suffix": ""}], "year": 2009, "venue": "", "link": null}, "BIBREF9": {"title": "Robust control of Markov decision processes with uncertain transition matrices", "authors": [{"first": "A", "middle": [], "last": "Nilim", "suffix": ""}, {"first": "L", "middle": ["El"], "last": "Ghaoui", "suffix": ""}], "year": 2005, "venue": "Oper. Res", "link": "1537485"}, "BIBREF10": {"title": "Robust dynamic programming", "authors": [{"first": "G", "middle": ["N"], "last": "Iyengar", "suffix": ""}], "year": 2005, "venue": "Math. Oper. Res", "link": "710328"}, "BIBREF11": {"title": "Percentile optimization for Markov decision processes with parameter uncertainty", "authors": [{"first": "E", "middle": [], "last": "Delage", "suffix": ""}, {"first": "S", "middle": [], "last": "Mannor", "suffix": ""}], "year": 2010, "venue": "Oper. Res", "link": "10308849"}, "BIBREF12": {"title": "Markov decision processes with imprecise transition probabilities", "authors": [{"first": "C", "middle": ["C"], "last": "White", "suffix": ""}, {"first": "H", "middle": ["K"], "last": "Eldeib", "suffix": ""}], "year": 1994, "venue": "Oper. Res", "link": "207242061"}, "BIBREF13": {"title": "Solving uncertain Markov decision problems", "authors": [{"first": "A", "middle": [], "last": "Bagnell", "suffix": ""}, {"first": "A", "middle": ["Y"], "last": "Ng", "suffix": ""}, {"first": "J", "middle": [], "last": "Schneider", "suffix": ""}], "year": 2001, "venue": "", "link": "59762877"}, "BIBREF14": {"title": "Learning under ambiguity", "authors": [{"first": "L", "middle": ["G"], "last": "Epstein", "suffix": ""}, {"first": "M", "middle": [], "last": "Schneider", "suffix": ""}], "year": 2007, "venue": "Rev. Econ. Studies", "link": "15546892"}, "BIBREF15": {"title": "Robust Markov decision processes", "authors": [{"first": "W", "middle": [], "last": "Wiesemann", "suffix": ""}, {"first": "D", "middle": [], "last": "Kuhn", "suffix": ""}, {"first": "B", "middle": [], "last": "Rustem", "suffix": ""}], "year": 2013, "venue": "Math. Oper. Res", "link": "6103434"}, "BIBREF16": {"title": "The robustness-performance tradeoff in Markov decision processes", "authors": [{"first": "H", "middle": [], "last": "Xu", "suffix": ""}, {"first": "S", "middle": [], "last": "Mannor", "suffix": ""}, {"first": "B", "middle": [], "last": "Sch\u00f6lkopf", "suffix": ""}, {"first": "J", "middle": [], "last": "Platt", "suffix": ""}, {"first": "T", "middle": [], "last": "Hofmann", "suffix": ""}], "year": 2006, "venue": "Proc. NIPS", "link": "63859912"}, "BIBREF17": {"title": "Distributionally robust convex optimization", "authors": [{"first": "W", "middle": [], "last": "Wiesemann", "suffix": ""}, {"first": "D", "middle": [], "last": "Kuhn", "suffix": ""}, {"first": "M", "middle": [], "last": "Sim", "suffix": ""}], "year": 2014, "venue": "Oper. Res", "link": "16625241"}, "BIBREF18": {"title": "Distributionally robust optimization under moment uncertainty with application to data-driven problems", "authors": [{"first": "E", "middle": [], "last": "Delage", "suffix": ""}, {"first": "Y", "middle": [], "last": "Ye", "suffix": ""}], "year": 2010, "venue": "Oper. Res", "link": null}, "BIBREF19": {"title": "Lightning does not strike twice: Robust MDPs with coupled uncertainty", "authors": [{"first": "S", "middle": [], "last": "Mannor", "suffix": ""}, {"first": "O", "middle": [], "last": "Mebel", "suffix": ""}, {"first": "H", "middle": [], "last": "Xu", "suffix": ""}], "year": 2012, "venue": "Proc. 29th Int. Conf. Machine Learning (ICML'12)", "link": "486400"}, "BIBREF20": {"title": "Available", "authors": [], "year": "", "venue": "", "link": null}, "BIBREF21": {"title": "Distributionally robust joint chance constraints with second-order moment information", "authors": [{"first": "S", "middle": [], "last": "Zymler", "suffix": ""}, {"first": "D", "middle": [], "last": "Kuhn", "suffix": ""}, {"first": "B", "middle": [], "last": "Rustem", "suffix": ""}], "year": 2013, "venue": "Math. Programm", "link": "11547182"}, "BIBREF22": {"title": "Distributionally robust counterpart in Markov decision processes", "authors": [{"first": "P", "middle": [], "last": "Yu", "suffix": ""}, {"first": "H", "middle": [], "last": "Xu", "suffix": ""}], "year": 2015, "venue": "", "link": "18980380"}, "BIBREF23": {"title": "Convex Optimization", "authors": [{"first": "S", "middle": [], "last": "Boyd", "suffix": ""}, {"first": "L", "middle": [], "last": "Vandenberghe", "suffix": ""}], "year": 2004, "venue": "", "link": "37925315"}, "BIBREF24": {"title": "CVX: Matlab Software for Disciplined Convex Programming, Version 2.1", "authors": [{"first": "M", "middle": [], "last": "Grant", "suffix": ""}, {"first": "S", "middle": [], "last": "Boyd", "suffix": ""}], "year": 2014, "venue": "", "link": null}, "BIBREF25": {"title": "A combined adaptive neural network and nonlinear model predictive control for multirate networked industrial process control", "authors": [{"first": "T", "middle": [], "last": "Wang", "suffix": ""}, {"first": "H", "middle": [], "last": "Gao", "suffix": ""}, {"first": "J", "middle": [], "last": "Qiu", "suffix": ""}], "year": 2016, "venue": "IEEE Trans. Neural Netw. Learn. Syst", "link": "18576331"}, "BIBREF26": {"title": "Networked multirate output feedback control for setpoints compensation and its application to rougher flotation process", "authors": [{"first": "F", "middle": [], "last": "Liu", "suffix": ""}, {"first": "H", "middle": [], "last": "Gao", "suffix": ""}, {"first": "J", "middle": [], "last": "Qiu", "suffix": ""}, {"first": "S", "middle": [], "last": "Yin", "suffix": ""}, {"first": "J", "middle": [], "last": "Fan", "suffix": ""}, {"first": "T", "middle": [], "last": "Chai", "suffix": ""}], "year": 2014, "venue": "IEEE Trans. Ind. Electron", "link": "24341930"}}, "ref_entries": {"FIGREF0": {"text": "The condition 1 of Assumption 3 ensures the confidence set with largest index, O ns s , contains the support of the joint unknown parameters pair (p s , r s ,\u0169 s ). The second condition stipulates that there is a probability distribution \u03bc s (p s , r s ,\u0169 s ) \u2208C s that satisfies the probability bounds in (1) as strict inequalities whenever the corresponding probability interval [\u03b1 i s , \u03b1 i s ] is non-degenerate. For each individual O i s , we make the following assumption to ensure tractability.Assumption For s \u2208 S, i \u2208 I s , each nonempty and convex confidence set O i s is defined as", "type": "figure"}, "FIGREF1": {"text": "Here, K i s * represents the cone dual to K i s ; set A(i) \u0394 = {i} \u222a {i \u2208 I s : O i s O i s };\u1e7d t+1 is the vector form of\u1e7d t+1 (s ) for all s \u2208 S t+1 ; and V s \u0394 = [e 1 (|A s |)\u1e7d t+1 , . . . , e |As| (|A s |)\u1e7d t+1 ] .", "type": "figure"}, "FIGREF2": {"text": "Mean Absolute Deviation): Assume that E rs\u223c\u03bcs (rs) [|r s \u2212 m|] \u2264 f for m, f \u2208 R |As| . [18] shows thatC s , which involves the auxiliary random vector\u0169 s \u2208 R |As| , can be expressed asC s = {\u03bc s (r s ,\u0169 s )|E\u0169 s \u223c\u03bcs [\u0169 s ] = f , \u03bc s (\u0169 s \u2265 r s \u2212 m,\u0169 s \u2265 m \u2212 r s ) = 1}. Note that \u03bc s (r s ) \u2208 rsC s . In this case Problem (3) can be rewritten as minimize w,\u03c0s ,\u03ba,\u03bd w subject to \u03ba \u2212 f \u03bd \u2264 w \u03ba + p s\u1e7cs \u03c0 s + m \u03c0 s \u2265 0 \u03c0 s \u2208 \u0394(s), \u03bd \u2265 0.", "type": "figure"}, "FIGREF3": {"text": "Mean): Assume that we only know a noisy empirical estimator of the exact mean of p s . That is, given G \u2208 R M \u00d7(|As|\u00d7|s|) , f \u2208 R M and p s \u223c \u03bc s (p s ), GE ps\u223c\u03bcs(ps) [p s ] K f , where K is a proper cone. [18] shows thatC s , which involves the auxiliary random vector\u0169 s \u2208 R M , can be expressed asC s = {\u03bc s (p s , u s )|E\u0169 s \u223c\u03bcs [\u0169 s ] = f , \u03bc s (Gp s K\u0169s ) = 1}. Note that \u03bc s (p s ) \u2208 psC s . Problem (3) now takes the form minimize w,\u03c0s ,\u03ba,\u03bd", "type": "figure"}, "FIGREF4": {"text": "Performance comparisons between nominal, robust, and distributionally robust strategies on 10,000 runs of the machine replacement problem with Gaussian rewards (The bottom figure focuses on the interval [ \u22120.0045, \u22120.001]).", "type": "figure"}, "FIGREF5": {"text": "Performance comparisons between robust and two distributionally robust strategies on 10,000 runs of the machine replacement problem with mixed Gaussian rewards.", "type": "figure"}, "FIGREF6": {"text": "Fig. 6. Fig. 6(a) illustrates the maze for the path plawnning problem. Fig. 6(b) shows the performance comparisons between nominal, robust and two distributionally robust strategies over 3,000 runs of the path planning problem.", "type": "figure"}, "TABREF0": {"text": "TOTAL DISCOUNTED REWARDS AND COMPUTATIONAL TIMES OF NOMINAL, ROBUST, AND DISTRIBUTIONALLY ROBUST STRATEGIES IN MACHINE REPLACEMENT PROBLEM WITH GAUSSIAN REWARDS", "type": "table"}, "TABREF1": {"text": "TOTAL DISCOUNTED REWARDS AND COMPUTATIONAL TIMES OF ROBUST AND TWO DISTRIBUTIONALLY ROBUST STRATEGIES IN MACHINE REPLACEMENT PROBLEM WITH MIXED GAUSSIAN REWARDS industrial systems, in which the components are often connected over network media. Our multi-model DAMDP approach might be extended for network-based performance tracking control of complex industrial processes, where recent work", "type": "table"}}}
diff --git a/s2orc-doc2json/tests/s2orc/20200705/18980463.json b/s2orc-doc2json/tests/s2orc/20200705/18980463.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1536e30167e4163043968fa303e8aa36d7d47
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20200705/18980463.json
@@ -0,0 +1 @@
+{"paper_id": "18980463", "_pdf_hash": "ac3a8d3545ca7c673ddbfbf2f503e40f6317eb6f", "abstract": [{"section": "Abstract", "text": "Although development of the adult Drosophila compound eye is very well understood, little is known about development of photoreceptors (PRs) in the simple larval eye. We show here that the larval eye is composed of 12 PRs, four of which express blue-sensitive rhodopsin5 (rh5) while the other eight contain green-sensitive rh6. This is similar to the 30:70 ratio of adult blue and green R8 cells. However, the stochastic choice of adult color PRs and the bistable loop of the warts and melted tumor suppressor genes that unambiguously specify rh5 and rh6 in R8 PRs are not involved in specification of larval PRs. Instead, primary PR precursors signal via EGFR to surrounding tissue to develop as secondary precursors, which will become Rh6-expressing PRs. EGFR signaling is required for the survival of the Rh6 subtype. Primary precursors give rise to the Rh5 subtype. Furthermore, the combinatorial action of the transcription factors Spalt, Seven-up, and Orthodenticle specifies the two PR subtypes. Therefore, even though the larval PRs and adult R8 PRs express the same rhodopsins (rh5 and rh6), they use very distinct mechanisms for their specification.", "cite_spans": [], "ref_spans": []}, {"section": "Abstract", "text": "[Keywords: Drosophila; visual system development; photoreceptor specification; transcription factor interaction; EGFR signaling] Supplemental material is available at http://www.genesdev.org. In spite of the morphological and developmental differences between vertebrate and invertebrate eyes, their basic function to translate light information from the environment to the brain is maintained. In Drosophila, the adult compound eye has been studied in great detail. It consists of \u223c800 individual ommatidia. Each ommatidium contains eight photoreceptor cells (PRs): six outer PRs (R1-R6) and two inner PRs (R7 and R8). Different PRs are sensitive to different wavelengths of light, depending on the rhodopsin gene (rh) they express. Outer PRs are involved in motion detection and contain Rh1, a broad-spectrum photopigment. R7 and R8 each expresses a distinct rh with restricted absorption spectra-rh3, rh4, rh5, and rh6. The type of rh expressed in inner PRs defines two major types of ommatidia: The pale (p) ommatidia have R7 that contain UV-sensitive Rh3 with the corresponding R8 expressing blue Rh5, whereas in yellow (y) ommatidia, R7 expresses UV-sensitive Rh4 and R8 expresses green Rh6.", "cite_spans": [], "ref_spans": []}, {"section": "Abstract", "text": "Recently, substantial progress has been achieved in understanding the molecular basis of how different subtypes of PRs are specified (Wernet and Desplan 2004; Mikeladze-Dvali et al. 2005a ). Initially, R7 and R8 express the transcription factor spalt (sal) that is required to specify them as inner PRs and distinguish them from outer PR identity (Mollereau et al. 2001) . Then, the expression in R7 of the gene prospero (pros), which encodes a homeodomain transcription factor, further distinguishes R7 from R8 by repressing R8 rhs, rh5, and rh6 .", "cite_spans": [{"start": 133, "end": 158, "text": "(Wernet and Desplan 2004;", "ref_id": "BIBREF55"}, {"start": 159, "end": 187, "text": "Mikeladze-Dvali et al. 2005a", "ref_id": "BIBREF32"}, {"start": 347, "end": 370, "text": "(Mollereau et al. 2001)", "ref_id": "BIBREF36"}], "ref_spans": []}, {"section": "Abstract", "text": "The generation of the two types of ommatidia, yellow and pale, includes several steps. First, the stochastic expression of the transcription factor Spineless (Ss) in a subset of R7 cells specifies yellow ommatidia. Ss is required cell autonomously in yR7 for rh4 expression and, further, cell nonautonomously for the underlying R8 cell to acquire y fate and turn on rh6 expression (Wernet et al. 2006) . The coordination between R7 and R8 rhodopsins requires a signal from pR7 that induces the pR8 fate. In sevenless mutants that lack R7, rh5 expression is lost while rh6 is expanded to almost all R8 (Papatsenko et al. 1997; Chou et al. 1999) . The y versus p choice in R8 is then reinforced by a bistable loop of regulation between the tumor suppressor gene warts (wts) and the growth regulator melted (melt) (Mikeladze-Dvali et al. 2005b) : wts is required for rh6 expression, whereas melt is essential for rh5 expression. wts and melt repress each other transcriptionally, thereby ensuring that a robust decision to express either rh5 or rh6 is made. The ho- ", "cite_spans": [{"start": 381, "end": 401, "text": "(Wernet et al. 2006)", "ref_id": "BIBREF56"}, {"start": 601, "end": 625, "text": "(Papatsenko et al. 1997;", "ref_id": "BIBREF42"}, {"start": 626, "end": 643, "text": "Chou et al. 1999)", "ref_id": "BIBREF6"}, {"start": 811, "end": 841, "text": "(Mikeladze-Dvali et al. 2005b)", "ref_id": "BIBREF33"}], "ref_spans": []}], "body_text": [{"section": "", "text": "In spite of the morphological and developmental differences between vertebrate and invertebrate eyes, their basic function to translate light information from the environment to the brain is maintained. In Drosophila, the adult compound eye has been studied in great detail. It consists of \u223c800 individual ommatidia. Each ommatidium contains eight photoreceptor cells (PRs): six outer PRs (R1-R6) and two inner PRs (R7 and R8) . Different PRs are sensitive to different wavelengths of light, depending on the rhodopsin gene (rh) they express. Outer PRs are involved in motion detection and contain Rh1, a broad-spectrum photopigment. R7 and R8 each expresses a distinct rh with restricted absorption spectra-rh3, rh4, rh5, and rh6. The type of rh expressed in inner PRs defines two major types of ommatidia: The pale (p) ommatidia have R7 that contain UV-sensitive Rh3 with the corresponding R8 expressing blue Rh5, whereas in yellow (y) ommatidia, R7 expresses UV-sensitive Rh4 and R8 expresses green Rh6.", "cite_spans": [], "ref_spans": []}, {"section": "", "text": "Recently, substantial progress has been achieved in understanding the molecular basis of how different subtypes of PRs are specified (Wernet and Desplan 2004; Mikeladze-Dvali et al. 2005a ). Initially, R7 and R8 express the transcription factor spalt (sal) that is required to specify them as inner PRs and distinguish them from outer PR identity (Mollereau et al. 2001) . Then, the expression in R7 of the gene prospero (pros), which encodes a homeodomain transcription factor, further distinguishes R7 from R8 by repressing R8 rhs, rh5, and rh6 .", "cite_spans": [{"start": 133, "end": 158, "text": "(Wernet and Desplan 2004;", "ref_id": "BIBREF55"}, {"start": 159, "end": 187, "text": "Mikeladze-Dvali et al. 2005a", "ref_id": "BIBREF32"}, {"start": 347, "end": 370, "text": "(Mollereau et al. 2001)", "ref_id": "BIBREF36"}], "ref_spans": []}, {"section": "", "text": "The generation of the two types of ommatidia, yellow and pale, includes several steps. First, the stochastic expression of the transcription factor Spineless (Ss) in a subset of R7 cells specifies yellow ommatidia. Ss is required cell autonomously in yR7 for rh4 expression and, further, cell nonautonomously for the underlying R8 cell to acquire y fate and turn on rh6 expression (Wernet et al. 2006) . The coordination between R7 and R8 rhodopsins requires a signal from pR7 that induces the pR8 fate. In sevenless mutants that lack R7, rh5 expression is lost while rh6 is expanded to almost all R8 (Papatsenko et al. 1997; Chou et al. 1999) . The y versus p choice in R8 is then reinforced by a bistable loop of regulation between the tumor suppressor gene warts (wts) and the growth regulator melted (melt) (Mikeladze-Dvali et al. 2005b) : wts is required for rh6 expression, whereas melt is essential for rh5 expression. wts and melt repress each other transcriptionally, thereby ensuring that a robust decision to express either rh5 or rh6 is made. The ho-meoprotein encoded by orthodenticle (otd) is required in both p R7 and R8 to activate rh3 and rh5 through direct binding to their promoter. Otd has only a permissive role: It is expressed in all PRs and its overexpression is not able to induce rh3 or rh5 expression in y ommatidia. Furthermore, the y ommatidial fate does not expand in otd mutants (Tahayato et al. 2003) .", "cite_spans": [{"start": 381, "end": 401, "text": "(Wernet et al. 2006)", "ref_id": "BIBREF56"}, {"start": 601, "end": 625, "text": "(Papatsenko et al. 1997;", "ref_id": "BIBREF42"}, {"start": 626, "end": 643, "text": "Chou et al. 1999)", "ref_id": "BIBREF6"}, {"start": 811, "end": 841, "text": "(Mikeladze-Dvali et al. 2005b)", "ref_id": "BIBREF33"}, {"start": 1410, "end": 1432, "text": "(Tahayato et al. 2003)", "ref_id": "BIBREF51"}], "ref_spans": []}, {"section": "", "text": "Although much is known about the development of the adult visual system, little is known about the development and function of the larval visual system. PRs of the larval eye (Bolwig Organ [BO] ) extend their axonal projections to the larval pacemaker neurons, which control the larval circadian rhythm (Malpel et al. 2002; Hassan et al. 2005; Moncalvo and Campos 2005) . Visual input via the larval eye is crucial for the entrainment of the molecular clock (Malpel et al. 2004; Mazzoni et al. 2005) . Furthermore, both larval PRs and pacemaker neurons control larval photophobic behavior (Mazzoni et al. 2005) . Very simple in structure, the larval eye contains \u223c12-14 PRs (Green et al. 1993) . In contrast to adult ommatidia, BO lacks accessory cells, such as pigment cells or cone cells.", "cite_spans": [{"start": 303, "end": 323, "text": "(Malpel et al. 2002;", "ref_id": "BIBREF29"}, {"start": 324, "end": 343, "text": "Hassan et al. 2005;", "ref_id": "BIBREF18"}, {"start": 344, "end": 369, "text": "Moncalvo and Campos 2005)", "ref_id": "BIBREF37"}, {"start": 458, "end": 478, "text": "(Malpel et al. 2004;", "ref_id": "BIBREF30"}, {"start": 479, "end": 499, "text": "Mazzoni et al. 2005)", "ref_id": "BIBREF31"}, {"start": 589, "end": 610, "text": "(Mazzoni et al. 2005)", "ref_id": "BIBREF31"}, {"start": 674, "end": 693, "text": "(Green et al. 1993)", "ref_id": "BIBREF15"}], "ref_spans": []}, {"section": "", "text": "BO precursor cells develop in the optic placode adjacent to the optic lobe primordium (Green et al. 1993 ). Development of larval PR precursor cells proceeds in a two-step process. First, primary precursors (also called BO founder cells) are specified; they express and require the proneural gene atonal (ato) and the retinal patterning genes sine oculis (so) and eyes absent (eya) as well as hedgehog (hh) signaling (Schmucker et al. 1994; Suzuki and Saigo 2000) . Then, primary precursors signal to the surrounding tissue by expressing the TGF\u2423 homolog spitz (spi), which activates EGF receptor (EGFR) and recruits adjacent cells to develop as secondary precursors (Daniel et al. 1999; Suzuki and Saigo 2000) . tailless (tll), which encodes an orphan nuclear receptor, opposes EGFR signaling in the surrounding optic lobe primordium, preventing adjacent cells from developing as PRs (Daniel et al. 1999) . Subsequently, as immature PRs detach from the optic lobe primordium and start to differentiate, they extend their axons and remain in contact with the optic lobe (Schmucker et al. 1992 (Schmucker et al. , 1997 Dumstrei et al. 2002) . Two distinct PR subtypes can be distinguished in the larval eye: Four PRs contain blue-sensitive Rh5 while eight contain green-sensitive Rh6. This is strongly reminiscent of adult R8 PRs, which express Rh5 in 30% of ommatidia and Rh6 in the remaining 70%. Thus, similar molecular mechanisms could act to specify the two distinct subtypes in larval PRs and in adult R8 cells.", "cite_spans": [{"start": 86, "end": 104, "text": "(Green et al. 1993", "ref_id": "BIBREF15"}, {"start": 417, "end": 440, "text": "(Schmucker et al. 1994;", "ref_id": "BIBREF47"}, {"start": 441, "end": 463, "text": "Suzuki and Saigo 2000)", "ref_id": "BIBREF50"}, {"start": 667, "end": 687, "text": "(Daniel et al. 1999;", "ref_id": "BIBREF9"}, {"start": 688, "end": 710, "text": "Suzuki and Saigo 2000)", "ref_id": "BIBREF50"}, {"start": 885, "end": 905, "text": "(Daniel et al. 1999)", "ref_id": "BIBREF9"}, {"start": 1070, "end": 1092, "text": "(Schmucker et al. 1992", "ref_id": "BIBREF46"}, {"start": 1093, "end": 1117, "text": "(Schmucker et al. , 1997", "ref_id": "BIBREF48"}, {"start": 1118, "end": 1139, "text": "Dumstrei et al. 2002)", "ref_id": "BIBREF12"}], "ref_spans": []}, {"section": "", "text": "Here we describe the genetic mechanisms underlying the specification of the two larval PR subtypes. We show that primary precursors develop independently of EGFR and give rise to the Rh5 subtype whereas secondary precursors give rise to Rh6-subtype PRs. EGFR signaling is required for the survival of secondary precursors of the Rh6 subtype. The combinatorial action of transcription factors Seven-up (Svp), Sal, and Otd is required to distinguish the two subtypes. The Rh5 subtype requires sal and otd, while the Rh6 subtype requires seven-up (svp). EGFR signaling, otd, svp, and sal are also core components of PR development in the adult retina. However, they play very different roles in the two systems. Thus, even though adult R8 and larval PRs share the same rhodopsin fates, the genetic pathways that control their expression are surprisingly different.", "cite_spans": [], "ref_spans": []}, {"section": "Results", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "Embryonic development of the larval eye and initiation of rhodopsin expression", "text": "We followed BO PR development from specification of precursors until the end of larval life using anatomical and molecular markers. Larval PRs develop from a group of cells located at the ventral tip of the optic placode, adjacent to the progenitors of the optic lobe primordium (Green et al. 1993) . The earliest precursor cells express the proneural gene ato in a highly dynamic manner during embryonic stages 10-12 ( Fig. 1A ; Daniel et al. 1999; Suzuki and Saigo 2000) (we obtained comparable results using either the ato-Gal4 driver or an anti-Ato antibody; data not shown). Expression of ato in PR precursors decreases during stage 12 until no expression is found after embryonic stage 13. During embryonic stage 12, BO precursors start to express the neuronal marker Elav as well as Kr\u00fcppel (Kr) and Fasciclin II (FasII) (Schmucker et al. 1992; Daniel et al. 1999) , which were used as molecular markers for immature PRs throughout embryogenesis (Figs. 1B, 3A, below). During optic lobe invagination, larval PRs remain connected with the optic lobe primordium by the Bolwig Nerve (BN), which extends while the distance between the PRs and optic lobe primordium gradually increases (Schmucker et al. 1992) . By stage 15, all PRs are largely separated from the optic lobe primordium and BO is positioned at the anterior part of the embryonic head (Fig. 1B) , where it becomes associated with the head skeleton at embryonic stage 17 (Green et al. 1993) .", "cite_spans": [{"start": 279, "end": 298, "text": "(Green et al. 1993)", "ref_id": "BIBREF15"}, {"start": 430, "end": 449, "text": "Daniel et al. 1999;", "ref_id": "BIBREF9"}, {"start": 450, "end": 472, "text": "Suzuki and Saigo 2000)", "ref_id": "BIBREF50"}, {"start": 828, "end": 851, "text": "(Schmucker et al. 1992;", "ref_id": "BIBREF46"}, {"start": 852, "end": 871, "text": "Daniel et al. 1999)", "ref_id": "BIBREF9"}, {"start": 1188, "end": 1211, "text": "(Schmucker et al. 1992)", "ref_id": "BIBREF46"}, {"start": 1437, "end": 1456, "text": "(Green et al. 1993)", "ref_id": "BIBREF15"}], "ref_spans": [{"start": 420, "end": 427, "text": "Fig. 1A", "ref_id": "FIGREF0"}, {"start": 1352, "end": 1361, "text": "(Fig. 1B)", "ref_id": "FIGREF0"}]}, {"section": "Embryonic development of the larval eye and initiation of rhodopsin expression", "text": "Larval PRs, whose number varies from eight to 16, start to express rh5 and rh6 by the end of embryogenesis (stage 16/17) and maintain rh expression throughout larval life (Fig. 1C,D) . Three or four BO PRs express Rh5 and the remaining eight to 10 express Rh6 (Fig. 1C,D) . We verified that rh1, rh3, and rh4, which had been previously reported to be expressed in BO (Pollock and Benzer 1988) were not expressed in the larval eye (data not shown). The PR-specific gene chaoptic (chp) becomes expressed in all larval PRs at about the same developmental stage (Fig. 6D, below) . By the end of embryogenesis, PRs are packed tightly and do not exhibit any obvious signs of further cellular differentiation. However, during early larval life, their morphology changes dramatically; cells become loosely packed and build up arborizationlike protrusions (Fig. 1D, arrow) . While Ato expression quickly disappears, the expression of Chp, Kr, Elav, Rh5, and Rh6 is maintained throughout larval life.", "cite_spans": [{"start": 367, "end": 392, "text": "(Pollock and Benzer 1988)", "ref_id": "BIBREF43"}], "ref_spans": [{"start": 171, "end": 182, "text": "(Fig. 1C,D)", "ref_id": "FIGREF0"}, {"start": 260, "end": 271, "text": "(Fig. 1C,D)", "ref_id": "FIGREF0"}, {"start": 558, "end": 574, "text": "(Fig. 6D, below)", "ref_id": "FIGREF5"}, {"start": 847, "end": 863, "text": "(Fig. 1D, arrow)", "ref_id": "FIGREF0"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "The expression of Rh5 and Rh6 in larval PRs is strongly reminiscent of the adult R8 PRs. Therefore, we tested whether wts and melt, which form a bistable loop of regulation required for the robust specification of Rh5 or Rh6 fate in adult R8, are also required in larval PRs. However we could not detect expression of wts or melt at any time during embryogenesis, as visualized using wts and melt enhancer trap lines. We also could not detect early maternal expression in embryos (data not shown). To test whether wts and melt are required for the subtype specification in the larval eye, we analyzed Rh5 and Rh6 expression in wts and melt mutant larvae. However, the expression of neither Rh5 nor Rh6 is affected ( Fig. 2A,B) . To manipulate the early phase of PR precursor specification and development, we made use of a sine oculis-Gal4 (so-Gal4), which starts to be expressed in the optic lobe placode at embryonic stage 10 and remains expressed in the optic lobe and all larval PRs throughout embryogenesis and larval life (Supplementary Fig. 1A,C) . Interestingly, the ectopic expression of UAS-wts or UAS-melt under the control of so-Gal4 did also not affect the expression Rh5 and Rh6 (Fig. 2C,D) . Furthermore the total number of larval PRs and the ratio of Rh5 PRs versus Rh6 PRs remain unaltered in both wts and melt gain of function (GOF) as well as in loss of function (LOF) (Fig. 2E) . Thus, in contrast to the adult retina, the specification of the Rh5 and Rh6 subtypes does not depend on wts and melt.", "cite_spans": [], "ref_spans": [{"start": 716, "end": 726, "text": "Fig. 2A,B)", "ref_id": "FIGREF1"}, {"start": 1043, "end": 1053, "text": "Fig. 1A,C)", "ref_id": "FIGREF0"}, {"start": 1193, "end": 1204, "text": "(Fig. 2C,D)", "ref_id": "FIGREF1"}, {"start": 1388, "end": 1397, "text": "(Fig. 2E)", "ref_id": "FIGREF1"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "Egfr signaling and tll action orchestrate the development of the Rh6 subtype As in the adult eye, recruitment of BO PR precursor cells requires activation of the EGFR pathway. Primary BO precursors produce Spi, which is required in neighboring cells to develop as secondary precursors. In embryos mutant for spi, the immature larval eye only consists of three or four cells (Daniel et al. 1999 ). Therefore, we tested whether EGFR signaling is involved in the specification of larval PR subtypes and whether there is a correlation between primary precursors and rh5-expressing PRs or between secondary and rh6-expressing PRs. Because EGFR signaling has multiple earlier functions in the embryo, we misexpressed a dominant-negative form of EGFR (UAS-EGFR dn ) under the control of so-Gal4 to suppress EGFR activity in BO precursors. This results in the development of only three or four immature PRs as compared with \u223c14 cells in the wild type (Fig. 3A,B) , a phenocopy of the BO in spi mutants (Daniel et al. 1999 ). All of the remaining three or four PRs all express Rh5 (Fig. 3D) as well as the general markers Elav, FasII, Kr, and Chp (Fig. 3B,F) . They also express the Rh5-subtypespecific marker sal (Fig. 3F, see below) , but not the Rh6-subtype-specific marker svp (see below). These results show that EGFR signaling is required for the develop- ment of Rh6 PRs and strongly suggests that primary precursors develop into the Rh5 subtype while secondary precursors give rise to the Rh6 subtype.", "cite_spans": [{"start": 374, "end": 393, "text": "(Daniel et al. 1999", "ref_id": "BIBREF9"}, {"start": 994, "end": 1013, "text": "(Daniel et al. 1999", "ref_id": "BIBREF9"}], "ref_spans": [{"start": 943, "end": 954, "text": "(Fig. 3A,B)", "ref_id": "FIGREF2"}, {"start": 1072, "end": 1081, "text": "(Fig. 3D)", "ref_id": "FIGREF2"}, {"start": 1138, "end": 1149, "text": "(Fig. 3B,F)", "ref_id": "FIGREF2"}, {"start": 1205, "end": 1225, "text": "(Fig. 3F, see below)", "ref_id": "FIGREF2"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "Activation of the EGFR pathway is relayed to the nucleus through the mitogen-activated protein kinase (MAPK) cascade, which ultimately leads to the phosphorylation of nuclear effectors such as the Ets family transcription factors Pointed (Pnt) and Yan (O'Neill et al. 1994) . Both Yan and Pnt are expressed during stage 10/11 in the developing lobe primordium when larval PRs form; however their expression decreases during embryonic stage 12/13, and no expression can be detected at stage 15 (Supplementary Fig. 2 ; data not shown).", "cite_spans": [{"start": 252, "end": 273, "text": "(O'Neill et al. 1994)", "ref_id": "BIBREF41"}], "ref_spans": [{"start": 508, "end": 514, "text": "Fig. 2", "ref_id": "FIGREF1"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "The orphan nuclear receptor Tll has an effect opposite to the EGFR signaling in specifying PR precursors versus optic lobe primordium. Tll is expressed in the optic lobe primordium but not in PR precursors. Removing tll function in the embryo leads to supernumerous immature PRs (Daniel et al. 1999) , suggesting that tll acts negatively on the development of secondary precursors. To test which subset is affected, we analyzed Rh5 and Rh6 expression in tll mutants at the end of embryogenesis, before tll mutants die. The number of Rh5 PRs remains largely unchanged. However, the number of Rh6 PRs is dramatically increased to 20-25 instead of the normal eight to 10 (Fig. 4D ). This suggests that tll inhibits adjacent cells from adapting the Rh6 cell fate.", "cite_spans": [{"start": 279, "end": 299, "text": "(Daniel et al. 1999)", "ref_id": "BIBREF9"}], "ref_spans": [{"start": 668, "end": 676, "text": "(Fig. 4D", "ref_id": "FIGREF3"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "Since the lack of EGFR signaling results in a smaller number of larval PRs due to cell death (Daniel et al. 1999) , the increased Rh6 PRs might result from the survival of adjacent optic lobe primordium cells that failed to die. We therefore expressed the apoptosis inhibitor UAS-p35 in optic lobe primordium under the control of so-Gal4. This leads to a high increase in the number of Rh6 PRs to \u223c20-25, whereas the number of Rh5 PRs remains unchanged, similar to the loss of tll function (Fig. 4F ). This strongly supports the notion that EGFR signaling is required in cells surrounding the primary precursors to prevent their apoptosis induced by tll, and therefore allow their development as Rh6 PRs. In addi- Larval PR-subtype specification tion to its survival function, EGFR signaling might also be required for the acquisition of the Rh6 cell fate. To approach this question, we concomitantly misexpressed UAS-p35 and UAS-EGFR dn under the control of so-Gal4. This leads to a strong increase in the number of Rh6 PRs, to \u223c20-25 without affecting the number of Rh5 PRs (Fig. 4G) , similar to the misexpression of UAS-p35 alone.", "cite_spans": [{"start": 93, "end": 113, "text": "(Daniel et al. 1999)", "ref_id": "BIBREF9"}], "ref_spans": [{"start": 490, "end": 498, "text": "(Fig. 4F", "ref_id": "FIGREF3"}, {"start": 1076, "end": 1086, "text": "(Fig. 4G)", "ref_id": "FIGREF3"}]}, {"section": "The Wts/Melt pathway is not involved in the choice of Rh5 versus Rh6", "text": "Thus EGFR signaling appears to be essential for the survival of the Rh6 subtype. However the immature precursors do not seem to depend on EGFR signaling for adapting the Rh6 cell fate. It is also possible that activity remaining in the EGFR dn context is able to induce the Rh6 fate while higher EGFR activity is required for survival.", "cite_spans": [], "ref_spans": []}, {"section": "Sal and Svp are expressed in distinct PR subtypes", "text": "In order to analyze the development, specification, and differentiation of the two subtypes of larval PRs, we looked for genes expressed in either the Rh5 or the Rh6 subtype. Among the genes required for adult PR development, pros and ss are key players in inner PR specification. However, we could not detect expression of Pros or Ss in developing BO PRs at any point during embryogenesis (data not shown). However, sal and svp are expressed in a subtype-specific fashion in the larval eye. During embryonic development, three or four immature PRs express sal ( Fig. 5A,B ; comparable results were obtained using sal-Gal4 or anti-Salm antibody; data not shown). sal expression starts during stage 13/14 and is maintained throughout embryogenesis and larval life. Its expression coincides precisely with rh5 ( Fig. 5C ) and is excluded from the Rh6 subtype (Fig. 5D) . svp shows the opposite expression pattern: Its expression is initiated during stage 13/14 and is maintained throughout embryogenesis and larval life in the Rh6 subtype, whereas it is excluded from the Rh5 subtype ( Fig. 5E-H anti-Svp antibody; data not shown). Thus sal and svp are expressed in complementary subsets of larval PR subtypes. The expression of both transcription factors precedes rhodopsin expression.", "cite_spans": [], "ref_spans": [{"start": 563, "end": 572, "text": "Fig. 5A,B", "ref_id": "FIGREF4"}, {"start": 810, "end": 817, "text": "Fig. 5C", "ref_id": "FIGREF4"}, {"start": 857, "end": 866, "text": "(Fig. 5D)", "ref_id": "FIGREF4"}, {"start": 1084, "end": 1093, "text": "Fig. 5E-H", "ref_id": "FIGREF4"}]}, {"section": "spalt is required for rh5 expression in larval PRs", "text": "The sal genes encode two zinc finger transcription factors that are specifically expressed in adult inner PRs R7 and R8, where they are required to distinguish them from outer PRs (Mollereau et al. 2001 ). Since sal is exclusively expressed in the Rh5 subtype, prior to rh5 expression, we tested whether it is required for the development of this subtype. We thus analyzed the expression of Rh5 and Rh6 in sal mutants at the end of embryogenesis, when these mutants die. No expression of Rh5 can be detected in sal mutants, even though the correct Cold (Fig. 6C) . However, the general PR markers Elav, FasII, Kr, and Chp remain expressed in all PRs, suggesting that four PRs express neither Rh5 nor Rh6 ( Fig. 6C ; data not shown). Since rh1, rh3, and rh4 expression cannot be detected, these four PRs appear to be \"empty\" and devoid of PR molecules (data not shown). PRs of the larval eye project into the region of the dendritic arborizations of the larval pacemaker neurons. Axon termini of Rh5 and Rh6 PRs are generally directly adjacent to each other (Mazzoni et al. 2005) . In sal mutants, all larval PRs project into the correct target region of the late embryonic brain (Fig. 6G) . We compared the projection termini of empty PRs with those of Rh6 PRs (\"empty\" PR termini are identified by FasII staining and the absence of Rh6 staining). At embryonic stage 17 (just before these mutants die), axonal termini of \"empty\" PRs project correctly to the target region, adjacent to Rh6 termini (Fig. 6H,I ). Therefore sal is essential in the larval eye for the proper differentiation of the Rh5 PRs but not for their early specification or for axonal targeting.", "cite_spans": [{"start": 180, "end": 202, "text": "(Mollereau et al. 2001", "ref_id": "BIBREF36"}, {"start": 1057, "end": 1078, "text": "(Mazzoni et al. 2005)", "ref_id": "BIBREF31"}], "ref_spans": [{"start": 553, "end": 562, "text": "(Fig. 6C)", "ref_id": "FIGREF5"}, {"start": 706, "end": 713, "text": "Fig. 6C", "ref_id": "FIGREF5"}, {"start": 1179, "end": 1188, "text": "(Fig. 6G)", "ref_id": "FIGREF5"}, {"start": 1497, "end": 1508, "text": "(Fig. 6H,I", "ref_id": "FIGREF5"}]}, {"section": "seven-up is required for Rh6-subtype specification and to repress Rh5-subtype fate", "text": "In the adult eye, the orphan nuclear receptor svp is required posterior to the morphogenetic furrow for the specification of the R3/R4 and R1/R6 pairs, but not for their later differentiation (Mlodzik et al. 1990 ). In the larval eye, svp is exclusively expressed in Rh6 PRs prior to rh6 expression, suggesting that it might be involved in the development of this subtype. We analyzed the expression of Rh5 and Rh6 in svp mutant embryos at stage 17 (as for sal, svp mutants die at the end of embryogenesis). While the total number of PRs (as marked by Elav, FasII, Kr, and Chp) remains unchanged (Fig. 6B,E) in svp mutants, all PRs express Rh5 (Fig. 6B ) and no Rh6 expression is detectable. Therefore, Rh6 cells appear to have switched fate toward the Rh5 fate. Consistent with the transformation of PRs into Rh5 subtype, all PRs express sal in svp mutants (Fig. 6E ). This suggests that svp is necessary for the repression of sal in the Rh6 subtype, thus allowing expression of rh6. To test whether svp is sufficient for the Rh6 PR fate, we performed gain-offunction experiments using early (so-Gal4) or later sevenless-Gal4 (sev-Gal4) drivers. The sev-Gal4 driver starts to be expressed in all larval PRs during late embryonic stages 12/13, just after all immature PRs have formed. It remains expressed through larval life ( Supplementary Fig. 1B,D; data not shown). In larvae that express UASsvp under the control of so-Gal4, no Rh5 expression can be detected, while all BO PRs express rh6 with no change in the overall number of PRs (Fig. 6F) . However, if sal expression is affected, it is not completely abolished, with one to three PRs still weakly expressing Sal (data not shown), arguing that Rh6 is now expressed in the Rh5 subtype. Later expression of UAS-svp under the control of sev-Gal4 does not result in alteration of Rh expression (data not shown). Thus, svp not only acts to repress sal in the Rh6 subtype, but it also acts as an activator of Rh6 expression since Sal and Rh6 can coexist in the same cell. In contrast ectopic expression of UASsalm under the control of so-Gal4 or of sev-Gal4 leads to no change of Rh5 or Rh6 expression. Expression of Svp remains unaffected, with about eight cells still expressing Svp (data not shown). Thus svp is necessary and sufficient to induce the Rh6 fate. In contrast sal is necessary but not sufficient for the Rh5 fate.", "cite_spans": [{"start": 192, "end": 212, "text": "(Mlodzik et al. 1990", "ref_id": "BIBREF34"}], "ref_spans": [{"start": 596, "end": 607, "text": "(Fig. 6B,E)", "ref_id": "FIGREF5"}, {"start": 644, "end": 652, "text": "(Fig. 6B", "ref_id": "FIGREF5"}, {"start": 858, "end": 866, "text": "(Fig. 6E", "ref_id": "FIGREF5"}, {"start": 1328, "end": 1353, "text": "Supplementary Fig. 1B,D;", "ref_id": "FIGREF0"}, {"start": 1539, "end": 1548, "text": "(Fig. 6F)", "ref_id": "FIGREF5"}]}, {"section": "seven-up is required for Rh6-subtype specification and to repress Rh5-subtype fate", "text": "otd is required in the Rh5 subtype for rh5 expression and rh6 repression The homeodomain protein Otd is required in inner PRs of the adult retina for the activation of rh3 and rh5 in p ommatidia, whereas it is required in outer PRs to repress rh6 (Tahayato et al. 2003 ). In the adult retina, otd is expressed in all PRs. Similarly, otd is expressed in all embryonic immature PRs, and this expression is maintained during larval life. During early development, otd acts as a major component for patterning the anterior region of the embryo (Cohen and Jurgens 1990) . During stages 8/9, otd expression spans the entire cephalic region (Fig. 7A) and becomes subsequently restricted to more anterior regions and regions giving rise to the brain. However otd expression is excluded from large parts of the optic lobe primordium as well as from the region giving rise to larval PRs (Fig. 7B) . Starting at stage 12, however, otd expression is reinitiated in the ventral lateral part of the optic lobe primordium, the region that will give rise to the precursors of larval PRs. otd remains expressed in all PRs throughout embryogenesis and larval life (Fig. 7C-F) . otd is not required for the formation of larval PRs (nor the larval optic lobe primordium): Even though otd-null mutant embryos show severe head involution defects, the normal number of immature PRs is formed (data not shown). To investigate whether otd is involved in larval PR-subtype specification, we analyzed Rh5 and Rh6 expression in viable otd uvi mutants ( Vandendries et al. 1996) . In otd uvi , larval PRs do not express Rh5 while the total number of PRs remains the same (Fig. 7G,H ) and all PRs express Rh6 (Fig. 7G) . About four cells still express sal, but these cells now express Rh6 instead of Rh5 (Fig. 7H) . Interestingly, the number of Svp-expressing cells is not altered, with four Svp-negative PRs expressing Rh6 (data not shown). This indicates that, in the Rh5 subtype, otd acts as an inhibitor of rh6 expression and is required for Rh5 expression. As expression of otd is unaltered in sal mutants (data not shown), otd, like sal, seems to be necessary but not sufficient for Rh5 expression. otd and sal expression does not depend on each other, and since they are both required for Rh5 expression, they appear to act in parallel pathways.", "cite_spans": [{"start": 247, "end": 268, "text": "(Tahayato et al. 2003", "ref_id": "BIBREF51"}, {"start": 540, "end": 564, "text": "(Cohen and Jurgens 1990)", "ref_id": "BIBREF7"}, {"start": 1525, "end": 1549, "text": "Vandendries et al. 1996)", "ref_id": "BIBREF54"}], "ref_spans": [{"start": 634, "end": 643, "text": "(Fig. 7A)", "ref_id": "FIGREF6"}, {"start": 877, "end": 886, "text": "(Fig. 7B)", "ref_id": "FIGREF6"}, {"start": 1146, "end": 1157, "text": "(Fig. 7C-F)", "ref_id": "FIGREF6"}, {"start": 1642, "end": 1652, "text": "(Fig. 7G,H", "ref_id": "FIGREF6"}, {"start": 1679, "end": 1688, "text": "(Fig. 7G)", "ref_id": "FIGREF6"}, {"start": 1774, "end": 1783, "text": "(Fig. 7H)", "ref_id": "FIGREF6"}]}, {"section": "Discussion", "text": "In this study, we describe the genetic mechanisms underlying the specification of BO PR subtypes. The larval eye consists of two distinct PR subtypes, three to four PRs containing blue-sensitive Rh5 and eight to 10 containing green-sensitive Rh6. Primary precursors, which give rise to the Rh5-subtype PRs, signal to the surrounding tissue to develop as secondary precursors, which become the Rh6 subtype. EGFR signaling is required for the survival of these secondary precursors. The combi- natorial action of the three transcription factors Sal, Svp, and Otd then orchestrates the differentiation of the two PR subtypes. Interestingly, even though larval PRs and the adult R8 have the same rhodopsin content, the mechanisms to establish their fates are remarkably different.", "cite_spans": [], "ref_spans": []}, {"section": "Initial specification of PR cell fates", "text": "Specification of adult PRs starts with the proliferation of undifferentiated cells anterior to the morphogenetic furrow and the recruitment of individual PRs into ommatidia posterior to the morphogenetic furrow in a tightly regulated spatiotemporal manner. R8 is first specified by ato and does not depend on EGFR signaling (Jarman et al. 1994; Freeman 1996) . Sequential recruitment of all other PRs (R2/5, R3/4, R1/6, and then R7) is dependent on EGFR signaling (Freeman 1996; Raabe 2000; Nagaraj and Banerjee 2004) . Similarly, in the larval eye, primary precursors express ato and are independent of EGFR signaling while secondary precursors need EGFR signaling for their development. Primary precursors develop into the Rh5 subtype while the Rh6-subtype identity corresponds to secondary precursors.", "cite_spans": [{"start": 324, "end": 344, "text": "(Jarman et al. 1994;", "ref_id": "BIBREF21"}, {"start": 345, "end": 358, "text": "Freeman 1996)", "ref_id": "BIBREF14"}, {"start": 464, "end": 478, "text": "(Freeman 1996;", "ref_id": "BIBREF14"}, {"start": 479, "end": 490, "text": "Raabe 2000;", "ref_id": "BIBREF44"}, {"start": 491, "end": 517, "text": "Nagaraj and Banerjee 2004)", "ref_id": "BIBREF38"}], "ref_spans": []}, {"section": "Promoting and repressing the Rh6 subtype", "text": "During larval PR development, EGFR signaling is required for the Rh6 but not the Rh5 subtype. tll inhibits this process by preventing the adjacent optic lobe primordium from responding to EGFR signaling (Daniel et al. 1999; Dumstrei et al. 2002; Chang et al. 2003a ). However, expression of tll is not negatively regulated by EGFR signaling (Daniel et al. 1999; Dumstrei et al. 2002; Chang et al. 2003a) . Blocking apoptosis or removing tll function both result in supernumerary Rh6 cells, indicating that cells that should have died in response to tll function become Rh6 PRs (Fig. 8A) . Secondary precursors do not appear to require maintenance of EGFR signaling to survive. During later developmental stages, all immature PRs express sevenless, another gene encoding a Receptor Tyrosine Kinase as well as its ligand, Boss (S.G. Specher and C. Desplan, unpubl.), which could act redundantly with EGFR later in development. However, we see no effect of mutating boss or sev, either on larval PR number or on Rh5/Rh6 expression (S.G. Specher and C. Desplan, unpubl.). It could be that late EGFR activity compensates for the loss of Sev activity, comparable with the adult R7 cells, where EGFR is sufficient to replace Sev (Freeman 1996) .", "cite_spans": [{"start": 203, "end": 223, "text": "(Daniel et al. 1999;", "ref_id": "BIBREF9"}, {"start": 224, "end": 245, "text": "Dumstrei et al. 2002;", "ref_id": "BIBREF12"}, {"start": 246, "end": 264, "text": "Chang et al. 2003a", "ref_id": "BIBREF3"}, {"start": 341, "end": 361, "text": "(Daniel et al. 1999;", "ref_id": "BIBREF9"}, {"start": 362, "end": 383, "text": "Dumstrei et al. 2002;", "ref_id": "BIBREF12"}, {"start": 384, "end": 403, "text": "Chang et al. 2003a)", "ref_id": "BIBREF3"}, {"start": 1222, "end": 1236, "text": "(Freeman 1996)", "ref_id": "BIBREF14"}], "ref_spans": [{"start": 577, "end": 586, "text": "(Fig. 8A)", "ref_id": "FIGREF7"}]}, {"section": "Promoting and repressing the Rh6 subtype", "text": "What is the function of the EGFR pathway in antagonizing tll function? The Ets transcription factors yan and pnt are both expressed during the period of secondary precursor specification. In response to EGFR signaling, Yan acts as a repressor and Pnt as an activator. Their tightly controlled activation, cross-regulation, and competition for binding sites are essential for appropriate EGFR signaling. It is difficult, however, to test their function in BO PR formation, as mutants die with strong patterning defects in the embryo. We could test the function of EGFR because we were able to inhibit its function late, specifically in the optic lobe region, by misexpressing a dominant-negative form of EGFR. This likely inhibits but might not completely abolish endogenous EGFR signaling. Thus, even though concomitantly preventing cell death and EGFR signaling restores Rh6 PRs, it is possible that basal levels of EGFR signaling are sufficient for the specification of Rh6 PR specification, but not for survival. The mechanism by which tll affects secondary precursor development remains elusive. tll might prevent cells from developing as secondary precursors, leading them to undergo apoptosis. Only cells that receive enough EGFR signal near primary PR precursors are rescued. Alternatively a second signal could make tll-expressing cells undergo apoptosis. Notch and hedgehog signaling have also been shown to be involved the development of larval PRs and may provide an alternative source for proper subtype specification and tll acts in the surrounding tissue to inhibit secondary precursor development. Primary precursors give rise to the Rh5 PR subtype, whereas secondary precursors give rise to the Rh6 PR subtype. In the Rh5 PR subtype, sal and otd are required for Rh5 expression, and otd further for the repression of Rh6. In the Rh6 subtype, svp is required for Rh6 expression and for the repression of sal expression. (B) The negative feedback loop of wts and melt mediates the decision of R8 to express Rh6 or Rh5. Which way the loop swings depends on an instructive signal of the overlying R7 cell. The presence of gene expression is indicated by black type and its absence is indicated with gray type. Arrows shown in black (for activation) and red (for repression) indicate an active interaction; gray arrows indicate the absence of this interaction.", "cite_spans": [], "ref_spans": []}, {"section": "Promoting and repressing the Rh6 subtype", "text": "Larval PR-subtype specification", "cite_spans": [], "ref_spans": []}, {"section": "GENES & DEVELOPMENT 2191", "text": "Cold Spring Harbor Laboratory Press on November 4, 2016 -Published by genesdev.cshlp.org Downloaded from survival of secondary precursors (Green et al. 1993; Schmucker et al. 1994; Suzuki and Saigo 2000) .", "cite_spans": [{"start": 138, "end": 157, "text": "(Green et al. 1993;", "ref_id": "BIBREF15"}, {"start": 158, "end": 180, "text": "Schmucker et al. 1994;", "ref_id": "BIBREF47"}, {"start": 181, "end": 203, "text": "Suzuki and Saigo 2000)", "ref_id": "BIBREF50"}], "ref_spans": []}, {"section": "Network of transcription factors for PR-subtype specification", "text": "During larval eye development, sal is only required for the expression of Rh5 but not for the specification of Rh5-subtype fate. First, ectopic activation of Sal is not sufficient to induce Rh5 expression. Further, sal is still expressed in cells that have adopted the Rh6 subtype due to ectopic svp expression. In contrast, svp is required and sufficient for the Rh6 subtype where it represses the Rh5-subtype fate. Interestingly, svp is not only required for the repression of sal, but is also necessary for Rh6 expression (Fig. 8A) . In the adult retina, svp is necessary for the specification of the R3/R4 and R1/R6 pairs (Mlodzik et al. 1990; Domingos et al. 2004a,b) where it is also required for the repression of sal: R3/R4 are transformed into R7 cells in svp mutants (Domingos et al. 2004b) . However, the upstream mechanisms by which the expression of sal and svp is controlled in larval PRs remains elusive. There may be an unknown signal from primary precursors that induces secondary precursors by controlling svp expression. This signal is probably not EGFR, since EGFR dn secondary cells that are rescued from death by p35 still express svp.", "cite_spans": [{"start": 626, "end": 647, "text": "(Mlodzik et al. 1990;", "ref_id": "BIBREF34"}, {"start": 777, "end": 800, "text": "(Domingos et al. 2004b)", "ref_id": "BIBREF11"}], "ref_spans": [{"start": 525, "end": 534, "text": "(Fig. 8A)", "ref_id": "FIGREF7"}]}, {"section": "Network of transcription factors for PR-subtype specification", "text": "In the adult retina otd is required for the expression of rh3 and rh5 and for the repression of Rh6 in outer PRs. In the larval eye, is also expressed in all PRs, but it is not required for the formation of larval PRs. During their terminal differentiation, otd is required, only in the Rh5 subtype, for Rh5 expression and Rh6 repression (Fig. 8A) . Since otd only functions in the context of sal expression, it acts as a permissive factor for Rh regulation. It seems likely that otd and sal act in parallel in the Rh5 subtype: Otd expression is not altered in sal mutants and Sal expression is not altered in otd mutants. Further, Otd, which binds directly to the rh3, rh5, and rh6 enhancers in the adult eye, likely acts in a similar fashion in larval PRs (Tahayato et al. 2003) .", "cite_spans": [{"start": 758, "end": 780, "text": "(Tahayato et al. 2003)", "ref_id": "BIBREF51"}], "ref_spans": [{"start": 338, "end": 347, "text": "(Fig. 8A)", "ref_id": "FIGREF7"}]}, {"section": "Specification of PR subtypes: comparison between larval and adult eyes", "text": "There are interesting similarities and differences between PR-subtype specification in the larval and adult eyes. Most strikingly, the two Rhs expressed in the larval eye are R8 Rhs. The type of Rh expressed in R8 is instructed by R7 and maintained by the wts/melt bistable loop (Fig. 8B) . In contrast, in the larva, there are no additional PRs in BO to instruct the Rh5 and Rh6 PR fate and the expression of these genes does not depend on wts and melt. Since misexpression of wts or melt does not affect larval PRs, the downstream effectors of the loop must be absent or not functioning in the larval eye. Rh5 PRs, which are specified first, might be a source of an instructive signal for the Rh6-subtype fate. Alternatively, this signal might arise from non-BO cells. However, we have not yet been able to determine its identity. Finally, larval PRs are not specified or distributed stochastically in BO, as the two groups that express Rhr or Rh6 are physically distinct, presumably explaining why there is not need for the bistable loop of wts and melt. Therefore, even though the Rh fates of larval PRs and adult R8 are identical, they achieve their fates through very distinct mechanisms.", "cite_spans": [], "ref_spans": [{"start": 279, "end": 288, "text": "(Fig. 8B)", "ref_id": "FIGREF7"}]}, {"section": "Larval PR development: similarities to chordotonal organ development", "text": "Primary sensory precursors also induce secondary precursors in the development of the peripheral nervous system, in chordotonal and external sensory organs. After the delamination of chordotonal or sensory organ precursors (SOP), these cells signal to the overlaying ectoderm to induce delamination of secondary precursors (Okabe and Okano 1997; zur Lage and Jarman 1999; zur Lage et al. 2004 ). EGFR signaling is essential for the survival of BO secondary precursors, whereas in SOPs, it induces the delamination of secondary precursors. sal is also required to adopt the proper final cell fate both in larval PR precursors (Rh5 vs. Rh6) and embryonic SOPs (nonneuronal oenocytes vs. sensory neurons) (Elstob et al. 2001 ). However, oenocyte specification completely depends on sal, whereas larval eye primary precursors only require sal for Rh5 expression (Elstob et al. 2001 ). Thus, the two systems use EGFR signaling and Sal differently.", "cite_spans": [{"start": 323, "end": 345, "text": "(Okabe and Okano 1997;", "ref_id": "BIBREF39"}, {"start": 346, "end": 371, "text": "zur Lage and Jarman 1999;", "ref_id": "BIBREF57"}, {"start": 372, "end": 392, "text": "zur Lage et al. 2004", "ref_id": "BIBREF58"}, {"start": 702, "end": 721, "text": "(Elstob et al. 2001", "ref_id": "BIBREF13"}, {"start": 858, "end": 877, "text": "(Elstob et al. 2001", "ref_id": "BIBREF13"}], "ref_spans": []}, {"section": "Larval PR development: similarities to chordotonal organ development", "text": "Larval PR precursors do not further divide, while SOPs later undergo asymmetric cell division to produce two nonidentical daughter cells. This may be due to the fact that BO only contain two different subtypes, whose identity correlate with primary or secondary precursors. Further, larval PR precursors develop in a group of adjacent cells as part of a placode. Thus, classical SOP specification using Notch signaling and lateral inhibition does not seem to occur to specify PR precursors. It will of great interest to further investigate the similarities and differences in the molecular mechanisms underlying the development of these sensory organs and how they are controlled.", "cite_spans": [], "ref_spans": []}, {"section": "Materials and methods", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "Drosophila strains and genetics", "text": "For wild-type comparison we used yw 122 ; yw 122 , sp/CyO, TM2/ TM6b; or heterozygous siblings of mutant alleles. For sal mutant analysis, the sal 16 and a small Deficiency-Df(2L)32FP-5, which removes salm and salr-were used, balanced over CyO, Dfd-YFP; both fly strains gave comparable results (Kuhnlein et al. 1994; Elstob et al. 2001) . For svp mutant analysis, the svp E22 was used, balanced over TM6b, Dfd-YFP (Mlodzik et al. 1990 ). For tll mutant analysis, the tll 1 and tll 149 were used, balanced over TM6b, Dfd-YFP. For sal, tll, and svp, homozygous mutants were identified by the absence of Dfd-YFP (Bloomington Stock Center). We used the viable otd uvi allele (Vandendries et al. 1996) . The following fly strains were used: so-Gal4 (Chang et al. 2003b) , sev-Gal4 (Therrien et al. 1999) , otd-Gal4 (T. Cook, pers. comm.), ato-Gal4 (Hassan et al. 2000) , svp H162 -LacZ (Elstob et al. 2001 (Kuhnlein and Schuh 1996) , UASsvp (Kramer et al. 1995) , UAS-CD8GFP, UAS-p35, UAS-pnt", "cite_spans": [{"start": 295, "end": 317, "text": "(Kuhnlein et al. 1994;", "ref_id": "BIBREF27"}, {"start": 318, "end": 337, "text": "Elstob et al. 2001)", "ref_id": "BIBREF13"}, {"start": 415, "end": 435, "text": "(Mlodzik et al. 1990", "ref_id": "BIBREF34"}, {"start": 672, "end": 697, "text": "(Vandendries et al. 1996)", "ref_id": "BIBREF54"}, {"start": 745, "end": 765, "text": "(Chang et al. 2003b)", "ref_id": "BIBREF4"}, {"start": 777, "end": 799, "text": "(Therrien et al. 1999)", "ref_id": "BIBREF53"}, {"start": 882, "end": 901, "text": "(Elstob et al. 2001", "ref_id": "BIBREF13"}, {"start": 902, "end": 927, "text": "(Kuhnlein and Schuh 1996)", "ref_id": "BIBREF26"}, {"start": 937, "end": 957, "text": "(Kramer et al. 1995)", "ref_id": "BIBREF25"}], "ref_spans": []}, {"section": "P1", "text": ", and UAS-yan (Bloomington). Embryos were staged according to Campos-Ortega and Hartenstein (1997) .", "cite_spans": [{"start": 62, "end": 98, "text": "Campos-Ortega and Hartenstein (1997)", "ref_id": "BIBREF2"}], "ref_spans": []}, {"section": "Immunohistochemistry and preparation of embryonic and larval specimen", "text": "Embryos were dechorionated, fixed, and immunostained according to previously published protocols (Therianos et al. 1995) . Primary antibodies were rabbit anti-Rh6 1:10,000 (Tahayato et al. 2003) , mouse anti-Rh5 1:20, anti-Rh3 1:20, or anti-Rh4 1:20 (Chou et al. 1996) , mouse anti-Nrt 1:10 (Developmental Studies Hybridoma Bank [DSHB]), mouse anti-Rh1 1:20 (DSHB), mouse anti-FasII 1:10 (Lin and Goodman 1994), rat anti-Elav 1:30 (DSHB), goat anti-Ato 1:1000 (Jarman et al. 1993) , sheep anti-GFP (Biogenesis), rabbit anti-Sal 1:200 (Kuhnlein et al. 1994) , mouse anti-Svp 1:1000 (Kanai et al. 2005) , mouse antiPros 1:50 (DSHB), mouse anti-Chp 1:10 (DSHB), rat anti-Otd 1:200 (Hirth et al. 2003) , rat anti-Kr 1:300 (Kosman et al. 1998) , rab anti-Pnt P1 (Alvarez et al. 2003) , anti-Yan (Rebay and Rubin 1995) , guinea pig anti-Ss 1:500 (Kim et al. 2006) , and mouse anti-\u2424GAL 1:20 (DSHB). Secondary antibodies used for confocal microscopic analysis were Alexa-488, Alexa-555, and Alexa-647 antibodies generated in goat (Molecular probes), all at 1:300-1:500 dilution. Embryos were mounted in Vectashield H-1000 (Vector). For the analysis of the larval BO, the head skeleton was separated from epidermis, imaginal discs, and brain and fixed for 15 min in 4% formaldehyde/PBS. The chitinous head skeleton was then carefully opened on the dorsal and ventral midline using sharpened minutien pins (0.1-mm diameter, Fisher Scientific Tools).", "cite_spans": [{"start": 97, "end": 120, "text": "(Therianos et al. 1995)", "ref_id": "BIBREF52"}, {"start": 172, "end": 194, "text": "(Tahayato et al. 2003)", "ref_id": "BIBREF51"}, {"start": 250, "end": 268, "text": "(Chou et al. 1996)", "ref_id": "BIBREF5"}, {"start": 460, "end": 480, "text": "(Jarman et al. 1993)", "ref_id": "BIBREF20"}, {"start": 534, "end": 556, "text": "(Kuhnlein et al. 1994)", "ref_id": "BIBREF27"}, {"start": 581, "end": 600, "text": "(Kanai et al. 2005)", "ref_id": "BIBREF22"}, {"start": 678, "end": 697, "text": "(Hirth et al. 2003)", "ref_id": "BIBREF19"}, {"start": 718, "end": 738, "text": "(Kosman et al. 1998)", "ref_id": "BIBREF24"}, {"start": 757, "end": 778, "text": "(Alvarez et al. 2003)", "ref_id": "BIBREF0"}, {"start": 790, "end": 812, "text": "(Rebay and Rubin 1995)", "ref_id": "BIBREF45"}, {"start": 840, "end": 857, "text": "(Kim et al. 2006)", "ref_id": "BIBREF23"}], "ref_spans": []}, {"section": "Laser confocal microscopy and image processing", "text": "For laser confocal microscopy, a Leica TCS SP was used. Optical sections ranged from 0.2 to 1.5 \u00b5m, recorded in line average mode with a picture size of 512 \u00d7 512 pixels or 1024 \u00d7 1024 pixels. Captured images from optical sections were arranged and processed using Leica confocal Software (LCS). Complete series of optical sections were imported and processed using ImageJ. Generation of three-dimensional digital models and raw tiff stacks (stacks of optical sections) were done using AMIRA (Mercury Computer Systems) as previously described (Sprecher et al. 2006) .", "cite_spans": [{"start": 543, "end": 565, "text": "(Sprecher et al. 2006)", "ref_id": "BIBREF49"}], "ref_spans": []}], "bib_entries": {"BIBREF0": {"title": "pannier and pointedP2 act sequentially to regulate Drosophila heart development", "authors": [{"first": "A", "middle": ["D"], "last": "Alvarez", "suffix": ""}, {"first": "W", "middle": ["Y"], "last": "Shi", "suffix": ""}, {"first": "B", "middle": ["A"], "last": "Wilson", "suffix": ""}, {"first": "J", "middle": ["B"], "last": "Skeath", "suffix": ""}], "year": 2003, "venue": "Development", "link": "3239892"}, "BIBREF1": {"title": "Frizzled regulates localization of cellfate determinants and mitotic spindle rotation during asymmetric cell division", "authors": [{"first": "Y", "middle": [], "last": "Bellaiche", "suffix": ""}, {"first": "M", "middle": [], "last": "Gho", "suffix": ""}, {"first": "J", "middle": ["A"], "last": "Kaltschmidt", "suffix": ""}, {"first": "A", "middle": ["H"], "last": "Brand", "suffix": ""}, {"first": "F", "middle": [], "last": "Schweisguth", "suffix": ""}], "year": 2001, "venue": "Nat. Cell Biol", "link": "16136920"}, "BIBREF2": {"title": "The embryonic development of Drosophila melanogaster", "authors": [{"first": "J", "middle": ["A"], "last": "Campos-Ortega", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 1997, "venue": "", "link": "21926238"}, "BIBREF3": {"title": "Antagonistic relationship between Dpp and EGFR signaling in Drosophila head patterning", "authors": [{"first": "T", "middle": [], "last": "Chang", "suffix": ""}, {"first": "D", "middle": [], "last": "Shy", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 2003, "venue": "Dev. Biol", "link": "6776383"}, "BIBREF4": {"title": "Development of neural lineages derived from the sine oculis positive eye field of Drosophila", "authors": [{"first": "T", "middle": [], "last": "Chang", "suffix": ""}, {"first": "A", "middle": [], "last": "Younossi-Hartenstein", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 2003, "venue": "Arthropod Struct. Dev", "link": "24243770"}, "BIBREF5": {"title": "Identification of a novel opsin reveals specific patterning of the R7 and R8 photoreceptor cells", "authors": [{"first": "W", "middle": [], "last": "Chou", "suffix": ""}, {"first": "K", "middle": [], "last": "Hall", "suffix": ""}, {"first": "D", "middle": [], "last": "Wilson", "suffix": ""}, {"first": "C", "middle": [], "last": "Wideman", "suffix": ""}, {"first": "S", "middle": [], "last": "Townson", "suffix": ""}, {"first": "L", "middle": [], "last": "Chadwell", "suffix": ""}, {"first": "S", "middle": [], "last": "Britt", "suffix": ""}], "year": 1996, "venue": "Neuron", "link": null}, "BIBREF6": {"title": "Patterning of the R7 and R8 photoreceptor cells of Drosophila: Evidence for induced and default cell-fate specification", "authors": [{"first": "W", "middle": ["H"], "last": "Chou", "suffix": ""}, {"first": "A", "middle": [], "last": "Huber", "suffix": ""}, {"first": "J", "middle": [], "last": "Bentrop", "suffix": ""}, {"first": "S", "middle": [], "last": "Schulz", "suffix": ""}, {"first": "K", "middle": [], "last": "Schwab", "suffix": ""}, {"first": "L", "middle": ["V"], "last": "Chadwell", "suffix": ""}, {"first": "R", "middle": [], "last": "Paulsen", "suffix": ""}, {"first": "S", "middle": ["G"], "last": "Britt", "suffix": ""}], "year": 1999, "venue": "Development", "link": "1653710"}, "BIBREF7": {"title": "Mediation of Drosophila head development by gap-like segmentation genes", "authors": [{"first": "S", "middle": ["M"], "last": "Cohen", "suffix": ""}, {"first": "G", "middle": [], "last": "Jurgens", "suffix": ""}], "year": 1990, "venue": "Nature", "link": null}, "BIBREF8": {"title": "Distinction between color photoreceptor cell fates is controlled by Prospero in Drosophila", "authors": [{"first": "T", "middle": [], "last": "Cook", "suffix": ""}, {"first": "F", "middle": [], "last": "Pichaud", "suffix": ""}, {"first": "R", "middle": [], "last": "Sonneville", "suffix": ""}, {"first": "D", "middle": [], "last": "Papatsenko", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2003, "venue": "Dev. Cell", "link": "2927625"}, "BIBREF9": {"title": "The control of cell fate in the embryonic visual system by atonal, tailless and EGFR signaling", "authors": [{"first": "A", "middle": [], "last": "Daniel", "suffix": ""}, {"first": "K", "middle": [], "last": "Dumstrei", "suffix": ""}, {"first": "J", "middle": ["A"], "last": "Lengyel", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 1999, "venue": "Development", "link": "14006830"}, "BIBREF10": {"title": "Regulation of R7 and R8 differentiation by the spalt genes", "authors": [{"first": "P", "middle": ["M"], "last": "Domingos", "suffix": ""}, {"first": "S", "middle": [], "last": "Brown", "suffix": ""}, {"first": "R", "middle": [], "last": "Barrio", "suffix": ""}, {"first": "K", "middle": [], "last": "Ratnakumar", "suffix": ""}, {"first": "B", "middle": ["J"], "last": "Frankfort", "suffix": ""}, {"first": "G", "middle": [], "last": "Mardon", "suffix": ""}, {"first": "H", "middle": [], "last": "Steller", "suffix": ""}, {"first": "B", "middle": [], "last": "Mollereau", "suffix": ""}], "year": 2004, "venue": "Dev. Biol", "link": "2738368"}, "BIBREF11": {"title": "Spalt transcription factors are required for R3/R4 specification and establishment of planar cell polarity in the Drosophila eye", "authors": [{"first": "P", "middle": ["M"], "last": "Domingos", "suffix": ""}, {"first": "M", "middle": [], "last": "Mlodzik", "suffix": ""}, {"first": "C", "middle": ["S"], "last": "Mendes", "suffix": ""}, {"first": "S", "middle": [], "last": "Brown", "suffix": ""}, {"first": "H", "middle": [], "last": "Steller", "suffix": ""}, {"first": "B", "middle": [], "last": "Mollereau", "suffix": ""}], "year": 2004, "venue": "Development", "link": "12389921"}, "BIBREF12": {"title": "Interaction between EGFR signaling and DE-cadherin during nervous system morphogenesis", "authors": [{"first": "K", "middle": [], "last": "Dumstrei", "suffix": ""}, {"first": "F", "middle": [], "last": "Wang", "suffix": ""}, {"first": "D", "middle": [], "last": "Shy", "suffix": ""}, {"first": "U", "middle": [], "last": "Tepass", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 2002, "venue": "Development", "link": "7313952"}, "BIBREF13": {"title": "spalt-dependent switching between two cell fates that are induced by the Drosophila EGF receptor", "authors": [{"first": "P", "middle": ["R"], "last": "Elstob", "suffix": ""}, {"first": "V", "middle": [], "last": "Brodu", "suffix": ""}, {"first": "A", "middle": ["P"], "last": "Gould", "suffix": ""}], "year": 2001, "venue": "Development", "link": "26064810"}, "BIBREF14": {"title": "Reiterative use of the EGF receptor triggers differentiation of all cell types in the Drosophila eye", "authors": [{"first": "M", "middle": [], "last": "Freeman", "suffix": ""}], "year": 1996, "venue": "Cell", "link": "13295688"}, "BIBREF15": {"title": "The embryonic development of the Drosophila visual system", "authors": [{"first": "P", "middle": [], "last": "Green", "suffix": ""}, {"first": "A", "middle": ["Y"], "last": "Hartenstein", "suffix": ""}, {"first": "V", "middle": [], "last": "Hartenstein", "suffix": ""}], "year": 1993, "venue": "Cell Tissue Res", "link": "24868933"}, "BIBREF17": {"title": "atonal regulates neurite arborization but does not act as a proneural gene in the Drosophila brain", "authors": [{"first": "H", "middle": ["Y"], "last": "Zoghbi", "suffix": ""}, {"first": "H", "middle": ["J"], "last": "Bellen", "suffix": ""}], "year": 2000, "venue": "Neuron", "link": "4980890"}, "BIBREF18": {"title": "Photic input pathways that mediate the Drosophila larval response to light and circadian rhythmicity are developmentally related but functionally distinct", "authors": [{"first": "J", "middle": [], "last": "Hassan", "suffix": ""}, {"first": "B", "middle": [], "last": "Iyengar", "suffix": ""}, {"first": "N", "middle": [], "last": "Scantlebury", "suffix": ""}, {"first": "V", "middle": ["R"], "last": "Moncalvo", "suffix": ""}, {"first": "R", "middle": [], "last": "Campos", "suffix": ""}], "year": 2005, "venue": "J. Comp. Neurol", "link": "38371869"}, "BIBREF19": {"title": "An urbilaterian origin of the tripartite brain: Developmental genetic insights from Drosophila", "authors": [{"first": "F", "middle": [], "last": "Hirth", "suffix": ""}, {"first": "L", "middle": [], "last": "Kammermeier", "suffix": ""}, {"first": "E", "middle": [], "last": "Frei", "suffix": ""}, {"first": "U", "middle": [], "last": "Walldorf", "suffix": ""}, {"first": "M", "middle": [], "last": "Noll", "suffix": ""}, {"first": "H", "middle": [], "last": "Reichert", "suffix": ""}], "year": 2003, "venue": "Development", "link": "14084538"}, "BIBREF20": {"title": "atonal is a proneural gene that directs chordotonal organ formation in the Drosophila peripheral nervous system", "authors": [{"first": "A", "middle": ["P"], "last": "Jarman", "suffix": ""}, {"first": "Y", "middle": [], "last": "Grau", "suffix": ""}, {"first": "L", "middle": ["Y"], "last": "Jan", "suffix": ""}, {"first": "Jan", "middle": [], "last": "", "suffix": ""}, {"first": "Y", "middle": ["N"], "last": "", "suffix": ""}], "year": 1993, "venue": "Cell", "link": "10169222"}, "BIBREF21": {"title": "Atonal is the proneural gene for Drosophila photoreceptors", "authors": [{"first": "A", "middle": ["P"], "last": "Jarman", "suffix": ""}, {"first": "E", "middle": ["H"], "last": "Grell", "suffix": ""}, {"first": "L", "middle": [], "last": "Ackerman", "suffix": ""}, {"first": "L", "middle": ["Y"], "last": "Jan", "suffix": ""}, {"first": "Jan", "middle": [], "last": "", "suffix": ""}, {"first": "Y", "middle": ["N"], "last": "", "suffix": ""}], "year": 1994, "venue": "Nature", "link": null}, "BIBREF22": {"title": "seven-up Controls switching of transcription factors that specify temporal identities of Drosophila neuroblasts", "authors": [{"first": "M", "middle": ["I"], "last": "Kanai", "suffix": ""}, {"first": "M", "middle": [], "last": "Okabe", "suffix": ""}, {"first": "Y", "middle": [], "last": "Hiromi", "suffix": ""}], "year": 2005, "venue": "Dev. Cell", "link": "22846431"}, "BIBREF23": {"title": "The bHLH-PAS protein Spineless is necessary for the diversification of dendrite morphology of Drosophila dendritic arborization neurons", "authors": [{"first": "M", "middle": ["D"], "last": "Kim", "suffix": ""}, {"first": "L", "middle": ["Y"], "last": "Jan", "suffix": ""}, {"first": "Jan", "middle": [], "last": "", "suffix": ""}, {"first": "Y", "middle": ["N"], "last": "", "suffix": ""}], "year": 2006, "venue": "Genes & Dev", "link": "31640257"}, "BIBREF24": {"title": "Rapid preparation of a panel of polyclonal antibodies to Drosophila segmentation proteins", "authors": [{"first": "D", "middle": [], "last": "Kosman", "suffix": ""}, {"first": "S", "middle": [], "last": "Small", "suffix": ""}, {"first": "J", "middle": [], "last": "Reinitz", "suffix": ""}], "year": 1998, "venue": "Dev. Genes Evol", "link": "12838277"}, "BIBREF25": {"title": "Cell fate control in the Drosophila retina by the orphan receptor seven-up: Its role in the decisions mediated by the ras signaling pathway", "authors": [{"first": "S", "middle": [], "last": "Kramer", "suffix": ""}, {"first": "S", "middle": ["R"], "last": "West", "suffix": ""}, {"first": "Y", "middle": [], "last": "Hiromi", "suffix": ""}], "year": 1995, "venue": "Development", "link": "12962397"}, "BIBREF26": {"title": "Dual function of the regionspecific homeotic gene spalt during Drosophila tracheal system development", "authors": [{"first": "R", "middle": ["P"], "last": "Kuhnlein", "suffix": ""}, {"first": "R", "middle": [], "last": "Schuh", "suffix": ""}], "year": 1996, "venue": "Development", "link": "5808703"}, "BIBREF27": {"title": "spalt encodes an evolutionarily conserved zinc finger protein of novel structure which provides homeotic gene function in the head and tail region of the Drosophila embryo", "authors": [{"first": "R", "middle": ["P"], "last": "Kuhnlein", "suffix": ""}, {"first": "G", "middle": [], "last": "Frommer", "suffix": ""}, {"first": "M", "middle": [], "last": "Friedrich", "suffix": ""}, {"first": "M", "middle": [], "last": "Gonzalez-Gaitan", "suffix": ""}, {"first": "A", "middle": [], "last": "Weber", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "Wagner-Bernholz", "suffix": ""}, {"first": "W", "middle": ["J"], "last": "Gehring", "suffix": ""}, {"first": "H", "middle": [], "last": "Jackle", "suffix": ""}, {"first": "R", "middle": [], "last": "Schuh", "suffix": ""}], "year": 1994, "venue": "EMBO J", "link": "26116881"}, "BIBREF28": {"title": "Ectopic and increased expression of Fasciclin II alters motoneuron growth cone guidance", "authors": [{"first": "D", "middle": ["M"], "last": "Lin", "suffix": ""}, {"first": "C", "middle": ["S"], "last": "Goodman", "suffix": ""}], "year": 1994, "venue": "Neuron", "link": "11045288"}, "BIBREF29": {"title": "Larval optic nerve and adult extra-retinal photoreceptors sequentially associate with clock neurons during Drosophila brain development", "authors": [{"first": "S", "middle": [], "last": "Malpel", "suffix": ""}, {"first": "A", "middle": [], "last": "Klarsfeld", "suffix": ""}, {"first": "F", "middle": [], "last": "Rouyer", "suffix": ""}], "year": 2002, "venue": "Development", "link": "24568065"}, "BIBREF30": {"title": "Circadian synchronization and rhythmicity in larval photoperception-defective mutants of Drosophila", "authors": [{"first": "S", "middle": [], "last": "Malpel", "suffix": ""}, {"first": "A", "middle": [], "last": "Klarsfeld", "suffix": ""}, {"first": "F", "middle": [], "last": "Rouyer", "suffix": ""}], "year": 2004, "venue": "J. Biol. Rhythms", "link": "14067551"}, "BIBREF31": {"title": "Circadian pacemaker neurons transmit and modulate visual information to control a rapid behavioral response", "authors": [{"first": "E", "middle": ["O"], "last": "Mazzoni", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}, {"first": "J", "middle": [], "last": "Blau", "suffix": ""}], "year": 2005, "venue": "Neuron", "link": "9568853"}, "BIBREF32": {"title": "Flipping coins in the fly retina", "authors": [{"first": "T", "middle": [], "last": "Mikeladze-Dvali", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}, {"first": "D", "middle": [], "last": "Pistillo", "suffix": ""}], "year": 2005, "venue": "Curr. Top. Dev. Biol", "link": "6921311"}, "BIBREF33": {"title": "The growth regulators warts/lats and melted interact in a bistable loop to specify opposite fates in Drosophila R8 photoreceptors", "authors": [{"first": "T", "middle": [], "last": "Mikeladze-Dvali", "suffix": ""}, {"first": "M", "middle": ["F"], "last": "Wernet", "suffix": ""}, {"first": "D", "middle": [], "last": "Pistillo", "suffix": ""}, {"first": "E", "middle": ["O"], "last": "Mazzoni", "suffix": ""}, {"first": "A", "middle": ["A"], "last": "Teleman", "suffix": ""}, {"first": "Y", "middle": ["W"], "last": "Chen", "suffix": ""}, {"first": "S", "middle": [], "last": "Cohen", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2005, "venue": "Cell", "link": "6632308"}, "BIBREF34": {"title": "The Drosophila seven-up gene, a member of the steroid receptor gene superfamily, controls photoreceptor cell fates", "authors": [{"first": "M", "middle": [], "last": "Mlodzik", "suffix": ""}, {"first": "Y", "middle": [], "last": "Hiromi", "suffix": ""}, {"first": "U", "middle": [], "last": "Weber", "suffix": ""}, {"first": "C", "middle": ["S"], "last": "Goodman", "suffix": ""}, {"first": "G", "middle": ["M"], "last": "Rubin", "suffix": ""}], "year": 1990, "venue": "Cell", "link": "13427369"}, "BIBREF35": {"title": "A green fluorescent protein enhancer trap screen in Drosophila photoreceptor cells", "authors": [{"first": "B", "middle": [], "last": "Mollereau", "suffix": ""}, {"first": "M", "middle": ["F"], "last": "Wernet", "suffix": ""}, {"first": "P", "middle": [], "last": "Beaufils", "suffix": ""}, {"first": "D", "middle": [], "last": "Killian", "suffix": ""}, {"first": "F", "middle": [], "last": "Pichaud", "suffix": ""}, {"first": "R", "middle": [], "last": "Kuhnlein", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2000, "venue": "Mech. Dev", "link": "16869461"}, "BIBREF36": {"title": "Two-step process for photoreceptor formation in Drosophila", "authors": [{"first": "B", "middle": [], "last": "Mollereau", "suffix": ""}, {"first": "M", "middle": [], "last": "Dominguez", "suffix": ""}, {"first": "R", "middle": [], "last": "Webel", "suffix": ""}, {"first": "N", "middle": ["J"], "last": "Colley", "suffix": ""}, {"first": "B", "middle": [], "last": "Keung", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "De Celis", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2001, "venue": "Nature", "link": "4426584"}, "BIBREF37": {"title": "Genetic dissection of trophic interactions in the larval optic neuropil of Drosophila melanogaster", "authors": [{"first": "V", "middle": ["G R"], "last": "Moncalvo", "suffix": ""}, {"first": "A", "middle": ["R"], "last": "Campos", "suffix": ""}], "year": 2005, "venue": "Dev. Biol", "link": null}, "BIBREF38": {"title": "The little R cell that could", "authors": [{"first": "R", "middle": [], "last": "Nagaraj", "suffix": ""}, {"first": "U", "middle": [], "last": "Banerjee", "suffix": ""}], "year": 2004, "venue": "Int. J. Dev. Biol", "link": "38928656"}, "BIBREF39": {"title": "Two-step induction of chordotonal organ precursors in Drosophila embryogenesis", "authors": [{"first": "M", "middle": [], "last": "Okabe", "suffix": ""}, {"first": "H", "middle": [], "last": "Okano", "suffix": ""}], "year": 1997, "venue": "Development", "link": "86925"}, "BIBREF40": {"title": "Spitz and Wingless, emanating from distinct borders, cooperate to establish cell fate across the Engrailed domain in the Drosophila epidermis", "authors": [{"first": "L", "middle": [], "last": "O'keefe", "suffix": ""}, {"first": "S", "middle": ["T"], "last": "Dougan", "suffix": ""}, {"first": "L", "middle": [], "last": "Gabay", "suffix": ""}, {"first": "E", "middle": [], "last": "Raz", "suffix": ""}, {"first": "B", "middle": ["Z"], "last": "Shilo", "suffix": ""}, {"first": "S", "middle": [], "last": "Dinardo", "suffix": ""}], "year": 1997, "venue": "Development", "link": "6535460"}, "BIBREF41": {"title": "The activities of two Ets-related transcription factors required for Drosophila eye development are modulated by the Ras/ MAPK pathway", "authors": [{"first": "E", "middle": ["M"], "last": "O'neill", "suffix": ""}, {"first": "I", "middle": [], "last": "Rebay", "suffix": ""}, {"first": "R", "middle": [], "last": "Tjian", "suffix": ""}, {"first": "G", "middle": ["M"], "last": "Rubin", "suffix": ""}], "year": 1994, "venue": "Cell", "link": null}, "BIBREF42": {"title": "A new rhodopsin in R8 photoreceptors of Drosophila: Evidence for coordinate expression with Rh3 in R7 cells", "authors": [{"first": "D", "middle": [], "last": "Papatsenko", "suffix": ""}, {"first": "G", "middle": [], "last": "Sheng", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 1997, "venue": "Development", "link": "23194626"}, "BIBREF43": {"title": "Transcript localization of four opsin genes in the three visual organs of Drosophila: RH2 is ocellus specific", "authors": [{"first": "J", "middle": [], "last": "Pollock", "suffix": ""}, {"first": "S", "middle": [], "last": "Benzer", "suffix": ""}], "year": 1988, "venue": "Nature", "link": null}, "BIBREF44": {"title": "The sevenless signaling pathway: Variations of a common theme", "authors": [{"first": "T", "middle": [], "last": "Raabe", "suffix": ""}], "year": 2000, "venue": "Biochim. Biophys. Acta", "link": "27219420"}, "BIBREF45": {"title": "Yan functions as a general inhibitor of differentiation and is negatively regulated by activation of the Ras1/MAPK pathway", "authors": [{"first": "I", "middle": [], "last": "Rebay", "suffix": ""}, {"first": "G", "middle": ["M"], "last": "Rubin", "suffix": ""}], "year": 1995, "venue": "Cell", "link": "1888229"}, "BIBREF46": {"title": "Formation of the Drosophila larval photoreceptor organ and its neuronal differentiation require continuous Kruppel gene activity", "authors": [{"first": "D", "middle": [], "last": "Schmucker", "suffix": ""}, {"first": "H", "middle": [], "last": "Taubert", "suffix": ""}, {"first": "Jackle", "middle": [], "last": "", "suffix": ""}, {"first": "H", "middle": [], "last": "", "suffix": ""}], "year": 1992, "venue": "Neuron", "link": null}, "BIBREF47": {"title": "Chromophore-assisted laser inactivation of patched protein switches cell fate in the larval visual system of Drosophila", "authors": [{"first": "D", "middle": [], "last": "Schmucker", "suffix": ""}, {"first": "A", "middle": ["L"], "last": "Su", "suffix": ""}, {"first": "A", "middle": [], "last": "Beermann", "suffix": ""}, {"first": "H", "middle": [], "last": "Jackle", "suffix": ""}, {"first": "Jay", "middle": [], "last": "", "suffix": ""}, {"first": "D", "middle": ["G"], "last": "", "suffix": ""}], "year": 1994, "venue": "Proc. Natl. Acad. Sci", "link": "24976004"}, "BIBREF48": {"title": "Genetic analysis of the larval optic nerve projection in Drosophila", "authors": [{"first": "D", "middle": [], "last": "Schmucker", "suffix": ""}, {"first": "H", "middle": [], "last": "Jackle", "suffix": ""}, {"first": "U", "middle": [], "last": "Gaul", "suffix": ""}], "year": 1997, "venue": "Development", "link": "18791028"}, "BIBREF49": {"title": "The columnar gene vnd is required for tritocerebral neuromere formation during embryonic brain development of Drosophila", "authors": [{"first": "S", "middle": ["G"], "last": "Sprecher", "suffix": ""}, {"first": "R", "middle": [], "last": "Urbach", "suffix": ""}, {"first": "G", "middle": ["M"], "last": "Technau", "suffix": ""}, {"first": "F", "middle": ["M"], "last": "Rijli", "suffix": ""}, {"first": "H", "middle": [], "last": "Reichert", "suffix": ""}, {"first": "F", "middle": [], "last": "Hirth", "suffix": ""}], "year": 2006, "venue": "Development", "link": "11479978"}, "BIBREF50": {"title": "Transcriptional regulation of atonal required for Drosophila larval eye development by concerted action of eyes absent, sine oculis and hedgehog signaling independent of fused kinase and cubitus interruptus", "authors": [{"first": "T", "middle": [], "last": "Suzuki", "suffix": ""}, {"first": "K", "middle": [], "last": "Saigo", "suffix": ""}], "year": 2000, "venue": "Development", "link": "29423415"}, "BIBREF51": {"title": "Otd/ Crx, a dual regulator for the specification of ommatidia subtypes in the Drosophila retina", "authors": [{"first": "A", "middle": [], "last": "Tahayato", "suffix": ""}, {"first": "R", "middle": [], "last": "Sonneville", "suffix": ""}, {"first": "F", "middle": [], "last": "Pichaud", "suffix": ""}, {"first": "M", "middle": ["F"], "last": "Wernet", "suffix": ""}, {"first": "D", "middle": [], "last": "Papatsenko", "suffix": ""}, {"first": "P", "middle": [], "last": "Beaufils", "suffix": ""}, {"first": "T", "middle": [], "last": "Cook", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2003, "venue": "Dev. Cell", "link": "14985756"}, "BIBREF52": {"title": "Embryonic development of the Drosophila brain: Formation of commissural and descending pathways", "authors": [{"first": "S", "middle": [], "last": "Therianos", "suffix": ""}, {"first": "S", "middle": [], "last": "Leuzinger", "suffix": ""}, {"first": "F", "middle": [], "last": "Hirth", "suffix": ""}, {"first": "C", "middle": ["S"], "last": "Goodman", "suffix": ""}, {"first": "H", "middle": [], "last": "Reichert", "suffix": ""}], "year": 1995, "venue": "Development", "link": "25263392"}, "BIBREF53": {"title": "Functional analysis of CNK in RAS signaling", "authors": [{"first": "M", "middle": [], "last": "Therrien", "suffix": ""}, {"first": "A", "middle": ["M"], "last": "Wong", "suffix": ""}, {"first": "E", "middle": [], "last": "Kwan", "suffix": ""}, {"first": "G", "middle": ["M"], "last": "Rubin", "suffix": ""}], "year": 1999, "venue": "Proc. Natl. Acad. Sci", "link": "22661344"}, "BIBREF54": {"title": "orthodenticle is required for photoreceptor cell development in the Drosophila eye", "authors": [{"first": "E", "middle": ["R"], "last": "Vandendries", "suffix": ""}, {"first": "D", "middle": [], "last": "Johnson", "suffix": ""}, {"first": "R", "middle": [], "last": "Reinke", "suffix": ""}], "year": 1996, "venue": "Dev. Biol", "link": "19604448"}, "BIBREF55": {"title": "Building a retinal mosaic: Cell-fate decision in the fly eye", "authors": [{"first": "M", "middle": ["F"], "last": "Wernet", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2004, "venue": "Trends Cell Biol", "link": "40199283"}, "BIBREF56": {"title": "Stochastic spineless expression creates the retinal mosaic for colour vision", "authors": [{"first": "M", "middle": ["F"], "last": "Wernet", "suffix": ""}, {"first": "E", "middle": ["O"], "last": "Mazzoni", "suffix": ""}, {"first": "A", "middle": [], "last": "Celik", "suffix": ""}, {"first": "D", "middle": ["M"], "last": "Duncan", "suffix": ""}, {"first": "I", "middle": [], "last": "Duncan", "suffix": ""}, {"first": "C", "middle": [], "last": "Desplan", "suffix": ""}], "year": 2006, "venue": "Nature", "link": "1211520"}, "BIBREF57": {"title": "Antagonism of EGFR and notch signalling in the reiterative recruitment of Drosophila adult chordotonal sense organ precursors", "authors": [{"first": "P", "middle": [], "last": "Zur Lage", "suffix": ""}, {"first": "A", "middle": ["P"], "last": "Jarman", "suffix": ""}], "year": 1999, "venue": "Development", "link": "1057283"}, "BIBREF58": {"title": "EGF receptor signaling triggers recruitment of Drosophila sense organ precursors by stimulating proneural gene autoregulation", "authors": [{"first": "P", "middle": ["I"], "last": "Zur Lage", "suffix": ""}, {"first": "L", "middle": ["M"], "last": "Powell", "suffix": ""}, {"first": "D", "middle": ["R"], "last": "Prentice", "suffix": ""}, {"first": "P", "middle": [], "last": "Mclaughlin", "suffix": ""}, {"first": "A", "middle": ["P"], "last": "Jarman", "suffix": ""}], "year": 2004, "venue": "Dev. Cell", "link": "196706"}, "BIBREF59": {"title": "Access the most recent version at doi", "authors": [], "year": 2007, "venue": "", "link": null}, "BIBREF61": {"title": "This article cites 57 articles, 21 of which can be accessed free at", "authors": [], "year": "", "venue": "", "link": null}}, "ref_entries": {"FIGREF0": {"text": "Figure 1. Development of BO and rhodopsin expression. (A) Lateral view of the head region of an ato-Gal4, UAS-lacZ embryo stained with anti-\u2424-Gal (red) and antiNeurotactin (Nrt) (green). ato-lacZ expression is found in BO precursors (arrow) at late stage 11. (B) Dorsal view of a stage 15 embryonic head: Anti-FasII (green) staining shows the position of BO (arrow). (C) High-magnification image of the BO at late stage 17 immunolabeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). A total of 12 PRs are observed: Eight are stained by anti-Rh6 antibody, and four are stained with anti-Rh5 antibody. (Inset) Three-dimensional reconstruction of the BO PRs. (D) High-magnification image of the BO in third instar larvae immunolabeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). A total of 12 PRs are observed: Eight are stained by anti-Rh6 antibody, and four are stained with anti-Rh5 antibody. PRs build up arborization-like protrusions (arrow).", "type": "figure"}, "FIGREF1": {"text": "Figure 2. wts and melt are not involved in Rh5-and Rh6-subtype specification. High-magnification images of third instar mutant larva BO of melt (A), wts (B), so-Gal4/UAS-melt (C), and so-Gal4/UAS-wts (D) labeled with anti-Rh5 (green, arrow), anti-Rh6 (red), and anti-Elav (blue). (A-D) melt and wts mutants, as well as so-Gal4/UAS-wts and so-Gal4/UAS-melt, display an expression of Rh5 and Rh6 PRs comparable with wild-type larvae. (E) Comparison of the total number of PRs (black bar) and Rh6 (white bar) and Rh5 (red bar) PRs in wild-type (wt), melt LOF, wts LOF, melt GOF, and wats GOF larval eyes (error bars, SD). The number of total PRs and Rh5 and Rh6 PRs in all conditions are comparable with wild type.", "type": "figure"}, "FIGREF2": {"text": "Figure 3. Function of EGFR signaling in Rh6 PR development. (A) High-magnification image of stage 15 wild-type embryonic BO labeled with anti-FasII (green) and antiKr (red). (B) High-magnification image of stage 15 so-Gal4 > UAS-EGFR dn embryonic BO labeled with anti-FasII (green) and antiKr (red). Only three Kr-expressing immature PRs are found (arrows) as compared with 12 in the wild type (shown in A). (C) High-magnification image of third instar larva wild-type BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). A total of 12 PRs are observed: Eight are stained by anti-Rh6 antibody, and four are stained with anti-Rh5 antibody. (D) Highmagnification image of third instar larva so-Gal4 > UAS-EGFR dn BO labeled with anti-Rh5 (green), anti-Rh6 (red), and antiElav (blue). Only three Rh5-expressing cells are found (arrows). (E) High-magnification image of third instar larva wild-type BO labeled with anti-Chp (green, 24B10), anti-Sal (red), and anti-Elav (blue). (F) High-magnification image of third instar larva soGal4,UAS-EGFR dn BO labeled with antiChp (green, 24B10), anti-Sal (red), and antiElav (blue). All four PRs found express Sal.", "type": "figure"}, "FIGREF3": {"text": "Figure 4. Function of EGFR, tll, and apoptosis in Rh6-subtype development. (A) High-magnification image of stage 15 wild-type embryonic BO labeled with anti-FasII (green) and anti-Elav (red). (B) High-magnification image of stage 15 tll mutant embryonic BO labeled with antiFasII (green) and anti-Elav (red). The number of immature PRs is increased to \u223c20-25. (C) High-magnification image of stage 17 wild-type BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). (Inset) Threedimensional reconstruction of the BO PRs. A total of 12 PRs are observed: Eight are stained by anti-Rh6 antibody, and four are stained with anti-Rh5 antibody. (D) Highmagnification image of stage 17 tll mutant BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). (Inset) Three-dimensional reconstruction of the BO PRs. The number of Rh6 PRs is increased to \u223c20-25; the number of Rh5 PRs remains unchanged. (E) Highmagnification image of third instar larva wild-type BO labeled with anti-Rh5 (green), anti-Rh6 (red), and antiElav (blue). (F) High-magnification image of third instar larva so-Gal4,UAS-p35 BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). The number of Rh6 PRs is largely increased to 20-25. (G) High-magnification image of third instar larva so-Gal4,UAS-p35, UAS-EGFR dn BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). The number of Rh6 PRs is largely increased. The number of Rh6 PRs is increased to \u223c20-25; the number of Rh5 PRs remains unchanged.", "type": "figure"}, "FIGREF4": {"text": "Figure 5. Expression of sal and svp in the developing and mature larval PRs. (A) Dorsal view of a sal-Gal4,UAS-H2B-YFP stage 15 embryonic head labeled with anti-GFP (green), anti-FasII (red), and anti-Elav (blue), and GFP expression in BO (arrow). (B) Highmagnification image of the BO in A. Four cells are labeled by anti-GFP staining (arrow). (C) High-magnification image of BO in third instar larva sal-Gal4,UAS-H2B-YFP, labeled with anti-GFP (red), anti-Rh5 (green), and anti-Elav (blue). Anti-GFP labeling coincides with anti-Rh5 staining (arrow). (D) High-magnification image of BO in third instar larva sal-Gal4,UAS-H2B-YFP, labeled with anti-GFP (green), anti-Rh6 (red), and anti-Elav (blue). Anti-GFP labeling is excluded from anti-Rh6 staining (arrows). (E) Dorsal view of a svp-lacZ stage 15 embryonic head labeled with anti-FasII (green) and anti-\u2424Gal (red) expression in BO (arrows). (F,F\u0408) High-magnification image of the BO in E; individual optical sections show four cells are devoid of anti-\u2424Gal staining (arrows). (G) High-magnification image of BO in third instar larva svp-Gal4,UAS-H2B-YFP, labeled with anti-GFP (red), anti-Rh5 (green), and anti-Elav (blue). Anti-GFP labeling is excluded from anti-Rh5 staining (arrows). (H) High-magnification image of BO in third instar larva svp-Gal4,UAS-H2B-YFP, labeled with anti-GFP (green), anti-Rh6 (red), and anti-Elav (blue). Anti-GFP labeling coincides with anti-Rh6 staining, and is excluded for the remaining four PRs (arrows).", "type": "figure"}, "FIGREF5": {"text": "Figure 6. Function of sal and svp in Rh5-and Rh6-subtype specification. (A) Highmagnification image of stage 17 wild-type BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). (Inset) Three-dimensional reconstruction of the BO PRs. (B) High-magnification image of stage 17 svp mutant BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). A total of 12 PRs are found, all expressing Rh5. (Inset) Three-dimensional reconstruction of the BO PRs. (C) High-magnification image of stage 17 sal mutant BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). A total of 12 PRs are detected: Eight express Rh6, whereas four are devoid of anti-Rh5 or anti-Rh6 staining. (D) High-magnification image of stage 17 wildtype BO labeled with anti-Chp (green), anti-Sal (red), and anti-Elav (blue). Four cells express Sal (arrow). (E) High-magnification image of stage 17 svp mutant BO labeled with anti-Chp (green), anti-Sal (red), and anti-Elav (blue). All PRs express Sal (arrow). (F) High-magnification image of third instar larva so-Gal4,UAS-svp BO labeled with anti-Rh5 (green), anti-Rh6 (red), and anti-Elav (blue). All PRs are labeled by anti-Rh6; no anti-Rh5 staining is detected. (G) Axonal projections of larval PRs in sal mutant stage 17 embryos labeled with anti-FasII (green) and anti-Rh6 (red). Axonal terminations are properly formed (arrow). (H,I) High-magnification image of termini of larval PRs in sal mutant labeled with anti-FasII (green) and anti-Rh6 (red). Projections of \"empty\" PRs are devoid of Rh6 expression but labeled with the general marker FasII (cf. arrows in H,I).", "type": "figure"}, "FIGREF6": {"text": "Figure 7. Expression and function of otd in developing and mature larval PRs. (A) Lateral view of a stage 7 embryo (procephalic region) stained with anti-Nrt (green) and anti-Otd (red); the region giving rise to the optic lobe anlage and larval PRs expresses Otd (arrow). (B) Lateral view of a so-Gal4/UAS-H2B-YFP stage 10 embryonic head region stained with anti-GFP (green) and anti-Otd (red); the region giving rise to larval PRS (ventral/lateral tip) is devoid of Otd expression. (C) Dorsal view of a wild-type stage 15 embryonic head labeled with anti-FasII (green), anti-Otd (red), and anti-Otd staining in BO, indicated by arrows. (C\u0408) High-magnification image of C. All immature PRs are labeled by anti-Otd staining. (D) High-magnification image of embryonic stage 15 otd-Gal4/UAS-CD8\u03fbGFP BO labeled with anti-FasII (green) and anti-GFP (red). All immature PRs are labeled by anti-GFP staining. (E) High-magnification image of third instar larva otd-Gal4/UAS-CD8\u03fbGFP BO labeled with anti-GFP (blue) and anti-Rh6 (red). (F) High-magnification image of third instar larva otd-Gal4/UAS-CD8\u03fbGFP BO labeled with anti-GFP (blue) and anti-Rh5 (green). (G) High-magnification image of third instar larva otd vui mutant BO labeled with anti-Rh6 (red) and anti-Elav (blue). All PRs are labeled by anti-Rh6 staining. (H) Highmagnification image of third instar larva otd vui mutant BO labeled with anti-Chp (green), anti-Sal (red), and anti-Elav (blue). Four PRs are labeled by anti-Sal staining.", "type": "figure"}, "FIGREF7": {"text": "Figure 8. Proposed model for development and the specification of rhodopsin fates in the larval eye and comparison with the adult R8. (A) Primary precursors (1\u00b0, blue) express the TGF\u2423 homolog spi, which is required in secondary precursor cells (2\u00b0, green) for their survival. tll acts in the surrounding tissue to inhibit secondary precursor development. Primary precursors give rise to the Rh5 PR subtype, whereas secondary precursors give rise to the Rh6 PR subtype. In the Rh5 PR subtype, sal and otd are required for Rh5 expression, and otd further for the repression of Rh6. In the Rh6 subtype, svp is required for Rh6 expression and for the repression of sal expression. (B) The negative feedback loop of wts and melt mediates the decision of R8 to express Rh6 or Rh5. Which way the loop swings depends on an instructive signal of the overlying R7 cell. The presence of gene expression is indicated by black type and its absence is indicated with gray type. Arrows shown in black (for activation) and red (for repression) indicate an active interaction; gray arrows indicate the absence of this interaction.", "type": "figure"}, "TABREF0": {"text": ") (compa- rable results were obtained using svp-Gal4, svp-LacZ, or", "type": "table"}, "TABREF1": {"text": "), svp 724 -Gal4 (Kyoto Stock Center; kindly provided by J. Urban), sal-Gal4 (Mollereau et al. 2000), UAS-EGFR dn (O'Keefe et al. 1997), wts P1 , melt D1 , melt-LacZ, wts-Gal4, UAS-melt, UAS-lats (Mikeladze-Dvali et al. 2005b), UAS-H2B\u03fbYFP (anti- GFP antibody/Biogenesis recognizes the YFP antigen), r (Bel- laiche et al. 2001), UAS-salm", "type": "table"}}}
diff --git a/s2orc-doc2json/tests/s2orc/20200705/94551546.json b/s2orc-doc2json/tests/s2orc/20200705/94551546.json
new file mode 100644
index 0000000000000000000000000000000000000000..1954c4edc139a279dfa8d33ae29a4c4505e168d6
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20200705/94551546.json
@@ -0,0 +1 @@
+{"paper_id": "94551546", "_pdf_hash": "9bf1cb19041b8ddfca7aeccc9d2f7689c8aa1c7e", "abstract": [{"section": "Abstract", "text": "Ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, has recently been used as effective gene carriers because of their low cytotoxicity and high transfection efficiency. In this study, a series of PGMA-based supramolecular polycations (PGED-Gd@PGEAs) with magnetic resonance imaging (MRI) functions were readily constructed by assembling multiple adamantine-headed star PGEA (Ad-PGEA) units with a versatile PGED-CD-Gd backbone, which possessed numerous flanking \u03b2-cyclodextrin species and Gd 3+ ions. The properties of different PGED-Gd@PGEA vectors were systematically characterized, including the plasmid DNA condensation ability, cytotoxicity, gene transfection efficiency, cellular uptake and MRI function. Such supramolecular gene vectors had lower toxicity than 'gold standard' polyethylenimine (PEI, 25 kDa). Furthermore, PGED-Gd@PGEAs exhibited significantly higher transfection efficiencies than PEI or the constituent units (PGED-CD-Gd and Ad-PGEA). The chelation of Gd 3+ ions imparted the PGED-Gd@PGEA vectors with a good MRI ability without obvious adverse effects. The present design of PGMA-based supramolecular polycations with Gd 3+ chelation would provide useful information for the development of low-toxicity and high-efficiency multifunctional gene delivery systems.", "cite_spans": [], "ref_spans": []}], "body_text": [{"section": "INTRODUCTION", "text": "Gene therapy holds potential for treating many severe diseases, such as cancer and genetic diseases. 1 Successful gene therapy depends on highefficiency gene delivery processes, in which the gene carriers have an essential role. The application of traditional viral vectors has been a challenge because of their toxicity, immunogenicity and low capability for scaling up. 2 There has long been a scientific demand for developing non-viral gene delivery systems that can overcome the drawbacks of viral vectors. 3 Non-viral gene delivery has been advanced by the rapid development of materials science and technology. Numerous novel gene delivery systems have been proposed based on functional cationic polymers, such as polyethylenimine (PEI), [4] [5] [6] [7] [8] [9] poly(2-(dimethylamino) ethyl methacrylate), 4,10 poly(L-lysine), 11 poly(aspartic acid) 12, 13 and polyamidoamine. 14 However, these non-viral gene carriers still have shortcomings, including cytotoxicity, low transfection efficiency and lack of multifunction.", "cite_spans": [{"start": 101, "end": 102, "text": "1", "ref_id": "BIBREF0"}, {"start": 372, "end": 373, "text": "2", "ref_id": "BIBREF1"}, {"start": 511, "end": 512, "text": "3", "ref_id": "BIBREF2"}, {"start": 744, "end": 747, "text": "[4]", "ref_id": "BIBREF3"}, {"start": 748, "end": 751, "text": "[5]", "ref_id": "BIBREF4"}, {"start": 752, "end": 755, "text": "[6]", "ref_id": "BIBREF5"}, {"start": 756, "end": 759, "text": "[7]", "ref_id": "BIBREF6"}, {"start": 760, "end": 763, "text": "[8]", "ref_id": "BIBREF7"}, {"start": 764, "end": 767, "text": "[9]", "ref_id": "BIBREF8"}, {"start": 833, "end": 835, "text": "11", "ref_id": "BIBREF10"}, {"start": 856, "end": 859, "text": "12,", "ref_id": "BIBREF11"}, {"start": 860, "end": 862, "text": "13", "ref_id": "BIBREF12"}, {"start": 883, "end": 885, "text": "14", "ref_id": "BIBREF13"}], "ref_spans": []}, {"section": "INTRODUCTION", "text": "Recently, we found that ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, could be used as effective gene carriers. 15, 16 They possess good gene transfection properties. To further improve the performance of PGMA-based gene carriers, several strategies have been applied such as polysaccharide introduction and target molecule binding. 16, 17 Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field. 18, 19 The construction of supramolecular polycations via host-guest interaction is a popular strategy for high-efficiency gene delivery systems. 20 In particular, cyclodextrins (CDs) and their derivatives have been widely utilized for constructing supramolecular gene delivery systems, mainly because of their superior biocompatibility. [21] [22] [23] With the host-guest interaction strategy, we successfully prepared one PGEA-based supramolecular delivery system by tying multiple \u03b2-cyclodextrin (\u03b2-CD)cored star PGEA polymers to an adamantine-modified linear PGEA backbone. 24 Such PGEA supramolecules markedly increased transfection efficiencies. Further improvements in functionality and the development of new preparation strategies for PGMA-based supramolecular vectors would benefit the construction of better gene delivery systems.", "cite_spans": [{"start": 180, "end": 183, "text": "15,", "ref_id": "BIBREF14"}, {"start": 184, "end": 186, "text": "16", "ref_id": "BIBREF15"}, {"start": 401, "end": 404, "text": "16,", "ref_id": "BIBREF15"}, {"start": 405, "end": 407, "text": "17", "ref_id": "BIBREF16"}, {"start": 593, "end": 596, "text": "18,", "ref_id": "BIBREF17"}, {"start": 597, "end": 599, "text": "19", "ref_id": "BIBREF18"}, {"start": 739, "end": 741, "text": "20", "ref_id": "BIBREF19"}, {"start": 931, "end": 935, "text": "[21]", "ref_id": "BIBREF20"}, {"start": 936, "end": 940, "text": "[22]", "ref_id": "BIBREF21"}, {"start": 941, "end": 945, "text": "[23]", "ref_id": "BIBREF22"}, {"start": 1171, "end": 1173, "text": "24", "ref_id": "BIBREF23"}], "ref_spans": []}, {"section": "INTRODUCTION", "text": "To construct multifunctional supramolecular vectors, a novel strategy was proposed to flexibly prepare PGMA-based supramolecular delivery systems (PGED-Gd@PGEAs) with magnetic resonance imaging (MRI) functionality, by assembling multiple adamantine-headed star PGEA (Ad-PGEA) units with a versatile PGED-CD-Gd backbone (Scheme 1). This backbone possessed numerous flanking \u03b2-CD species and Gd 3+ ions. MRI technology has received considerable attention because of its high spatial resolution and its applications in areas such as deep tissue imaging. 25, 26 In particular, Gd 3+ ions have been used as contrast agents because of their optimal chemical and magnetic properties. 27, 28 In this work, Gd ions were chelated by diethylenetriaminepentacetate acid (DTPA) immobilized on the PGED backbones to introduce the MRI effect into the resultant PGED-Gd@PGEA supramolecular systems. The physicochemical properties of the PGED-Gd@PGEA assemblies, including plasmid DNA (pDNA) condensation ability, cytotoxicity, gene transfection, cellular uptake and MRI functionality, were examined in detail. The present work provides a new strategy to design multifunctional supramolecular delivery systems.", "cite_spans": [{"start": 551, "end": 554, "text": "25,", "ref_id": "BIBREF24"}, {"start": 555, "end": 557, "text": "26", "ref_id": "BIBREF25"}, {"start": 677, "end": 680, "text": "27,", "ref_id": "BIBREF26"}, {"start": 681, "end": 683, "text": "28", "ref_id": "BIBREF27"}], "ref_spans": []}, {"section": "EXPERIMENTAL PROCEDURE Materials", "text": "Branched PEI (Mw~25 kDa), \u03b2-CD (99%), epichlorohydrin (EP, 99%), 1-adamantanecarboxylic acid chloride (98%), pentaerythritol (98%), 2-bromoisobutyryl bromide (98%), ethyl bromoisobutyrate (99%), glycidyl methacrylate (GMA, 98%), N,N,N\u2032,N\u2033,N\u2033-pentamethyldiethylenetriamine (99%), copper(I) bromide (99%), EA (98%), ED (98%), DTPA (98%), carbodiimide hydrochloride (98%), gadolinium(III) chloride hexahydrate (99%) and 3-(4,5-dimethylthiazol-2yl)-2,5-diphenyl tetrazolium bromide were bought from Sigma-Aldrich Chemical Co., St Louis, MO, USA. GMA was used after removal of the inhibitors. The plasmid pRL-CMV, encoding Renilla luciferase, (Promega Co., Cergy Pontoise, France), and the plasmid pEGFP-N1, encoding enhanced green fluorescent protein (EGFP) (BD Biosciences, San ", "cite_spans": [], "ref_spans": []}, {"section": "Ad-Br", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "PGED-CD", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "PGED-CD-Gd", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "Ad", "text": "", "cite_spans": [], "ref_spans": []}, {"section": "Synthesis of the PGED-CD-Gd backbone", "text": "Linear PGMA was prepared via atom transfer radical polymerization (ATRP). First, 160 \u03bcl of ethyl bromoisobutyrate (1 equiv), 2.1 ml of GMA (60 equiv) and 300 \u03bcl of N,N,N\u2032,N\u2033,N\u2033-pentamethyldiethylenetriamine (2 equiv) were added to a 50 ml flask with 5 ml of dimethyl sulfoxide (DMSO). The reaction system was degassed with argon for 10 min before adding 115 mg of copper(I) bromide (1 equiv). The details of the preparation of PGMA are described in our earlier work. 29 The molar weight of PGMA was 4.6 \u00d7 10 3 g mol \u22121 polydispersity index (PDI) = 1.28. The resultant ED-functionalized PGMA (PGED) was prepared using excess ED, as reported previously. 16 As shown in Scheme 1, before the preparation of CD units containing PGED (PGED-CD), it was necessary to synthesize EP-modified CD (CD-EP), using similar procedures to those reported previously. 30 In brief, 1.5 g of \u03b2-CD was added to a 50 ml round flask with a mixture (v/v, 5/5 ml) of DMSO/i-PrOH. After the \u03b2-CD was thoroughly dissolved, 12.5 ml of 1 M NaOH aqueous solution was then added to the solution, followed by the addition of 2 ml of EP. The reaction proceeded at room temperature in a nitrogen atmosphere for 48 h. The pH of the reaction solution was adjusted to~7.0 with concentrated HCl. Excess acetone was used to precipitate the reaction mixture. The raw CD-EP was dissolved in a small amount of deionized (DI) water and dialyzed against DI water (4 \u00d7 5 l) with a dialysis membrane (MWCO, 1000 Da) at room temperature for 4 h, prior to lyophilization. PGED-CD was prepared subsequently. In a 50 ml flask, PGED (130 mg) and CD-EP (480 mg) were added with 7 ml of water. The molar ratio of the terminal amino groups (of PGED) and CD-EP units was 1.6:1. The reaction was conducted at 80\u00b0C for 48 h. The PGED-CD product was purified by a dialysis method and then lyophilized.", "cite_spans": [{"start": 467, "end": 469, "text": "29", "ref_id": "BIBREF28"}, {"start": 652, "end": 654, "text": "16", "ref_id": "BIBREF15"}, {"start": 849, "end": 851, "text": "30", "ref_id": "BIBREF29"}], "ref_spans": []}, {"section": "Synthesis of the PGED-CD-Gd backbone", "text": "For the preparation of Gd 3+ -chelated PGED-CD (PGED-CD-Gd), the amidation reaction was used to introduce DTPA as the chelating agent of Gd 3+ ions onto PGED-CD according to procedures described previously. 31 In brief, 8 mg of carbodiimide hydrochloride and 40 mg of DTPA were dissolved in 7 ml of water. The molar ratio of carbodiimide hydrochloride and DTPA was kept at 1:5 to avoid cross-linking between the amino groups of PGED-CD and carboxylate groups of DTPA. The solution was stirred for 4 h at room temperature before adding 100 mg of PGED-CD. The molar ratio of the remaining amino groups (of PGED-CD) and DTPA units was 1:1. The reaction proceeded for 48 h to produce PGED-CD-DTPA. The resulting solution was purified by dialyzed with a dialysis membrane (MWCO, 1000 Da) against 0.1 M NaCl for 2 days and then dialyzed against DI water for 1 day. The purified solution of PGED-CD-DTPA was transferred into a flask, and GdCl 3 (GdCl 3 \u20226H 2 O) was added. The molar ratio of DTPA and GdCl 3 (GdCl 3 \u20226H 2 O) was kept at 1:1. The mixing solution was stirred for 24 h at room temperature and then dialyzed against DI water for 24 h. Subsequently, the PGED-CD-Gd product was lyophilized.", "cite_spans": [{"start": 207, "end": 209, "text": "31", "ref_id": "BIBREF30"}], "ref_spans": []}, {"section": "Synthesis of Ad-PGEA", "text": "For the preparation of the Ad-PGEA guest, the adamantine-headed ATRP initiator (Ad-Br) with three initiation sites was first synthesized. Pentaerythritol (1.08 g, 7.94 mmol) was thoroughly dissolved in a 50 ml round flask containing 20 ml of anhydrous N,N-dimethylformamide. Then, 1-adamantanecarboxylic acid chloride (1 g, 5.05 mmol) and K 2 CO 3 (1.33 g, 9.64 mmol) were added. The reaction was conducted at 50\u00b0C for 24 h under magnetic stirring. The final reaction solution was centrifuged, evaporated and distillated under reduced pressure, producing Ad-OH with three hydroxyl groups. The resultant Ad-Br was prepared using the similar procedures described earlier. 29 Next, 447 mg of Ad-OH (1.5 mmol) was added to a 50 ml round flask containing 7 ml of N,Ndimethylformamide. Then, 0.75 ml of 2-bromoisobutyryl bromide (6 mmol) was dropped into the aforementioned solution under an ice bath condition and stirring for 24 h. The reaction mixture was quenched with water and extracted with CH 2 Cl 2 . The organic layer was washed with brine, dried over Na 2 SO 4 and concentrated under reduced pressure. The Ad-Br product was a yellowish syrupy-like liquid.", "cite_spans": [{"start": 670, "end": 672, "text": "29", "ref_id": "BIBREF28"}], "ref_spans": []}, {"section": "Synthesis of Ad-PGEA", "text": "The resultant Ad-PGMA polymers were synthesized under the typical conditions of ATRP. 29 In a 50 ml flask, 120 mg of Ad-Br (1 equiv), 2.3 ml of GMA (100 equiv) and 70 \u03bcl of N,N,N\u2032,N\u2033,N\u2033-pentamethyldiethylenetriamine (2.5 equiv) were added to a 50 ml round flask containing 5 ml of DMSO. The reaction system was degassed by nitrogen for 10 min before adding 22.8 mg copper(I) bromide (1 equiv) under a nitrogen atmosphere. The products with a polymerization time of 20 and 35 min were named Ad-PGMA1 (Mn = 5.5 \u00d7 10 3 g mol \u22121 , PDI = 1.26) and Ad-PGMA2 (Mn = 1.0 \u00d7 10 4 g mol \u22121 , PDI = 1.32), respectively.", "cite_spans": [{"start": 86, "end": 88, "text": "29", "ref_id": "BIBREF28"}], "ref_spans": []}, {"section": "Synthesis of Ad-PGEA", "text": "The resulting EA-functionalized Ad-PGMA (Ad-PGEA) was prepared using excess EA as reported earlier. 15 The crude product was purified using a dialysis membrane (MWCO 3500), followed by lyophilization.", "cite_spans": [{"start": 100, "end": 102, "text": "15", "ref_id": "BIBREF14"}], "ref_spans": []}, {"section": "Preparation of supramolecular assembly/pDNA complexes", "text": "For the preparation of PGED@PGEA and PGED-Gd@PGEA assemblies, with a 1:1 molar feed ratio of the CD/Ad units, PGED-CD or PGED-CD-Gd was assembled with Ad-PGEA. The concentration of Ad-PGEA was based on the nitrogen concentration of 20 mM. Before use, equal volumes of Ad-PGEA solution and PGED-CD or PGED-CD-Gd solution were mixed, shaken and incubated at ambient temperature for 3 h. The final nitrogen concentration for all polymer solutions was 10 mM in DI water. The polymer to DNA ratio was expressed as the molar ratio of nitrogen (N) in PGMA-based polycations to phosphate (P) in DNA, named as the N/P ratio. The average mass weight per phosphate group of DNA was assumed to be 325. 32 All PGED@PGEA/pDNA and PGED-Gd@PGEA/pDNA polyplexes at different N/P ratios were formed by mixing polymer solution and pDNA solution for 30 min before use.", "cite_spans": [{"start": 690, "end": 692, "text": "32", "ref_id": "BIBREF31"}], "ref_spans": []}, {"section": "Physicochemical characterization", "text": "1 H NMR spectra were measured on a Bruker ARX 400 MHz spectrometer using CDCl 3 (for Ad-Br and Ad-PGMA) and D 2 O (for Ad-PGEA, PGED, CD-EP, PGED-CD, PGED-CD-DTPA) as the solvents with tetramethylsilane (Me 4 Si) as an internal standard. GPC measurements of PGMA and Ad-PGMA were performed on a Waters GPC system, in which DMSO was used as the eluent at a low flow rate of 1.0 ml min \u22121 at 25\u00b0C and monodispersed poly (methyl methacrylate) standards were used to generate the calibration curve. Dynamic light scattering measurements of polyplexes were performed with a Zetasizer Nano ZS (Malvern Instruments, Southborough, MA, USA) equipped with a laser of wavelength 633 nm at a 173\u00b0scattering angle. Atomic force microscopy (AFM) studies were carried out with the Dimension Icon model with a Nanoscope IIIa controller (Bruker, Santa Barbara, CA, USA). The samples were imaged using the ScanAsyst mode. Image analysis was performed using Nanoscope software after removing the background slope after flattening the images. Gel electrophoresis was implemented in a Sub-Cell system (Bio-Rad Laboratories), and then, a UV transilluminator and BioDco-It imaging system (UVP Inc.) was used to record DNA bands.", "cite_spans": [], "ref_spans": []}, {"section": "Cell viability assay", "text": "C6 and Hep G2 cell lines from two common cancers were selected to evaluate the performance of the gene carriers. An 3-(4,5-dimethylthiazol-2yl)-2,5diphenyl tetrazolium bromide assay was used to evaluate the cytotoxicity of polyplexes at a series of N/P ratios in C6 and Hep G2 cells cultured in DMEM (with 10% fetal bovine serum) according to the methods described in our previous studies. 32, 33 In brief, the C6 and Hep G2 cells were cultured in 96-well plates at a density of 2 \u00d7 10 4 cells per well with DMEM (with 10% fetal bovine serum). Then, the culture medium was replaced with 100 \u03bcl of fresh culture medium. The PGEA-based and PEI complexes (6.7 \u03bcl per well containing 0.33 \u03bcg of pDNA) at various N/P ratios were added to the media. Then, the cells were incubated for 4 h, and fresh media were added to the culture for another 20 h. Then, 10 \u03bcl of sterile, filtered 3-(4,5-dimethylthiazol-2yl)-2,5diphenyl tetrazolium bromide solution in phosphate-buffered saline (PBS) (5 mg ml \u22121 ) was added to each well. After 4 h, the unreacted dye was removed, and the produced formazan crystals were dissolved in DMSO (100 \u03bcl per well). The OD values were measured at a wavelength of 570 nm with a Bio-Rad Model 680 Microplate Reader (UK). For each sample, the final absorbance was the average of those measured from six wells in parallel. The cell viability results were expressed as the percentage relative to that of the control.", "cite_spans": [{"start": 390, "end": 393, "text": "32,", "ref_id": "BIBREF31"}, {"start": 394, "end": 396, "text": "33", "ref_id": "BIBREF32"}], "ref_spans": []}, {"section": "In vitro transfection assay", "text": "The plasmid pRL-CMV as the reporter gene was first utilized to estimate the in vitro gene transfection of PGMA-based polycation/pDNA polyplexes in C6 and Hep G2 cell lines. Essentially, the C6 and Hep G2 cells were cultured in 24well plates at a density of 6 \u00d7 10 4 cells per well with DMEM media (with 10% fetal bovine serum). Then, the solutions of polycation/pDNA complexes with different N/P ratios (20 \u03bcl per well with 1.0 \u03bcg of pDNA) were added into the transfection media. The detailed transfection procedures are described in our earlier work. 15, 16, 32, 33 A commercial Promega kit and a luminometer (Berthold Lumat LB 9507, Berthold Technologies GmbH KG, Bad Wildbad, Germany) were used to quantify the luciferase gene expression. Gene expression results were expressed as relative light units per milligram of cell protein lysate (relative light units per mg protein). The plasmid pEGFP-N1 with EGFP gene was also utilized as the reporter gene in C6 and Hep G2 cell lines at the optimal N/P ratio of the polycations to evaluate polymer-mediated gene transfection. The transfected cells were imaged with a Leica DMI3000B fluorescence microscope. The percentage of the EGFP-positive cells was determined by flow cytometry (Beckman Coulter, Brea, CA, USA).", "cite_spans": [{"start": 552, "end": 555, "text": "15,", "ref_id": "BIBREF14"}, {"start": 556, "end": 559, "text": "16,", "ref_id": "BIBREF15"}, {"start": 560, "end": 563, "text": "32,", "ref_id": "BIBREF31"}, {"start": 564, "end": 566, "text": "33", "ref_id": "BIBREF32"}], "ref_spans": []}, {"section": "In vitro cellular uptake", "text": "The in vitro cellular uptake was determined by flow cytometry analysis and imaged using fluorescence microscopy. C6 and Hep G2 cells were seeded into 24-well plates at the density of 8 \u00d7 10 5 cells per well and incubated for 24 h. Then, the cells were incubated with the fresh media containing PGMA-based polycation/pDNA polyplexes for 4 h, where pDNA (pRL-CMV) were labeled with the fluorescent dye YOYO-1. 34 The cells were trypsinized, centrifuged, resuspended in PBS and then analyzed by flow cytometry (BD LSR II, BD, USA). For the fluorescence imaging, after cellular uptake, the cells were washed with PBS five times and stained with 4\u2032,6-diamidino-2-phenylindole for 10 min. The fluorescence images were acquired on a Leica DMI3000B microscope.", "cite_spans": [{"start": 408, "end": 410, "text": "34", "ref_id": "BIBREF33"}], "ref_spans": []}, {"section": "In vitro MRI", "text": "Approximately 5 \u00d7 10 6 C6 or Hep G2 cells were seeded and incubated in cell culture flasks for 24 h. Then, the media were replaced with 5 ml of fresh media containing PGED-CD-Gd or PGED-Gd@PGEA ((Gd) = 0, 5, 10 and 20 \u03bcM, respectively). After incubation at 37\u00b0C for 4 h, the cells were washed with PBS three times and then precipitated by centrifugation. The MRI experiments were performed on a 7.0-T MRI instrument (BioSpec 70/20 USR 7.0 T Bruker) with a wrist receiver coil. The pulse sequence was a T1 map-RATE sequence with the following parameters: TR/TE = 400, 800, 1500, 2500, 4000/7 ms: field of view: 3.5 cm 2 ; matrix: 128 \u00d7 128; number of excitations: 1; slice thickness = 1 mm; slice gap = 0 mm; field of view: 3.5 cm. PGED-Gd@PGEA or detached cells were prepared in PBS at different concentrations in 0.5 ml Eppendorf tubes. The T1 relaxivities were determined via a linear fit of the inverse relaxation time as a function of the Gd 3+ concentration.", "cite_spans": [], "ref_spans": []}, {"section": "Statistical analysis", "text": "All experiments were repeated at least three times. The data are presented as means \u00b1 s.d. Statistical significance (Po0.05) was evaluated by a t-test when two groups of samples were compared, and the statistical significance was set at Po0.05.", "cite_spans": [], "ref_spans": []}, {"section": "RESULTS AND DISCUSSION", "text": "Preparation and characterization of supramolecular polycations with Gd 3+ chelation As shown in Scheme 1, the PGMA-based supramolecular delivery systems (PGED-Gd@PGEAs) with MRI functions were prepared by assembling multiple Ad-PGEA units with a PGED-CD-Gd backbone. The PGED-CD-Gd backbone possessed numerous flanking \u03b2-CD species and Gd 3+ ions. For the preparation of the PGED-CD-Gd backbone, the starting PGMA (Mn = 4.6 \u00d7 10 3 g mol \u22121 , PDI= 1.28) with 32 repeat units was first synthesized via ATRP. Then, PGMA was functionalized with excess ED by ring-opening addition to produce PGED with abundant primary amine groups. Similar to our previous study, 16 it was confirmed that the PGED was successfully prepared from the NMR results ( Supplementary Figure S1(a1) ). To introduce \u03b2-CD units, CD-EP that possessed one epoxy ring (Supplementary Figure S1(a2) ) was immobilized onto PGED via ring-opening Figure S1(a3) ), the molar ratio of \u03b2-CD and ED units was 1:2, indicating that every PGED-CD chain contained 16 \u03b2-CD units. DTPA, the chelating agent of Gd 3+ ions, was subsequently reacted with the remaining primary amine groups of PGED-CD via the amidation reaction, producing PGED-CD-DTPA. Based on the 1 H NMR spectrum of PGED-CD-DTPA ( Supplementary Figure S1(a4) ), PGED-CD-DTPA contained~12 DPTA units. The resultant PGED-CD-Gd backbone was obtained by the chelation of PGED-CD-DTPA with Gd 3+ ions. According to the thermogravimetric analysis (Supplementary Figure S2) , the PGED-CD-Gd backbone contained~10 Gd 3+ ions, which was fairly consistent with the number of DPTA units.", "cite_spans": [{"start": 659, "end": 661, "text": "16", "ref_id": "BIBREF15"}], "ref_spans": [{"start": 742, "end": 769, "text": "Supplementary Figure S1(a1)", "ref_id": "FIGREF0"}, {"start": 849, "end": 862, "text": "Figure S1(a2)", "ref_id": "FIGREF0"}, {"start": 908, "end": 921, "text": "Figure S1(a3)", "ref_id": "FIGREF0"}, {"start": 1249, "end": 1277, "text": "Supplementary Figure S1(a4)", "ref_id": "FIGREF0"}, {"start": 1460, "end": 1485, "text": "(Supplementary Figure S2)", "ref_id": "FIGREF2"}]}, {"section": "RESULTS AND DISCUSSION", "text": "The synthesis of the Ad-PGEA guest with three arms was performed in a four-step procedure (Scheme 1). C(CH 2 OH) 4 was compounded with 1-adamantanecarboxylic acid chloride, and the rest of the hydroxyl groups were reacted with 2-bromoisobutyryl bromide to initiate the ATRP process of GMA, followed by ring-opening of epoxy groups of Ad-PGMA with excess EA. Every step of the preparation process of Ad-PGEA was monitored by 1 H NMR (Supplementary Figure S3 ). After the first two steps of the substitution reaction, C-CH 2 -OCO protons and CH 3 -C-Br protons from the newly formed Ad-OH and Ad-Br were observed at~3.89 and 1.95 ppm, respectively. By integrating the corresponding NMR peaks, it was estimated that Ad-Br possessed approximately three initiation sites. The resultant Ad-PGMA1 (Mn = 5.5 \u00d7 10 3 g mol \u22121 , PDI= 1.26) and Ad-PGMA2 (Mn = 1.0 \u00d7 10 4 g mol \u22121 , PDI = 1.32) were prepared by using Ad-Br. The Ad-PGMA species were further functionalized with excess EA to produce the corresponding Ad-PGEA1 and Ad-PGEA2 guests with different molecular weights. Similar to our previous studies, 15, 16 Ad-PGMA and Ad-PGEA were successfully prepared, as shown by the NMR results (Supplementary Figure S3 ). For the preparation of the PGED@PGEA and PGED-Gd@PGEA assemblies, two Ad-PGEA guests were complexed with PGED-CD or PGED-CD-Gd hosts in DI water by the host-guest interaction of Ad and CD species.", "cite_spans": [{"start": 113, "end": 114, "text": "4", "ref_id": "BIBREF3"}, {"start": 1100, "end": 1103, "text": "15,", "ref_id": "BIBREF14"}, {"start": 1104, "end": 1106, "text": "16", "ref_id": "BIBREF15"}], "ref_spans": [{"start": 447, "end": 456, "text": "Figure S3", "ref_id": "FIGREF3"}, {"start": 1198, "end": 1207, "text": "Figure S3", "ref_id": "FIGREF3"}]}, {"section": "RESULTS AND DISCUSSION", "text": "Characterization of polymer/pDNA nanocomplexes For non-viral polycationic gene carriers, the ability to condense pDNA into a nanoparticle is essential because nanocomplexes of carrier and pDNA are more suitable for cellular uptake. In this work, agarose gel electrophoresis, particle size and \u03b6-potential measurements, as well as AFM images, were used to confirm the ability of PGMA-based polycations to condense pDNA. The gel retardation results of polymer/pDNA complexes with increasing N/P ratios are shown in Figure 1 . All PGED@PGEA and PGED-Gd@PGEA could compact pDNA completely within the N/P ratio of 1.5, whereas the PGED-CD-Gd backbone and Ad-PGEA guests only condensed pDNA when N/P ratios reached 2.5. The above results indicated that the pDNAcondensing ability of PGED@PGEA and PGED-Gd@PGEA is better than that of PGED-CD-Gd and Ad-PGEA, which is probably because of the high molecular weight supramolecular structures. 35 The particle size and the \u03b6-potential of polymer/pDNA complexes with increasing N/P ratio are shown in Figure 2 . The particle sizes of all complexes decreased with increasing N/P ratios. At lower N/P ratios, loose aggregates with large size were formed owing to the few positive charges on the polymers. 36 With an increase in the N/P ratios, the diameters of all complexes decreased to 100-150 nm, which enabled the complexes to easily undergo endocytosis. It was also noted that at both low and high N/P ratios, the polycation/pDNA complexes had good stability (Supplementary Figure S4 ). In addition to particle sizes, the \u03b6-potential of polymer/pDNA complexes are another factor that affects cellular uptake. All of the complexes had positive \u03b6-potentials, and the values of the \u03b6-potential slightly increased with the N/P ratios. Owing to the positive charges, polymer/pDNA complexes had a good affinity for the negatively charged cell membrane and were able to facilitate cellular uptake. 37 The \u03b6-potential values of PGED@PGEA and PGED-Gd@PGEA groups were higher than those of the PGED-CD-Gd backbone and Ad-PGEA guests, once again confirming that PGMA-based supramolecular polycations possessed better DNA-compacting ability.", "cite_spans": [{"start": 933, "end": 935, "text": "35", "ref_id": "BIBREF34"}, {"start": 1241, "end": 1243, "text": "36", "ref_id": "BIBREF35"}, {"start": 1932, "end": 1934, "text": "37", "ref_id": "BIBREF36"}], "ref_spans": [{"start": 513, "end": 521, "text": "Figure 1", "ref_id": "FIGREF0"}, {"start": 1039, "end": 1047, "text": "Figure 2", "ref_id": "FIGREF2"}, {"start": 1515, "end": 1524, "text": "Figure S4", "ref_id": "FIGREF4"}]}, {"section": "RESULTS AND DISCUSSION", "text": "The morphology of polymer/pDNA complexes was observed by AFM. Figure 3 shows the representative AFM images of Ad-PGEA1/ pDNA, PGED@PGEA1/pDNA and PGED-Gd@PGEA1/pDNA complexes at the N/P ratio of 15. The AFM images show that the polymers could compact pDNA to form nanoparticles with a diameter of 100 nm, which is consistent with the results of particle size measurement (Figure 2a ).", "cite_spans": [], "ref_spans": [{"start": 62, "end": 70, "text": "Figure 3", "ref_id": "FIGREF3"}, {"start": 371, "end": 381, "text": "(Figure 2a", "ref_id": "FIGREF2"}]}, {"section": "Cell viability assay", "text": "Low cytotoxicity is essential for ideal biomedical materials. To evaluate the cytotoxicity of PGMA-based polycation/pDNA complexes at different N/P ratios, an 3-(4,5-dimethylthiazol-2yl)-2,5-diphenyl tetrazolium bromide assay was performed using the C6 and Hep G2 cell lines. The relative cell viabilities of all of the groups in both cell lines decreased with increasing N/P ratios (Figure 4 ). The cytotoxicity of polycations generally increases with their molecular weight. 38 In comparison with the PGED-CD and PGED-CD-Gd backbone, PGED@PGEA and PGED-Gd@PGEA exhibited increased cytotoxicity. However, PGED@PGEA and PGED-Gd@PGEA showed significantly lower cytotoxicity than the gold standard, PEI (25 kDa). For example, at an N/P ratio of 30, for the PGMA-based polycation/pDNA complexes, the relative cell viability values of C6 cells were 450% and those of Hep G2 470%. In our earlier work, 3, 15, 24, 39, 40 it was found that in comparison with PEI, which mainly has secondary amine groups, the nonionic hydrophilic hydroxyl groups of PGEAs can shield the deleterious, excess charges of the cationic complexes, leading to lower cytotoxicity. In addition, no significant differences among the cytotoxicities of Ad-PGEA, PGED@PGEA and PGED-Gd@PGEA were observed. The above results indicated that the supramolecular polymers could achieve a lower cytotoxicity despite their high molecular weights.", "cite_spans": [{"start": 477, "end": 479, "text": "38", "ref_id": "BIBREF37"}, {"start": 897, "end": 899, "text": "3,", "ref_id": "BIBREF2"}, {"start": 900, "end": 903, "text": "15,", "ref_id": "BIBREF14"}, {"start": 904, "end": 907, "text": "24,", "ref_id": "BIBREF23"}, {"start": 908, "end": 911, "text": "39,", "ref_id": "BIBREF38"}, {"start": 912, "end": 914, "text": "40", "ref_id": "BIBREF39"}], "ref_spans": [{"start": 383, "end": 392, "text": "(Figure 4", "ref_id": "FIGREF4"}]}, {"section": "Gene transfection assay", "text": "The gene transfection efficiency of the polycation/pDNA complexes was first analyzed in C6 and Hep G2 cell lines using pRL-CMV as a reporter gene. The C6 and Hep G2 cell lines are documented to be difficult to transfect with synthetic vectors. 41 Figure 5 shows the gene transfection efficiencies mediated by Ad-PGEA, PGED@PGEA and PGED-Gd@PGEA at various N/P ratios in comparison with those mediated by the controls (PEI (25 kDa), PGED-CD and PGED-CD-Gd) at their respective optimal N/P ratios. In general, the transfection efficiency first increases and then decreases slightly with increasing N/P ratio. At lower N/P ratios, cationic polymers cannot efficiently compact DNA, and the resultant loose nanocomplexes cannot easily enter cells. By contrast at higher N/P ratios, the transfection efficiency slightly decreased, probably as a result of the increased toxicity of free cationic polymers. The difference in gene transfection efficiencies at the same N/P ratio was observed between C6 and Hep G2 cells. The transfection efficiencies of gene carriers depend on different cell types because of their different properties. 42 As expected, the transfection efficiencies of PGED@PGEA and PGED-Gd@PGEA were significantly higher than those of Ad-PGEA guests at various N/P ratios because PGED@PGEA and ", "cite_spans": [{"start": 244, "end": 246, "text": "41", "ref_id": "BIBREF40"}, {"start": 1129, "end": 1131, "text": "42", "ref_id": "BIBREF41"}], "ref_spans": []}, {"section": "Figure 5", "text": "In vitro gene transfection efficiencies of the polymer/pDNA complexes at various N/P ratios in (a) C6 and (b) Hep G2 cells in comparison with those mediated by PEI (25 kDa, at its optimal N/P ratio of 10), PGED-CD (at the optimal N/P ratio of 15 for C6 or 25 for G2 cells) and PGED-CD-Gd (at the optimal N/P ratio of 15 for C6 or 25 for Hep G2 cells) (mean \u00b1 s.d., n = 3, *Po0.05).", "cite_spans": [], "ref_spans": []}, {"section": "Figure 5", "text": "PGED-Gd@PGEA might benefit from the better DNA-compacting ability (Figure 1 ) and low toxicities (Figure 4 ) of the supramolecular vectors. Notably, the transfection efficiencies of PGED@PGEA and PGED-Gd@PGEA arising from the same Ad-PGEA guests exhibited no significant difference, indicating that the Gd 3+ ions had no obvious effects on transfection efficiencies. In C6 and Hep G2 cell lines, Ad-PGEA2 exhibited much higher transfection efficiencies than Ad-PGEA1. This indicated that the transfection efficiencies mediated by PGEA-based vectors were dependent on the molecular weight of PGEA. In addition, the transfection efficiencies mediated by PGED@P-GEA and PGED-Gd@PGEA at most N/P ratios were much higher than those mediated by PEI at its optimal ratio of 10.", "cite_spans": [], "ref_spans": [{"start": 66, "end": 75, "text": "(Figure 1", "ref_id": "FIGREF0"}, {"start": 97, "end": 106, "text": "(Figure 4", "ref_id": "FIGREF4"}]}, {"section": "Figure 5", "text": "As shown in Figure 5 , the differences between the transfection efficiencies of Ad-PGEA1 and PGED-Gd@PGEA1 at most N/P ratios are bigger than those of Ad-PGEA2 and PGED-Gd@PGEA2. Thus, PGED-Gd@PGEA1 was selected as the representative supramolecular vector in the following experiments. To visually confirm the gene delivery abilities of PGEA-based vectors, the plasmid pEGFP-N1, Figure 6 Representative images of EGFP expression in (a, a1, a2, a3) C6 and (b, b1, b2, b3) Hep G2 cells transfected with Ad-PGEA1, PGED@PGEA1, PGED-Gd@PGEA1 and PEI at their respective optimal N/P ratios.", "cite_spans": [], "ref_spans": []}, {"section": "Figure 5", "text": "Biomedical applications of polycations Y Zhao et al Figure 7 Fluorescent images and flow cytometry analysis plots of (a) C6 and (b) Hep G2 cells treated with Ad-PGEA1/pDNA, PGED@PGEA1/pDNA and PGED-Gd@PGEA1/pDNA polyplexes for 4 h at the respective optimal N/P ratios. For the fluorescent images, the YOYO-1-labeled pDNA is shown in green, and the 4\u2032,6-diamidino-2-phenylindole -labeled nuclei are shown in blue.", "cite_spans": [], "ref_spans": []}, {"section": "Biomedical applications of polycations", "text": "Y Zhao et al encoding GFP, was used as another reporter gene in C6 and Hep G2 cell lines. Representative images of EGFP gene expression mediated by Ad-PGEA1, PGED@PGEA1 and PGED-Gd@PGEA1 at their respective optimal N/P ratios are shown in Figure 6 . The corresponding merged images are shown in Supplementary Figure S5 . Transfection with PGED@PGEA1 or PGED-Gd@PGEA1 led to more green (EGFPpositive) Hep G2 cells than did Ad-PGEA1 or PEI. The transfection efficiency, as reflected by the percentage of EGFP-positive cells, was quantitatively determined using flow cytometry. The percentages of EGFP-positive cells for Ad-PGEA1, PGED@PGEA1, PGED-Gd@PGEA1 and PEI in C6 (or Hep G2) cells are 13% (or 16%), 43% (or 47%), 41% (or 44%) and 14% (or 23%), respectively, which is fairly consistent with the results for luciferase expression ( Figure 5 ). The above gene transfection assay indicated that the supramolecular structure improved the gene transfection, and Gd 3+ ions had no obvious adverse effects on the transfection process.", "cite_spans": [], "ref_spans": []}, {"section": "Cellular internalization", "text": "To investigate the intracellular fates of the supramolecular vectors, the C6 and Hep G2 cells were treated with the Ad-PGEA1/pDNA, PGED@PGEA1/pDNA and PGED-Gd@PGEA1/pDNA complexes for 4 h at their respective optimal N/P ratios (15 for C6 cells and 25 for Hep G2 cells), whereas the pDNA was labeled by YOYO1, as shown in Figure 7 . YOYO1-labeled pDNA is shown in green. The nuclei stained with 4\u2032,6-diamidino-2-phenylindole are shown in blue. Compared with the cells treated with Ad-PGEA1/pDNA, the cells treated with PGED@PGEA1/pDNA and PGED-Gd@PGEA1/pDNA showed more green aggregations. The corresponding merged images are shown in Supplementary Figure S6 . The cellular uptake of the complexes with YOYO1-labeled pDNA was quantified by flow cytometry. The percentages of YOYO1-positive cells treated with PGED@PGEA1/ pDNA and PGED-Gd@PGEA1/pDNA were almost 90%, much higher than those (65~69%) of the cells treated with Ad-PGEA1/pDNA. This result was consistent with the fluorescent images of cellular uptake, confirming that supramolecular structure enhanced cell uptake, benefiting the resultant gene transfection.", "cite_spans": [], "ref_spans": []}, {"section": "In vitro MRI", "text": "The complexes with gadolinium have been widely used as efficient T1weighted MR contrast agents. 41, 43 PGED-Gd@PGEA was expected to act as an MRI contrast agent (relaxation times were shown in Supplementary Table S1 ). The MRI assay of PGED-Gd@PGEA1 is shown in Figure 8 . A linear curve could be obtained by plotting the inverse T1 as a function of the molar concentration of Gd 3+ ions (Figure 8a, top) . The brightness of the MR images increased with the molar concentration of Gd 3+ ions (Figure 8a, bottom) . Next, the potential to use the PGED-Gd@PGEA1 for MRI of cancer cells was explored. Both C6 and Hep G2 cells were treated for 4 h with PGED-CD-Gd and PGED-Gd@PGEA1 at different Gd 3+ ion concentrations in culture media (0, 5, 10 and 20 \u03bcM,) before imaging with the MR systems. Both the inverse T1 and MR images were presented and collected. The MR images of cancer cells treated with PGED-CD-Gd became slightly brighter with an increasing concentration of Gd 3+ ions (Supplementary Figure S7) . However, the MR images of cancer cells treated with PGED-Gd@PGEA1 evidently became brighter, and the inverse T1 increased with the concentration of Gd 3+ ions (Figure 8b ), likely arising from the higher cellular uptake of PGED-Gd@PGEA1 (Figure 7) . The above results indicated that supramolecular structure could also affect the relaxation time in vitro. PGED-Gd@PGEA could potentially be used as a contrast agent for MR imaging.", "cite_spans": [{"start": 96, "end": 99, "text": "41,", "ref_id": "BIBREF40"}, {"start": 100, "end": 102, "text": "43", "ref_id": "BIBREF42"}], "ref_spans": []}, {"section": "CONCLUSIONS", "text": "A series of PGMA-based supramolecular polycations (PGED-Gd@PGEAs) with MRI functionality were successfully constructed by assembling multiple Ad-PGEA guests with a versatile PGED-CD-Gd host, which possessed numerous flanking Gd 3+ ions. Such PGED-Gd@PGEAs had good DNA condensation abilities and low cell cytotoxicity. Moreover, PGED-Gd@PGEAs exhibited significantly higher transfection efficiencies than PEI (25 kDa) and the constituent units (PGED-CD-Gd and Ad-PGEA). The chelation of Gd 3+ ions imparted PGED-Gd@PGEAs with effective MRI functionality without adverse effects on gene transfection processes. These unique features could allow PGED-Gd@PGEA to become a competitive multifunctional gene delivery system. Figure 8 Linear fitting of the inverse T 1 and the T 1 -weighted MR images of (a) PGED-Gd@PGEA1 solutions and (b) C6 and Hep G2 cells treated with PGED-Gd@PGEA1 at different Gd concentrations.", "cite_spans": [], "ref_spans": []}], "bib_entries": {"BIBREF0": {"title": "Cancer statistics", "authors": [{"first": "R", "middle": [], "last": "Siegel", "suffix": ""}, {"first": "D", "middle": [], "last": "Naishadham", "suffix": ""}, {"first": "A", "middle": [], "last": "Jemal", "suffix": ""}], "year": 2012, "venue": "CA Cancer J. Clin", "link": null}, "BIBREF1": {"title": "Enzyme-responsive cell-penetrating peptide conjugated mesoporous silica quantum dot nanocarriers for controlled release of nucleus-targeted drug molecules and real-time intracellular fluorescence imaging of tumor cells", "authors": [{"first": "J", "middle": [], "last": "Li", "suffix": ""}, {"first": "F", "middle": [], "last": "Liu", "suffix": ""}, {"first": "Q", "middle": [], "last": "Shao", "suffix": ""}, {"first": "Y", "middle": [], "last": "Min", "suffix": ""}, {"first": "M", "middle": [], "last": "Costa", "suffix": ""}, {"first": "E", "middle": ["K L"], "last": "Yeow", "suffix": ""}, {"first": "B", "middle": [], "last": "Xing", "suffix": ""}], "year": 2014, "venue": "Adv. Healthc. Mater", "link": "4683185"}, "BIBREF2": {"title": "Nonviral vectors for gene delivery", "authors": [{"first": "M", "middle": ["A"], "last": "Mintzer", "suffix": ""}, {"first": "E", "middle": ["E"], "last": "Simanek", "suffix": ""}], "year": 2008, "venue": "Chem. Rev", "link": "82875295"}, "BIBREF3": {"title": "Polymer vectors via controlled/living radical polymerization for gene delivery", "authors": [{"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}, {"first": "W", "middle": ["T"], "last": "Yang", "suffix": ""}], "year": 2011, "venue": "Prog. Polym. Sci", "link": "98119828"}, "BIBREF4": {"title": "Gene expression, biodistribution, and pharmacoscintigraphic evaluation of chondroitin sulfate-PEI nano-constructs mediated tumor gene therapy", "authors": [{"first": "A", "middle": [], "last": "Pathak", "suffix": ""}, {"first": "P", "middle": [], "last": "Kumar", "suffix": ""}, {"first": "K", "middle": [], "last": "Chuttani", "suffix": ""}, {"first": "S", "middle": [], "last": "Jain", "suffix": ""}, {"first": "A", "middle": ["K"], "last": "Mishra", "suffix": ""}, {"first": "S", "middle": ["P"], "last": "Vyas", "suffix": ""}, {"first": "K", "middle": ["C"], "last": "Gupta", "suffix": ""}], "year": 2009, "venue": "ACS Nano", "link": "28185772"}, "BIBREF5": {"title": "Controlled Gene delivery by reduced graphene oxide \u2212 polyethylenimine nanocomposite", "authors": [{"first": "H", "middle": [], "last": "Kim", "suffix": ""}, {"first": "W", "middle": ["J"], "last": "Kim", "suffix": ""}], "year": 2014, "venue": "Small", "link": null}, "BIBREF6": {"title": "RGD targeting hyaluronic acid coating system for PEI-PBLG polycation gene carriers", "authors": [{"first": "H", "middle": ["Y"], "last": "Tian", "suffix": ""}, {"first": "L", "middle": [], "last": "Lin", "suffix": ""}, {"first": "J", "middle": [], "last": "Chen", "suffix": ""}, {"first": "X", "middle": ["S"], "last": "Chen", "suffix": ""}, {"first": "T", "middle": ["G"], "last": "Park", "suffix": ""}, {"first": "A", "middle": [], "last": "Maruyama", "suffix": ""}], "year": 2011, "venue": "J. Control. Release", "link": "7171208"}, "BIBREF7": {"title": "Tyroserleutide-based gene vector for suppressing VEGF expression in cancer therapy", "authors": [{"first": "H", "middle": ["Y"], "last": "Wang", "suffix": ""}, {"first": "W", "middle": ["J"], "last": "Yi", "suffix": ""}, {"first": "S", "middle": ["Y"], "last": "Qin", "suffix": ""}, {"first": "C", "middle": [], "last": "Li", "suffix": ""}, {"first": "R", "middle": ["X"], "last": "Zhuo", "suffix": ""}, {"first": "X", "middle": ["Z"], "last": "Zhang", "suffix": ""}], "year": 2012, "venue": "Biomaterials", "link": "5620574"}, "BIBREF8": {"title": "Copolymer of poly(ethylene glycol) and poly(L-lysine) grafting polyethylenimine through a reducible disulfide linkage for siRNA delivery", "authors": [{"first": "J", "middle": ["G"], "last": "Li", "suffix": ""}, {"first": "D", "middle": [], "last": "Cheng", "suffix": ""}, {"first": "T", "middle": ["H"], "last": "Yin", "suffix": ""}, {"first": "W", "middle": ["C"], "last": "Chen", "suffix": ""}, {"first": "Y", "middle": ["J"], "last": "Lin", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "Chen", "suffix": ""}, {"first": "R", "middle": ["T"], "last": "Li", "suffix": ""}, {"first": "X", "middle": ["T"], "last": "Shuai", "suffix": ""}], "year": 2014, "venue": "Nanoscale", "link": "31313658"}, "BIBREF9": {"title": "Redox-cleavable star cationic PDMAEMA by arm-first approach of ATRP as a nonviral vector for gene delivery", "authors": [{"first": "F", "middle": ["Y"], "last": "Dai", "suffix": ""}, {"first": "P", "middle": [], "last": "Sun", "suffix": ""}, {"first": "Y", "middle": ["J"], "last": "Liu", "suffix": ""}, {"first": "W", "middle": ["G"], "last": "Liu", "suffix": ""}], "year": 2010, "venue": "Biomaterials", "link": "5068078"}, "BIBREF10": {"title": "A poly(L-lysine)-based hydrophilic star block copolymer as a protein nano carrier with facile encapsulation and ph-responsive release", "authors": [{"first": "Y", "middle": ["S"], "last": "Yan", "suffix": ""}, {"first": "D", "middle": ["X"], "last": "Wei", "suffix": ""}, {"first": "J", "middle": ["Y"], "last": "Li", "suffix": ""}, {"first": "J", "middle": ["H"], "last": "Zheng", "suffix": ""}, {"first": "G", "middle": ["G"], "last": "Shi", "suffix": ""}, {"first": "W", "middle": ["H"], "last": "Luo", "suffix": ""}, {"first": "Y", "middle": [], "last": "Pan", "suffix": ""}, {"first": "J", "middle": ["Z"], "last": "Wang", "suffix": ""}, {"first": "L", "middle": ["M"], "last": "Zhang", "suffix": ""}, {"first": "X", "middle": ["Y"], "last": "He", "suffix": ""}, {"first": "D", "middle": ["J"], "last": "Liu", "suffix": ""}], "year": 2012, "venue": "Acta. Biomater", "link": "44733737"}, "BIBREF11": {"title": "Odd-even effect of repeating amino-ethylene units in the side chain of N-substituted polyaspartamides on gene transfection profiles", "authors": [{"first": "H", "middle": [], "last": "Uchida", "suffix": ""}, {"first": "K", "middle": [], "last": "Miyata", "suffix": ""}, {"first": "M", "middle": [], "last": "Oba", "suffix": ""}, {"first": "T", "middle": [], "last": "Ishii", "suffix": ""}, {"first": "T", "middle": [], "last": "Suma", "suffix": ""}, {"first": "K", "middle": [], "last": "Itaka", "suffix": ""}, {"first": "N", "middle": [], "last": "Nishiyama", "suffix": ""}, {"first": "K", "middle": [], "last": "Kataoka", "suffix": ""}], "year": 2011, "venue": "J. Am. Chem. Soc", "link": "207069459"}, "BIBREF12": {"title": "Efficient poly(N-3-hydroxypropyl) as partamide-based carriers via ATRP for gene delivery", "authors": [{"first": "Y", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "G", "middle": ["P"], "last": "Tang", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2013, "venue": "ACS Appl. Mater. Interfaces", "link": "22565423"}, "BIBREF13": {"title": "Efficient transfer of genetic material into mammalian cells using starburst polyamidoamine dendrimers", "authors": [{"first": "J", "middle": ["F"], "last": "Kukowska-Latallo", "suffix": ""}, {"first": "A", "middle": ["U"], "last": "Bielinska", "suffix": ""}, {"first": "J", "middle": [], "last": "Johnson", "suffix": ""}, {"first": "R", "middle": [], "last": "Spindler", "suffix": ""}, {"first": "D", "middle": ["A"], "last": "Tomalia", "suffix": ""}, {"first": "J", "middle": ["R"], "last": "Baker", "suffix": ""}], "year": 1996, "venue": "Proc. Natl Acad. Sci. USA", "link": "40686720"}, "BIBREF14": {"title": "Well-defined poly(2-hydroxyl-3-(2-hydroxyethylamino)propyl methacrylate) vectors with low toxicity and high gene transfection efficiency", "authors": [{"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}, {"first": "M", "middle": ["Y"], "last": "Chai", "suffix": ""}, {"first": "W", "middle": ["B"], "last": "Li", "suffix": ""}, {"first": "Y", "middle": [], "last": "Ping", "suffix": ""}, {"first": "G", "middle": ["P"], "last": "Tang", "suffix": ""}, {"first": "W", "middle": ["T"], "last": "Yang", "suffix": ""}], "year": 2010, "venue": "Biomacromolecules", "link": "21073691"}, "BIBREF15": {"title": "Comparison of ethanolamine/ethylenediamine-functionalized poly(glycidylmethacrylate) for efficient gene delivery", "authors": [{"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}, {"first": "Y", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "M", "middle": ["Y"], "last": "Chai", "suffix": ""}, {"first": "F", "middle": ["S"], "last": "Liu", "suffix": ""}], "year": 2011, "venue": "Acta Biomater", "link": "20402408"}, "BIBREF16": {"title": "A biocleavable pullulan-based vector via ATRP for liver cell-targeting gene delivery", "authors": [{"first": "X", "middle": ["C"], "last": "Yang", "suffix": ""}, {"first": "N", "middle": ["L"], "last": "Niu", "suffix": ""}, {"first": "N", "middle": ["N"], "last": "Zhao", "suffix": ""}, {"first": "C", "middle": [], "last": "Mao", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "Biomaterials", "link": "32905809"}, "BIBREF17": {"title": "The role of supramolecular chemistry in responsive vectors for gene delivery", "authors": [{"first": "C", "middle": ["M"], "last": "Lamanna", "suffix": ""}, {"first": "M", "middle": ["W"], "last": "Grinstaff", "suffix": ""}], "year": 2012, "venue": "", "link": "82732655"}, "BIBREF18": {"title": "Rational design of smart supramolecular assemblies for gene delivery, chemical challenges in the creation of artificial viruses", "authors": [{"first": "K", "middle": [], "last": "Miyata", "suffix": ""}, {"first": "N", "middle": [], "last": "Nishiyama", "suffix": ""}, {"first": "K", "middle": [], "last": "Kataoka", "suffix": ""}], "year": 2012, "venue": "Chem. Soc. Rev", "link": "29022577"}, "BIBREF19": {"title": "Cyclodextrin-based host-guest supramolecular nanoparticles for delivery: from design to applications", "authors": [{"first": "Q", "middle": ["D"], "last": "Hu", "suffix": ""}, {"first": "G", "middle": ["P"], "last": "Tang", "suffix": ""}, {"first": "P", "middle": ["K"], "last": "Chu", "suffix": ""}], "year": 2014, "venue": "Acc. Chem. Res", "link": "4805879"}, "BIBREF20": {"title": "Cyclodextrin-based gene delivery systems", "authors": [{"first": "C", "middle": ["O"], "last": "Mellet", "suffix": ""}, {"first": "J", "middle": ["M G"], "last": "Fernandez", "suffix": ""}, {"first": "J", "middle": [], "last": "Benito", "suffix": ""}], "year": 2011, "venue": "Chem. Soc. Rev", "link": "205764556"}, "BIBREF21": {"title": "Cyclodextrin-based supramolecular systems for drug delivery: recent progress and future perspective", "authors": [{"first": "J", "middle": [], "last": "Zhang", "suffix": ""}, {"first": "P", "middle": ["X"], "last": "Ma", "suffix": ""}], "year": 2013, "venue": "Adv. Drug. Deliv. Rev", "link": "25550302"}, "BIBREF22": {"title": "Cyclodextrins in non-viral gene delivery", "authors": [{"first": "W", "middle": ["F"], "last": "Lai", "suffix": ""}], "year": 2014, "venue": "Biomaterials", "link": "12637670"}, "BIBREF23": {"title": "Supramolecular host \u2212 guest pseudo comb conjugates composed of multiple star polycations tied tunably with a linear polycation backbone for gene transfection", "authors": [{"first": "Y", "middle": [], "last": "Hu", "suffix": ""}, {"first": "M", "middle": ["Y"], "last": "Chai", "suffix": ""}, {"first": "W", "middle": ["T"], "last": "Yang", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2013, "venue": "Bioconjug. Chem", "link": "19457945"}, "BIBREF24": {"title": "In vivo visualization of gene expression using magnetic resonance imaging", "authors": [{"first": "A", "middle": ["Y"], "last": "Louie", "suffix": ""}, {"first": "M", "middle": ["M"], "last": "H\u00fcber", "suffix": ""}, {"first": "E", "middle": ["T"], "last": "Ahrens", "suffix": ""}, {"first": "U", "middle": [], "last": "Rothb\u00e4cher", "suffix": ""}, {"first": "R", "middle": [], "last": "Moats", "suffix": ""}, {"first": "R", "middle": ["E"], "last": "Jacobs", "suffix": ""}, {"first": "S", "middle": ["E"], "last": "Fraser", "suffix": ""}, {"first": "T", "middle": ["J"], "last": "Meade", "suffix": ""}], "year": 2000, "venue": "Nat. Biotechnol", "link": "20213897"}, "BIBREF25": {"title": "Europium(III) DOTA-tetraamide complexes as redox-active MRI sensors", "authors": [{"first": "S", "middle": ["J"], "last": "Ratnakar", "suffix": ""}, {"first": "S", "middle": [], "last": "Viswana-Than", "suffix": ""}, {"first": "Z", "middle": [], "last": "Kovacs", "suffix": ""}, {"first": "A", "middle": ["K"], "last": "Jindal", "suffix": ""}, {"first": "K", "middle": ["N"], "last": "Green", "suffix": ""}, {"first": "A", "middle": ["D"], "last": "Sherry", "suffix": ""}], "year": 2012, "venue": "J. Am. Chem. Soc", "link": "207078344"}, "BIBREF26": {"title": "Gadolinium(III) chelates as MRI contrast agents: structure, dynamics, and applications", "authors": [{"first": "P", "middle": [], "last": "Caravan", "suffix": ""}, {"first": "J", "middle": ["J"], "last": "Ellison", "suffix": ""}, {"first": "T", "middle": ["J"], "last": "Mcmurry", "suffix": ""}, {"first": "R", "middle": ["B"], "last": "Lauffer", "suffix": ""}], "year": 1999, "venue": "Chem. Rev", "link": "87756"}, "BIBREF27": {"title": "Gd(III)-based contrast agents for MRI", "authors": [{"first": "S", "middle": [], "last": "Aime", "suffix": ""}, {"first": "M", "middle": [], "last": "Botta", "suffix": ""}, {"first": "E", "middle": [], "last": "Terreno", "suffix": ""}], "year": 2005, "venue": "Adv. Inorg. Chem", "link": "94727149"}, "BIBREF28": {"title": "New low molecular weight polycation-based nanoparticles for effective codelivery of pDNA and drug", "authors": [{"first": "Y", "middle": [], "last": "Zhao", "suffix": ""}, {"first": "B", "middle": [], "last": "Yu", "suffix": ""}, {"first": "H", "middle": [], "last": "Hu", "suffix": ""}, {"first": "Y", "middle": [], "last": "Hu", "suffix": ""}, {"first": "N", "middle": ["N"], "last": "Zhao", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "ACS Appl. Mater. Interfaces", "link": "206804279"}, "BIBREF29": {"title": "Two roles of guest and crosslinked degree on hydrosoluble \u03b2-cyclodextrin polymer electrorheological fluids", "authors": [{"first": "Z", "middle": [], "last": "Gao", "suffix": ""}, {"first": "X", "middle": [], "last": "Zhao", "suffix": ""}], "year": 2004, "venue": "Polymer", "link": "94293567"}, "BIBREF30": {"title": "A polymeric fastener can easily functionalize liposome surfaceswith gadoliniu m for enhanced magnetic resonance imaging", "authors": [{"first": "C", "middle": ["E"], "last": "Smith", "suffix": ""}, {"first": "A", "middle": [], "last": "Shkumatov", "suffix": ""}, {"first": "S", "middle": ["G"], "last": "Withers", "suffix": ""}, {"first": "B", "middle": [], "last": "Yang", "suffix": ""}, {"first": "J", "middle": ["F"], "last": "Glockner", "suffix": ""}, {"first": "S", "middle": [], "last": "Misra", "suffix": ""}, {"first": "E", "middle": ["J"], "last": "Roy", "suffix": ""}, {"first": "C", "middle": ["H"], "last": "Wong", "suffix": ""}, {"first": "S", "middle": ["C"], "last": "Zimmerman", "suffix": ""}, {"first": "H", "middle": [], "last": "Kong", "suffix": ""}], "year": 2013, "venue": "ACS Nano", "link": null}, "BIBREF31": {"title": "Bioreducible POSS-cored star-shaped polycation for efficient gene delivery", "authors": [{"first": "Y", "middle": ["Y"], "last": "Yang", "suffix": ""}, {"first": "X", "middle": [], "last": "Wang", "suffix": ""}, {"first": "Y", "middle": [], "last": "Hu", "suffix": ""}, {"first": "H", "middle": [], "last": "Hu", "suffix": ""}, {"first": "D", "middle": [], "last": "Wu", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "ACS Appl. Mater. Interfaces", "link": "39175408"}, "BIBREF32": {"title": "Copolymers of poly(ethylene glycol), poly((2-dimethyl amino)ethyl methacrylate) and poly(2-hydroxyethyl methacrylate) from consecutive atom transfer radical polymerizations for nonviral gene delivery", "authors": [{"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}, {"first": "H", "middle": ["Z"], "last": "Li", "suffix": ""}, {"first": "J", "middle": [], "last": "Li", "suffix": ""}, {"first": "Z", "middle": ["X"], "last": "Zhang", "suffix": ""}, {"first": "E", "middle": ["T"], "last": "Kang", "suffix": ""}, {"first": "G", "middle": ["N"], "last": "Pentablock", "suffix": ""}], "year": 2008, "venue": "Biomaterials", "link": "9453297"}, "BIBREF33": {"title": "-dimethyl amino)ethyl methacrylate) by combination of ATRP and Click chemistry", "authors": [{"first": "Y", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "X", "middle": [], "last": "Zheng", "suffix": ""}, {"first": "B", "middle": [], "last": "Yu", "suffix": ""}, {"first": "W", "middle": [], "last": "Yang", "suffix": ""}, {"first": "N", "middle": [], "last": "Zhao", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "ethanolamine-functionalized poly(glycidyl methacrylate), and poly", "link": null}, "BIBREF34": {"title": "Versatile types of polysaccharide-based supramolecular polycation/pDNA nanoplexes for gene delivery", "authors": [{"first": "Y", "middle": [], "last": "Hu", "suffix": ""}, {"first": "N", "middle": [], "last": "Zhao", "suffix": ""}, {"first": "B", "middle": [], "last": "Yu", "suffix": ""}, {"first": "F", "middle": [], "last": "Liu", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "Nanoscale", "link": "23821256"}, "BIBREF35": {"title": "Cationic micelles self-assembled from cholesterol-conjugated oligopeptides as an efficient gene delivery vector", "authors": [{"first": "X", "middle": ["D"], "last": "Guo", "suffix": ""}, {"first": "F", "middle": [], "last": "Tandiono", "suffix": ""}, {"first": "N", "middle": [], "last": "Wiradharma", "suffix": ""}, {"first": "D", "middle": [], "last": "Khor", "suffix": ""}, {"first": "C", "middle": ["G"], "last": "Tan", "suffix": ""}, {"first": "M", "middle": [], "last": "Khan", "suffix": ""}, {"first": "Y", "middle": [], "last": "Qian", "suffix": ""}, {"first": "Y", "middle": ["Y"], "last": "Yang", "suffix": ""}], "year": 2008, "venue": "Biomaterials", "link": "671947"}, "BIBREF36": {"title": "Enhanced gene transfection efficiency of PDMAEMA by incorporating hydrophobic hyperbranched polymer cores: effect of degree of branching", "authors": [{"first": "S", "middle": [], "last": "Yu", "suffix": ""}, {"first": "J", "middle": [], "last": "Chen", "suffix": ""}, {"first": "R", "middle": [], "last": "Dong", "suffix": ""}, {"first": "Y", "middle": [], "last": "Su", "suffix": ""}, {"first": "B", "middle": [], "last": "Ji", "suffix": ""}, {"first": "Y", "middle": [], "last": "Zhou", "suffix": ""}, {"first": "X", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "D", "middle": [], "last": "Yan", "suffix": ""}], "year": 2012, "venue": "Polym. Chem", "link": "97385612"}, "BIBREF37": {"title": "Cyclodextrin-based supramolecular architectures: Syntheses, structures, and applications for drug and gene delivery", "authors": [{"first": "J", "middle": [], "last": "Li", "suffix": ""}, {"first": "X", "middle": ["J"], "last": "Loh", "suffix": ""}], "year": 2008, "venue": "Adv. Drug. Deliv. Rev", "link": "23285968"}, "BIBREF38": {"title": "A highly fluorescent cationic bifunctional conjugate", "authors": [{"first": "X", "middle": ["C"], "last": "Yang", "suffix": ""}, {"first": "M", "middle": ["Y"], "last": "Chai", "suffix": ""}, {"first": "Y", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "W", "middle": ["T"], "last": "Yang", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2012, "venue": "J. Mater. Chem", "link": "96203707"}, "BIBREF39": {"title": "Series of new \u03b2-cyclodextrin-cored starlike carriers for gene delivery", "authors": [{"first": "R", "middle": ["Q"], "last": "Li", "suffix": ""}, {"first": "Y", "middle": ["L"], "last": "Niu", "suffix": ""}, {"first": "N", "middle": ["N"], "last": "Zhao", "suffix": ""}, {"first": "B", "middle": [], "last": "Yu", "suffix": ""}, {"first": "C", "middle": [], "last": "Mao", "suffix": ""}, {"first": "F", "middle": ["J"], "last": "Xu", "suffix": ""}], "year": 2014, "venue": "ACS Appl. Mater. Interfaces", "link": "19692409"}, "BIBREF40": {"title": "HCV cDNA transfection to HepG2 cells", "authors": [{"first": "N", "middle": [], "last": "Hiramatsu", "suffix": ""}, {"first": "S", "middle": [], "last": "Dash", "suffix": ""}, {"first": "M", "middle": ["A"], "last": "Gerber", "suffix": ""}], "year": 1997, "venue": "J. Viral. Hepat", "link": "35367681"}, "BIBREF41": {"title": "Functionalization of chitosan via atom transfer radical polymerization for gene delivery", "authors": [{"first": "Y", "middle": [], "last": "Ping", "suffix": ""}, {"first": "C.-D", "middle": [], "last": "Liu", "suffix": ""}, {"first": "G.-P", "middle": [], "last": "Tang", "suffix": ""}, {"first": "J.-S", "middle": [], "last": "Li", "suffix": ""}, {"first": "J", "middle": [], "last": "Li", "suffix": ""}, {"first": "W.-T", "middle": [], "last": "Yang", "suffix": ""}, {"first": "F.-J", "middle": [], "last": "Xu", "suffix": ""}], "year": 2010, "venue": "Adv. Funct. Mater", "link": "97902332"}, "BIBREF42": {"title": "Multimodal MRI contrast agents", "authors": [{"first": "L", "middle": [], "last": "Frullano", "suffix": ""}, {"first": "T", "middle": [], "last": "Meade", "suffix": ""}], "year": 2007, "venue": "J Biol. Inorg. Chem", "link": "22298268"}}, "ref_entries": {"FIGREF0": {"text": "General procedures for the preparation of different supramolecule-based polyplexes.", "type": "figure"}, "FIGREF1": {"text": "Electrophoretic mobility retardation assay of pDNA in the polyplexes of (a) PGED-CD, (b) PGED-CD-Gd, (c) Ad-PGEA1, (d) Ad-PGEA2, (e) PGED@PGEA1, (f) PGED@PGEA2, (g) PGED-Gd@PGEA1 and (h) PGED-Gd@PGEA2 at various N/P ratios. Biomedical applications of polycations Y Zhao et al reactions between the epoxy group of CD-EP and the amine group of PGED, producing PGED-CD. Based on the 1 H NMR spectrum of PGED-CD (Supplementary", "type": "figure"}, "FIGREF2": {"text": "(a) Particle size and (b) \u03b6-potential of the polymer/pDNA complexes at various N/P ratios.", "type": "figure"}, "FIGREF3": {"text": "AFM images of (a) Ad-PGEA1/pDNA, (b) PGED@PGEA1/pDNA and (c) PGED-Gd@PGEA1/pDNA complexes at the N/P ratio of 15.", "type": "figure"}, "FIGREF4": {"text": "Cytotoxicity of the polymer/pDNA complexes at different N/P ratios in (a) C6 and (b) Hep G2 cells. (mean \u00b1 s.d., n = 6, *Po0.05).", "type": "figure"}, "TABREF0": {"text": "Biomedical applications of polycations Y Zhao et alJose, CA, USA), were amplified in Escherichia coli and purified according to the supplier's protocol (Qiagen GmbH, Hilden, Germany).", "type": "table"}}}
diff --git a/s2orc-doc2json/tests/s2orc/20210101/24e6b80e338a4e543de6b49cada07156c9149d22.json b/s2orc-doc2json/tests/s2orc/20210101/24e6b80e338a4e543de6b49cada07156c9149d22.json
new file mode 100644
index 0000000000000000000000000000000000000000..d53598f1f6397109be411ac0a978c0706fc7ae03
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20210101/24e6b80e338a4e543de6b49cada07156c9149d22.json
@@ -0,0 +1 @@
+{"paper_id": "24e6b80e338a4e543de6b49cada07156c9149d22", "metadata": {"title": "Niche Partitioning of Marine Group I Crenarchaeota in the Euphotic and Upper Mesopelagic Zones of the East China Sea", "authors": [{"first": "\u1c14", "middle": [], "last": "", "suffix": "", "affiliation": {}, "email": ""}, {"first": "Anyi", "middle": [], "last": "Hu", "suffix": "", "affiliation": {"laboratory": "Key Laboratory of Urban Environment and Health", "institution": "Chinese Academy of Sciences", "location": {"postCode": "361021", "settlement": "Xiamen", "country": "People's Republic of China"}}, "email": ""}, {"first": "Nianzhi", "middle": [], "last": "Jiao", "suffix": "", "affiliation": {"laboratory": "Key Laboratory of Urban Environment and Health", "institution": "Chinese Academy of Sciences", "location": {"postCode": "361021", "settlement": "Xiamen", "country": "People's Republic of China"}}, "email": ""}, {"first": "Rui", "middle": [], "last": "Zhang", "suffix": "", "affiliation": {"laboratory": "Key Laboratory of Urban Environment and Health", "institution": "Chinese Academy of Sciences", "location": {"postCode": "361021", "settlement": "Xiamen", "country": "People's Republic of China"}}, "email": ""}, {"first": "Zao", "middle": [], "last": "Yang", "suffix": "", "affiliation": {"laboratory": "Key Laboratory of Urban Environment and Health", "institution": "Chinese Academy of Sciences", "location": {"postCode": "361021", "settlement": "Xiamen", "country": "People's Republic of China"}}, "email": ""}], "year": "2011-11"}, "abstract": [{"text": "Marine group I Crenarchaeota (MGI) represents a ubiquitous and numerically predominant microbial population in marine environments. An understanding of the spatial dynamics of MGI and its controlling mechanisms is essential for an understanding of the role of MGI in energy and element cycling in the ocean. In the present study, we investigated the diversity and abundance of MGI in the East China Sea (ECS) by analysis of crenarchaeal 16S rRNA gene, the ammonia monooxygenase gene amoA, and the biotin carboxylase gene accA. Quantitative PCR analyses revealed that these genes were higher in abundance in the mesopelagic than in the euphotic zone. In addition, the crenarchaeal amoA gene was positively correlated with the copy number of the MGI 16S rRNA gene, suggesting that most of the MGI in the ECS are nitrifiers. Furthermore, the ratios of crenarchaeal accA to amoA or to MGI 16S rRNA genes increased from the euphotic to the mesopelagic zone, suggesting that the role of MGI in carbon cycling may change from the epipelagic to the mesopelagic zones. Denaturing gradient gel electrophoretic profiling of the 16S rRNA genes revealed depth partitioning in MGI community structures. Clone libraries of the crenarchaeal amoA and accA genes showed both \"shallow\" and \"deep\" groups, and their relative abundances varied in the water column. Ecotype simulation analysis revealed that MGI in the upper ocean could diverge into special ecotypes associated with depth to adapt to the light gradient across the water column. Overall, our results showed niche partitioning of the MGI population and suggested a shift in their ecological functions between the euphotic and mesopelagic zones of the ECS.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Abstract"}], "body_text": [{"text": "Microbes are the most abundant and diverse life forms in oceans and play a critical role in energy flux and element cycling (14, 28) . Determining the structure of marine microbial communities and their spatial and temporal variation is an essential step in an understanding of the role of microbes in the functioning of ecosystems (55) . The application of molecular technology has led to great advances in microbial oceanography (14) . On the horizontal dimension, marine microorganisms have been shown to have biogeographic distribution patterns like those of macroorganisms (39) . Depth is considered a predominant factor in structuring of microbial communities across the oceanic water column, since the major physical and chemical gradients (e.g., light, temperature, pressure, nutrients, and organic matter) are apparent on the vertical scale (12) . Numerous studies show the depth-stratified niche adaptation of marine bacteria, either at the population level (7) or at the community level (12) .", "cite_spans": [{"start": 124, "end": 128, "text": "(14,", "latex": null, "ref_id": "BIBREF14"}, {"start": 129, "end": 132, "text": "28)", "latex": null, "ref_id": "BIBREF28"}, {"start": 332, "end": 336, "text": "(55)", "latex": null, "ref_id": "BIBREF58"}, {"start": 431, "end": 435, "text": "(14)", "latex": null, "ref_id": "BIBREF14"}, {"start": 578, "end": 582, "text": "(39)", "latex": null, "ref_id": "BIBREF40"}, {"start": 850, "end": 854, "text": "(12)", "latex": null, "ref_id": "BIBREF12"}, {"start": 968, "end": 971, "text": "(7)", "latex": null, "ref_id": "BIBREF6"}, {"start": 998, "end": 1002, "text": "(12)", "latex": null, "ref_id": "BIBREF12"}], "ref_spans": [], "eq_spans": [], "section": ""}, {"text": "Marine group I Crenarchaeota (MGI), one of the major phylogenetic groups of pelagic archaea, were first discovered by using 16S rRNA gene-based molecular methods (11, 15) . A number of studies showed that MGI forms one of the most abundant populations of marine microorganisms: it is estimated that there are 1.3 \u03eb 10 28 to 2.7 \u03eb 10 28 MGI cells, representing up to 40% of the total prokaryotic biomass in the dark ocean (31, 49) . Phylogenetic and genomic studies revealed that MGI together with mesophilic Crenarchaeota in soil environments likely form a third archaeal phylum, the Thaumarchaeota (6, 51) . Recent observations of the crenarchaeal amoA gene, encoding ammonia monooxygenase subunit A, in marine (56) and soil (54) metagenomics and the isolation of the marine crenarchaeon \"Candidatus Nitrosopumilus maritimus\" (32) imply that most mesophilic Crenarchaeota have the ability to oxidize ammonia to nitrite, which was previously thought to be achieved only by ammonia-oxidizing bacteria (AOB) (34) . Meanwhile, a novel mechanism of autotrophic carbon fixation, the 3-hydroxypropionate/4-hydroxybutyrate pathway, was identified in the genomes of \"Candidatus Nitrosopumilus maritimus\" and \"Candidatus Cenarchaeum symbiosum\" (20, 57) . The acc gene, encoding acetyl coenzyme A (acetyl-CoA) carboxylase, one of the key enzymes in that pathway, and the amoA gene are used as phylogenetic markers to mirror the ecological function of MGI (5, 13, 25, 59, 60) .", "cite_spans": [{"start": 162, "end": 166, "text": "(11,", "latex": null, "ref_id": "BIBREF11"}, {"start": 167, "end": 170, "text": "15)", "latex": null, "ref_id": "BIBREF15"}, {"start": 421, "end": 425, "text": "(31,", "latex": null, "ref_id": "BIBREF31"}, {"start": 426, "end": 429, "text": "49)", "latex": null, "ref_id": "BIBREF51"}, {"start": 599, "end": 602, "text": "(6,", "latex": null, "ref_id": "BIBREF5"}, {"start": 603, "end": 606, "text": "51)", "latex": null, "ref_id": "BIBREF53"}, {"start": 712, "end": 716, "text": "(56)", "latex": null, "ref_id": "BIBREF59"}, {"start": 726, "end": 730, "text": "(54)", "latex": null, "ref_id": "BIBREF56"}, {"start": 827, "end": 831, "text": "(32)", "latex": null, "ref_id": "BIBREF32"}, {"start": 1006, "end": 1010, "text": "(34)", "latex": null, "ref_id": "BIBREF34"}, {"start": 1235, "end": 1239, "text": "(20,", "latex": null, "ref_id": "BIBREF20"}, {"start": 1240, "end": 1243, "text": "57)", "latex": null, "ref_id": "BIBREF60"}, {"start": 1445, "end": 1448, "text": "(5,", "latex": null, "ref_id": "BIBREF4"}, {"start": 1449, "end": 1452, "text": "13,", "latex": null, "ref_id": "BIBREF13"}, {"start": 1453, "end": 1456, "text": "25,", "latex": null, "ref_id": "BIBREF25"}, {"start": 1457, "end": 1460, "text": "59,", "latex": null, "ref_id": "BIBREF62"}, {"start": 1461, "end": 1464, "text": "60)", "latex": null, "ref_id": "BIBREF63"}], "ref_spans": [], "eq_spans": [], "section": ""}, {"text": "Previous investigations showed that the MGI population seems to be rather similar on the horizontal scale in the world's oceans (25, 35, 40) , except where different water masses meet (1, 16, 30) . Vertically, however, two distinct phylogenetic groups of MGI (MGI-\u2423 and MGI-\u2425), based on analyses of 16S rRNA genes, predominated in shallow and deep waters, respectively (3, 40) . Additional evidence for depth-stratified phylogeny within MGI has also been observed by using the crenarchaeal 16S rRNA gene internal transcribed spacer region (18) and the amoA (5, 21, 48) and accA (25, 60) genes. The concordant depth-stratified phylogeny suggests that MGI populations have evolved into different ecotypes to adapt to specific habitats (niches) according to different environmental factors or resources (60) .", "cite_spans": [{"start": 128, "end": 132, "text": "(25,", "latex": null, "ref_id": "BIBREF25"}, {"start": 133, "end": 136, "text": "35,", "latex": null, "ref_id": "BIBREF36"}, {"start": 137, "end": 140, "text": "40)", "latex": null, "ref_id": "BIBREF41"}, {"start": 184, "end": 187, "text": "(1,", "latex": null, "ref_id": "BIBREF0"}, {"start": 188, "end": 191, "text": "16,", "latex": null, "ref_id": "BIBREF16"}, {"start": 192, "end": 195, "text": "30)", "latex": null, "ref_id": "BIBREF30"}, {"start": 369, "end": 372, "text": "(3,", "latex": null, "ref_id": "BIBREF2"}, {"start": 373, "end": 376, "text": "40)", "latex": null, "ref_id": "BIBREF41"}, {"start": 578, "end": 582, "text": "(25,", "latex": null, "ref_id": "BIBREF25"}, {"start": 583, "end": 586, "text": "60)", "latex": null, "ref_id": "BIBREF63"}, {"start": 800, "end": 804, "text": "(60)", "latex": null, "ref_id": "BIBREF63"}], "ref_spans": [], "eq_spans": [], "section": ""}, {"text": "Some studies suggested that the depth-related phylogeny of MGI may be due to photoinhibition-resistant adaptations (8, 42) . Assuming this to be true, we hypothesized that (i) MGI might have a restricted distribution pattern (niche partitioning) across the water column due to the influence of light or other continuous environmental gradients and (ii) evolutionary divergence among MGI ecotypes can be observed since niche partitioning might contribute to restricted gene flow during their evolution (44a) . To test these hypotheses, we investigated the vertical distribution of the abundance and diversity of MGI in the euphotic and upper mesopelagic zones at two stations along the Kuroshio Current using multiphasic molecular methods, including denaturing gradient gel electrophoresis (DGGE), clone libraries, quantitative PCR (qPCR), and ecotype simulation analysis. Our results showed that MGI in the upper ocean diverged into special ecotypes associated with depth in order to adapt to the light gradient in the water column.", "cite_spans": [{"start": 115, "end": 118, "text": "(8,", "latex": null, "ref_id": "BIBREF8"}, {"start": 119, "end": 122, "text": "42)", "latex": null, "ref_id": "BIBREF43"}, {"start": 501, "end": 506, "text": "(44a)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": ""}, {"text": "Sampling stations and environmental conditions. The East China Sea (ECS), located in the Northwest Pacific Ocean, is the largest continental shelf sea in the temperate zone. The Kuroshio Current going along the ECS shelf edge originates from the West Pacific warm pool and is characterized by high temperature and high salinity (29) (Fig. 1) . Two sites along the shelf slope of the ECS on the west side of the Kuroshio Current were selected for the present study: station 712 (27.44\u00b0N, 126.14\u00b0E) and station 608 (30.33\u00b0N, 128.62\u00b0E) (Fig. 1) . Station 712 is located by the main stream of the Kuroshio Current, while station 608 is situated near a branch of the Kuroshio Current. With similar water depths, both beyond 500 m, but different environmental conditions (please see below for detailed results), these two stations provided an opportunity for comparative studies of microbial dynamics along both geographic and vertical environmental dimensions.", "cite_spans": [{"start": 328, "end": 332, "text": "(29)", "latex": null, "ref_id": "BIBREF29"}], "ref_spans": [{"start": 333, "end": 341, "text": "(Fig. 1)", "latex": null, "ref_id": null}, {"start": 533, "end": 541, "text": "(Fig. 1)", "latex": null, "ref_id": null}], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Water samples were collected on board the RV Dongfanghong #2 between 5 and 12 November 2007. A SeaBird SBE 9/11 Plus conductivity-temperaturedepth (CTD) system fitted with a rosette sampler was used to measure temperature and salinity and to collect water samples. The potential density ( t ) was calculated based on the equation for the state of seawater as proposed previously (41) . The mixed-layer depths (MLDs) were estimated by using the t values. The euphotic zone depth of these two stations was derived from the monthly mean Aqua-MODIS euphotic depth [Lee] data set (34a) in the corresponding months (http://oceancolor.gsfc.nasa.gov/cgi/l3). Water samples were collected at 10 intervals in the euphotic and upper mesopelagic zones (0 to 400 m) for each station (Table 1) . Subsamples (2 to 3 liters) were prefiltered through a 20-m mesh (Millipore, Billerica, MA) and subsequently filtered onto 0.2-m-pore-size polycarbonate filters (Millipore) at a pressure of \u03fd0.03 MPa. The filters were immediately frozen and stored at \u03ea80\u00b0C until further analysis.", "cite_spans": [{"start": 379, "end": 383, "text": "(41)", "latex": null, "ref_id": "BIBREF42"}, {"start": 560, "end": 565, "text": "[Lee]", "latex": null, "ref_id": null}], "ref_spans": [{"start": 770, "end": 779, "text": "(Table 1)", "latex": null, "ref_id": "TABREF1"}], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Environmental DNA extraction. Microbial community DNA was extracted by using the UltraClean Soil DNA kit (MoBio, San Diego, CA) as described elsewhere previously (26) . DNA integrity and size were checked in a 0.8% agarose gel stained with SYBR green I (Molecular Probes, Eugene, OR), and the concentrations were quantified in duplicate by using a FlexStation 3 instrument (Molecular Devices, Sunnyvale, CA) with a Quant-iT dsDNA HS assay kit (Molecular Probes). A standard curve was generated by using known amounts of lambda DNA (Molecular Probes). qPCR analysis. qPCR was performed on an ABI Prism 7500 system (Applied Biosystems, Foster City, CA) with the primers listed in Table S1 in the supplemental material. Plasmids carrying the respective 16S rRNA or functional gene fragments (archaeal and MGI 16S rRNA genes, crenarchaeal amoA and accA genes, and \u2424-AOB amoA genes) as an insert were constructed (25) . The concentrations of plasmid DNAs were determined by using a Quant-iT dsDNA HS The abundances of the 16S rRNA genes of the archaea and MGI and the functional genes of the Crenarchaeota (amoA and accA) and \u2424-AOB (amoA) in all samples were measured in triplicate for each sample. For the quantification of 16S rRNA genes, a 20-l reaction mixture consisting of 10 l of SYBR GreenER-qPCR SuperMix Universal (Molecular Probes), 50 nM ROX dye, and 5 g bovine serum albumin (BSA) plus 0.4 M primers and 1 l of template (1 to 10 ng) was used. For the quantification of the functional genes, the following reaction mixture was used: 10 l of SYBR Premix Ex Taq (TaKaRa, Dalian, China), 50 nM ROX dye, 5 g BSA, 0.4 M primers, and 1 l template DNA of 1 to 10 ng in a final volume of 20 l. The specificity of qPCRs was confirmed by using melting curve analysis and agarose gel electrophoresis after amplification. The thermocycling parameters and efficiency of the qPCRs are described in Table S2 in the supplemental material.", "cite_spans": [{"start": 162, "end": 166, "text": "(26)", "latex": null, "ref_id": "BIBREF26"}, {"start": 908, "end": 912, "text": "(25)", "latex": null, "ref_id": "BIBREF25"}], "ref_spans": [{"start": 678, "end": 686, "text": "Table S1", "latex": null, "ref_id": "TABREF1"}, {"start": 1891, "end": 1899, "text": "Table S2", "latex": null, "ref_id": "TABREF2"}], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "T-RFLP analysis of bacterial 16S rRNA genes. Bacterial community structures were analyzed by using terminal restriction fragment length polymorphism (T-RFLP) analysis for PCR amplification with primer pair 27F/926R, with primer 27F labeled by 6-carboxyfluorescein phosphoramidite at the 5\u0408 end. The PCR conditions and chemistry were described elsewhere previously (62) . Briefly, 1 to 10 ng of the extracted DNA was added as a template in a 50-l PCR mixture. Purified PCR products were digested with RsaI (TaKaRa) at 37\u00b0C for 12 h. Digested products were recovered in a final volume of 20 l of Mill-Q water using ethanol precipitation. Purified products (10 l) were mixed with 0.5 l of the internal ET ROX-900 size standard (Amersham Bioscience) and then detected by using a MegaBACE genetic analyzer (Amersham) operated in genotyping mode (62) . The T-RFLP data were exported by using MegaBACE Genetic Profiler software and were processed with T-REX software for filtering out noise, aligning terminal restriction fragments (T-RFs), and constructing a data matrix (9) . The obtained matrix was further imported into PAST v1.92 (23) to perform the cluster analysis with both Sorensen and Bray-Curtis algorithms. T-RFs of \u03fd50 bp or contributing \u03fd0.5% to the total fluorescence signal were excluded from the analysis. DGGE analysis of crenarchaeal 16S rRNA genes. Crenarchaeal 16S rRNA gene fragments were amplified by employing a nested PCR strategy as described previously (44) . Briefly, crenarchaeal 16S rRNA gene fragments were amplified by using primer 21F (11) and modified primer 1492R (44) in the first round and primers 771F (45) and GI_956R (42) in the second round, with primer GI_956R containing a 40-bp GC clamp. DGGE was performed by using a Bio-Rad DCode universal mutation detection system (Bio-Rad, Hercules, CA) according to the manufacturer's instructions. The PCR products of the crenarchaeal 16S rRNA gene were applied onto 8% (wt/vol) gels in 1\u03eb Tris-acetate-EDTA (TAE) buffer with a denaturing gradient of 30 to 55% denaturant (100% denaturing solution containing 40% formamide and 7 M urea). Electrophoresis was performed at a constant temperature of 60\u00b0C and at 75 V for 16 h. The DGGE images, stained with SYBR green I (Molecular Probes), were captured and analyzed by using GeneSnap and GeneTools software (SynGene-Synoptics, Cambridge, United Kingdom).", "cite_spans": [{"start": 364, "end": 368, "text": "(62)", "latex": null, "ref_id": "BIBREF65"}, {"start": 840, "end": 844, "text": "(62)", "latex": null, "ref_id": "BIBREF65"}, {"start": 1065, "end": 1068, "text": "(9)", "latex": null, "ref_id": "BIBREF9"}, {"start": 1128, "end": 1132, "text": "(23)", "latex": null, "ref_id": "BIBREF23"}, {"start": 1473, "end": 1477, "text": "(44)", "latex": null, "ref_id": "BIBREF45"}, {"start": 1592, "end": 1596, "text": "(44)", "latex": null, "ref_id": "BIBREF45"}, {"start": 1633, "end": 1637, "text": "(45)", "latex": null, "ref_id": "BIBREF47"}, {"start": 1650, "end": 1654, "text": "(42)", "latex": null, "ref_id": "BIBREF43"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Crenarchaeal amoA and accA gene clone library analyses. Clone libraries were constructed for the crenarchaeal amoA genes targeting ammonia monooxygenase subunit A and for the crenarchaeal accA genes targeting acetyl-CoA carboxylase subunit A for the six depth zones at each station. The amoA gene fragments were amplified with Arch-amoAF/Arch-amoAR (13) , and the accA gene fragments were amplified with Cren529F/Cren981R (59), except that the nested PCR strategy was employed when few positive PCR products were obtained from the euphotic samples. Subsequently, purified PCR products were ligated into the pMD18-T vector (TaKaRa) and then transformed into competent Escherichia coli DH5\u2423 cells (TaKaRa). Positive clones were screened by using PCR reamplification with vector primers M-13F and M-13R and selected for sequencing by using an ABI 3730 XL sequencer (Applied Biosystems).", "cite_spans": [{"start": 349, "end": 353, "text": "(13)", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Phylogenetic analysis and ecotype simulation. The crenarchaeal amoA and accA gene sequences, along with their closest relatives retrieved from GenBank, were imported into ARB (37) . The sequences were first translated and aligned by using Clustal W in ARB, and the nucleotides were then realigned according to their protein alignments. Ambiguously and incorrectly aligned positions were corrected manually by using the ARB-edit tool. The sequence base frequency filters were used to exclude ambiguous positions and columns containing gaps. The Bayesian tree was generated by using the MrBayes v. 3.1.2 program (47) and the following parameters: the general time reversal model of evolution with gamma-invariable-distributed rate, the number of chains set to 6, and the temperature set to 0.1. Five Markov chains in parallel were run with 5,000,000 generations and sampled every 100 generations (the first 7,500 to 10,000 \"burnin\" trees were excluded from the consensus tree). Tree topologies were also evaluated with the neighbor-joining and maximum parsimony methods by using PAUP*4.0 (53) .", "cite_spans": [{"start": 175, "end": 179, "text": "(37)", "latex": null, "ref_id": "BIBREF38"}, {"start": 1086, "end": 1090, "text": "(53)", "latex": null, "ref_id": "BIBREF55"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Ecotype simulation analyses were performed by using AdaptML software (27) . Since Bayesian trees constructed with archaeal amoA or accA gene sequences contained several multifurcation nodes, maximum likelihood trees constructed by using RAxML v7.0.4 (52) were used as inputs for AdaptML analyses with default parameters.", "cite_spans": [{"start": 69, "end": 73, "text": "(27)", "latex": null, "ref_id": "BIBREF27"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Statistical analysis. Operational taxonomic units (OTUs) for clone library analyses were defined by using the furthest-neighbor algorithm in DOTUR (50) and a cutoff of \u05455% as described in previous studies (4, 13, 25) . Rarefaction, the nonparametric richness estimator Chao1, the Shannon diversity index, and Simpson's index were also calculated by using DOTUR. The coverage of each clone library was calculated as coverage (C) \u03ed 1 \u03ea (n/N) \u03eb 100, where n is the number of unique clones detected in the sample and N is the total number of clones analyzed (19) .", "cite_spans": [{"start": 147, "end": 151, "text": "(50)", "latex": null, "ref_id": "BIBREF52"}, {"start": 205, "end": 208, "text": "(4,", "latex": null, "ref_id": "BIBREF3"}, {"start": 209, "end": 212, "text": "13,", "latex": null, "ref_id": "BIBREF13"}, {"start": 213, "end": 216, "text": "25)", "latex": null, "ref_id": "BIBREF25"}, {"start": 554, "end": 558, "text": "(19)", "latex": null, "ref_id": "BIBREF19"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Community classification of the crenarchaeal assemblages for both the amoA and accA genes was determined by using phylogeny-based weighted UniFrac environmental clustering (36) or ecotype-abundance-based clustering analyses. Analyses of similarity (ANOSIM) were performed to verify the significance of microbial community structures from different groups or stations. The significance of the correlations between two distance matrices or between microbial community structures and environmental variables was tested with the Mantel test. Analyses were performed with the PAST v1.92 program (23) . Since a normality of distribution of the individual data sets was not always met, nonparametric statistical analyses were performed by using SPSS v13.0 (SPSS, Inc., Chicago, IL).", "cite_spans": [{"start": 590, "end": 594, "text": "(23)", "latex": null, "ref_id": "BIBREF23"}], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "Nucleotide sequence accession numbers. Sequences reported in this study have been deposited in the GenBank database under accession numbers GU181423 to GU181799 (crenarchaeal amoA genes), GU195200 to GU195605 (crenarchaeal accA genes), and GU195606 to GU195629 (bacterial accA genes).", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "MATERIALS AND METHODS"}, {"text": "The main hydrographic characteristics of the two stations investigated in this study are summarized in Table 1 . The temperature was significantly higher but the t was significantly lower at station 712 than at station 608, while the salinity values were not significantly different between the two stations. As indicated by the t , the MLD was slightly greater at station 712 (\u03f375 m) than at station 608 (\u03f350 m) ( Table 1) . At both stations, the temperature showed a strong decrease with increasing depth from the MLD, whereas salinity had constant values ( Table 1) . As seen from the MODIS euphotic zone depth data for November 2007, the euphotic depth at station 608 was around 84 m, whereas the euphotic depth at station 712 was slightly greater, at \u03f3106 m.", "cite_spans": [], "ref_spans": [{"start": 103, "end": 110, "text": "Table 1", "latex": null, "ref_id": "TABREF1"}, {"start": 415, "end": 423, "text": "Table 1)", "latex": null, "ref_id": "TABREF1"}, {"start": 560, "end": 568, "text": "Table 1)", "latex": null, "ref_id": "TABREF1"}], "eq_spans": [], "section": "RESULTS"}, {"text": "Quantification of 16S rRNA, amoA, and accA genes of pelagic Crenarchaeota. Archaeal 16S rRNA gene abundances ranged from 1.41 \u03eb 10 2 to 3.72 \u03eb 10 4 copies ng DNA \u03ea1 within the euphotic zone at the two stations (Fig. 2) . Depth profiles of MGI 16S rRNA gene abundance ranging from 3.3 \u03eb 10 1 to 1.38 \u03eb 10 4 copies ng DNA \u03ea1 were similar to those of archaeal 16S rRNA genes (Fig. 2) . The MGI 16S rRNA gene abundance was significantly correlated with the archaeal 16S rRNA gene abundance (R 2 \u03ed 0.86; P \u03fd 0.001). Furthermore, qPCR analysis showed that MGI dominated in the archaeal community below the euphotic zone, with an averaged relative abundance of 57.2% of the total.", "cite_spans": [], "ref_spans": [{"start": 210, "end": 218, "text": "(Fig. 2)", "latex": null, "ref_id": "FIGREF0"}, {"start": 372, "end": 380, "text": "(Fig. 2)", "latex": null, "ref_id": "FIGREF0"}], "eq_spans": [], "section": "RESULTS"}, {"text": "The crenarchaeal amoA gene abundance increased from near the detection limit in the surface waters to a maximum of 2.44 \u03eb 10 4 copies ng DNA \u03ea1 in the upper mesopelagic zones of both stations (Fig. 2) . A linear regression analysis indicated that the crenarchaeal amoA and MGI 16S rRNA genes were significantly correlated with each other (R 2 \u03ed 0.83; P \u03fd 0.001). The abundance of the amoA gene of \u2424-AOB was below the detection limits in all samples.", "cite_spans": [], "ref_spans": [{"start": 192, "end": 200, "text": "(Fig. 2)", "latex": null, "ref_id": "FIGREF0"}], "eq_spans": [], "section": "RESULTS"}, {"text": "The abundance of the crenarchaeal accA gene was almost always below the detection limit within the euphotic zones but then increased with depth to maximal values at a depth of 300 m at both stations (Fig. 2) . The abundance ratios of the crenarchaeal accA gene to the MGI 16S rRNA gene or to the crenarchaeal amoA gene increased with depth, and the peak ratios occurred in the upper mesopelagic zone (Fig. 3) . Community structure of bacterial and crenarchaeal 16S rRNA genes. The T-RFLP pattern of the bacterial communities revealed a total of 134 T-RFs at the 16S rRNA gene level. The number of T-RFs showed no depth-related pattern at both stations (data not shown). In contrast, ANOSIM showed distinct site-or water mass-specific bacterial communities between the two stations (P \u03fd 0.05). Furthermore, the cluster analysis demonstrated that the bacterial communities at station 608 were stratified in the water column, but most samples from station 712 clustered together and did not show a depthrelated pattern (Fig. 4a) . Mantel tests indicated that the bacterial communities at station 608 were strongly influenced by depth, temperature, and t (r \u03fe 0.5; P \u03fd 0.001), while this influence at station 712 was not obviously seen (P \u03fe 0.1).", "cite_spans": [], "ref_spans": [{"start": 199, "end": 207, "text": "(Fig. 2)", "latex": null, "ref_id": "FIGREF0"}, {"start": 400, "end": 408, "text": "(Fig. 3)", "latex": null, "ref_id": null}, {"start": 1016, "end": 1025, "text": "(Fig. 4a)", "latex": null, "ref_id": "FIGREF2"}], "eq_spans": [], "section": "RESULTS"}, {"text": "Crenarchaeal community structures at both stations were characterized on the basis of the 16S rRNA gene by using DGGE. The number of DGGE bands detected ranged from 1 to 14 per sample (see Fig. S1 in the supplemental material). There were relatively lower band numbers in the euphotic zone (\u0545100 m) than in greater depths at both stations (P \u03fd 0.05 by Mann-Whitney test) (Fig. S1 ), resulting in depth-related variations. Cluster analysis demonstrated a general depth-stratified pattern for the crenarchaeal communities at both stations ( Fig. 4b) : a deep clade (\u0546150 m) and a euphotic zone cluster (\u0545100 m). Moreover, the deep clade could be further divided into two subclades (Fig. 4b) closely related to water depth, temperature, and t (r \u03fe 0.4; P \u03fd 0.001). In contrast, homogeneous crenarchaeal communities were observed at both stations (P \u03fe 0.5 by ANOSIM).", "cite_spans": [], "ref_spans": [{"start": 189, "end": 196, "text": "Fig. S1", "latex": null, "ref_id": null}, {"start": 371, "end": 379, "text": "(Fig. S1", "latex": null, "ref_id": null}, {"start": 539, "end": 547, "text": "Fig. 4b)", "latex": null, "ref_id": "FIGREF2"}, {"start": 679, "end": 688, "text": "(Fig. 4b)", "latex": null, "ref_id": "FIGREF2"}], "eq_spans": [], "section": "RESULTS"}, {"text": "Diversity and phylogeny of crenarchaeal amoA and accA genes. To assess the evolutionary divergence and restricted distribution of MGI ecotypes along the water column, samples from six depths from each station were chosen, based on the above-described DGGE profiles, to construct clone libraries of the crenarchaeal amoA and accA genes. In total, 377 amoA and 406 accA gene sequences were obtained (Table 2) . These sequences contained 31 unique amoA OTUs and 53 unique accA OTUs based on a 5% cutoff value at the DNA level. The numbers of OTUs per sample varied between 2 and 15 for amoA and between 3 and 17 for accA ( Table 2) . The values of library coverage ranged from 83.7 to 100% for amoA and 82 to 97.4% for accA. The diversity of amoA varied at both stations: clone libraries below the euphotic zones (\u0546150 m) had a higher level of diversity than did those recovered from the shallower depths (\u0545100 m), as indicated by the Shannon, reciprocal Simpson's, and Chao1 diversity indices (P \u03fd 0.01 by Mann-Whitney test) ( Table 2 ). In contrast, there were no significant differences in accA diversity between the euphotic zone samples and those from greater depths (P \u03fe 0.1 by MannWhitney test) ( Table 2) . Overall, the numbers of OTUs and the Shannon and Chao1 richnesses of accA were generally higher than those of amoA, with three exceptions: at stations at depths of 608 to 400 m, 712 to 150 m, and 712 to 400 m. Phylogenetic analyses demonstrated that all sequences of both genes were affiliated with the two primary marine clusters described previously, the \"shallow group\" and the \"deep group\" (1, 25, 60) (see Fig. S2 in the supplemental material), indicating that there was a high level of phylogenetic congruence between the crenarchaeal amoA and accA genes. The shallow group for both genes contained sequences derived exclusively from epipelagic water (\u0545200 m) (Fig. S2 ), but their relative abundances decreased with increasing depth (Fig. 5) . Accordingly, the deep group for both genes appeared near the base of the euphotic zone (150 m for amoA genes and 75 m for accA genes), and the relative abundance of this group increased with depth (Fig. 5) .", "cite_spans": [], "ref_spans": [{"start": 397, "end": 406, "text": "(Table 2)", "latex": null, "ref_id": "TABREF2"}, {"start": 620, "end": 628, "text": "Table 2)", "latex": null, "ref_id": "TABREF2"}, {"start": 1025, "end": 1032, "text": "Table 2", "latex": null, "ref_id": "TABREF2"}, {"start": 1201, "end": 1209, "text": "Table 2)", "latex": null, "ref_id": "TABREF2"}, {"start": 1623, "end": 1630, "text": "Fig. S2", "latex": null, "ref_id": "FIGREF0"}, {"start": 1878, "end": 1886, "text": "(Fig. S2", "latex": null, "ref_id": "FIGREF0"}, {"start": 1952, "end": 1960, "text": "(Fig. 5)", "latex": null, "ref_id": null}, {"start": 2160, "end": 2168, "text": "(Fig. 5)", "latex": null, "ref_id": null}], "eq_spans": [], "section": "RESULTS"}, {"text": "Ecotype simulation and community classification of crenarchaeal amoA and accA. The UniFrac significance and P-test significance analyses indicated that there was a nonrandom clustering of the community structures of crenarchaeal amoA and accA across the sampling depths at both stations (P \u03fd 0.001). We therefore demarcated ecotypes using the AdaptML model, with the sampling depths and sites as habitat source inputs. AdaptML identified 10 amoA ecotypes and 10 accA ecotypes, both of which were clearly associated with sampling depth but not with sampling site (Fig. 6a and b) . Noticeably, some amoA and accA ecotypes (8 out of 10 ecotypes) had similar depth-related variations ( Fig. 6c and d) . For example, the relative abundances of amoA-E1/accA-E1 and amoA-E2/ accA-E2 generally decreased with increasing depth, while the abundances of amoA-E5/accA-E5 and amoA-E6/accA-E6 increased from the surface to the bottom of the euphotic zone (\u03f3100 m). On the other hand, amoA-E7/accA-E7 and amoA-E8/accA-E8 appeared near the bottom of the euphotic zone and showed an opposite trend. Both amoA-E9/accA-E9 and amoA-E10/accA-E10 thrived in the mesopelagic zone. These results supported the phylogenetic congruence between the amoA and accA genes, implying that most pelagic Crenarchaeota adapt to the changing environments along the vertical dimension of the water column.", "cite_spans": [], "ref_spans": [{"start": 562, "end": 577, "text": "(Fig. 6a and b)", "latex": null, "ref_id": "FIGREF4"}, {"start": 682, "end": 696, "text": "Fig. 6c and d)", "latex": null, "ref_id": "FIGREF4"}], "eq_spans": [], "section": "RESULTS"}, {"text": "Both phylogeny-and ecotype-based cluster analyses demonstrated almost identical depth-related patterns for the amoA and accA genes (Fig. 7) . Generally, the community structures of both genes could be divided into three major clusters ( Fig. 7) : a euphotic zone cluster (\u0545100 m), a deep epipelagic cluster (150 m and 200 m), and a mesopelagic cluster (400 m). Mantel tests verified that there were significant correlations among these clusters (r \u03fe 0.5; P \u03fd 0.001).", "cite_spans": [], "ref_spans": [{"start": 131, "end": 139, "text": "(Fig. 7)", "latex": null, "ref_id": null}, {"start": 237, "end": 245, "text": "Fig. 7)", "latex": null, "ref_id": null}], "eq_spans": [], "section": "RESULTS"}, {"text": "Spatial distribution of MGI in the ECS. The spatial dynamics of pelagic microbial populations is a central concern in microbial oceanography (14) . Several recent studies indicate that ocean water masses play an important role in shaping the community structures and distribution patterns of plegia bacteria (2, 17, 22, 63) and archaea (1, 16, 30) . The results of the present study did verify the recognition with bacterial data from the two investigation stations with heterogeneity in hydrographic characteristics. However, our archaeal data, including those from crenarchaeal 16S rRNA gene-based DGGE analysis ( Fig. 4b and see Fig. S1 in the supplemental material) and amoA and accA gene-based clone library analyses ( Fig. 7a and b) , showed highly similar crenarchaeal communities at both investigation stations. The case of archaea is unexpected but consistent with some previous studies showing that communities of pelagic archaea from similar water depths appear to be similar in structure regardless of geographic location (25, 35, 61) . The observed distinct bacterial communities and similar archaeal communities between the two hydrological sites suggest different controlling mechanisms for bacteria and archaea in spatial dynamics, which is of great interest for future studies.", "cite_spans": [{"start": 141, "end": 145, "text": "(14)", "latex": null, "ref_id": "BIBREF14"}, {"start": 308, "end": 311, "text": "(2,", "latex": null, "ref_id": "BIBREF1"}, {"start": 312, "end": 315, "text": "17,", "latex": null, "ref_id": "BIBREF17"}, {"start": 316, "end": 319, "text": "22,", "latex": null, "ref_id": "BIBREF22"}, {"start": 320, "end": 323, "text": "63)", "latex": null, "ref_id": "BIBREF66"}, {"start": 336, "end": 339, "text": "(1,", "latex": null, "ref_id": "BIBREF0"}, {"start": 340, "end": 343, "text": "16,", "latex": null, "ref_id": "BIBREF16"}, {"start": 344, "end": 347, "text": "30)", "latex": null, "ref_id": "BIBREF30"}, {"start": 1035, "end": 1039, "text": "(25,", "latex": null, "ref_id": "BIBREF25"}, {"start": 1040, "end": 1043, "text": "35,", "latex": null, "ref_id": "BIBREF36"}, {"start": 1044, "end": 1047, "text": "61)", "latex": null, "ref_id": "BIBREF64"}], "ref_spans": [{"start": 616, "end": 639, "text": "Fig. 4b and see Fig. S1", "latex": null, "ref_id": "FIGREF2"}, {"start": 724, "end": 739, "text": "Fig. 7a and b)", "latex": null, "ref_id": null}], "eq_spans": [], "section": "DISCUSSION"}, {"text": "Niche partitioning of MGI in the upper ocean of the ECS. As shown by the DGGE profiles of 16S rRNA genes showing that depth-stratified MGI populations inhabited both stations, archaeal communities are less variable in geographic dimension but more variable along the water depth. The functional gene-based clone library analyses revealed that the crenarchaeal amoA and accA genes diverged into two lineages (shallow and deep groups) associated with different water layers (epipelagia versus mesopelagia) ( observed previously for other ocean regions (5, 40, 42, 60) . In the central California Current, members of the \"deep group\" are strongly associated with deepwater transportation but are less active than their shallow-water-adapted counterparts, providing evidence that these two independent MGI groups indeed represent shallow-and deep-water-adapted \"ecotypes\" (48) . In the present work, we applied AdaptML modeling and found 10 ecotypes for the amoA and accA genes, while only two general lineages were demarcated based on common phylogenetic analyses (Fig. 6) . Definition of ecotypes with higher resolution would provide new insights into the evolutionary mechanisms of MGI speciation. More interestingly, although the accA gene abundance was extremely low in the epipelagic zone, 8 out of 10 ecotypes of both genes had similar depthrelated distribution patterns ( Fig. 6 and 7c and d) . This finding suggested that MGI functional groups containing amoA and accA underwent a similar ecological adaptation and evolutionary history. This conclusion is further supported by the almost consistent clustering network observation based on phylogenetic branch length or ecotype abundance (Fig. 7) .", "cite_spans": [{"start": 550, "end": 553, "text": "(5,", "latex": null, "ref_id": "BIBREF4"}, {"start": 554, "end": 557, "text": "40,", "latex": null, "ref_id": "BIBREF41"}, {"start": 558, "end": 561, "text": "42,", "latex": null, "ref_id": "BIBREF43"}, {"start": 562, "end": 565, "text": "60)", "latex": null, "ref_id": "BIBREF63"}, {"start": 868, "end": 872, "text": "(48)", "latex": null, "ref_id": "BIBREF50"}], "ref_spans": [{"start": 1061, "end": 1069, "text": "(Fig. 6)", "latex": null, "ref_id": "FIGREF4"}, {"start": 1376, "end": 1396, "text": "Fig. 6 and 7c and d)", "latex": null, "ref_id": "FIGREF4"}, {"start": 1692, "end": 1700, "text": "(Fig. 7)", "latex": null, "ref_id": null}], "eq_spans": [], "section": "DISCUSSION"}, {"text": "Although the mechanisms of the niche partitioning of MGI with depth remain unknown, light and oxygen have been proposed to play a significant role in structuring of communities of ammonia-oxidizing crenarchaea in the ocean (42, 43) . In the present study, the ecotypes of MGI in the oxygenated upper ocean of the ECS had a strongly restricted distribution with depths where the light intensity was attenuated across the water column, suggesting that light might have been a potential key factor resulting in this depth-related phylogenetic partitioning of MGI. However, it was impossible to exclude the effects of other environmental gradients existing in the water column, such as dissolved organic matter (DOM) and pressure. Further studies are needed to establish a certain relationship between environmental variables and crenarchaeal ecotypes.", "cite_spans": [{"start": 223, "end": 227, "text": "(42,", "latex": null, "ref_id": "BIBREF43"}, {"start": 228, "end": 231, "text": "43)", "latex": null, "ref_id": "BIBREF44"}], "ref_spans": [], "eq_spans": [], "section": "DISCUSSION"}, {"text": "Ecological roles of MGI in the ocean. The qPCR assays indicated an amoA-to-16S rRNA gene ratio close to 2 at both stations, which is within the reference ranges reported previously for other regions of the Pacific Ocean (5, 8, 25, 48) . Numerous studies have demonstrated that crenarchaeal amoA genes are more abundant than those of AOB in various open oceans (5, 10, 48, 58) . In agreement with the results of those studies, our data suggest that ammonia-oxidizing archaea outcompete AOB in the open region of the study sites, which might be explained by the conspicuous adaptability of MGI to low concentrations of ammonia (38) . It was suggested previously that meso-and bathypelagic Crenarchaeota may lack the amoA gene, given the low ratios of amoA to 16S rRNA genes, and therefore are heterotrophic (1, 10) . This conclusion remains debatable due to the less comprehensive primers used in those studies (8, 33, 60) . In this study, besides the amoA genes, we quantified the abundance of accA genes, one of the key genes in the 3-hydroxypropionate/4-hydroxybutyrate pathway of mesophilic Crenarchaeota. Our results indicated that crenarchaeal accA genes were almost absent in the euphotic zone but were more abundant below the euphotic zone. The scarcity of crenarchaeal accA genes in the euphotic zone raised the possibility that epipelagic Crenarchaeota may rely on chemolithoheterotrophy and play a minor role in dissolved inorganic carbon fixation. However, a ratio of 1 amoA gene copy to 1 accA gene copy was found previously in the metagenomic data set recovered from surface water of the Sargasso Sea (21) . One possible explanation for this discrepancy is that some accA genes of epipelagic Crenarchaeota might have been missed in our qPCR assays, since the primers commonly used were designed based on only a few available sequences (25) . In contrast, in the upper mesopelagic zone, the ratios of crenarchaeal accA to MGI 16S rRNA genes or to crenarchaeal amoA genes were close to 1, which was in agreement with our previous investigation in the South China Sea (25) . This finding suggested that ammonia oxidization may be an important energy source for autotrophic CO 2 fixation by MGI in deep waters (1, 24) . Such chemoautotrophy can reduce the respiratory consumption of DOM and further provide fresh DOM for other microbial carbon demands. An earlier study revealed that CO 2 fixation by marine Crenarchaeota can meet the substantial carbon demand of the deep-sea microbial food web (46) . A portion of such fresh DOM could ultimately be transformed into recalcitrant DOM through the microbial carbon pump contributing ocean carbon sequestration (28) . The increasing ratio of crenarchaeal accA to amoA or to MGI 16S rRNA genes from the euphotic zone to the mesopelagic zone suggests that MGI could play a more important role in the dark ocean. The recognition of such a function of archaea at taxonomic-and functional-group levels would shed light on the mechanisms of carbon cycling in the ocean. A comprehensive view of the archaeal community structure and its ecological functioning is to be acquired through multiple approaches, including metagenomics, proteomics, and metabolomics, in the future.", "cite_spans": [{"start": 220, "end": 223, "text": "(5,", "latex": null, "ref_id": "BIBREF4"}, {"start": 224, "end": 226, "text": "8,", "latex": null, "ref_id": "BIBREF8"}, {"start": 227, "end": 230, "text": "25,", "latex": null, "ref_id": "BIBREF25"}, {"start": 231, "end": 234, "text": "48)", "latex": null, "ref_id": "BIBREF50"}, {"start": 360, "end": 363, "text": "(5,", "latex": null, "ref_id": "BIBREF4"}, {"start": 364, "end": 367, "text": "10,", "latex": null, "ref_id": "BIBREF10"}, {"start": 368, "end": 371, "text": "48,", "latex": null, "ref_id": "BIBREF50"}, {"start": 372, "end": 375, "text": "58)", "latex": null, "ref_id": "BIBREF61"}, {"start": 625, "end": 629, "text": "(38)", "latex": null, "ref_id": "BIBREF39"}, {"start": 805, "end": 808, "text": "(1,", "latex": null, "ref_id": "BIBREF0"}, {"start": 809, "end": 812, "text": "10)", "latex": null, "ref_id": "BIBREF10"}, {"start": 909, "end": 912, "text": "(8,", "latex": null, "ref_id": "BIBREF8"}, {"start": 913, "end": 916, "text": "33,", "latex": null, "ref_id": "BIBREF33"}, {"start": 917, "end": 920, "text": "60)", "latex": null, "ref_id": "BIBREF63"}, {"start": 1613, "end": 1617, "text": "(21)", "latex": null, "ref_id": "BIBREF21"}, {"start": 1847, "end": 1851, "text": "(25)", "latex": null, "ref_id": "BIBREF25"}, {"start": 2077, "end": 2081, "text": "(25)", "latex": null, "ref_id": "BIBREF25"}, {"start": 2218, "end": 2221, "text": "(1,", "latex": null, "ref_id": "BIBREF0"}, {"start": 2222, "end": 2225, "text": "24)", "latex": null, "ref_id": "BIBREF24"}, {"start": 2504, "end": 2508, "text": "(46)", "latex": null, "ref_id": "BIBREF48"}, {"start": 2667, "end": 2671, "text": "(28)", "latex": null, "ref_id": "BIBREF28"}], "ref_spans": [], "eq_spans": [], "section": "DISCUSSION"}], "bib_entries": {"BIBREF0": {"ref_id": "b0", "title": "Major gradients in putatively nitrifying and non-nitrifying Archaea in the deep North Atlantic", "authors": [{"first": "H", "middle": [], "last": "Agogu\u00e9", "suffix": ""}, {"first": "M", "middle": [], "last": "Brink", "suffix": ""}, {"first": "J", "middle": [], "last": "Dinasquet", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Herndl", "suffix": ""}], "year": 2008, "venue": "Nature", "volume": "456", "issn": "", "pages": "788--791", "other_ids": {}}, "BIBREF1": {"ref_id": "b1", "title": "Water mass specificity of bacterial communities in the North Atlantic revealed by massively parallel sequencing", "authors": [{"first": "H", "middle": [], "last": "Agogu\u00e9", "suffix": ""}, {"first": "D", "middle": [], "last": "Lamy", "suffix": ""}, {"first": "P", "middle": ["R"], "last": "Neal", "suffix": ""}, {"first": "M", "middle": ["L"], "last": "Sogin", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Herndl", "suffix": ""}], "year": 2011, "venue": "Mol. Ecol", "volume": "20", "issn": "", "pages": "258--274", "other_ids": {}}, "BIBREF2": {"ref_id": "b2", "title": "Phylogenetic composition of Arctic Ocean archaeal assemblages and comparison with Antarctic assemblages", "authors": [{"first": "N", "middle": [], "last": "Bano", "suffix": ""}, {"first": "S", "middle": [], "last": "Ruffin", "suffix": ""}, {"first": "B", "middle": [], "last": "Ransom", "suffix": ""}, {"first": "J", "middle": ["T"], "last": "Hollibaugh", "suffix": ""}], "year": 2004, "venue": "Appl. Environ. Microbiol", "volume": "70", "issn": "", "pages": "781--789", "other_ids": {}}, "BIBREF3": {"ref_id": "b3", "title": "Diversity of ammonia-oxidizing archaea and bacteria in the sediments of a hypernutrified subtropical estuary: Bahia del Tobari", "authors": [{"first": "J", "middle": ["M"], "last": "Beman", "suffix": ""}, {"first": "C", "middle": ["A"], "last": "Francis", "suffix": ""}], "year": 2006, "venue": "Mexico. Appl. Environ. Microbiol", "volume": "72", "issn": "", "pages": "7767--7777", "other_ids": {}}, "BIBREF4": {"ref_id": "b4", "title": "Molecular and biogeochemical evidence for ammonia oxidation by marine Crenarchaeota in the Gulf of California", "authors": [{"first": "J", "middle": ["M"], "last": "Beman", "suffix": ""}, {"first": "B", "middle": ["N"], "last": "Popp", "suffix": ""}, {"first": "C", "middle": ["A"], "last": "Francis", "suffix": ""}], "year": 2008, "venue": "ISME J", "volume": "2", "issn": "", "pages": "429--441", "other_ids": {}}, "BIBREF5": {"ref_id": "b5", "title": "Mesophilic Crenarchaeota: proposal for a third archaeal phylum, the Thaumarchaeota", "authors": [{"first": "C", "middle": [], "last": "Brochier-Armanet", "suffix": ""}, {"first": "B", "middle": [], "last": "Boussau", "suffix": ""}, {"first": "S", "middle": [], "last": "Gribaldo", "suffix": ""}, {"first": "P", "middle": [], "last": "Forterre", "suffix": ""}], "year": 2008, "venue": "Nat. Rev. Microbiol", "volume": "6", "issn": "", "pages": "245--252", "other_ids": {}}, "BIBREF6": {"ref_id": "b6", "title": "Seasonal dynamics of SAR11 populations in the euphotic and mesopelagic zones of the northwestern Sargasso Sea", "authors": [{"first": "C", "middle": ["A"], "last": "Carlson", "suffix": ""}], "year": 2009, "venue": "ISME J", "volume": "3", "issn": "", "pages": "283--295", "other_ids": {}}, "BIBREF7": {"ref_id": "b7", "title": "Physicalbiological sources for dense algal blooms near the Changjiang River", "authors": [{"first": "C", "middle": [], "last": "Chen", "suffix": ""}, {"first": "J", "middle": [], "last": "Zhu", "suffix": ""}, {"first": "R", "middle": ["C"], "last": "Beardsley", "suffix": ""}, {"first": "P", "middle": ["J S"], "last": "Franks", "suffix": ""}], "year": 2003, "venue": "Geophys. Res. Lett", "volume": "30", "issn": "", "pages": "22--23", "other_ids": {}}, "BIBREF8": {"ref_id": "b8", "title": "Abundances of crenarchaeal amoA genes and transcripts in the Pacific Ocean", "authors": [{"first": "M", "middle": ["J"], "last": "Church", "suffix": ""}, {"first": "B", "middle": [], "last": "Wai", "suffix": ""}, {"first": "D", "middle": ["M"], "last": "Karl", "suffix": ""}, {"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}], "year": 2010, "venue": "Environ. Microbiol", "volume": "12", "issn": "", "pages": "679--688", "other_ids": {}}, "BIBREF9": {"ref_id": "b9", "title": "T-REX: software for the processing and analysis of T-RFLP data", "authors": [{"first": "S", "middle": [], "last": "Culman", "suffix": ""}, {"first": "R", "middle": [], "last": "Bukowski", "suffix": ""}, {"first": "H", "middle": [], "last": "Gauch", "suffix": ""}, {"first": "H", "middle": [], "last": "Cadillo-Quiroz", "suffix": ""}, {"first": "D", "middle": [], "last": "Buckley", "suffix": ""}], "year": 2009, "venue": "BMC Bioinformatics", "volume": "10", "issn": "", "pages": "", "other_ids": {}}, "BIBREF10": {"ref_id": "b10", "title": "Spatial distribution of Bacteria and Archaea and amoA gene copy numbers throughout the water column of the Eastern Mediterranean Sea", "authors": [{"first": "D", "middle": [], "last": "De Corte", "suffix": ""}, {"first": "T", "middle": [], "last": "Yokokawa", "suffix": ""}, {"first": "M", "middle": ["M"], "last": "Varela", "suffix": ""}, {"first": "H", "middle": [], "last": "Agogue", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Herndl", "suffix": ""}], "year": 2009, "venue": "ISME J", "volume": "3", "issn": "", "pages": "147--158", "other_ids": {}}, "BIBREF11": {"ref_id": "b11", "title": "Archaea in coastal marine environment", "authors": [{"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}], "year": 1992, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "89", "issn": "", "pages": "5685--5689", "other_ids": {}}, "BIBREF12": {"ref_id": "b12", "title": "Community genomics among stratified microbial assemblages in the ocean's interior", "authors": [{"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}], "year": 2006, "venue": "Science", "volume": "311", "issn": "", "pages": "496--503", "other_ids": {}}, "BIBREF13": {"ref_id": "b13", "title": "Ubiquity and diversity of ammonia-oxidizing archaea in water columns and sediments of the ocean", "authors": [{"first": "C", "middle": ["A"], "last": "Francis", "suffix": ""}, {"first": "K", "middle": ["J"], "last": "Roberts", "suffix": ""}, {"first": "J", "middle": ["M"], "last": "Beman", "suffix": ""}, {"first": "A", "middle": ["E"], "last": "Santoro", "suffix": ""}, {"first": "B", "middle": ["B"], "last": "Oakley", "suffix": ""}], "year": 2005, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "102", "issn": "", "pages": "14683--14688", "other_ids": {}}, "BIBREF14": {"ref_id": "b14", "title": "Microbial community structure and its functional implications", "authors": [{"first": "J", "middle": ["A"], "last": "Fuhrman", "suffix": ""}], "year": 2009, "venue": "Nature", "volume": "459", "issn": "", "pages": "193--199", "other_ids": {}}, "BIBREF15": {"ref_id": "b15", "title": "Novel major archaebacterial group from marine plankton", "authors": [{"first": "J", "middle": ["A"], "last": "Fuhrman", "suffix": ""}, {"first": "K", "middle": [], "last": "Mccallum", "suffix": ""}, {"first": "A", "middle": ["A"], "last": "Davis", "suffix": ""}], "year": 1992, "venue": "Nature", "volume": "356", "issn": "", "pages": "148--149", "other_ids": {}}, "BIBREF16": {"ref_id": "b16", "title": "Archaeal diversity and a gene for ammonia oxidation are coupled to oceanic circulation", "authors": [{"first": "P", "middle": ["E"], "last": "Galand", "suffix": ""}], "year": 2009, "venue": "Environ. Microbiol", "volume": "11", "issn": "", "pages": "971--980", "other_ids": {}}, "BIBREF17": {"ref_id": "b17", "title": "Hydrography shapes bacterial biogeography of the deep Arctic Ocean", "authors": [{"first": "P", "middle": ["E"], "last": "Galand", "suffix": ""}, {"first": "M", "middle": [], "last": "Potvin", "suffix": ""}, {"first": "E", "middle": ["O"], "last": "Casamayor", "suffix": ""}, {"first": "C", "middle": [], "last": "Lovejoy", "suffix": ""}], "year": 2009, "venue": "ISME J", "volume": "4", "issn": "", "pages": "564--576", "other_ids": {}}, "BIBREF18": {"ref_id": "b18", "title": "Microdiversity of uncultured marine prokaryotes: the SAR11 cluster and the marine Archaea of group I", "authors": [{"first": "J", "middle": [], "last": "Garcia-Martinez", "suffix": ""}, {"first": "F", "middle": [], "last": "Rodriguez-Valera", "suffix": ""}], "year": 2000, "venue": "Mol. Ecol", "volume": "9", "issn": "", "pages": "935--948", "other_ids": {}}, "BIBREF19": {"ref_id": "b19", "title": "The population frequencies of species and the estimation of population parameters", "authors": [{"first": "I", "middle": ["J"], "last": "Good", "suffix": ""}], "year": 1953, "venue": "Biometrika", "volume": "40", "issn": "", "pages": "237--264", "other_ids": {}}, "BIBREF20": {"ref_id": "b20", "title": "Genomic analysis of the uncultivated marine crenarchaeote Cenarchaeum symbiosum", "authors": [{"first": "S", "middle": ["J"], "last": "Hallam", "suffix": ""}], "year": 2006, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "103", "issn": "", "pages": "18296--18301", "other_ids": {}}, "BIBREF21": {"ref_id": "b21", "title": "Pathways of carbon assimilation and ammonia oxidation suggested by environmental genomic analyses of marine Crenarchaeota", "authors": [{"first": "S", "middle": ["J"], "last": "Hallam", "suffix": ""}], "year": 2006, "venue": "PLoS Biol", "volume": "4", "issn": "", "pages": "", "other_ids": {}}, "BIBREF22": {"ref_id": "b22", "title": "Water masses and biogeography of picoeukaryote assemblages in a cold hydrographically complex system", "authors": [{"first": "A", "middle": ["K"], "last": "Hamilton", "suffix": ""}, {"first": "C", "middle": [], "last": "Lovejoy", "suffix": ""}, {"first": "P", "middle": ["E"], "last": "Galand", "suffix": ""}, {"first": "R", "middle": ["G"], "last": "Ingram", "suffix": ""}], "year": 2008, "venue": "Limnol. Oceanogr", "volume": "53", "issn": "", "pages": "922--935", "other_ids": {}}, "BIBREF23": {"ref_id": "b23", "title": "PAST: paleontological statistics software package for education and data analysis", "authors": [{"first": "\u00d8", "middle": [], "last": "Hammer", "suffix": ""}, {"first": "D", "middle": ["A T"], "last": "Harper", "suffix": ""}, {"first": "P", "middle": ["D"], "last": "Ryan", "suffix": ""}], "year": 2001, "venue": "Palaeontol. Electron", "volume": "4", "issn": "", "pages": "", "other_ids": {}}, "BIBREF24": {"ref_id": "b24", "title": "The radiocarbon signature of microorganisms in the mesopelagic ocean", "authors": [{"first": "R", "middle": ["L"], "last": "Hansman", "suffix": ""}], "year": 2009, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "106", "issn": "", "pages": "6513--6518", "other_ids": {}}, "BIBREF25": {"ref_id": "b25", "title": "Community structure and function of planktonic Crenarchaeota: changes with depth in the South China Sea", "authors": [{"first": "A", "middle": [], "last": "Hu", "suffix": ""}, {"first": "N", "middle": [], "last": "Jiao", "suffix": ""}, {"first": "C", "middle": ["L"], "last": "Zhang", "suffix": ""}], "year": 2011, "venue": "Microb. Ecol", "volume": "61", "issn": "", "pages": "549--563", "other_ids": {}}, "BIBREF26": {"ref_id": "b26", "title": "Community structures of ammonia-oxidising archaea and bacteria in high-altitude lakes on the Tibetan Plateau", "authors": [{"first": "A", "middle": [], "last": "Hu", "suffix": ""}], "year": 2010, "venue": "Freshw. Biol", "volume": "55", "issn": "", "pages": "2375--2390", "other_ids": {}}, "BIBREF27": {"ref_id": "b27", "title": "Resource partitioning and sympatric differentiation among closely related bacterioplankton", "authors": [{"first": "D", "middle": ["E"], "last": "Hunt", "suffix": ""}], "year": 2008, "venue": "Science", "volume": "320", "issn": "", "pages": "1081--1085", "other_ids": {}}, "BIBREF28": {"ref_id": "b28", "title": "Microbial production of recalcitrant dissolved organic matter: long-term carbon storage in the global ocean", "authors": [{"first": "N", "middle": [], "last": "Jiao", "suffix": ""}], "year": 2010, "venue": "Nat. Rev. Microbiol", "volume": "8", "issn": "", "pages": "593--599", "other_ids": {}}, "BIBREF29": {"ref_id": "b29", "title": "Influence of hydrographic conditions on picoplankton distribution in the East China Sea", "authors": [{"first": "N", "middle": [], "last": "Jiao", "suffix": ""}, {"first": "Y", "middle": [], "last": "Yang", "suffix": ""}, {"first": "H", "middle": [], "last": "Koshikawa", "suffix": ""}, {"first": "M", "middle": [], "last": "Watanabe", "suffix": ""}], "year": 2002, "venue": "Aquat. Microb. Ecol", "volume": "30", "issn": "", "pages": "37--48", "other_ids": {}}, "BIBREF30": {"ref_id": "b30", "title": "Ammonia-oxidizing Archaea in the Arctic Ocean and Antarctic coastal waters", "authors": [{"first": "K", "middle": ["M"], "last": "Kalanetra", "suffix": ""}, {"first": "N", "middle": [], "last": "Bano", "suffix": ""}, {"first": "J", "middle": ["T"], "last": "Hollibaugh", "suffix": ""}], "year": 2009, "venue": "Environ. Microbiol", "volume": "11", "issn": "", "pages": "2434--2445", "other_ids": {}}, "BIBREF31": {"ref_id": "b31", "title": "Archaeal dominance in the mesopelagic zone of the Pacific Ocean", "authors": [{"first": "M", "middle": ["B"], "last": "Karner", "suffix": ""}, {"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}, {"first": "D", "middle": ["M"], "last": "Karl", "suffix": ""}], "year": 2001, "venue": "Nature", "volume": "409", "issn": "", "pages": "507--510", "other_ids": {}}, "BIBREF32": {"ref_id": "b32", "title": "Isolation of an autotrophic ammonia-oxidizing marine archaeon", "authors": [{"first": "M", "middle": [], "last": "K\u00f6nneke", "suffix": ""}], "year": 2005, "venue": "Nature", "volume": "437", "issn": "", "pages": "543--546", "other_ids": {}}, "BIBREF33": {"ref_id": "b33", "title": "Comparative metagenomic analysis of a microbial community residing at a depth of 4,000 meters at station ALOHA in the North Pacific Subtropical Gyre", "authors": [{"first": "K", "middle": ["T"], "last": "Konstantinidis", "suffix": ""}, {"first": "J", "middle": [], "last": "Braff", "suffix": ""}, {"first": "D", "middle": ["M"], "last": "Karl", "suffix": ""}, {"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}], "year": 2009, "venue": "Appl. Environ. Microbiol", "volume": "75", "issn": "", "pages": "5345--5355", "other_ids": {}}, "BIBREF34": {"ref_id": "b34", "title": "Ammonia-oxidizing bacteria: a model for molecular microbial ecology", "authors": [{"first": "G", "middle": ["A"], "last": "Kowalchuk", "suffix": ""}, {"first": "J", "middle": ["R"], "last": "Stephen", "suffix": ""}], "year": 2001, "venue": "Annu. Rev. Microbiol", "volume": "55", "issn": "", "pages": "485--529", "other_ids": {}}, "BIBREF35": {"ref_id": "b35", "title": "Euphotic zone depth: its derivation and implication to ocean-color remote sensing", "authors": [{"first": "Z", "middle": ["P"], "last": "Lee", "suffix": ""}], "year": 2007, "venue": "J. Geophys. Res", "volume": "112", "issn": "", "pages": "", "other_ids": {}}, "BIBREF36": {"ref_id": "b36", "title": "Community structure of Archaea in the water column above gas hydrates in the Gulf of Mexico", "authors": [{"first": "B", "middle": [], "last": "Liu", "suffix": ""}], "year": 2009, "venue": "Geomicrobiol. J", "volume": "26", "issn": "", "pages": "363--369", "other_ids": {}}, "BIBREF37": {"ref_id": "b37", "title": "UniFrac-an online tool for comparing microbial community diversity in a phylogenetic context", "authors": [{"first": "C", "middle": [], "last": "Lozupone", "suffix": ""}, {"first": "M", "middle": [], "last": "Hamady", "suffix": ""}, {"first": "R", "middle": [], "last": "Knight", "suffix": ""}], "year": 2006, "venue": "BMC Bioinformatics", "volume": "7", "issn": "", "pages": "", "other_ids": {}}, "BIBREF38": {"ref_id": "b38", "title": "ARB: a software environment for sequence data", "authors": [{"first": "W", "middle": [], "last": "Ludwig", "suffix": ""}], "year": 2004, "venue": "Nucleic Acids Res", "volume": "32", "issn": "", "pages": "1363--1371", "other_ids": {}}, "BIBREF39": {"ref_id": "b39", "title": "Ammonia oxidation kinetics determine niche separation of nitrifying Archaea and Bacteria", "authors": [{"first": "W", "middle": [], "last": "Martens-Habbena", "suffix": ""}, {"first": "P", "middle": ["M"], "last": "Berube", "suffix": ""}, {"first": "H", "middle": [], "last": "Urakawa", "suffix": ""}, {"first": "J", "middle": ["R"], "last": "De La Torre", "suffix": ""}, {"first": "D", "middle": ["A"], "last": "Stahl", "suffix": ""}], "year": 2009, "venue": "Nature", "volume": "461", "issn": "", "pages": "976--979", "other_ids": {}}, "BIBREF40": {"ref_id": "b40", "title": "Microbial biogeography: putting microorganisms on the map", "authors": [{"first": "J", "middle": ["B H"], "last": "Martiny", "suffix": ""}], "year": 2006, "venue": "Nat. Rev. Microbiol", "volume": "4", "issn": "", "pages": "102--112", "other_ids": {}}, "BIBREF41": {"ref_id": "b41", "title": "A few cosmopolitan phylotypes dominate planktonic archaeal assemblages in widely different oceanic provinces", "authors": [{"first": "R", "middle": [], "last": "Massana", "suffix": ""}, {"first": "E", "middle": ["F"], "last": "Delong", "suffix": ""}, {"first": "C", "middle": [], "last": "Pedros-Alio", "suffix": ""}], "year": 2000, "venue": "Appl. Environ. Microbiol", "volume": "66", "issn": "", "pages": "1777--1787", "other_ids": {}}, "BIBREF42": {"ref_id": "b42", "title": "A new high pressure equation of state for seawater", "authors": [{"first": "F", "middle": ["J"], "last": "Millero", "suffix": ""}, {"first": "C", "middle": [], "last": "Chen", "suffix": ""}, {"first": "A", "middle": [], "last": "Bradshaw", "suffix": ""}, {"first": "K", "middle": [], "last": "Schleicher", "suffix": ""}], "year": 1980, "venue": "Deep Sea Res", "volume": "27", "issn": "", "pages": "255--264", "other_ids": {}}, "BIBREF43": {"ref_id": "b43", "title": "Quantitative distribution of presumptive archaeal and bacterial nitrifiers in Monterey Bay and the North Pacific Subtropical Gyre", "authors": [{"first": "T", "middle": ["J"], "last": "Mincer", "suffix": ""}], "year": 2007, "venue": "Environ. Microbiol", "volume": "9", "issn": "", "pages": "1162--1175", "other_ids": {}}, "BIBREF44": {"ref_id": "b44", "title": "High diversity of ammoniaoxidizing archaea in permanent and seasonal oxygen-deficient waters of the eastern South Pacific", "authors": [{"first": "V", "middle": [], "last": "Molina", "suffix": ""}, {"first": "L", "middle": [], "last": "Belmar", "suffix": ""}, {"first": "O", "middle": [], "last": "Ulloa", "suffix": ""}], "year": 2010, "venue": "Environ. Microbiol", "volume": "12", "issn": "", "pages": "2450--2465", "other_ids": {"doi": ["10.1111/j.1462-2920.2010.02218.x"]}}, "BIBREF45": {"ref_id": "b45", "title": "The influence of soil pH on the diversity, abundance and transcriptional activity of ammonia oxidizing archaea and bacteria", "authors": [{"first": "G", "middle": ["W"], "last": "Nicol", "suffix": ""}, {"first": "S", "middle": [], "last": "Leininger", "suffix": ""}, {"first": "C", "middle": [], "last": "Schleper", "suffix": ""}, {"first": "J", "middle": ["I"], "last": "Prosser", "suffix": ""}], "year": 2008, "venue": "Environ. Microbiol", "volume": "10", "issn": "", "pages": "2966--2978", "other_ids": {}}, "BIBREF46": {"ref_id": "b46", "title": "Evolutionary divergence and biogeography of sympatric nichedifferentiated bacterial populations", "authors": [{"first": "B", "middle": ["B"], "last": "Oakley", "suffix": ""}, {"first": "F", "middle": [], "last": "Carbonero", "suffix": ""}, {"first": "C", "middle": ["J"], "last": "Van Der Gast", "suffix": ""}, {"first": "R", "middle": ["J"], "last": "Hawkins", "suffix": ""}, {"first": "K", "middle": ["J"], "last": "Purdy", "suffix": ""}], "year": 2010, "venue": "ISME J", "volume": "4", "issn": "", "pages": "488--497", "other_ids": {}}, "BIBREF47": {"ref_id": "b47", "title": "Diversity and abundance of Crenarchaeota in terrestrial habitats studied by 16S RNA surveys and real time PCR", "authors": [{"first": "T", "middle": [], "last": "Ochsenreiter", "suffix": ""}, {"first": "D", "middle": [], "last": "Selezi", "suffix": ""}, {"first": "A", "middle": [], "last": "Quaiser", "suffix": ""}, {"first": "L", "middle": [], "last": "Bonch-Osmolovskaya", "suffix": ""}, {"first": "C", "middle": [], "last": "Schleper", "suffix": ""}], "year": 2003, "venue": "Environ. Microbiol", "volume": "5", "issn": "", "pages": "787--797", "other_ids": {}}, "BIBREF48": {"ref_id": "b48", "title": "Major contribution of autotrophy to microbial carbon cycling in the deep North Atlantic's interior", "authors": [{"first": "T", "middle": [], "last": "Reinthaler", "suffix": ""}, {"first": "H", "middle": ["M"], "last": "Van Aken", "suffix": ""}, {"first": "G", "middle": ["J"], "last": "Herndl", "suffix": ""}], "year": 2010, "venue": "Deep Sea Res. II", "volume": "57", "issn": "", "pages": "1572--1580", "other_ids": {}}, "BIBREF49": {"ref_id": "b49", "title": "MrBayes 3: Bayesian phylogenetic inference under mixed models", "authors": [{"first": "F", "middle": [], "last": "Ronquist", "suffix": ""}, {"first": "J", "middle": ["P"], "last": "Huelsenbeck", "suffix": ""}], "year": 2003, "venue": "Bioinformatics", "volume": "19", "issn": "", "pages": "1572--1574", "other_ids": {}}, "BIBREF50": {"ref_id": "b50", "title": "Activity, abundance and diversity of nitrifying archaea and bacteria in the central California current", "authors": [{"first": "A", "middle": ["E"], "last": "Santoro", "suffix": ""}, {"first": "K", "middle": ["L"], "last": "Casciotti", "suffix": ""}, {"first": "C", "middle": ["A"], "last": "Francis", "suffix": ""}], "year": 2010, "venue": "Environ. Microbiol", "volume": "12", "issn": "", "pages": "1989--2006", "other_ids": {}}, "BIBREF51": {"ref_id": "b51", "title": "Latitudinal distribution of prokaryotic picoplankton populations in the Atlantic Ocean", "authors": [{"first": "M", "middle": [], "last": "Schattenhofer", "suffix": ""}], "year": 2009, "venue": "Environ. Microbiol", "volume": "11", "issn": "", "pages": "2078--2093", "other_ids": {}}, "BIBREF52": {"ref_id": "b52", "title": "Introducing DOTUR, a computer program for defining operational taxonomic units and estimating species richness", "authors": [{"first": "P", "middle": ["D"], "last": "Schloss", "suffix": ""}, {"first": "J", "middle": [], "last": "Handelsman", "suffix": ""}], "year": 2005, "venue": "Appl. Environ. Microbiol", "volume": "71", "issn": "", "pages": "1501--1506", "other_ids": {}}, "BIBREF53": {"ref_id": "b53", "title": "Distinct gene set in two different lineages of ammoniaoxidizing archaea supports the phylum Thaumarchaeota", "authors": [{"first": "A", "middle": [], "last": "Spang", "suffix": ""}], "year": 2010, "venue": "Trends Microbiol", "volume": "18", "issn": "", "pages": "331--340", "other_ids": {}}, "BIBREF54": {"ref_id": "b54", "title": "RAxML-III: a fast program for maximum likelihood-based inference of large phylogenetic trees", "authors": [{"first": "A", "middle": [], "last": "Stamatakis", "suffix": ""}, {"first": "T", "middle": [], "last": "Ludwig", "suffix": ""}, {"first": "H", "middle": [], "last": "Meier", "suffix": ""}], "year": 2005, "venue": "Bioinformatics", "volume": "21", "issn": "", "pages": "456--463", "other_ids": {}}, "BIBREF55": {"ref_id": "b55", "title": "PAUP*, phylogenetic analysis using parsimony (*and other methods), version 4", "authors": [{"first": "D", "middle": ["L"], "last": "Swofford", "suffix": ""}], "year": 2003, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF56": {"ref_id": "b56", "title": "Novel genes for nitrite reductase and Amo", "authors": [{"first": "A", "middle": ["H"], "last": "Treusch", "suffix": ""}], "year": 2005, "venue": "", "volume": "77", "issn": "", "pages": "", "other_ids": {}}, "BIBREF57": {"ref_id": "b57", "title": "related proteins indicate a role of uncultivated mesophilic crenarchaeota in nitrogen cycling", "authors": [], "year": null, "venue": "Environ. Microbiol", "volume": "7", "issn": "", "pages": "1985--1995", "other_ids": {}}, "BIBREF58": {"ref_id": "b58", "title": "Seasonality and vertical structure of microbial communities in an ocean gyre", "authors": [{"first": "A", "middle": ["H"], "last": "Treusch", "suffix": ""}], "year": 2009, "venue": "ISME J", "volume": "3", "issn": "", "pages": "1148--1163", "other_ids": {}}, "BIBREF59": {"ref_id": "b59", "title": "Environmental genome shotgun sequencing of the Sargasso Sea", "authors": [{"first": "J", "middle": ["C"], "last": "Venter", "suffix": ""}], "year": 2004, "venue": "Science", "volume": "304", "issn": "", "pages": "66--74", "other_ids": {}}, "BIBREF60": {"ref_id": "b60", "title": "Nitrosopumilus maritimus genome reveals unique mechanisms for nitrification and autotrophy in globally distributed marine crenarchaea", "authors": [{"first": "C", "middle": ["B"], "last": "Walker", "suffix": ""}], "year": 2010, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "107", "issn": "", "pages": "8818--8823", "other_ids": {}}, "BIBREF61": {"ref_id": "b61", "title": "Archaeal nitrification in the ocean", "authors": [{"first": "C", "middle": [], "last": "Wuchter", "suffix": ""}], "year": 2006, "venue": "Proc. Natl. Acad. Sci. U. S. A", "volume": "103", "issn": "", "pages": "12317--12322", "other_ids": {}}, "BIBREF62": {"ref_id": "b62", "title": "A first insight into the occurrence and expression of functional amoA and accA genes of autotrophic and ammonia-oxidizing bathypelagic Crenarchaeota of Tyrrhenian Sea", "authors": [{"first": "M", "middle": ["M"], "last": "Yakimov", "suffix": ""}, {"first": "V", "middle": ["L"], "last": "Conoa", "suffix": ""}, {"first": "R", "middle": [], "last": "Denaroa", "suffix": ""}], "year": 2009, "venue": "Deep Sea Res. II", "volume": "56", "issn": "", "pages": "748--754", "other_ids": {}}, "BIBREF63": {"ref_id": "b63", "title": "Contribution of crenarchaeal autotrophic ammonia oxidizers to the dark primary production in Tyrrhenian deep waters (Central Mediterranean Sea)", "authors": [{"first": "M", "middle": ["M"], "last": "Yakimov", "suffix": ""}], "year": 2011, "venue": "ISME J", "volume": "5", "issn": "", "pages": "945--961", "other_ids": {}}, "BIBREF64": {"ref_id": "b64", "title": "Phylogenetic diversity of planktonic archaea in the estuarine region of East China Sea", "authors": [{"first": "Y", "middle": ["H"], "last": "Zeng", "suffix": ""}, {"first": "H", "middle": ["Y"], "last": "Li", "suffix": ""}, {"first": "N", "middle": ["Z"], "last": "Jiao", "suffix": ""}], "year": 2007, "venue": "Microbiol. Res", "volume": "162", "issn": "", "pages": "26--36", "other_ids": {}}, "BIBREF65": {"ref_id": "b65", "title": "Evaluation of terminalrestriction fragment length polymorphism analysis in contrasting marine environments", "authors": [{"first": "R", "middle": [], "last": "Zhang", "suffix": ""}, {"first": "P", "middle": ["Y"], "last": "V. Thiyagarajan", "suffix": ""}, {"first": "", "middle": [], "last": "Qian", "suffix": ""}], "year": 2008, "venue": "FEMS Microbiol. Ecol", "volume": "65", "issn": "", "pages": "169--178", "other_ids": {}}, "BIBREF66": {"ref_id": "b66", "title": "Dynamics of aerobic anoxygenic phototrophic bacteria in the East China Sea", "authors": [{"first": "Y", "middle": [], "last": "Zhang", "suffix": ""}, {"first": "N", "middle": [], "last": "Jiao", "suffix": ""}], "year": 2007, "venue": "FEMS Microbiol. Ecol", "volume": "61", "issn": "", "pages": "459--469", "other_ids": {}}}, "ref_entries": {"FIGREF0": {"text": "FIG. 2. Depth profiles of abundances of archaeal and MGI 16S rRNA genes and crenarchaeal amoA and accA genes measured by using qPCR at stations 608 and 712 in the ECS. Bars denote 1 standard error of triplicate qPCR determinations and are not visible when they are less than the width of the data point.", "latex": null, "type": "figure"}, "FIGREF1": {"text": ": a deep epipelagic subclade (150 m and 200 m) and a mesopelagic subclade (300 m and 400 m), except for the depth of 712 to 200 m. Mantel tests indicated that the crenarchaeal communities in both stations were", "latex": null, "type": "figure"}, "FIGREF2": {"text": "FIG. 4. Clustering of the T-RFLP profiles of bacterial 16S rRNA genes (a) and clustering of the DGGE profiles of crenarchaeal 16S rRNA genes (b) in the ECS. Clustering analyses were performed based on the Sorensen algorithm, and the scale bar indicates the Sorensen distance.", "latex": null, "type": "figure"}, "FIGREF3": {"text": "FIG. 5. Depth distribution of phylogenetic community structures of crenarchaeal amoA and accA genes recovered from the ECS. The relative abundance of each phylotype named in Fig. S2 in the supplemental material was calculated and is represented in a column diagram.", "latex": null, "type": "figure"}, "FIGREF4": {"text": "FIG. 6. Ecotype simulation of crenarchaeal amoA (a) and accA (b) gene sequences recovered from the ECS and shift in relative abundances of different predicted ecotypes for amoA (c) and accA (d) across the water column of the ECS. Sampling stations and sampling depths are coded by colored squares. Predicted ecotypes are coded by colored circles at the phylogenetic nodes or the colored squares shown in panels c and d. Two more-detailed images for panels a and b are shown in Fig. S3 in the supplemental material.", "latex": null, "type": "figure"}, "TABREF0": {"text": "FIG. 1. Map of the ECS showing the approximate bottom topography and hydrographic structure and the location of the sampling stations. CDW, Changjiang diluted water; TWC, Taiwan Warm Current; KBCNT, Kuroshio Branch Current north of Taiwan; KBCWK, Kuroshio Branch Current west of Kyushu; TSWC, Tsushima Strait Warm Current; YSWC, Yellow Sea Warm Current; Kuroshio, Kuroshio Current. The map was created by use of Planiglobe, with information on currents taken from references 7a and 29.", "latex": null, "type": "table"}, "TABREF1": {"text": "Properties of the water bodies of two stations located in the ECS", "latex": null, "type": "table"}, "TABREF2": {"text": "Diversity indices of crenarchaeal amoA and accA clone libraries recovered from the stations investigated in the ECS", "latex": null, "type": "table"}}, "back_matter": [{"text": "We thank the captain and crew of the RV Dongfanghong #2, L. K. Hao and H. Y. Cai for assistance during sampling, J. W. Tian for providing the temperature and salinity data, and Lawrence David and Albert Wang for assistance in using the Adapt ML software. We thank John Hodgkiss for his help in polishing the English. ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "ACKNOWLEDGMENTS"}]}
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/s2orc/20210101/f2235cd28c171f9f3b6a8bcebe246159c464980c.json b/s2orc-doc2json/tests/s2orc/20210101/f2235cd28c171f9f3b6a8bcebe246159c464980c.json
new file mode 100644
index 0000000000000000000000000000000000000000..199de434b30959e18547842e22b5c398400221f6
--- /dev/null
+++ b/s2orc-doc2json/tests/s2orc/20210101/f2235cd28c171f9f3b6a8bcebe246159c464980c.json
@@ -0,0 +1 @@
+{"paper_id": "f2235cd28c171f9f3b6a8bcebe246159c464980c", "metadata": {"title": "M P RA Munich Personal RePEc Archive Territorial Rural Development in Latin America and the Caribbean: Discourses and Realities", "authors": [{"first": "Jorge", "middle": [], "last": "Mora-Alfaro", "suffix": "", "affiliation": {"laboratory": "", "institution": "Programa Regional de Maestr\u00eda en Desarrollo Rural", "location": {}}, "email": ""}], "year": "2006"}, "abstract": [], "body_text": [{"text": "La reorientaci\u00f3n ocurrida en el desarrollo socioecon\u00f3mico y en los sistemas institucionales de Am\u00e9rica Latina a partir de la d\u00e9cada de los a\u00f1os 80, genera significativas transformaciones en el medio rural de la regi\u00f3n. La centralidad asignada en las pol\u00edticas p\u00fablicas a la promoci\u00f3n de las exportaciones y a la atracci\u00f3n de inversiones externas, contribuye a conformar un din\u00e1mico sector de productores y empresarios agr\u00edcolas, agroindustriales y comerciales vinculados a los mercados internacionales. El substancial crecimiento de las exportaciones y las importaciones agr\u00edcolas, ocurridas en el marco de la estrategia de apertura econ\u00f3mica implantada en estos pa\u00edses, son una clara expresi\u00f3n de los resultados alcanzados con las medidas econ\u00f3micas y los est\u00edmulos a la liberalizaci\u00f3n y expansi\u00f3n del comercio internacional.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "Por otra parte, fen\u00f3menos tales como, A) la modificaci\u00f3n en las reglas del juego mediante las cuales se regulan las relaciones entre los productores agr\u00edcolas y el estado; B) la reducci\u00f3n, el traslado al sector privado, el desmantelamiento o el deterioro de funciones p\u00fablicas esenciales de apoyo al desarrollo agr\u00edcola (investigaci\u00f3n, extensi\u00f3n, fomento productivo, financiamiento); C) el debilitamiento de las pol\u00edticas sectoriales y el \u00e9nfasis puesto en la b\u00fasqueda de los equilibrios macroecon\u00f3micos; D) as\u00ed como los d\u00e9biles mecanismos de encadenamiento entre el sector m\u00e1s din\u00e1mico de la agricultura, la agroindustria y los agronegocios, con extendidos grupos de productores familiares o con los territorios m\u00e1s deprimidos de la regi\u00f3n, generan un paisaje rural m\u00e1s complejo, con situaciones productivas y econ\u00f3micas m\u00e1s diversas y con la presencia de agudos problemas sociales. La diversificaci\u00f3n de las actividades agr\u00edcolas, la acentuaci\u00f3n de los procesos de diferenciaci\u00f3n social de los territorios, la multiplicaci\u00f3n de las actividades rurales no agr\u00edcolas y de la pluriactividad familiar, la intensificaci\u00f3n de los movimientos migratorios 1 Programa Regional de Maestr\u00eda en Desarrollo Rural, Universidad Nacional (UNA), Costa Rica.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "jorpili@ice.co.cr Apartado Postal 1441-3000 Tel\u00e9fonos (506) 2611671 / (506) 8438542 internacionales y desde los espacios rurales a las \u00e1reas urbanas o de los espacios rurales deprimidos hacia otros territorios de mayor dinamismo econ\u00f3mico y la persistente pobreza e indigencia sufrida por numerosas familias rurales, llevan a la configuraci\u00f3n de un medio rural con fuertes necesidades y demandas.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "Un estado debilitado cuyos recursos se colocan, de forma prioritaria, en el est\u00edmulo a las actividades productivas de exportaci\u00f3n, con formas de atenci\u00f3n al medio rural institucionalmente fragmentadas, organizada por medio de acciones sectoriales y sustentadas, de manera predominante, en iniciativas asistenciales y en arraigadas relaciones de clientela, enfrenta la necesidad de buscar estrategias alternativas para revertir los desequilibrios regionales, resolver las apremiantes situaciones de exclusi\u00f3n social y buscar mecanismos eficaces para promover el desarrollo territorial, la generaci\u00f3n de empleo y el bienestar en las comunidades rurales. Las medidas de descentralizaci\u00f3n y fortalecimiento de los gobiernos locales y los intentos dirigidos a dise\u00f1ar diversas iniciativas de desarrollo rural, encuentran en el enfoque del desarrollo territorial rural (DTR), una opci\u00f3n que pareciera responder adecuadamente a la b\u00fasqueda de una ruta alternativa para impulsar las transformaciones requeridas por el medio rural de la regi\u00f3n. El conocimiento de las exitosas experiencias de DTR llevadas a cabo en la Uni\u00f3n Europea, el desarrollo de varias iniciativas territoriales de desarrollo rural, ejecutadas con el apoyo t\u00e9cnico y financiero otorgado por distintos programas de cooperaci\u00f3n internacional y la promoci\u00f3n efectuada por parte de los principales organismos financieros internacionales presentes en la regi\u00f3n, colocan el tema de las pol\u00edticas y las estrategias de desarrollo territorial rural en un destacado lugar en la agenda de la mayor\u00eda de los gobiernos de Am\u00e9rica Latina y el Caribe.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "La denominada \"nueva ruralidad\" y el \"desarrollo territorial rural\" pasan a ser elementos obligados en los renovados discursos acad\u00e9micos, pol\u00edticos, t\u00e9cnicos y de los organismos de cooperaci\u00f3n internacional. El dise\u00f1o de pol\u00edticas, programas y estrategias, as\u00ed como la aprobaci\u00f3n de legislaciones nacionales de desarrollo rural, marcan el rumbo de los esfuerzos dirigidos a afrontar, con nuevos enfoques e instrumentos, las demandas y los desaf\u00edos surgidos en un medio rural regional con significativas transformaciones estructurales.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "Pero, \u00bfSe han creado las condiciones requeridas para impulsar procesos end\u00f3genos de desarrollo territorial rural?\u00bfSe han impulsado las transformaciones institucionales necesarias para poner en pr\u00e1ctica, de manera efectiva, las diversas iniciativas de desarrollo territorial rural?\u00bfSe est\u00e1n llevando a cabo acciones formativas o de reclutamiento de los equipos t\u00e9cnicos que lleven a cabo estos procesos?\u00bfSe han asignado los recursos financieros que permitan darle sostenibilidad a los procesos?\u00bfSe han puesto en pr\u00e1ctica pol\u00edticas de traslado de competencias y recursos a los gobiernos locales y de est\u00edmulo a la participaci\u00f3n y la autogesti\u00f3n de las comunidades en los diversos territorios rurales?\u00bf Se est\u00e1n haciendo esfuerzos de territorializaci\u00f3n de las pol\u00edticas sectoriales y los arreglos institucionales requeridos para superar la dispersi\u00f3n institucional en los territorios?\u00bfExiste la disposici\u00f3n estatal para dise\u00f1ar estrategias de largo plazo, con un enfoque ascendente y con espacios efectivos de participaci\u00f3n ciudadana en los territorios rurales?\u00bfSe est\u00e1n creando las condiciones que permitan que las iniciativas impulsadas en el marco de la cooperaci\u00f3n internacional tengan continuidad y sostenibilidad una vez finalizado el apoyo externo?", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Introducci\u00f3n"}, {"text": "Las reformas econ\u00f3micas y pol\u00edticas aplicadas a partir de la d\u00e9cada de los a\u00f1os 80 en Am\u00e9rica Latina y el Caribe, con diferentes grados de profundidad en las diversas naciones de esta heterog\u00e9nea regi\u00f3n, son el marco obligado para el an\u00e1lisis de las transformaciones experimentadas por el medio rural y los territorios particulares que la conforman (Gasc\u00f3, 2004; CEPAL, 2000) 2 . Entre las decisiones m\u00e1s destacados adoptadas con estos procesos sobresalen, entre otras, A) la apertura y desregulaci\u00f3n de la econom\u00eda; B) la reducci\u00f3n y privatizaci\u00f3n de funciones esenciales desempe\u00f1adas por el estado durante el per\u00edodo precedente (industrializaci\u00f3n sustitutiva de importaciones -ISI-.); C) la reducci\u00f3n relativa de las inversiones en investigaci\u00f3n y fomento agr\u00edcola y la privatizaci\u00f3n o terecerizaci\u00f3n de los servicios de extensi\u00f3n y transferencia tecnol\u00f3gica; D) la orientaci\u00f3n prioritaria de las pol\u00edticas 2 En un estudio efectuado por CEPAL en el a\u00f1o 2000 sobre el impacto de las reformas en Am\u00e9rica Latina y el Caribe en nueve pa\u00edses seleccionados para el an\u00e1lisis, se concluye en que \"los resultados de las reformas no fueron tan positivos como predec\u00edan sus partidarios ni tan negativos como tem\u00edan sus oponentes\". Las naciones fueron divididas en reformadores radicales (Argentina, Bolivia, Chile y Per\u00fa) y reformadores cautos (Brasil, Colombia, Costa Rica, Jamaica y M\u00e9xico). Los primeros son pa\u00edses con condiciones iniciales muy dif\u00edciles, por lo que decidieron llevar a cabo reformas m\u00e1s profundas. Los segundos ten\u00edan un buen desempe\u00f1o inicial y pretend\u00edan mantener ciertas fortalezas de sus econom\u00edas y sociedades, por lo que optaron por reformas m\u00e1s graduales y selectivas. Seg\u00fan concluye el estudio en casi todos ellos el efecto de las reformas en la productividad, el crecimiento, el empleo y la equidad son muy limitados o resultan negativos, sobre todo en este \u00faltimo aspecto\" ( Stallings y Peres, 2000:256) .", "cite_spans": [{"start": 349, "end": 362, "text": "(Gasc\u00f3, 2004;", "latex": null, "ref_id": "BIBREF16"}, {"start": 363, "end": 375, "text": "CEPAL, 2000)", "latex": null, "ref_id": null}, {"start": 1895, "end": 1925, "text": "( Stallings y Peres, 2000:256)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": "Algunas condiciones compartidas por el medio rural regional"}, {"text": "econ\u00f3micas hacia la promoci\u00f3n y diversificaci\u00f3n de las exportaciones (introducci\u00f3n de la agricultura no tradicional de exportaci\u00f3n -ANTEx-); E) la sustituci\u00f3n de las pol\u00edticas redistributivas por medidas de compensaci\u00f3n y focalizaci\u00f3n del gasto social, destinadas a responder al impacto social ocasionado por la reorientaci\u00f3n en el patr\u00f3n de crecimiento econ\u00f3mico; y, F) la adopci\u00f3n de pol\u00edticas centradas en la b\u00fasqueda del equilibrio macroecon\u00f3mico, la reducci\u00f3n del gasto p\u00fablico y la eliminaci\u00f3n o traslado al sector privado de una serie de programas de bienestar social.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Algunas condiciones compartidas por el medio rural regional"}, {"text": "Las reformas, concretadas por medio de los denominados Programas de Ajuste Estructural (PAES), se constituyen en una suerte de gran proyecto de ingenier\u00eda social promovido por diversos organismos financieros internacionales (Banco Mundial, Fondo", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Algunas condiciones compartidas por el medio rural regional"}, {"text": "Monetario Internacional, Banco Interamericano de Desarrollo) (Stiglitz, 2002) . La f\u00f3rmula aplicada para enfrentar la situaci\u00f3n del desarrollo econ\u00f3mico y pol\u00edtico de la regi\u00f3n e impulsar el crecimiento de la econom\u00eda es extendida por la mayor\u00eda de los pa\u00edses, con independencia de las condiciones hist\u00f3ricas, pol\u00edticas, institucionales o culturales de cada naci\u00f3n en particular. La creciente integraci\u00f3n econ\u00f3mica, pol\u00edtica y cultural de la sociedad contempor\u00e1nea -en el plano global-, y la conformaci\u00f3n de nuevas coaliciones gobernantes -en el plano regional-, favorecen el impulso de la reorientaci\u00f3n en las pol\u00edticas econ\u00f3micas y en el modelo de acumulaci\u00f3n implantado en la regi\u00f3n (CEPAL, 2002; Barbosa y Neiman, 2005) .", "cite_spans": [{"start": 61, "end": 77, "text": "(Stiglitz, 2002)", "latex": null, "ref_id": "BIBREF33"}, {"start": 686, "end": 699, "text": "(CEPAL, 2002;", "latex": null, "ref_id": null}, {"start": 700, "end": 723, "text": "Barbosa y Neiman, 2005)", "latex": null, "ref_id": "BIBREF0"}], "ref_spans": [], "eq_spans": [], "section": "Algunas condiciones compartidas por el medio rural regional"}, {"text": "Este complejo entorno de mutaci\u00f3n socioecon\u00f3mica y pol\u00edtica condiciona el surgimiento de significativas transformaciones en el medio rural de Am\u00e9rica Latina y el Caribe. Las variaciones en las estructuras productivas rurales no se hacen esperar. El dinamismo cobrado por el sector externo de las econom\u00edas regionales se manifiesta, con claridad, en el revelador incremento de las actividades destinadas al mercado exterior. Como lo muestra el gr\u00e1fico 1, entre 1990 y 2003 se constata el significativo aumento de las exportaciones agr\u00edcolas. Por su parte, la apertura econ\u00f3mica contribuye, de manera determinante, a la sustituci\u00f3n de la producci\u00f3n de alimentos y materias primas destinadas a los mercados dom\u00e9sticos por bienes importados. Esto genera un notable incremento de las importaciones de productos agr\u00edcolas(ver gr\u00e1fico 2). Estos factores influyen, de igual manera, en los cambios experimentados en cuanto a la composici\u00f3n de la producci\u00f3n agr\u00edcola regional (ver Gr\u00e1fico 3). Como se apunta en un reciente an\u00e1lisis de FAO sobre las tendencias y desaf\u00edos de la agricultura, los montes y la pesca en Am\u00e9rica Latina y el Caribe, \"La", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Algunas condiciones compartidas por el medio rural regional"}, {"text": "Fuente: FAO, 2005", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 1"}, {"text": "Fuente: FAO, 2005 composici\u00f3n de la producci\u00f3n agr\u00edcola de Am\u00e9rica Latina y el Caribe ha presentado cambios altamente significativos en las \u00faltimas dos d\u00e9cadas. En los a\u00f1os ochenta el crecimiento de la producci\u00f3n frut\u00edcola hab\u00eda llevado a este grupo a ser el m\u00e1s importante en t\u00e9rminos de valor de la producci\u00f3n, desbancando a los cereales que tradicionalmente hab\u00edan constituido el grupo m\u00e1s importante dentro de la producci\u00f3n agr\u00edcola de la regi\u00f3n. El crecimiento explosivo de la soya en los \u00faltimos a\u00f1os implic\u00f3 que las oleaginosas alcanzaran la mayor participaci\u00f3n dentro del valor de la producci\u00f3n agr\u00edcola de la regi\u00f3n, ligeramente por encima de las frutas y los cereales. En 2003 cada uno de estos tres grupos represent\u00f3 aproximadamente el 21% del valor de la producci\u00f3n agr\u00edcola de la regi\u00f3n\" (FAO, 2005: 152) .", "cite_spans": [{"start": 8, "end": 17, "text": "FAO, 2005", "latex": null, "ref_id": "BIBREF12"}, {"start": 801, "end": 817, "text": "(FAO, 2005: 152)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 2"}, {"text": "Estas importantes modificaciones advertidas en la producci\u00f3n agr\u00edcola regional, as\u00ed como en el funcionamiento del sistema estatal, acarrean consecuencias significativas en", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 2"}, {"text": "Fuente, FAO, 2005 numerosos grupos de familias rurales dedicadas a la producci\u00f3n de alimentos y materias primas para los mercados locales, con poca capacidad para reinsertarse en el nuevo marco econ\u00f3mico y pol\u00edtico instaurado en la regi\u00f3n. Situaci\u00f3n enfrentada, asimismo, por otros grupos de productores dedicados a estas mismas actividades agr\u00edcolas pero con mayor capacidad para reorientar o reconvertir sus explotaciones, lo que les permite una paulatina integraci\u00f3n en los procesos econ\u00f3micos. El resultado de estas transformaciones es un medio rural m\u00e1s complejo, con una clara segmentaci\u00f3n de las cadenas productivas precedentes y la sustituci\u00f3n de \u00e9stas por nuevos encadenamientos agrocomerciales y con una mayor diferenciaci\u00f3n territorial, o al interior mismo de los espacios rurales particulares, en consonancia con las mutaciones experimentadas por las actividades agr\u00edcolas y la extensi\u00f3n de los cultivos con un mayor potencial de incorporaci\u00f3n (de Janvry y Sadoulet, 2004) expansi\u00f3n de estos movimientos o su dinamismo no ha sido lo suficiente como para evitar el crecimiento del n\u00famero de pobres e indigentes presentes en el medio rural regional (Sumpsi y Mora, 2004) . Empleando la clasificaci\u00f3n de de Janvry y Sadoulet, por lo general estas", "cite_spans": [{"start": 8, "end": 17, "text": "FAO, 2005", "latex": null, "ref_id": "BIBREF12"}, {"start": 1159, "end": 1180, "text": "(Sumpsi y Mora, 2004)", "latex": null, "ref_id": "BIBREF36"}], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 3"}, {"text": "Fuente: FAO, 2005 actividades han logrado constituirse en una fuente dinamizadora de las econom\u00edas locales \u00fanicamente en los denominados territorios o \u00e1reas rurales favorables (ARF). En los cuadros 16 y 17 si incluye la informaci\u00f3n sobre la evoluci\u00f3n del n\u00famero de personas en condici\u00f3n de pobreza en Am\u00e9rica Latina y el Caribe, en t\u00e9rminos absolutos y porcentuales, entre los a\u00f1os 1980 y 2002. Es importante resaltar en esta informaci\u00f3n el incremento de la pobreza durante este per\u00edodo en el cual pasa de 135,9 millones de personas en 1980 a 221,4 en 2002 y en la indigencia la cual pasa de 62,4 millones de personas en 1980 a 97,4 veintid\u00f3s a\u00f1os despu\u00e9s.", "cite_spans": [{"start": 8, "end": 17, "text": "FAO, 2005", "latex": null, "ref_id": "BIBREF12"}], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 4"}, {"text": "Aunque los datos muestran un incremento m\u00e1s significativo de la pobreza urbana en relaci\u00f3n con el comportamiento de la pobreza rural, diversos estudios efectuados en la regi\u00f3n concluyen en que en buena parte el crecimiento de la pobreza urbana es el resultado del propias comunidades rurales, aprovechando sus propias capacidades y las oportunidades ofrecidas por las nuevas condiciones de la econom\u00eda global.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 4"}, {"text": "Por otra parte, es posible distinguir un conjunto de espacios rurales estancados (EREs), cuyo potencial de desarrollo no se impulsa enteramente, por no contar con los instrumentos (pol\u00edticas, proyectos, fondos, sistema institucional local) con los cuales es posible inducir su diversificaci\u00f3n y reconversi\u00f3n productiva. Por lo general, se encuentran en estos espacios empresas agr\u00edcolas, agroindustriales o comerciales con formas tradicionales de funcionamiento o con poca capacidad de impulsar el desarrollo territorial. En ciertos casos establecen v\u00ednculos contractuales con productores familiares quienes abastecen parte de la demanda de estas agrupaciones. En otras situaciones, ofrecen oportunidades de empleo permanente o temporal a algunos miembros de las unidades familiares locales (hombres, mujeres, j\u00f3venes), pero sin generar las condiciones requeridas para impulsar procesos sostenibles de desarrollo econ\u00f3mico y de superaci\u00f3n de la pobreza. En estos espacios la pluriactividad de las familias rurales, mediante la incorporaci\u00f3n de los integrantes del grupo familiar en actividades generadoras de ingresos en otros territorios o fuera del pa\u00eds, adquiere mucha relevancia para lograr su subsistencia. \u00e1reas urbanas, a los espacios rurales de mayor dinamismo o al exterior, es el camino seguido por los pobladores de estos territorios, sobre todo los j\u00f3venes y jefes de familia.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 4"}, {"text": "El an\u00e1lisis sobre las tendencias del desarrollo agr\u00edcola y del medio rural en la regi\u00f3n muestra una frecuente desvinculaci\u00f3n entre las principales orientaciones de las pol\u00edticas p\u00fablicas y las condiciones existentes en la mayor\u00eda de los espacios rurales. Esta disgregaci\u00f3n hace que, a\u00fan en muchos casos en los cuales se ha creado legislaci\u00f3n y se cuenta con pol\u00edticas o programas de desarrollo rural, con un enfoque territorial, los resultados obtenidos con las iniciativas impulsadas desde el estado no lleguen a traducirse en procesos territoriales de desarrollo rural sostenibles 5 . El lugar concedido a las pol\u00edticas o programas de desarrollo rural en la mayor\u00eda de los casos las sit\u00faa en una posici\u00f3n desvinculada de las pol\u00edticas econ\u00f3micas preponderantes, reduciendo su efectividad o recibiendo una colocaci\u00f3n secundaria en relaci\u00f3n con las medidas destinadas a estimular las actividades agr\u00edcolas de exportaci\u00f3n llevadas a cabo por inversionistas locales o externos. La prevaleciente organizaci\u00f3n sectorial de las instituciones p\u00fablicas vinculadas con el medio rural y la separaci\u00f3n entre las pol\u00edticas econ\u00f3micas y las pol\u00edticas, programas o estrategias de desarrollo rural, constituyen dos obst\u00e1culos significativos para la concreci\u00f3n de las propuestas de desarrollo territorial rural, 5 Por ejemplo, no cabe duda sobre la importante posici\u00f3n asignada en las pol\u00edticas p\u00fablicas de M\u00e9xico a las pol\u00edticas agrarias y de desarrollo rural. El monto de recursos asignado a estos programas, agrupados en la denominada Alianza Para el Campo, ascendi\u00f3 aproximadamente a mil millones de d\u00f3lares anuales (10 mil millones de pesos mexicanos). Esto permite impulsar un conjunto de programas, con relativa articulaci\u00f3n, para enfrentar las dif\u00edciles condiciones del medio rural en ese pa\u00eds. Sin embargo, las distancias entre los planteamientos conceptuales y pol\u00edticos, centrados en el impulso del desarrollo territorial rural, y los resultados efectivamente alcanzados, muestran los obst\u00e1culos existentes en el momento de la operacionalizaci\u00f3n del enfoque y la ejecuci\u00f3n de las iniciativas y en cuanto a la capacidad efectiva de las familias rurales y sus organizaciones para asumir la direcci\u00f3n de sus procesos de desarrollo. En una evaluaci\u00f3n sobre la Alianza para el Campo, efectuada en 2003, en relaci\u00f3n con este tema se concluye lo siguiente: \"La formaci\u00f3n de capacidades en la poblaci\u00f3n fueran (Sic) productivas de administraci\u00f3n y de gesti\u00f3n, fueron pr\u00e1cticamente inexistentes o apenas perceptibles. Cuatro problemas centrales se presentaron en la operaci\u00f3n del PRODEPESCA: i) insuficiente capacitaci\u00f3n de los PSPs para cumplir el cometido; ii) mecanismos de control incompletos o inoportunos, para asegurar la puesta en pr\u00e1ctica de proyectos con pertinencia econ\u00f3mica y/ o social local, situaci\u00f3n que permiti\u00f3 la clonaci\u00f3n de proyectos, independientemente de su adecuaci\u00f3n al contexto; iii) el pago por productos se expres\u00f3 en la entrega de servicios aislados, en detrimento de una integralidad en el funcionamiento y en el seguimiento del proyecto y iv) el descuido relativo de poblaciones marginadas aisladas, quienes, a pesar de los pagos diferenciados y a favor de esos grupos previstos en las Reglas de Operaci\u00f3n 2002, fueron relegadas a un segundo plano, debido al costo de oportunidad de los PPSs \" (FAO, 2003) .", "cite_spans": [{"start": 1297, "end": 1298, "text": "5", "latex": null, "ref_id": null}, {"start": 3312, "end": 3323, "text": "(FAO, 2003)", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "conceptualmente bien definidas pero con relevantes limitaciones en sus alcances y en su ejecuci\u00f3n 6 .", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Diversas experiencias exitosas de desarrollo territorial rural llevadas a cabo en la regi\u00f3n han contado con el respaldo de la cooperaci\u00f3n internacional. Este es un factor de mucha relevancia pues crea condiciones para articular las acciones sectoriales, propiciar la participaci\u00f3n efectiva de las comunidades en la toma de decisiones y formar personal t\u00e9cnico regional en los aspectos operativos que permiten la aplicaci\u00f3n del enfoque territorial del desarrollo rural, en correspondencia con las condiciones particulares de los espacios rurales en los cuales se llevan a cabo estas iniciativas (Cherret, 1999; Miranda, 2003) . Asimismo, el desarrollo de estas experiencias permite detectar con m\u00e1s claridad los obst\u00e1culos de diversa naturaleza prevalecientes en Am\u00e9rica Latina y el Caribe para el impulso efectivo de iniciativas de desarrollo territorial rural (Sumpsi, 2005) . Pero, estas beneficiosas experiencias generan algunos problemas previsibles y superables, en la medida en la cual se logre un balance adecuado entre las exigencias de corto plazo establecidas por los organismos de cooperaci\u00f3n internacional o las agencias de financiamiento que brindan el apoyo financiero y t\u00e9cnico para la ejecuci\u00f3n de estas iniciativas y las condiciones particulares de los territorios y las familias u organizaciones con las cuales se llevan a cabo estos procesos de desarrollo rural, cuyos resultados sustanciales y sostenibles se obtienen en el mediano o el largo plazo.", "cite_spans": [{"start": 594, "end": 609, "text": "(Cherret, 1999;", "latex": null, "ref_id": "BIBREF7"}, {"start": 610, "end": 624, "text": "Miranda, 2003)", "latex": null, "ref_id": "BIBREF23"}, {"start": 861, "end": 875, "text": "(Sumpsi, 2005)", "latex": null, "ref_id": "BIBREF37"}], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Para Roberto Mart\u00ednez, \"Estos proyectos no deben concebirse como instrumentos aislados, sino consolidando conjuntos de acciones integradas. La falta de integraci\u00f3n se advierte con particular frecuencia en los proyectos de desarrollo rural, los cuales suelen reflejar las preferencias de los donantes o financiadores, con una precaria articulaci\u00f3n entre ellos y relaciones d\u00e9biles con los \u00e1mbitos que tienen una responsabilidad central sobre el asunto tratado. Adem\u00e1s, los proyectos que as\u00ed se gestionaron enfrentan el riesgo de quedar aislados con respecto al resto de las acciones de pol\u00edtica y del aparato administrativo. Esta gesti\u00f3n suele tender a la autosuficiencia, evitando la creaci\u00f3n de relaciones que generen dependencias e incertidumbres. Su aislamiento y sus condiciones excepcionales de operaci\u00f3n dificultan con 6 Un buen ejemplo sobre esta situaci\u00f3n lo constituye el establecimiento de una \"Estrategia de Desarrollo Rural\" en Costa Rica (PDR, 2003) , definida como una iniciativa de alcance nacional, pero relegada como un programa en una posici\u00f3n secundaria con pocas posibilidades de lograr una incidencia efectiva en el medio rural del pa\u00eds, dedicada impulsar una serie de proyectos localizados en determinados espacios locales y con grandes dificultades para concretar su enfoque conceptual de desarrollo territorial rural (V\u00e9ase: Mora, 2003) . Para Kliskberg, \"En Am\u00e9rica Latina el discurso pol\u00edtico ha tendido a reconocer crecientemente a la participaci\u00f3n. Ser\u00eda claramente antipopular enfrentar la presi\u00f3n pro participaci\u00f3n tan fuerte en la sociedad, y con argumentos tan contundentes a su favor. Sin embargo, los avances reales en cuanto a la implementaci\u00f3n efectiva de programas con altos niveles de participaci\u00f3n comunitaria son muy reducidos. Siguen predominando los programas 'llave en mano', impuestos verticalmente, donde los decisores o dise\u00f1adores, son los que saben y la comunidad desfavorecida debe acatar sus directivas, y ser sujeto pasivo de los mismos\". Por otra parte, de acuerdo con lo apuntado por el autor, con frecuencia en los programas se hacen fuertes apelativos a su car\u00e1cter participativo, cuando en la realidad las comunidades tan s\u00f3lo tienen una m\u00ednima intervenci\u00f3n efectiva en la toma de decisiones. Para", "cite_spans": [{"start": 825, "end": 826, "text": "6", "latex": null, "ref_id": null}, {"start": 951, "end": 962, "text": "(PDR, 2003)", "latex": null, "ref_id": null}, {"start": 1349, "end": 1360, "text": "Mora, 2003)", "latex": null, "ref_id": "BIBREF27"}], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Kliskberg, \"El discurso dice s\u00ed a la participaci\u00f3n en la regi\u00f3n, pero los hechos con frecuencia dicen no. Asimismo la presencia de la comunidad es uno de los pocos medios probados que previene efectivamente la corrupci\u00f3n. El control social de la misma sobre la gesti\u00f3n es una gran garant\u00eda al respecto que se pierde al impedir la participaci\u00f3n\" (Kliskberg, 2001: 28) .", "cite_spans": [{"start": 345, "end": 366, "text": "(Kliskberg, 2001: 28)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Como bien apunta el autor, existe hoy una generalizada convalidaci\u00f3n mundial sobre la superioridad en t\u00e9rminos de efectividad de la participaci\u00f3n comunitaria, sobre las formas organizativas tradicionales de corte vertical o burocr\u00e1tico. Esto es muy visible en campos como el del desarrollo rural o en el de las pol\u00edticas sociales. En el desarrollo de estas iniciativas, se\u00f1ala Kliskberg, se logra hacer un mejor uso de los recursos, se alcanzan con mayor efectividad sus metas y crean sostenibilidad si las comunidades a las que se desea favorecer participan desde el inicio y a lo largo de todo su desarrollo y comparten la planificaci\u00f3n, la gesti\u00f3n, el control y la evaluaci\u00f3n. Cuando prevalece el divorcio entre el discurso y la realidad esta separaci\u00f3n es claramente percibida por las comunidades quienes reaccionan ante esta situaci\u00f3n con descontento y frustraci\u00f3n. \"Se limitan as\u00ed las posibilidades de programas donde se ofrezca participaci\u00f3n genuina porque las comunidades est\u00e1n 'quemadas' al respecto por las falsas promesas. El s\u00ed pero no, est\u00e1 basado en resistencias profundas a que en definitiva realmente las comunidades pobres participen, que se disfrazan ante su ilegitimidad conceptual, pol\u00edtica, y \u00e9tica.\" (Kliskberg, 2001: 28) .", "cite_spans": [{"start": 1222, "end": 1243, "text": "(Kliskberg, 2001: 28)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Este es uno de los principales obst\u00e1culos enfrentados en la ejecuci\u00f3n de las iniciativas de desarrollo territorial en la regi\u00f3n. En la medida en que prevalezcan las formas de vinculaci\u00f3n sustentadas en las relaciones de clientela o en el asistencialismo y la participaci\u00f3n de las comunidades rurales sea m\u00e1s un discurso que una realidad, prevalecer\u00e1n los limites para el impulso de estrategias o iniciativas de desarrollo rural, salvo en aquellos casos en los cuales la propia organizaci\u00f3n y movilizaci\u00f3n de los actores sociales locales permita impulsar los procesos y generar los emprendimientos a partir de su propia iniciativa, con un enfoque efectivamente ascendente.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Pol\u00edticas p\u00fablicas y participaci\u00f3n ciudadana"}, {"text": "Una de las dificultades m\u00e1s relevantes para el impulso de las iniciativas de desarrollo territorial rural en la regi\u00f3n, no siempre incorporada en los an\u00e1lisis sobre el desarrollo rural en Am\u00e9rica Latina y el Caribe, lo constituye el desmantelamiento, la tercerizaci\u00f3n o privatizaci\u00f3n y el deterioro experimentado en los sistemas de asistencia t\u00e9cnica (extensi\u00f3n, investigaci\u00f3n agr\u00edcola y transferencia), como resultado de las medidas orientadas a la disminuci\u00f3n del tama\u00f1o del estado por la v\u00eda de la contracci\u00f3n y el traslado de algunas de sus funciones esenciales al sector privado, la disminuci\u00f3n en la cantidad del personal t\u00e9cnico contratado y en la dr\u00e1stica reducci\u00f3n del gasto p\u00fablico destinado a estas labores ( FAO, 2003) .", "cite_spans": [{"start": 718, "end": 730, "text": "( FAO, 2003)", "latex": null, "ref_id": "BIBREF13"}], "ref_spans": [], "eq_spans": [], "section": "Un obst\u00e1culo b\u00e1sico: el desmantelamiento de los servicios de asistencia t\u00e9cnica y las limitaciones de acceso a ala informaci\u00f3n y el conocimiento."}, {"text": "Como resultado de estos procesos se presenta una marcada diversidad en cuanto a las modalidades mediante las cuales se prestan los diferentes tipos de servicios de investigaci\u00f3n, asistencia t\u00e9cnica y extensi\u00f3n agr\u00edcola o rural (Berdegu\u00e9, 2002) . Por un lado, producto de la sensible disminuci\u00f3n de los recursos asignados a estos programas, se presenta un evidente deterioro en la mayor\u00eda de los sistemas en los cuales se mantuvo su car\u00e1cter predominantemente estatal y en los cuales se continuaron considerando estos servicios como un bien p\u00fablico. Por otro lado, en aquellos casos en los cuales se desmontan los programas p\u00fablicos y se privatiza o terceriza la prestaci\u00f3n de los servicios, surgen una serie de programas dirigidos a la creaci\u00f3n de un mercado de servicios profesionales mediante el cual proveedores privados sustituyen, en parte, las funciones previamente desempe\u00f1adas por el estado. La asignaci\u00f3n de fondos p\u00fablicos o de recursos financieros externos a grupos de productores rurales para la adquisici\u00f3n de los servicios ofrecidos por empresas profesionales emergentes, muchas veces constituidas por los t\u00e9cnicos y profesionales separados de las instituciones estatales, es el camino m\u00e1s frecuente para fomentar el mercado de servicios profesionales. En algunos casos, como en Nicaragua, se crea un sistema mixto en el cual se combina la \"extensi\u00f3n pagada\" (Paid Extension), con el acceso gratuito a estos servicios por parte de los grupos de productores rurales de menores ingresos (Mora, 2002) .", "cite_spans": [{"start": 227, "end": 243, "text": "(Berdegu\u00e9, 2002)", "latex": null, "ref_id": "BIBREF1"}, {"start": 1499, "end": 1511, "text": "(Mora, 2002)", "latex": null, "ref_id": "BIBREF28"}], "ref_spans": [], "eq_spans": [], "section": "Un obst\u00e1culo b\u00e1sico: el desmantelamiento de los servicios de asistencia t\u00e9cnica y las limitaciones de acceso a ala informaci\u00f3n y el conocimiento."}, {"text": "Pese a la existencia de experiencias positivas en relaci\u00f3n con las nuevas modalidades de prestaci\u00f3n de servicios de apoyo al desarrollo agr\u00edcola y rural, sobre todo en algunos de los pa\u00edses del sur de la regi\u00f3n, predomina a\u00fan una tendencia a la disminuci\u00f3n de la cobertura de la poblaci\u00f3n atendida y una concentraci\u00f3n de los servicios en los grupos de productores con capacidad financiera para la adquisici\u00f3n de \u00e9stos. El acompa\u00f1amiento de las iniciativas de desarrollo territorial rural por lo general enfrentan el problema de la organizaci\u00f3n sectorial de", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Un obst\u00e1culo b\u00e1sico: el desmantelamiento de los servicios de asistencia t\u00e9cnica y las limitaciones de acceso a ala informaci\u00f3n y el conocimiento."}, {"text": "Fuente: FAO, 2005 los servicios o la ausencia de programas que permitan un seguimiento permanente de los procesos (Mora, 2004) .", "cite_spans": [{"start": 8, "end": 17, "text": "FAO, 2005", "latex": null, "ref_id": "BIBREF12"}, {"start": 114, "end": 126, "text": "(Mora, 2004)", "latex": null, "ref_id": "BIBREF26"}], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 5"}, {"text": "La competitividad de la agricultura y de los territorios rurales, en las condiciones de la sociedad contempor\u00e1nea, descansa en gran medida en la capacidad de generar conocimientos y de impulsar procesos de investigaci\u00f3n y desarrollo (I&D), con los cuales se posibilite la innovaci\u00f3n permanente y el incremento constante en la productividad (Boisier, 2005) . En Am\u00e9rica Latina y el Caribe, a la situaci\u00f3n enfrentada con el tema particular de la investigaci\u00f3n agr\u00edcola y con los sistemas de apoyo a la agricultura y el desarrollo rural, se suma el enorme desaf\u00edo de las limitadas inversiones efectuadas, con muy contadas excepciones, como la de Brasil, en investigaci\u00f3n agr\u00edcola y en general en investigaci\u00f3n y desarrollo (Sa\u00edn, G. y Ardila, J. 2004) 7 . Los datos incluidos en el gr\u00e1fico 6 muestran que esta situaci\u00f3n lejos de revertirse, en comparaci\u00f3n con los esfuerzos efectuados entre 1990 y 2000 por otras regiones del mundo, m\u00e1s bien tiende a ahondarse. Los bajos porcentajes del PIB destinados a la I&D en la regi\u00f3n, tienen consecuencias muy importantes en el fomento agr\u00edcola y en el desarrollo del medio rural en la regi\u00f3n (Berdegu\u00e9 and Escobar, 2001 ).", "cite_spans": [{"start": 340, "end": 355, "text": "(Boisier, 2005)", "latex": null, "ref_id": "BIBREF3"}, {"start": 720, "end": 748, "text": "(Sa\u00edn, G. y Ardila, J. 2004)", "latex": null, "ref_id": null}, {"start": 1131, "end": 1158, "text": "(Berdegu\u00e9 and Escobar, 2001", "latex": null, "ref_id": "BIBREF2"}], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 5"}, {"text": "El establecimiento o fortalecimiento de sistemas de investigaci\u00f3n y extensi\u00f3n que permitan articular los esfuerzos dispersos efectuados por distintos actores corporativos, p\u00fablicos y privados, enfocados hacia el cumplimiento de determinadas metas de desarrollo, es un paso requerido para generar procesos de I&D y promover la innovaci\u00f3n en los espacios rurales de la regi\u00f3n. El reforzamiento de las inversiones en ciencia y tecnolog\u00eda y la aplicaci\u00f3n de los recursos en funci\u00f3n de las demandas de productores y territorios, son medidas de indiscutible importancia para favorecer las pol\u00edticas y las estrategias de desarrollo territorial rural en Am\u00e9rica Latina y el Caribe. ", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Gr\u00e1fico 5"}, {"text": "Otro obst\u00e1culo significativo al impulso de los procesos de desarrollo territorial rural en la regi\u00f3n lo constituye la extendida debilidad de los gobiernos locales y la posici\u00f3n secundaria asignada a \u00e9stos en las pol\u00edticas y estrategias dise\u00f1adas para impulsar el desarrollo en la regi\u00f3n. En los pa\u00edses con un mayor grado de centralizaci\u00f3n pol\u00edtica la situaci\u00f3n se torna a\u00fan m\u00e1s complicada, pues la transferencia de competencias y recursos hacia los gobiernos municipales es muy restringida y el cumplimiento de sus funciones se encuentra por lo general limitada tanto por la cantidad de recursos de que disponen, como por su debilitada capacidad de gesti\u00f3n. El frecuente aislamiento de los gobiernos locales en relaci\u00f3n con los principales actores sociales locales y con los procesos de desarrollo de sus territorios a los cuales se hayan vinculados, encuentra en los temas de la gesti\u00f3n p\u00fablica local y en los recursos de que disponen para afrontar los desaf\u00edos del desarrollo territorial, dos factores condicionantes de sus limitaciones.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Actores sociales y gobiernos locales: una alianza estrat\u00e9gica"}, {"text": "Los procesos de descentralizaci\u00f3n impulsados durante las dos \u00faltimas d\u00e9cadas en la regi\u00f3n, con diferentes grados de profundizaci\u00f3n y de traslado efectivo de competencias y recursos a los gobiernos locales, ofrecen una oportunidad para el fortalecimiento de su capacidad de gesti\u00f3n, para promover el asociacionismo municipal y para colocar a estas entidades gubernamentales en posici\u00f3n de responder a las demandas del desarrollo territorial y al fortalecimiento de los procesos de democracia local. La formaci\u00f3n en aspectos vinculados con la planificaci\u00f3n local participativa, el ordenamiento territorial, la gesti\u00f3n ambiental, el manejo de riesgos y la apertura de espacios de participaci\u00f3n efectiva de los habitantes de los espacios rurales en el impulso de los procesos de desarrollo econ\u00f3mico local, son disposiciones de gran relevancia para fortalecer a los gobiernos municipales y contribuir as\u00ed a crear una condici\u00f3n b\u00e1sica para el impulso de los procesos de desarrollo territorial rural en la regi\u00f3n (Finot, 2005) .", "cite_spans": [{"start": 1007, "end": 1020, "text": "(Finot, 2005)", "latex": null, "ref_id": "BIBREF14"}], "ref_spans": [], "eq_spans": [], "section": "Actores sociales y gobiernos locales: una alianza estrat\u00e9gica"}, {"text": "La posici\u00f3n de los gobiernos locales en el enfoque territorial de desarrollo rural no es antojadiza. Por las competencias y las funciones que desempe\u00f1an \u00e9stos en las diversas comunidades rurales, con sus diferencias propias de las condiciones del entorno pol\u00edtico-institucional en el cual se desenvuelven, constituye organismos estatales de mucha relevancia para la continuidad y la sostenibilidad de los procesos. Al ser un componente esencial en la conformaci\u00f3n de los territorios, la proximidad con los diversos fen\u00f3menos que ocurren en \u00e9l, le ofrece la posibilidad de responder con mayor efectividad a las demandas y necesidades del desarrollo local. Sin embargo, el adecuado cumplimiento de sus competencias y funciones requiere de una vinculaci\u00f3n estrecha con los actores sociales locales y de los acuerdos que les permitan articular proyectos de desarrollo territorial con participaci\u00f3n de organizaciones sociales, empresas, dependencias p\u00fablicas locales y otros actores relevantes de acuerdo con las condiciones particulares de cada territorio. Como apunta Finot, \"Hoy en d\u00eda la descentralizaci\u00f3n pol\u00edtica cobra nueva importancia, ya no solamente para avanzar en ciudadan\u00eda e inclusi\u00f3n social sino tambi\u00e9n en desarrollo econ\u00f3mico. En efecto, se ha llegado a consenso en que la competitividad depende, no solamente de las empresas, de los equilibrios macroecon\u00f3micos y de pol\u00edticas nacionales activas, sino de la capacidad de cada localidad para encarar su propio desarrollo a trav\u00e9s de procesos de concertaci\u00f3n p\u00fablico-social-privada.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Actores sociales y gobiernos locales: una alianza estrat\u00e9gica"}, {"text": "Y para esto es indispensable que las localidades cuenten con m\u00e1s autonom\u00eda.\" (Finot, 2005:30) .", "cite_spans": [{"start": 77, "end": 93, "text": "(Finot, 2005:30)", "latex": null, "ref_id": null}], "ref_spans": [], "eq_spans": [], "section": "Actores sociales y gobiernos locales: una alianza estrat\u00e9gica"}, {"text": "El fortalecimiento de la autonom\u00eda local y de la capacidad de gesti\u00f3n de los gobiernos locales, el establecimiento de alianzas efectivas de los gobiernos municipales con los actores sociales locales y la apertura de espacios de participaci\u00f3n de las comunidades en los procesos de desarrollo terriotial, son decisiones sustanciales para promover iniciativas de desarrollo territorial rural en las cuales la direcci\u00f3n de los procesos recaiga en los habitantes de los territorios y no en los equipos t\u00e9cnicos con una presencia temporal o permanente en estos espacios, situaci\u00f3n que impedir\u00eda darle continuidad a las transformaciones productivas, institucionales y sociales requeridas para promover el desarrollo y el bienestar en el medio rural regional.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Actores sociales y gobiernos locales: una alianza estrat\u00e9gica"}, {"text": "El an\u00e1lisis sobre algunas tendencias del desarrollo pol\u00edtico y econ\u00f3mico de Am\u00e9rica Latina y el Caribe permite percibir una serie de factores limitantes para el impulso efectivo de procesos de desarrollo territorial rural en la regi\u00f3n. Algunos de ellos se ubican m\u00e1s all\u00e1 del medio rural regional, del funcionamiento de las instituciones p\u00fablicas vinculadas, de manera directa, con los procesos de desarrollo agr\u00edcola y rural y de los aspectos operativos enfrentados en la ejecuci\u00f3n de los programas espec\u00edficos de desarrollo territorial rural en la regi\u00f3n, con una incidencia notable en las posibilidades de concreci\u00f3n de estas iniciativas.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Conclusiones"}, {"text": "Otros, por el contrario, tiene que ver con trabas surgidas en la propia orientaci\u00f3n de las iniciativas y en las situaciones particulares existentes en el medio rural de la regi\u00f3n. El conocimiento de estos obst\u00e1culos y su colocaci\u00f3n como desaf\u00edos a vencer para promover procesos sostenibles de desarrollo rural, con un efectivo enfoque ascendente y generando las condiciones requeridas para que los actores sociales locales (p\u00fablicos, sociales y privados), asuman la direcci\u00f3n de los procesos de desarrollo en sus territorios, resulta imprescindible para superar las frustraciones con frecuencia enfrentadas en m\u00faltiples acciones impulsadas en la regi\u00f3n que no logran plasmar las evoluciones esperadas o la sostenibilidad imprescindible para que \u00e9stas tengan lugar en el mediano o el largo plazo.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Conclusiones"}, {"text": "Tres aspectos adquieren mucha relevancia en el impulso de los procesos de desarrollo territorial rural, como complementos indispensables a la legislaci\u00f3n establecida por algunos pa\u00edses o a las pol\u00edticas p\u00fablicas y estrategias dise\u00f1adas con el fin de responder a las nuevas condiciones del medio rural en la regi\u00f3n: por una parte, la elaboraci\u00f3n de arreglos institucionales que permitan pasar de la visi\u00f3n y el funcionamiento sectorial de las organizaciones estatales vinculadas con el medio rural, a una acci\u00f3n integrada en respuesta a las necesidades y demandas particulares de los diversos territorios. Por otra parte, el restablecimiento de los servicios institucionales b\u00e1sicos de investigaci\u00f3n y extensi\u00f3n rural, en correspondencia con las demandas territoriales y desarrollo de procesos de formaci\u00f3n y capacitaci\u00f3n del personal institucional con el prop\u00f3sito de crear las condiciones requeridas para la reorientaci\u00f3n de las iniciativas ejecutadas en el medio rural. Por \u00faltimo, la aplicaci\u00f3n del enfoque ascendente en los procesos de desarrollo territorial rural, de tal forma que se superen los programas o proyectos dise\u00f1ados sin tomar en cuenta las situaciones espec\u00edficas de los diferentes territorios presentes en el medio rural regional. El establecimiento de mecanismos precisos para la participaci\u00f3n efectiva de las familias, las comunidades, las organizaciones rurales y los gobiernos locales, en las distintas fases de desarrollo de las iniciativas y la conducci\u00f3n paulatina de los procesos, por parte de los actores sociales locales, son elementos sustanciales para lograr el autodesarrollo y la sostenibilidad de los procesos de desarrollo territorial rural.", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Conclusiones"}, {"text": "Los procesos de desarrollo territorial no corresponden, por lo general, con los plazos en los cuales deben ejecutarse los proyectos o los programas institucionales. Los primeros requieren de plazo que casi siempre exceden los l\u00edmites temporales establecidas a las iniciativas impulsadas por las instituciones nacionales o los organismos de cooperaci\u00f3n internacional. El an\u00e1lisis sobre los obst\u00e1culos enfrentados para la ejecuci\u00f3n exitosa de las iniciativas promovidas en este campo, puede contribuir a encontrar otros caminos, de m\u00e1s largo plazo, que se requiere recorrer para alcanzar las metas de desarrollo del medio rural en", "cite_spans": [], "ref_spans": [], "eq_spans": [], "section": "Conclusiones"}], "bib_entries": {"BIBREF0": {"ref_id": "b0", "title": "Acerca de la globalizaci\u00f3n en la agricultura. Territorios, empresas y desarrollo local en Am\u00e9rica Latina", "authors": [{"first": "J", "middle": [], "last": "Barbosa", "suffix": ""}, {"first": "G", "middle": [], "last": "Neiman", "suffix": ""}], "year": 2005, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF1": {"ref_id": "b1", "title": "Las reformas de los sistemas de extensi\u00f3n en Am\u00e9rica Latina a partir de la d\u00e9cada de los 80", "authors": [{"first": "J", "middle": [], "last": "Berdegu\u00e9", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF2": {"ref_id": "b2", "title": "Agricultural knowledge and information systems and poverty reduction", "authors": [{"first": "J", "middle": [], "last": "Berdegu\u00e9", "suffix": ""}, {"first": "G", "middle": [], "last": "Escobar", "suffix": ""}], "year": 2001, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF3": {"ref_id": "b3", "title": "\u00bfHay espacio para el desarrollo local en la globalizaci\u00f3n?", "authors": [{"first": "S", "middle": [], "last": "Boisier", "suffix": ""}], "year": 2005, "venue": "Revista de la CEPAL", "volume": "", "issn": "86", "pages": "47--62", "other_ids": {}}, "BIBREF4": {"ref_id": "b4", "title": "Globalizaci\u00f3n y desarrollo. Brasilia: Comisi\u00f3n Econ\u00f3mica para Am\u00e9rica Latina y el Caribe -CEPAL-Publicaciones de las Naciones Unidas", "authors": [], "year": 2002, "venue": "CEPAL", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF5": {"ref_id": "b5", "title": "Comisi\u00f3n Econ\u00f3mica para Am\u00e9rica Latina y el Caribe -CEPAL-. Publicaciones de las Naciones Unidas", "authors": [], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF6": {"ref_id": "b6", "title": "Objetivos de Desarrollo del Milenio: una mirada desde Am\u00e9rica Latina y el Caribe", "authors": [], "year": 2005, "venue": "Comisi\u00f3n Econ\u00f3mica para Am\u00e9rica Latina y el Caribe -CEPAL-. Publicaciones de las Naciones Unidas", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF7": {"ref_id": "b7", "title": "Informe sobre la experiencia de Lempira Sur 1994-1998", "authors": [{"first": "I", "middle": [], "last": "Cherret", "suffix": ""}], "year": 1999, "venue": "Secretar\u00eda de Agricultura y Ganader\u00eda, FAO", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF8": {"ref_id": "b8", "title": "Preparado para el IV Foro tem\u00e1tico Regional de Am\u00e9rica Latina y el Caribe \"Cosechando oportunidades: Desarrollo Rural en el Siglo 21", "authors": [{"first": "A", "middle": [], "last": "De Janvry", "suffix": ""}, {"first": "E", "middle": [], "last": "Sadoulet", "suffix": ""}], "year": 2004, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF9": {"ref_id": "b9", "title": "En Cambios en el pensamiento y la pr\u00e1ctica del desarrollo rural en Centroam\u00e9rica", "authors": [{"first": "A", "middle": [], "last": "De Janvry", "suffix": ""}, {"first": "E", "middle": [], "last": "Sadolulet", "suffix": ""}], "year": 1999, "venue": "Pobreza rural y el dise\u00f1o de estrategias efectivas de desarrollo rural", "volume": "", "issn": "", "pages": "5--26", "other_ids": {}}, "BIBREF10": {"ref_id": "b10", "title": "En Capital social y reducci\u00f3n de la pobreza en Am\u00e9rica Latina y el caribe: en busca de un nuevo paradigma", "authors": [{"first": "J", "middle": [], "last": "Durston", "suffix": ""}], "year": 2003, "venue": "", "volume": "", "issn": "", "pages": "147--202", "other_ids": {}}, "BIBREF11": {"ref_id": "b11", "title": "Territorio y competitividad en la agroindustria en M\u00e9xico. Condiciones y propuestas de pol\u00edtica para los clusters de lim\u00f3n mexicano en Colima y la pi\u00f1a de Veracruz", "authors": [{"first": "E", "middle": [], "last": "Dussel", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF12": {"ref_id": "b12", "title": "Tendencias y desaf\u00edos en la agricultura, los montes y la pesca en Am\u00e9rica Latina y el Caribe", "authors": [{"first": "", "middle": [], "last": "Fao", "suffix": ""}], "year": 2004, "venue": "Organizaci\u00f3n de las Naciones Unidas para la Agricultura y la Alimentaci\u00f3n", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF13": {"ref_id": "b13", "title": "Informe de evaluaci\u00f3n nacional desarrollo Rural", "authors": [{"first": "", "middle": [], "last": "Fao", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF14": {"ref_id": "b14", "title": "Descentralizaci\u00f3n, transferencias territoriales y desarrollo local", "authors": [{"first": "I", "middle": [], "last": "Finot", "suffix": ""}], "year": 2005, "venue": "Revista de la CEPAL", "volume": "", "issn": "86", "pages": "29--46", "other_ids": {}}, "BIBREF15": {"ref_id": "b15", "title": "CLAHE, Ponencia presentada en el seminario \"Desarrollo con inclusi\u00f3n y equidad: sus implicancias desde lo local", "authors": [{"first": "E", "middle": [], "last": "Gallicchio", "suffix": ""}], "year": 2004, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF16": {"ref_id": "b16", "title": "\u00bfLuces? Y sombras de la reforma del Estado en Am\u00e9rica Latina", "authors": [{"first": "M", "middle": [], "last": "Gasc\u00f3", "suffix": ""}], "year": 2004, "venue": "Documentos de Trabajo", "volume": "", "issn": "8", "pages": "", "other_ids": {}}, "BIBREF17": {"ref_id": "b17", "title": "Equidad y protecci\u00f3n social. Desaf\u00edos de pol\u00edticas sociales en Am\u00e9rica Latina", "authors": [{"first": "C", "middle": [], "last": "Hardy", "suffix": ""}], "year": 2004, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF18": {"ref_id": "b18", "title": "Competitividad del sector agr\u00edcola y pobreza rural: el papel del gasto p\u00fablico en Am\u00e9rica Latina", "authors": [{"first": "M", "middle": [], "last": "Kj\u00f6llerstr\u00f6m", "suffix": ""}], "year": 2004, "venue": "Santiago de Chile: CEPAL", "volume": "", "issn": "155", "pages": "", "other_ids": {}}, "BIBREF19": {"ref_id": "b19", "title": "Diez falacias sobre los problemas de Am\u00e9rica Latina", "authors": [{"first": "B", "middle": [], "last": "Kliskberg", "suffix": ""}], "year": 2001, "venue": "Centro de Documentaci\u00f3n en Pol\u00edticas Sociales. Documentos (27)", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF20": {"ref_id": "b20", "title": "Compiladores) (1998) Agricultura, medio ambiente y pobreza rural en Am\u00e9rica Latina", "authors": [{"first": "L", "middle": [], "last": "Leca", "suffix": ""}, {"first": "R", "middle": [], "last": "Echeverr\u00eda", "suffix": ""}], "year": null, "venue": "Instituto Internacional de Investigaciones sobre Pol\u00edticas Alimentarias -IFPRI-, Banco", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF21": {"ref_id": "b21", "title": "El impacto de la desigualdad en el desarrollo humano en Am\u00e9rica Latina", "authors": [{"first": "I", "middle": [], "last": "Mac\u00edas-Aymar", "suffix": ""}], "year": 2004, "venue": "Documentos de Trabajo", "volume": "", "issn": "7", "pages": "", "other_ids": {}}, "BIBREF22": {"ref_id": "b22", "title": "reforma institucional y gesti\u00f3n del sector p\u00fablico agropecuario", "authors": [{"first": "R", "middle": [], "last": "Mart\u00ednez", "suffix": ""}], "year": 2001, "venue": "En Desarrollo de las econom\u00edas rurales", "volume": "", "issn": "", "pages": "143--182", "other_ids": {}}, "BIBREF23": {"ref_id": "b23", "title": "Capital social e institucionalidad: la experiencia del Proyecto IICAHolanda / Laderas", "authors": [{"first": "B", "middle": [], "last": "Miranda", "suffix": ""}], "year": 2003, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF24": {"ref_id": "b24", "title": "Desarrollo socioecon\u00f3mico y pobreza en Am\u00e9rica Latina y el Caribe", "authors": [{"first": "J", "middle": [], "last": "Mora", "suffix": ""}], "year": 2005, "venue": "Programa Regional de Maestr\u00eda en Desarrollo Comunitario", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF25": {"ref_id": "b25", "title": "Pol\u00edtica agraria y desarrollo rural en Costa Rica: elementos para su definici\u00f3n en el nuevo entorno internacional", "authors": [{"first": "J", "middle": [], "last": "Mora", "suffix": ""}], "year": 2005, "venue": "Agronom\u00eda Costarricense", "volume": "29", "issn": "1", "pages": "101--133", "other_ids": {}}, "BIBREF26": {"ref_id": "b26", "title": "Preparado para el IV Foro tem\u00e1tico Regional de Am\u00e9rica Latina y el Caribe \"Cosechando oportunidades: Desarrollo Rural en el Siglo 21", "authors": [{"first": "J", "middle": [], "last": "Mora", "suffix": ""}], "year": 2004, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF27": {"ref_id": "b27", "title": "Informe de consultor\u00eda, Instituto Interamericano de Cooperaci\u00f3n para la Agricultura", "authors": [{"first": "J", "middle": [], "last": "Mora", "suffix": ""}], "year": 2003, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF28": {"ref_id": "b28", "title": "Desarrollo rural, cambio institucional y extensi\u00f3n rural en Centroam\u00e9rica y M\u00e9xico. Instituto Interamericano de Cooperaci\u00f3n para la Agricultura (IICA), Proyecto FONTAGRO", "authors": [{"first": "J", "middle": [], "last": "Mora", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF29": {"ref_id": "b29", "title": "Communities in Globalization. The invisible Mayan Nahual", "authors": [{"first": "J", "middle": ["P"], "last": "P\u00e9rez-Sainz", "suffix": ""}, {"first": "K", "middle": [], "last": "Andrade-Eekhoff", "suffix": ""}], "year": 2003, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF30": {"ref_id": "b30", "title": "Una alianza para mejorar la productividad. La ciencia y la tecnolog\u00eda y el sector rural mesoamericano", "authors": [{"first": "G", "middle": [], "last": "Sa\u00edn", "suffix": ""}, {"first": "J", "middle": [], "last": "Ardila", "suffix": ""}], "year": 2005, "venue": "Tecnolog\u00eda e Innovaci\u00f3n, COMUNICA Online", "volume": "1", "issn": "1", "pages": "8--14", "other_ids": {}}, "BIBREF31": {"ref_id": "b31", "title": "Crecimiento, empleo y equidad. El impacto de las reformas econ\u00f3micas en Am\u00e9rica Latina y el Caribe", "authors": [{"first": "B", "middle": [], "last": "Stallings", "suffix": ""}, {"first": "W", "middle": [], "last": "Peres", "suffix": ""}], "year": 2000, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF32": {"ref_id": "b32", "title": "Hacia una nueva agenda para Am\u00e9rica Latina", "authors": [{"first": "J", "middle": [], "last": "Stiglitz", "suffix": ""}], "year": 2003, "venue": "Revista de la CEPAL", "volume": "80", "issn": "8", "pages": "7--40", "other_ids": {}}, "BIBREF33": {"ref_id": "b33", "title": "Globalization and its discontents", "authors": [{"first": "J", "middle": [], "last": "Stiglitz", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF34": {"ref_id": "b34", "title": "Desarrollo Territorial Rural", "authors": [{"first": "A", "middle": [], "last": "Schejtman", "suffix": ""}, {"first": "J", "middle": [], "last": "Berdegu\u00e9r", "suffix": ""}], "year": 2003, "venue": "Fondo Internacional de Desarrollo Agr\u00edcola", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF35": {"ref_id": "b35", "title": "Estrategias de inserci\u00f3n de las \u00e1reas rurales en la econom\u00eda mundial. Una aproximaci\u00f3n desde Andaluc\u00eda", "authors": [{"first": "R", "middle": [], "last": "Silva", "suffix": ""}], "year": 2002, "venue": "", "volume": "", "issn": "", "pages": "103--131", "other_ids": {}}, "BIBREF36": {"ref_id": "b36", "title": "Desarrollo rural: Nuevos enfoques y perspectivas", "authors": [{"first": "J", "middle": ["M"], "last": "Sumpsi", "suffix": ""}, {"first": "J", "middle": [], "last": "Mora", "suffix": ""}, {"first": "Fodepal", "middle": [], "last": "Cuadernos", "suffix": ""}, {"first": "Fodepal", "middle": [], "last": "Proyecto", "suffix": ""}, {"first": "", "middle": [], "last": "Fao/Upm/Aeci", "suffix": ""}], "year": 2004, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF37": {"ref_id": "b37", "title": "Experiencias de desarrollo territorial rural en Am\u00e9rica Latina y el Caribe. Banco Interamericano de Desarrollo", "authors": [{"first": "J", "middle": ["M"], "last": "Sumpsi", "suffix": ""}], "year": 2005, "venue": "", "volume": "", "issn": "", "pages": "", "other_ids": {}}, "BIBREF38": {"ref_id": "b38", "title": "A Decade of Investment in Research and Development", "authors": [], "year": 2004, "venue": "UIS Bulletin on Science and Technology Statistics Issue", "volume": "1", "issn": "4", "pages": "1--4", "other_ids": {}}}, "ref_entries": {"TABREF0": {"text": "Fuente: CEPAL, sobre la base de tabulaciones especiales de las encuestas de hogares de los respectivos pa\u00edses. a/ Estimaci\u00f3n correspondiente a 18 pa\u00edses de la regi\u00f3n m\u00e1s Hait\u00ed b/ Personas con ingresos inferiores a la l\u00ednea de pobreza. Incluye a las personas que se encuentran en situaci\u00f3n de indigencia c/ Personas con ingresos inferiores a la l\u00ednea de indigencia.", "latex": null, "type": "table"}, "TABREF1": {"text": "Los costos de esta falacia son muy fuertes. Por un lado se est\u00e1n desechando enormes energ\u00edas latentes en las comunidades pobres. Cuando se les moviliza como sucedi\u00f3 en experiencias latinoamericanas mundialmente reconocidas como Villa El Salvador en el Per\u00fa, las escuelas Educo en El Salvador, o el presupuesto municipal participativo en Porto Alegre, los resultados son sorprendentes. La comunidad multiplica los recursos escasos, sumando a ellos incontables horas de trabajo, y es generadora de continuas iniciativas innovativas.", "latex": null, "type": "table"}, "TABREF2": {"text": "Gr\u00e1fico 6 World Expenditure on R&D in US$ PPP by region, 1990 & 2000", "latex": null, "type": "table"}}, "back_matter": []}
\ No newline at end of file
diff --git a/s2orc-doc2json/tests/test_end_to_end.py b/s2orc-doc2json/tests/test_end_to_end.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0119351af68be7e45518d3fa690e5516d31dcff
--- /dev/null
+++ b/s2orc-doc2json/tests/test_end_to_end.py
@@ -0,0 +1,117 @@
+import os
+import unittest
+import shutil
+
+from doc2json.grobid2json.process_pdf import process_pdf_file
+from doc2json.tex2json.process_tex import process_tex_file
+from doc2json.jats2json.process_jats import process_jats_file
+
+TEST_PDF_INPUT_DATA = os.path.join('tests', 'pdf')
+TEST_PDF_TEMP_DATA = os.path.join('tests', 'pdf_temp')
+TEST_PDF_OUTPUT_DATA = os.path.join('tests', 'pdf_output')
+
+TEST_LATEX_INPUT_DATA = os.path.join('tests', 'latex')
+TEST_LATEX_TEMP_DIR = os.path.join('tests', 'latex_temp')
+TEST_LATEX_EXPAND_DATA = os.path.join(TEST_LATEX_TEMP_DIR, 'latex')
+TEST_LATEX_NORM_DATA = os.path.join(TEST_LATEX_TEMP_DIR, 'norm')
+TEST_LATEX_XML_DATA = os.path.join(TEST_LATEX_TEMP_DIR, 'xml')
+TEST_LATEX_LOG_DATA = os.path.join(TEST_LATEX_TEMP_DIR, 'log')
+TEST_LATEX_OUTPUT_DATA = os.path.join('tests', 'latex_output')
+
+TEST_JATS_INPUT_DATA = os.path.join('tests', 'jats')
+TEST_JATS_OUTPUT_DATA = os.path.join('tests', 'jats_output')
+
+
+class TestE2E(unittest.TestCase):
+
+ def test_pdf_e2e(self):
+ """
+ Check end2end performance (PDF -> JSON)
+ :return:
+ """
+ for fname in os.listdir(TEST_PDF_INPUT_DATA):
+ if fname.endswith('.pdf'):
+ print(fname)
+ # get paper id
+ pid = '.'.join(fname.split('.')[:-1])
+ # remove output files if previously made
+ temp_file_name = os.path.join(TEST_PDF_TEMP_DATA, f'{pid}.tei.xml')
+ output_file_name = os.path.join(TEST_PDF_OUTPUT_DATA, f'{pid}.json')
+ if os.path.exists(temp_file_name):
+ os.remove(temp_file_name)
+ if os.path.exists(output_file_name):
+ os.remove(output_file_name)
+ # create directories
+ assert os.path.exists(TEST_PDF_INPUT_DATA)
+ os.makedirs(TEST_PDF_TEMP_DATA, exist_ok=True)
+ os.makedirs(TEST_PDF_OUTPUT_DATA, exist_ok=True)
+ # process pdf
+ process_pdf_file(
+ os.path.join(TEST_PDF_INPUT_DATA, fname),
+ TEST_PDF_TEMP_DATA,
+ TEST_PDF_OUTPUT_DATA
+ )
+ # check that output is there
+ assert os.path.exists(temp_file_name)
+ assert os.path.exists(output_file_name)
+
+ def test_latex_e2e(self):
+ """
+ Check end2end performance (LaTeX -> JSON)
+ :return:
+ """
+ for fname in os.listdir(TEST_LATEX_INPUT_DATA):
+ if fname.endswith('.gz'):
+ print(fname)
+ # get paper id
+ pid = list(os.path.splitext(fname))[0].split('/')[-1]
+ # remove output files if previously made
+ expand_dir = os.path.join(TEST_LATEX_EXPAND_DATA, pid)
+ norm_file_name = os.path.join(TEST_LATEX_NORM_DATA, pid, f'{pid}.tex')
+ xml_file_name = os.path.join(TEST_LATEX_XML_DATA, pid, f'{pid}.xml')
+ output_file_name = os.path.join(TEST_LATEX_OUTPUT_DATA, f'{pid}.json')
+ if os.path.exists(TEST_LATEX_TEMP_DIR):
+ shutil.rmtree(TEST_LATEX_TEMP_DIR)
+ if os.path.exists(output_file_name):
+ os.remove(output_file_name)
+ # create directories
+ assert os.path.exists(TEST_LATEX_INPUT_DATA)
+ os.makedirs(TEST_LATEX_TEMP_DIR, exist_ok=True)
+ os.makedirs(TEST_LATEX_OUTPUT_DATA, exist_ok=True)
+ # process LaTeX
+ process_tex_file(
+ os.path.join(TEST_LATEX_INPUT_DATA, fname),
+ temp_dir=TEST_LATEX_TEMP_DIR,
+ output_dir=TEST_LATEX_OUTPUT_DATA,
+ log_dir=TEST_LATEX_LOG_DATA,
+ keep_flag=True
+ )
+ # check that output is there
+ assert os.path.exists(expand_dir)
+ assert os.path.exists(norm_file_name)
+ assert os.path.exists(xml_file_name)
+ assert os.path.exists(output_file_name)
+
+ def test_jats_e2e(self):
+ """
+ Check end2end performance (JATS -> JSON)
+ """
+ for fname in os.listdir(TEST_JATS_INPUT_DATA):
+ if fname.endswith('nxml'):
+ print(fname)
+ # get PMC id
+ pid = fname.split('/')[-1].split('.')[0]
+ # remove output files if exist
+ output_file_name = os.path.join(TEST_JATS_OUTPUT_DATA, f'{pid}.json')
+ if os.path.exists(output_file_name):
+ os.remove(output_file_name)
+ # create directories
+ assert os.path.exists(TEST_JATS_INPUT_DATA)
+ os.makedirs(TEST_JATS_OUTPUT_DATA, exist_ok=True)
+ # process JATS
+ process_jats_file(
+ os.path.join(TEST_JATS_INPUT_DATA, fname),
+ output_dir=TEST_JATS_OUTPUT_DATA
+ )
+ # check that output is there
+ assert os.path.exists(output_file_name)
diff --git a/s2orc-doc2json/tests/test_read_write.py b/s2orc-doc2json/tests/test_read_write.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1097b55de26243828ebd95881d0ed4b88972e14
--- /dev/null
+++ b/s2orc-doc2json/tests/test_read_write.py
@@ -0,0 +1,27 @@
+import os
+import unittest
+import json
+
+from doc2json.s2orc import load_s2orc
+
+JSON_INPUT_DATA = os.path.join('tests', 'pdf', 'N18-3011.json')
+
+
+class TestS2ORC(unittest.TestCase):
+
+ def test_read_write(self):
+ """
+ Check loading current s2orc files
+ :return:
+ """
+ with open(JSON_INPUT_DATA, 'r') as f:
+ data = json.load(f)
+ try1 = load_s2orc(data)
+ output1 = try1.release_json("pdf")
+ try2 = load_s2orc(data)
+ output2 = try2.release_json("pdf")
+ for key, value in output2.items():
+ if key == 'header':
+ assert value != output1[key]
+ else:
+ assert value == output1[key]
diff --git a/s2orc-doc2json/tests/test_s2orc_versions.py b/s2orc-doc2json/tests/test_s2orc_versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8062939062ed901a7448a265e196fc6163d453b
--- /dev/null
+++ b/s2orc-doc2json/tests/test_s2orc_versions.py
@@ -0,0 +1,88 @@
+import os
+import unittest
+import json
+
+from doc2json.s2orc import load_s2orc
+
+TEST_S2ORC_INPUT_DATA = os.path.join('tests', 's2orc')
+TEST_S2ORC_CURRENT = os.path.join(TEST_S2ORC_INPUT_DATA, '20210101')
+TEST_S2ORC_2020_DATA = os.path.join(TEST_S2ORC_INPUT_DATA, '20200705')
+TEST_S2ORC_2019_DATA = os.path.join(TEST_S2ORC_INPUT_DATA, '20190928')
+
+
+class TestS2ORC(unittest.TestCase):
+
+ def test_s2orc_current(self):
+ """
+ Check loading current s2orc files
+ :return:
+ """
+ for fname in os.listdir(TEST_S2ORC_CURRENT):
+ if fname.endswith('.json'):
+ print(fname)
+ # get paper id
+ pid = fname.split('.')[0]
+ # load file
+ file_path = os.path.join(TEST_S2ORC_CURRENT, fname)
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+ # load into s2orc class
+ paper = load_s2orc(data)
+ assert pid == paper.paper_id
+ assert paper.metadata == {} or paper.metadata
+ assert paper.abstract == [] or paper.abstract
+ assert paper.body_text == [] or paper.body_text
+ assert paper.bib_entries == {} or paper.bib_entries
+ assert paper.ref_entries == {} or paper.ref_entries
+ assert paper.as_json()
+ assert paper.release_json()
+
+ def test_s2orc_2020(self):
+ """
+ Check loading old s2orc from 2020/07 release
+ :return:
+ """
+ for fname in os.listdir(TEST_S2ORC_2020_DATA):
+ if fname.endswith('.json'):
+ print(fname)
+ # get paper id
+ pid = fname.split('.')[0]
+ # load file
+ file_path = os.path.join(TEST_S2ORC_2020_DATA, fname)
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+ # load into s2orc class
+ paper = load_s2orc(data)
+ assert pid == paper.paper_id
+ assert paper.metadata == {} or paper.metadata
+ assert paper.abstract == [] or paper.abstract
+ assert paper.body_text == [] or paper.body_text
+ assert paper.bib_entries == {} or paper.bib_entries
+ assert paper.ref_entries == {} or paper.ref_entries
+ assert paper.as_json()
+ assert paper.release_json()
+
+ def test_s2orc_2019(self):
+ """
+ Check loading old s2orc from 2019/09 release
+ :return:
+ """
+ for fname in os.listdir(TEST_S2ORC_2019_DATA):
+ if fname.endswith('.json'):
+ print(fname)
+ # get paper id
+ pid = fname.split('.')[0]
+ # load file
+ file_path = os.path.join(TEST_S2ORC_2019_DATA, fname)
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+ # load into s2orc class
+ paper = load_s2orc(data)
+ assert pid == paper.paper_id
+ assert paper.metadata == {} or paper.metadata
+ assert paper.abstract == [] or paper.abstract
+ assert paper.body_text == [] or paper.body_text
+ assert paper.bib_entries == {} or paper.bib_entries
+ assert paper.ref_entries == {} or paper.ref_entries
+ assert paper.as_json()
+ assert paper.release_json()
\ No newline at end of file
diff --git a/services/app.py b/services/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0072f073f90eae49db91551f40d9e6579e6d41
--- /dev/null
+++ b/services/app.py
@@ -0,0 +1,94 @@
+import streamlit as st
+from io import BytesIO
+from datetime import datetime
+import json
+import requests
+import numpy as np
+import hashlib
+import base64
+import pickle
+import threading
+import re
+from MemSum.src.summarizer import MemSum
+import argparse
+
+def convert_pdf_to_json( pdf_bytes ):
+ paper_info = requests.post( "http://localhost:8061/parse-and-normalize-pdf",
+ files = {"pdf":pdf_bytes}
+ ).json()["response"]
+ return paper_info
+
+def get_arxiv_paper_bytes_fomr_url(url):
+ url = url.lower()
+ arxiv_id_matcher = re.compile( r"\d{4}\.\d{4,5}|[a-z-]+\/\d{7}" )
+ arxiv_id = (arxiv_id_matcher.findall( url ) + [None])[0]
+ if arxiv_id is None:
+ return None
+ pdf_url = "https://arxiv.org/pdf/%s.pdf"%( arxiv_id )
+ return requests.get(pdf_url).content
+
+def summarize_paper(memsum, paper_info):
+ sentence_list = []
+ for sec in paper_info["Content"]["Fullbody_Parsed"]:
+ for para in sec["section_text"]:
+ for sen in para["paragraph_text"]:
+ sentence_list.append(sen["sentence_text"])
+ extracted_summary, poses = memsum.extract( [ sentence_list ],
+ p_stop_thres = 0.5,
+ max_extracted_sentences_per_document = 5,
+ return_sentence_position = True
+ )
+ extracted_summary, poses = extracted_summary[0], poses[0]
+ if len(extracted_summary) > 0:
+ extracted_summary = list(zip(*sorted(zip( extracted_summary, poses ), key = lambda x:x[1])))[0]
+ return extracted_summary
+
+@st.cache_resource
+def load_models():
+ memsum = MemSum( "/app/models/memsum_arxiv/model.pt",
+ "/app/models/word_embedding/vocabulary_200dim.pkl",
+ gpu = None, max_doc_len = 500 )
+ return memsum
+
+def main():
+ memsum = load_models()
+
+ st.title('MemSum ArXiv Summarizer')
+ st.markdown("""
+ Using MemSum to extractively summarize an arXiv paper by extracting sentences from its fullbody.
+
+ Paper: https://aclanthology.org/2022.acl-long.450/ GitHub: https://github.com/nianlonggu/MemSum
+ """)
+ # Add options for input method
+ option = st.radio("Choose your method to provide the paper:",
+ ('Provide arXiv URL', 'Directly upload a PDF'))
+ if option == 'Provide arXiv URL':
+ # User will input the URL text here
+ url = st.text_input('Enter the arXiv URL here (e.g., https://arxiv.org/abs/1810.04805):')
+ # You can add a button to trigger the summarization process after the URL is input
+ if st.button('Summarize from URL'):
+ # Functionality to handle summarization from URL goes here
+ pdf_bytes = get_arxiv_paper_bytes_fomr_url( url )
+ if pdf_bytes is None:
+ st.text("URL parsing error. Is the URL valid?")
+ else:
+ paper_info = convert_pdf_to_json( pdf_bytes )
+ extracted_summary = summarize_paper(memsum, paper_info)
+ st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
+
+ elif option == 'Directly upload a PDF':
+ # User can upload a PDF directly here
+ uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
+ if uploaded_file is not None:
+ # Functionality to handle summarization from uploaded file goes here
+ pdf_bytes = BytesIO(uploaded_file.getvalue())
+ if pdf_bytes is None:
+ st.text("PDF parsing error. Is the uploaded file a valid PDF?")
+ else:
+ paper_info = convert_pdf_to_json( pdf_bytes )
+ extracted_summary = summarize_paper(memsum, paper_info)
+ st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/services/json_schema.json b/services/json_schema.json
new file mode 100644
index 0000000000000000000000000000000000000000..97c33bed2479a6271722a488975a1c620aab6038
--- /dev/null
+++ b/services/json_schema.json
@@ -0,0 +1 @@
+{"type": "object", "properties": {"Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "Title": {"type": "string"}, "Abstract": {"type": "string"}, "Venue": {"type": "string"}, "DOI": {"type": "string"}, "URL": {"type": "string"}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Content": {"type": "object", "properties": {"Abstract": {"type": "string"}, "Abstract_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"},"cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}, "Fullbody": {"type": "string"}, "Fullbody_Parsed": {"type": "array", "items": {"type": "object", "properties": {"section_id": {"type": "string"}, "section_title": {"type": "string"}, "section_text": {"type": "array", "items": {"type": "object", "properties": {"paragraph_id": {"type": "string"}, "paragraph_text": {"type": "array", "items": {"type": "object", "properties": {"sentence_id": {"type": "string"}, "sentence_text": {"type": "string"}, "cite_spans":{"type":"array","items":{"type":"object","properties":{"start":{"type":"string"}, "end":{"type":"string"},"text":{"type":"string"},"ref_id":{"type":"string"}} } } }}}}}}}}}}, "required": ["Abstract", "Abstract_Parsed", "Fullbody", "Fullbody_Parsed"]}, "Reference": {"type": "array", "items": {"type": "object", "properties": {"Title": {"type": "string"}, "Author": {"type": "array", "items": {"type": "object", "properties": {"FamilyName": {"type": "string"}, "GivenName": {"type": "string"}}, "required": ["FamilyName", "GivenName"]}}, "PublicationDate": {"type": "object", "properties": {"Year": {"type": "string"}, "Month": {"type": "string"}, "Day": {"type": "string"}}}, "Venue": {"type": "string"}, "ReferenceText": {"type": "string"}, "PaperID": {"type": "object", "properties": {"collection": {"type": "string"}, "id_field": {"type": "string"}, "id_type": {"type": "string"}, "id_value": {"type": "string"}}}}, "required": ["Title", "Author", "PublicationDate", "Venue", "ReferenceText"]}}, "S2CID": {"type": "string"}, "PMID": {"type": "string"}, "PMCID": {"type": "string"}, "ArxivId": {"type": "string"}, "ACLId": {"type": "string"}, "MAGId": {"type": "string"}, "Abstract_in_metadata": {"type": "boolean"}, "Last_update_unixtime": {"type": "number"}, "isDuplicated": {"type": "boolean"}}, "required": ["Author", "Title", "Abstract", "Venue", "DOI", "URL", "PublicationDate", "Content","Reference"]}
\ No newline at end of file
diff --git a/services/normalization_utils.py b/services/normalization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55bddc68fdb6226fcb2d7baa47d0841dd71fe3f
--- /dev/null
+++ b/services/normalization_utils.py
@@ -0,0 +1,424 @@
+import re
+import time
+import numpy as np
+from jsonschema import validate
+from nltk.tokenize import sent_tokenize
+import json
+class DocumentNormalizer:
+ def __init__(self, json_schema_path ):
+ self.json_schema = json.load(open(json_schema_path,"r"))
+ self.cit_marker_matcher = re.compile("(^[^A-Za-z\d]*)([0-9]+)(?=[^A-Za-z\d]*$)")
+ self.sentence_boundary_matcher = re.compile("\.\s")
+
+ def normalize( self, paper, requires_validation = True ):
+ ##### Author #####
+ parsed_authors = self.parse_author( paper )
+ ##### Title #####
+ parsed_title = self.parse_title( paper )
+ ##### Venue #####
+ parsed_venue = self.parse_venue( paper )
+ ##### DOI #####
+ parsed_doi = self.parse_doi(paper)
+ ##### URL #####
+ parsed_url = self.parse_url(paper)
+ ##### PublicationDate #####
+ parsed_pub_date = self.parse_pub_date(paper)
+ ##### Reference #####
+ parsed_reference, bib_entry_key_to_row_id_mapper = self.parse_reference(paper)
+
+ ##### Content #####
+ parsed_content = self.parse_content(paper, bib_entry_key_to_row_id_mapper)
+ ##### Abstract (The abstract text stored in the metadata) #####
+ abstract_text = (" ".join(self.get_sentence_list_from_parsed_sections( parsed_content["Abstract_Parsed"] ))).strip()
+
+ ##### Last_update_unixtime ######
+ Last_update_unixtime = int(time.time())
+ ##### Others #####
+ Abstract_in_metadata = abstract_text != ""
+ isDuplicated = False
+
+ normalized_paper = {
+ "Author":parsed_authors,
+ "Title":parsed_title,
+ "Abstract":abstract_text,
+ "Venue":parsed_venue,
+ "DOI":parsed_doi,
+ "URL":parsed_url,
+ "PublicationDate":parsed_pub_date,
+ "Content":parsed_content,
+ "Reference":parsed_reference,
+ "Last_update_unixtime":Last_update_unixtime,
+ "Abstract_in_metadata":Abstract_in_metadata,
+ "isDuplicated":isDuplicated
+ }
+
+ ##### Additional IDs, this is only added for S2ORC dataset #####
+ additional_ids = self.parse_additional_ids(paper)
+ normalized_paper.update( additional_ids )
+
+ if requires_validation:
+ try:
+ validate(instance=normalized_paper, schema=self.json_schema)
+ except:
+ return None
+ return normalized_paper
+
+ def get_sentence_list_from_parsed_sections(self, parsed_sections ):
+ sentence_list = []
+ for section in parsed_sections:
+ sentence_list.append(str(section.get( "section_title", "" )))
+ for para in section.get("section_text",[]):
+ for sen in para.get("paragraph_text", []):
+ sentence_list.append( str(sen.get("sentence_text","")) )
+ return sentence_list
+
+
+ def parse_author(self, paper ):
+ try:
+ parsed_authors = []
+ authors = paper.get("authors", [] )
+ for author in authors:
+ parsed_authors.append(
+ {
+ "GivenName":str( author.get( "first", "" ).replace("None","") ),
+ "FamilyName":str( author.get( "last", "" ).replace("None","") )
+ }
+ )
+ except:
+ parsed_authors = []
+ return parsed_authors
+
+ def parse_title(self, paper ):
+ try:
+ parsed_title = str(paper.get("title", "")).replace("None","").lstrip("[").rstrip("]")
+ except:
+ parsed_title = ""
+ return parsed_title
+
+ def parse_venue(self, paper):
+ try:
+ parsed_venue = str(paper.get("venue", "")).replace("None","")
+ except:
+ parsed_venue = ""
+
+ if parsed_venue.strip() == "":
+ try:
+ parsed_venue = str(paper.get("journal","")).replace("None","")
+ except:
+ parsed_venue = ""
+ return parsed_venue
+
+ def parse_doi(self, paper):
+ try:
+ parsed_doi = str( paper.get("doi","") ).replace("None","")
+ except:
+ parsed_doi = ""
+ return parsed_doi
+
+ def parse_url(self, paper):
+ try:
+ parsed_doi = str(paper.get("doi","")).strip().replace("%", "%25").replace('"', "%22").replace("#", "%23").replace(" ", "%20").replace("?", "%3F").replace("None","")
+ if parsed_doi.strip() != "":
+ parsed_url = "https://doi.org/" + parsed_doi
+ else:
+ parsed_url = str(paper.get("s2_url", ""))
+ except:
+ parsed_url = ""
+ return parsed_url
+
+
+ def parse_pub_date( self, paper ):
+ try:
+ year = str(int(paper.get("year", ""))).replace("None","")
+ except:
+ year = ""
+ return {
+ "Year":year
+ }
+
+ def parse_para( self, para, bib_entry_key_to_row_id_mapper ):
+ paragraph_text = [{ "sentence_id":str(sen_id), "sentence_text": str(sen), "cite_spans":[] }
+ for sen_id, sen in enumerate(self.sent_tok( str(para.get("text",""))) )]
+ para_cite_spans = para.get( "cite_spans", [] )
+ for cite_span in para_cite_spans:
+ start, end = cite_span["start"], cite_span["end"]
+ for sen in paragraph_text:
+ if start < len( sen["sentence_text"] ):
+ end = min( end, len( sen["sentence_text"] ) )
+ sen["cite_spans"].append(
+ {
+ "start":start,
+ "end":end,
+ "text":sen["sentence_text"][start:end],
+ "ref_id":cite_span["ref_id"]
+ }
+ )
+ break
+ else:
+ start -= len( sen["sentence_text"] )
+ end -= len( sen["sentence_text"] )
+ cleaned_paragraph_text = []
+ for sen in paragraph_text:
+ sentence_text = sen["sentence_text"]
+ cite_spans = sen["cite_spans"]
+
+ sentence_text = sentence_text.rstrip()
+
+ cite_spans.sort( key= lambda x:x["start"] )
+
+ cleaned_cite_spans = []
+ for sen_cite_span in cite_spans:
+ if sen_cite_span["ref_id"] not in bib_entry_key_to_row_id_mapper:
+ continue
+
+ start, end = sen_cite_span["start"], sen_cite_span["end"]
+ ## make sure ther is no overlapping between multiple citation markers
+ if len(cleaned_cite_spans) > 0 and start < int(cleaned_cite_spans[-1]["end"]):
+ continue
+
+ if start >= len(sentence_text):
+ continue
+ end = min( end, len(sentence_text) )
+
+ sen_cite_span["start"] = str(start)
+ sen_cite_span["end"] = str(end)
+ sen_cite_span["text"] = sentence_text[start:end]
+ sen_cite_span["ref_id"] = str(bib_entry_key_to_row_id_mapper[ sen_cite_span["ref_id"] ])
+
+ cleaned_cite_spans.append( sen_cite_span )
+
+ sentence_id = str(len(cleaned_paragraph_text))
+ cleaned_paragraph_text.append(
+ {
+ "sentence_id":sentence_id,
+ "sentence_text":sentence_text,
+ "cite_spans":cleaned_cite_spans
+ }
+ )
+
+ return cleaned_paragraph_text
+
+
+ def parse_para_list( self, para_list, bib_entry_key_to_row_id_mapper ):
+ section_list = []
+ current_section = None
+
+ for para in para_list:
+ paragraph_text = self.parse_para( para, bib_entry_key_to_row_id_mapper )
+
+ para_section = str(para.get("section",""))
+
+ if current_section is None or (para_section != "" and para_section != current_section["section_title"]):
+ if current_section is not None:
+ section_list.append(current_section)
+ current_section = {
+ "section_id":str(len(section_list)),
+ "section_title":para_section,
+ "section_text":[
+ {
+ "paragraph_id":"0",
+ "paragraph_text":paragraph_text
+ }
+ ]
+ }
+ else:
+ next_para_id = len(current_section["section_text"])
+ current_section["section_text"].append(
+ {
+ "paragraph_id":str(next_para_id),
+ "paragraph_text":paragraph_text
+ }
+ )
+ if current_section is not None:
+ section_list.append(current_section)
+
+ if (" ".join(self.get_sentence_list_from_parsed_sections( section_list ))).strip() == "":
+ section_list = []
+
+ return section_list
+
+ def parse_content( self, paper, bib_entry_key_to_row_id_mapper ):
+ ### Abstract
+ abstract = ""
+ ### Abstract_Parsed
+ try:
+ pdf_parsed_abstract = paper.get("pdf_parse",{}).get("abstract",[])
+ if len( pdf_parsed_abstract ) == 0:
+ abstract_text = str(paper.get("abstract",""))
+ if abstract_text != "None" and abstract_text != "":
+ pdf_parsed_abstract = [ { "section":"Abstract", "text":abstract_text } ]
+ assert len(pdf_parsed_abstract) > 0
+
+ abstract_parsed = self.parse_para_list( pdf_parsed_abstract, bib_entry_key_to_row_id_mapper )
+ except:
+ abstract_parsed = []
+
+ ### Fullbody
+ fullbody = ""
+
+ ### Fullbody_Parsed
+ try:
+ fullbody_parsed = self.parse_para_list( paper.get( "pdf_parse", {} ).get("body_text", []), bib_entry_key_to_row_id_mapper )
+ except:
+ fullbody_parsed = []
+ return {
+ "Abstract":abstract,
+ "Abstract_Parsed":abstract_parsed,
+ "Fullbody":fullbody,
+ "Fullbody_Parsed":fullbody_parsed
+ }
+
+ def parse_reference(self, paper):
+ try:
+ bibref_text = {}
+ body_text = paper.get("pdf_parse",{}).get("body_text", [])
+ for para in body_text:
+ for cit in para.get("cite_spans", []):
+ if isinstance(cit, dict):
+ ref_id, ref_text = cit.get("ref_id",""), cit.get("text","")
+ if ref_id != "":
+ bibref_text[ref_id] = ref_text
+
+ for ref_id in bibref_text:
+ ref_text = bibref_text[ref_id]
+ matched_texts = self.cit_marker_matcher.findall(ref_text)
+ if len(matched_texts) > 0:
+ ref_text = matched_texts[0][1]+"."
+ else:
+ ref_text = ""
+ bibref_text[ref_id] = ref_text
+
+ except:
+ bibref_text = {}
+
+ try:
+ reference = []
+ bib_entry_key_to_row_id_mapper = {}
+
+ bib_entries = paper.get("pdf_parse",{}).get("bib_entries",{})
+ bib_entry_keys = list(bib_entries.keys())
+ try:
+ bib_entry_keys.sort( key = lambda x : int(x[6:]) )
+ except:
+ pass
+
+ for bib_entry_key in bib_entry_keys:
+ try:
+ parsed_entry = self.convert_bibentry_to_metadata( bib_entries[bib_entry_key] )
+ reference_text = self.get_citation_from_paper_metadata(parsed_entry)
+ if bibref_text.get(bib_entry_key,"").strip() != "":
+ reference_text = bibref_text[bib_entry_key] + " "+ reference_text
+ parsed_entry["ReferenceText"] = reference_text
+
+ bib_entry_key_to_row_id_mapper[bib_entry_key] = len(reference)
+ reference.append(parsed_entry)
+ except:
+ continue
+ except:
+ reference = []
+ bib_entry_key_to_row_id_mapper = {}
+
+ return reference, bib_entry_key_to_row_id_mapper
+
+ def parse_additional_ids(self, paper):
+ try:
+ S2CID = str(paper.get("paper_id", "")).replace("None","")
+ PMID = str(paper.get("pubmed_id", "")).replace("None","")
+ PMCID = str(paper.get("pmc_id", "")).replace("None","")
+ ArxivId = str(paper.get("arxiv_id", "")).replace("None","")
+ ACLId = str(paper.get("acl_id","")).replace("None","")
+ MAGId = str(paper.get("mag_id","")).replace("None","")
+ except:
+ S2CID = ""
+ PMID = ""
+ PMCID = ""
+ ArxivId = ""
+ ACLId = ""
+ MAGId = ""
+ return {
+ "S2CID":S2CID,
+ "PMID":PMID,
+ "PMCID":PMCID,
+ "ArxivId":ArxivId,
+ "ACLId":ACLId,
+ "MAGId":MAGId
+ }
+
+
+ def sent_tok(self, text, min_sen_len = 10 ):
+
+ sens = self.sentence_boundary_matcher.split( text )
+ for pos in range( len(sens)-1 ):
+ sens[pos] += ". "
+
+ return self.merge_sens( sens, min_sen_len = min_sen_len )
+
+ def merge_sens(self, sens, min_sen_len = 10 ):
+ out_sens =[]
+ current_sen = None
+
+ for sen in sens:
+ sen_len = len(sen.split())
+ if sen_len >= min_sen_len:
+ if current_sen is not None:
+ out_sens.append( current_sen )
+ current_sen = sen
+ else:
+ if current_sen is not None:
+ current_sen += sen
+ else:
+ current_sen = sen
+ if current_sen is not None:
+ if len( current_sen.split() ) < min_sen_len and len( out_sens ) > 0:
+ out_sens[-1] += current_sen
+ else:
+ out_sens.append(current_sen)
+ return out_sens
+
+ def convert_bibentry_to_metadata(self, bibentry):
+ metadata = {}
+ metadata["Title"] = bibentry["title"]
+ metadata["Author"] = []
+ for author in bibentry.get("authors",[]):
+ metadata["Author"].append({
+ "GivenName":author.get("first",""),
+ "FamilyName": author.get("last", "")
+ })
+ metadata["Venue"] = bibentry.get("venue","")
+ metadata["PublicationDate"] = {"Year":str( bibentry.get("year","") )}
+ return metadata
+
+
+ def get_citation_from_paper_metadata(self, paper_metadata ):
+ author = paper_metadata.get("Author",[])
+ title = paper_metadata.get("Title","")
+ venue = paper_metadata.get("Venue","")
+ year = paper_metadata.get("PublicationDate",{}).get("Year","")
+
+ author_list = []
+ for pos,author_item in enumerate(author):
+ if pos == 0:
+ author_list.append( "%s, %s"%( author_item.get("FamilyName",""), author_item.get("GivenName","") ) )
+ else:
+ author_list.append( "%s %s"%( author_item.get("GivenName",""), author_item.get("FamilyName","") ) )
+
+ if len(author_list)>3:
+ author_info = author_list[0] + " et al"
+ elif len(author_list)>1:
+ author_info = ", ".join( author_list[:-1] ) + ", and " + author_list[-1]
+ elif len(author_list)==1:
+ author_info = author_list[0]
+ else:
+ author_info = ""
+ author_info += "."
+
+ title_info = "“"+title.rstrip(".")+".”"
+ journal_info = venue
+ if year.strip() != "":
+ year_info = "(%s)"%(year)
+ else:
+ year_info = ""
+
+ citation_text = " ".join(" ".join( [author_info, title_info, journal_info, year_info ] ).split()) +"."
+
+ return citation_text
diff --git a/services/pdf_parsing_service.py b/services/pdf_parsing_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..be0b9d5e6164fa6824a3f553938bfc8ee40b15af
--- /dev/null
+++ b/services/pdf_parsing_service.py
@@ -0,0 +1,169 @@
+import argparse
+import json
+import requests
+from datetime import datetime
+from flask import Flask, jsonify, abort, make_response, request, Response
+from flask_cors import CORS
+
+import uuid
+import os
+import subprocess
+import threading
+import shutil
+import hashlib
+import base64
+
+from normalization_utils import DocumentNormalizer
+
+import time
+import socket
+from urllib.parse import urlparse
+
+import re
+
+
+# Make Flask application
+app = Flask(__name__)
+CORS(app)
+
+def bytes_to_base64_string(f_bytes):
+ return base64.b64encode(f_bytes).decode('ASCII')
+
+def base64_string_to_bytes(base64_string):
+ return base64.b64decode(base64_string)
+
+def get_md5( file_bytes ):
+ readable_hash = hashlib.md5(file_bytes).hexdigest()
+ return readable_hash
+
+def adjust_cite_span( cite_span, cite_span_year_matcher ):
+ non_cite_text_chars = ",; []()"
+ try:
+ start = int(cite_span["start"])
+ end = int(cite_span["end"])
+ orig_text = cite_span["text"]
+ text = cite_span_year_matcher.sub( r"Y\1Y", orig_text )
+
+ begin_offset = 0
+ end_offset = 0
+ for c in text:
+ if c in non_cite_text_chars:
+ start += 1
+ begin_offset += 1
+ else:
+ break
+ for c in text[::-1]:
+ if c in non_cite_text_chars:
+ end -= 1
+ end_offset -= 1
+ else:
+ break
+ assert start < end
+ new_cite_span = {
+ "start":str( start ),
+ "end":str( end ),
+ "text":orig_text[ begin_offset:len(text) + end_offset ],
+ "ref_id":cite_span["ref_id"]
+ }
+ except:
+ new_cite_span = cite_span
+ return new_cite_span
+
+def parse_pdf_base( pdf_bytes ):
+
+ root_dir = "root_dir_" + str(uuid.uuid4())
+ pdf_dir = root_dir + "/pdf/"
+ temp_dir = root_dir + "/temp_dir/"
+ output_dir = root_dir + "/output_dir/"
+ try:
+ os.makedirs(pdf_dir)
+ os.makedirs(temp_dir)
+ os.makedirs(output_dir)
+ except:
+ print("warning: folders exist!")
+
+ try:
+ with open( pdf_dir + "pdf.pdf","wb" ) as f:
+ f.write(pdf_bytes)
+
+ pdf_name = [ pdf_dir+fname for fname in os.listdir( pdf_dir )][0]
+ subprocess.run( list(map( str, [
+ "python",
+ PDF2JSON_HOME+"/doc2json/grobid2json/process_pdf.py",
+ "-i", pdf_name,
+ "-t", temp_dir,
+ "-o", output_dir
+ ] ) ) )
+ print("PDF parsing done!")
+
+
+ json_name = [ output_dir+fname for fname in os.listdir( output_dir )][0]
+
+ parsed_data = json.load(open(json_name))
+ shutil.rmtree(root_dir)
+
+ except:
+ parsed_data = {}
+ try:
+ shutil.rmtree(root_dir)
+ except:
+ print("warning: removing temporary folder failed!")
+ return parsed_data
+
+def convert_pdf_to_json( fbytes, count, conversion_results ):
+ try:
+ parsed_data = parse_pdf_base( fbytes )
+ except:
+ parsed_data = {}
+
+ conversion_results[count] = parsed_data
+
+
+
+@app.route('/parse-pdf', methods=['POST'])
+def parse_pdf():
+ try:
+ pdf_bytes = request.files.get('pdf').read()
+ parsed_data = parse_pdf_base( pdf_bytes )
+ except:
+ parsed_data = {}
+
+ return {"response":parsed_data}, 201
+
+
+@app.route('/parse-and-normalize-pdf', methods=['POST'])
+def parse_and_normalize_pdf():
+ global doc_normalizer, cite_span_year_matcher
+ try:
+ pdf_bytes = request.files.get('pdf').read()
+ parsed_data = parse_pdf_base( pdf_bytes )
+ parsed_data = doc_normalizer.normalize( parsed_data )
+
+ """ Clean the citation marker text """
+ for sec in parsed_data["Content"]["Abstract_Parsed"] + parsed_data["Content"]["Fullbody_Parsed"]:
+ for para in sec["section_text"]:
+ for sen in para["paragraph_text"]:
+ sen["cite_spans"] = [ adjust_cite_span( cite_span, cite_span_year_matcher ) for cite_span in sen["cite_spans"] ]
+
+ except:
+ parsed_data = {}
+
+ return {"response":parsed_data}, 201
+
+
+PDF2JSON_HOME = os.getenv("PDF2JSON_HOME")
+doc_normalizer = DocumentNormalizer( "./json_schema.json" )
+cite_span_year_matcher = re.compile( "\((\d{4})\)" )
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument( "-flask_port", type = int, default = 8060 )
+
+ args = parser.parse_args()
+
+
+ print("\n\nWaiting for requests...")
+ sem = threading.Semaphore()
+
+ app.run(host='0.0.0.0', port=args.flask_port, threaded = True, debug = True)
\ No newline at end of file
diff --git a/services/start_service.sh b/services/start_service.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c716af5e5ef2fbc1340fac6763b6f63676509177
--- /dev/null
+++ b/services/start_service.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+screen -dmS grobid bash -c 'cd $HOME/grobid-0.6.1 && ./gradlew run'
+
+source activate my_env
+
+gunicorn -k gthread -w 4 --threads 16 --backlog 2048 pdf_parsing_service:app -b 0.0.0.0:8061 &
+
+## wait for the pdf parsing service to be ready
+sleep 5
+
+streamlit run app.py --server.maxUploadSize 200 --server.port 7860