Spaces:

auto-academic
/

auto-draft

Running

auto-draft / utils /references.py

shaocongma

Re-format references. Remove ArXiv API Search.

70e35a5 over 1 year ago

12.6 kB

	# Each `paper` is a dictionary containing:
	# (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings
	#
	# Generate references:
	# `Reference` class:
	# 1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract.
	# 2. Given some keywords; use Semantic Scholar API to find papers.
	# 3. Generate bibtex from the selected papers. --> to_bibtex()
	# 4. Generate prompts from the selected papers: --> to_prompts()
	# A sample prompt: {"paper_id": "paper summary"}

	# todo: (1) citations & citedby of provided papers:
	# load the pre-defined papers; use S2 to find all related works
	# add all citations to `bib_papers`
	# add all citedby to `bib_papers`
	# use Semantic Scholar to find their embeddings
	# (2) separate references:
	# divide references into different groups to reduce the tokens count
	# for generating different paragraph of related works, use different set of references

	import requests
	import re
	import bibtexparser
	from scholarly import scholarly
	from scholarly import ProxyGenerator


	######################################################################################################################
	# Some basic tools
	######################################################################################################################
	def remove_newlines(serie):
	# This function is applied to the abstract of each paper to reduce the length of prompts.
	serie = serie.replace('\n', ' ')
	serie = serie.replace('\\n', ' ')
	serie = serie.replace(' ', ' ')
	serie = serie.replace(' ', ' ')
	return serie


	def search_paper_abstract(title):
	pg = ProxyGenerator()
	success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") # todo: change this to env. var. for protection.
	if success:
	scholarly.use_proxy(pg)
	# input the title of a paper, return its abstract
	search_query = scholarly.search_pubs(title)
	found_paper = next(search_query)
	else:
	raise RuntimeError("ScraperAPI fails.")
	return remove_newlines(found_paper['bib']['abstract'])


	def load_papers_from_bibtex(bib_file_path):
	with open(bib_file_path) as bibtex_file:
	bib_database = bibtexparser.load(bibtex_file)
	if len(bib_database.entries) == 0:
	return []
	else:
	bib_papers = []
	for bibitem in bib_database.entries:
	# Add each paper to `bib_papers`
	paper_id = bibitem.get("ID")
	title = bibitem.get("title")
	if title is None:
	continue
	journal = bibitem.get("journal")
	year = bibitem.get("year")
	author = bibitem.get("author")
	abstract = bibitem.get("abstract")
	if abstract is None:
	abstract = search_paper_abstract(title)
	result = {
	"paper_id": paper_id,
	"title": title,
	"link": "",
	"abstract": abstract,
	"authors": author,
	"year": year,
	"journal": journal
	}
	bib_papers.append(result)
	return bib_papers

	######################################################################################################################
	# Semantic Scholar (SS) API
	######################################################################################################################
	def ss_search(keywords, limit=20, fields=None):
	# space between the query to be removed and replaced with +
	if fields is None:
	fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
	keywords = keywords.lower()
	keywords = keywords.replace(" ", "+")
	url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
	# headers = {"Accept": "/", "x-api-key": constants.S2_KEY}
	headers = {"Accept": "/"}

	response = requests.get(url, headers=headers, timeout=30)
	return response.json()


	def _collect_papers_ss(keyword, counts=3, tldr=False):
	def externalIds2link(externalIds):
	# Sample externalIds:
	# "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
	if externalIds:
	# Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
	# priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
	# DBLP
	dblp_id = externalIds.get('DBLP')
	if dblp_id is not None:
	dblp_link = f"dblp.org/rec/{dblp_id}"
	return dblp_link
	# arXiv
	arxiv_id = externalIds.get('ArXiv')
	if arxiv_id is not None:
	arxiv_link = f"arxiv.org/abs/{arxiv_id}"
	return arxiv_link
	return ""
	else:
	# if this is an empty dictionary, return an empty string
	return ""

	def extract_paper_id(last_name, year_str, title):
	pattern = r'^\w+'
	words = re.findall(pattern, title)
	# return last_name + year_str + title.split(' ', 1)[0]
	try:
	output = last_name + year_str + words[0]
	except IndexError:
	output = last_name + year_str + title[:4]
	return output

	def extract_author_info(raw_authors):
	authors = [author['name'] for author in raw_authors]

	authors_str = " and ".join(authors)
	try:
	last_name = authors[0].split()[-1]
	except IndexError:
	last_name = "ma"
	# pattern = r'^\w+'
	# last_name = re.findall(pattern, authors[0])
	return authors_str, last_name

	def parse_search_results(search_results_ss):
	if len(search_results_ss) == 0:
	return []

	# turn the search result to a list of paper dictionary.
	papers_ss = []
	for raw_paper in search_results_ss:
	if raw_paper["abstract"] is None:
	continue

	authors_str, last_name = extract_author_info(raw_paper['authors'])
	year_str = str(raw_paper['year'])
	title = raw_paper['title']

	# some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
	journal = raw_paper['venue'].replace("&", "\\&")
	if not journal:
	journal = "arXiv preprint"

	paper_id = extract_paper_id(last_name, year_str, title).lower()
	link = externalIds2link(raw_paper['externalIds'])

	if tldr and raw_paper['tldr'] is not None:
	abstract = raw_paper['tldr']['text']
	else:
	abstract = remove_newlines(raw_paper['abstract'])
	embeddings = raw_paper['embedding']['vector']
	result = {
	"paper_id": paper_id,
	"title": title,
	"abstract": abstract,
	"link": link,
	"authors": authors_str,
	"year": year_str,
	"journal": journal,
	"embeddings": embeddings
	}
	papers_ss.append(result)
	return papers_ss

	raw_results = ss_search(keyword, limit=counts)
	if raw_results is not None:
	search_results = raw_results.get("data")
	if search_results is None:
	search_results = []
	else:
	search_results = []
	results = parse_search_results(search_results)
	return results

	######################################################################################################################
	# References Class
	######################################################################################################################

	class References:
	def __init__(self):
	# if load_papers:
	# # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10
	# # (2) google scholar didn't give a full abstract for some papers ...
	# # (3) may use langchain to support long input
	# self.papers = load_papers_from_bibtex(load_papers)
	# else:
	self.papers = {}

	def load_papers(self, bibtex, keyword):
	self.papers[keyword] = load_papers_from_bibtex(bibtex)

	def generate_keywords_dict(self):
	keywords_dict = {}
	for k in self.papers:
	keywords_dict[k] = len(self.papers[k])
	return keywords_dict

	def collect_papers(self, keywords_dict, tldr=False):
	"""
	keywords_dict:
	{"machine learning": 5, "language model": 2};
	the first is the keyword, the second is how many references are needed.
	"""
	for key, counts in keywords_dict.items():
	self.papers[key] = _collect_papers_ss(key, counts, tldr)

	# Remove duplicated references # todo: remove duplicated references in tex_processing procedure.

	def find_relevant(self, max_refs=30):
	# todo: use embeddings to evaluate
	pass

	def to_bibtex(self, path_to_bibtex="ref.bib"):
	"""
	Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
	"""
	papers = self._get_papers(keyword = "_all")

	# clear the bibtex file
	with open(path_to_bibtex, "w", encoding="utf-8") as file:
	file.write("")

	bibtex_entries = []
	paper_ids = []
	for paper in papers:
	bibtex_entry = f"""@article{{{paper["paper_id"]},
	title = {{{paper["title"]}}},
	author = {{{paper["authors"]}}},
	journal={{{paper["journal"]}}},
	year = {{{paper["year"]}}},
	url = {{{paper["link"]}}}
	}}"""
	bibtex_entries.append(bibtex_entry)
	paper_ids.append(paper["paper_id"])
	# Save the generated BibTeX entries to a file
	with open(path_to_bibtex, "a", encoding="utf-8") as file:
	file.write(bibtex_entry)
	file.write("\n\n")
	return paper_ids

	def _get_papers(self, keyword = "_all"):
	if keyword == "_all":
	papers = []
	for k, v in self.papers.items():
	papers = papers + v
	else:
	papers = self.papers["keyword"]
	return papers

	def to_prompts(self, keyword = "_all"):
	# `prompts`:
	# {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
	# this will be used to instruct GPT model to cite the correct bibtex entry.
	papers = self._get_papers(keyword)
	prompts = {}
	for paper in papers:
	prompts[paper["paper_id"]] = paper["abstract"]
	return prompts

	def to_json(self, keyword = "_all"):
	papers = self._get_papers(keyword)
	papers_json = {}
	for paper in papers:
	papers_json[paper["paper_id"]] = paper
	return papers_json



	if __name__ == "__main__":
	# r = ss_search("Deep Q-Networks")['data']
	# print(r)
	# papers_json = {}
	# # for i in range(len(r)):
	# # r[i]
	# #
	# # with open("Output.txt", "w") as text_file:
	# # text_file.write("Purchase Amount: %s" % TotalAmount)
	# embeddings = r[0]['embedding']['vector']
	# print(embeddings)

	refs = References()
	keywords_dict = {
	"Deep Q-Networks": 5,
	"Actor-Critic Algorithms": 4,
	"Exploration-Exploitation Trade-off": 3
	}
	refs.collect_papers(keywords_dict, method="ss", tldr=True)
	for k in refs.papers:
	papers = refs.papers[k]
	print("keyword: ", k)
	for paper in papers:
	print(paper["paper_id"])

	refs.to_json()
	refs.to_bibtex()
	refs.to_prompts()
	# print(refs.papers)

	# todo: test load_papers
	# write test covering `references.py`. / fix this as a stable version

	# for p in refs.papers:
	# print(p["paper_id"])
	# print(len(refs.papers))
	#
	# papers_json = refs.to_json()
	# # print(papers_json)
	# with open("papers.json", "w", encoding='utf-8') as text_file:
	# text_file.write(f"{papers_json}")


	# bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib"
	# papers = load_papers_from_bibtex(bib)
	# for paper in papers:
	# print(paper)