# Each `paper` is a dictionary containing: # (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings # # Generate references: # `Reference` class: # 1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract. # 2. Given some keywords; use Semantic Scholar API to find papers. # 3. Generate bibtex from the selected papers. --> to_bibtex() # 4. Generate prompts from the selected papers: --> to_prompts() # A sample prompt: {"paper_id": "paper summary"} # todo: (1) citations & citedby of provided papers: # load the pre-defined papers; use S2 to find all related works # add all citations to `bib_papers` # add all citedby to `bib_papers` # use Semantic Scholar to find their embeddings # (2) separate references: # divide references into different groups to reduce the tokens count # for generating different paragraph of related works, use different set of references import requests import re import bibtexparser from scholarly import scholarly from scholarly import ProxyGenerator ###################################################################################################################### # Some basic tools ###################################################################################################################### def remove_newlines(serie): # This function is applied to the abstract of each paper to reduce the length of prompts. serie = serie.replace('\n', ' ') serie = serie.replace('\\n', ' ') serie = serie.replace(' ', ' ') serie = serie.replace(' ', ' ') return serie def search_paper_abstract(title): pg = ProxyGenerator() success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") # todo: change this to env. var. for protection. if success: scholarly.use_proxy(pg) # input the title of a paper, return its abstract search_query = scholarly.search_pubs(title) found_paper = next(search_query) else: raise RuntimeError("ScraperAPI fails.") return remove_newlines(found_paper['bib']['abstract']) def load_papers_from_bibtex(bib_file_path): with open(bib_file_path) as bibtex_file: bib_database = bibtexparser.load(bibtex_file) if len(bib_database.entries) == 0: return [] else: bib_papers = [] for bibitem in bib_database.entries: # Add each paper to `bib_papers` paper_id = bibitem.get("ID") title = bibitem.get("title") if title is None: continue journal = bibitem.get("journal") year = bibitem.get("year") author = bibitem.get("author") abstract = bibitem.get("abstract") if abstract is None: abstract = search_paper_abstract(title) result = { "paper_id": paper_id, "title": title, "link": "", "abstract": abstract, "authors": author, "year": year, "journal": journal } bib_papers.append(result) return bib_papers ###################################################################################################################### # Semantic Scholar (SS) API ###################################################################################################################### def ss_search(keywords, limit=20, fields=None): # space between the query to be removed and replaced with + if fields is None: fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"] keywords = keywords.lower() keywords = keywords.replace(" ", "+") url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}' # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY} headers = {"Accept": "*/*"} response = requests.get(url, headers=headers, timeout=30) return response.json() def _collect_papers_ss(keyword, counts=3, tldr=False): def externalIds2link(externalIds): # Sample externalIds: # "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}" if externalIds: # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral) # DBLP dblp_id = externalIds.get('DBLP') if dblp_id is not None: dblp_link = f"dblp.org/rec/{dblp_id}" return dblp_link # arXiv arxiv_id = externalIds.get('ArXiv') if arxiv_id is not None: arxiv_link = f"arxiv.org/abs/{arxiv_id}" return arxiv_link return "" else: # if this is an empty dictionary, return an empty string return "" def extract_paper_id(last_name, year_str, title): pattern = r'^\w+' words = re.findall(pattern, title) # return last_name + year_str + title.split(' ', 1)[0] try: output = last_name + year_str + words[0] except IndexError: output = last_name + year_str + title[:4] return output def extract_author_info(raw_authors): authors = [author['name'] for author in raw_authors] authors_str = " and ".join(authors) try: last_name = authors[0].split()[-1] except IndexError: last_name = "ma" # pattern = r'^\w+' # last_name = re.findall(pattern, authors[0]) return authors_str, last_name def parse_search_results(search_results_ss): if len(search_results_ss) == 0: return [] # turn the search result to a list of paper dictionary. papers_ss = [] for raw_paper in search_results_ss: if raw_paper["abstract"] is None: continue authors_str, last_name = extract_author_info(raw_paper['authors']) year_str = str(raw_paper['year']) title = raw_paper['title'] # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting} journal = raw_paper['venue'].replace("&", "\\&") if not journal: journal = "arXiv preprint" paper_id = extract_paper_id(last_name, year_str, title).lower() link = externalIds2link(raw_paper['externalIds']) if tldr and raw_paper['tldr'] is not None: abstract = raw_paper['tldr']['text'] else: abstract = remove_newlines(raw_paper['abstract']) embeddings = raw_paper['embedding']['vector'] result = { "paper_id": paper_id, "title": title, "abstract": abstract, "link": link, "authors": authors_str, "year": year_str, "journal": journal, "embeddings": embeddings } papers_ss.append(result) return papers_ss raw_results = ss_search(keyword, limit=counts) if raw_results is not None: search_results = raw_results.get("data") if search_results is None: search_results = [] else: search_results = [] results = parse_search_results(search_results) return results ###################################################################################################################### # References Class ###################################################################################################################### class References: def __init__(self): # if load_papers: # # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10 # # (2) google scholar didn't give a full abstract for some papers ... # # (3) may use langchain to support long input # self.papers = load_papers_from_bibtex(load_papers) # else: self.papers = {} def load_papers(self, bibtex, keyword): self.papers[keyword] = load_papers_from_bibtex(bibtex) def generate_keywords_dict(self): keywords_dict = {} for k in self.papers: keywords_dict[k] = len(self.papers[k]) return keywords_dict def collect_papers(self, keywords_dict, tldr=False): """ keywords_dict: {"machine learning": 5, "language model": 2}; the first is the keyword, the second is how many references are needed. """ for key, counts in keywords_dict.items(): self.papers[key] = _collect_papers_ss(key, counts, tldr) # Remove duplicated references # todo: remove duplicated references in tex_processing procedure. def find_relevant(self, max_refs=30): # todo: use embeddings to evaluate pass def to_bibtex(self, path_to_bibtex="ref.bib"): """ Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`. """ papers = self._get_papers(keyword = "_all") # clear the bibtex file with open(path_to_bibtex, "w", encoding="utf-8") as file: file.write("") bibtex_entries = [] paper_ids = [] for paper in papers: bibtex_entry = f"""@article{{{paper["paper_id"]}, title = {{{paper["title"]}}}, author = {{{paper["authors"]}}}, journal={{{paper["journal"]}}}, year = {{{paper["year"]}}}, url = {{{paper["link"]}}} }}""" bibtex_entries.append(bibtex_entry) paper_ids.append(paper["paper_id"]) # Save the generated BibTeX entries to a file with open(path_to_bibtex, "a", encoding="utf-8") as file: file.write(bibtex_entry) file.write("\n\n") return paper_ids def _get_papers(self, keyword = "_all"): if keyword == "_all": papers = [] for k, v in self.papers.items(): papers = papers + v else: papers = self.papers["keyword"] return papers def to_prompts(self, keyword = "_all"): # `prompts`: # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"} # this will be used to instruct GPT model to cite the correct bibtex entry. papers = self._get_papers(keyword) prompts = {} for paper in papers: prompts[paper["paper_id"]] = paper["abstract"] return prompts def to_json(self, keyword = "_all"): papers = self._get_papers(keyword) papers_json = {} for paper in papers: papers_json[paper["paper_id"]] = paper return papers_json if __name__ == "__main__": # r = ss_search("Deep Q-Networks")['data'] # print(r) # papers_json = {} # # for i in range(len(r)): # # r[i] # # # # with open("Output.txt", "w") as text_file: # # text_file.write("Purchase Amount: %s" % TotalAmount) # embeddings = r[0]['embedding']['vector'] # print(embeddings) refs = References() keywords_dict = { "Deep Q-Networks": 5, "Actor-Critic Algorithms": 4, "Exploration-Exploitation Trade-off": 3 } refs.collect_papers(keywords_dict, method="ss", tldr=True) for k in refs.papers: papers = refs.papers[k] print("keyword: ", k) for paper in papers: print(paper["paper_id"]) refs.to_json() refs.to_bibtex() refs.to_prompts() # print(refs.papers) # todo: test load_papers # write test covering `references.py`. / fix this as a stable version # for p in refs.papers: # print(p["paper_id"]) # print(len(refs.papers)) # # papers_json = refs.to_json() # # print(papers_json) # with open("papers.json", "w", encoding='utf-8') as text_file: # text_file.write(f"{papers_json}") # bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib" # papers = load_papers_from_bibtex(bib) # for paper in papers: # print(paper)