shaocongma commited on
Commit
d18c569
1 Parent(s): 7d01fc4
app.py CHANGED
@@ -6,7 +6,6 @@ from utils.file_operations import hash_name
6
  from references_generator import generate_top_k_references
7
 
8
  # todo:
9
- # generation.log sometimes disappears
10
  # 6. get logs when the procedure is not completed. *
11
  # 7. 自己的文件库; 更多的prompts
12
  # 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
@@ -15,10 +14,8 @@ from references_generator import generate_top_k_references
15
  # 3. Check API Key GPT-4 Support.
16
  # 8. Re-build some components using `langchain`
17
  # - in `gpt_interation`, use LLM
18
- # 5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
19
  # future:
20
- # 4. add auto_polishing function
21
- # 12. Change link to more appealing color # after the website is built;
22
  # 1. Check if there are any duplicated citations
23
  # 2. Remove potential thebibliography and bibitem in .tex file
24
 
 
6
  from references_generator import generate_top_k_references
7
 
8
  # todo:
 
9
  # 6. get logs when the procedure is not completed. *
10
  # 7. 自己的文件库; 更多的prompts
11
  # 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
 
14
  # 3. Check API Key GPT-4 Support.
15
  # 8. Re-build some components using `langchain`
16
  # - in `gpt_interation`, use LLM
 
17
  # future:
18
+ # generation.log sometimes disappears (ignore this)
 
19
  # 1. Check if there are any duplicated citations
20
  # 2. Remove potential thebibliography and bibitem in .tex file
21
 
references_generator.py CHANGED
@@ -32,7 +32,7 @@ def generate_raw_references(title, description="",
32
  print(f"keywords: {keywords}\n\n")
33
 
34
  ref.collect_papers(keywords, tldr=tldr)
35
- # paper_json = ref.to_json()
36
 
37
  with open(save_to, "w") as f:
38
  json.dump(paper_json, f)
 
32
  print(f"keywords: {keywords}\n\n")
33
 
34
  ref.collect_papers(keywords, tldr=tldr)
35
+ paper_json = ref.to_json()
36
 
37
  with open(save_to, "w") as f:
38
  json.dump(paper_json, f)
utils/gpt_interaction.py CHANGED
@@ -25,7 +25,7 @@ def get_gpt_responses(systems, prompts, model="gpt-4", temperature=0.4):
25
 
26
 
27
  def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, base_url=None, key=None):
28
- end_point = r"/v1/chat/completions"
29
  if base_url is None:
30
  base_url = r"https://api.openai.com" + end_point
31
  if key is None:
@@ -45,10 +45,13 @@ def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, bas
45
  "message": message,
46
  "temperature": temperature
47
  }
 
48
  response = requests.post(url, headers=headers, json=data)
 
49
  response = response.json()
50
  return response['choices'][0]["message"]["content"]
51
 
52
 
53
  if __name__ == "__main__":
54
- pass
 
 
25
 
26
 
27
  def get_gpt_responses_test(systems, prompts, model="gpt-4", temperature=0.4, base_url=None, key=None):
28
+ end_point = r"/v1/completions"
29
  if base_url is None:
30
  base_url = r"https://api.openai.com" + end_point
31
  if key is None:
 
45
  "message": message,
46
  "temperature": temperature
47
  }
48
+ print(data)
49
  response = requests.post(url, headers=headers, json=data)
50
+ print(response)
51
  response = response.json()
52
  return response['choices'][0]["message"]["content"]
53
 
54
 
55
  if __name__ == "__main__":
56
+ rep = get_gpt_responses_test(None, "Hello!", base_url=r"https://api.openai.com/v1/completions", key="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")
57
+ print(rep)
utils/references.py CHANGED
@@ -10,13 +10,13 @@
10
  # A sample prompt: {"paper_id": "paper summary"}
11
 
12
  # todo: (1) citations & citedby of provided papers:
13
- # load the pre-defined papers; use S2 to find all related works
14
- # add all citations to `bib_papers`
15
- # add all citedby to `bib_papers`
16
- # use Semantic Scholar to find their embeddings
17
  # (2) separate references:
18
- # divide references into different groups to reduce the tokens count
19
- # for generating different paragraph of related works, use different set of references
20
 
21
  import requests
22
  import re
@@ -44,7 +44,7 @@ def remove_newlines(serie):
44
 
45
  def search_paper_abstract(title):
46
  pg = ProxyGenerator()
47
- success = pg.FreeProxies() #pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
48
  if success:
49
  try:
50
  scholarly.use_proxy(pg)
@@ -91,15 +91,17 @@ def load_papers_from_bibtex(bib_file_path):
91
  return bib_papers
92
 
93
 
94
-
95
  # `tokenizer`: used to count how many tokens
96
  tokenizer_name = tiktoken.encoding_for_model('gpt-4')
97
  tokenizer = tiktoken.get_encoding(tokenizer_name.name)
98
 
 
99
  def tiktoken_len(text):
100
  # evaluate how many tokens for the given text
101
  tokens = tokenizer.encode(text, disallowed_special=())
102
  return len(tokens)
 
 
103
  ######################################################################################################################
104
  # Semantic Scholar (SS) API
105
  ######################################################################################################################
@@ -218,6 +220,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
218
  results = parse_search_results(search_results)
219
  return results
220
 
 
221
  ######################################################################################################################
222
  # References Class
223
  ######################################################################################################################
@@ -225,8 +228,7 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
225
  class References:
226
  def __init__(self, title, load_papers=None, keyword="customized_refs"):
227
  if load_papers is not None:
228
- self.papers = {}
229
- self.papers[keyword] = load_papers_from_bibtex(load_papers)
230
  else:
231
  self.papers = {}
232
  self.title = title
@@ -259,7 +261,6 @@ class References:
259
  # for key, counts in keywords_dict.items():
260
  # self.papers[key] = _collect_papers_ss(key, counts, tldr)
261
 
262
-
263
  def to_bibtex(self, path_to_bibtex="ref.bib"):
264
  """
265
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
@@ -267,7 +268,7 @@ class References:
267
  # todo:
268
  # use embeddings to evaluate; keep top k relevant references in papers
269
  # send (title, .bib file) to evaluate embeddings; recieve truncated papers
270
- papers = self._get_papers(keyword = "_all")
271
 
272
  # clear the bibtex file
273
  with open(path_to_bibtex, "w", encoding="utf-8") as file:
@@ -296,7 +297,7 @@ class References:
296
  file.write("\n\n")
297
  return paper_ids
298
 
299
- def _get_papers(self, keyword = "_all"):
300
  if keyword == "_all":
301
  papers = []
302
  for k, v in self.papers.items():
@@ -305,7 +306,7 @@ class References:
305
  papers = self.papers["keyword"]
306
  return papers
307
 
308
- def to_prompts(self, keyword = "_all", max_tokens = 2048):
309
  # `prompts`:
310
  # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
311
  # this will be used to instruct GPT model to cite the correct bibtex entry.
@@ -319,6 +320,7 @@ class References:
319
  json.dump(papers_json, f)
320
 
321
  try:
 
322
  title = self.title
323
  client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
324
  result = client.predict(
@@ -347,7 +349,7 @@ class References:
347
  break
348
  return prompts
349
 
350
- def to_json(self, keyword = "_all"):
351
  papers = self._get_papers(keyword)
352
  papers_json = {}
353
  for paper in papers:
@@ -355,7 +357,6 @@ class References:
355
  return papers_json
356
 
357
 
358
-
359
  if __name__ == "__main__":
360
  # testing search results
361
  print("================Testing `ss_search`================")
@@ -375,7 +376,7 @@ if __name__ == "__main__":
375
  print("================Testing `References.collect_papers`================")
376
  refs.collect_papers(keywords_dict, tldr=True)
377
  for k in refs.papers:
378
- papers = refs.papers[k] # for each keyword, there is a list of papers
379
  print("keyword: ", k)
380
  for paper in papers:
381
  print(paper["paper_id"])
@@ -384,8 +385,8 @@ if __name__ == "__main__":
384
  refs.to_bibtex()
385
 
386
  print("================Testing `References.to_json`================")
387
- papers_json = refs.to_json() # this json can be used to find the most relevant papers
388
- with open("papers.json", "w", encoding='utf-8') as text_file:
389
  text_file.write(f"{papers_json}")
390
 
391
  print("================Testing `References.to_prompts`================")
 
10
  # A sample prompt: {"paper_id": "paper summary"}
11
 
12
  # todo: (1) citations & citedby of provided papers:
13
+ # load the pre-defined papers; use S2 to find all related works
14
+ # add all citations to `bib_papers`
15
+ # add all citedby to `bib_papers`
16
+ # use Semantic Scholar to find their embeddings
17
  # (2) separate references:
18
+ # divide references into different groups to reduce the tokens count
19
+ # for generating different paragraph of related works, use different set of references
20
 
21
  import requests
22
  import re
 
44
 
45
  def search_paper_abstract(title):
46
  pg = ProxyGenerator()
47
+ success = pg.FreeProxies() # pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155")
48
  if success:
49
  try:
50
  scholarly.use_proxy(pg)
 
91
  return bib_papers
92
 
93
 
 
94
  # `tokenizer`: used to count how many tokens
95
  tokenizer_name = tiktoken.encoding_for_model('gpt-4')
96
  tokenizer = tiktoken.get_encoding(tokenizer_name.name)
97
 
98
+
99
  def tiktoken_len(text):
100
  # evaluate how many tokens for the given text
101
  tokens = tokenizer.encode(text, disallowed_special=())
102
  return len(tokens)
103
+
104
+
105
  ######################################################################################################################
106
  # Semantic Scholar (SS) API
107
  ######################################################################################################################
 
220
  results = parse_search_results(search_results)
221
  return results
222
 
223
+
224
  ######################################################################################################################
225
  # References Class
226
  ######################################################################################################################
 
228
  class References:
229
  def __init__(self, title, load_papers=None, keyword="customized_refs"):
230
  if load_papers is not None:
231
+ self.papers = {keyword: load_papers_from_bibtex(load_papers)}
 
232
  else:
233
  self.papers = {}
234
  self.title = title
 
261
  # for key, counts in keywords_dict.items():
262
  # self.papers[key] = _collect_papers_ss(key, counts, tldr)
263
 
 
264
  def to_bibtex(self, path_to_bibtex="ref.bib"):
265
  """
266
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
 
268
  # todo:
269
  # use embeddings to evaluate; keep top k relevant references in papers
270
  # send (title, .bib file) to evaluate embeddings; recieve truncated papers
271
+ papers = self._get_papers(keyword="_all")
272
 
273
  # clear the bibtex file
274
  with open(path_to_bibtex, "w", encoding="utf-8") as file:
 
297
  file.write("\n\n")
298
  return paper_ids
299
 
300
+ def _get_papers(self, keyword="_all"):
301
  if keyword == "_all":
302
  papers = []
303
  for k, v in self.papers.items():
 
306
  papers = self.papers["keyword"]
307
  return papers
308
 
309
+ def to_prompts(self, keyword="_all", max_tokens=2048):
310
  # `prompts`:
311
  # {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
312
  # this will be used to instruct GPT model to cite the correct bibtex entry.
 
320
  json.dump(papers_json, f)
321
 
322
  try:
323
+ # Use external API to obtain the most relevant papers
324
  title = self.title
325
  client = Client("https://shaocongma-evaluate-specter-embeddings.hf.space/")
326
  result = client.predict(
 
349
  break
350
  return prompts
351
 
352
+ def to_json(self, keyword="_all"):
353
  papers = self._get_papers(keyword)
354
  papers_json = {}
355
  for paper in papers:
 
357
  return papers_json
358
 
359
 
 
360
  if __name__ == "__main__":
361
  # testing search results
362
  print("================Testing `ss_search`================")
 
376
  print("================Testing `References.collect_papers`================")
377
  refs.collect_papers(keywords_dict, tldr=True)
378
  for k in refs.papers:
379
+ papers = refs.papers[k] # for each keyword, there is a list of papers
380
  print("keyword: ", k)
381
  for paper in papers:
382
  print(paper["paper_id"])
 
385
  refs.to_bibtex()
386
 
387
  print("================Testing `References.to_json`================")
388
+ papers_json = refs.to_json() # this json can be used to find the most relevant papers
389
+ with open("papers.json", "w", encoding='utf-8') as text_file:
390
  text_file.write(f"{papers_json}")
391
 
392
  print("================Testing `References.to_prompts`================")