singhvaibhav924 commited on
Commit
77e70e4
·
1 Parent(s): e1bc181

Server Optimised

Browse files
Files changed (3) hide show
  1. app.py +10 -26
  2. helper.py +60 -90
  3. requirements.txt +0 -0
app.py CHANGED
@@ -4,10 +4,6 @@ from fastapi import FastAPI
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
6
  from typing import List
7
- import os
8
- from dotenv import load_dotenv
9
-
10
- load_dotenv()
11
 
12
  app = FastAPI()
13
  app.add_middleware(
@@ -18,10 +14,6 @@ app.add_middleware(
18
  allow_headers=["*"]
19
  )
20
 
21
- llms = None
22
- base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
23
- sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"
24
-
25
  class RequestData(BaseModel):
26
  abstract: str
27
  words: int
@@ -31,35 +23,27 @@ class ResponseData(BaseModel):
31
  summary: str
32
  ids: List[str]
33
 
 
 
 
34
  @app.post("/generateLiteratureSurvey/", response_model=ResponseData)
35
  async def generate_literature_survey(request_data: RequestData):
36
- summary, ids = summarize(request_data.abstract, request_data.words, request_data.papers, llms)
37
  return {"summary": summary,
38
  "ids": ids
39
  }
40
 
41
  @app.get("/")
42
  async def root():
43
- if llms == None :
44
- return {"status": 0}
45
  return {"status": 1}
46
 
47
- @app.get("/test")
48
- async def root():
49
- if llms == None :
50
- return {"status": 0}
51
- return {"status": 1}
52
-
53
- def summarize(query, n_words, n_papers, llms) :
54
- keywords = helper.extract_keywords(llms['feature_extractor'], query)
55
- papers = helper.search_papers(llms['arxiv_agent'], keywords, n_papers*2)
56
- ranked_papers = helper.re_rank_papers(llms['ranker'], query, papers, n_papers)
57
- literature_review, ids = helper.generate_related_work(llms['summarizer'], llms['summarizer_tokenizer'], query, ranked_papers, base_prompt, sentence_plan, n_words)
58
  return literature_review, ids
59
 
60
- print("Program running")
61
- llms = helper.init_pipeline()
62
- print('Model loaded')
63
-
64
  if __name__ == '__main__':
 
65
  uvicorn.run(app)
 
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from pydantic import BaseModel
6
  from typing import List
 
 
 
 
7
 
8
  app = FastAPI()
9
  app.add_middleware(
 
14
  allow_headers=["*"]
15
  )
16
 
 
 
 
 
17
  class RequestData(BaseModel):
18
  abstract: str
19
  words: int
 
23
  summary: str
24
  ids: List[str]
25
 
26
+ base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
27
+ sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"
28
+
29
  @app.post("/generateLiteratureSurvey/", response_model=ResponseData)
30
  async def generate_literature_survey(request_data: RequestData):
31
+ summary, ids = summarize(request_data.abstract, request_data.words, request_data.papers)
32
  return {"summary": summary,
33
  "ids": ids
34
  }
35
 
36
  @app.get("/")
37
  async def root():
 
 
38
  return {"status": 1}
39
 
40
+ def summarize(query, n_words, n_papers) :
41
+ keywords = helper.extract_keywords(query)
42
+ papers = helper.search_papers(keywords, n_papers*2)
43
+ ranked_papers = helper.re_rank_papers(query, papers, n_papers)
44
+ literature_review, ids = helper.generate_related_work(query, ranked_papers, base_prompt, sentence_plan, n_words)
 
 
 
 
 
 
45
  return literature_review, ids
46
 
 
 
 
 
47
  if __name__ == '__main__':
48
+ print("Program running")
49
  uvicorn.run(app)
helper.py CHANGED
@@ -1,79 +1,60 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer, TokenClassificationPipeline, AutoModelForTokenClassification, pipeline
2
- from langchain_community.utilities import ArxivAPIWrapper
3
- from transformers.pipelines import AggregationStrategy
4
- from sentence_transformers import SentenceTransformer
5
- import arxiv
6
  import numpy as np
7
- import torch
 
 
 
 
 
 
 
 
8
 
9
  summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
10
  feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
11
  ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
12
 
13
- class KeyphraseExtractionPipeline(TokenClassificationPipeline):
14
- def __init__(self, model, *args, **kwargs):
15
- super().__init__(
16
- model=AutoModelForTokenClassification.from_pretrained(model),
17
- tokenizer=AutoTokenizer.from_pretrained(model),
18
- *args,
19
- **kwargs
20
- )
21
-
22
- def postprocess(self, all_outputs):
23
- results = super().postprocess(
24
- all_outputs=all_outputs,
25
- aggregation_strategy=AggregationStrategy.SIMPLE,
26
- )
27
- return np.unique([result.get("word").strip() for result in results])
28
 
29
- def init_pipeline() :
30
- summarizer_model = AutoModelForCausalLM.from_pretrained(
31
- summarizer_model_name,
32
- torch_dtype=torch.float16,
33
- trust_remote_code=True
34
- )
35
- summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name)
36
-
37
- feature_extractor_model = KeyphraseExtractionPipeline(model=feature_extractor_model_name)
38
-
39
- ranker_model=SentenceTransformer(ranker_model_name)
40
-
41
- arxiv_agent = ArxivAPIWrapper(top_k_results = 5, doc_content_chars_max = None, load_max_docs = 10)
42
- return {
43
- "summarizer" : summarizer_model,
44
- "summarizer_tokenizer" : summarizer_tokenizer,
45
- "feature_extractor" : feature_extractor_model,
46
- "ranker" : ranker_model,
47
- "arxiv_agent" : arxiv_agent
48
- }
49
-
50
- def extract_keywords(model, abstract):
51
- keyphrases = model(abstract)
52
  print(keyphrases)
53
  return keyphrases
54
 
55
-
56
- def search_papers(arxiv_agent, keywords, n_papers):
57
  query = " ".join(keywords)
58
  results = arxiv_agent.get_summaries_as_docs(query)
59
- #print("arxiv ouptut ")
60
- #print(results)
61
  return results
62
 
63
- def re_rank_papers(model, query_abstract, papers, n_papers):
64
- summaries = {paper.page_content : {"Title":paper.metadata['Title']} for paper in papers}
65
- print(summaries)
66
- target_embeddings = model.encode([query_abstract])
67
- summaries_embeddings = model.encode(list(summaries.keys()))
68
 
69
- cosine_similarities = -torch.nn.functional.cosine_similarity(torch.from_numpy(target_embeddings), torch.from_numpy(summaries_embeddings))
70
- cosine_similarities = cosine_similarities.tolist()
71
-
72
- i = 0
73
- for key in summaries.keys() :
74
- summaries[key]["score"] = cosine_similarities[i]
75
- i+=1
76
- return dict(sorted(summaries.items(), key=lambda x: x[1]["score"], reverse=True))
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  def format_abstracts_as_references(papers):
79
  cite_text = ""
@@ -121,38 +102,27 @@ def generate_refs(papers) :
121
  i+=1
122
  return refs, ids
123
 
124
- def generate_related_work(model, tokenizer, query_abstract, ranked_papers, base_prompt, sentence_plan, n_words):
125
- input_text = f"Abstract: {query_abstract}\n"
126
- i = 1
127
- for key in ranked_papers.keys():
128
- input_text += f"{i+1}. {ranked_papers[key]['Title']} - {key}\n"
129
- i+=1
130
-
131
  data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
132
  complete_prompt = f"{base_prompt}\n```{data}```"
133
- messages = [
134
- {"role": "system", "content": "You are a helpful AI assistant."},
135
- {"role": "user", "content": complete_prompt}]
136
 
137
- pipe = pipeline(
138
- "text-generation",
139
- model=model,
140
- tokenizer=tokenizer,
141
- )
142
-
143
- generation_args = {
144
- "max_new_tokens": n_words,
145
- "return_full_text": False,
146
- "temperature": 0.0,
147
- "do_sample": False,
148
- }
149
-
150
- output = pipe(messages, **generation_args)
151
- print(output)
152
- related_work = output[0]['generated_text']
153
  refs, ids = generate_refs(ranked_papers)
154
  related_work += refs
155
- f = open("literature review.txt", "w")
156
- f.write(related_work)
157
- f.close()
 
158
  return related_work, ids
 
1
+ import requests
 
 
 
 
2
  import numpy as np
3
+ import arxiv
4
+ from langchain.utilities import ArxivAPIWrapper
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ HF_API_TOKEN = os.environ.get('HF_API_TOKEN')
11
+ HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
12
 
13
  summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
14
  feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
15
  ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
16
 
17
+ def hf_api_call(model_name, payload):
18
+ API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
19
+ response = requests.post(API_URL, headers=HEADERS, json=payload)
20
+ return response.json()
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def extract_keywords(abstract):
23
+ payload = {"inputs": abstract}
24
+ result = hf_api_call(feature_extractor_model_name, payload)
25
+ keyphrases = np.unique([item['word'].strip() for item in result])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  print(keyphrases)
27
  return keyphrases
28
 
29
+ def search_papers(keywords, n_papers):
30
+ arxiv_agent = ArxivAPIWrapper(top_k_results=n_papers, doc_content_chars_max=None, load_max_docs=n_papers+3)
31
  query = " ".join(keywords)
32
  results = arxiv_agent.get_summaries_as_docs(query)
 
 
33
  return results
34
 
35
+ def re_rank_papers(query_abstract, papers, n_papers):
36
+ summaries = {paper.page_content: {"Title": paper.metadata['Title']} for paper in papers}
37
+ summ_list = []
 
 
38
 
39
+ payload = {
40
+ "inputs": {
41
+ "source_sentence": query_abstract,
42
+ "sentences": list(summaries.keys())
43
+ }
44
+ }
45
+ result = hf_api_call(ranker_model_name, payload)
46
+
47
+ for i, key in enumerate(summaries.keys()):
48
+ summ_list.append((key, summaries[key]["Title"], result[i]))
49
+ print((key, summaries[key]["Title"], result[i]))
50
+ summ_list = sorted(summ_list, key=lambda x: x[2], reverse=True)
51
+ summaries = {}
52
+ for i in range(n_papers) :
53
+ summaries[summ_list[i][0]] = {
54
+ "Title" : summ_list[i][1],
55
+ "score" : summ_list[i][2]
56
+ }
57
+ return summaries
58
 
59
  def format_abstracts_as_references(papers):
60
  cite_text = ""
 
102
  i+=1
103
  return refs, ids
104
 
105
+
106
+ def generate_related_work(query_abstract, ranked_papers, base_prompt, sentence_plan, n_words):
 
 
 
 
 
107
  data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
108
  complete_prompt = f"{base_prompt}\n```{data}```"
 
 
 
109
 
110
+ payload = {
111
+ "inputs": complete_prompt,
112
+ "parameters": {
113
+ "max_new_tokens": n_words,
114
+ "temperature": 0.01,
115
+ "do_sample": False
116
+ }
117
+ }
118
+
119
+ result = hf_api_call(summarizer_model_name, payload)
120
+ print(result)
121
+ related_work = result[0]['generated_text']
 
 
 
 
122
  refs, ids = generate_refs(ranked_papers)
123
  related_work += refs
124
+
125
+ with open("literature review.txt", "w") as f:
126
+ f.write(related_work)
127
+
128
  return related_work, ids
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ