Spaces:
Sleeping
Sleeping
Commit
·
77e70e4
1
Parent(s):
e1bc181
Server Optimised
Browse files- app.py +10 -26
- helper.py +60 -90
- requirements.txt +0 -0
app.py
CHANGED
@@ -4,10 +4,6 @@ from fastapi import FastAPI
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from pydantic import BaseModel
|
6 |
from typing import List
|
7 |
-
import os
|
8 |
-
from dotenv import load_dotenv
|
9 |
-
|
10 |
-
load_dotenv()
|
11 |
|
12 |
app = FastAPI()
|
13 |
app.add_middleware(
|
@@ -18,10 +14,6 @@ app.add_middleware(
|
|
18 |
allow_headers=["*"]
|
19 |
)
|
20 |
|
21 |
-
llms = None
|
22 |
-
base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
|
23 |
-
sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"
|
24 |
-
|
25 |
class RequestData(BaseModel):
|
26 |
abstract: str
|
27 |
words: int
|
@@ -31,35 +23,27 @@ class ResponseData(BaseModel):
|
|
31 |
summary: str
|
32 |
ids: List[str]
|
33 |
|
|
|
|
|
|
|
34 |
@app.post("/generateLiteratureSurvey/", response_model=ResponseData)
|
35 |
async def generate_literature_survey(request_data: RequestData):
|
36 |
-
summary, ids = summarize(request_data.abstract, request_data.words, request_data.papers
|
37 |
return {"summary": summary,
|
38 |
"ids": ids
|
39 |
}
|
40 |
|
41 |
@app.get("/")
|
42 |
async def root():
|
43 |
-
if llms == None :
|
44 |
-
return {"status": 0}
|
45 |
return {"status": 1}
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
def summarize(query, n_words, n_papers, llms) :
|
54 |
-
keywords = helper.extract_keywords(llms['feature_extractor'], query)
|
55 |
-
papers = helper.search_papers(llms['arxiv_agent'], keywords, n_papers*2)
|
56 |
-
ranked_papers = helper.re_rank_papers(llms['ranker'], query, papers, n_papers)
|
57 |
-
literature_review, ids = helper.generate_related_work(llms['summarizer'], llms['summarizer_tokenizer'], query, ranked_papers, base_prompt, sentence_plan, n_words)
|
58 |
return literature_review, ids
|
59 |
|
60 |
-
print("Program running")
|
61 |
-
llms = helper.init_pipeline()
|
62 |
-
print('Model loaded')
|
63 |
-
|
64 |
if __name__ == '__main__':
|
|
|
65 |
uvicorn.run(app)
|
|
|
4 |
from fastapi.middleware.cors import CORSMiddleware
|
5 |
from pydantic import BaseModel
|
6 |
from typing import List
|
|
|
|
|
|
|
|
|
7 |
|
8 |
app = FastAPI()
|
9 |
app.add_middleware(
|
|
|
14 |
allow_headers=["*"]
|
15 |
)
|
16 |
|
|
|
|
|
|
|
|
|
17 |
class RequestData(BaseModel):
|
18 |
abstract: str
|
19 |
words: int
|
|
|
23 |
summary: str
|
24 |
ids: List[str]
|
25 |
|
26 |
+
base_prompt = "You will be provided with an abstract of a scientific document and other references papers in triple quotes. Your task is to write the related work section of the document using only the provided abstracts and other references papers. Please write the related work section creating a cohesive storyline by doing a critical analysis of prior work comparing the strengths and weaknesses while also motivating the proposed approach. You are also provided a sentence plan mentioning the total number of lines and the citations to refer in different lines. You should cite all the other related documents as [#] whenever you are referring it in the related work. Do not cite abstract. Do not include any extra notes or newline characters at the end. Do not copy the abstracts of reference papers directly but compare and contrast to the main work concisely. Do not provide the output in bullet points. Do not provide references at the end. Please cite all the provided reference papers. Please follow the plan when generating sentences, especially the number of lines to generate."
|
27 |
+
sentence_plan = "1. Introduction sentence\n2. Overview of relevant studies\n3. Detailed discussion on key papers\n4. Summary of related work\n"
|
28 |
+
|
29 |
@app.post("/generateLiteratureSurvey/", response_model=ResponseData)
|
30 |
async def generate_literature_survey(request_data: RequestData):
|
31 |
+
summary, ids = summarize(request_data.abstract, request_data.words, request_data.papers)
|
32 |
return {"summary": summary,
|
33 |
"ids": ids
|
34 |
}
|
35 |
|
36 |
@app.get("/")
|
37 |
async def root():
|
|
|
|
|
38 |
return {"status": 1}
|
39 |
|
40 |
+
def summarize(query, n_words, n_papers) :
|
41 |
+
keywords = helper.extract_keywords(query)
|
42 |
+
papers = helper.search_papers(keywords, n_papers*2)
|
43 |
+
ranked_papers = helper.re_rank_papers(query, papers, n_papers)
|
44 |
+
literature_review, ids = helper.generate_related_work(query, ranked_papers, base_prompt, sentence_plan, n_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
return literature_review, ids
|
46 |
|
|
|
|
|
|
|
|
|
47 |
if __name__ == '__main__':
|
48 |
+
print("Program running")
|
49 |
uvicorn.run(app)
|
helper.py
CHANGED
@@ -1,79 +1,60 @@
|
|
1 |
-
|
2 |
-
from langchain_community.utilities import ArxivAPIWrapper
|
3 |
-
from transformers.pipelines import AggregationStrategy
|
4 |
-
from sentence_transformers import SentenceTransformer
|
5 |
-
import arxiv
|
6 |
import numpy as np
|
7 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
|
10 |
feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
|
11 |
ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
tokenizer=AutoTokenizer.from_pretrained(model),
|
18 |
-
*args,
|
19 |
-
**kwargs
|
20 |
-
)
|
21 |
-
|
22 |
-
def postprocess(self, all_outputs):
|
23 |
-
results = super().postprocess(
|
24 |
-
all_outputs=all_outputs,
|
25 |
-
aggregation_strategy=AggregationStrategy.SIMPLE,
|
26 |
-
)
|
27 |
-
return np.unique([result.get("word").strip() for result in results])
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
trust_remote_code=True
|
34 |
-
)
|
35 |
-
summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_name)
|
36 |
-
|
37 |
-
feature_extractor_model = KeyphraseExtractionPipeline(model=feature_extractor_model_name)
|
38 |
-
|
39 |
-
ranker_model=SentenceTransformer(ranker_model_name)
|
40 |
-
|
41 |
-
arxiv_agent = ArxivAPIWrapper(top_k_results = 5, doc_content_chars_max = None, load_max_docs = 10)
|
42 |
-
return {
|
43 |
-
"summarizer" : summarizer_model,
|
44 |
-
"summarizer_tokenizer" : summarizer_tokenizer,
|
45 |
-
"feature_extractor" : feature_extractor_model,
|
46 |
-
"ranker" : ranker_model,
|
47 |
-
"arxiv_agent" : arxiv_agent
|
48 |
-
}
|
49 |
-
|
50 |
-
def extract_keywords(model, abstract):
|
51 |
-
keyphrases = model(abstract)
|
52 |
print(keyphrases)
|
53 |
return keyphrases
|
54 |
|
55 |
-
|
56 |
-
|
57 |
query = " ".join(keywords)
|
58 |
results = arxiv_agent.get_summaries_as_docs(query)
|
59 |
-
#print("arxiv ouptut ")
|
60 |
-
#print(results)
|
61 |
return results
|
62 |
|
63 |
-
def re_rank_papers(
|
64 |
-
summaries = {paper.page_content
|
65 |
-
|
66 |
-
target_embeddings = model.encode([query_abstract])
|
67 |
-
summaries_embeddings = model.encode(list(summaries.keys()))
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
def format_abstracts_as_references(papers):
|
79 |
cite_text = ""
|
@@ -121,38 +102,27 @@ def generate_refs(papers) :
|
|
121 |
i+=1
|
122 |
return refs, ids
|
123 |
|
124 |
-
|
125 |
-
|
126 |
-
i = 1
|
127 |
-
for key in ranked_papers.keys():
|
128 |
-
input_text += f"{i+1}. {ranked_papers[key]['Title']} - {key}\n"
|
129 |
-
i+=1
|
130 |
-
|
131 |
data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
|
132 |
complete_prompt = f"{base_prompt}\n```{data}```"
|
133 |
-
messages = [
|
134 |
-
{"role": "system", "content": "You are a helpful AI assistant."},
|
135 |
-
{"role": "user", "content": complete_prompt}]
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
output = pipe(messages, **generation_args)
|
151 |
-
print(output)
|
152 |
-
related_work = output[0]['generated_text']
|
153 |
refs, ids = generate_refs(ranked_papers)
|
154 |
related_work += refs
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
158 |
return related_work, ids
|
|
|
1 |
+
import requests
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
+
import arxiv
|
4 |
+
from langchain.utilities import ArxivAPIWrapper
|
5 |
+
import os
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
HF_API_TOKEN = os.environ.get('HF_API_TOKEN')
|
11 |
+
HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}
|
12 |
|
13 |
summarizer_model_name = "microsoft/Phi-3-mini-4k-instruct"
|
14 |
feature_extractor_model_name = "ml6team/keyphrase-extraction-kbir-inspec"
|
15 |
ranker_model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
16 |
|
17 |
+
def hf_api_call(model_name, payload):
|
18 |
+
API_URL = f"https://api-inference.huggingface.co/models/{model_name}"
|
19 |
+
response = requests.post(API_URL, headers=HEADERS, json=payload)
|
20 |
+
return response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
def extract_keywords(abstract):
|
23 |
+
payload = {"inputs": abstract}
|
24 |
+
result = hf_api_call(feature_extractor_model_name, payload)
|
25 |
+
keyphrases = np.unique([item['word'].strip() for item in result])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
print(keyphrases)
|
27 |
return keyphrases
|
28 |
|
29 |
+
def search_papers(keywords, n_papers):
|
30 |
+
arxiv_agent = ArxivAPIWrapper(top_k_results=n_papers, doc_content_chars_max=None, load_max_docs=n_papers+3)
|
31 |
query = " ".join(keywords)
|
32 |
results = arxiv_agent.get_summaries_as_docs(query)
|
|
|
|
|
33 |
return results
|
34 |
|
35 |
+
def re_rank_papers(query_abstract, papers, n_papers):
|
36 |
+
summaries = {paper.page_content: {"Title": paper.metadata['Title']} for paper in papers}
|
37 |
+
summ_list = []
|
|
|
|
|
38 |
|
39 |
+
payload = {
|
40 |
+
"inputs": {
|
41 |
+
"source_sentence": query_abstract,
|
42 |
+
"sentences": list(summaries.keys())
|
43 |
+
}
|
44 |
+
}
|
45 |
+
result = hf_api_call(ranker_model_name, payload)
|
46 |
+
|
47 |
+
for i, key in enumerate(summaries.keys()):
|
48 |
+
summ_list.append((key, summaries[key]["Title"], result[i]))
|
49 |
+
print((key, summaries[key]["Title"], result[i]))
|
50 |
+
summ_list = sorted(summ_list, key=lambda x: x[2], reverse=True)
|
51 |
+
summaries = {}
|
52 |
+
for i in range(n_papers) :
|
53 |
+
summaries[summ_list[i][0]] = {
|
54 |
+
"Title" : summ_list[i][1],
|
55 |
+
"score" : summ_list[i][2]
|
56 |
+
}
|
57 |
+
return summaries
|
58 |
|
59 |
def format_abstracts_as_references(papers):
|
60 |
cite_text = ""
|
|
|
102 |
i+=1
|
103 |
return refs, ids
|
104 |
|
105 |
+
|
106 |
+
def generate_related_work(query_abstract, ranked_papers, base_prompt, sentence_plan, n_words):
|
|
|
|
|
|
|
|
|
|
|
107 |
data = f"Abstract: {query_abstract} \n {format_abstracts_as_references(ranked_papers)} \n Plan: {sentence_plan}"
|
108 |
complete_prompt = f"{base_prompt}\n```{data}```"
|
|
|
|
|
|
|
109 |
|
110 |
+
payload = {
|
111 |
+
"inputs": complete_prompt,
|
112 |
+
"parameters": {
|
113 |
+
"max_new_tokens": n_words,
|
114 |
+
"temperature": 0.01,
|
115 |
+
"do_sample": False
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
result = hf_api_call(summarizer_model_name, payload)
|
120 |
+
print(result)
|
121 |
+
related_work = result[0]['generated_text']
|
|
|
|
|
|
|
|
|
122 |
refs, ids = generate_refs(ranked_papers)
|
123 |
related_work += refs
|
124 |
+
|
125 |
+
with open("literature review.txt", "w") as f:
|
126 |
+
f.write(related_work)
|
127 |
+
|
128 |
return related_work, ids
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|