Spaces:
Runtime error
Runtime error
Added main app.py file and requirements.txt
Browse files- app.py +110 -0
- requirements.txt +86 -0
app.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.document_loaders import ArxivLoader
|
2 |
+
from transformers import pipeline
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
def strip(content):
|
6 |
+
content = str(content)
|
7 |
+
#print(content)
|
8 |
+
content = content.split("\n")
|
9 |
+
content = " ".join(content)
|
10 |
+
#print(content)
|
11 |
+
|
12 |
+
return content
|
13 |
+
|
14 |
+
def clip(content):
|
15 |
+
loc_intro = content.find("Introduction")
|
16 |
+
loc_refer = content.rfind("Reference")
|
17 |
+
if loc_intro !=-1:
|
18 |
+
if loc_refer !=-1:
|
19 |
+
content = content[loc_intro:loc_refer]
|
20 |
+
else:
|
21 |
+
content = content[loc_intro:]
|
22 |
+
print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
|
23 |
+
else:
|
24 |
+
print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
|
25 |
+
|
26 |
+
return content
|
27 |
+
|
28 |
+
|
29 |
+
def chunk(content):
|
30 |
+
|
31 |
+
content = clip(content)
|
32 |
+
|
33 |
+
sent = []
|
34 |
+
c= 0
|
35 |
+
k = ""
|
36 |
+
content = content.split(". ")
|
37 |
+
for i in range(len(content)):
|
38 |
+
k = k + content[i] + ". "
|
39 |
+
c = c+1
|
40 |
+
if c == 10:
|
41 |
+
sent.append(k)
|
42 |
+
c = 0
|
43 |
+
k = ""
|
44 |
+
elif i==len(content)-1:
|
45 |
+
sent.append(k)
|
46 |
+
|
47 |
+
return sent
|
48 |
+
|
49 |
+
|
50 |
+
def summarize(sent):
|
51 |
+
model_str = "Falconsai/text_summarization"
|
52 |
+
tokenizer_str = "Falconsai/text_summarization"
|
53 |
+
|
54 |
+
summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
|
55 |
+
|
56 |
+
|
57 |
+
summarized = ""
|
58 |
+
for i in sent:
|
59 |
+
s = summarizer(i, max_length=256, min_length=64, do_sample=False)
|
60 |
+
summarized = summarized + s[0]['summary_text'] +"\n"
|
61 |
+
|
62 |
+
return summarized
|
63 |
+
|
64 |
+
def fn_one(search_query, n_docs):
|
65 |
+
docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
|
66 |
+
print(search_query, n_docs)
|
67 |
+
titles = []
|
68 |
+
n_pairs = {}
|
69 |
+
for i in range(n_docs):
|
70 |
+
title = docs[i].metadata['Title']
|
71 |
+
titles.append(title)
|
72 |
+
n_pairs[title] = i
|
73 |
+
return gr.Dropdown(titles), docs, n_pairs
|
74 |
+
|
75 |
+
def fn_two(choice, docs, n_pairs):
|
76 |
+
ch = n_pairs[str(choice)]
|
77 |
+
metadata = docs[ch].metadata
|
78 |
+
content = docs[ch].page_content
|
79 |
+
|
80 |
+
content = strip(content)
|
81 |
+
sent = chunk(content)
|
82 |
+
summarized = summarize(sent)
|
83 |
+
|
84 |
+
|
85 |
+
out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
|
86 |
+
return out
|
87 |
+
return 'one output to show in the result box'
|
88 |
+
|
89 |
+
|
90 |
+
with gr.Blocks() as demo:
|
91 |
+
with gr.Row():
|
92 |
+
paper_name = gr.Textbox(label="Enter Paper Name/ID")
|
93 |
+
n_docs = gr.Dropdown(label='Number of Docs to Load", [1,2,3,4,5,6,7,8,9,10])
|
94 |
+
docs = gr.State() #gr.Textbox(label="second", visible=False)
|
95 |
+
n_pairs = gr.State() #gr.Textbox(label="third", visible=False)
|
96 |
+
fetch_btn = gr.Button("Fetch")
|
97 |
+
with gr.Row():
|
98 |
+
label = "Papers for " + paper_name
|
99 |
+
choice = gr.Dropdown(label = label, interactive=True)
|
100 |
+
submit_btn = gr.Button('Fetch & Summarize')
|
101 |
+
result = gr.Textbox(label="Summary", visible=True)
|
102 |
+
|
103 |
+
fetch_btn.click(fn=fn_one, inputs=[paper_name, n_docs],
|
104 |
+
outputs=[choice, docs, n_pairs],
|
105 |
+
api_name="fetch")
|
106 |
+
submit_btn.click(fn=fn_two, inputs=[choice, docs, n_pairs],
|
107 |
+
outputs=[result],
|
108 |
+
api_name="submit")
|
109 |
+
if __name__ == "__main__":
|
110 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.1
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.2.0
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.1.0
|
6 |
+
arxiv==2.0.0
|
7 |
+
attrs==23.1.0
|
8 |
+
blinker==1.7.0
|
9 |
+
cachetools==5.3.2
|
10 |
+
certifi==2023.11.17
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
dataclasses-json==0.6.3
|
15 |
+
feedparser==6.0.10
|
16 |
+
filelock==3.13.1
|
17 |
+
frozenlist==1.4.0
|
18 |
+
fsspec==2023.12.1
|
19 |
+
gitdb==4.0.11
|
20 |
+
GitPython==3.1.40
|
21 |
+
greenlet==3.0.2
|
22 |
+
huggingface-hub==0.19.4
|
23 |
+
idna==3.6
|
24 |
+
importlib-metadata==6.11.0
|
25 |
+
Jinja2==3.1.2
|
26 |
+
jsonpatch==1.33
|
27 |
+
jsonpointer==2.4
|
28 |
+
jsonschema==4.20.0
|
29 |
+
jsonschema-specifications==2023.11.2
|
30 |
+
langchain==0.0.348
|
31 |
+
langchain-core==0.0.12
|
32 |
+
langsmith==0.0.69
|
33 |
+
markdown-it-py==3.0.0
|
34 |
+
MarkupSafe==2.1.3
|
35 |
+
marshmallow==3.20.1
|
36 |
+
mdurl==0.1.2
|
37 |
+
mpmath==1.3.0
|
38 |
+
multidict==6.0.4
|
39 |
+
mypy-extensions==1.0.0
|
40 |
+
networkx==3.2.1
|
41 |
+
numpy==1.26.2
|
42 |
+
packaging==23.2
|
43 |
+
pandas==2.1.4
|
44 |
+
Pillow==10.1.0
|
45 |
+
protobuf==4.25.1
|
46 |
+
pyarrow==14.0.1
|
47 |
+
pydantic==2.5.2
|
48 |
+
pydantic_core==2.14.5
|
49 |
+
pydeck==0.8.1b0
|
50 |
+
Pygments==2.17.2
|
51 |
+
PyMuPDF==1.23.7
|
52 |
+
PyMuPDFb==1.23.7
|
53 |
+
python-dateutil==2.8.2
|
54 |
+
pytz==2023.3.post1
|
55 |
+
PyYAML==6.0.1
|
56 |
+
referencing==0.32.0
|
57 |
+
regex==2023.10.3
|
58 |
+
requests==2.31.0
|
59 |
+
rich==13.7.0
|
60 |
+
rpds-py==0.13.2
|
61 |
+
safetensors==0.4.1
|
62 |
+
sgmllib3k==1.0.0
|
63 |
+
six==1.16.0
|
64 |
+
smmap==5.0.1
|
65 |
+
sniffio==1.3.0
|
66 |
+
SQLAlchemy==2.0.23
|
67 |
+
streamlit==1.29.0
|
68 |
+
sympy==1.12
|
69 |
+
tenacity==8.2.3
|
70 |
+
tokenizers==0.15.0
|
71 |
+
toml==0.10.2
|
72 |
+
toolz==0.12.0
|
73 |
+
torch==2.1.1
|
74 |
+
tornado==6.4
|
75 |
+
tqdm==4.66.1
|
76 |
+
transformers==4.36.0
|
77 |
+
typing-inspect==0.9.0
|
78 |
+
typing_extensions==4.9.0
|
79 |
+
tzdata==2023.3
|
80 |
+
tzlocal==5.2
|
81 |
+
urllib3==2.1.0
|
82 |
+
validators==0.22.0
|
83 |
+
watchdog==3.0.0
|
84 |
+
yarl==1.9.4
|
85 |
+
zipp==3.17.0
|
86 |
+
gradio==4.9.0
|