Spaces:

Archan
/

Arxiv-Summarizer-Gradio

Runtime error

App Files Files Community

Archan commited on Dec 13, 2023

Commit

7b91b09

•

1 Parent(s): 4323b86

Added main app.py file and requirements.txt

Browse files

Files changed (2) hide show

app.py +110 -0
requirements.txt +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from langchain.document_loaders import ArxivLoader
+from transformers import pipeline
+import gradio as gr
+def strip(content):
+  content = str(content)
+  #print(content)
+  content = content.split("\n")
+  content = " ".join(content)
+  #print(content)
+  return content
+def clip(content):
+  loc_intro = content.find("Introduction")
+  loc_refer = content.rfind("Reference")
+  if loc_intro !=-1:
+    if loc_refer !=-1:
+      content = content[loc_intro:loc_refer]
+    else:
+      content = content[loc_intro:]
+      print("Warning: Paper Doesn't have a References Title, may lead to overlap of references in summary")
+  else:
+    print("Warning: Paper Doesn't Have an Introduction Title, these may lead to overlap of summarization")
+  return content
+def chunk(content):
+  content = clip(content)
+  sent = []
+  c= 0
+  k = ""
+  content = content.split(". ")
+  for i in range(len(content)):
+    k = k + content[i] + ". "
+    c = c+1
+    if c == 10:
+      sent.append(k)
+      c = 0
+      k = ""
+    elif i==len(content)-1:
+      sent.append(k)
+  return sent
+def summarize(sent):
+  model_str = "Falconsai/text_summarization"
+  tokenizer_str = "Falconsai/text_summarization"
+  summarizer = pipeline("summarization", model=model_str, tokenizer = tokenizer_str)
+  summarized = ""
+  for i in sent:
+    s = summarizer(i, max_length=256, min_length=64, do_sample=False)
+    summarized = summarized + s[0]['summary_text'] +"\n"
+  return summarized
+def fn_one(search_query, n_docs):
+  docs = ArxivLoader(query=search_query, load_max_docs=n_docs).load()
+  print(search_query, n_docs)
+  titles = []
+  n_pairs = {}
+  for i in range(n_docs):
+    title = docs[i].metadata['Title']
+    titles.append(title)
+    n_pairs[title] = i
+  return gr.Dropdown(titles), docs, n_pairs
+def fn_two(choice, docs, n_pairs):
+  ch = n_pairs[str(choice)]
+  metadata = docs[ch].metadata
+  content = docs[ch].page_content
+  content = strip(content)
+  sent = chunk(content)
+  summarized = summarize(sent)
+  out = "Date: "+ str(metadata['Published']) + "\n" + "\n Title: "+ metadata['Title'] + "\n" + "\n Authors: " + metadata['Authors'] + "\n" + "\n Summary: \n" + summarized
+  return out
+  return 'one output to show in the result box'
+with gr.Blocks() as demo:
+  with gr.Row():
+    paper_name = gr.Textbox(label="Enter Paper Name/ID")
+    n_docs = gr.Dropdown(label='Number of Docs to Load", [1,2,3,4,5,6,7,8,9,10])
+    docs = gr.State() #gr.Textbox(label="second", visible=False)
+    n_pairs = gr.State() #gr.Textbox(label="third", visible=False)
+  fetch_btn = gr.Button("Fetch")
+  with gr.Row():
+    label = "Papers for " + paper_name
+    choice = gr.Dropdown(label = label, interactive=True)
+    submit_btn = gr.Button('Fetch & Summarize')
+  result = gr.Textbox(label="Summary", visible=True)
+  fetch_btn.click(fn=fn_one, inputs=[paper_name, n_docs],
+                    outputs=[choice, docs, n_pairs],
+                    api_name="fetch")
+  submit_btn.click(fn=fn_two, inputs=[choice, docs, n_pairs],
+                    outputs=[result],
+                    api_name="submit")
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,86 @@

+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.1.0
+arxiv==2.0.0
+attrs==23.1.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+dataclasses-json==0.6.3
+feedparser==6.0.10
+filelock==3.13.1
+frozenlist==1.4.0
+fsspec==2023.12.1
+gitdb==4.0.11
+GitPython==3.1.40
+greenlet==3.0.2
+huggingface-hub==0.19.4
+idna==3.6
+importlib-metadata==6.11.0
+Jinja2==3.1.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.20.0
+jsonschema-specifications==2023.11.2
+langchain==0.0.348
+langchain-core==0.0.12
+langsmith==0.0.69
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.2.1
+numpy==1.26.2
+packaging==23.2
+pandas==2.1.4
+Pillow==10.1.0
+protobuf==4.25.1
+pyarrow==14.0.1
+pydantic==2.5.2
+pydantic_core==2.14.5
+pydeck==0.8.1b0
+Pygments==2.17.2
+PyMuPDF==1.23.7
+PyMuPDFb==1.23.7
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.0
+regex==2023.10.3
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.13.2
+safetensors==0.4.1
+sgmllib3k==1.0.0
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+SQLAlchemy==2.0.23
+streamlit==1.29.0
+sympy==1.12
+tenacity==8.2.3
+tokenizers==0.15.0
+toml==0.10.2
+toolz==0.12.0
+torch==2.1.1
+tornado==6.4
+tqdm==4.66.1
+transformers==4.36.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.1.0
+validators==0.22.0
+watchdog==3.0.0
+yarl==1.9.4
+zipp==3.17.0
+gradio==4.9.0