nianlonggu
update
8589755
raw
history blame contribute delete
No virus
4.23 kB
import streamlit as st
from io import BytesIO
from datetime import datetime
import json
import requests
import numpy as np
import hashlib
import base64
import pickle
import threading
import re
from MemSum.src.summarizer import MemSum
import argparse
def convert_pdf_to_json( pdf_bytes ):
paper_info = requests.post( "http://localhost:8061/parse-and-normalize-pdf",
files = {"pdf":pdf_bytes}
).json()["response"]
return paper_info
def get_arxiv_paper_bytes_fomr_url(url):
url = url.lower()
arxiv_id_matcher = re.compile( r"\d{4}\.\d{4,5}|[a-z-]+\/\d{7}" )
arxiv_id = (arxiv_id_matcher.findall( url ) + [None])[0]
if arxiv_id is None:
return None
pdf_url = "https://arxiv.org/pdf/%s.pdf"%( arxiv_id )
return requests.get(pdf_url).content
def summarize_paper(memsum, paper_info):
sentence_list = []
for sec in paper_info["Content"]["Fullbody_Parsed"]:
for para in sec["section_text"]:
for sen in para["paragraph_text"]:
sentence_list.append(sen["sentence_text"])
extracted_summary, poses = memsum.extract( [ sentence_list ],
p_stop_thres = 0.5,
max_extracted_sentences_per_document = 5,
return_sentence_position = True
)
extracted_summary, poses = extracted_summary[0], poses[0]
if len(extracted_summary) > 0:
extracted_summary = list(zip(*sorted(zip( extracted_summary, poses ), key = lambda x:x[1])))[0]
return extracted_summary
@st.cache_resource
def load_models():
memsum = MemSum( "/home/myuser/app/models/memsum_arxiv/model.pt",
"/home/myuser/app/models/word_embedding/vocabulary_200dim.pkl",
gpu = None, max_doc_len = 500 )
return memsum
def main():
memsum = load_models()
st.title('MemSum ArXiv Summarizer')
st.markdown("""
Using MemSum to extractively summarize an arXiv paper by extracting sentences from its fullbody.
Paper: https://aclanthology.org/2022.acl-long.450/ GitHub: https://github.com/nianlonggu/MemSum
""")
# Add options for input method
option = st.radio("Choose your method to provide the paper:",
('Provide arXiv URL', 'Directly upload a PDF'))
if option == 'Provide arXiv URL':
# User will input the URL text here
url = st.text_input('Enter the arXiv URL here (e.g., https://arxiv.org/abs/1810.04805):')
# You can add a button to trigger the summarization process after the URL is input
if st.button('Summarize from URL'):
# Functionality to handle summarization from URL goes here
pdf_bytes = get_arxiv_paper_bytes_fomr_url( url )
if pdf_bytes is None:
st.text("URL parsing error. Is the URL valid?")
else:
paper_info = convert_pdf_to_json( pdf_bytes )
extracted_summary = summarize_paper(memsum, paper_info)
st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
else:
st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)")
elif option == 'Directly upload a PDF':
# User can upload a PDF directly here
uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
if uploaded_file is not None:
# Functionality to handle summarization from uploaded file goes here
pdf_bytes = BytesIO(uploaded_file.getvalue())
if pdf_bytes is None:
st.text("PDF parsing error. Is the uploaded file a valid PDF?")
else:
paper_info = convert_pdf_to_json( pdf_bytes )
extracted_summary = summarize_paper(memsum, paper_info)
st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
else:
st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)")
if __name__ == "__main__":
main()