|
import streamlit as st |
|
from io import BytesIO |
|
from datetime import datetime |
|
import json |
|
import requests |
|
import numpy as np |
|
import hashlib |
|
import base64 |
|
import pickle |
|
import threading |
|
import re |
|
from MemSum.src.summarizer import MemSum |
|
import argparse |
|
|
|
def convert_pdf_to_json( pdf_bytes ): |
|
paper_info = requests.post( "http://localhost:8061/parse-and-normalize-pdf", |
|
files = {"pdf":pdf_bytes} |
|
).json()["response"] |
|
return paper_info |
|
|
|
def get_arxiv_paper_bytes_fomr_url(url): |
|
url = url.lower() |
|
arxiv_id_matcher = re.compile( r"\d{4}\.\d{4,5}|[a-z-]+\/\d{7}" ) |
|
arxiv_id = (arxiv_id_matcher.findall( url ) + [None])[0] |
|
if arxiv_id is None: |
|
return None |
|
pdf_url = "https://arxiv.org/pdf/%s.pdf"%( arxiv_id ) |
|
return requests.get(pdf_url).content |
|
|
|
def summarize_paper(memsum, paper_info): |
|
sentence_list = [] |
|
for sec in paper_info["Content"]["Fullbody_Parsed"]: |
|
for para in sec["section_text"]: |
|
for sen in para["paragraph_text"]: |
|
sentence_list.append(sen["sentence_text"]) |
|
extracted_summary, poses = memsum.extract( [ sentence_list ], |
|
p_stop_thres = 0.5, |
|
max_extracted_sentences_per_document = 5, |
|
return_sentence_position = True |
|
) |
|
extracted_summary, poses = extracted_summary[0], poses[0] |
|
if len(extracted_summary) > 0: |
|
extracted_summary = list(zip(*sorted(zip( extracted_summary, poses ), key = lambda x:x[1])))[0] |
|
return extracted_summary |
|
|
|
@st.cache_resource |
|
def load_models(): |
|
memsum = MemSum( "/home/myuser/app/models/memsum_arxiv/model.pt", |
|
"/home/myuser/app/models/word_embedding/vocabulary_200dim.pkl", |
|
gpu = None, max_doc_len = 500 ) |
|
return memsum |
|
|
|
def main(): |
|
memsum = load_models() |
|
|
|
st.title('MemSum ArXiv Summarizer') |
|
st.markdown(""" |
|
Using MemSum to extractively summarize an arXiv paper by extracting sentences from its fullbody. |
|
|
|
Paper: https://aclanthology.org/2022.acl-long.450/ GitHub: https://github.com/nianlonggu/MemSum |
|
""") |
|
|
|
option = st.radio("Choose your method to provide the paper:", |
|
('Provide arXiv URL', 'Directly upload a PDF')) |
|
if option == 'Provide arXiv URL': |
|
|
|
url = st.text_input('Enter the arXiv URL here (e.g., https://arxiv.org/abs/1810.04805):') |
|
|
|
if st.button('Summarize from URL'): |
|
|
|
pdf_bytes = get_arxiv_paper_bytes_fomr_url( url ) |
|
if pdf_bytes is None: |
|
st.text("URL parsing error. Is the URL valid?") |
|
else: |
|
paper_info = convert_pdf_to_json( pdf_bytes ) |
|
extracted_summary = summarize_paper(memsum, paper_info) |
|
st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) ) |
|
else: |
|
st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)") |
|
|
|
elif option == 'Directly upload a PDF': |
|
|
|
uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf']) |
|
if uploaded_file is not None: |
|
|
|
pdf_bytes = BytesIO(uploaded_file.getvalue()) |
|
if pdf_bytes is None: |
|
st.text("PDF parsing error. Is the uploaded file a valid PDF?") |
|
else: |
|
paper_info = convert_pdf_to_json( pdf_bytes ) |
|
extracted_summary = summarize_paper(memsum, paper_info) |
|
st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) ) |
|
else: |
|
st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |