Spaces:

nianlong
/

memsum-arxiv-summarizer

Sleeping

File size: 4,233 Bytes

import streamlit as st
from io import BytesIO
from datetime import datetime
import json
import requests
import numpy as np
import hashlib
import base64
import pickle
import threading
import re
from  MemSum.src.summarizer import MemSum
import argparse

def convert_pdf_to_json( pdf_bytes ):
    paper_info = requests.post( "http://localhost:8061/parse-and-normalize-pdf", 
                                files = {"pdf":pdf_bytes}
                              ).json()["response"]
    return paper_info

def get_arxiv_paper_bytes_fomr_url(url):
    url = url.lower()
    arxiv_id_matcher = re.compile( r"\d{4}\.\d{4,5}|[a-z-]+\/\d{7}" )
    arxiv_id = (arxiv_id_matcher.findall( url ) + [None])[0]
    if arxiv_id is None:
        return None
    pdf_url = "https://arxiv.org/pdf/%s.pdf"%( arxiv_id )
    return requests.get(pdf_url).content

def summarize_paper(memsum, paper_info):
    sentence_list = []
    for sec in paper_info["Content"]["Fullbody_Parsed"]:
        for para in sec["section_text"]:
            for sen in para["paragraph_text"]:
                sentence_list.append(sen["sentence_text"])
    extracted_summary, poses = memsum.extract( [ sentence_list ], 
                                   p_stop_thres = 0.5, 
                                   max_extracted_sentences_per_document = 5,
                                   return_sentence_position = True
                                  )
    extracted_summary, poses = extracted_summary[0], poses[0]
    if len(extracted_summary) > 0:
        extracted_summary = list(zip(*sorted(zip( extracted_summary, poses  ), key = lambda x:x[1])))[0]
    return  extracted_summary    
    
@st.cache_resource
def load_models():
    memsum = MemSum(  "/home/myuser/app/models/memsum_arxiv/model.pt", 
                  "/home/myuser/app/models/word_embedding/vocabulary_200dim.pkl", 
                  gpu = None,  max_doc_len = 500  )
    return memsum

def main():
    memsum = load_models()

    st.title('MemSum ArXiv Summarizer')
    st.markdown("""
        Using MemSum to extractively summarize an arXiv paper by extracting sentences from its fullbody. 
        
        Paper: https://aclanthology.org/2022.acl-long.450/ GitHub: https://github.com/nianlonggu/MemSum
    """)
    # Add options for input method
    option = st.radio("Choose your method to provide the paper:",
                      ('Provide arXiv URL', 'Directly upload a PDF'))
    if option == 'Provide arXiv URL':
        # User will input the URL text here
        url = st.text_input('Enter the arXiv URL here (e.g., https://arxiv.org/abs/1810.04805):')
        # You can add a button to trigger the summarization process after the URL is input
        if st.button('Summarize from URL'):
            # Functionality to handle summarization from URL goes here
            pdf_bytes = get_arxiv_paper_bytes_fomr_url( url )
            if pdf_bytes is None:
                st.text("URL parsing error. Is the URL valid?")
            else:
                paper_info =  convert_pdf_to_json( pdf_bytes )
                extracted_summary = summarize_paper(memsum, paper_info)
                st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
        else:
            st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)")
            
    elif option == 'Directly upload a PDF':
        # User can upload a PDF directly here
        uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf'])
        if uploaded_file is not None:
            # Functionality to handle summarization from uploaded file goes here
            pdf_bytes = BytesIO(uploaded_file.getvalue())
            if pdf_bytes is None:
                st.text("PDF parsing error. Is the uploaded file a valid PDF?")
            else:
                paper_info =  convert_pdf_to_json( pdf_bytes )
                extracted_summary = summarize_paper(memsum, paper_info)
                st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) )
        else:
            st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)")
            
        
if __name__ == "__main__":
    main()