import streamlit as st from io import BytesIO from datetime import datetime import json import requests import numpy as np import hashlib import base64 import pickle import threading import re from MemSum.src.summarizer import MemSum import argparse def convert_pdf_to_json( pdf_bytes ): paper_info = requests.post( "http://localhost:8061/parse-and-normalize-pdf", files = {"pdf":pdf_bytes} ).json()["response"] return paper_info def get_arxiv_paper_bytes_fomr_url(url): url = url.lower() arxiv_id_matcher = re.compile( r"\d{4}\.\d{4,5}|[a-z-]+\/\d{7}" ) arxiv_id = (arxiv_id_matcher.findall( url ) + [None])[0] if arxiv_id is None: return None pdf_url = "https://arxiv.org/pdf/%s.pdf"%( arxiv_id ) return requests.get(pdf_url).content def summarize_paper(memsum, paper_info): sentence_list = [] for sec in paper_info["Content"]["Fullbody_Parsed"]: for para in sec["section_text"]: for sen in para["paragraph_text"]: sentence_list.append(sen["sentence_text"]) extracted_summary, poses = memsum.extract( [ sentence_list ], p_stop_thres = 0.5, max_extracted_sentences_per_document = 5, return_sentence_position = True ) extracted_summary, poses = extracted_summary[0], poses[0] if len(extracted_summary) > 0: extracted_summary = list(zip(*sorted(zip( extracted_summary, poses ), key = lambda x:x[1])))[0] return extracted_summary @st.cache_resource def load_models(): memsum = MemSum( "/home/myuser/app/models/memsum_arxiv/model.pt", "/home/myuser/app/models/word_embedding/vocabulary_200dim.pkl", gpu = None, max_doc_len = 500 ) return memsum def main(): memsum = load_models() st.title('MemSum ArXiv Summarizer') st.markdown(""" Using MemSum to extractively summarize an arXiv paper by extracting sentences from its fullbody. Paper: https://aclanthology.org/2022.acl-long.450/ GitHub: https://github.com/nianlonggu/MemSum """) # Add options for input method option = st.radio("Choose your method to provide the paper:", ('Provide arXiv URL', 'Directly upload a PDF')) if option == 'Provide arXiv URL': # User will input the URL text here url = st.text_input('Enter the arXiv URL here (e.g., https://arxiv.org/abs/1810.04805):') # You can add a button to trigger the summarization process after the URL is input if st.button('Summarize from URL'): # Functionality to handle summarization from URL goes here pdf_bytes = get_arxiv_paper_bytes_fomr_url( url ) if pdf_bytes is None: st.text("URL parsing error. Is the URL valid?") else: paper_info = convert_pdf_to_json( pdf_bytes ) extracted_summary = summarize_paper(memsum, paper_info) st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) ) else: st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)") elif option == 'Directly upload a PDF': # User can upload a PDF directly here uploaded_file = st.file_uploader("Choose a PDF file", type=['pdf']) if uploaded_file is not None: # Functionality to handle summarization from uploaded file goes here pdf_bytes = BytesIO(uploaded_file.getvalue()) if pdf_bytes is None: st.text("PDF parsing error. Is the uploaded file a valid PDF?") else: paper_info = convert_pdf_to_json( pdf_bytes ) extracted_summary = summarize_paper(memsum, paper_info) st.markdown( "\n".join( [ "* " + sen for sen in extracted_summary] ) ) else: st.markdown("(This may takes upto 1 min on huggingface space, since MemSum is running on a small CPU)") if __name__ == "__main__": main()