Spaces:

Tuana
/

PDF-Summarizer

Build error

File size: 3,133 Bytes

28ec4f0
a3fdd99
2a34bac
836e16d
a3fdd99
f6cc0cb
28c1177
2a34bac
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
58c1223
9a54394
a3fdd99
 
3a4a956
 
a3fdd99
3a4a956
 
 
 
 
 
f6cc0cb
2a34bac
 
cb95f0e
2d4dc51
67f4a7d
 
 
 
 
d42a71a
2a34bac
 
9a54394
9097656
762970d
f176fb0
 
762970d
fe7b517
c1986cc
fe7b517
 
3a4a956
2a34bac
cb95f0e
3a4a956
67f4a7d
 
2a34bac
67f4a7d

import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators

@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_file):
    document_store.delete_documents()
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    with open("temp-path.pdf", 'wb') as temp_file:
        base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        temp_file.write(base64.b64decode(base64_pdf))
        doc = converter.convert(file_path="temp-path.pdf", meta=None)
        preprocessed_docs=preprocessor.process(doc)
        document_store.write_documents(preprocessed_docs)
        temp_file.close()
    
def summarize(content):
    pdf_to_document_store(content)
    summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    return summaries

def set_state_if_absent(key, value):
    if key not in st.session_state:
        st.session_state[key] = value
        
set_state_if_absent("summaries", None)
        
document_store, summarizer, preprocessor = start_haystack()

st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)

st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top. 
""", unsafe_allow_html=True)

uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
                
if uploaded_file is not None :
    if st.button('Summarize Document'):
        with st.spinner("📚 &nbsp;&nbsp; Please wait while we produce a summary..."):
            try:
                st.session_state.summaries = summarize(uploaded_file)
            except Exception as e:
                logging.exception(e)
 
if st.session_state.summaries:
    st.write('## Summary')
    for count, summary in enumerate(st.session_state.summaries):
        st.write(summary.content)