PDF-Summarizer / app.py
Tuana's picture
revert to only pdfs
cb95f0e
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, Crawler
from haystack.schema import Document
import logging
import base64
from PIL import Image
import validators
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(content):
pdf_to_document_store(content)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
return summaries
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
set_state_if_absent("summaries", None)
document_store, summarizer, preprocessor = start_haystack()
st.title('TL;DR with Haystack')
image = Image.open('header-image.png')
st.image(image)
st.markdown( """
This Summarization demo uses a [Haystack TransformerSummarizer node](https://haystack.deepset.ai/pipeline_nodes/summarizer). You can upload a PDF file, which will be converted to text with the [Haystack PDFtoTextConverter](https://haystack.deepset.ai/reference/file-converters#pdftotextconverter). In this demo, we produce 1 summary for the whole file you upload. So, the TransformerSummarizer treats the whole thing as one string, which means along with the model limitations, PDFs that have a lot of unneeded text at the beginning produce poor results. For best results, upload a document that has minimal intro and tables at the top.
""", unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
if uploaded_file is not None :
if st.button('Summarize Document'):
with st.spinner("πŸ“š    Please wait while we produce a summary..."):
try:
st.session_state.summaries = summarize(uploaded_file)
except Exception as e:
logging.exception(e)
if st.session_state.summaries:
st.write('## Summary')
for count, summary in enumerate(st.session_state.summaries):
st.write(summary.content)