import streamlit as st from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter from haystack.schema import Document import logging import base64 @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True) def start_haystack(): document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=200, split_respect_sentence_boundary=True, ) summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn", batch_size=5) return document_store, summarizer, preprocessor def pdf_to_document_store(pdf_file): document_store.delete_documents() converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) with open("temp-path.pdf", 'wb') as temp_file: base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8') temp_file.write(base64.b64decode(base64_pdf)) doc = converter.convert(file_path="temp-path.pdf", meta=None) preprocessed_docs=preprocessor.process(doc) document_store.write_documents(preprocessed_docs) temp_file.close() def summarize(file): pdf_to_document_store(file) summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True) st.write('Summary') for summary in summaries: st.write(summary.content) document_store, summarizer, preprocessor = start_haystack() uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False) if uploaded_file is not None: if st.button('Summarize Document'): summarize(uploaded_file)