Spaces:
Build error
Build error
File size: 1,833 Bytes
28ec4f0 a3fdd99 6e57c67 836e16d a3fdd99 f6cc0cb a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 56d5448 9a54394 a3fdd99 3a4a956 a3fdd99 3a4a956 f6cc0cb a7fa548 3a4a956 ed39a05 eff1d2d d42a71a 9a54394 9097656 3a4a956 a3fdd99 3a4a956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(file):
pdf_to_document_store(file)
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True, batch_size=5)
st.write('Summary')
for summary in summaries:
st.write(summary.content)
document_store, summarizer, preprocessor = start_haystack()
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
if uploaded_file is not None:
if st.button('Summarize Document'):
summarize(uploaded_file)
|