Spaces:
Build error
Build error
File size: 1,892 Bytes
28ec4f0 a3fdd99 6e57c67 836e16d a3fdd99 f6cc0cb a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 56d5448 9a54394 a3fdd99 3a4a956 a3fdd99 3a4a956 f6cc0cb a7fa548 3a4a956 cc5b5c1 2d4dc51 eff1d2d d42a71a 9a54394 9097656 3a4a956 a3fdd99 3a4a956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="facebook/bart-large-cnn")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
with open("temp-path.pdf", 'wb') as temp_file:
base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
temp_file.write(base64.b64decode(base64_pdf))
doc = converter.convert(file_path="temp-path.pdf", meta=None)
preprocessed_docs=preprocessor.process(doc)
document_store.write_documents(preprocessed_docs)
temp_file.close()
def summarize(file):
pdf_to_document_store(file)
st.write('Number of documents', document_store.get_document_count())
summaries = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
st.write('Summary')
for summary in summaries:
st.write(summary.content)
document_store, summarizer, preprocessor = start_haystack()
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False)
if uploaded_file is not None:
if st.button('Summarize Document'):
summarize(uploaded_file)
|