Spaces:
Build error
Build error
File size: 1,964 Bytes
28ec4f0 a3fdd99 a7fa548 836e16d a3fdd99 9a54394 5fdc2d5 cc0fbf1 9a54394 07607d7 9a54394 a3fdd99 6c152f9 5fdc2d5 5daf335 9c1fb8f a7fa548 07607d7 a7fa548 9097656 9a54394 9097656 28ec4f0 a3fdd99 28ec4f0 bfb2bfb a4300de a7fa548 a4300de 3dfe2a3 f3a61e0 9097656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=200,
split_respect_sentence_boundary=True,
)
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_files):
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
documents = []
for pdf in pdf_files:
with open("temp-path.pdf", 'wb') as temp_file:
temp_file.write(pdf)
doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
preprocessed_doc=preprocessor.process([doc])
documents.append(preprocessed_doc)
document_store.write_documents(documents)
st.write('Document count: ', document_store.get_document_count())
def summarize(files):
pdf_to_document_store(files)
summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=False)
st.write(summary)
document_store, summarizer, preprocessor = start_haystack()
uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)
if uploaded_files is not None:
st.write(len(uploaded_files))
if st.button('Summarize Documents'):
summarize(uploaded_files)
if st.button('Calculate num of docs'):
st.write(document_store.get_document_count())
if st.button('Clear DocumentStore'):
document_store.delete_all_documents() |