Spaces:
Build error
Build error
File size: 1,866 Bytes
28ec4f0 a3fdd99 a7fa548 836e16d a3fdd99 9a54394 2a5639d 9a54394 a3fdd99 6c152f9 4d16c37 80419e0 4d16c37 95e1f5e 9c1fb8f a7fa548 9097656 9a54394 9097656 28ec4f0 a3fdd99 28ec4f0 bfb2bfb a4300de a7fa548 a4300de 3dfe2a3 f3a61e0 9097656 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.schema import Document
import logging
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
document_store = InMemoryDocumentStore()
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=0
)
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
return document_store, summarizer, preprocessor
def pdf_to_document_store(pdf_files):
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
documents = []
for pdf in pdf_files:
st.write(pdf.name)
doc = converter.convert(file_path=pdf.name, meta=None)[0]
documents.append(doc)
st.write(len(documents))
document_store.write_documents(documents)
st.write('Document count: ', document_store.get_document_count())
def summarize(files):
pdf_to_document_store(files)
summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
st.write(summary)
document_store, summarizer, preprocessor = start_haystack()
uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)
if uploaded_files is not None:
st.write(len(uploaded_files))
if st.button('Summarize Documents'):
summarize(uploaded_files)
if st.button('Calculate num of docs'):
st.write(document_store.get_document_count())
if st.button('Clear DocumentStore'):
document_store.delete_all_documents() |