File size: 2,030 Bytes
28ec4f0
a3fdd99
6e57c67
836e16d
a3fdd99
f6cc0cb
 
a3fdd99
9a54394
 
 
 
 
 
 
5fdc2d5
cc0fbf1
9a54394
 
6e57c67
9a54394
a3fdd99
 
 
 
6c152f9
5fdc2d5
f6cc0cb
 
5fdc2d5
5f91d5b
 
f6cc0cb
9c1fb8f
 
a7fa548
 
 
6e57c67
a7fa548
9097656
9a54394
9097656
28ec4f0
a3fdd99
28ec4f0
bfb2bfb
a4300de
a7fa548
a4300de
3dfe2a3
 
f3a61e0
9097656
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import streamlit as st
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TransformersSummarizer, PreProcessor, PDFToTextConverter
from haystack.schema import Document
import logging
import base64


@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack():
    document_store = InMemoryDocumentStore()
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
    )
    summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum")
    return document_store, summarizer, preprocessor


def pdf_to_document_store(pdf_files):
    converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
    for pdf in pdf_files:
        with open("temp-path.pdf", 'wb') as temp_file:
            base64_pdf = base64.b64encode(pdf.read()).decode('utf-8')
            temp_file.write(base64.b64decode(base64_pdf))
            doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
            preprocessed_docs=preprocessor.process([doc])
            document_store.write_documents(preprocessed_docs)
        temp_file.close()
    st.write('Document count: ', document_store.get_document_count())


def summarize(files):
    pdf_to_document_store(files)
    summary = summarizer.predict(documents=document_store.get_all_documents(), generate_single_summary=True)
    st.write(summary)

document_store, summarizer, preprocessor = start_haystack()

uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True)

if uploaded_files is not None:
    st.write(len(uploaded_files))
    if st.button('Summarize Documents'):
        summarize(uploaded_files)

if st.button('Calculate num of docs'):
    st.write(document_store.get_document_count())

if st.button('Clear DocumentStore'):
    document_store.delete_all_documents()