import streamlit as st from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever import logging document_store = InMemoryDocumentStore() preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word", split_length=100, split_respect_sentence_boundary=True, split_overlap=3 ) uploaded_files = st.file_uploader(label='Upload a PDF Document', accept_multiple_files=True) logging.info(uploaded_files) def pdf_to_document_store(pdf_files): document_store.delete_documents() converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) documents = [] for pdf in pdf_files: documents.append(converter.convert(file_path=pdf.name, meta=None)) preprocessed_docs = preprocessor.process(documents) document_store.write_documents(preprocessed_docs) return None if uploaded_files is not None: document_store.delete_all_documents() pdf_to_document_store(uploaded_files)