Tuana commited on
Commit
a3fdd99
1 Parent(s): eee8137

First attempt

Browse files
Files changed (2) hide show
  1. app.py +32 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from haystack.document_stores import InMemoryDocumentStore
3
+ from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
4
+ import logging
5
+
6
+ document_store = InMemoryDocumentStore()
7
+ preprocessor = PreProcessor(
8
+ clean_empty_lines=True,
9
+ clean_whitespace=True,
10
+ clean_header_footer=True,
11
+ split_by="word",
12
+ split_length=100,
13
+ split_respect_sentence_boundary=True,
14
+ split_overlap=3
15
+ )
16
+
17
+ uploaded_files = st.file_uploader(label='Upload a PDF Document', accept_multiple_files=True)
18
+ logging.info(uploaded_files)
19
+
20
+ def pdf_to_document_store(pdf_files):
21
+ document_store.delete_documents()
22
+ converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
23
+ documents = []
24
+ for pdf in pdf_files:
25
+ documents.append(converter.convert(file_path=pdf.name, meta=None))
26
+ preprocessed_docs = preprocessor.process(documents)
27
+ document_store.write_documents(preprocessed_docs)
28
+ return None
29
+
30
+ if uploaded_files is not None:
31
+ document_store.delete_all_documents()
32
+ pdf_to_document_store(uploaded_files)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ farm-haystack==1.4.0