Spaces:
Build error
Build error
opening files as temp pdfs
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ def start_haystack():
|
|
11 |
clean_empty_lines=True,
|
12 |
clean_whitespace=True,
|
13 |
clean_header_footer=True,
|
14 |
-
split_by="
|
15 |
split_length=200,
|
16 |
split_respect_sentence_boundary=True,
|
17 |
)
|
@@ -23,12 +23,11 @@ def pdf_to_document_store(pdf_files):
|
|
23 |
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
24 |
documents = []
|
25 |
for pdf in pdf_files:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
documents.append(preprocessed_doc)
|
32 |
document_store.write_documents(documents)
|
33 |
st.write('Document count: ', document_store.get_document_count())
|
34 |
|
|
|
11 |
clean_empty_lines=True,
|
12 |
clean_whitespace=True,
|
13 |
clean_header_footer=True,
|
14 |
+
split_by="word",
|
15 |
split_length=200,
|
16 |
split_respect_sentence_boundary=True,
|
17 |
)
|
|
|
23 |
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
24 |
documents = []
|
25 |
for pdf in pdf_files:
|
26 |
+
with open("temp-path.pdf", 'wb') as temp_file:
|
27 |
+
temp_file.write(pdf)
|
28 |
+
doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
|
29 |
+
preprocessed_doc=preprocessor.process([doc])
|
30 |
+
documents.append(preprocessed_doc)
|
|
|
31 |
document_store.write_documents(documents)
|
32 |
st.write('Document count: ', document_store.get_document_count())
|
33 |
|