Tuana commited on
Commit
5fdc2d5
1 Parent(s): cc0fbf1

opening files as temp pdfs

Browse files
Files changed (1) hide show
  1. app.py +6 -7
app.py CHANGED
@@ -11,7 +11,7 @@ def start_haystack():
11
  clean_empty_lines=True,
12
  clean_whitespace=True,
13
  clean_header_footer=True,
14
- split_by="passage",
15
  split_length=200,
16
  split_respect_sentence_boundary=True,
17
  )
@@ -23,12 +23,11 @@ def pdf_to_document_store(pdf_files):
23
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
24
  documents = []
25
  for pdf in pdf_files:
26
- bytes_data = pdf.read()
27
- st.write(pdf)
28
- doc = converter.convert(file_path=pdf.name, meta=None)[0]
29
- st.write(doc)
30
- preprocessed_doc=preprocessor.process([doc])
31
- documents.append(preprocessed_doc)
32
  document_store.write_documents(documents)
33
  st.write('Document count: ', document_store.get_document_count())
34
 
 
11
  clean_empty_lines=True,
12
  clean_whitespace=True,
13
  clean_header_footer=True,
14
+ split_by="word",
15
  split_length=200,
16
  split_respect_sentence_boundary=True,
17
  )
 
23
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
24
  documents = []
25
  for pdf in pdf_files:
26
+ with open("temp-path.pdf", 'wb') as temp_file:
27
+ temp_file.write(pdf)
28
+ doc = converter.convert(file_path="temp-path.pdf", meta=None)[0]
29
+ preprocessed_doc=preprocessor.process([doc])
30
+ documents.append(preprocessed_doc)
 
31
  document_store.write_documents(documents)
32
  st.write('Document count: ', document_store.get_document_count())
33