Tuana commited on
Commit
cc0fbf1
1 Parent(s): 652a208

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -11,8 +11,8 @@ def start_haystack():
11
  clean_empty_lines=True,
12
  clean_whitespace=True,
13
  clean_header_footer=True,
14
- split_by="word",
15
- split_length=100,
16
  split_respect_sentence_boundary=True,
17
  )
18
  summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
@@ -23,10 +23,11 @@ def pdf_to_document_store(pdf_files):
23
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
24
  documents = []
25
  for pdf in pdf_files:
 
 
26
  doc = converter.convert(file_path=pdf.name, meta=None)[0]
27
  st.write(doc)
28
  preprocessed_doc=preprocessor.process([doc])
29
- st.write(preprocessed_doc)
30
  documents.append(preprocessed_doc)
31
  document_store.write_documents(documents)
32
  st.write('Document count: ', document_store.get_document_count())
 
11
  clean_empty_lines=True,
12
  clean_whitespace=True,
13
  clean_header_footer=True,
14
+ split_by="passage",
15
+ split_length=200,
16
  split_respect_sentence_boundary=True,
17
  )
18
  summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-newsroom")
 
23
  converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
24
  documents = []
25
  for pdf in pdf_files:
26
+ bytes_data = pdf.read()
27
+ st.write(pdf)
28
  doc = converter.convert(file_path=pdf.name, meta=None)[0]
29
  st.write(doc)
30
  preprocessed_doc=preprocessor.process([doc])
 
31
  documents.append(preprocessed_doc)
32
  document_store.write_documents(documents)
33
  st.write('Document count: ', document_store.get_document_count())