peter2000 commited on
Commit
9df5b3c
β€’
1 Parent(s): 0171405

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +21 -23
scripts/process.py CHANGED
@@ -70,29 +70,27 @@ def load_document(
70
  Returns a list of type haystack.schema.Document
71
  """
72
  with st.spinner("πŸ‘‘ Uploading file"):#+file.name+"..."):
73
- try:
74
- if file_name.endswith('.pdf'):
75
- converter = PDFToTextConverter(remove_numeric_tables=True)
76
- if file_name.endswith('.txt'):
77
- converter = TextConverter()
78
- if file_name.endswith('.docx'):
79
- converter = DocxToTextConverter()
80
-
81
-
82
- documents = []
83
- #logger.info("Converting {}".format(file_name))
84
- # PDFToTextConverter, TextConverter, and DocxToTextConverter
85
- # return a list containing a single Document
86
- document = converter.convert(
87
- file_path=file_path, meta=None,
88
- encoding=encoding, id_hash_keys=id_hash_keys
89
- )[0]
90
- text = document.content
91
- documents.append(Document(content=text,
92
- meta={"name": file_name},
93
- id_hash_keys=id_hash_keys))
94
-
95
- return documents
96
 
97
 
98
  def preprocessing(document):
 
70
  Returns a list of type haystack.schema.Document
71
  """
72
  with st.spinner("πŸ‘‘ Uploading file"):#+file.name+"..."):
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter()
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+
80
+
81
+ documents = []
82
+ #logger.info("Converting {}".format(file_name))
83
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter
84
+ # return a list containing a single Document
85
+ document = converter.convert(
86
+ file_path=file_path, meta=None,
87
+ encoding=encoding, id_hash_keys=id_hash_keys
88
+ )[0]
89
+ text = document.content
90
+ documents.append(Document(content=text,
91
+ meta={"name": file_name},
92
+ id_hash_keys=id_hash_keys))
93
+ return documents
 
 
94
 
95
 
96
  def preprocessing(document):