Spaces:

peter2000
/

policy_test

Runtime error

peter2000 commited on Sep 27, 2022

Commit

c2c2862

1 Parent(s): a087ad0

Update scripts/process.py

Files changed (1) hide show

scripts/process.py CHANGED Viewed

@@ -20,20 +20,21 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
 #    document_store.write_documents(docs)
 #pipeline = start_haystack()
 def load_document(
-                    file_path: str,
-                    encoding: Optional[str] = None,
-                    id_hash_keys: Optional[List[str]] = None,
-                  ) -> List[Document]:
     """
-    Takes docx, txt and pdf files as input and extracts text as well as the
-    filename as metadata. Image pdf will not be handled in this notebook.
     Returns a list of type haystack.schema.Document
     """
-    file_name = str.split(file_path,'/')[-1]
     if file_name.endswith('.pdf'):
         converter = PDFToTextConverter(remove_numeric_tables=True)
@@ -44,17 +45,18 @@ def load_document(
     documents = []
-    #logger.info("Converting {}".format(file_name))
-    print("Converting '{}'".format(file_name))
-    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
     document = converter.convert(
-                file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
-            )[0]
     text = document.content
-    # creating the Haystack document by extracting 'content' from the returned object and passing meta information
-    documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
     return documents
  def preprocessing(document,

 #    document_store.write_documents(docs)
 #pipeline = start_haystack()
 def load_document(
+    file_path: str,
+    file_name,
+    encoding: Optional[str] = None,
+    id_hash_keys: Optional[List[str]] = None,
+) -> List[Document]:
     """
+    takes docx, txt and pdf files as input and \
+    extracts text as well as the filename as metadata. \
+    Since haystack does not take care of all pdf files, \
+    pdfplumber is attached to the pipeline in case the pdf \
+    extraction fails via Haystack.
     Returns a list of type haystack.schema.Document
     """
     if file_name.endswith('.pdf'):
         converter = PDFToTextConverter(remove_numeric_tables=True)
     documents = []
+    logger.info("Converting {}".format(file_name))
+    # PDFToTextConverter, TextConverter, and DocxToTextConverter
+    # return a list containing a single Document
     document = converter.convert(
+                file_path=file_path, meta=None,
+                encoding=encoding, id_hash_keys=id_hash_keys
+                )[0]
     text = document.content
+    documents.append(Document(content=text,
+                              meta={"name": file_name},
+                              id_hash_keys=id_hash_keys))
     return documents
  def preprocessing(document,