peter2000 commited on
Commit
c2c2862
1 Parent(s): a087ad0

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +20 -18
scripts/process.py CHANGED
@@ -20,20 +20,21 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
20
  # document_store.write_documents(docs)
21
 
22
  #pipeline = start_haystack()
23
-
24
  def load_document(
25
- file_path: str,
26
- encoding: Optional[str] = None,
27
- id_hash_keys: Optional[List[str]] = None,
28
- ) -> List[Document]:
 
29
 
30
  """
31
- Takes docx, txt and pdf files as input and extracts text as well as the
32
- filename as metadata. Image pdf will not be handled in this notebook.
33
-
 
 
34
  Returns a list of type haystack.schema.Document
35
  """
36
- file_name = str.split(file_path,'/')[-1]
37
 
38
  if file_name.endswith('.pdf'):
39
  converter = PDFToTextConverter(remove_numeric_tables=True)
@@ -44,17 +45,18 @@ def load_document(
44
 
45
 
46
  documents = []
47
-
48
- #logger.info("Converting {}".format(file_name))
49
- print("Converting '{}'".format(file_name))
50
- # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
51
  document = converter.convert(
52
- file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
53
- )[0]
 
54
  text = document.content
55
-
56
- # creating the Haystack document by extracting 'content' from the returned object and passing meta information
57
- documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
 
58
  return documents
59
 
60
  def preprocessing(document,
 
20
  # document_store.write_documents(docs)
21
 
22
  #pipeline = start_haystack()
 
23
  def load_document(
24
+ file_path: str,
25
+ file_name,
26
+ encoding: Optional[str] = None,
27
+ id_hash_keys: Optional[List[str]] = None,
28
+ ) -> List[Document]:
29
 
30
  """
31
+ takes docx, txt and pdf files as input and \
32
+ extracts text as well as the filename as metadata. \
33
+ Since haystack does not take care of all pdf files, \
34
+ pdfplumber is attached to the pipeline in case the pdf \
35
+ extraction fails via Haystack.
36
  Returns a list of type haystack.schema.Document
37
  """
 
38
 
39
  if file_name.endswith('.pdf'):
40
  converter = PDFToTextConverter(remove_numeric_tables=True)
 
45
 
46
 
47
  documents = []
48
+ logger.info("Converting {}".format(file_name))
49
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter
50
+ # return a list containing a single Document
 
51
  document = converter.convert(
52
+ file_path=file_path, meta=None,
53
+ encoding=encoding, id_hash_keys=id_hash_keys
54
+ )[0]
55
  text = document.content
56
+ documents.append(Document(content=text,
57
+ meta={"name": file_name},
58
+ id_hash_keys=id_hash_keys))
59
+
60
  return documents
61
 
62
  def preprocessing(document,