peter2000 commited on
Commit
6fa214d
1 Parent(s): b913c31

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +17 -2
scripts/process.py CHANGED
@@ -1,4 +1,6 @@
1
  import streamlit as st
 
 
2
  import os
3
  from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
4
  from haystack.schema import Answer
@@ -19,10 +21,11 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
19
  # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
20
  # document_store.write_documents(docs)
21
 
22
- #pipeline = start_haystack()
23
  def load_document(
24
  file_path: str,
25
  file_name,
 
26
  id_hash_keys: Optional[List[str]] = None,
27
  ) -> List[Document]:
28
 
@@ -49,14 +52,26 @@ def load_document(
49
  # return a list containing a single Document
50
  document = converter.convert(
51
  file_path=file_path, meta=None,
52
- id_hash_keys=id_hash_keys
53
  )[0]
54
  text = document.content
55
  documents.append(Document(content=text,
56
  meta={"name": file_name},
57
  id_hash_keys=id_hash_keys))
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  return documents
 
60
 
61
  def preprocessing(document):
62
  """
 
1
  import streamlit as st
2
+ from typing import Callable, Dict, List, Optional
3
+
4
  import os
5
  from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
6
  from haystack.schema import Answer
 
21
  # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
22
  # document_store.write_documents(docs)
23
 
24
+
25
  def load_document(
26
  file_path: str,
27
  file_name,
28
+ encoding: Optional[str] = None,
29
  id_hash_keys: Optional[List[str]] = None,
30
  ) -> List[Document]:
31
 
 
52
  # return a list containing a single Document
53
  document = converter.convert(
54
  file_path=file_path, meta=None,
55
+ encoding=encoding, id_hash_keys=id_hash_keys
56
  )[0]
57
  text = document.content
58
  documents.append(Document(content=text,
59
  meta={"name": file_name},
60
  id_hash_keys=id_hash_keys))
61
 
62
+ '''check if text is empty and apply different pdf processor. \
63
+ This can happen whith certain pdf types.'''
64
+ for i in documents:
65
+ if i.content == "":
66
+ st.write("using pdfplumber")
67
+ text = []
68
+ with pdfplumber.open(file_path) as pdf:
69
+ for page in pdf.pages:
70
+ text.append(page.extract_text())
71
+ i.content = ' '.join([page for page in text])
72
+
73
  return documents
74
+
75
 
76
  def preprocessing(document):
77
  """