peter2000 commited on
Commit
5cf4bb5
1 Parent(s): bca1994

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +34 -33
scripts/process.py CHANGED
@@ -69,38 +69,39 @@ def load_document(
69
  extraction fails via Haystack.
70
  Returns a list of type haystack.schema.Document
71
  """
72
- st.write(file_name)
73
- if file_name.endswith('.pdf'):
74
- converter = PDFToTextConverter(remove_numeric_tables=True)
75
- if file_name.endswith('.txt'):
76
- converter = TextConverter()
77
- if file_name.endswith('.docx'):
78
- converter = DocxToTextConverter()
79
-
80
-
81
- documents = []
82
- logger.info("Converting {}".format(file_name))
83
- # PDFToTextConverter, TextConverter, and DocxToTextConverter
84
- # return a list containing a single Document
85
- document = converter.convert(
86
- file_path=file_path, meta=None,
87
- encoding=encoding, id_hash_keys=id_hash_keys
88
- )[0]
89
- text = document.content
90
- documents.append(Document(content=text,
91
- meta={"name": file_name},
92
- id_hash_keys=id_hash_keys))
93
-
94
- '''check if text is empty and apply different pdf processor. \
95
- This can happen whith certain pdf types.'''
96
- for i in documents:
97
- if i.content == "":
98
- st.write("using pdfplumber")
99
- text = []
100
- with pdfplumber.open(file_path) as pdf:
101
- for page in pdf.pages:
102
- text.append(page.extract_text())
103
- i.content = ' '.join([page for page in text])
 
104
 
105
  return documents
106
 
@@ -126,7 +127,7 @@ def preprocessing(document):
126
  for item in docs_processed:
127
  item.content = basic(item.content)
128
 
129
- st.write("your document has been splitted to", len(docs_processed), "paragraphs")
130
 
131
  # create dataframe of text and list of all text
132
  #df = pd.DataFrame(docs_processed)
 
69
  extraction fails via Haystack.
70
  Returns a list of type haystack.schema.Document
71
  """
72
+ with st.spinner("👑 Uploading file"):#+file.name+"..."):
73
+ try:
74
+ if file_name.endswith('.pdf'):
75
+ converter = PDFToTextConverter(remove_numeric_tables=True)
76
+ if file_name.endswith('.txt'):
77
+ converter = TextConverter()
78
+ if file_name.endswith('.docx'):
79
+ converter = DocxToTextConverter()
80
+
81
+
82
+ documents = []
83
+ #logger.info("Converting {}".format(file_name))
84
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter
85
+ # return a list containing a single Document
86
+ document = converter.convert(
87
+ file_path=file_path, meta=None,
88
+ encoding=encoding, id_hash_keys=id_hash_keys
89
+ )[0]
90
+ text = document.content
91
+ documents.append(Document(content=text,
92
+ meta={"name": file_name},
93
+ id_hash_keys=id_hash_keys))
94
+
95
+ '''check if text is empty and apply different pdf processor. \
96
+ This can happen whith certain pdf types.'''
97
+ for i in documents:
98
+ if i.content == "":
99
+ st.write("using pdfplumber")
100
+ text = []
101
+ with pdfplumber.open(file_path) as pdf:
102
+ for page in pdf.pages:
103
+ text.append(page.extract_text())
104
+ i.content = ' '.join([page for page in text])
105
 
106
  return documents
107
 
 
127
  for item in docs_processed:
128
  item.content = basic(item.content)
129
 
130
+ #st.write("your document has been splitted to", len(docs_processed), "paragraphs")
131
 
132
  # create dataframe of text and list of all text
133
  #df = pd.DataFrame(docs_processed)