Spaces:
Runtime error
Runtime error
Update scripts/process.py
Browse files- scripts/process.py +20 -18
scripts/process.py
CHANGED
@@ -20,20 +20,21 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
|
|
20 |
# document_store.write_documents(docs)
|
21 |
|
22 |
#pipeline = start_haystack()
|
23 |
-
|
24 |
def load_document(
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
29 |
|
30 |
"""
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
Returns a list of type haystack.schema.Document
|
35 |
"""
|
36 |
-
file_name = str.split(file_path,'/')[-1]
|
37 |
|
38 |
if file_name.endswith('.pdf'):
|
39 |
converter = PDFToTextConverter(remove_numeric_tables=True)
|
@@ -44,17 +45,18 @@ def load_document(
|
|
44 |
|
45 |
|
46 |
documents = []
|
47 |
-
|
48 |
-
#
|
49 |
-
|
50 |
-
# PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
|
51 |
document = converter.convert(
|
52 |
-
file_path=file_path, meta=None,
|
53 |
-
|
|
|
54 |
text = document.content
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
return documents
|
59 |
|
60 |
def preprocessing(document,
|
|
|
20 |
# document_store.write_documents(docs)
|
21 |
|
22 |
#pipeline = start_haystack()
|
|
|
23 |
def load_document(
|
24 |
+
file_path: str,
|
25 |
+
file_name,
|
26 |
+
encoding: Optional[str] = None,
|
27 |
+
id_hash_keys: Optional[List[str]] = None,
|
28 |
+
) -> List[Document]:
|
29 |
|
30 |
"""
|
31 |
+
takes docx, txt and pdf files as input and \
|
32 |
+
extracts text as well as the filename as metadata. \
|
33 |
+
Since haystack does not take care of all pdf files, \
|
34 |
+
pdfplumber is attached to the pipeline in case the pdf \
|
35 |
+
extraction fails via Haystack.
|
36 |
Returns a list of type haystack.schema.Document
|
37 |
"""
|
|
|
38 |
|
39 |
if file_name.endswith('.pdf'):
|
40 |
converter = PDFToTextConverter(remove_numeric_tables=True)
|
|
|
45 |
|
46 |
|
47 |
documents = []
|
48 |
+
logger.info("Converting {}".format(file_name))
|
49 |
+
# PDFToTextConverter, TextConverter, and DocxToTextConverter
|
50 |
+
# return a list containing a single Document
|
|
|
51 |
document = converter.convert(
|
52 |
+
file_path=file_path, meta=None,
|
53 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
54 |
+
)[0]
|
55 |
text = document.content
|
56 |
+
documents.append(Document(content=text,
|
57 |
+
meta={"name": file_name},
|
58 |
+
id_hash_keys=id_hash_keys))
|
59 |
+
|
60 |
return documents
|
61 |
|
62 |
def preprocessing(document,
|