peter2000 commited on
Commit
91975ca
1 Parent(s): 67458c0

Create new file

Browse files
Files changed (1) hide show
  1. scripts/process.py +65 -0
scripts/process.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Optional
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import logging
6
+ import string
7
+ import streamlit as st
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
+ from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
15
+ from haystack.schema import Document
16
+ import pdfplumber
17
+
18
+ import pandas as pd
19
+ import tempfile
20
+ import sqlite3
21
+
22
+
23
+
24
+ def load_document(
25
+ file: str,
26
+ file_name,
27
+ encoding: Optional[str] = None,
28
+ id_hash_keys: Optional[List[str]] = None,
29
+ ) -> List[Document]:
30
+
31
+ """
32
+ takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
33
+ does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
34
+ via Haystack.
35
+ Returns a list of type haystack.schema.Document
36
+ """
37
+
38
+ if file_name.name.endswith('.pdf'):
39
+ converter = PDFToTextConverter(remove_numeric_tables=True)
40
+ if file_name.name.endswith('.txt'):
41
+ converter = TextConverter()
42
+ if file_name.name.endswith('.docx'):
43
+ converter = DocxToTextConverter()
44
+
45
+
46
+ documents = []
47
+ logger.info("Converting {}".format(file_name))
48
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
49
+ document = converter.convert(
50
+ file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
51
+ )[0]
52
+ text = document.content
53
+ documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
54
+
55
+ '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
56
+ for i in documents:
57
+ if i.content == "":
58
+ st.write("using pdfplumber")
59
+ text = []
60
+ with pdfplumber.open(file) as pdf:
61
+ for page in pdf.pages:
62
+ text.append(page.extract_text())
63
+ i.content = ' '.join([page for page in text])
64
+
65
+ return documents