peter2000 commited on
Commit
0c277f0
1 Parent(s): f75d001

Update scripts/process.py

Browse files
Files changed (1) hide show
  1. scripts/process.py +74 -39
scripts/process.py CHANGED
@@ -1,65 +1,100 @@
1
- from typing import Callable, Dict, List, Optional
2
-
3
- from pathlib import Path
4
- import re
5
- import logging
6
- import string
7
  import streamlit as st
8
- logger = logging.getLogger(__name__)
9
-
10
  import os
11
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 
 
 
 
 
 
12
 
13
- from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
- from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
15
- from haystack.schema import Document
16
- import pdfplumber
17
 
18
- import pandas as pd
19
- import tempfile
20
- import sqlite3
21
 
22
 
 
 
 
 
 
 
23
 
24
  def load_document(
25
- file: str,
26
- file_name,
27
- encoding: Optional[str] = None,
28
- id_hash_keys: Optional[List[str]] = None,
29
- ) -> List[Document]:
30
 
31
  """
32
- takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
33
- does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
34
- via Haystack.
35
  Returns a list of type haystack.schema.Document
36
  """
 
37
 
38
- if file_name.name.endswith('.pdf'):
39
  converter = PDFToTextConverter(remove_numeric_tables=True)
40
- if file_name.name.endswith('.txt'):
41
  converter = TextConverter()
42
- if file_name.name.endswith('.docx'):
43
  converter = DocxToTextConverter()
44
 
45
 
46
  documents = []
47
- logger.info("Converting {}".format(file_name))
 
 
48
  # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
49
  document = converter.convert(
50
- file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
51
  )[0]
52
  text = document.content
 
 
53
  documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
56
- for i in documents:
57
- if i.content == "":
58
- st.write("using pdfplumber")
59
- text = []
60
- with pdfplumber.open(file) as pdf:
61
- for page in pdf.pages:
62
- text.append(page.extract_text())
63
- i.content = ' '.join([page for page in text])
64
 
65
- return documents
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
2
  import os
3
+ from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
4
+ from haystack.schema import Answer
5
+ from haystack.document_stores import InMemoryDocumentStore
6
+ from haystack.pipelines import ExtractiveQAPipeline
7
+ from haystack.nodes import FARMReader, TfidfRetriever
8
+ import logging
9
+ from markdown import markdown
10
+ from annotated_text import annotation
11
+ from PIL import Image
12
 
13
+ os.environ['TOKENIZERS_PARALLELISM'] ="false"
 
 
 
14
 
 
 
 
15
 
16
 
17
+ #def load_and_write_data(document_store):
18
+ # doc_dir = './article_txt_got'
19
+ # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
20
+ # document_store.write_documents(docs)
21
+
22
+ #pipeline = start_haystack()
23
 
24
  def load_document(
25
+ file_path: str,
26
+ encoding: Optional[str] = None,
27
+ id_hash_keys: Optional[List[str]] = None,
28
+ ) -> List[Document]:
 
29
 
30
  """
31
+ Takes docx, txt and pdf files as input and extracts text as well as the
32
+ filename as metadata. Image pdf will not be handled in this notebook.
33
+
34
  Returns a list of type haystack.schema.Document
35
  """
36
+ file_name = str.split(file_path,'/')[-1]
37
 
38
+ if file_name.endswith('.pdf'):
39
  converter = PDFToTextConverter(remove_numeric_tables=True)
40
+ if file_name.endswith('.txt'):
41
  converter = TextConverter()
42
+ if file_name.endswith('.docx'):
43
  converter = DocxToTextConverter()
44
 
45
 
46
  documents = []
47
+
48
+ #logger.info("Converting {}".format(file_name))
49
+ print("Converting '{}'".format(file_name))
50
  # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
51
  document = converter.convert(
52
+ file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
53
  )[0]
54
  text = document.content
55
+
56
+ # creating the Haystack document by extracting 'content' from the returned object and passing meta information
57
  documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
58
+ return documents
59
+
60
+ def preprocessing(document,
61
+ split_by: Literal["sentence", "word"] = 'sentence',
62
+ split_length:int = 3):
63
+
64
+ """
65
+ takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning.
66
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
67
+ list that contains all text joined together.
68
+ """
69
+ if split_by == 'sentence':
70
+ split_respect_sentence_boundary = False
71
+ split_overlap=0
72
+ else:
73
+ split_respect_sentence_boundary = True
74
+ split_overlap= 20
75
+
76
+ preprocessor = PreProcessor(
77
+ clean_empty_lines=True,
78
+ clean_whitespace=True,
79
+ clean_header_footer=True,
80
+ split_by=split_by,
81
+ split_length=split_length,
82
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
83
+ split_overlap=split_overlap
84
+ )
85
+ for i in document:
86
+ docs_processed = preprocessor.process([i])
87
+ for item in docs_processed:
88
+ item.content = basic(item.content)
89
+
90
+ print("\n your document has been splitted to", len(docs_processed), "paragraphs")
91
+ # logger.info("document has been splitted to {}".format(len(docs_processed)))
92
+
93
+ # create dataframe of text and list of all text
94
+ #df = pd.DataFrame(docs_processed)
95
+ #all_text = " ".join(df.content.to_list())
96
+ #par_list = df.content.to_list()
97
+
98
+ return docs_processed #, df, all_text, par_list
99
 
 
 
 
 
 
 
 
 
 
100