Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

peter2000 commited on Sep 27, 2022

Commit

0c277f0

•

1 Parent(s): f75d001

Update scripts/process.py

Browse files

Files changed (1) hide show

scripts/process.py +74 -39

scripts/process.py CHANGED Viewed

@@ -1,65 +1,100 @@
-from typing import Callable, Dict, List, Optional
-from pathlib import Path
-import re
-import logging
-import string
 import streamlit as st
-logger = logging.getLogger(__name__)
 import os
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from haystack.utils import convert_files_to_docs, fetch_archive_from_http
-from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
-from haystack.schema import Document
-import pdfplumber
-import pandas as pd
-import tempfile
-import sqlite3
 def load_document(
-    file: str,
-    file_name,
-    encoding: Optional[str] = None,
-    id_hash_keys: Optional[List[str]] = None,
-) -> List[Document]:
     """
-    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
-    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
-    via Haystack.
     Returns a list of type haystack.schema.Document
     """
-    if file_name.name.endswith('.pdf'):
         converter = PDFToTextConverter(remove_numeric_tables=True)
-    if file_name.name.endswith('.txt'):
         converter = TextConverter()
-    if file_name.name.endswith('.docx'):
         converter = DocxToTextConverter()
     documents = []
-    logger.info("Converting {}".format(file_name))
     # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
     document = converter.convert(
-                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
             )[0]
     text = document.content
     documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
-    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
-    for i in documents:
-        if i.content == "":
-            st.write("using pdfplumber")
-            text = []
-            with pdfplumber.open(file) as pdf:
-                for page in pdf.pages:
-                    text.append(page.extract_text())
-            i.content = ' '.join([page for page in text])
-    return documents

 import streamlit as st
 import os
+from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
+from haystack.schema import Answer
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.pipelines import ExtractiveQAPipeline
+from haystack.nodes import FARMReader, TfidfRetriever
+import logging
+from markdown import markdown
+from annotated_text import annotation
+from PIL import Image
+os.environ['TOKENIZERS_PARALLELISM'] ="false"
+#def load_and_write_data(document_store):
+#    doc_dir = './article_txt_got'
+#    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
+#    document_store.write_documents(docs)
+#pipeline = start_haystack()
 def load_document(
+                    file_path: str,
+                    encoding: Optional[str] = None,
+                    id_hash_keys: Optional[List[str]] = None,
+                  ) -> List[Document]:
     """
+    Takes docx, txt and pdf files as input and extracts text as well as the
+    filename as metadata. Image pdf will not be handled in this notebook.
     Returns a list of type haystack.schema.Document
     """
+    file_name = str.split(file_path,'/')[-1]
+    if file_name.endswith('.pdf'):
         converter = PDFToTextConverter(remove_numeric_tables=True)
+    if file_name.endswith('.txt'):
         converter = TextConverter()
+    if file_name.endswith('.docx'):
         converter = DocxToTextConverter()
     documents = []
+    #logger.info("Converting {}".format(file_name))
+    print("Converting '{}'".format(file_name))
     # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
     document = converter.convert(
+                file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
             )[0]
     text = document.content
+    # creating the Haystack document by extracting 'content' from the returned object and passing meta information
     documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
+    return documents
+ def preprocessing(document,
+                  split_by: Literal["sentence", "word"] = 'sentence',
+                  split_length:int = 3):
+    """
+    takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning.
+    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
+    list that contains all text joined together.
+    """
+    if split_by == 'sentence':
+      split_respect_sentence_boundary = False
+      split_overlap=0
+    else:
+      split_respect_sentence_boundary = True
+      split_overlap= 20
+    preprocessor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=True,
+        split_by=split_by,
+        split_length=split_length,
+        split_respect_sentence_boundary= split_respect_sentence_boundary,
+        split_overlap=split_overlap
+    )
+    for i in document:
+        docs_processed = preprocessor.process([i])
+        for item in docs_processed:
+            item.content = basic(item.content)
+    print("\n your document has been splitted to", len(docs_processed), "paragraphs")
+    # logger.info("document has been splitted to {}".format(len(docs_processed)))
+    # create dataframe of text and list of all text
+    #df = pd.DataFrame(docs_processed)
+    #all_text = " ".join(df.content.to_list())
+    #par_list = df.content.to_list()
+    return docs_processed #, df, all_text, par_list