sdg-policy-tracing / src /preprocessing.py
jonas's picture
add app.py
f51b958
raw
history blame
2.13 kB
from typing import Callable, Dict, List, Optional
from pathlib import Path
import re
import logging
import string
import streamlit as st
logger = logging.getLogger(__name__)
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber
import pandas as pd
def load_document(
file: str,
file_name,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
via Haystack.
Returns a list of type haystack.schema.Document
"""
if file_name.name.endswith('.pdf'):
converter = PDFToTextConverter(remove_numeric_tables=True)
if file_name.name.endswith('.txt'):
converter = TextConverter()
if file_name.name.endswith('.docx'):
converter = DocxToTextConverter()
documents = []
logger.info("Converting {}".format(file_name))
# PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
document = converter.convert(
file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
)[0]
text = document.content
documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys))
'''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
for i in documents:
if i.content == "":
st.write("using pdfplumber")
text = []
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text.append(page.extract_text())
i.content = ' '.join([page for page in text])
return documents