from typing import Callable, Dict, List, Optional from pathlib import Path import re import logging import string import streamlit as st logger = logging.getLogger(__name__) import os os.environ["TOKENIZERS_PARALLELISM"] = "false" from haystack.utils import convert_files_to_docs, fetch_archive_from_http from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter from haystack.schema import Document import pdfplumber import pandas as pd def load_document( file: str, file_name, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails via Haystack. Returns a list of type haystack.schema.Document """ if file_name.name.endswith('.pdf'): converter = PDFToTextConverter(remove_numeric_tables=True) if file_name.name.endswith('.txt'): converter = TextConverter() if file_name.name.endswith('.docx'): converter = DocxToTextConverter() documents = [] logger.info("Converting {}".format(file_name)) # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document document = converter.convert( file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys )[0] text = document.content documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys)) '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.''' for i in documents: if i.content == "": st.write("using pdfplumber") text = [] with pdfplumber.open(file) as pdf: for page in pdf.pages: text.append(page.extract_text()) i.content = ' '.join([page for page in text]) return documents