from typing import Callable, Dict, List, Optional from pathlib import Path import re import logging import string import streamlit as st logger = logging.getLogger(__name__) import os os.environ["TOKENIZERS_PARALLELISM"] = "false" from haystack.utils import convert_files_to_docs, fetch_archive_from_http from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter from haystack.nodes.file_converter import PDFToTextConverter, TextConverter from haystack.schema import Document import pdfplumber import pandas as pd import tempfile import sqlite3 def load_document( file_path: str, file_name, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ takes docx, txt and pdf files as input and \ extracts text as well as the filename as metadata. \ Since haystack does not take care of all pdf files, \ pdfplumber is attached to the pipeline in case the pdf \ extraction fails via Haystack. Returns a list of type haystack.schema.Document """ if file_name.endswith('.pdf'): converter = PDFToTextConverter(remove_numeric_tables=True) if file_name.endswith('.txt'): converter = TextConverter() if file_name.endswith('.docx'): converter = DocxToTextConverter() documents = [] logger.info("Converting {}".format(file_name)) # PDFToTextConverter, TextConverter, and DocxToTextConverter # return a list containing a single Document document = converter.convert( file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys )[0] text = document.content documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys)) '''check if text is empty and apply different pdf processor. \ This can happen whith certain pdf types.''' for i in documents: if i.content == "": with st.spinner("using pdfplumber"): text = [] with pdfplumber.open(file_path) as pdf: for page in pdf.pages: text.append(page.extract_text()) i.content = ' '.join([page for page in text]) return documents