Spaces:
GIZ
/
Running on CPU Upgrade

SDSN-demo / udfPreprocess /docPreprocessing.py
prashant
ver0.2 udfpreprocess update
49a314a
raw
history blame
2.35 kB
from typing import Callable, Dict, List, Optional
from pathlib import Path
import re
import logging
import string
import streamlit as st
logger = logging.getLogger(__name__)
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber
import pandas as pd
import tempfile
import sqlite3
def load_document(
file_path: str,
file_name,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
takes docx, txt and pdf files as input and \
extracts text as well as the filename as metadata. \
Since haystack does not take care of all pdf files, \
pdfplumber is attached to the pipeline in case the pdf \
extraction fails via Haystack.
Returns a list of type haystack.schema.Document
"""
if file_name.endswith('.pdf'):
converter = PDFToTextConverter(remove_numeric_tables=True)
if file_name.endswith('.txt'):
converter = TextConverter()
if file_name.endswith('.docx'):
converter = DocxToTextConverter()
documents = []
logger.info("Converting {}".format(file_name))
# PDFToTextConverter, TextConverter, and DocxToTextConverter
# return a list containing a single Document
document = converter.convert(
file_path=file_path, meta=None,
encoding=encoding, id_hash_keys=id_hash_keys
)[0]
text = document.content
documents.append(Document(content=text,
meta={"name": file_name},
id_hash_keys=id_hash_keys))
'''check if text is empty and apply different pdf processor. \
This can happen whith certain pdf types.'''
for i in documents:
if i.content == "":
with st.spinner("using pdfplumber"):
text = []
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text.append(page.extract_text())
i.content = ' '.join([page for page in text])
return documents