### Notebook for updating the PDF document

In [None]:
from ipynb.fs.defs.preprocess_data import store_documents
from ipynb.fs.defs.preprocess_data import load_documents
from langchain.docstore.document import Document
import pypdfium2 as pdfium
import cv2
import os
import pytesseract
from typing import List
import shutil

pytesseract_path = os.environ.get("TESSERACT_PATH")
pytesseract.pytesseract.tesseract_cmd = pytesseract_path


def update_pdf_documents() -> List[Document]:
 """
 Method for processing and updating documents based on the PDFs stored in input_data/PDF/documents. For that the PDFs, that were not processed yet, are converted to images and then transformed to texts. For each PDF one document is then created with all text from all pages. In the end the filename is changed, so that it is clear that it was already processed.
 """

 # List for either all documents or only new ones
 documents_PDF = []
 # List for all documents
 already_processed_documents = load_documents("./../input_data/PDF/documents/all_documents")

 PDF_images_path = "./../input_data/PDF/PDF_Images"
 directory_path = "./../input_data/PDF/files"

 # Go through each PDF file in the directory
 for file in os.listdir(directory_path):
 if "Tesseract_processed" not in file:
 file_path = os.path.join(directory_path, file)
 pdf = pdfium.PdfDocument(file_path)
 n_pages = len(pdf)
 # Create directory to store the image
 os.makedirs(PDF_images_path + f"/{file}")
 complete_text = ""
 # Go through each page of the PDF and save the according image
 for page_number in range(n_pages):
 page = pdf.get_page(page_number)
 pil_image = page.render(
 scale=300 / 72,
 rotation=0,
 crop=(0, 0, 0, 0),
 ).to_pil()
 pil_image_path = PDF_images_path + f"/{file}/image_{page_number+1}.png"
 pil_image.save(pil_image_path)
 img = cv2.imread(pil_image_path)
 # Convert image to grayscale
 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
 # Apply threshold to convert to binary image
 threshold_img = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
 # Pass the image through pytesseract and add the text to the whole document text
 complete_text += pytesseract.image_to_string(threshold_img) + "\n"
 # Remove the image as it is already processed
 os.remove(pil_image_path)

 file_name_without_pdf = file
 if file.endswith(".pdf"):
 file_name_without_pdf = file[:-4]
 # Create a document based on the whole text and metadata
 document_PDF = Document(page_content=complete_text, metadata={"source": file, "title": file_name_without_pdf})
 documents_PDF.append(document_PDF)
 already_processed_documents.append(document_PDF)

 # Change the filename, so that in future calls the PDF is not processed again
 new_filename = file.replace(".pdf", "_Tesseract_processed.pdf")
 new_pdf_path = os.path.join(directory_path, new_filename)
 print(new_pdf_path)
 pdf.close()
 os.rename(file_path, new_pdf_path)

 # Store docs if new documents were processed
 if len(documents_PDF) > 0:
 # Store all documents, including the new ones
 store_documents(already_processed_documents, "./../input_data/PDF/documents/all_documents")
 # Store the new documents
 store_documents(documents_PDF, "./../input_data/PDF/documents/new_documents")

 # Delete the empty folders inside the images folder
 target_dir = "./../input_data/PDF/PDF_images"

 # Check if the target directory exists to avoid errors
 if os.path.exists(target_dir):
 # List all the items in the directory
 for item in os.listdir(target_dir):
 item_path = os.path.join(target_dir, item)
 if os.path.isdir(item_path):
 # Use shutil.rmtree to delete the directory and all its contents
 shutil.rmtree(item_path)

In [None]:
# Uncomment update needed because of new unprocessed files
# update_pdf_documents()