Spaces:

oceansweep
/

tldw

Running

File size: 5,732 Bytes

ed28876

# PDF_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting PDF files.#
#
####################
# Function List
#
# 1. convert_pdf_to_markdown(pdf_path)
# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
# 3.
#
#
####################


# Import necessary libraries
from datetime import datetime
import logging
import subprocess
import os
import shutil
import tempfile


# Import Local
from App_Function_Libraries.SQLite_DB import add_media_with_keywords

#######################################################################################################################
# Function Definitions
#

# Ingest a text file into the database with Title/Author/Keywords


# Constants
MAX_FILE_SIZE_MB = 50
CONVERSION_TIMEOUT_SECONDS = 300


def convert_pdf_to_markdown(pdf_path):
    """

    Convert a PDF file to Markdown by calling a script in another virtual environment.

    """

    logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
    # Check if the file size exceeds the maximum allowed size
    file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
    if file_size_mb > MAX_FILE_SIZE_MB:
        raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")

    logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
    # Path to the Python interpreter in the other virtual environment
    other_venv_python = "Helper_Scripts/marker_venv/bin/python"

    # Path to the conversion script
    converter_script = "Helper_Scripts/PDF_Converter.py"

    logging.debug("Marker: Attempting to convert PDF file to Markdown...")
    try:
        result = subprocess.run(
            [other_venv_python, converter_script, pdf_path],
            capture_output=True,
            text=True,
            timeout=CONVERSION_TIMEOUT_SECONDS
        )
        if result.returncode != 0:
            raise Exception(f"Conversion failed: {result.stderr}")
        return result.stdout
    except subprocess.TimeoutExpired:
        raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")


def process_and_ingest_pdf(file, title, author, keywords):
    if file is None:
        return "Please select a PDF file to upload."

    try:
        # Create a temporary directory
        with tempfile.TemporaryDirectory() as temp_dir:
            # Create a path for the temporary PDF file
            temp_path = os.path.join(temp_dir, "temp.pdf")

            # Copy the contents of the uploaded file to the temporary file
            shutil.copy(file.name, temp_path)

            # Call the ingest_pdf_file function with the temporary file path
            result = ingest_pdf_file(temp_path, title, author, keywords)

        return result
    except Exception as e:
        return f"Error processing PDF: {str(e)}"


def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
    try:
        # Convert PDF to Markdown
        markdown_content = convert_pdf_to_markdown(file_path)

        # If title is not provided, use the filename without extension
        if not title:
            title = os.path.splitext(os.path.basename(file_path))[0]

        # If author is not provided, set it to 'Unknown'
        if not author:
            author = 'Unknown'

        # If keywords are not provided, use a default keyword
        if not keywords:
            keywords = 'pdf_file,markdown_converted'
        else:
            keywords = f'pdf_file,markdown_converted,{keywords}'

        # Add the markdown content to the database
        add_media_with_keywords(
            url=file_path,
            title=title,
            media_type='document',
            content=markdown_content,
            keywords=keywords,
            prompt='No prompt for PDF files',
            summary='No summary for PDF files',
            transcription_model='None',
            author=author,
            ingestion_date=datetime.now().strftime('%Y-%m-%d')
        )

        return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
    except ValueError as e:
        logging.error(f"File size error: {str(e)}")
        return f"Error: {str(e)}", file_path
    except Exception as e:
        logging.error(f"Error ingesting PDF file: {str(e)}")
        return f"Error ingesting PDF file: {str(e)}", file_path


def process_and_cleanup_pdf(file, title, author, keywords):
    if file is None:
        return "No file uploaded. Please upload a PDF file."

    temp_dir = tempfile.mkdtemp()
    temp_file_path = os.path.join(temp_dir, "temp.pdf")

    try:
        # Copy the uploaded file to a temporary location
        shutil.copy2(file.name, temp_file_path)

        # Process the file
        result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)

        return result
    except Exception as e:
        logging.error(f"Error in processing and cleanup: {str(e)}")
        return f"Error: {str(e)}"
    finally:
        # Clean up the temporary directory and its contents
        try:
            shutil.rmtree(temp_dir)
            logging.info(f"Removed temporary directory: {temp_dir}")
        except Exception as cleanup_error:
            logging.error(f"Error during cleanup: {str(cleanup_error)}")
            result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"


#
#
#######################################################################################################################