Spaces:

oceansweep
/

tldw

Running

tldw / App_Function_Libraries /PDF_Ingestion_Lib.py

ed28876 4 months ago

5.73 kB

	# PDF_Ingestion_Lib.py
	#########################################
	# Library to hold functions for ingesting PDF files.#
	#
	####################
	# Function List
	#
	# 1. convert_pdf_to_markdown(pdf_path)
	# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
	# 3.
	#
	#
	####################


	# Import necessary libraries
	from datetime import datetime
	import logging
	import subprocess
	import os
	import shutil
	import tempfile


	# Import Local
	from App_Function_Libraries.SQLite_DB import add_media_with_keywords

	#######################################################################################################################
	# Function Definitions
	#

	# Ingest a text file into the database with Title/Author/Keywords


	# Constants
	MAX_FILE_SIZE_MB = 50
	CONVERSION_TIMEOUT_SECONDS = 300


	def convert_pdf_to_markdown(pdf_path):
	"""
	Convert a PDF file to Markdown by calling a script in another virtual environment.
	"""

	logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
	# Check if the file size exceeds the maximum allowed size
	file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
	if file_size_mb > MAX_FILE_SIZE_MB:
	raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")

	logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
	# Path to the Python interpreter in the other virtual environment
	other_venv_python = "Helper_Scripts/marker_venv/bin/python"

	# Path to the conversion script
	converter_script = "Helper_Scripts/PDF_Converter.py"

	logging.debug("Marker: Attempting to convert PDF file to Markdown...")
	try:
	result = subprocess.run(
	[other_venv_python, converter_script, pdf_path],
	capture_output=True,
	text=True,
	timeout=CONVERSION_TIMEOUT_SECONDS
	)
	if result.returncode != 0:
	raise Exception(f"Conversion failed: {result.stderr}")
	return result.stdout
	except subprocess.TimeoutExpired:
	raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")


	def process_and_ingest_pdf(file, title, author, keywords):
	if file is None:
	return "Please select a PDF file to upload."

	try:
	# Create a temporary directory
	with tempfile.TemporaryDirectory() as temp_dir:
	# Create a path for the temporary PDF file
	temp_path = os.path.join(temp_dir, "temp.pdf")

	# Copy the contents of the uploaded file to the temporary file
	shutil.copy(file.name, temp_path)

	# Call the ingest_pdf_file function with the temporary file path
	result = ingest_pdf_file(temp_path, title, author, keywords)

	return result
	except Exception as e:
	return f"Error processing PDF: {str(e)}"


	def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
	try:
	# Convert PDF to Markdown
	markdown_content = convert_pdf_to_markdown(file_path)

	# If title is not provided, use the filename without extension
	if not title:
	title = os.path.splitext(os.path.basename(file_path))[0]

	# If author is not provided, set it to 'Unknown'
	if not author:
	author = 'Unknown'

	# If keywords are not provided, use a default keyword
	if not keywords:
	keywords = 'pdf_file,markdown_converted'
	else:
	keywords = f'pdf_file,markdown_converted,{keywords}'

	# Add the markdown content to the database
	add_media_with_keywords(
	url=file_path,
	title=title,
	media_type='document',
	content=markdown_content,
	keywords=keywords,
	prompt='No prompt for PDF files',
	summary='No summary for PDF files',
	transcription_model='None',
	author=author,
	ingestion_date=datetime.now().strftime('%Y-%m-%d')
	)

	return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
	except ValueError as e:
	logging.error(f"File size error: {str(e)}")
	return f"Error: {str(e)}", file_path
	except Exception as e:
	logging.error(f"Error ingesting PDF file: {str(e)}")
	return f"Error ingesting PDF file: {str(e)}", file_path


	def process_and_cleanup_pdf(file, title, author, keywords):
	if file is None:
	return "No file uploaded. Please upload a PDF file."

	temp_dir = tempfile.mkdtemp()
	temp_file_path = os.path.join(temp_dir, "temp.pdf")

	try:
	# Copy the uploaded file to a temporary location
	shutil.copy2(file.name, temp_file_path)

	# Process the file
	result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)

	return result
	except Exception as e:
	logging.error(f"Error in processing and cleanup: {str(e)}")
	return f"Error: {str(e)}"
	finally:
	# Clean up the temporary directory and its contents
	try:
	shutil.rmtree(temp_dir)
	logging.info(f"Removed temporary directory: {temp_dir}")
	except Exception as cleanup_error:
	logging.error(f"Error during cleanup: {str(cleanup_error)}")
	result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"


	#
	#
	#######################################################################################################################