tldw / App_Function_Libraries /PDF /PDF_Ingestion_Lib.py
oceansweep's picture
Upload 22 files
e15e1c7 verified
raw
history blame
12 kB
# PDF_Ingestion_Lib.py
#########################################
# Library to hold functions for ingesting PDF files.#
#
####################
# Function List
#
# 1. convert_pdf_to_markdown(pdf_path)
# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
# 3.
#
#
####################
import re
# Import necessary libraries
# Import Local
#######################################################################################################################
# Function Definitions
#
# Ingest a text file into the database with Title/Author/Keywords
# Constants
MAX_FILE_SIZE_MB = 50
CONVERSION_TIMEOUT_SECONDS = 300
# Marker PDF solution
# def convert_pdf_to_markdown(pdf_path):
# """
# Convert a PDF file to Markdown by calling a script in another virtual environment.
# """
#
# logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
# # Check if the file size exceeds the maximum allowed size
# file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
# if file_size_mb > MAX_FILE_SIZE_MB:
# raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
#
# logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
# # Path to the Python interpreter in the other virtual environment
# other_venv_python = "Helper_Scripts/marker_venv/bin/python"
#
# # Path to the conversion script
# converter_script = "Helper_Scripts/PDF_Converter.py"
#
# logging.debug("Marker: Attempting to convert PDF file to Markdown...")
# try:
# result = subprocess.run(
# [other_venv_python, converter_script, pdf_path],
# capture_output=True,
# text=True,
# timeout=CONVERSION_TIMEOUT_SECONDS
# )
# if result.returncode != 0:
# raise Exception(f"Conversion failed: {result.stderr}")
# return result.stdout
# except subprocess.TimeoutExpired:
# raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
#
#
# def process_and_ingest_pdf(file, title, author, keywords):
# if file is None:
# return "Please select a PDF file to upload."
#
# try:
# # Create a temporary directory
# with tempfile.TemporaryDirectory() as temp_dir:
# # Create a path for the temporary PDF file
# temp_path = os.path.join(temp_dir, "temp.pdf")
#
# # Copy the contents of the uploaded file to the temporary file
# shutil.copy(file.name, temp_path)
#
# # Call the ingest_pdf_file function with the temporary file path
# result = ingest_pdf_file(temp_path, title, author, keywords)
#
# return result
# except Exception as e:
# return f"Error processing PDF: {str(e)}"
#
#
# def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
# try:
# # Convert PDF to Markdown
# markdown_content = convert_pdf_to_markdown(file_path)
#
# # If title is not provided, use the filename without extension
# if not title:
# title = os.path.splitext(os.path.basename(file_path))[0]
#
# # If author is not provided, set it to 'Unknown'
# if not author:
# author = 'Unknown'
#
# # If keywords are not provided, use a default keyword
# if not keywords:
# keywords = 'pdf_file,markdown_converted'
# else:
# keywords = f'pdf_file,markdown_converted,{keywords}'
#
# # Add the markdown content to the database
# add_media_with_keywords(
# url=file_path,
# title=title,
# media_type='document',
# content=markdown_content,
# keywords=keywords,
# prompt='No prompt for PDF files',
# summary='No summary for PDF files',
# transcription_model='None',
# author=author,
# ingestion_date=datetime.now().strftime('%Y-%m-%d')
# )
#
# return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
# except ValueError as e:
# logging.error(f"File size error: {str(e)}")
# return f"Error: {str(e)}", file_path
# except Exception as e:
# logging.error(f"Error ingesting PDF file: {str(e)}")
# return f"Error ingesting PDF file: {str(e)}", file_path
#
#
# def process_and_cleanup_pdf(file, title, author, keywords):
# # FIXME - Update to validate file upload/filetype is pdf....
# if file is None:
# return "No file uploaded. Please upload a PDF file."
#
# temp_dir = tempfile.mkdtemp()
# temp_file_path = os.path.join(temp_dir, "temp.pdf")
#
# try:
# # Copy the uploaded file to a temporary location
# shutil.copy2(file.name, temp_file_path)
#
# # Process the file
# result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
#
# return result
# except Exception as e:
# logging.error(f"Error in processing and cleanup: {str(e)}")
# return f"Error: {str(e)}"
# finally:
# # Clean up the temporary directory and its contents
# try:
# shutil.rmtree(temp_dir)
# logging.info(f"Removed temporary directory: {temp_dir}")
# except Exception as cleanup_error:
# logging.error(f"Error during cleanup: {str(cleanup_error)}")
# result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
import logging
#
#
#######################################################################################################################
#
# Non-Marker implementation
import os
import shutil
import tempfile
from datetime import datetime
import pymupdf
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
def extract_text_and_format_from_pdf(pdf_path):
"""
Extract text from a PDF file and convert it to Markdown, preserving formatting.
"""
try:
markdown_text = ""
with pymupdf.open(pdf_path) as doc:
for page_num, page in enumerate(doc, 1):
markdown_text += f"## Page {page_num}\n\n"
blocks = page.get_text("dict")["blocks"]
current_paragraph = ""
for block in blocks:
if block["type"] == 0: # Text block
for line in block["lines"]:
line_text = ""
for span in line["spans"]:
text = span["text"]
font_size = span["size"]
font_flags = span["flags"]
# Apply formatting based on font size and flags
if font_size > 20:
text = f"# {text}"
elif font_size > 16:
text = f"## {text}"
elif font_size > 14:
text = f"### {text}"
if font_flags & 2 ** 0: # Bold
text = f"**{text}**"
if font_flags & 2 ** 1: # Italic
text = f"*{text}*"
line_text += text + " "
# Remove hyphens at the end of lines
line_text = line_text.rstrip()
if line_text.endswith('-'):
line_text = line_text[:-1]
else:
line_text += " "
current_paragraph += line_text
# End of block, add paragraph
if current_paragraph:
# Remove extra spaces
current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
markdown_text += current_paragraph + "\n\n"
current_paragraph = ""
elif block["type"] == 1: # Image block
markdown_text += "[Image]\n\n"
markdown_text += "\n---\n\n" # Page separator
# Clean up hyphenated words
markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
return markdown_text
except Exception as e:
logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
raise
def extract_metadata_from_pdf(pdf_path):
"""
Extract metadata from a PDF file using PyMuPDF.
"""
try:
with pymupdf.open(pdf_path) as doc:
metadata = doc.metadata
return metadata
except Exception as e:
logging.error(f"Error extracting metadata from PDF: {str(e)}")
return {}
def process_and_ingest_pdf(file, title, author, keywords):
if file is None:
return "Please select a PDF file to upload."
try:
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a path for the temporary PDF file
temp_path = os.path.join(temp_dir, "temp.pdf")
# Copy the contents of the uploaded file to the temporary file
shutil.copy(file.name, temp_path)
# Extract text and convert to Markdown
markdown_text = extract_text_and_format_from_pdf(temp_path)
# Extract metadata from PDF
metadata = extract_metadata_from_pdf(temp_path)
# Use metadata for title and author if not provided
if not title:
title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
if not author:
author = metadata.get('author', 'Unknown')
# If keywords are not provided, use a default keyword
if not keywords:
keywords = 'pdf_file,markdown_converted'
else:
keywords = f'pdf_file,markdown_converted,{keywords}'
# Add metadata-based keywords
if 'subject' in metadata:
keywords += f",{metadata['subject']}"
# Add the PDF content to the database
add_media_with_keywords(
url=file.name,
title=title,
media_type='document',
content=markdown_text,
keywords=keywords,
prompt='No prompt for PDF files',
summary='No summary for PDF files',
transcription_model='None',
author=author,
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)
return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
except Exception as e:
logging.error(f"Error ingesting PDF file: {str(e)}")
return f"Error ingesting PDF file: {str(e)}"
def process_and_cleanup_pdf(file, title, author, keywords):
if file is None:
return "No file uploaded. Please upload a PDF file."
try:
result = process_and_ingest_pdf(file, title, author, keywords)
return result
except Exception as e:
logging.error(f"Error in processing and cleanup: {str(e)}")
return f"Error: {str(e)}"
#
# End of PDF_Ingestion_Lib.py
#######################################################################################################################