paper_qa / paper /parser.py
chansung's picture
update
928f123
raw history blame
No virus
1.77 kB
import os
import fitz
import PyPDF2
def extract_text_and_figures(pdf_path):
"""
Extracts text and figures from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
tuple: A tuple containing two lists:
* A list of extracted text blocks.
* A list of extracted figures (as bytes).
"""
texts = []
figures = []
# Open the PDF using PyMuPDF (fitz) for image extraction
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc):
text = page.get_text("text") # Extract text as plain text
texts.append(text)
# Process images on the page
image_list = page.get_images()
for image_index, img in enumerate(image_list):
xref = img[0] # Image XREF
pix = fitz.Pixmap(doc, xref) # Create Pixmap image
# Save image in desired format (here, PNG)
if pix.n < 5: # Grayscale or RGB
img_bytes = pix.tobytes("png")
else: # CMYK: Convert to RGB first
pix = fitz.Pixmap(fitz.csRGB, pix)
img_bytes = pix.tobytes("png")
figures.append(img_bytes)
# Extract additional text using PyPDF2 (in case fitz didn't get everything)
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
texts.append(text)
try:
os.remove(pdf_path)
except FileNotFoundError:
print(f"File '{pdf_path}' not found.")
except PermissionError:
print(f"Unable to remove '{pdf_path}'. Check permissions.")
return texts, figures