src / mapping_functions.py
bdpc's picture
Upload 9 files
1ceb840
import os
from io import BytesIO
from tqdm import tqdm
import numpy as np
from typing import Callable, Dict, List # , Literal, NamedTuple, Optional, Tuple, Union
from PIL import Image as PIL_Image
from PIL.Image import Image
from datasets import logging
logger = logging.get_logger(__name__)
import PyPDF2
MAX_PAGES = 50
MAX_PDF_SIZE = 100000000 # almost 100MB
MIN_WIDTH, MIN_HEIGHT = 150, 150
import pdf2image
def pdf2image_image_extraction(pdf_stream):
try:
images: List[Image] = pdf2image.convert_from_bytes(pdf_stream)
return images
except Exception as e:
logger.warning(f"{e}")
def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method):
example["pages"] = 0
example["pixel_values"] = None
pixel_values = []
if len(example["file"]) > MAX_PDF_SIZE:
logger.warning(f"too large file {len(example['file'])}")
return example
try:
reader = PyPDF2.PdfReader(BytesIO(example["file"]))
except Exception as e:
logger.warning(f"read_pdf {e}")
return example
example["pages"] = len(reader.pages)
reached_page_limit = False
if "sample" in inference_method.scope and inference_method.scope != "sample-grid":
page_iterator = [inference_method.get_page_scope(reader.pages)]
else:
page_iterator = reader.pages
try:
for p, page in enumerate(page_iterator):
if reached_page_limit:
break
for image in page.images:
if len(pixel_values) == MAX_PAGES:
reached_page_limit = True
break
im = PIL_Image.open(BytesIO(image.data))
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
continue
# try:
# except Exception as e:
# logger.warning(f"get_images {e}")
if inference_method.scope != "sample-grid":
im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
pixel_values.append(im)
except Exception as e:
print(f"{example.get('id')} PyPDF get_images {e}")
pixel_values = []
if len(pixel_values) == 0:
# at least try with another API
try:
images = pdf2image_image_extraction(example["file"])
except Exception as e:
print(f"{example.get('id')} pdf2image get_images {e}")
images = []
if not images:
print(f"{example.get('id')} pdf2image has no images")
example["pages"] = 0
return example
# got lucky with pdf2image
example["pages"] = len(images)
for im in images:
if len(pixel_values) == MAX_PAGES:
reached_page_limit = True
break
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT:
continue
if inference_method.scope != "sample-grid":
im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
pixel_values.append(im)
if inference_method.scope == "sample-grid":
grid = inference_method.get_page_scope(pixel_values)
pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0]
elif "sample" in inference_method.scope:
pixel_values = pixel_values[0]
example["pixel_values"] = np.array(pixel_values)
return example
def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method):
IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"]
example["pages"] = 0
example["pixel_values"] = None
pixel_values = []
if len(example["file"]) > MAX_PDF_SIZE:
logger.warning(f"too large file {len(example['file'])}")
return example
# images = example['images']
try:
images = pdf2image_image_extraction(example["file"])
except Exception as e:
print(f"{example.get('id')} pdf2image get_images {e}")
images = []
if not images:
print(f"{example.get('id')} pdf2image has no images")
example["pages"] = 0
return example
# do image checks before and after
images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT]
if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"):
print(f"{example.get('id')} pdf2image has no images")
example["pages"] = 0
return example
example["pages"] = len(images)
reached_page_limit = False
if "sample" in inference_method.scope and inference_method.scope != "sample-grid":
page_iterator = [inference_method.get_page_scope(images)]
else:
page_iterator = images
for im in page_iterator:
if len(pixel_values) == MAX_PAGES:
reached_page_limit = True
break
if inference_method.scope != "sample-grid":
im = feature_extractor([im.convert("RGB")])["pixel_values"][0]
pixel_values.append(im)
if len(pixel_values) == 0:
print(f"{example.get('id')} pdf2image has no valid images")
example["pages"] = 0
return example
if inference_method.scope == "sample-grid":
grid = inference_method.get_page_scope(pixel_values)
pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0]
elif "sample" in inference_method.scope:
pixel_values = pixel_values[0]
example["pixel_values"] = np.array(pixel_values)
return example