|
import os |
|
from io import BytesIO |
|
from tqdm import tqdm |
|
import numpy as np |
|
from typing import Callable, Dict, List |
|
from PIL import Image as PIL_Image |
|
from PIL.Image import Image |
|
|
|
from datasets import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
import PyPDF2 |
|
|
|
MAX_PAGES = 50 |
|
MAX_PDF_SIZE = 100000000 |
|
MIN_WIDTH, MIN_HEIGHT = 150, 150 |
|
import pdf2image |
|
|
|
|
|
def pdf2image_image_extraction(pdf_stream): |
|
try: |
|
images: List[Image] = pdf2image.convert_from_bytes(pdf_stream) |
|
return images |
|
except Exception as e: |
|
logger.warning(f"{e}") |
|
|
|
|
|
def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
|
example["pages"] = 0 |
|
example["pixel_values"] = None |
|
pixel_values = [] |
|
if len(example["file"]) > MAX_PDF_SIZE: |
|
logger.warning(f"too large file {len(example['file'])}") |
|
return example |
|
try: |
|
reader = PyPDF2.PdfReader(BytesIO(example["file"])) |
|
except Exception as e: |
|
logger.warning(f"read_pdf {e}") |
|
return example |
|
example["pages"] = len(reader.pages) |
|
reached_page_limit = False |
|
if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
|
page_iterator = [inference_method.get_page_scope(reader.pages)] |
|
else: |
|
page_iterator = reader.pages |
|
|
|
try: |
|
for p, page in enumerate(page_iterator): |
|
if reached_page_limit: |
|
break |
|
for image in page.images: |
|
if len(pixel_values) == MAX_PAGES: |
|
reached_page_limit = True |
|
break |
|
im = PIL_Image.open(BytesIO(image.data)) |
|
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
|
continue |
|
|
|
|
|
|
|
if inference_method.scope != "sample-grid": |
|
im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
|
pixel_values.append(im) |
|
except Exception as e: |
|
print(f"{example.get('id')} PyPDF get_images {e}") |
|
pixel_values = [] |
|
|
|
if len(pixel_values) == 0: |
|
|
|
try: |
|
images = pdf2image_image_extraction(example["file"]) |
|
except Exception as e: |
|
print(f"{example.get('id')} pdf2image get_images {e}") |
|
images = [] |
|
|
|
if not images: |
|
print(f"{example.get('id')} pdf2image has no images") |
|
example["pages"] = 0 |
|
return example |
|
|
|
|
|
example["pages"] = len(images) |
|
for im in images: |
|
if len(pixel_values) == MAX_PAGES: |
|
reached_page_limit = True |
|
break |
|
if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
|
continue |
|
if inference_method.scope != "sample-grid": |
|
im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
|
pixel_values.append(im) |
|
|
|
if inference_method.scope == "sample-grid": |
|
grid = inference_method.get_page_scope(pixel_values) |
|
pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
|
elif "sample" in inference_method.scope: |
|
pixel_values = pixel_values[0] |
|
example["pixel_values"] = np.array(pixel_values) |
|
return example |
|
|
|
|
|
def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
|
IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"] |
|
example["pages"] = 0 |
|
example["pixel_values"] = None |
|
pixel_values = [] |
|
if len(example["file"]) > MAX_PDF_SIZE: |
|
logger.warning(f"too large file {len(example['file'])}") |
|
return example |
|
|
|
|
|
try: |
|
images = pdf2image_image_extraction(example["file"]) |
|
except Exception as e: |
|
print(f"{example.get('id')} pdf2image get_images {e}") |
|
images = [] |
|
|
|
if not images: |
|
print(f"{example.get('id')} pdf2image has no images") |
|
example["pages"] = 0 |
|
return example |
|
|
|
|
|
images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT] |
|
|
|
if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"): |
|
print(f"{example.get('id')} pdf2image has no images") |
|
example["pages"] = 0 |
|
return example |
|
|
|
example["pages"] = len(images) |
|
reached_page_limit = False |
|
if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
|
page_iterator = [inference_method.get_page_scope(images)] |
|
else: |
|
page_iterator = images |
|
|
|
for im in page_iterator: |
|
if len(pixel_values) == MAX_PAGES: |
|
reached_page_limit = True |
|
break |
|
if inference_method.scope != "sample-grid": |
|
im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
|
pixel_values.append(im) |
|
|
|
if len(pixel_values) == 0: |
|
print(f"{example.get('id')} pdf2image has no valid images") |
|
example["pages"] = 0 |
|
return example |
|
|
|
if inference_method.scope == "sample-grid": |
|
grid = inference_method.get_page_scope(pixel_values) |
|
pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
|
elif "sample" in inference_method.scope: |
|
pixel_values = pixel_values[0] |
|
example["pixel_values"] = np.array(pixel_values) |
|
return example |
|
|