import os from io import BytesIO from tqdm import tqdm import numpy as np from typing import Callable, Dict, List # , Literal, NamedTuple, Optional, Tuple, Union from PIL import Image as PIL_Image from PIL.Image import Image from datasets import logging logger = logging.get_logger(__name__) import PyPDF2 MAX_PAGES = 50 MAX_PDF_SIZE = 100000000 # almost 100MB MIN_WIDTH, MIN_HEIGHT = 150, 150 import pdf2image def pdf2image_image_extraction(pdf_stream): try: images: List[Image] = pdf2image.convert_from_bytes(pdf_stream) return images except Exception as e: logger.warning(f"{e}") def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): example["pages"] = 0 example["pixel_values"] = None pixel_values = [] if len(example["file"]) > MAX_PDF_SIZE: logger.warning(f"too large file {len(example['file'])}") return example try: reader = PyPDF2.PdfReader(BytesIO(example["file"])) except Exception as e: logger.warning(f"read_pdf {e}") return example example["pages"] = len(reader.pages) reached_page_limit = False if "sample" in inference_method.scope and inference_method.scope != "sample-grid": page_iterator = [inference_method.get_page_scope(reader.pages)] else: page_iterator = reader.pages try: for p, page in enumerate(page_iterator): if reached_page_limit: break for image in page.images: if len(pixel_values) == MAX_PAGES: reached_page_limit = True break im = PIL_Image.open(BytesIO(image.data)) if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: continue # try: # except Exception as e: # logger.warning(f"get_images {e}") if inference_method.scope != "sample-grid": im = feature_extractor([im.convert("RGB")])["pixel_values"][0] pixel_values.append(im) except Exception as e: print(f"{example.get('id')} PyPDF get_images {e}") pixel_values = [] if len(pixel_values) == 0: # at least try with another API try: images = pdf2image_image_extraction(example["file"]) except Exception as e: print(f"{example.get('id')} pdf2image get_images {e}") images = [] if not images: print(f"{example.get('id')} pdf2image has no images") example["pages"] = 0 return example # got lucky with pdf2image example["pages"] = len(images) for im in images: if len(pixel_values) == MAX_PAGES: reached_page_limit = True break if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: continue if inference_method.scope != "sample-grid": im = feature_extractor([im.convert("RGB")])["pixel_values"][0] pixel_values.append(im) if inference_method.scope == "sample-grid": grid = inference_method.get_page_scope(pixel_values) pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] elif "sample" in inference_method.scope: pixel_values = pixel_values[0] example["pixel_values"] = np.array(pixel_values) return example def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"] example["pages"] = 0 example["pixel_values"] = None pixel_values = [] if len(example["file"]) > MAX_PDF_SIZE: logger.warning(f"too large file {len(example['file'])}") return example # images = example['images'] try: images = pdf2image_image_extraction(example["file"]) except Exception as e: print(f"{example.get('id')} pdf2image get_images {e}") images = [] if not images: print(f"{example.get('id')} pdf2image has no images") example["pages"] = 0 return example # do image checks before and after images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT] if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"): print(f"{example.get('id')} pdf2image has no images") example["pages"] = 0 return example example["pages"] = len(images) reached_page_limit = False if "sample" in inference_method.scope and inference_method.scope != "sample-grid": page_iterator = [inference_method.get_page_scope(images)] else: page_iterator = images for im in page_iterator: if len(pixel_values) == MAX_PAGES: reached_page_limit = True break if inference_method.scope != "sample-grid": im = feature_extractor([im.convert("RGB")])["pixel_values"][0] pixel_values.append(im) if len(pixel_values) == 0: print(f"{example.get('id')} pdf2image has no valid images") example["pages"] = 0 return example if inference_method.scope == "sample-grid": grid = inference_method.get_page_scope(pixel_values) pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] elif "sample" in inference_method.scope: pixel_values = pixel_values[0] example["pixel_values"] = np.array(pixel_values) return example