Spaces:

vincentlo
/

document-ocr-demo

Runtime error

App Files Files Community

vincentlo commited on Apr 13

Commit

2548af8

•

1 Parent(s): 0b2749c

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

.dockerignore +1 -0
.env.template +2 -0
.gitignore +7 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/lastfailed +12 -0
.pytest_cache/v/cache/nodeids +23 -0
.pytest_cache/v/cache/stepwise +1 -0
Dockerfile +30 -0
Makefile +21 -0
README.md +87 -7
compose.yaml +9 -0
core-0.1.0-py3-none-any.whl +0 -0
poetry.lock +0 -0
pyproject.toml +26 -0
src/__init__.py +0 -0
src/app.py +52 -0
src/extractor/__init__.py +1 -0
src/extractor/content.py +62 -0
src/extractor/extractor.py +35 -0
src/extractor/keypair.py +81 -0
src/utils.py +52 -0
tests/__init__.py +0 -0
tests/assets/sample.jpeg +0 -0
tests/assets/sample.pdf +0 -0
tests/conftest.py +30 -0
tests/test_content.py +25 -0
tests/test_extractor.py +44 -0
tests/test_keypair.py +29 -0
tests/test_utils.py +62 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

.env.template ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ AZURE_ENDPOINT=<YOUR_ENDPOINT>
2	+ AZURE_KEY=<YOUR_KEY>

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.venv
+flagged
+__pycache__
+scratch
+.env
+todo.txt
+notebooks

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "tests/test_integration.py::test_extract_image_content": true,
+  "tests/test_integration.py::test_extract_pdf_content": true,
+  "tests/test_integration.py::test_extract_image_key_pair": true,
+  "tests/test_integration.py::test_extract_pdf_key_pair": true,
+  "tests/test_extractor.py::test_extract_image_content": true,
+  "tests/test_extractor.py::test_extract_pdf_content": true,
+  "tests/test_extractor.py::test_extract_image_key_pair": true,
+  "tests/test_extractor.py::test_extract_pdf_key_pair": true,
+  "tests/units/test_content.py": true,
+  "tests/units/test_keypair.py": true
+}

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,23 @@

+[
+  "tests/test_content.py::test_extract_image_content",
+  "tests/test_content.py::test_extract_pdf_content",
+  "tests/test_extractor.py::test_extract_image_content",
+  "tests/test_extractor.py::test_extract_image_in_content_mode",
+  "tests/test_extractor.py::test_extract_image_in_keypair_mode",
+  "tests/test_extractor.py::test_extract_image_key_pair",
+  "tests/test_extractor.py::test_extract_pdf_content",
+  "tests/test_extractor.py::test_extract_pdf_in_content_mode",
+  "tests/test_extractor.py::test_extract_pdf_in_keypair_mode",
+  "tests/test_extractor.py::test_extract_pdf_key_pair",
+  "tests/test_integration.py::test_extract_image_content",
+  "tests/test_integration.py::test_extract_image_key_pair",
+  "tests/test_integration.py::test_extract_pdf_content",
+  "tests/test_integration.py::test_extract_pdf_key_pair",
+  "tests/test_keypair.py::test_extract_image_key_pair",
+  "tests/test_keypair.py::test_extract_pdf_key_pair",
+  "tests/test_utils.py::test_determine_file_type",
+  "tests/test_utils.py::test_get_PIL_image_from_path",
+  "tests/test_utils.py::test_get_bytes_from_path",
+  "tests/test_utils.py::test_get_images_from_one_image_path",
+  "tests/test_utils.py::test_get_images_from_pdf_path"
+]

.pytest_cache/v/cache/stepwise ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.10 as builder
+RUN pip install poetry==1.7.1
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+WORKDIR /app
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+RUN poetry install --only main && rm -rf $POETRY_CACHE_DIR
+FROM python:3.10-slim as runtime
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH" \
+    PYTHONPATH="./app:$PYTHONPATH"
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+WORKDIR /app
+COPY ./src ./src/
+EXPOSE 7860
+CMD ["python", "./src/app.py"]

Makefile ADDED Viewed

	@@ -0,0 +1,21 @@

+update-image-and-run:
+	docker compose down --rmi all
+	docker compose up --detach
+run-service: stop
+	docker compose up --detach
+stop:
+	docker compose down
+run-dev:
+	gradio src/app.py
+run-local-public: init
+	poetry run python src/app.py -s
+linter:
+	poetry run isort .
+init:
+	poetry install --no-root

README.md CHANGED Viewed

@@ -1,12 +1,92 @@
 ---
-title: Document Ocr Demo
-emoji: 🔥
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
 sdk_version: 4.26.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: document-ocr-demo
+app_file: src/app.py
 sdk: gradio
 sdk_version: 4.26.0
 ---
+# Document OCR Demo
+This repository contains a web demo for extracting key-value pairs from documents using Azure Document Intelligence.
+## Getting Started
+### Prerequisites
+- Git
+- Docker and Docker Compose
+### Installation
+1. **Clone the repository:**
+   ```bash
+   git clone git@github.com:vincent-lo-greenaitech/document-ocr-demo.git
+   ```
+2. **Configure environment variables:**
+   Locate the `.env.template` file in the repository, replace the placeholders with the actual Azure endpoint and key, and rename the file to `.env`.
+   For example, convert:
+   **.env.template**
+   ```plaintext
+   AZURE_ENDPOINT=<YOUR_ENDPOINT>
+   AZURE_KEY=<YOUR_KEY>
+   ```
+   To:
+   **.env**
+   ```plaintext
+   AZURE_ENDPOINT="https://123.com"
+   AZURE_KEY="abc123kljfdkkvvs"
+   ```
+3. **Start the server with Docker Compose:**
+   Ensure `docker compose` is installed and available, then run the following command:
+   ```bash
+   make run
+   ```
+   Or:
+   ```bash
+   docker compose up --detach
+   ```
+4. **Access the web demo:**
+   The web demo should now be accessible at `http://localhost:7860` (or the configured port).
+### Shutting Down
+To shut down the server and clean up Docker containers, use the following command:
+```bash
+make stop
+```
+Or:
+```bash
+docker compose down
+```
+## Update image
+To update the image and run the server again, use the following command:
+```bash
+make update-image-and-run
+```
+## Development
+### Managing Python Dependencies
+This project uses `poetry` for managing Python dependencies. It is recommended to install `poetry` through `pipx` for an isolated setup. Refer to the [official Poetry documentation](https://python-poetry.org/docs/) for more detailed instructions.

compose.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+version: '3.8'
+services:
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+    environment:
+      AZURE_ENDPOINT: ${AZURE_ENDPOINT}
+      AZURE_KEY: ${AZURE_KEY}

core-0.1.0-py3-none-any.whl ADDED Viewed

Binary file (4.69 kB). View file

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[tool.poetry]
+name = "document-ocr-demo"
+version = "0.1.0"
+description = ""
+authors = ["Vincent Lo <vincentlo@greenaitech.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.10"
+gradio = "^4.19.2"
+azure-ai-formrecognizer = "3.3.0"
+pymupdf = "^1.23.25"
+core = { path = "./core-0.1.0-py3-none-any.whl" }
+python-dotenv = "^1.0.1"
+[tool.poetry.group.dev.dependencies]
+isort = "^5.13.2"
+pytest = "^8.0.2"
+python-dotenv = "^1.0.1"
+jupyter = "^1.0.0"
+pandas = "^2.2.1"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

src/__init__.py ADDED Viewed

File without changes

src/app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+import os
+from typing import Literal
+import gradio as gr
+from PIL.Image import Image
+from extractor import AzureExtractor
+from utils import get_images_from_path
+from dotenv import load_dotenv
+load_dotenv()
+parser = argparse.ArgumentParser()
+parser.add_argument('-s', '--share', action='store_true', help='Create a public link')
+args = parser.parse_args()
+endpoint = os.environ['AZURE_ENDPOINT']
+key = os.environ['AZURE_KEY']
+extractor = AzureExtractor(endpoint=endpoint, key=key)
+def handle_input(
+    extract_mode: Literal['Content', 'Key Pairs'], file_input: str
+) -> tuple[list[Image], dict[str, str]]:
+    try:
+        output_images = get_images_from_path(file_input)
+        mode_mapping = {'Content': 'content', 'Key Pairs': 'key_pair'}
+        result = extractor.extract(file_input, mode=mode_mapping[extract_mode])
+        status = 'Success'
+    except Exception as err:
+        status = f'Error: {err}'
+        output_images = None
+        result = None
+    return status, output_images, result
+file_input = gr.File(file_types=['image', '.pdf'])
+extract_mode = gr.Dropdown(
+    ['Content', 'Key Pairs'], value='Content', label='Text extract mode'
+)
+status_box = gr.Textbox(label='Status')
+image_gallery = gr.Gallery(label='Input Preview')
+result = gr.Json(label='result')
+demo = gr.Interface(
+    fn=handle_input,
+    inputs=[extract_mode, file_input],
+    outputs=[status_box, image_gallery, result],
+)
+demo.launch(server_name='0.0.0.0', server_port=7860, share=args.share)

src/extractor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .extractor import AzureExtractor

src/extractor/content.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import io
+from typing import Protocol
+import PIL
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+class ContentExtractor(Protocol):
+    """
+    An interface for extracting content from an image
+    """
+    def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
+        """Extract content from an image
+        Args:
+            image: A PIL Image
+        Returns:
+            The content of the image in a string inside a dictionary with key 'content'
+        """
+        ...
+    def extract_pdf(self, pdf: bytes) -> dict[str, str]:
+        """Extract content from a pdf
+        Args:
+            image: A pdf file in bytes
+        Returns:
+            The content of the pdf in a string inside a dictionary with key 'content'
+        """
+        ...
+class AzureContentExtractor(ContentExtractor):
+    def __init__(self, endpoint: str, key: str):
+        self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+    def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
+        return self._get_result_from_bytes(AzureContentExtractor._PIL_to_bytes(image))
+    def extract_pdf(self, pdf: bytes) -> dict[str, str]:
+        return self._get_result_from_bytes(pdf)
+    def _get_result_from_bytes(self, bytes_obj: bytes) -> dict[str, str]:
+        """Obtain prediction from Azure given a bytes object"""
+        poller = self._client.begin_analyze_document("prebuilt-read", document=bytes_obj)
+        result = poller.result()
+        return {'content': result.content}
+    @staticmethod
+    def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
+        """Convert a PIL image to bytes"""
+        if image.format is None:
+            image.format = 'JPEG'
+        with io.BytesIO() as output:
+            image.save(output, format=image.format)
+            return output.getvalue()

src/extractor/extractor.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Literal
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent.parent))
+from src.utils import determine_file_type, get_bytes_from_path, get_PIL_image_from_path
+from .content import AzureContentExtractor
+from .keypair import AzureKeyPairsExtractor
+class AzureExtractor:
+    def __init__(self, endpoint: str, key: str):
+        self._content_extractor = AzureContentExtractor(endpoint, key)
+        self._keypair_extractor = AzureKeyPairsExtractor(endpoint, key)
+        self._extractor = None
+    def extract(self, file_path: str, mode: Literal['content', 'key_pair']) -> dict[str, str]:
+        self._set_extractor(mode)
+        file_type = determine_file_type(file_path)
+        if file_type == 'pdf':
+            pdf_bytes = get_bytes_from_path(file_path)
+            result = self._extractor.extract_pdf(pdf_bytes)
+        elif file_type == 'image':
+            image = get_PIL_image_from_path(file_path)
+            result = self._extractor.extract_image(image)
+        return result
+    def _set_extractor(self, mode: Literal['content', 'key_pair']):
+        extractor_mapping = {
+            'content': self._content_extractor,
+            'key_pair': self._keypair_extractor,
+        }
+        self._extractor = extractor_mapping[mode]

src/extractor/keypair.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import io
+from typing import Protocol
+import PIL
+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+class KeyPairExtractor(Protocol):
+    """
+    An interface for extracting key pairs from an image
+    """
+    def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
+        """Extract key pairs from an image
+        Args:
+            image: A PIL Image
+        Returns:
+            A dictionary mapping from fields to values
+        """
+        ...
+    def extract_pdf(self, pdf: bytes) -> dict[str, str]:
+        """Extract key pairs from a pdf
+        Args:
+            image: A pdf file in bytes
+        Returns:
+            A dictionary mapping from fields to values
+        """
+        ...
+class FakeExtractor(KeyPairExtractor):
+    """
+    A fake implementation of `Extractor`
+    """
+    def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
+        return {'field1': 'value 1', 'field2': 'value 2'}
+class AzureKeyPairsExtractor(KeyPairExtractor):
+    """
+    An Azure implementation of `KeyPairExtractor`
+    """
+    def __init__(self, endpoint: str, key: str):
+        self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+    def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
+        return self._get_result_from_bytes(AzureKeyPairsExtractor._PIL_to_bytes(image))
+    def extract_pdf(self, pdf: bytes) -> dict[str, str]:
+        return self._get_result_from_bytes(pdf)
+    def _get_result_from_bytes(self, bytes_obj: bytes):
+        """Obtain prediction from Azure given a bytes object"""
+        poller = self._client.begin_analyze_document("prebuilt-document", document=bytes_obj)
+        result = poller.result()
+        result_dict = {}
+        for kv_pair in result.key_value_pairs:
+            if kv_pair.key and kv_pair.value:
+                result_dict[kv_pair.key.content] = kv_pair.value.content
+            else:
+                result_dict[kv_pair.key.content] = ''
+        return result_dict
+    @staticmethod
+    def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
+        """Convert a PIL image to bytes"""
+        if image.format is None:
+            image.format = 'JPEG'
+        with io.BytesIO() as output:
+            image.save(output, format=image.format)
+            return output.getvalue()

src/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import io
+import os
+from typing import Literal
+import fitz
+from PIL import Image
+def determine_file_type(file_path: str) -> str:
+    import os
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension.lower() in ['.pdf']:
+        return 'pdf'
+    elif file_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff']:
+        return 'image'
+    else:
+        supported_types = ', '.join(['PDF', 'JPG', 'JPEG', 'PNG', 'BMP', 'GIF', 'TIFF'])
+        raise ValueError(
+            f"Unsupported file type: {file_extension}. "
+            f"Only the following file types are supported: {supported_types}"
+        )
+def get_bytes_from_path(file_path: str) -> bytes:
+    with open(file_path, 'rb') as file:
+        file_bytes = file.read()
+    return file_bytes
+def get_PIL_image_from_path(file_path: str) -> Image.Image:
+    image = Image.open(file_path)
+    return image
+def get_images_from_path(file_path: str) -> list[Image.Image]:
+    file_type = determine_file_type(file_path)
+    if file_type == 'image':
+        return [get_PIL_image_from_path(file_path)]
+    elif file_type == 'pdf':
+        doc = fitz.open(file_path)
+        images = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            pix = page.get_pixmap()
+            img_data = pix.tobytes("ppm")
+            img = Image.open(io.BytesIO(img_data))
+            images.append(img)
+        doc.close()
+        return images

tests/__init__.py ADDED Viewed

File without changes

tests/assets/sample.jpeg ADDED Viewed

tests/assets/sample.pdf ADDED Viewed

Binary file (80.3 kB). View file

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pathlib import Path
+import PIL.Image
+import pytest
+from dotenv import load_dotenv
+from PIL.Image import Image
+load_dotenv()
+asset_dir = Path(__file__).parent / 'assets'
+@pytest.fixture
+def image() -> Image:
+    return PIL.Image.open(asset_dir / 'sample.jpeg')
+@pytest.fixture
+def pdf() -> bytes:
+    with open(asset_dir / 'sample.pdf', 'rb') as file:
+        return file.read()
+@pytest.fixture
+def image_path() -> str:
+    return str(asset_dir / 'sample.jpeg')
+@pytest.fixture
+def pdf_path() -> str:
+    return str(asset_dir / 'sample.pdf')

tests/test_content.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import pytest
+from src.extractor.content import AzureContentExtractor
+@pytest.fixture
+def azure_content_extractor():
+    return AzureContentExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
+def test_extract_image_content(azure_content_extractor, image):
+    content = azure_content_extractor.extract_image(image)
+    assert isinstance(content, dict)
+    assert 'Smith, John' in content['content']
+def test_extract_pdf_content(azure_content_extractor, pdf):
+    content = azure_content_extractor.extract_pdf(pdf)
+    assert isinstance(content, dict)
+    assert 'Smith, John' in content['content']
+    assert 'Paper Submission Form' in content['content']

tests/test_extractor.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import pytest
+from src.extractor import AzureExtractor
+@pytest.fixture
+def extractor():
+    return AzureExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
+def test_extract_image_in_content_mode(extractor, image_path):
+    result = extractor.extract(image_path, mode='content')
+    assert isinstance(result, dict)
+    assert 'content' in result
+    assert len(result) == 1
+    assert 'CREDIT APPLICATION' in result['content']
+def test_extract_image_in_keypair_mode(extractor, image_path):
+    result = extractor.extract(image_path, mode='key_pair')
+    assert isinstance(result, dict)
+    assert len(result) > 1
+def test_extract_pdf_in_content_mode(extractor, pdf_path):
+    result = extractor.extract(pdf_path, mode='content')
+    assert 'content' in result
+    assert isinstance(result, dict)
+    assert len(result['content']) > 10
+    assert 'CREDIT APPLICATION' in result['content']
+    assert 'Student ID' in result['content']
+def test_extract_pdf_in_keypair_mode(extractor, pdf_path):
+    result = extractor.extract(pdf_path, mode='key_pair')
+    assert isinstance(result, dict)
+    assert result['TELEPHONE NO.'] == '(243) 555-2309'
+    assert result['Student e-mail'] == 'john.doe@example.com'

tests/test_keypair.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import pytest
+from dotenv import load_dotenv
+from src.extractor.keypair import AzureKeyPairsExtractor
+load_dotenv()
+@pytest.fixture
+def azure_key_pairs_extractor():
+    return AzureKeyPairsExtractor(
+        endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY']
+    )
+def test_extract_image_key_pair(azure_key_pairs_extractor, image):
+    key_pairs = azure_key_pairs_extractor.extract_image(image)
+    assert isinstance(key_pairs, dict)
+    assert len(key_pairs) > 1
+def test_extract_pdf_key_pair(azure_key_pairs_extractor, pdf):
+    key_pairs = azure_key_pairs_extractor.extract_pdf(pdf)
+    assert isinstance(key_pairs, dict)
+    assert len(key_pairs) > 1

tests/test_utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import fitz
+import pytest
+from PIL import Image
+from src.utils import (determine_file_type, get_bytes_from_path,
+                       get_images_from_path, get_PIL_image_from_path)
+IMAGE_FILE_PATH = 'test.jpg'
+PDF_FILE_PATH = 'test.pdf'
+@pytest.fixture
+def pdf_file():
+    doc = fitz.open()
+    page1 = doc.new_page()
+    page1.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 1")
+    page2 = doc.new_page()
+    page2.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 2")
+    doc.save(PDF_FILE_PATH)
+    doc.close()
+    yield PDF_FILE_PATH
+    os.remove(PDF_FILE_PATH)
+@pytest.fixture
+def image_file():
+    image = Image.new('RGB', (10, 10), color='red')
+    image.save(IMAGE_FILE_PATH)
+    yield IMAGE_FILE_PATH
+    os.remove(IMAGE_FILE_PATH)
+def test_determine_file_type(pdf_file, image_file):
+    assert determine_file_type(pdf_file) == 'pdf'
+    assert determine_file_type(image_file) == 'image'
+def test_get_bytes_from_path(pdf_file):
+    with open(pdf_file, 'rb') as f:
+        expected_bytes = f.read()
+    assert get_bytes_from_path(pdf_file) == expected_bytes
+def test_get_PIL_image_from_path(image_file):
+    image = get_PIL_image_from_path(image_file)
+    assert isinstance(image, Image.Image)
+def test_get_images_from_pdf_path(pdf_file):
+    images = get_images_from_path(pdf_file)
+    assert len(images) == 2
+    assert all([isinstance(obj, Image.Image) for obj in images])
+def test_get_images_from_one_image_path(image_file):
+    images = get_images_from_path(image_file)
+    assert len(images) == 1