vincentlo commited on
Commit
2548af8
1 Parent(s): 0b2749c

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
.env.template ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ AZURE_ENDPOINT=<YOUR_ENDPOINT>
2
+ AZURE_KEY=<YOUR_KEY>
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .venv
2
+ flagged
3
+ __pycache__
4
+ scratch
5
+ .env
6
+ todo.txt
7
+ notebooks
.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
.pytest_cache/v/cache/lastfailed ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tests/test_integration.py::test_extract_image_content": true,
3
+ "tests/test_integration.py::test_extract_pdf_content": true,
4
+ "tests/test_integration.py::test_extract_image_key_pair": true,
5
+ "tests/test_integration.py::test_extract_pdf_key_pair": true,
6
+ "tests/test_extractor.py::test_extract_image_content": true,
7
+ "tests/test_extractor.py::test_extract_pdf_content": true,
8
+ "tests/test_extractor.py::test_extract_image_key_pair": true,
9
+ "tests/test_extractor.py::test_extract_pdf_key_pair": true,
10
+ "tests/units/test_content.py": true,
11
+ "tests/units/test_keypair.py": true
12
+ }
.pytest_cache/v/cache/nodeids ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "tests/test_content.py::test_extract_image_content",
3
+ "tests/test_content.py::test_extract_pdf_content",
4
+ "tests/test_extractor.py::test_extract_image_content",
5
+ "tests/test_extractor.py::test_extract_image_in_content_mode",
6
+ "tests/test_extractor.py::test_extract_image_in_keypair_mode",
7
+ "tests/test_extractor.py::test_extract_image_key_pair",
8
+ "tests/test_extractor.py::test_extract_pdf_content",
9
+ "tests/test_extractor.py::test_extract_pdf_in_content_mode",
10
+ "tests/test_extractor.py::test_extract_pdf_in_keypair_mode",
11
+ "tests/test_extractor.py::test_extract_pdf_key_pair",
12
+ "tests/test_integration.py::test_extract_image_content",
13
+ "tests/test_integration.py::test_extract_image_key_pair",
14
+ "tests/test_integration.py::test_extract_pdf_content",
15
+ "tests/test_integration.py::test_extract_pdf_key_pair",
16
+ "tests/test_keypair.py::test_extract_image_key_pair",
17
+ "tests/test_keypair.py::test_extract_pdf_key_pair",
18
+ "tests/test_utils.py::test_determine_file_type",
19
+ "tests/test_utils.py::test_get_PIL_image_from_path",
20
+ "tests/test_utils.py::test_get_bytes_from_path",
21
+ "tests/test_utils.py::test_get_images_from_one_image_path",
22
+ "tests/test_utils.py::test_get_images_from_pdf_path"
23
+ ]
.pytest_cache/v/cache/stepwise ADDED
@@ -0,0 +1 @@
 
 
1
+ []
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10 as builder
2
+
3
+ RUN pip install poetry==1.7.1
4
+
5
+ ENV POETRY_NO_INTERACTION=1 \
6
+ POETRY_VIRTUALENVS_IN_PROJECT=1 \
7
+ POETRY_VIRTUALENVS_CREATE=1 \
8
+ POETRY_CACHE_DIR=/tmp/poetry_cache
9
+
10
+ WORKDIR /app
11
+
12
+ COPY pyproject.toml poetry.lock ./
13
+ RUN touch README.md
14
+ RUN poetry install --only main && rm -rf $POETRY_CACHE_DIR
15
+
16
+
17
+ FROM python:3.10-slim as runtime
18
+
19
+ ENV VIRTUAL_ENV=/app/.venv \
20
+ PATH="/app/.venv/bin:$PATH" \
21
+ PYTHONPATH="./app:$PYTHONPATH"
22
+
23
+ COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
24
+
25
+ WORKDIR /app
26
+ COPY ./src ./src/
27
+
28
+ EXPOSE 7860
29
+
30
+ CMD ["python", "./src/app.py"]
Makefile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ update-image-and-run:
2
+ docker compose down --rmi all
3
+ docker compose up --detach
4
+
5
+ run-service: stop
6
+ docker compose up --detach
7
+
8
+ stop:
9
+ docker compose down
10
+
11
+ run-dev:
12
+ gradio src/app.py
13
+
14
+ run-local-public: init
15
+ poetry run python src/app.py -s
16
+
17
+ linter:
18
+ poetry run isort .
19
+
20
+ init:
21
+ poetry install --no-root
README.md CHANGED
@@ -1,12 +1,92 @@
1
  ---
2
- title: Document Ocr Demo
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.26.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: document-ocr-demo
3
+ app_file: src/app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.26.0
 
 
6
  ---
7
+ # Document OCR Demo
8
 
9
+ This repository contains a web demo for extracting key-value pairs from documents using Azure Document Intelligence.
10
+
11
+ ## Getting Started
12
+
13
+ ### Prerequisites
14
+
15
+ - Git
16
+ - Docker and Docker Compose
17
+
18
+ ### Installation
19
+
20
+ 1. **Clone the repository:**
21
+
22
+ ```bash
23
+ git clone git@github.com:vincent-lo-greenaitech/document-ocr-demo.git
24
+ ```
25
+
26
+ 2. **Configure environment variables:**
27
+
28
+ Locate the `.env.template` file in the repository, replace the placeholders with the actual Azure endpoint and key, and rename the file to `.env`.
29
+
30
+ For example, convert:
31
+
32
+ **.env.template**
33
+
34
+ ```plaintext
35
+ AZURE_ENDPOINT=<YOUR_ENDPOINT>
36
+ AZURE_KEY=<YOUR_KEY>
37
+ ```
38
+
39
+ To:
40
+
41
+ **.env**
42
+
43
+ ```plaintext
44
+ AZURE_ENDPOINT="https://123.com"
45
+ AZURE_KEY="abc123kljfdkkvvs"
46
+ ```
47
+
48
+ 3. **Start the server with Docker Compose:**
49
+
50
+ Ensure `docker compose` is installed and available, then run the following command:
51
+
52
+ ```bash
53
+ make run
54
+ ```
55
+
56
+ Or:
57
+
58
+ ```bash
59
+ docker compose up --detach
60
+ ```
61
+
62
+ 4. **Access the web demo:**
63
+
64
+ The web demo should now be accessible at `http://localhost:7860` (or the configured port).
65
+
66
+ ### Shutting Down
67
+
68
+ To shut down the server and clean up Docker containers, use the following command:
69
+
70
+ ```bash
71
+ make stop
72
+ ```
73
+
74
+ Or:
75
+
76
+ ```bash
77
+ docker compose down
78
+ ```
79
+
80
+ ## Update image
81
+
82
+ To update the image and run the server again, use the following command:
83
+
84
+ ```bash
85
+ make update-image-and-run
86
+ ```
87
+
88
+ ## Development
89
+
90
+ ### Managing Python Dependencies
91
+
92
+ This project uses `poetry` for managing Python dependencies. It is recommended to install `poetry` through `pipx` for an isolated setup. Refer to the [official Poetry documentation](https://python-poetry.org/docs/) for more detailed instructions.
compose.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+ services:
3
+ app:
4
+ build: .
5
+ ports:
6
+ - "7860:7860"
7
+ environment:
8
+ AZURE_ENDPOINT: ${AZURE_ENDPOINT}
9
+ AZURE_KEY: ${AZURE_KEY}
core-0.1.0-py3-none-any.whl ADDED
Binary file (4.69 kB). View file
 
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "document-ocr-demo"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Vincent Lo <vincentlo@greenaitech.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.10"
10
+ gradio = "^4.19.2"
11
+ azure-ai-formrecognizer = "3.3.0"
12
+ pymupdf = "^1.23.25"
13
+ core = { path = "./core-0.1.0-py3-none-any.whl" }
14
+ python-dotenv = "^1.0.1"
15
+
16
+
17
+ [tool.poetry.group.dev.dependencies]
18
+ isort = "^5.13.2"
19
+ pytest = "^8.0.2"
20
+ python-dotenv = "^1.0.1"
21
+ jupyter = "^1.0.0"
22
+ pandas = "^2.2.1"
23
+
24
+ [build-system]
25
+ requires = ["poetry-core"]
26
+ build-backend = "poetry.core.masonry.api"
src/__init__.py ADDED
File without changes
src/app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from typing import Literal
4
+
5
+ import gradio as gr
6
+ from PIL.Image import Image
7
+
8
+ from extractor import AzureExtractor
9
+ from utils import get_images_from_path
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument('-s', '--share', action='store_true', help='Create a public link')
16
+ args = parser.parse_args()
17
+
18
+ endpoint = os.environ['AZURE_ENDPOINT']
19
+ key = os.environ['AZURE_KEY']
20
+ extractor = AzureExtractor(endpoint=endpoint, key=key)
21
+
22
+
23
+ def handle_input(
24
+ extract_mode: Literal['Content', 'Key Pairs'], file_input: str
25
+ ) -> tuple[list[Image], dict[str, str]]:
26
+ try:
27
+ output_images = get_images_from_path(file_input)
28
+ mode_mapping = {'Content': 'content', 'Key Pairs': 'key_pair'}
29
+ result = extractor.extract(file_input, mode=mode_mapping[extract_mode])
30
+ status = 'Success'
31
+ except Exception as err:
32
+ status = f'Error: {err}'
33
+ output_images = None
34
+ result = None
35
+
36
+ return status, output_images, result
37
+
38
+
39
+ file_input = gr.File(file_types=['image', '.pdf'])
40
+ extract_mode = gr.Dropdown(
41
+ ['Content', 'Key Pairs'], value='Content', label='Text extract mode'
42
+ )
43
+
44
+ status_box = gr.Textbox(label='Status')
45
+ image_gallery = gr.Gallery(label='Input Preview')
46
+ result = gr.Json(label='result')
47
+ demo = gr.Interface(
48
+ fn=handle_input,
49
+ inputs=[extract_mode, file_input],
50
+ outputs=[status_box, image_gallery, result],
51
+ )
52
+ demo.launch(server_name='0.0.0.0', server_port=7860, share=args.share)
src/extractor/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .extractor import AzureExtractor
src/extractor/content.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from typing import Protocol
3
+
4
+ import PIL
5
+ from azure.ai.formrecognizer import DocumentAnalysisClient
6
+ from azure.core.credentials import AzureKeyCredential
7
+
8
+
9
+ class ContentExtractor(Protocol):
10
+ """
11
+ An interface for extracting content from an image
12
+ """
13
+
14
+ def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
15
+ """Extract content from an image
16
+
17
+ Args:
18
+ image: A PIL Image
19
+
20
+ Returns:
21
+ The content of the image in a string inside a dictionary with key 'content'
22
+ """
23
+ ...
24
+
25
+ def extract_pdf(self, pdf: bytes) -> dict[str, str]:
26
+ """Extract content from a pdf
27
+
28
+ Args:
29
+ image: A pdf file in bytes
30
+
31
+ Returns:
32
+ The content of the pdf in a string inside a dictionary with key 'content'
33
+ """
34
+ ...
35
+
36
+
37
+ class AzureContentExtractor(ContentExtractor):
38
+ def __init__(self, endpoint: str, key: str):
39
+ self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
40
+
41
+ def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
42
+ return self._get_result_from_bytes(AzureContentExtractor._PIL_to_bytes(image))
43
+
44
+ def extract_pdf(self, pdf: bytes) -> dict[str, str]:
45
+ return self._get_result_from_bytes(pdf)
46
+
47
+ def _get_result_from_bytes(self, bytes_obj: bytes) -> dict[str, str]:
48
+ """Obtain prediction from Azure given a bytes object"""
49
+ poller = self._client.begin_analyze_document("prebuilt-read", document=bytes_obj)
50
+ result = poller.result()
51
+
52
+ return {'content': result.content}
53
+
54
+ @staticmethod
55
+ def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
56
+ """Convert a PIL image to bytes"""
57
+ if image.format is None:
58
+ image.format = 'JPEG'
59
+
60
+ with io.BytesIO() as output:
61
+ image.save(output, format=image.format)
62
+ return output.getvalue()
src/extractor/extractor.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ sys.path.append(str(Path(__file__).parent.parent.parent))
7
+ from src.utils import determine_file_type, get_bytes_from_path, get_PIL_image_from_path
8
+
9
+ from .content import AzureContentExtractor
10
+ from .keypair import AzureKeyPairsExtractor
11
+
12
+
13
+ class AzureExtractor:
14
+ def __init__(self, endpoint: str, key: str):
15
+ self._content_extractor = AzureContentExtractor(endpoint, key)
16
+ self._keypair_extractor = AzureKeyPairsExtractor(endpoint, key)
17
+ self._extractor = None
18
+
19
+ def extract(self, file_path: str, mode: Literal['content', 'key_pair']) -> dict[str, str]:
20
+ self._set_extractor(mode)
21
+ file_type = determine_file_type(file_path)
22
+ if file_type == 'pdf':
23
+ pdf_bytes = get_bytes_from_path(file_path)
24
+ result = self._extractor.extract_pdf(pdf_bytes)
25
+ elif file_type == 'image':
26
+ image = get_PIL_image_from_path(file_path)
27
+ result = self._extractor.extract_image(image)
28
+ return result
29
+
30
+ def _set_extractor(self, mode: Literal['content', 'key_pair']):
31
+ extractor_mapping = {
32
+ 'content': self._content_extractor,
33
+ 'key_pair': self._keypair_extractor,
34
+ }
35
+ self._extractor = extractor_mapping[mode]
src/extractor/keypair.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from typing import Protocol
3
+
4
+ import PIL
5
+ from azure.ai.formrecognizer import DocumentAnalysisClient
6
+ from azure.core.credentials import AzureKeyCredential
7
+
8
+
9
+ class KeyPairExtractor(Protocol):
10
+ """
11
+ An interface for extracting key pairs from an image
12
+ """
13
+
14
+ def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
15
+ """Extract key pairs from an image
16
+
17
+ Args:
18
+ image: A PIL Image
19
+
20
+ Returns:
21
+ A dictionary mapping from fields to values
22
+ """
23
+ ...
24
+
25
+ def extract_pdf(self, pdf: bytes) -> dict[str, str]:
26
+ """Extract key pairs from a pdf
27
+
28
+ Args:
29
+ image: A pdf file in bytes
30
+
31
+ Returns:
32
+ A dictionary mapping from fields to values
33
+ """
34
+ ...
35
+
36
+
37
+ class FakeExtractor(KeyPairExtractor):
38
+ """
39
+ A fake implementation of `Extractor`
40
+ """
41
+
42
+ def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
43
+ return {'field1': 'value 1', 'field2': 'value 2'}
44
+
45
+
46
+ class AzureKeyPairsExtractor(KeyPairExtractor):
47
+ """
48
+ An Azure implementation of `KeyPairExtractor`
49
+ """
50
+
51
+ def __init__(self, endpoint: str, key: str):
52
+ self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
53
+
54
+ def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
55
+ return self._get_result_from_bytes(AzureKeyPairsExtractor._PIL_to_bytes(image))
56
+
57
+ def extract_pdf(self, pdf: bytes) -> dict[str, str]:
58
+ return self._get_result_from_bytes(pdf)
59
+
60
+ def _get_result_from_bytes(self, bytes_obj: bytes):
61
+ """Obtain prediction from Azure given a bytes object"""
62
+ poller = self._client.begin_analyze_document("prebuilt-document", document=bytes_obj)
63
+ result = poller.result()
64
+
65
+ result_dict = {}
66
+ for kv_pair in result.key_value_pairs:
67
+ if kv_pair.key and kv_pair.value:
68
+ result_dict[kv_pair.key.content] = kv_pair.value.content
69
+ else:
70
+ result_dict[kv_pair.key.content] = ''
71
+ return result_dict
72
+
73
+ @staticmethod
74
+ def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
75
+ """Convert a PIL image to bytes"""
76
+ if image.format is None:
77
+ image.format = 'JPEG'
78
+
79
+ with io.BytesIO() as output:
80
+ image.save(output, format=image.format)
81
+ return output.getvalue()
src/utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import Literal
4
+
5
+ import fitz
6
+ from PIL import Image
7
+
8
+
9
+ def determine_file_type(file_path: str) -> str:
10
+ import os
11
+
12
+ _, file_extension = os.path.splitext(file_path)
13
+ if file_extension.lower() in ['.pdf']:
14
+ return 'pdf'
15
+ elif file_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff']:
16
+ return 'image'
17
+ else:
18
+ supported_types = ', '.join(['PDF', 'JPG', 'JPEG', 'PNG', 'BMP', 'GIF', 'TIFF'])
19
+ raise ValueError(
20
+ f"Unsupported file type: {file_extension}. "
21
+ f"Only the following file types are supported: {supported_types}"
22
+ )
23
+
24
+
25
+ def get_bytes_from_path(file_path: str) -> bytes:
26
+ with open(file_path, 'rb') as file:
27
+ file_bytes = file.read()
28
+ return file_bytes
29
+
30
+
31
+ def get_PIL_image_from_path(file_path: str) -> Image.Image:
32
+ image = Image.open(file_path)
33
+ return image
34
+
35
+
36
+ def get_images_from_path(file_path: str) -> list[Image.Image]:
37
+ file_type = determine_file_type(file_path)
38
+ if file_type == 'image':
39
+ return [get_PIL_image_from_path(file_path)]
40
+ elif file_type == 'pdf':
41
+ doc = fitz.open(file_path)
42
+
43
+ images = []
44
+ for page_num in range(len(doc)):
45
+ page = doc[page_num]
46
+ pix = page.get_pixmap()
47
+ img_data = pix.tobytes("ppm")
48
+ img = Image.open(io.BytesIO(img_data))
49
+ images.append(img)
50
+
51
+ doc.close()
52
+ return images
tests/__init__.py ADDED
File without changes
tests/assets/sample.jpeg ADDED
tests/assets/sample.pdf ADDED
Binary file (80.3 kB). View file
 
tests/conftest.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import PIL.Image
4
+ import pytest
5
+ from dotenv import load_dotenv
6
+ from PIL.Image import Image
7
+
8
+ load_dotenv()
9
+ asset_dir = Path(__file__).parent / 'assets'
10
+
11
+
12
+ @pytest.fixture
13
+ def image() -> Image:
14
+ return PIL.Image.open(asset_dir / 'sample.jpeg')
15
+
16
+
17
+ @pytest.fixture
18
+ def pdf() -> bytes:
19
+ with open(asset_dir / 'sample.pdf', 'rb') as file:
20
+ return file.read()
21
+
22
+
23
+ @pytest.fixture
24
+ def image_path() -> str:
25
+ return str(asset_dir / 'sample.jpeg')
26
+
27
+
28
+ @pytest.fixture
29
+ def pdf_path() -> str:
30
+ return str(asset_dir / 'sample.pdf')
tests/test_content.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from src.extractor.content import AzureContentExtractor
6
+
7
+
8
+ @pytest.fixture
9
+ def azure_content_extractor():
10
+ return AzureContentExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
11
+
12
+
13
+ def test_extract_image_content(azure_content_extractor, image):
14
+ content = azure_content_extractor.extract_image(image)
15
+
16
+ assert isinstance(content, dict)
17
+ assert 'Smith, John' in content['content']
18
+
19
+
20
+ def test_extract_pdf_content(azure_content_extractor, pdf):
21
+ content = azure_content_extractor.extract_pdf(pdf)
22
+
23
+ assert isinstance(content, dict)
24
+ assert 'Smith, John' in content['content']
25
+ assert 'Paper Submission Form' in content['content']
tests/test_extractor.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pytest
4
+
5
+ from src.extractor import AzureExtractor
6
+
7
+
8
+ @pytest.fixture
9
+ def extractor():
10
+ return AzureExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
11
+
12
+
13
+ def test_extract_image_in_content_mode(extractor, image_path):
14
+ result = extractor.extract(image_path, mode='content')
15
+
16
+ assert isinstance(result, dict)
17
+ assert 'content' in result
18
+ assert len(result) == 1
19
+ assert 'CREDIT APPLICATION' in result['content']
20
+
21
+
22
+ def test_extract_image_in_keypair_mode(extractor, image_path):
23
+ result = extractor.extract(image_path, mode='key_pair')
24
+
25
+ assert isinstance(result, dict)
26
+ assert len(result) > 1
27
+
28
+
29
+ def test_extract_pdf_in_content_mode(extractor, pdf_path):
30
+ result = extractor.extract(pdf_path, mode='content')
31
+
32
+ assert 'content' in result
33
+ assert isinstance(result, dict)
34
+ assert len(result['content']) > 10
35
+ assert 'CREDIT APPLICATION' in result['content']
36
+ assert 'Student ID' in result['content']
37
+
38
+
39
+ def test_extract_pdf_in_keypair_mode(extractor, pdf_path):
40
+ result = extractor.extract(pdf_path, mode='key_pair')
41
+
42
+ assert isinstance(result, dict)
43
+ assert result['TELEPHONE NO.'] == '(243) 555-2309'
44
+ assert result['Student e-mail'] == 'john.doe@example.com'
tests/test_keypair.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pytest
4
+ from dotenv import load_dotenv
5
+
6
+ from src.extractor.keypair import AzureKeyPairsExtractor
7
+
8
+ load_dotenv()
9
+
10
+
11
+ @pytest.fixture
12
+ def azure_key_pairs_extractor():
13
+ return AzureKeyPairsExtractor(
14
+ endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY']
15
+ )
16
+
17
+
18
+ def test_extract_image_key_pair(azure_key_pairs_extractor, image):
19
+ key_pairs = azure_key_pairs_extractor.extract_image(image)
20
+
21
+ assert isinstance(key_pairs, dict)
22
+ assert len(key_pairs) > 1
23
+
24
+
25
+ def test_extract_pdf_key_pair(azure_key_pairs_extractor, pdf):
26
+ key_pairs = azure_key_pairs_extractor.extract_pdf(pdf)
27
+
28
+ assert isinstance(key_pairs, dict)
29
+ assert len(key_pairs) > 1
tests/test_utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import fitz
4
+ import pytest
5
+ from PIL import Image
6
+
7
+ from src.utils import (determine_file_type, get_bytes_from_path,
8
+ get_images_from_path, get_PIL_image_from_path)
9
+
10
+ IMAGE_FILE_PATH = 'test.jpg'
11
+ PDF_FILE_PATH = 'test.pdf'
12
+
13
+
14
+ @pytest.fixture
15
+ def pdf_file():
16
+ doc = fitz.open()
17
+ page1 = doc.new_page()
18
+ page1.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 1")
19
+ page2 = doc.new_page()
20
+ page2.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 2")
21
+ doc.save(PDF_FILE_PATH)
22
+ doc.close()
23
+ yield PDF_FILE_PATH
24
+ os.remove(PDF_FILE_PATH)
25
+
26
+
27
+ @pytest.fixture
28
+ def image_file():
29
+ image = Image.new('RGB', (10, 10), color='red')
30
+ image.save(IMAGE_FILE_PATH)
31
+ yield IMAGE_FILE_PATH
32
+ os.remove(IMAGE_FILE_PATH)
33
+
34
+
35
+ def test_determine_file_type(pdf_file, image_file):
36
+ assert determine_file_type(pdf_file) == 'pdf'
37
+ assert determine_file_type(image_file) == 'image'
38
+
39
+
40
+ def test_get_bytes_from_path(pdf_file):
41
+ with open(pdf_file, 'rb') as f:
42
+ expected_bytes = f.read()
43
+
44
+ assert get_bytes_from_path(pdf_file) == expected_bytes
45
+
46
+
47
+ def test_get_PIL_image_from_path(image_file):
48
+ image = get_PIL_image_from_path(image_file)
49
+ assert isinstance(image, Image.Image)
50
+
51
+
52
+ def test_get_images_from_pdf_path(pdf_file):
53
+ images = get_images_from_path(pdf_file)
54
+
55
+ assert len(images) == 2
56
+ assert all([isinstance(obj, Image.Image) for obj in images])
57
+
58
+
59
+ def test_get_images_from_one_image_path(image_file):
60
+ images = get_images_from_path(image_file)
61
+
62
+ assert len(images) == 1