Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .dockerignore +1 -0
- .env.template +2 -0
- .gitignore +7 -0
- .pytest_cache/.gitignore +2 -0
- .pytest_cache/CACHEDIR.TAG +4 -0
- .pytest_cache/README.md +8 -0
- .pytest_cache/v/cache/lastfailed +12 -0
- .pytest_cache/v/cache/nodeids +23 -0
- .pytest_cache/v/cache/stepwise +1 -0
- Dockerfile +30 -0
- Makefile +21 -0
- README.md +87 -7
- compose.yaml +9 -0
- core-0.1.0-py3-none-any.whl +0 -0
- poetry.lock +0 -0
- pyproject.toml +26 -0
- src/__init__.py +0 -0
- src/app.py +52 -0
- src/extractor/__init__.py +1 -0
- src/extractor/content.py +62 -0
- src/extractor/extractor.py +35 -0
- src/extractor/keypair.py +81 -0
- src/utils.py +52 -0
- tests/__init__.py +0 -0
- tests/assets/sample.jpeg +0 -0
- tests/assets/sample.pdf +0 -0
- tests/conftest.py +30 -0
- tests/test_content.py +25 -0
- tests/test_extractor.py +44 -0
- tests/test_keypair.py +29 -0
- tests/test_utils.py +62 -0
.dockerignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
.env.template
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
AZURE_ENDPOINT=<YOUR_ENDPOINT>
|
2 |
+
AZURE_KEY=<YOUR_KEY>
|
.gitignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
flagged
|
3 |
+
__pycache__
|
4 |
+
scratch
|
5 |
+
.env
|
6 |
+
todo.txt
|
7 |
+
notebooks
|
.pytest_cache/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Created by pytest automatically.
|
2 |
+
*
|
.pytest_cache/CACHEDIR.TAG
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
2 |
+
# This file is a cache directory tag created by pytest.
|
3 |
+
# For information about cache directory tags, see:
|
4 |
+
# https://bford.info/cachedir/spec.html
|
.pytest_cache/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pytest cache directory #
|
2 |
+
|
3 |
+
This directory contains data from the pytest's cache plugin,
|
4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
5 |
+
|
6 |
+
**Do not** commit this to version control.
|
7 |
+
|
8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
.pytest_cache/v/cache/lastfailed
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tests/test_integration.py::test_extract_image_content": true,
|
3 |
+
"tests/test_integration.py::test_extract_pdf_content": true,
|
4 |
+
"tests/test_integration.py::test_extract_image_key_pair": true,
|
5 |
+
"tests/test_integration.py::test_extract_pdf_key_pair": true,
|
6 |
+
"tests/test_extractor.py::test_extract_image_content": true,
|
7 |
+
"tests/test_extractor.py::test_extract_pdf_content": true,
|
8 |
+
"tests/test_extractor.py::test_extract_image_key_pair": true,
|
9 |
+
"tests/test_extractor.py::test_extract_pdf_key_pair": true,
|
10 |
+
"tests/units/test_content.py": true,
|
11 |
+
"tests/units/test_keypair.py": true
|
12 |
+
}
|
.pytest_cache/v/cache/nodeids
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
"tests/test_content.py::test_extract_image_content",
|
3 |
+
"tests/test_content.py::test_extract_pdf_content",
|
4 |
+
"tests/test_extractor.py::test_extract_image_content",
|
5 |
+
"tests/test_extractor.py::test_extract_image_in_content_mode",
|
6 |
+
"tests/test_extractor.py::test_extract_image_in_keypair_mode",
|
7 |
+
"tests/test_extractor.py::test_extract_image_key_pair",
|
8 |
+
"tests/test_extractor.py::test_extract_pdf_content",
|
9 |
+
"tests/test_extractor.py::test_extract_pdf_in_content_mode",
|
10 |
+
"tests/test_extractor.py::test_extract_pdf_in_keypair_mode",
|
11 |
+
"tests/test_extractor.py::test_extract_pdf_key_pair",
|
12 |
+
"tests/test_integration.py::test_extract_image_content",
|
13 |
+
"tests/test_integration.py::test_extract_image_key_pair",
|
14 |
+
"tests/test_integration.py::test_extract_pdf_content",
|
15 |
+
"tests/test_integration.py::test_extract_pdf_key_pair",
|
16 |
+
"tests/test_keypair.py::test_extract_image_key_pair",
|
17 |
+
"tests/test_keypair.py::test_extract_pdf_key_pair",
|
18 |
+
"tests/test_utils.py::test_determine_file_type",
|
19 |
+
"tests/test_utils.py::test_get_PIL_image_from_path",
|
20 |
+
"tests/test_utils.py::test_get_bytes_from_path",
|
21 |
+
"tests/test_utils.py::test_get_images_from_one_image_path",
|
22 |
+
"tests/test_utils.py::test_get_images_from_pdf_path"
|
23 |
+
]
|
.pytest_cache/v/cache/stepwise
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10 as builder
|
2 |
+
|
3 |
+
RUN pip install poetry==1.7.1
|
4 |
+
|
5 |
+
ENV POETRY_NO_INTERACTION=1 \
|
6 |
+
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
7 |
+
POETRY_VIRTUALENVS_CREATE=1 \
|
8 |
+
POETRY_CACHE_DIR=/tmp/poetry_cache
|
9 |
+
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
COPY pyproject.toml poetry.lock ./
|
13 |
+
RUN touch README.md
|
14 |
+
RUN poetry install --only main && rm -rf $POETRY_CACHE_DIR
|
15 |
+
|
16 |
+
|
17 |
+
FROM python:3.10-slim as runtime
|
18 |
+
|
19 |
+
ENV VIRTUAL_ENV=/app/.venv \
|
20 |
+
PATH="/app/.venv/bin:$PATH" \
|
21 |
+
PYTHONPATH="./app:$PYTHONPATH"
|
22 |
+
|
23 |
+
COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
24 |
+
|
25 |
+
WORKDIR /app
|
26 |
+
COPY ./src ./src/
|
27 |
+
|
28 |
+
EXPOSE 7860
|
29 |
+
|
30 |
+
CMD ["python", "./src/app.py"]
|
Makefile
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
update-image-and-run:
|
2 |
+
docker compose down --rmi all
|
3 |
+
docker compose up --detach
|
4 |
+
|
5 |
+
run-service: stop
|
6 |
+
docker compose up --detach
|
7 |
+
|
8 |
+
stop:
|
9 |
+
docker compose down
|
10 |
+
|
11 |
+
run-dev:
|
12 |
+
gradio src/app.py
|
13 |
+
|
14 |
+
run-local-public: init
|
15 |
+
poetry run python src/app.py -s
|
16 |
+
|
17 |
+
linter:
|
18 |
+
poetry run isort .
|
19 |
+
|
20 |
+
init:
|
21 |
+
poetry install --no-root
|
README.md
CHANGED
@@ -1,12 +1,92 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.26.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: document-ocr-demo
|
3 |
+
app_file: src/app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.26.0
|
|
|
|
|
6 |
---
|
7 |
+
# Document OCR Demo
|
8 |
|
9 |
+
This repository contains a web demo for extracting key-value pairs from documents using Azure Document Intelligence.
|
10 |
+
|
11 |
+
## Getting Started
|
12 |
+
|
13 |
+
### Prerequisites
|
14 |
+
|
15 |
+
- Git
|
16 |
+
- Docker and Docker Compose
|
17 |
+
|
18 |
+
### Installation
|
19 |
+
|
20 |
+
1. **Clone the repository:**
|
21 |
+
|
22 |
+
```bash
|
23 |
+
git clone git@github.com:vincent-lo-greenaitech/document-ocr-demo.git
|
24 |
+
```
|
25 |
+
|
26 |
+
2. **Configure environment variables:**
|
27 |
+
|
28 |
+
Locate the `.env.template` file in the repository, replace the placeholders with the actual Azure endpoint and key, and rename the file to `.env`.
|
29 |
+
|
30 |
+
For example, convert:
|
31 |
+
|
32 |
+
**.env.template**
|
33 |
+
|
34 |
+
```plaintext
|
35 |
+
AZURE_ENDPOINT=<YOUR_ENDPOINT>
|
36 |
+
AZURE_KEY=<YOUR_KEY>
|
37 |
+
```
|
38 |
+
|
39 |
+
To:
|
40 |
+
|
41 |
+
**.env**
|
42 |
+
|
43 |
+
```plaintext
|
44 |
+
AZURE_ENDPOINT="https://123.com"
|
45 |
+
AZURE_KEY="abc123kljfdkkvvs"
|
46 |
+
```
|
47 |
+
|
48 |
+
3. **Start the server with Docker Compose:**
|
49 |
+
|
50 |
+
Ensure `docker compose` is installed and available, then run the following command:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
make run
|
54 |
+
```
|
55 |
+
|
56 |
+
Or:
|
57 |
+
|
58 |
+
```bash
|
59 |
+
docker compose up --detach
|
60 |
+
```
|
61 |
+
|
62 |
+
4. **Access the web demo:**
|
63 |
+
|
64 |
+
The web demo should now be accessible at `http://localhost:7860` (or the configured port).
|
65 |
+
|
66 |
+
### Shutting Down
|
67 |
+
|
68 |
+
To shut down the server and clean up Docker containers, use the following command:
|
69 |
+
|
70 |
+
```bash
|
71 |
+
make stop
|
72 |
+
```
|
73 |
+
|
74 |
+
Or:
|
75 |
+
|
76 |
+
```bash
|
77 |
+
docker compose down
|
78 |
+
```
|
79 |
+
|
80 |
+
## Update image
|
81 |
+
|
82 |
+
To update the image and run the server again, use the following command:
|
83 |
+
|
84 |
+
```bash
|
85 |
+
make update-image-and-run
|
86 |
+
```
|
87 |
+
|
88 |
+
## Development
|
89 |
+
|
90 |
+
### Managing Python Dependencies
|
91 |
+
|
92 |
+
This project uses `poetry` for managing Python dependencies. It is recommended to install `poetry` through `pipx` for an isolated setup. Refer to the [official Poetry documentation](https://python-poetry.org/docs/) for more detailed instructions.
|
compose.yaml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.8'
|
2 |
+
services:
|
3 |
+
app:
|
4 |
+
build: .
|
5 |
+
ports:
|
6 |
+
- "7860:7860"
|
7 |
+
environment:
|
8 |
+
AZURE_ENDPOINT: ${AZURE_ENDPOINT}
|
9 |
+
AZURE_KEY: ${AZURE_KEY}
|
core-0.1.0-py3-none-any.whl
ADDED
Binary file (4.69 kB). View file
|
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "document-ocr-demo"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Vincent Lo <vincentlo@greenaitech.com>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.10"
|
10 |
+
gradio = "^4.19.2"
|
11 |
+
azure-ai-formrecognizer = "3.3.0"
|
12 |
+
pymupdf = "^1.23.25"
|
13 |
+
core = { path = "./core-0.1.0-py3-none-any.whl" }
|
14 |
+
python-dotenv = "^1.0.1"
|
15 |
+
|
16 |
+
|
17 |
+
[tool.poetry.group.dev.dependencies]
|
18 |
+
isort = "^5.13.2"
|
19 |
+
pytest = "^8.0.2"
|
20 |
+
python-dotenv = "^1.0.1"
|
21 |
+
jupyter = "^1.0.0"
|
22 |
+
pandas = "^2.2.1"
|
23 |
+
|
24 |
+
[build-system]
|
25 |
+
requires = ["poetry-core"]
|
26 |
+
build-backend = "poetry.core.masonry.api"
|
src/__init__.py
ADDED
File without changes
|
src/app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from typing import Literal
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from PIL.Image import Image
|
7 |
+
|
8 |
+
from extractor import AzureExtractor
|
9 |
+
from utils import get_images_from_path
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
parser = argparse.ArgumentParser()
|
15 |
+
parser.add_argument('-s', '--share', action='store_true', help='Create a public link')
|
16 |
+
args = parser.parse_args()
|
17 |
+
|
18 |
+
endpoint = os.environ['AZURE_ENDPOINT']
|
19 |
+
key = os.environ['AZURE_KEY']
|
20 |
+
extractor = AzureExtractor(endpoint=endpoint, key=key)
|
21 |
+
|
22 |
+
|
23 |
+
def handle_input(
|
24 |
+
extract_mode: Literal['Content', 'Key Pairs'], file_input: str
|
25 |
+
) -> tuple[list[Image], dict[str, str]]:
|
26 |
+
try:
|
27 |
+
output_images = get_images_from_path(file_input)
|
28 |
+
mode_mapping = {'Content': 'content', 'Key Pairs': 'key_pair'}
|
29 |
+
result = extractor.extract(file_input, mode=mode_mapping[extract_mode])
|
30 |
+
status = 'Success'
|
31 |
+
except Exception as err:
|
32 |
+
status = f'Error: {err}'
|
33 |
+
output_images = None
|
34 |
+
result = None
|
35 |
+
|
36 |
+
return status, output_images, result
|
37 |
+
|
38 |
+
|
39 |
+
file_input = gr.File(file_types=['image', '.pdf'])
|
40 |
+
extract_mode = gr.Dropdown(
|
41 |
+
['Content', 'Key Pairs'], value='Content', label='Text extract mode'
|
42 |
+
)
|
43 |
+
|
44 |
+
status_box = gr.Textbox(label='Status')
|
45 |
+
image_gallery = gr.Gallery(label='Input Preview')
|
46 |
+
result = gr.Json(label='result')
|
47 |
+
demo = gr.Interface(
|
48 |
+
fn=handle_input,
|
49 |
+
inputs=[extract_mode, file_input],
|
50 |
+
outputs=[status_box, image_gallery, result],
|
51 |
+
)
|
52 |
+
demo.launch(server_name='0.0.0.0', server_port=7860, share=args.share)
|
src/extractor/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .extractor import AzureExtractor
|
src/extractor/content.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
from typing import Protocol
|
3 |
+
|
4 |
+
import PIL
|
5 |
+
from azure.ai.formrecognizer import DocumentAnalysisClient
|
6 |
+
from azure.core.credentials import AzureKeyCredential
|
7 |
+
|
8 |
+
|
9 |
+
class ContentExtractor(Protocol):
|
10 |
+
"""
|
11 |
+
An interface for extracting content from an image
|
12 |
+
"""
|
13 |
+
|
14 |
+
def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
|
15 |
+
"""Extract content from an image
|
16 |
+
|
17 |
+
Args:
|
18 |
+
image: A PIL Image
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
The content of the image in a string inside a dictionary with key 'content'
|
22 |
+
"""
|
23 |
+
...
|
24 |
+
|
25 |
+
def extract_pdf(self, pdf: bytes) -> dict[str, str]:
|
26 |
+
"""Extract content from a pdf
|
27 |
+
|
28 |
+
Args:
|
29 |
+
image: A pdf file in bytes
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
The content of the pdf in a string inside a dictionary with key 'content'
|
33 |
+
"""
|
34 |
+
...
|
35 |
+
|
36 |
+
|
37 |
+
class AzureContentExtractor(ContentExtractor):
|
38 |
+
def __init__(self, endpoint: str, key: str):
|
39 |
+
self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
|
40 |
+
|
41 |
+
def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
|
42 |
+
return self._get_result_from_bytes(AzureContentExtractor._PIL_to_bytes(image))
|
43 |
+
|
44 |
+
def extract_pdf(self, pdf: bytes) -> dict[str, str]:
|
45 |
+
return self._get_result_from_bytes(pdf)
|
46 |
+
|
47 |
+
def _get_result_from_bytes(self, bytes_obj: bytes) -> dict[str, str]:
|
48 |
+
"""Obtain prediction from Azure given a bytes object"""
|
49 |
+
poller = self._client.begin_analyze_document("prebuilt-read", document=bytes_obj)
|
50 |
+
result = poller.result()
|
51 |
+
|
52 |
+
return {'content': result.content}
|
53 |
+
|
54 |
+
@staticmethod
|
55 |
+
def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
|
56 |
+
"""Convert a PIL image to bytes"""
|
57 |
+
if image.format is None:
|
58 |
+
image.format = 'JPEG'
|
59 |
+
|
60 |
+
with io.BytesIO() as output:
|
61 |
+
image.save(output, format=image.format)
|
62 |
+
return output.getvalue()
|
src/extractor/extractor.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal
|
2 |
+
|
3 |
+
import sys
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
sys.path.append(str(Path(__file__).parent.parent.parent))
|
7 |
+
from src.utils import determine_file_type, get_bytes_from_path, get_PIL_image_from_path
|
8 |
+
|
9 |
+
from .content import AzureContentExtractor
|
10 |
+
from .keypair import AzureKeyPairsExtractor
|
11 |
+
|
12 |
+
|
13 |
+
class AzureExtractor:
|
14 |
+
def __init__(self, endpoint: str, key: str):
|
15 |
+
self._content_extractor = AzureContentExtractor(endpoint, key)
|
16 |
+
self._keypair_extractor = AzureKeyPairsExtractor(endpoint, key)
|
17 |
+
self._extractor = None
|
18 |
+
|
19 |
+
def extract(self, file_path: str, mode: Literal['content', 'key_pair']) -> dict[str, str]:
|
20 |
+
self._set_extractor(mode)
|
21 |
+
file_type = determine_file_type(file_path)
|
22 |
+
if file_type == 'pdf':
|
23 |
+
pdf_bytes = get_bytes_from_path(file_path)
|
24 |
+
result = self._extractor.extract_pdf(pdf_bytes)
|
25 |
+
elif file_type == 'image':
|
26 |
+
image = get_PIL_image_from_path(file_path)
|
27 |
+
result = self._extractor.extract_image(image)
|
28 |
+
return result
|
29 |
+
|
30 |
+
def _set_extractor(self, mode: Literal['content', 'key_pair']):
|
31 |
+
extractor_mapping = {
|
32 |
+
'content': self._content_extractor,
|
33 |
+
'key_pair': self._keypair_extractor,
|
34 |
+
}
|
35 |
+
self._extractor = extractor_mapping[mode]
|
src/extractor/keypair.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
from typing import Protocol
|
3 |
+
|
4 |
+
import PIL
|
5 |
+
from azure.ai.formrecognizer import DocumentAnalysisClient
|
6 |
+
from azure.core.credentials import AzureKeyCredential
|
7 |
+
|
8 |
+
|
9 |
+
class KeyPairExtractor(Protocol):
|
10 |
+
"""
|
11 |
+
An interface for extracting key pairs from an image
|
12 |
+
"""
|
13 |
+
|
14 |
+
def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
|
15 |
+
"""Extract key pairs from an image
|
16 |
+
|
17 |
+
Args:
|
18 |
+
image: A PIL Image
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
A dictionary mapping from fields to values
|
22 |
+
"""
|
23 |
+
...
|
24 |
+
|
25 |
+
def extract_pdf(self, pdf: bytes) -> dict[str, str]:
|
26 |
+
"""Extract key pairs from a pdf
|
27 |
+
|
28 |
+
Args:
|
29 |
+
image: A pdf file in bytes
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
A dictionary mapping from fields to values
|
33 |
+
"""
|
34 |
+
...
|
35 |
+
|
36 |
+
|
37 |
+
class FakeExtractor(KeyPairExtractor):
|
38 |
+
"""
|
39 |
+
A fake implementation of `Extractor`
|
40 |
+
"""
|
41 |
+
|
42 |
+
def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
|
43 |
+
return {'field1': 'value 1', 'field2': 'value 2'}
|
44 |
+
|
45 |
+
|
46 |
+
class AzureKeyPairsExtractor(KeyPairExtractor):
|
47 |
+
"""
|
48 |
+
An Azure implementation of `KeyPairExtractor`
|
49 |
+
"""
|
50 |
+
|
51 |
+
def __init__(self, endpoint: str, key: str):
|
52 |
+
self._client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
|
53 |
+
|
54 |
+
def extract_image(self, image: PIL.Image.Image) -> dict[str, str]:
|
55 |
+
return self._get_result_from_bytes(AzureKeyPairsExtractor._PIL_to_bytes(image))
|
56 |
+
|
57 |
+
def extract_pdf(self, pdf: bytes) -> dict[str, str]:
|
58 |
+
return self._get_result_from_bytes(pdf)
|
59 |
+
|
60 |
+
def _get_result_from_bytes(self, bytes_obj: bytes):
|
61 |
+
"""Obtain prediction from Azure given a bytes object"""
|
62 |
+
poller = self._client.begin_analyze_document("prebuilt-document", document=bytes_obj)
|
63 |
+
result = poller.result()
|
64 |
+
|
65 |
+
result_dict = {}
|
66 |
+
for kv_pair in result.key_value_pairs:
|
67 |
+
if kv_pair.key and kv_pair.value:
|
68 |
+
result_dict[kv_pair.key.content] = kv_pair.value.content
|
69 |
+
else:
|
70 |
+
result_dict[kv_pair.key.content] = ''
|
71 |
+
return result_dict
|
72 |
+
|
73 |
+
@staticmethod
|
74 |
+
def _PIL_to_bytes(image: PIL.Image.Image) -> bytes:
|
75 |
+
"""Convert a PIL image to bytes"""
|
76 |
+
if image.format is None:
|
77 |
+
image.format = 'JPEG'
|
78 |
+
|
79 |
+
with io.BytesIO() as output:
|
80 |
+
image.save(output, format=image.format)
|
81 |
+
return output.getvalue()
|
src/utils.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
from typing import Literal
|
4 |
+
|
5 |
+
import fitz
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
|
9 |
+
def determine_file_type(file_path: str) -> str:
|
10 |
+
import os
|
11 |
+
|
12 |
+
_, file_extension = os.path.splitext(file_path)
|
13 |
+
if file_extension.lower() in ['.pdf']:
|
14 |
+
return 'pdf'
|
15 |
+
elif file_extension.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff']:
|
16 |
+
return 'image'
|
17 |
+
else:
|
18 |
+
supported_types = ', '.join(['PDF', 'JPG', 'JPEG', 'PNG', 'BMP', 'GIF', 'TIFF'])
|
19 |
+
raise ValueError(
|
20 |
+
f"Unsupported file type: {file_extension}. "
|
21 |
+
f"Only the following file types are supported: {supported_types}"
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def get_bytes_from_path(file_path: str) -> bytes:
|
26 |
+
with open(file_path, 'rb') as file:
|
27 |
+
file_bytes = file.read()
|
28 |
+
return file_bytes
|
29 |
+
|
30 |
+
|
31 |
+
def get_PIL_image_from_path(file_path: str) -> Image.Image:
|
32 |
+
image = Image.open(file_path)
|
33 |
+
return image
|
34 |
+
|
35 |
+
|
36 |
+
def get_images_from_path(file_path: str) -> list[Image.Image]:
|
37 |
+
file_type = determine_file_type(file_path)
|
38 |
+
if file_type == 'image':
|
39 |
+
return [get_PIL_image_from_path(file_path)]
|
40 |
+
elif file_type == 'pdf':
|
41 |
+
doc = fitz.open(file_path)
|
42 |
+
|
43 |
+
images = []
|
44 |
+
for page_num in range(len(doc)):
|
45 |
+
page = doc[page_num]
|
46 |
+
pix = page.get_pixmap()
|
47 |
+
img_data = pix.tobytes("ppm")
|
48 |
+
img = Image.open(io.BytesIO(img_data))
|
49 |
+
images.append(img)
|
50 |
+
|
51 |
+
doc.close()
|
52 |
+
return images
|
tests/__init__.py
ADDED
File without changes
|
tests/assets/sample.jpeg
ADDED
tests/assets/sample.pdf
ADDED
Binary file (80.3 kB). View file
|
|
tests/conftest.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
|
3 |
+
import PIL.Image
|
4 |
+
import pytest
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from PIL.Image import Image
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
asset_dir = Path(__file__).parent / 'assets'
|
10 |
+
|
11 |
+
|
12 |
+
@pytest.fixture
|
13 |
+
def image() -> Image:
|
14 |
+
return PIL.Image.open(asset_dir / 'sample.jpeg')
|
15 |
+
|
16 |
+
|
17 |
+
@pytest.fixture
|
18 |
+
def pdf() -> bytes:
|
19 |
+
with open(asset_dir / 'sample.pdf', 'rb') as file:
|
20 |
+
return file.read()
|
21 |
+
|
22 |
+
|
23 |
+
@pytest.fixture
|
24 |
+
def image_path() -> str:
|
25 |
+
return str(asset_dir / 'sample.jpeg')
|
26 |
+
|
27 |
+
|
28 |
+
@pytest.fixture
|
29 |
+
def pdf_path() -> str:
|
30 |
+
return str(asset_dir / 'sample.pdf')
|
tests/test_content.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pytest
|
4 |
+
|
5 |
+
from src.extractor.content import AzureContentExtractor
|
6 |
+
|
7 |
+
|
8 |
+
@pytest.fixture
|
9 |
+
def azure_content_extractor():
|
10 |
+
return AzureContentExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
|
11 |
+
|
12 |
+
|
13 |
+
def test_extract_image_content(azure_content_extractor, image):
|
14 |
+
content = azure_content_extractor.extract_image(image)
|
15 |
+
|
16 |
+
assert isinstance(content, dict)
|
17 |
+
assert 'Smith, John' in content['content']
|
18 |
+
|
19 |
+
|
20 |
+
def test_extract_pdf_content(azure_content_extractor, pdf):
|
21 |
+
content = azure_content_extractor.extract_pdf(pdf)
|
22 |
+
|
23 |
+
assert isinstance(content, dict)
|
24 |
+
assert 'Smith, John' in content['content']
|
25 |
+
assert 'Paper Submission Form' in content['content']
|
tests/test_extractor.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pytest
|
4 |
+
|
5 |
+
from src.extractor import AzureExtractor
|
6 |
+
|
7 |
+
|
8 |
+
@pytest.fixture
|
9 |
+
def extractor():
|
10 |
+
return AzureExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
|
11 |
+
|
12 |
+
|
13 |
+
def test_extract_image_in_content_mode(extractor, image_path):
|
14 |
+
result = extractor.extract(image_path, mode='content')
|
15 |
+
|
16 |
+
assert isinstance(result, dict)
|
17 |
+
assert 'content' in result
|
18 |
+
assert len(result) == 1
|
19 |
+
assert 'CREDIT APPLICATION' in result['content']
|
20 |
+
|
21 |
+
|
22 |
+
def test_extract_image_in_keypair_mode(extractor, image_path):
|
23 |
+
result = extractor.extract(image_path, mode='key_pair')
|
24 |
+
|
25 |
+
assert isinstance(result, dict)
|
26 |
+
assert len(result) > 1
|
27 |
+
|
28 |
+
|
29 |
+
def test_extract_pdf_in_content_mode(extractor, pdf_path):
|
30 |
+
result = extractor.extract(pdf_path, mode='content')
|
31 |
+
|
32 |
+
assert 'content' in result
|
33 |
+
assert isinstance(result, dict)
|
34 |
+
assert len(result['content']) > 10
|
35 |
+
assert 'CREDIT APPLICATION' in result['content']
|
36 |
+
assert 'Student ID' in result['content']
|
37 |
+
|
38 |
+
|
39 |
+
def test_extract_pdf_in_keypair_mode(extractor, pdf_path):
|
40 |
+
result = extractor.extract(pdf_path, mode='key_pair')
|
41 |
+
|
42 |
+
assert isinstance(result, dict)
|
43 |
+
assert result['TELEPHONE NO.'] == '(243) 555-2309'
|
44 |
+
assert result['Student e-mail'] == 'john.doe@example.com'
|
tests/test_keypair.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pytest
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
from src.extractor.keypair import AzureKeyPairsExtractor
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
@pytest.fixture
|
12 |
+
def azure_key_pairs_extractor():
|
13 |
+
return AzureKeyPairsExtractor(
|
14 |
+
endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY']
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def test_extract_image_key_pair(azure_key_pairs_extractor, image):
|
19 |
+
key_pairs = azure_key_pairs_extractor.extract_image(image)
|
20 |
+
|
21 |
+
assert isinstance(key_pairs, dict)
|
22 |
+
assert len(key_pairs) > 1
|
23 |
+
|
24 |
+
|
25 |
+
def test_extract_pdf_key_pair(azure_key_pairs_extractor, pdf):
|
26 |
+
key_pairs = azure_key_pairs_extractor.extract_pdf(pdf)
|
27 |
+
|
28 |
+
assert isinstance(key_pairs, dict)
|
29 |
+
assert len(key_pairs) > 1
|
tests/test_utils.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import fitz
|
4 |
+
import pytest
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
from src.utils import (determine_file_type, get_bytes_from_path,
|
8 |
+
get_images_from_path, get_PIL_image_from_path)
|
9 |
+
|
10 |
+
IMAGE_FILE_PATH = 'test.jpg'
|
11 |
+
PDF_FILE_PATH = 'test.pdf'
|
12 |
+
|
13 |
+
|
14 |
+
@pytest.fixture
|
15 |
+
def pdf_file():
|
16 |
+
doc = fitz.open()
|
17 |
+
page1 = doc.new_page()
|
18 |
+
page1.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 1")
|
19 |
+
page2 = doc.new_page()
|
20 |
+
page2.insert_text((72, 72), "This is a test PDF created with PyMuPDF. Page 2")
|
21 |
+
doc.save(PDF_FILE_PATH)
|
22 |
+
doc.close()
|
23 |
+
yield PDF_FILE_PATH
|
24 |
+
os.remove(PDF_FILE_PATH)
|
25 |
+
|
26 |
+
|
27 |
+
@pytest.fixture
|
28 |
+
def image_file():
|
29 |
+
image = Image.new('RGB', (10, 10), color='red')
|
30 |
+
image.save(IMAGE_FILE_PATH)
|
31 |
+
yield IMAGE_FILE_PATH
|
32 |
+
os.remove(IMAGE_FILE_PATH)
|
33 |
+
|
34 |
+
|
35 |
+
def test_determine_file_type(pdf_file, image_file):
|
36 |
+
assert determine_file_type(pdf_file) == 'pdf'
|
37 |
+
assert determine_file_type(image_file) == 'image'
|
38 |
+
|
39 |
+
|
40 |
+
def test_get_bytes_from_path(pdf_file):
|
41 |
+
with open(pdf_file, 'rb') as f:
|
42 |
+
expected_bytes = f.read()
|
43 |
+
|
44 |
+
assert get_bytes_from_path(pdf_file) == expected_bytes
|
45 |
+
|
46 |
+
|
47 |
+
def test_get_PIL_image_from_path(image_file):
|
48 |
+
image = get_PIL_image_from_path(image_file)
|
49 |
+
assert isinstance(image, Image.Image)
|
50 |
+
|
51 |
+
|
52 |
+
def test_get_images_from_pdf_path(pdf_file):
|
53 |
+
images = get_images_from_path(pdf_file)
|
54 |
+
|
55 |
+
assert len(images) == 2
|
56 |
+
assert all([isinstance(obj, Image.Image) for obj in images])
|
57 |
+
|
58 |
+
|
59 |
+
def test_get_images_from_one_image_path(image_file):
|
60 |
+
images = get_images_from_path(image_file)
|
61 |
+
|
62 |
+
assert len(images) == 1
|