Spaces:
Runtime error
Runtime error
Upload 8 files
Browse files- README.MD +267 -0
- requirements.txt +11 -0
- run_converter.py +27 -0
- run_donut.py +54 -0
- run_donut_data_generator.py +22 -0
- run_donut_test.py +8 -0
- run_donut_upload.py +8 -0
- run_ocr.py +32 -0
README.MD
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Sparrow Data
|
2 |
+
|
3 |
+
## Description
|
4 |
+
|
5 |
+
This module implements data structure for Sparrow ML model fine-tuning. We are using list of invoices to build Hugging Face dataset.
|
6 |
+
|
7 |
+
## Install
|
8 |
+
|
9 |
+
1. Install
|
10 |
+
|
11 |
+
```
|
12 |
+
pip install -r requirements.txt
|
13 |
+
```
|
14 |
+
|
15 |
+
2. Install Poppler, required for pdf2image to work (macos example)
|
16 |
+
|
17 |
+
```
|
18 |
+
brew install poppler
|
19 |
+
```
|
20 |
+
|
21 |
+
3. Mindee docTR OCR installation with dependencies
|
22 |
+
|
23 |
+
```
|
24 |
+
pip install torch torchvision torchaudio
|
25 |
+
pip install python-doctr
|
26 |
+
```
|
27 |
+
|
28 |
+
## Usage
|
29 |
+
|
30 |
+
1. Run OCR on invoices with PDF conversion to JPG
|
31 |
+
|
32 |
+
```
|
33 |
+
python run_ocr.py
|
34 |
+
```
|
35 |
+
|
36 |
+
2. Run data conversion to Sparrow format
|
37 |
+
|
38 |
+
```
|
39 |
+
python run_converter.py
|
40 |
+
```
|
41 |
+
|
42 |
+
Run Sparrow UI to annotate the documents and create key/value pairs.
|
43 |
+
|
44 |
+
3. Run data preparation task for Donut model fine-tuning. This task will create metadata. It will create Hugging Face dataset with train, validation and test splits for Donut model fine-tuning
|
45 |
+
|
46 |
+
```
|
47 |
+
python run_donut.py
|
48 |
+
```
|
49 |
+
|
50 |
+
4. Push dataset to Huggung Face Hub. You need to have Hugging Face account and Hugging Face Hub token. Read more: https://huggingface.co/docs/datasets/main/en/image_dataset
|
51 |
+
|
52 |
+
```
|
53 |
+
python run_donut_upload.py
|
54 |
+
```
|
55 |
+
|
56 |
+
5. Test dataset by using load_dataset and fetching data from Hugging Face Hub
|
57 |
+
|
58 |
+
```
|
59 |
+
python run_donut_test.py
|
60 |
+
```
|
61 |
+
|
62 |
+
## FastAPI Service
|
63 |
+
|
64 |
+
Set environment variables in **set_env_vars.sh**
|
65 |
+
|
66 |
+
1. Run
|
67 |
+
|
68 |
+
```
|
69 |
+
cd api
|
70 |
+
```
|
71 |
+
|
72 |
+
```
|
73 |
+
RUN_LOCALLY=true ./start.sh
|
74 |
+
```
|
75 |
+
|
76 |
+
2. FastAPI Swagger
|
77 |
+
|
78 |
+
```
|
79 |
+
http://127.0.0.1:8000/api/v1/sparrow-data/docs
|
80 |
+
```
|
81 |
+
|
82 |
+
**Run in Docker container**
|
83 |
+
|
84 |
+
1. Build Docker image
|
85 |
+
|
86 |
+
```
|
87 |
+
docker build --tag katanaml/sparrow-data .
|
88 |
+
```
|
89 |
+
|
90 |
+
2. Run Docker container
|
91 |
+
|
92 |
+
```
|
93 |
+
docker run -e RUN_LOCALLY=true -it --name sparrow-data -p 7860:7860 katanaml/sparrow-data:latest
|
94 |
+
```
|
95 |
+
|
96 |
+
## Endpoints
|
97 |
+
|
98 |
+
1. Info
|
99 |
+
|
100 |
+
```
|
101 |
+
curl -X 'GET' \
|
102 |
+
'https://katanaml-org-sparrow-data.hf.space/api-dataset/v1/sparrow-data/dataset_info' \
|
103 |
+
-H 'accept: application/json'
|
104 |
+
```
|
105 |
+
|
106 |
+
Replace URL with your own
|
107 |
+
|
108 |
+
2. Ground truth
|
109 |
+
|
110 |
+
```
|
111 |
+
curl -X 'GET' \
|
112 |
+
'https://katanaml-org-sparrow-data.hf.space/api-dataset/v1/sparrow-data/ground_truth' \
|
113 |
+
-H 'accept: application/json'
|
114 |
+
```
|
115 |
+
|
116 |
+
Replace URL with your own
|
117 |
+
|
118 |
+
3. OCR service
|
119 |
+
|
120 |
+
```
|
121 |
+
curl -X 'POST' \
|
122 |
+
'https://katanaml-org-sparrow-data.hf.space/api-ocr/v1/sparrow-data/ocr' \
|
123 |
+
-H 'accept: application/json' \
|
124 |
+
-H 'Content-Type: multipart/form-data' \
|
125 |
+
-F 'file=' \
|
126 |
+
-F 'image_url=https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/invoices/processed/images/invoice_10.jpg' \
|
127 |
+
-F 'post_processing=false' \
|
128 |
+
-F 'sparrow_key=your_key'
|
129 |
+
```
|
130 |
+
|
131 |
+
Replace URL with your own
|
132 |
+
|
133 |
+
4. OCR statistics
|
134 |
+
|
135 |
+
```
|
136 |
+
curl -X 'GET' \
|
137 |
+
'https://katanaml-org-sparrow-data.hf.space/api-ocr/v1/sparrow-data/statistics' \
|
138 |
+
-H 'accept: application/json'
|
139 |
+
```
|
140 |
+
|
141 |
+
Replace URL with your own
|
142 |
+
|
143 |
+
## Endpoints - ChatGPT Plugin
|
144 |
+
|
145 |
+
1. Get OCR content for receipt
|
146 |
+
|
147 |
+
```
|
148 |
+
curl -X 'GET' \
|
149 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/receipt_by_id?receipt_id=34563&sparrow_key=your_key' \
|
150 |
+
-H 'accept: application/json'
|
151 |
+
```
|
152 |
+
|
153 |
+
Replace URL with your own
|
154 |
+
|
155 |
+
2. Post Receipt JSON content to DB
|
156 |
+
|
157 |
+
```
|
158 |
+
curl -X 'POST' \
|
159 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/store_receipt_db' \
|
160 |
+
-H 'accept: application/json' \
|
161 |
+
-H 'Content-Type: application/x-www-form-urlencoded' \
|
162 |
+
-d 'chatgpt_user=user&receipt_id=12345&receipt_content=%7Breceipt%7D&sparrow_key=your_key'
|
163 |
+
```
|
164 |
+
|
165 |
+
Replace URL with your own
|
166 |
+
|
167 |
+
3. Get receipt JSON from DB by ID
|
168 |
+
|
169 |
+
```
|
170 |
+
curl -X 'GET' \
|
171 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/receipt_db_by_id?chatgpt_user=user&receipt_id=12345&sparrow_key=your_key' \
|
172 |
+
-H 'accept: application/json'
|
173 |
+
```
|
174 |
+
|
175 |
+
Replace URL with your own
|
176 |
+
|
177 |
+
4. Delete receipt JSON from DB by ID
|
178 |
+
|
179 |
+
```
|
180 |
+
curl -X 'DELETE' \
|
181 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/receipt_db_by_id?chatgpt_user=user&receipt_id=13456&sparrow_key=your_key' \
|
182 |
+
-H 'accept: application/json'
|
183 |
+
```
|
184 |
+
|
185 |
+
Replace URL with your own
|
186 |
+
|
187 |
+
5. Get all IDs for receipts stored in DB
|
188 |
+
|
189 |
+
```
|
190 |
+
curl -X 'GET' \
|
191 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/receipt_db_ids_by_user?chatgpt_user=user&sparrow_key=your_key' \
|
192 |
+
-H 'accept: application/json'
|
193 |
+
```
|
194 |
+
|
195 |
+
Replace URL with your own
|
196 |
+
|
197 |
+
6. Get all receipts content stored in DB
|
198 |
+
|
199 |
+
```
|
200 |
+
curl -X 'GET' \
|
201 |
+
'https://katanaml-org-sparrow-data.hf.space/api-chatgpt-plugin/v1/sparrow-data/receipt_db_content_by_user?chatgpt_user=user&sparrow_key=your_key' \
|
202 |
+
-H 'accept: application/json'
|
203 |
+
```
|
204 |
+
|
205 |
+
Replace URL with your own
|
206 |
+
|
207 |
+
## CLI
|
208 |
+
|
209 |
+
Navigate to 'cli' folder and run 'chmod +x sparrowdata'. Add to system path to make it executable globally on the system.
|
210 |
+
|
211 |
+
1. OCR
|
212 |
+
|
213 |
+
```
|
214 |
+
./sparrowdata --api_url https://katanaml-org-sparrow-data.hf.space/api-ocr/v1/sparrow-data/ocr \
|
215 |
+
--file_path ../docs/models/donut/data/img/test/invoice_2.jpg \
|
216 |
+
--post_processing false \
|
217 |
+
--sparrow_key your_key
|
218 |
+
```
|
219 |
+
|
220 |
+
## Deploy to Hugging Face Spaces
|
221 |
+
|
222 |
+
1. Create new space - https://huggingface.co/spaces. Follow instructions from readme doc
|
223 |
+
|
224 |
+
2. Create huggingface_key secret in space settings
|
225 |
+
|
226 |
+
3. In config.py, replace huggingface_key variable with this line of code
|
227 |
+
|
228 |
+
```
|
229 |
+
huggingface_key: str = os.environ.get("huggingface_key")
|
230 |
+
```
|
231 |
+
|
232 |
+
4. Commit and push code to the space, follow readme instructions. Docker container will be deployed automatically. Example:
|
233 |
+
|
234 |
+
```
|
235 |
+
https://huggingface.co/spaces/katanaml-org/sparrow-data
|
236 |
+
```
|
237 |
+
|
238 |
+
5. Sparrow Data API will be accessible by URL, you can get it from space info. Example:
|
239 |
+
|
240 |
+
```
|
241 |
+
https://katanaml-org-sparrow-data.hf.space/api/v1/sparrow-data/docs
|
242 |
+
```
|
243 |
+
|
244 |
+
## MongoDB connection
|
245 |
+
|
246 |
+
If post_processing is set to True, then OCR results will be saved to MongoDB. You need to have MongoDB Atlas account and MongoDB Atlas token. Read more: https://docs.atlas.mongodb.com/configure-api-access/
|
247 |
+
|
248 |
+
1. Set environment variable for MongoDB Atlas connection, before starting FastAPI service
|
249 |
+
|
250 |
+
```
|
251 |
+
export MONGODB_URL="mongodb+srv://sparrow:<password>@<url>/?retryWrites=true&w=majority"
|
252 |
+
```
|
253 |
+
|
254 |
+
|
255 |
+
## Dataset info
|
256 |
+
|
257 |
+
- [Samples of electronic invoices](https://data.mendeley.com/datasets/tnj49gpmtz)
|
258 |
+
- [Receipts](https://www.kaggle.com/jenswalter/receipts)
|
259 |
+
- [SROIE](https://github.com/zzzDavid/ICDAR-2019-SROIE)
|
260 |
+
|
261 |
+
## Author
|
262 |
+
|
263 |
+
[Katana ML](https://katanaml.io), [Andrej Baranovskij](https://github.com/abaranovskis-redsamurai)
|
264 |
+
|
265 |
+
## License
|
266 |
+
|
267 |
+
Licensed under the Apache License, Version 2.0. Copyright 2020-2023 Katana ML, Andrej Baranovskij. [Copy of the license](https://github.com/katanaml/sparrow/blob/main/LICENSE).
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pdf2image==1.16.2
|
2 |
+
torch==1.13.1
|
3 |
+
torchvision
|
4 |
+
torchaudio
|
5 |
+
datasets==2.10.1
|
6 |
+
fastapi==0.96.0
|
7 |
+
python-doctr==0.6.0
|
8 |
+
paddleocr==2.6.1.3
|
9 |
+
paddlepaddle==2.4.2
|
10 |
+
uvicorn[standard]
|
11 |
+
rapidfuzz<3.0
|
run_converter.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.data_converter import DataConverter
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
|
6 |
+
def main():
|
7 |
+
# Convert to sparrow format
|
8 |
+
data_converter = DataConverter()
|
9 |
+
data_converter.convert_to_sparrow_format('docs/input/invoices/processed/ocr',
|
10 |
+
'docs/input/invoices/processed/output')
|
11 |
+
|
12 |
+
# define the source and destination directory
|
13 |
+
src_dir = 'docs/input/invoices/processed/output'
|
14 |
+
dst_dir = '../sparrow-ui/docs/json'
|
15 |
+
|
16 |
+
# Get list of files in source directory
|
17 |
+
files = os.listdir(src_dir)
|
18 |
+
|
19 |
+
# Loop through all files in source directory and copy to destination directory
|
20 |
+
for f in files:
|
21 |
+
src_file = os.path.join(src_dir, f)
|
22 |
+
dst_file = os.path.join(dst_dir, f)
|
23 |
+
shutil.copy(src_file, dst_file)
|
24 |
+
|
25 |
+
|
26 |
+
if __name__ == '__main__':
|
27 |
+
main()
|
run_donut.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.donut.metadata_generator import DonutMetadataGenerator
|
2 |
+
from tools.donut.dataset_generator import DonutDatasetGenerator
|
3 |
+
from pathlib import Path
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
|
7 |
+
|
8 |
+
def main():
|
9 |
+
# define the source and destination directory
|
10 |
+
src_dir_json = '../sparrow-ui/docs/json/key'
|
11 |
+
src_dir_img = '../sparrow-ui/docs/images'
|
12 |
+
dst_dir_json = 'docs/models/donut/data/key'
|
13 |
+
dst_dir_img = 'docs/models/donut/data/key/img'
|
14 |
+
|
15 |
+
# copy JSON files from src to dst
|
16 |
+
files = os.listdir(src_dir_json)
|
17 |
+
for f in files:
|
18 |
+
src_file = os.path.join(src_dir_json, f)
|
19 |
+
dst_file = os.path.join(dst_dir_json, f)
|
20 |
+
shutil.copy(src_file, dst_file)
|
21 |
+
|
22 |
+
# copy images from src to dst
|
23 |
+
files = os.listdir(src_dir_img)
|
24 |
+
for f in files:
|
25 |
+
# copy img file, only if file with sane name exists in dst_dir_json
|
26 |
+
if os.path.isfile(os.path.join(dst_dir_json, f[:-4] + '.json')):
|
27 |
+
src_file = os.path.join(src_dir_img, f)
|
28 |
+
dst_file = os.path.join(dst_dir_img, f)
|
29 |
+
shutil.copy(src_file, dst_file)
|
30 |
+
|
31 |
+
# Convert to Donut format
|
32 |
+
base_path = 'docs/models/donut/data'
|
33 |
+
data_dir_path = Path(base_path).joinpath("key")
|
34 |
+
files = data_dir_path.glob("*.json")
|
35 |
+
files_list = [file for file in files]
|
36 |
+
# split files_list array into 3 parts, 85% train, 10% validation, 5% test
|
37 |
+
train_files_list = files_list[:int(len(files_list) * 0.85)]
|
38 |
+
print("Train set size:", len(train_files_list))
|
39 |
+
validation_files_list = files_list[int(len(files_list) * 0.85):int(len(files_list) * 0.95)]
|
40 |
+
print("Validation set size:", len(validation_files_list))
|
41 |
+
test_files_list = files_list[int(len(files_list) * 0.95):]
|
42 |
+
print("Test set size:", len(test_files_list))
|
43 |
+
|
44 |
+
metadata_generator = DonutMetadataGenerator()
|
45 |
+
metadata_generator.generate(base_path, train_files_list, "train")
|
46 |
+
metadata_generator.generate(base_path, validation_files_list, "validation")
|
47 |
+
metadata_generator.generate(base_path, test_files_list, "test")
|
48 |
+
|
49 |
+
# Generate dataset
|
50 |
+
dataset_generator = DonutDatasetGenerator()
|
51 |
+
dataset_generator.generate(base_path)
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
main()
|
run_donut_data_generator.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
# file_name = "docs/models/donut/data/key/invoice_0.json"
|
6 |
+
# for i in range(2, 250):
|
7 |
+
# with open(file_name, "r") as file:
|
8 |
+
# # create new file name
|
9 |
+
# new_file_name = file_name.replace("invoice_0", f"invoice_{i}")
|
10 |
+
# # open new file
|
11 |
+
# with open(new_file_name, "w") as outfile:
|
12 |
+
# # write to new file
|
13 |
+
# outfile.write(file.read())
|
14 |
+
|
15 |
+
file_name = "docs/models/donut/data/img/test/invoice_1.jpg"
|
16 |
+
img = cv2.imread(file_name)
|
17 |
+
for i in range(250, 500):
|
18 |
+
new_file_name = file_name.replace("invoice_1", f"invoice_{i}")
|
19 |
+
cv2.imwrite(new_file_name, img)
|
20 |
+
|
21 |
+
if __name__ == '__main__':
|
22 |
+
main()
|
run_donut_test.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.donut.dataset_tester import DonutDatasetTester
|
2 |
+
|
3 |
+
def main():
|
4 |
+
dataset_tester = DonutDatasetTester()
|
5 |
+
dataset_tester.test("katanaml-org/invoices-donut-data-v1")
|
6 |
+
|
7 |
+
if __name__ == '__main__':
|
8 |
+
main()
|
run_donut_upload.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.donut.dataset_uploader import DonutDatasetUploader
|
2 |
+
|
3 |
+
def main():
|
4 |
+
dataset_uploader = DonutDatasetUploader()
|
5 |
+
dataset_uploader.upload('docs/models/donut/data', "katanaml-org/invoices-donut-data-v1")
|
6 |
+
|
7 |
+
if __name__ == '__main__':
|
8 |
+
main()
|
run_ocr.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.pdf_converter import PDFConverter
|
2 |
+
from tools.ocr_extractor import OCRExtractor
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
|
6 |
+
|
7 |
+
def main():
|
8 |
+
# Convert pdf to jpg
|
9 |
+
pdf_converter = PDFConverter()
|
10 |
+
pdf_converter.convert_to_jpg('docs/input/invoices/Dataset with valid information',
|
11 |
+
'docs/input/invoices/processed/images')
|
12 |
+
|
13 |
+
# define the source and destination directory
|
14 |
+
src_dir = 'docs/input/invoices/processed/images'
|
15 |
+
dst_dir = '../sparrow-ui/docs/images'
|
16 |
+
|
17 |
+
# Get list of files in source directory
|
18 |
+
files = os.listdir(src_dir)
|
19 |
+
|
20 |
+
# Loop through all files in source directory and copy to destination directory
|
21 |
+
for f in files:
|
22 |
+
src_file = os.path.join(src_dir, f)
|
23 |
+
dst_file = os.path.join(dst_dir, f)
|
24 |
+
shutil.copy(src_file, dst_file)
|
25 |
+
|
26 |
+
|
27 |
+
# OCR
|
28 |
+
ocr_extractor = OCRExtractor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
|
29 |
+
ocr_extractor.extract('docs/input/invoices/processed', show_prediction=False)
|
30 |
+
|
31 |
+
if __name__ == '__main__':
|
32 |
+
main()
|