ITNovaML katanaml commited on
Commit
f25b29f
0 Parent(s):

Duplicate from katanaml-org/sparrow-data

Browse files

Co-authored-by: Andrej Baranovskij <katanaml@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements-fastapi.txt ./
6
+
7
+ # Installing libGL
8
+ RUN apt-get update && apt-get install -y \
9
+ libgl1-mesa-dev
10
+
11
+ RUN apt-get install -y poppler-utils libpoppler-cpp-dev
12
+
13
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements-fastapi.txt
14
+
15
+ RUN useradd -m -u 1000 user
16
+
17
+ USER user
18
+
19
+ ENV HOME=/home/user \
20
+ PATH=/home/user/.local/bin:$PATH
21
+
22
+ WORKDIR $HOME/app
23
+
24
+ COPY --chown=user . $HOME/app/
25
+
26
+ CMD ["uvicorn", "endpoints:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sparrow Data
3
+ emoji: 🏃
4
+ colorFrom: pink
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ duplicated_from: katanaml-org/sparrow-data
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.py ADDED
File without changes
config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseSettings
2
+ import os
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ huggingface_key: str = os.environ.get("huggingface_key")
7
+ sparrow_key: str = os.environ.get("sparrow_key")
8
+ secure_key: str = os.environ.get("secure_key")
9
+ dataset_name: str = "katanaml-org/invoices-donut-data-v1"
10
+ ocr_stats_file: str = "data/ocr_stats.json"
11
+
12
+
13
+ settings = Settings()
data/ocr_stats.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [[0.0, "receipt_00001.png", "2023-05-23 10:55:43"], [19.22510600090027, "receipt_00001.png", "2023-05-23 11:11:02"], [4.0531158447265625e-06, "receipt_00001.png", "2023-05-23 11:11:10"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:11"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:12"], [2.86102294921875e-06, "receipt_00001.png", "2023-05-23 11:11:13"], [3.0994415283203125e-06, "receipt_00001.png", "2023-05-23 11:11:13"], [3.653481960296631, "receipt_00001.png", "2023-05-23 11:32:48"], [8.929341077804565, "receipt_00001.png", "2023-05-23 11:34:52"], [3.5088820457458496, "receipt_00001.png", "2023-05-23 16:32:17"], [2.863774061203003, "receipt_00001.png", "2023-05-23 16:32:56"], [4.174198150634766, "inout-20211211_001.jpg", "2023-05-23 16:38:33"], [4.616858243942261, "inout-20211211_001.jpg", "2023-05-23 16:39:28"], [4.6479880809783936, "inout-20211211_001.jpg", "2023-05-23 16:47:27"], [4.756654262542725, "inout-20211211_001.jpg", "2023-05-23 22:07:14"], [5.704661130905151, "wholefoods-20211211_005.jpg", "2023-05-23 22:16:38"], [6.363792896270752, "wholefoods-20211211_005.jpg", "2023-05-23 22:24:13"], [6.582294940948486, "cvs-20211211_009.jpg", "2023-05-23 22:43:41"], [8.032721757888794, "oldnavy-20211211_015.jpg", "2023-05-23 22:45:58"], [6.35598611831665, "ross-20211211_010.jpg", "2023-05-23 22:47:50"], [7.241703987121582, "ross-20211211_010.jpg", "2023-05-24 11:03:57"], [6.259234189987183, "ross-20211211_010.pdf", "2023-05-24 11:04:26"], [7.275213956832886, "ross-20211211_010.pdf", "2023-05-24 11:05:25"], [5.848371982574463, "invoice_10.jpg", "2023-05-24 11:06:21"], [6.028747081756592, "invoice_10.jpg", "2023-05-24 11:12:14"], [6.5253260135650635, "cvs-20211211_009.jpg", "2023-05-24 11:29:53"], [6.507750988006592, "cvs-20211211_009.jpg", "2023-05-24 12:55:14"], [2.864002227783203, "receipt_00001.png", "2023-05-24 12:55:30"], [2.9030818939208984, "receipt_00001.png", "2023-05-24 12:55:40"], [5.672614097595215, "wholefoods-20211211_005.jpg", "2023-05-24 12:56:13"], [5.712976932525635, "wholefoods-20211211_005.pdf", "2023-05-24 12:56:29"], [5.984729051589966, "invoice_10.jpg", "2023-05-24 13:00:23"], [7.3337507247924805, "bestbuy-20211211_006.pdf", "2023-05-24 13:01:13"], [4.676954984664917, "inout-20211211_001.jpg", "2023-05-24 21:09:53"], [3.9793169498443604, "inout-20211211_001.jpg", "2023-05-24 22:01:12"], [4.716302871704102, "inout-20211211_001.jpg", "2023-05-24 22:07:19"], [4.611649990081787, "inout-20211211_001.jpg", "2023-05-24 22:11:00"], [5.18176007270813, "inout-20211211_001.jpg", "2023-05-24 22:12:26"], [4.76771092414856, "inout-20211211_001.jpg", "2023-05-25 10:00:11"], [4.62838888168335, "inout-20211211_001.jpg", "2023-05-25 10:12:36"], [4.6390650272369385, "inout-20211211_001.jpg", "2023-05-25 10:35:31"], [4.605455160140991, "inout-20211211_001.jpg", "2023-05-25 10:36:59"], [4.541555881500244, "inout-20211211_001.jpg", "2023-05-25 10:37:41"], [4.652244806289673, "inout-20211211_001.jpg", "2023-05-25 10:38:09"], [3.947613000869751, "inout-20211211_001.jpg", "2023-05-25 10:58:54"], [4.597126245498657, "inout-20211211_001.jpg", "2023-05-25 11:00:03"], [4.6871421337127686, "inout-20211211_001.jpg", "2023-05-25 11:02:44"], [4.579195976257324, "inout-20211211_001.jpg", "2023-05-25 11:32:11"], [4.734511137008667, "inout-20211211_001.jpg", "2023-05-25 11:33:03"], [4.602473258972168, "inout-20211211_001.jpg", "2023-05-25 11:44:11"], [4.563000202178955, "inout-20211211_001.jpg", "2023-05-25 11:47:35"], [4.576035022735596, "inout-20211211_001.jpg", "2023-05-25 11:49:55"], [4.860241889953613, "inout-20211211_001.jpg", "2023-05-25 11:53:19"], [4.693282127380371, "inout-20211211_001.jpg", "2023-05-25 11:56:00"], [4.5564610958099365, "inout-20211211_001.jpg", "2023-05-25 16:02:52"], [5.022596836090088, "inout-20211211_001.jpg", "2023-05-25 16:03:47"], [4.650119781494141, "inout-20211211_001.jpg", "2023-05-25 16:27:35"], [6.16159200668335, "inout-20211211_001.jpg", "2023-05-30 22:15:29"], [9.421452045440674, "../docs/models/donut/data/img/test/invoice_2.jpg", "2023-06-07 21:03:34"]]
data/result.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ['YOUR GUEST NUMBER IS']
2
+ ['43']
3
+ ['IN-N-OUT BURGER LINQ']
4
+ ['320 6 2166 6301']
5
+ ['Cashier: SERJI0 SA']
6
+ ['Check : 43']
7
+ ['TRANS #: 6301']
8
+ ['1 Db1-Dbl']
9
+ ['5.25']
10
+ ['+ Onion']
11
+ ['1 Fry']
12
+ ['2.35']
13
+ ['1 Med Soft Drink']
14
+ ['2.15']
15
+ ['COUNTER-Eat In']
16
+ ['9.75']
17
+ ['TAX 8.375%']
18
+ ['.82']
19
+ ['Amount Due']
20
+ ['$10.57']
21
+ ['Tender MasterCard']
22
+ ['$10.57']
23
+ ['Change']
24
+ ['$.00']
25
+ ['CHARGE DETAIL']
26
+ ['SALE']
27
+ ['Card Type:']
28
+ ['Mastercard']
29
+ ['Account :']
30
+ ['************5562 R']
31
+ ['Auth Code:']
32
+ ['NDTQU8']
33
+ ['Trans #:']
34
+ ['6301']
35
+ ['Auth Ref :']
36
+ ['2015517078']
37
+ ['AUTH AMT :']
38
+ ['$10.57']
39
+ ['AID:']
40
+ ['A0000000041010']
41
+ ['TVR :']
42
+ ['0000008001']
43
+ ['TSI:']
44
+ ['0000']
45
+ ['App Name:']
46
+ ['Debit MasterCard']
47
+ ['ARQC:']
48
+ ['ADCF5208793B7BD6']
49
+ ['THANK YOU!']
50
+ ['Quest ions/Comments: Cal1 800-786-1 :10']
51
+ ['L1 T6']
52
+ ['9:21 PM']
53
+ ['2021-11-30']
endpoints.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from routers import dataset
4
+ from routers import ocr
5
+ from routers import chatgpt_plugin
6
+
7
+ app = FastAPI(openapi_url="/api/v1/sparrow-data/openapi.json", docs_url="/api/v1/sparrow-data/docs")
8
+
9
+ app.add_middleware(
10
+ CORSMiddleware,
11
+ allow_origins=["*"],
12
+ allow_methods=["*"],
13
+ allow_headers=["*"],
14
+ allow_credentials=True,
15
+ )
16
+
17
+ app.include_router(dataset.router, prefix="/api-dataset/v1/sparrow-data", tags=["Dataset"])
18
+ app.include_router(ocr.router, prefix="/api-ocr/v1/sparrow-data", tags=["OCR"])
19
+ app.include_router(chatgpt_plugin.router, prefix="/api-chatgpt-plugin/v1/sparrow-data", tags=["ChatGPT Plugin"])
20
+
21
+
22
+ @app.get("/")
23
+ async def root():
24
+ return {"message": "Sparrow Data API"}
requirements-fastapi.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pdf2image==1.16.2
2
+ python-poppler==0.4.1
3
+ datasets==2.10.1
4
+ Pillow==9.5.0
5
+ paddlepaddle==2.4.2
6
+ paddleocr==2.6.1.3
7
+ fastapi==0.96.0
8
+ python-multipart
9
+ motor==3.1.2
10
+ pydantic==1.10.8
11
+ pycryptodome==3.18.0
12
+ uvicorn[standard]
routers/__init__.py ADDED
File without changes
routers/chatgpt_plugin.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Response, Form
2
+ from config import settings
3
+ import os
4
+ import motor.motor_asyncio
5
+ from routers.data_utils import get_receipt_data
6
+ from routers.data_utils import store_receipt_db_data
7
+ from routers.data_utils import get_receipt_db_data
8
+ from routers.data_utils import delete_receipt_db_data
9
+ from routers.data_utils import get_user_receipt_db_ids
10
+ from routers.data_utils import get_user_receipt_content_db
11
+ from pymongo.errors import PyMongoError
12
+ import json
13
+
14
+
15
+ router = APIRouter()
16
+
17
+
18
+ client = None
19
+ db = None
20
+
21
+
22
+ @router.on_event("startup")
23
+ async def startup_event():
24
+ if "MONGODB_URL" in os.environ:
25
+ global client
26
+ global db
27
+ client = motor.motor_asyncio.AsyncIOMotorClient(os.environ["MONGODB_URL"])
28
+ db = client.chatgpt_plugin
29
+ print("Connected to MongoDB from ChatGPT plugin!")
30
+
31
+
32
+ @router.on_event("shutdown")
33
+ async def shutdown_event():
34
+ if "MONGODB_URL" in os.environ:
35
+ global client
36
+ client.close()
37
+
38
+
39
+ @router.get("/receipt_by_id")
40
+ async def get_receipt_by_id(receipt_id: str, sparrow_key: str):
41
+ if sparrow_key != settings.sparrow_key:
42
+ return {"error": "Invalid Sparrow key."}
43
+
44
+ if "MONGODB_URL" in os.environ:
45
+ result = await get_receipt_data(receipt_id, db)
46
+
47
+ if result is None:
48
+ raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
49
+
50
+ return result
51
+
52
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
53
+
54
+
55
+ @router.post("/store_receipt_db")
56
+ async def run_store_receipt_db(chatgpt_user: str = Form(None), receipt_id: str = Form(None),
57
+ receipt_content: str = Form(None), sparrow_key: str = Form(None)):
58
+
59
+ if sparrow_key != settings.sparrow_key:
60
+ return {"error": "Invalid Sparrow key."}
61
+
62
+ print(f"Storing receipt {receipt_id} for user {chatgpt_user}...")
63
+
64
+ if "MONGODB_URL" in os.environ:
65
+ try:
66
+ json.loads(receipt_content)
67
+ except json.decoder.JSONDecodeError:
68
+ return HTTPException(status_code=400, detail=f"Receipt content is not valid JSON.")
69
+
70
+ try:
71
+ result = await store_receipt_db_data(chatgpt_user, receipt_id, receipt_content, db)
72
+ except PyMongoError:
73
+ return HTTPException(status_code=400, detail=f"Saving data failed.")
74
+
75
+ if result is not None:
76
+ return Response(status_code=200)
77
+
78
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
79
+
80
+
81
+ @router.get("/receipt_db_by_id")
82
+ async def get_receipt_db_by_id(chatgpt_user: str, receipt_id: str, sparrow_key: str):
83
+ if sparrow_key != settings.sparrow_key:
84
+ return {"error": "Invalid Sparrow key."}
85
+
86
+ if "MONGODB_URL" in os.environ:
87
+ result = await get_receipt_db_data(chatgpt_user, receipt_id, db)
88
+
89
+ if result is None:
90
+ raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
91
+
92
+ return json.loads(result)
93
+
94
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
95
+
96
+
97
+ @router.delete("/receipt_db_by_id")
98
+ async def delete_receipt_db_by_id(chatgpt_user: str, receipt_id: str, sparrow_key: str):
99
+ if sparrow_key != settings.sparrow_key:
100
+ return {"error": "Invalid Sparrow key."}
101
+
102
+ if "MONGODB_URL" in os.environ:
103
+ result = await delete_receipt_db_data(chatgpt_user, receipt_id, db)
104
+
105
+ if result.deleted_count == 0:
106
+ raise HTTPException(status_code=404, detail=f"Receipt {receipt_id} not found")
107
+
108
+ return Response(status_code=200)
109
+
110
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
111
+
112
+
113
+ @router.get("/receipt_db_ids_by_user")
114
+ async def get_receipt_db_ids_by_user(chatgpt_user: str, sparrow_key: str):
115
+ if sparrow_key != settings.sparrow_key:
116
+ return {"error": "Invalid Sparrow key."}
117
+
118
+ if "MONGODB_URL" in os.environ:
119
+ result = await get_user_receipt_db_ids(chatgpt_user, db)
120
+
121
+ if result is None:
122
+ raise HTTPException(status_code=404, detail=f"User {chatgpt_user} not found")
123
+
124
+ return result
125
+
126
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
127
+
128
+
129
+ @router.get("/receipt_db_content_by_user")
130
+ async def get_receipt_db_content_by_user(chatgpt_user: str, sparrow_key: str):
131
+ if sparrow_key != settings.sparrow_key:
132
+ return {"error": "Invalid Sparrow key."}
133
+
134
+ if "MONGODB_URL" in os.environ:
135
+ result = await get_user_receipt_content_db(chatgpt_user, db)
136
+
137
+ return result
138
+
139
+ return HTTPException(status_code=400, detail=f"No MongoDB URL provided.")
routers/data_utils.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import secrets
2
+ import string
3
+ from bson import ObjectId
4
+ from pydantic import BaseModel, Field, ValidationError
5
+ from typing import List
6
+ import datetime
7
+ from Crypto.Cipher import AES
8
+ from Crypto.Util.Padding import pad, unpad
9
+ from base64 import b64encode, b64decode
10
+ import base64
11
+ from pymongo.errors import DuplicateKeyError
12
+ from pymongo.errors import PyMongoError
13
+ import json
14
+ from config import settings
15
+
16
+
17
+ # Define a key. Note: it must be of length 16, 24, or 32.
18
+ secure_key = settings.secure_key
19
+
20
+
21
+ def encrypt(plain_text: str, key: bytes) -> str:
22
+ cipher = AES.new(key, AES.MODE_CBC)
23
+ iv = cipher.iv
24
+ encrypted_text = cipher.encrypt(pad(plain_text.encode(), AES.block_size))
25
+ return b64encode(iv + encrypted_text).decode()
26
+
27
+
28
+ def decrypt(encrypted_text: str, key: bytes) -> str:
29
+ decrypted_text = b64decode(encrypted_text)
30
+ iv = decrypted_text[:16]
31
+ cipher = AES.new(key, AES.MODE_CBC, iv=iv)
32
+ decrypted_text = unpad(cipher.decrypt(decrypted_text[16:]), AES.block_size)
33
+ return decrypted_text.decode()
34
+
35
+
36
+ class PyObjectId(ObjectId):
37
+ @classmethod
38
+ def __get_validators__(cls):
39
+ yield cls.validate
40
+
41
+ @classmethod
42
+ def validate(cls, v):
43
+ if not ObjectId.is_valid(v):
44
+ raise ValueError("Invalid objectid")
45
+ return ObjectId(v)
46
+
47
+ @classmethod
48
+ def __modify_schema__(cls, field_schema):
49
+ field_schema.update(type="string")
50
+
51
+
52
+ class ReceiptModel(BaseModel):
53
+ id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
54
+ receipt_key: str = Field(..., description="The unique key for the receipt.")
55
+ content: List[List[str]] = Field(..., description="An array of single-element arrays, each containing receipt entry.")
56
+
57
+ class Config:
58
+ allow_population_by_field_name = True
59
+ arbitrary_types_allowed = True
60
+ json_encoders = {ObjectId: str}
61
+ schema_extra = {
62
+ 'example': {
63
+ 'receipt_key': 'RzSZ0BTnuG',
64
+ 'content': [['YOUR GUEST NUMBER IS'], ['43'], ['IN-N-OUT BURGER LINQ']]
65
+ },
66
+ 'title': 'ReceiptModel',
67
+ 'description': 'A model representing a receipt with a key and its contents.',
68
+ }
69
+
70
+
71
+ class ReceiptDBModel(BaseModel):
72
+ id: PyObjectId = Field(default_factory=PyObjectId, alias="_id")
73
+ user: str = Field(..., description="The user who uploaded the receipt.")
74
+ receipt_key: str = Field(..., description="The unique key for the receipt.")
75
+ content: str = Field(..., description="A string representing DB receipt data.")
76
+
77
+ class Config:
78
+ allow_population_by_field_name = True
79
+ arbitrary_types_allowed = True
80
+ json_encoders = {ObjectId: str}
81
+ schema_extra = {
82
+ 'example': {
83
+ 'user': 'user1',
84
+ 'receipt_key': 'RzSZ0BTnuG',
85
+ 'content': '{"store": "CVS Pharmacy", "location": "3300 S LAS VEGAS BLVD, LAS VEGAS, NV, 89109"}'
86
+ },
87
+ 'title': 'ReceiptProcessedModel',
88
+ 'description': 'A model representing a receipt DB contents.',
89
+ }
90
+
91
+
92
+ def merge_data(values):
93
+ data = []
94
+ for idx in range(len(values)):
95
+ data.append([values[idx][1][0]])
96
+ # print(data[idx])
97
+
98
+ return data
99
+
100
+
101
+ async def store_data(data, db):
102
+ print("Storing data...")
103
+
104
+ key = generate_key()
105
+
106
+ try:
107
+ receipt = ReceiptModel(receipt_key=key, content=data)
108
+ except ValidationError as e:
109
+ print(f"An error occurred: {e}")
110
+ else:
111
+ # Convert the Pydantic model instance into a dictionary
112
+ receipt_dict = receipt.dict()
113
+
114
+ receipt_dict["content"] = encrypt(str(receipt_dict["content"]), base64.b64decode(secure_key))
115
+ receipt_dict["created_at"] = datetime.datetime.utcnow()
116
+
117
+ # Insert the dictionary into MongoDB
118
+ try:
119
+ result = await db["uploads"].insert_one(receipt_dict)
120
+ except DuplicateKeyError:
121
+ raise
122
+
123
+ print(f"Inserted document with id: {result.inserted_id}")
124
+
125
+ return key
126
+
127
+ return None
128
+
129
+
130
+ async def get_receipt_data(key, db):
131
+ print(f"Getting receipt data for key: {key}")
132
+
133
+ receipt = await db["uploads"].find_one({"receipt_key": key})
134
+ if receipt is not None:
135
+ await db["uploads"].delete_one({"receipt_key": key})
136
+
137
+ receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
138
+
139
+ return receipt['content']
140
+
141
+ return None
142
+
143
+
144
+ async def store_receipt_db_data(chatgpt_user, receipt_id, receipt_content, db):
145
+ print("Storing receipt data...")
146
+
147
+ try:
148
+ receipt = ReceiptDBModel(user=chatgpt_user, receipt_key=receipt_id, content=receipt_content)
149
+ except ValidationError as e:
150
+ print(f"An error occurred: {e}")
151
+ else:
152
+ # Convert the Pydantic model instance into a dictionary
153
+ receipt_dict = receipt.dict()
154
+
155
+ receipt_dict["content"] = encrypt(str(receipt_dict["content"]), base64.b64decode(secure_key))
156
+
157
+ # Insert the dictionary into MongoDB
158
+ try:
159
+ query = {"user": chatgpt_user, "receipt_key": receipt_id}
160
+ new_data = {"$set": {"content": receipt_dict["content"]}}
161
+
162
+ result = await db["receipts"].update_one(query, new_data, upsert=True)
163
+ except PyMongoError:
164
+ raise
165
+
166
+ print(f"Inserted document with id: {result}")
167
+
168
+ return result
169
+
170
+ return None
171
+
172
+
173
+ async def get_receipt_db_data(chatgpt_user, receipt_id, db):
174
+ print(f"Getting receipt data for key: {receipt_id}")
175
+
176
+ receipt = await db["receipts"].find_one({"user": chatgpt_user, "receipt_key": receipt_id})
177
+ if receipt is not None:
178
+ receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
179
+ return receipt['content']
180
+
181
+ return None
182
+
183
+
184
+ async def get_user_receipt_db_ids(chatgpt_user, db):
185
+ print(f"Getting user receipts ids for user: {chatgpt_user}")
186
+
187
+ receipts_processed = await db["receipts"].find({"user": chatgpt_user}).to_list(length=100)
188
+
189
+ receipts = []
190
+ if receipts_processed is not None:
191
+ for receipt in receipts_processed:
192
+ receipts.append(receipt['receipt_key'])
193
+
194
+ return receipts
195
+
196
+
197
+ async def delete_receipt_db_data(chatgpt_user, receipt_id, db):
198
+ print(f"Deleting receipt data for key: {receipt_id}")
199
+
200
+ result = await db["receipts"].delete_one({"user": chatgpt_user, "receipt_key": receipt_id})
201
+
202
+ if result.deleted_count == 0:
203
+ print(f"Receipt with id: {receipt_id} not found")
204
+ else:
205
+ print(f"Deleted document with id: {result}")
206
+
207
+ return result
208
+
209
+
210
+ async def get_user_receipt_content_db(chatgpt_user, db):
211
+ print(f"Getting user receipts fields for user: {chatgpt_user}")
212
+
213
+ receipts_processed = await db["receipts"].find({"user": chatgpt_user}).to_list(length=100)
214
+
215
+ receipts = []
216
+ if receipts_processed is not None:
217
+ for receipt in receipts_processed:
218
+ receipt['content'] = decrypt(receipt['content'], base64.b64decode(secure_key))
219
+ receipts.append(json.loads(receipt['content']))
220
+
221
+ return receipts
222
+
223
+
224
+ def generate_key(length=10):
225
+ alphabet = string.ascii_letters + string.digits
226
+ key = ''.join(secrets.choice(alphabet) for i in range(length))
227
+ return key
routers/dataset.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from datasets import load_dataset
3
+ from ast import literal_eval
4
+ from pydantic import BaseModel
5
+ from typing import Dict
6
+ from io import BytesIO
7
+ from PIL import Image
8
+ import base64
9
+ from config import settings
10
+ from huggingface_hub import login
11
+
12
+
13
+ router = APIRouter()
14
+
15
+ login(settings.huggingface_key)
16
+
17
+ class ImageResponse(BaseModel):
18
+ image_data: str
19
+ ground_truth_data: Dict
20
+
21
+ def encode_pil_image(image: Image) -> str:
22
+ buffer = BytesIO()
23
+ image.save(buffer, format='JPEG')
24
+ img_data = buffer.getvalue()
25
+ return base64.b64encode(img_data).decode('utf-8')
26
+
27
+
28
+ @router.get("/dataset_info")
29
+ async def get_dataset_info():
30
+ dataset = load_dataset(settings.dataset_name)
31
+
32
+ splits = []
33
+ for split in dataset.keys():
34
+ split = {
35
+ "name": split,
36
+ "number_of_rows": len(dataset[split])
37
+ }
38
+ splits.append(split)
39
+
40
+ result = {
41
+ "dataset": settings.dataset_name,
42
+ "splits": splits
43
+ }
44
+
45
+ return result
46
+
47
+
48
+ @router.get("/ground_truth", response_model=ImageResponse)
49
+ async def get_ground_truth() -> ImageResponse:
50
+ dataset = load_dataset(settings.dataset_name)
51
+
52
+ example = dataset['test'][0]
53
+ image = example['image']
54
+ encoded_img = encode_pil_image(image)
55
+
56
+ ground_truth = example['ground_truth']
57
+ data = literal_eval(ground_truth)['gt_parse']
58
+
59
+ return ImageResponse(image_data=encoded_img, ground_truth_data=data)
routers/ocr.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, UploadFile, Form, HTTPException, status
2
+ from fastapi.responses import JSONResponse
3
+ from config import settings
4
+ from PIL import Image
5
+ import urllib.request
6
+ from io import BytesIO
7
+ import utils
8
+ import os
9
+ import time
10
+ from functools import lru_cache
11
+ from paddleocr import PaddleOCR
12
+ from pdf2image import convert_from_bytes
13
+ import io
14
+ import json
15
+ from routers.data_utils import merge_data
16
+ from routers.data_utils import store_data
17
+ import motor.motor_asyncio
18
+ from typing import Optional
19
+ from pymongo import ASCENDING
20
+ from pymongo.errors import DuplicateKeyError
21
+
22
+
23
+ router = APIRouter()
24
+
25
+ client = None
26
+ db = None
27
+
28
+
29
+ async def create_unique_index(collection, *fields):
30
+ index_fields = [(field, 1) for field in fields]
31
+ return await collection.create_index(index_fields, unique=True)
32
+
33
+
34
+ async def create_ttl_index(db, collection_name, field, expire_after_seconds):
35
+ # Get a reference to your collection
36
+ collection = db[collection_name]
37
+ # Create an index on the specified field
38
+ index_result = await collection.create_index([(field, ASCENDING)], expireAfterSeconds=expire_after_seconds)
39
+ print(f"TTL index created or already exists: {index_result}")
40
+
41
+
42
+ @router.on_event("startup")
43
+ async def startup_event():
44
+ if "MONGODB_URL" in os.environ:
45
+ global client
46
+ global db
47
+ client = motor.motor_asyncio.AsyncIOMotorClient(os.environ["MONGODB_URL"])
48
+ db = client.chatgpt_plugin
49
+
50
+ index_result = await create_unique_index(db['uploads'], 'receipt_key')
51
+ print(f"Unique index created or already exists: {index_result}")
52
+ index_result = await create_unique_index(db['receipts'], 'user', 'receipt_key')
53
+ print(f"Unique index created or already exists: {index_result}")
54
+ await create_ttl_index(db, 'uploads', 'created_at', 15*60)
55
+
56
+ print("Connected to MongoDB from OCR!")
57
+
58
+
59
+ @router.on_event("shutdown")
60
+ async def shutdown_event():
61
+ if "MONGODB_URL" in os.environ:
62
+ global client
63
+ client.close()
64
+
65
+
66
+ @lru_cache(maxsize=1)
67
+ def load_ocr_model():
68
+ model = PaddleOCR(use_angle_cls=True, lang='en')
69
+ return model
70
+
71
+
72
+ def invoke_ocr(doc, content_type):
73
+ worker_pid = os.getpid()
74
+ print(f"Handling OCR request with worker PID: {worker_pid}")
75
+ start_time = time.time()
76
+
77
+ model = load_ocr_model()
78
+
79
+ bytes_img = io.BytesIO()
80
+
81
+ format_img = "JPEG"
82
+ if content_type == "image/png":
83
+ format_img = "PNG"
84
+
85
+ doc.save(bytes_img, format=format_img)
86
+ bytes_data = bytes_img.getvalue()
87
+ bytes_img.close()
88
+
89
+ result = model.ocr(bytes_data, cls=True)
90
+
91
+ values = []
92
+ for idx in range(len(result)):
93
+ res = result[idx]
94
+ for line in res:
95
+ values.append(line)
96
+
97
+ values = merge_data(values)
98
+
99
+ end_time = time.time()
100
+ processing_time = end_time - start_time
101
+ print(f"OCR done, worker PID: {worker_pid}")
102
+
103
+ return values, processing_time
104
+
105
+ @router.post("/ocr")
106
+ async def run_ocr(file: Optional[UploadFile] = File(None), image_url: Optional[str] = Form(None),
107
+ post_processing: Optional[bool] = Form(False), sparrow_key: str = Form(None)):
108
+
109
+ if sparrow_key != settings.sparrow_key:
110
+ return {"error": "Invalid Sparrow key."}
111
+
112
+ result = None
113
+ if file:
114
+ if file.content_type in ["image/jpeg", "image/jpg", "image/png"]:
115
+ doc = Image.open(BytesIO(await file.read()))
116
+ elif file.content_type == "application/pdf":
117
+ pdf_bytes = await file.read()
118
+ pages = convert_from_bytes(pdf_bytes, 300)
119
+ doc = pages[0]
120
+ else:
121
+ return {"error": "Invalid file type. Only JPG/PNG images and PDF are allowed."}
122
+
123
+ result, processing_time = invoke_ocr(doc, file.content_type)
124
+
125
+ utils.log_stats(settings.ocr_stats_file, [processing_time, file.filename])
126
+ print(f"Processing time OCR: {processing_time:.2f} seconds")
127
+
128
+ if post_processing and "MONGODB_URL" in os.environ:
129
+ print("Postprocessing...")
130
+ try:
131
+ result = await store_data(result, db)
132
+ except DuplicateKeyError:
133
+ return HTTPException(status_code=400, detail=f"Duplicate data.")
134
+ print(f"Stored data with key: {result}")
135
+ elif image_url:
136
+ # test image url: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/invoices/processed/images/invoice_10.jpg
137
+ # test PDF: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/receipts/2021/us/bestbuy-20211211_006.pdf
138
+ with urllib.request.urlopen(image_url) as response:
139
+ content_type = response.info().get_content_type()
140
+
141
+ if content_type in ["image/jpeg", "image/jpg", "image/png"]:
142
+ doc = Image.open(BytesIO(response.read()))
143
+ elif content_type == "application/octet-stream":
144
+ pdf_bytes = response.read()
145
+ pages = convert_from_bytes(pdf_bytes, 300)
146
+ doc = pages[0]
147
+ else:
148
+ return {"error": "Invalid file type. Only JPG/PNG images and PDF are allowed."}
149
+
150
+ result, processing_time = invoke_ocr(doc, content_type)
151
+
152
+ # parse file name from url
153
+ file_name = image_url.split("/")[-1]
154
+ utils.log_stats(settings.ocr_stats_file, [processing_time, file_name])
155
+ print(f"Processing time OCR: {processing_time:.2f} seconds")
156
+
157
+ if post_processing and "MONGODB_URL" in os.environ:
158
+ print("Postprocessing...")
159
+ try:
160
+ result = await store_data(result, db)
161
+ except DuplicateKeyError:
162
+ return HTTPException(status_code=400, detail=f"Duplicate data.")
163
+ print(f"Stored data with key: {result}")
164
+ else:
165
+ result = {"info": "No input provided"}
166
+
167
+ if result is None:
168
+ raise HTTPException(status_code=400, detail=f"Failed to process the input.")
169
+
170
+ return JSONResponse(status_code=status.HTTP_200_OK, content=result)
171
+
172
+
173
+ @router.get("/statistics")
174
+ async def get_statistics():
175
+ file_path = settings.ocr_stats_file
176
+
177
+ # Check if the file exists, and read its content
178
+ if os.path.exists(file_path):
179
+ with open(file_path, 'r') as file:
180
+ try:
181
+ content = json.load(file)
182
+ except json.JSONDecodeError:
183
+ content = []
184
+ else:
185
+ content = []
186
+
187
+ return content
utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+
6
+ def log_stats(file_path, new_data):
7
+ # Check if the file exists, and read its content
8
+ if os.path.exists(file_path):
9
+ with open(file_path, 'r') as file:
10
+ try:
11
+ content = json.load(file)
12
+ except json.JSONDecodeError:
13
+ content = []
14
+ else:
15
+ content = []
16
+
17
+ # Get the current date and time
18
+ now = datetime.now()
19
+ # Format the date and time as a string
20
+ date_time_string = now.strftime("%Y-%m-%d %H:%M:%S")
21
+ new_data.append(date_time_string)
22
+
23
+ # Append the new data to the content
24
+ content.append(new_data)
25
+
26
+ # Write the updated content back to the file
27
+ with open(file_path, 'w') as file:
28
+ json.dump(content, file)
29
+