Spaces:

woodserino
/

petrockapi

Build error

App Files Files Community

katanaml commited on Apr 14, 2023

Commit

ce1dd07

1 Parent(s): 7f8dcfd

Sparrow ML

Browse files

Files changed (14) hide show

.gitignore +2 -0
Dockerfile +20 -0
README.md +1 -1
__init__.py +0 -0
config.py +14 -0
data/donut_inference_stats.json +1 -0
data/donut_training_stats.json +1 -0
endpoints.py +21 -0
requirements-fastapi.txt +10 -0
routers/__init__.py +0 -0
routers/donut_inference.py +49 -0
routers/inference.py +81 -0
routers/training.py +29 -0
utils.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ .DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.7-slim
+WORKDIR /code
+COPY requirements-fastapi.txt ./
+RUN pip install --no-cache-dir --upgrade -r /code/requirements-fastapi.txt
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app/
+CMD ["uvicorn", "endpoints:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Sparrow Ml
 emoji: 🌍
 colorFrom: purple
 colorTo: indigo

 ---
+title: Sparrow ML
 emoji: 🌍
 colorFrom: purple
 colorTo: indigo

__init__.py ADDED Viewed

File without changes

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pydantic import BaseSettings
+import os
+class Settings(BaseSettings):
+    huggingface_key: str = os.environ.get("huggingface_key")
+    sparrow_key: str = os.environ.get("sparrow_key")
+    processor: str = "katanaml-org/invoices-donut-model-v1"
+    model: str = "katanaml-org/invoices-donut-model-v1"
+    inference_stats_file: str = "data/donut_inference_stats.json"
+    training_stats_file: str = "data/donut_training_stats.json"
+settings = Settings()

data/donut_inference_stats.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ [[14.571558952331543, 21, "invoice_10.jpg", "katanaml-org/invoices-donut-model-v1", "2023-04-13 21:45:30"]]

data/donut_training_stats.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ [["2023-04-09 23:24:24", 0.1, 1260, "invoices-donut-model-v1"], ["2023-04-10 23:24:24", 0.2, 1360, "invoices-donut-model-v1"], ["2023-04-11 23:24:24", 0.85, 1750, "invoices-donut-model-v1"]]

endpoints.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from routers import inference, training
+app = FastAPI(openapi_url="/api/v1/sparrow-ml/openapi.json", docs_url="/api/v1/sparrow-ml/docs")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+    allow_credentials=True,
+)
+app.include_router(inference.router, prefix="/api-inference/v1/sparrow-ml", tags=["Inference"])
+app.include_router(training.router, prefix="/api-training/v1/sparrow-ml", tags=["Training"])
+@app.get("/")
+async def root():
+    return {"message": "Sparrow ML API"}

requirements-fastapi.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+transformers
+datasets
+sentencepiece
+tensorboard
+pytorch-lightning
+Pillow
+donut-python
+fastapi==0.95.0
+uvicorn[standard]
+python-multipart

routers/__init__.py ADDED Viewed

File without changes

routers/donut_inference.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import re
+import time
+import torch
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+from config import settings
+from huggingface_hub import login
+login(settings.huggingface_key)
+processor = DonutProcessor.from_pretrained(settings.processor)
+model = VisionEncoderDecoderModel.from_pretrained(settings.model)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+def process_document_donut(image):
+    start_time = time.time()
+    # prepare encoder inputs
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+    # prepare decoder inputs
+    task_prompt = "<s_cord-v2>"
+    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
+    # generate answer
+    outputs = model.generate(
+        pixel_values.to(device),
+        decoder_input_ids=decoder_input_ids.to(device),
+        max_length=model.decoder.config.max_position_embeddings,
+        early_stopping=True,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        num_beams=1,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+    )
+    # postprocess
+    sequence = processor.batch_decode(outputs.sequences)[0]
+    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
+    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+    end_time = time.time()
+    processing_time = end_time - start_time
+    return processor.token2json(sequence), processing_time

routers/inference.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import APIRouter, File, UploadFile, Form
+from typing import Optional
+from PIL import Image
+import urllib.request
+from io import BytesIO
+from config import settings
+import utils
+import os
+import json
+from routers.donut_inference import process_document_donut
+router = APIRouter()
+def count_values(obj):
+    if isinstance(obj, dict):
+        count = 0
+        for value in obj.values():
+            count += count_values(value)
+        return count
+    elif isinstance(obj, list):
+        count = 0
+        for item in obj:
+            count += count_values(item)
+        return count
+    else:
+        return 1
+@router.post("/inference")
+async def run_inference(file: Optional[UploadFile] = File(None), image_url: Optional[str] = Form(None),
+                        model_in_use: str = Form('donut'), sparrow_key: str = Form(None)):
+    if sparrow_key != settings.sparrow_key:
+        return {"error": "Invalid Sparrow key."}
+    result = []
+    if file:
+        # Ensure the uploaded file is a JPG image
+        if file.content_type not in ["image/jpeg", "image/jpg"]:
+            return {"error": "Invalid file type. Only JPG images are allowed."}
+        image = Image.open(BytesIO(await file.read()))
+        processing_time = 0
+        if model_in_use == 'donut':
+            result, processing_time = process_document_donut(image)
+        utils.log_stats(settings.inference_stats_file, [processing_time, count_values(result), file.filename, settings.model])
+        print(f"Processing time: {processing_time:.2f} seconds")
+    elif image_url:
+        # test image url: https://raw.githubusercontent.com/katanaml/sparrow/main/sparrow-data/docs/input/invoices/processed/images/invoice_10.jpg
+        with urllib.request.urlopen(image_url) as url:
+            image = Image.open(BytesIO(url.read()))
+        processing_time = 0
+        if model_in_use == 'donut':
+            result, processing_time = process_document_donut(image)
+        # parse file name from url
+        file_name = image_url.split("/")[-1]
+        utils.log_stats(settings.inference_stats_file, [processing_time, count_values(result), file_name, settings.model])
+        print(f"Processing time: {processing_time:.2f} seconds")
+    else:
+        result = {"info": "No input provided"}
+    return result
+@router.get("/statistics")
+async def get_statistics():
+    file_path = settings.inference_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content

routers/training.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from fastapi import APIRouter
+from config import settings
+import os
+import json
+router = APIRouter()
+@router.get("/training")
+async def run_training():
+    return {"message": "Sparrow ML training started"}
+@router.get("/statistics")
+async def get_statistics():
+    file_path = settings.training_stats_file
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    return content

utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+import os
+from datetime import datetime
+def log_stats(file_path, new_data):
+    # Check if the file exists, and read its content
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            try:
+                content = json.load(file)
+            except json.JSONDecodeError:
+                content = []
+    else:
+        content = []
+    # Get the current date and time
+    now = datetime.now()
+    # Format the date and time as a string
+    date_time_string = now.strftime("%Y-%m-%d %H:%M:%S")
+    new_data.append(date_time_string)
+    # Append the new data to the content
+    content.append(new_data)
+    # Write the updated content back to the file
+    with open(file_path, 'w') as file:
+        json.dump(content, file)