Spaces:

salverz
/

llm-document-parser

Running

File size: 1,655 Bytes

3f7e152

# config.py
from pydantic import BaseModel
from datetime import date
from typing import List

# Options: "rapid", "easy", "ocrmac", "tesseract"
OCR_MODEL = "easy"

# Must be set when using the tesseract OCR model
# Linux: "/usr/share/tesseract-ocr/4.00/tessdata"
# Windows: "C:\\Program Files\\Tesseract-OCR\\tessdata"
# Mac: "/usr/local/share/tessdata" or "/opt/homebrew/share/tessdata"
TESSERACT_TESSDATA_LOCATION = "/usr/share/tesseract-ocr/4.00/tessdata"

OLLAMA_MODEL = "llama3:instruct"

LLM_PROMPT = """
        Extract all transactions from the following statement. Each transaction must be returned as a JSON object with the fields: transaction_date (YYYY-MM-DD), description, amount, and transaction_type ('deposit' or 'withdrawal'). All of these must be returned as a list of JSON objects under a key called 'transactions'. Here is an example:
        [
            {
                transaction_date: 2025-01-24,
                description: "Walmart",
                amount: 34.24,
                transaction_type: "withdrawl"
            }
        ]
"""

# Options: "csv", "json", "excel"
EXPORT_TYPE = "json"

# Can be a file or directory
INPUT_PATH = ""
OUTPUT_FOLDER = ""
OUTPUT_FILE_NAME = "output"

# Define Pydantic response models for instructor:

class BankStatementEntry(BaseModel):
    transaction_date: date | None | str
    description: str | None
    amount: float | None
    #transaction_type: Literal['deposit', 'withdrawal', None]
    transaction_type: str | None

class BankStatement(BaseModel):
    transactions: List[BankStatementEntry] | None

# The model that LLM output will conform to
RESPONSE_MODEL = BankStatement