salverz's picture
Update file structure
4b3616f
# config.py
from pydantic import BaseModel
from datetime import date
from typing import List
# Options: "rapid", "easy", "ocrmac", "tesseract"
OCR_MODEL = "easy"
# Must be set when using the tesseract OCR model
# Linux: "/usr/share/tesseract-ocr/4.00/tessdata"
# Windows: "C:\\Program Files\\Tesseract-OCR\\tessdata"
# Mac: "/usr/local/share/tessdata" or "/opt/homebrew/share/tessdata"
TESSERACT_TESSDATA_LOCATION = "/usr/share/tesseract-ocr/4.00/tessdata"
OLLAMA_MODEL = "llama3:instruct"
LLM_PROMPT = """
Extract all transactions from the following statement. Each transaction must be returned as a JSON object with the fields: transaction_date (YYYY-MM-DD), description, amount, and transaction_type ('deposit' or 'withdrawal'). All of these must be returned as a list of JSON objects under a key called 'transactions'. Here is an example:
[
{
transaction_date: 2025-01-24,
description: "Walmart",
amount: 34.24,
transaction_type: "withdrawl"
}
]
"""
# Options: "csv", "json", "excel"
EXPORT_TYPE = "json"
# Can be a file or directory
INPUT_PATH = ""
OUTPUT_FOLDER = ""
OUTPUT_FILE_NAME = "output"
# Define Pydantic response models for instructor:
class BankStatementEntry(BaseModel):
transaction_date: date | None | str
description: str | None
amount: float | None
#transaction_type: Literal['deposit', 'withdrawal', None]
transaction_type: str | None
class BankStatement(BaseModel):
transactions: List[BankStatementEntry] | None
# The model that LLM output will conform to
RESPONSE_MODEL = BankStatement