backend / app /categorization /categorizer_list.py
praneethys's picture
01-Categorization-Transactions (#1)
e612627 verified
raw
history blame
No virus
2.84 kB
# Standard library imports
import os
from datetime import datetime
# Third-party library imports
import pandas as pd
# Local application/library specific imports
from app.categorization.config import CATEGORY_REFERENCE_OUTPUT_FILE
from app.categorization.categorizer import llm_list_categorizer, fuzzy_match_list_categorizer
async def categorize_list(tx_list: pd.DataFrame) -> pd.DataFrame:
"""Asynchronously categorize a list of transactions.
This function categorizes a list of transactions using a combination of fuzzy matching
and a language model. It looks up new transaction descriptions in the reference file
(a combination of user input and previous executions) to minimize API calls.
Any uncategorized transactions are sent to the language model, and new description-category
pairs are added to the reference file.
Args:
tx_list (pd.DataFrame): The list of transactions to categorize.
Returns:
pd.DataFrame: The original DataFrame with an additional column for the category.
"""
if os.path.exists(CATEGORY_REFERENCE_OUTPUT_FILE):
# Read description-category pairs from the reference file
description_category_pairs = pd.read_csv(
CATEGORY_REFERENCE_OUTPUT_FILE, header=None, names=['name/description', 'category']
)
# Extract only descriptions for faster matching
descriptions = description_category_pairs['name/description'].values
# Use fuzzy matching to find similar descriptions and assign the category
tx_list['category'] = tx_list['name/description'].apply(
fuzzy_match_list_categorizer,
args=(descriptions, description_category_pairs),
)
# Filter out uncategorized transactions, deduplicate, and sort by description
uncategorized_descriptions = (
tx_list[tx_list['category'].isnull()]
.drop_duplicates(subset=['name/description'])
.sort_values(by=['name/description'])
)
# Ask the language model to categorize the remaining descriptions
if not uncategorized_descriptions.empty:
categorized_descriptions = await llm_list_categorizer(
uncategorized_descriptions[['name/description', 'category']]
)
categorized_descriptions.dropna(inplace=True)
# Update the category for uncategorized transactions based on the language model results
if not categorized_descriptions.empty:
tx_list['category'] = tx_list['category'].fillna(
tx_list['name/description'].map(
categorized_descriptions.set_index('name/description')['category']
)
)
# Fill remaining NaN values in 'category' with 'Other'
tx_list['category'] = tx_list['category'].fillna('Other')
return tx_list