Spaces:
Sleeping
Sleeping
# Standard library imports | |
import os | |
from datetime import datetime | |
# Third-party library imports | |
import pandas as pd | |
# Local application/library specific imports | |
from app.categorization.config import CATEGORY_REFERENCE_OUTPUT_FILE | |
from app.categorization.categorizer import llm_list_categorizer, fuzzy_match_list_categorizer | |
async def categorize_list(tx_list: pd.DataFrame) -> pd.DataFrame: | |
"""Asynchronously categorize a list of transactions. | |
This function categorizes a list of transactions using a combination of fuzzy matching | |
and a language model. It looks up new transaction descriptions in the reference file | |
(a combination of user input and previous executions) to minimize API calls. | |
Any uncategorized transactions are sent to the language model, and new description-category | |
pairs are added to the reference file. | |
Args: | |
tx_list (pd.DataFrame): The list of transactions to categorize. | |
Returns: | |
pd.DataFrame: The original DataFrame with an additional column for the category. | |
""" | |
if os.path.exists(CATEGORY_REFERENCE_OUTPUT_FILE): | |
# Read description-category pairs from the reference file | |
description_category_pairs = pd.read_csv( | |
CATEGORY_REFERENCE_OUTPUT_FILE, header=None, names=['name/description', 'category'] | |
) | |
# Extract only descriptions for faster matching | |
descriptions = description_category_pairs['name/description'].values | |
# Use fuzzy matching to find similar descriptions and assign the category | |
tx_list['category'] = tx_list['name/description'].apply( | |
fuzzy_match_list_categorizer, | |
args=(descriptions, description_category_pairs), | |
) | |
# Filter out uncategorized transactions, deduplicate, and sort by description | |
uncategorized_descriptions = ( | |
tx_list[tx_list['category'].isnull()] | |
.drop_duplicates(subset=['name/description']) | |
.sort_values(by=['name/description']) | |
) | |
# Ask the language model to categorize the remaining descriptions | |
if not uncategorized_descriptions.empty: | |
categorized_descriptions = await llm_list_categorizer( | |
uncategorized_descriptions[['name/description', 'category']] | |
) | |
categorized_descriptions.dropna(inplace=True) | |
# Update the category for uncategorized transactions based on the language model results | |
if not categorized_descriptions.empty: | |
tx_list['category'] = tx_list['category'].fillna( | |
tx_list['name/description'].map( | |
categorized_descriptions.set_index('name/description')['category'] | |
) | |
) | |
# Fill remaining NaN values in 'category' with 'Other' | |
tx_list['category'] = tx_list['category'].fillna('Other') | |
return tx_list |