# Standard library imports import os from datetime import datetime # Third-party library imports import pandas as pd # Local application/library specific imports from app.categorization.config import CATEGORY_REFERENCE_OUTPUT_FILE from app.categorization.categorizer import llm_list_categorizer, fuzzy_match_list_categorizer async def categorize_list(tx_list: pd.DataFrame) -> pd.DataFrame: """Asynchronously categorize a list of transactions. This function categorizes a list of transactions using a combination of fuzzy matching and a language model. It looks up new transaction descriptions in the reference file (a combination of user input and previous executions) to minimize API calls. Any uncategorized transactions are sent to the language model, and new description-category pairs are added to the reference file. Args: tx_list (pd.DataFrame): The list of transactions to categorize. Returns: pd.DataFrame: The original DataFrame with an additional column for the category. """ if os.path.exists(CATEGORY_REFERENCE_OUTPUT_FILE): # Read description-category pairs from the reference file description_category_pairs = pd.read_csv( CATEGORY_REFERENCE_OUTPUT_FILE, header=None, names=['name/description', 'category'] ) # Extract only descriptions for faster matching descriptions = description_category_pairs['name/description'].values # Use fuzzy matching to find similar descriptions and assign the category tx_list['category'] = tx_list['name/description'].apply( fuzzy_match_list_categorizer, args=(descriptions, description_category_pairs), ) # Filter out uncategorized transactions, deduplicate, and sort by description uncategorized_descriptions = ( tx_list[tx_list['category'].isnull()] .drop_duplicates(subset=['name/description']) .sort_values(by=['name/description']) ) # Ask the language model to categorize the remaining descriptions if not uncategorized_descriptions.empty: categorized_descriptions = await llm_list_categorizer( uncategorized_descriptions[['name/description', 'category']] ) categorized_descriptions.dropna(inplace=True) # Update the category for uncategorized transactions based on the language model results if not categorized_descriptions.empty: tx_list['category'] = tx_list['category'].fillna( tx_list['name/description'].map( categorized_descriptions.set_index('name/description')['category'] ) ) # Fill remaining NaN values in 'category' with 'Other' tx_list['category'] = tx_list['category'].fillna('Other') return tx_list