# Standard library imports
import os
from datetime import datetime

# Third-party library imports
import pandas as pd

# Local application/library specific imports
from app.categorization.config import CATEGORY_REFERENCE_OUTPUT_FILE
from app.categorization.categorizer import llm_list_categorizer, fuzzy_match_list_categorizer


async def categorize_list(tx_list: pd.DataFrame) -> pd.DataFrame:
    """Asynchronously categorize a list of transactions.

    This function categorizes a list of transactions using a combination of fuzzy matching
    and a language model. It looks up new transaction descriptions in the reference file
    (a combination of user input and previous executions) to minimize API calls.
    Any uncategorized transactions are sent to the language model, and new description-category
    pairs are added to the reference file.

    Args:
        tx_list (pd.DataFrame): The list of transactions to categorize.

    Returns:
        pd.DataFrame: The original DataFrame with an additional column for the category.
    """

    if os.path.exists(CATEGORY_REFERENCE_OUTPUT_FILE):
        # Read description-category pairs from the reference file
        description_category_pairs = pd.read_csv(
            CATEGORY_REFERENCE_OUTPUT_FILE, header=None, names=['name/description', 'category']
        )

        # Extract only descriptions for faster matching
        descriptions = description_category_pairs['name/description'].values

        # Use fuzzy matching to find similar descriptions and assign the category
        tx_list['category'] = tx_list['name/description'].apply(
            fuzzy_match_list_categorizer,
            args=(descriptions, description_category_pairs),
        )

    # Filter out uncategorized transactions, deduplicate, and sort by description
    uncategorized_descriptions = (
        tx_list[tx_list['category'].isnull()]
        .drop_duplicates(subset=['name/description'])
        .sort_values(by=['name/description'])
    )

    # Ask the language model to categorize the remaining descriptions
    if not uncategorized_descriptions.empty:
        categorized_descriptions = await llm_list_categorizer(
            uncategorized_descriptions[['name/description', 'category']]
        )

        categorized_descriptions.dropna(inplace=True)

        # Update the category for uncategorized transactions based on the language model results
        if not categorized_descriptions.empty:
            tx_list['category'] = tx_list['category'].fillna(
                tx_list['name/description'].map(
                    categorized_descriptions.set_index('name/description')['category']
                )
            )
        
        # Fill remaining NaN values in 'category' with 'Other'
        tx_list['category'] = tx_list['category'].fillna('Other')

    return tx_list