Spaces:

praneeth-hakeem-patrick
/

backend

Sleeping

App Files Files Community

palexis3 commited on May 27

Commit

f9f85ba

•

1 Parent(s): 35c2cbc

Refactored jupyter rag logic and categorizer methods

Browse files

Files changed (4) hide show

.vscode/settings.json +3 -0
app/categorization/file_processing.py +120 -0
app/{rag → transactions_rag}/categorize_transactions.ipynb +0 -0
app/{rag/transactions_2022_2023.csv → transactions_rag/transactions_2024.csv} +0 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "jupyter.notebookFileRoot": "${workspaceFolder}"
+}

app/categorization/file_processing.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import glob
+import shutil
+import logging
+import warnings
+from typing import Optional, Union, Dict, List
+from datetime import datetime
+import pandas as pd
+from dateparser import parse
+from categorizer_list import categorize_list
+from config import TEXT_OUTPUT_FILE, CATEGORY_REFERENCE_OUTPUT_FILE
+# Read file and process it (e.g. categorize transactions)
+async def process_file(file_path: str) -> Dict[str, Union[str, pd.DataFrame]]:
+    """
+    Process the input file by reading, cleaning, standardizing, and categorizing the transactions.
+    Args:
+        file_path (str): Path to the input file.
+    Returns:
+        Dict[str, Union[str, pd.DataFrame]]: Dictionary containing the file name, processed output, and error information if any
+    """
+    file_name = os.path.basename(file_path)
+    result= {'file_name': file_name, 'output': pd.DataFrame(), 'error': ''}
+    try:
+        # Read file into standardized tx format: source, date, type, category, description, amount
+        tx_list = standardize_csv_file(file_path)
+        # Categorize transactions
+        result['output'] = await categorize_list(tx_list)
+        print(f'File processed sucessfully: {file_name}')
+    except Exception as e:
+        # Return an error indicator and exception info
+        logging.log(logging.ERROR, f"| File: {file_name} | Unexpected Error: {e}")
+        print(f'ERROR processing file {file_name}: {e}')
+        result['error'] = str(e)
+    return result
+def standardize_csv_file(file_path: str) -> pd.DataFrame:
+    """
+    Read and prepare the data from the input file.
+    Args:
+        file_path (str): Path to the input file.
+    Returns:
+        pd.DataFrame: Prepared transaction data.
+    """
+    tx_list = pd.read_csv(file_path, index_col=False)
+    tx_list.attrs['file_name'] = file_path
+    tx_list.columns = tx_list.columns.str.lower().str.strip()
+    # Standardize dates to YYYY/MM/DD format
+    tx_list['date'] = pd.to_datetime(tx_list['date']).dt.strftime('%Y/%m/%d')
+    # Add source and reindex to desired tx format; category column is new and therefore empty
+    tx_list.loc[:, 'source'] = os.path.basename(file_path)
+    tx_list = tx_list.reindex(columns=['date', 'expense/income', 'category', 'name/description', 'amount'])
+    return tx_list
+def save_results(results: List) -> None:
+    """
+    Merge all interim results in the input folder and write the merged results to the output file.
+    Args:
+        in_folder (str): Path to the input folder containing interim results.
+        out_file (str): Path to the output file.
+    Returns:
+        None
+    """
+    # Concatenate all (valid) results into a single DataFrame
+    # Print errors to console
+    ok_files = []
+    ko_files = []
+    error_messages = []
+    col_list = ['date', 'expense/income', 'category', 'name/description', 'amount']
+    tx_list = pd.DataFrame(columns=col_list)
+    for result in results:
+        if not result['error']:
+            ok_files.append(result['file_name'])
+            result_df = result['output']
+            result_df.columns = col_list
+            tx_list = pd.concat([tx_list, result_df], ignore_index=True)
+        else:
+            ko_files.append(result['file_name'])
+            error_messages.append(f"{result['file_name']}: {result['error']}")
+    # Write contents to output file (based on file type)
+    tx_list.to_csv(TEXT_OUTPUT_FILE, mode="a", index=False, header=not os.path.exists(TEXT_OUTPUT_FILE))
+    new_ref_data = tx_list[['name/description', 'category']]
+    if os.path.exists(CATEGORY_REFERENCE_OUTPUT_FILE):
+        # If it exists, add master file to interim results
+        old_ref_data = pd.read_csv(CATEGORY_REFERENCE_OUTPUT_FILE, names=['name/description', 'category'], header=0)
+        new_ref_data = pd.concat([old_ref_data, new_ref_data], ignore_index=True)
+    # Drop duplicates, sort, and write to create new Master File
+    new_ref_data.drop_duplicates(subset=['name/description']).sort_values(by=['name/description']).to_csv(CATEGORY_REFERENCE_OUTPUT_FILE, mode="w", index=False, header=True)
+    # Summarize results
+    print(f"\nProcessed {len(results)} files: {len(ok_files)} successful, {len(ko_files)} with errors\n")
+    if len(ko_files):
+        print(f"Errors in the following files:")
+        for message in error_messages:
+            print(f"  {message}")
+        print('\n')

app/{rag → transactions_rag}/categorize_transactions.ipynb RENAMED Viewed

File without changes

app/{rag/transactions_2022_2023.csv → transactions_rag/transactions_2024.csv} RENAMED Viewed

File without changes