import os
import time
import cProfile
import pstats
import pandas as pd
from algo import Algo
from db.db_utils import get_connection

if __name__ == "__main__":
    db_conn = get_connection()
    db_cursor = db_conn.cursor()
    # raw_file_name = 'food-forward-2022-raw-data.csv'
    # raw_file_name = 'MFB-2023-raw-data.csv'

    # get all files in the raw folder and iterate through them
    raw_files = os.listdir('./raw')
    # remove test.csv from raw_files
    raw_files = [f for f in raw_files if f != 'test.csv']

    # for raw_file_name in ['sharing-excess-2020-raw-data.csv', 'sharing-excess-2021-raw-data.csv', 'sharing-excess-2022-raw-data.csv', 'sharing-excess-2023-raw-data.csv']:
    # for raw_file_name in ['spoonfuls-2023-Raw-Data.csv']:
    for raw_file_name in raw_files:
        if not raw_file_name.endswith('.csv'):
            continue

        # chop off the extension for the results run key
        # result_file_name = raw_file_name.split('.')[0]
        # run_key = f"{result_file_name}-{int(time.time())}"
        run_key = raw_file_name.split('.')[0]
        print(f"Processing {raw_file_name}")


        # Check if the file is in the run_meta table
        db_cursor.execute('SELECT run_key FROM run_meta WHERE run_key = %s', (run_key,))
        run_meta_row = db_cursor.fetchone()
        if not run_meta_row:
            # prompt the user for the organization_id and year
            # the user can select from a list of organizations
            db_cursor.execute('SELECT id, name FROM organizations')
            organizations = db_cursor.fetchall()
            for i, org in enumerate(organizations):
                print(f"{i+1}. {org[1]}")
            org_choice = int(input("Select an organization: "))
            
            organization_id = organizations[org_choice-1][0]

            year = int(input("Enter the year: "))

            db_cursor.execute('INSERT INTO run_meta (run_key, organization_id, year) VALUES (%s, %s, %s)', (run_key, organization_id, year))
            db_conn.commit()

        # find the number of rows that were already processed associated with this run key
        db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
        
        # get the last row that was processed
        last_row = db_cursor.fetchone()
        
        input_file_path = f'./raw/{raw_file_name}'
        df_input = pd.read_csv(input_file_path)

        # Convert column headers to lowercase
        df_input.columns = df_input.columns.str.lower()

        descriptions = df_input['description'].astype(str).tolist()
        descriptions2 = df_input.get('description2', pd.Series([None] * len(df_input))).astype(str).tolist()
        donors = df_input['donor'].astype(str).tolist()
        dates = df_input['date'].astype(str).tolist()
        weights = df_input['weight'].astype(str).tolist()

        input_data = [(desc, desc2, i + 2, donor, date, weight) for i, (desc, desc2, donor, date, weight) in enumerate(zip(descriptions, descriptions2, donors, dates, weights))]
        
        # run_row is the the last row from the CSV file that was processed, so let's offset from there
        num_rows = len(input_data)
        last_row_num = 0
        if last_row:
            last_row_num = last_row[0] - 1

        # num_rows is the total number of rows in the CSV file but we add one row for the header
        print(f"CSV has {num_rows+1} rows")
        csv_complete = last_row_num >= num_rows
        print("CSV is complete" if csv_complete else "CSV is not complete")
        if not csv_complete:
            print(f"Starting at row #{last_row_num + 1}")

            input_data = input_data[last_row_num:]

            algo = Algo(db_conn, run_key)
            algo.match_words(input_data)

        
        # algo.match_words([['bananas']])

    db_conn.close()