import time import cProfile import pstats import pandas as pd from algo import Algo from db.db_utils import get_connection if __name__ == "__main__": db_conn = get_connection() db_cursor = db_conn.cursor() # raw_file_name = 'food-forward-2022-raw-data.csv' raw_file_name = 'MFB-2023-raw-data.csv' # chop off the extension for the results run key result_file_name = raw_file_name.split('.')[0] run_key = f"{result_file_name}-{int(time.time())}" # override # run_key = 'food-forward-2022-raw-data-1719334900' # find the number of rows that were already processed associated with this run key db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,)) # get the last row that was processed last_row = db_cursor.fetchone() algo = Algo(db_conn, run_key) input_file_path = f'./raw/{raw_file_name}' df_input = pd.read_csv(input_file_path) input_data = [(desc, i + 2, donor, date, weight) for i, (desc, donor, date, weight) in enumerate(zip(df_input['description'].astype(str).tolist(), df_input['donor'].astype(str).tolist(), df_input['date'].astype(str).tolist(), df_input['weight'].astype(str).tolist()))] # run_row is the the last row from the CSV file that was processed, so let's offset from there last_row_num = 1 if last_row: last_row_num = last_row[0] - 1 print(f"Processing {len(input_data)} rows") print(f"Starting at row #{last_row_num}") input_data = input_data[last_row_num:] # limit to 100 rows # input_data = input_data[:100] # profiler = cProfile.Profile() # profiler.enable() algo.match_words(input_data) # profiler.disable() # # Print profiling results # stats = pstats.Stats(profiler).sort_stats('cumtime') # stats.print_stats(20) # Print top 20 results # algo.match_words([['bananas']]) db_conn.close()