import os import time import cProfile import pstats import pandas as pd from algo import Algo from db.db_utils import get_connection if __name__ == "__main__": db_conn = get_connection() db_cursor = db_conn.cursor() # raw_file_name = 'food-forward-2022-raw-data.csv' # raw_file_name = 'MFB-2023-raw-data.csv' # get all files in the raw folder and iterate through them raw_files = os.listdir('./raw') # remove test.csv from raw_files raw_files = [f for f in raw_files if f != 'test.csv'] # for raw_file_name in ['sharing-excess-2020-raw-data.csv', 'sharing-excess-2021-raw-data.csv', 'sharing-excess-2022-raw-data.csv', 'sharing-excess-2023-raw-data.csv']: # for raw_file_name in ['spoonfuls-2023-Raw-Data.csv']: for raw_file_name in raw_files: if not raw_file_name.endswith('.csv'): continue # chop off the extension for the results run key # result_file_name = raw_file_name.split('.')[0] # run_key = f"{result_file_name}-{int(time.time())}" run_key = raw_file_name.split('.')[0] # print(f"Processing {raw_file_name}") # Check if the file is in the run_meta table db_cursor.execute('SELECT count(*) FROM results WHERE run_key = %s', (run_key,)) res = db_cursor.fetchone() number_processed = res[0] input_file_path = f'./raw/{raw_file_name}' df_input = pd.read_csv(input_file_path) # Convert column headers to lowercase df_input.columns = df_input.columns.str.lower() descriptions = df_input['description'].astype(str).tolist() descriptions2 = df_input.get('description2', pd.Series([None] * len(df_input))).astype(str).tolist() donors = df_input['donor'].astype(str).tolist() dates = df_input['date'].astype(str).tolist() weights = df_input['weight'].astype(str).tolist() input_data = [(desc, desc2, i + 2, donor, date, weight) for i, (desc, desc2, donor, date, weight) in enumerate(zip(descriptions, descriptions2, donors, dates, weights))] # run_row is the the last row from the CSV file that was processed, so let's offset from there num_rows = len(input_data) if num_rows != number_processed: print(run_key) print(f"We have {num_rows} rows in csv. We processed {number_processed}") print("----")