Spaces:
Paused
Paused
import time | |
import cProfile | |
import pstats | |
import pandas as pd | |
from algo import Algo | |
from db.db_utils import get_connection | |
if __name__ == "__main__": | |
db_conn = get_connection() | |
db_cursor = db_conn.cursor() | |
# raw_file_name = 'food-forward-2022-raw-data.csv' | |
raw_file_name = 'MFB-2023-raw-data.csv' | |
# chop off the extension for the results run key | |
result_file_name = raw_file_name.split('.')[0] | |
run_key = f"{result_file_name}-{int(time.time())}" | |
# override | |
# run_key = 'food-forward-2022-raw-data-1719334900' | |
# find the number of rows that were already processed associated with this run key | |
db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,)) | |
# get the last row that was processed | |
last_row = db_cursor.fetchone() | |
algo = Algo(db_conn, run_key) | |
input_file_path = f'./raw/{raw_file_name}' | |
df_input = pd.read_csv(input_file_path) | |
input_data = [(desc, i + 2, donor, date, weight) for i, (desc, donor, date, weight) in enumerate(zip(df_input['description'].astype(str).tolist(), df_input['donor'].astype(str).tolist(), df_input['date'].astype(str).tolist(), df_input['weight'].astype(str).tolist()))] | |
# run_row is the the last row from the CSV file that was processed, so let's offset from there | |
last_row_num = 1 | |
if last_row: | |
last_row_num = last_row[0] - 1 | |
print(f"Processing {len(input_data)} rows") | |
print(f"Starting at row #{last_row_num}") | |
input_data = input_data[last_row_num:] | |
# limit to 100 rows | |
# input_data = input_data[:100] | |
# profiler = cProfile.Profile() | |
# profiler.enable() | |
algo.match_words(input_data) | |
# profiler.disable() | |
# # Print profiling results | |
# stats = pstats.Stats(profiler).sort_stats('cumtime') | |
# stats.print_stats(20) # Print top 20 results | |
# algo.match_words([['bananas']]) | |
db_conn.close() | |