File size: 3,871 Bytes
3534b4d
b1f9aab
efa7589
 
9189e38
 
 
 
 
 
b1f9aab
a216741
3534b4d
b1f9aab
3534b4d
 
c3ffd5b
 
 
1ea315f
49234db
 
ecbcfc4
 
 
3534b4d
 
 
 
f2740a4
b1f9aab
132d401
 
 
 
 
 
 
 
 
 
 
 
c3ffd5b
132d401
 
 
 
 
 
 
3534b4d
 
 
 
 
f2740a4
3534b4d
 
b1f9aab
f2740a4
 
 
 
 
 
 
 
 
 
3534b4d
 
f2740a4
 
3534b4d
4edd87e
13b3103
4edd87e
 
a291937
22bb999
dbcabfb
 
efa7589
dbcabfb
efa7589
f2740a4
 
b1f9aab
3534b4d
 
c5c929e
9189e38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import time
import cProfile
import pstats
import pandas as pd
from algo import Algo
from db.db_utils import get_connection

if __name__ == "__main__":
    db_conn = get_connection()
    db_cursor = db_conn.cursor()
    # raw_file_name = 'food-forward-2022-raw-data.csv'
    # raw_file_name = 'MFB-2023-raw-data.csv'

    # get all files in the raw folder and iterate through them
    raw_files = os.listdir('./raw')
    # remove test.csv from raw_files
    raw_files = [f for f in raw_files if f != 'test.csv']

    # for raw_file_name in ['sharing-excess-2020-raw-data.csv', 'sharing-excess-2021-raw-data.csv', 'sharing-excess-2022-raw-data.csv', 'sharing-excess-2023-raw-data.csv']:
    # for raw_file_name in ['spoonfuls-2023-Raw-Data.csv']:
    for raw_file_name in raw_files:
        if not raw_file_name.endswith('.csv'):
            continue

        # chop off the extension for the results run key
        # result_file_name = raw_file_name.split('.')[0]
        # run_key = f"{result_file_name}-{int(time.time())}"
        run_key = raw_file_name.split('.')[0]
        print(f"Processing {raw_file_name}")


        # Check if the file is in the run_meta table
        db_cursor.execute('SELECT run_key FROM run_meta WHERE run_key = %s', (run_key,))
        run_meta_row = db_cursor.fetchone()
        if not run_meta_row:
            # prompt the user for the organization_id and year
            # the user can select from a list of organizations
            db_cursor.execute('SELECT id, name FROM organizations')
            organizations = db_cursor.fetchall()
            for i, org in enumerate(organizations):
                print(f"{i+1}. {org[1]}")
            org_choice = int(input("Select an organization: "))
            
            organization_id = organizations[org_choice-1][0]

            year = int(input("Enter the year: "))

            db_cursor.execute('INSERT INTO run_meta (run_key, organization_id, year) VALUES (%s, %s, %s)', (run_key, organization_id, year))
            db_conn.commit()

        # find the number of rows that were already processed associated with this run key
        db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
        
        # get the last row that was processed
        last_row = db_cursor.fetchone()
        
        input_file_path = f'./raw/{raw_file_name}'
        df_input = pd.read_csv(input_file_path)

        # Convert column headers to lowercase
        df_input.columns = df_input.columns.str.lower()

        descriptions = df_input['description'].astype(str).tolist()
        descriptions2 = df_input.get('description2', pd.Series([None] * len(df_input))).astype(str).tolist()
        donors = df_input['donor'].astype(str).tolist()
        dates = df_input['date'].astype(str).tolist()
        weights = df_input['weight'].astype(str).tolist()

        input_data = [(desc, desc2, i + 2, donor, date, weight) for i, (desc, desc2, donor, date, weight) in enumerate(zip(descriptions, descriptions2, donors, dates, weights))]
        
        # run_row is the the last row from the CSV file that was processed, so let's offset from there
        num_rows = len(input_data)
        last_row_num = 0
        if last_row:
            last_row_num = last_row[0] - 1

        # num_rows is the total number of rows in the CSV file but we add one row for the header
        print(f"CSV has {num_rows+1} rows")
        csv_complete = last_row_num >= num_rows
        print("CSV is complete" if csv_complete else "CSV is not complete")
        if not csv_complete:
            print(f"Starting at row #{last_row_num + 1}")

            input_data = input_data[last_row_num:]

            algo = Algo(db_conn, run_key)
            algo.match_words(input_data)

        
        # algo.match_words([['bananas']])

    db_conn.close()