Spaces:
Paused
Paused
Commit
•
a216741
1
Parent(s):
efa7589
need to figure out how to make this more performant
Browse files- db/db_utils.py +17 -0
- requirements.txt +0 -1
- run.py +4 -4
db/db_utils.py
CHANGED
@@ -101,6 +101,23 @@ def get_mapping_from_db(cursor, cleaned_word):
|
|
101 |
return dict(zip(columns, row))
|
102 |
return None
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
def get_dictionary_data_from_db(cursor, dictionary_word):
|
105 |
cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
|
106 |
row = cursor.fetchone()
|
|
|
101 |
return dict(zip(columns, row))
|
102 |
return None
|
103 |
|
104 |
+
def get_batch_mapping_from_db(cursor, cleaned_words):
|
105 |
+
if not cleaned_words:
|
106 |
+
return {}
|
107 |
+
|
108 |
+
# Create a query with a list of placeholders
|
109 |
+
placeholders = ', '.join(['%s'] * len(cleaned_words))
|
110 |
+
query = f'SELECT * FROM mappings WHERE cleaned_word IN ({placeholders})'
|
111 |
+
|
112 |
+
cursor.execute(query, tuple(cleaned_words))
|
113 |
+
rows = cursor.fetchall()
|
114 |
+
|
115 |
+
if rows:
|
116 |
+
columns = [col[0] for col in cursor.description]
|
117 |
+
return {row[columns.index('cleaned_word')]: dict(zip(columns, row)) for row in rows}
|
118 |
+
|
119 |
+
return {}
|
120 |
+
|
121 |
def get_dictionary_data_from_db(cursor, dictionary_word):
|
122 |
cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
|
123 |
row = cursor.fetchone()
|
requirements.txt
CHANGED
@@ -7,7 +7,6 @@ openai==1.34.0
|
|
7 |
pluralizer==1.2.0
|
8 |
psutil==6.0.0
|
9 |
psycopg2-binary==2.9.9
|
10 |
-
asyncpg==0.29.0
|
11 |
python-dotenv==1.0.1
|
12 |
python-Levenshtein==0.25.1
|
13 |
requests==2.32.3
|
|
|
7 |
pluralizer==1.2.0
|
8 |
psutil==6.0.0
|
9 |
psycopg2-binary==2.9.9
|
|
|
10 |
python-dotenv==1.0.1
|
11 |
python-Levenshtein==0.25.1
|
12 |
requests==2.32.3
|
run.py
CHANGED
@@ -8,15 +8,15 @@ from db.db_utils import get_connection
|
|
8 |
if __name__ == "__main__":
|
9 |
db_conn = get_connection()
|
10 |
db_cursor = db_conn.cursor()
|
11 |
-
raw_file_name = 'food-forward-2022-raw-data.csv'
|
12 |
-
|
13 |
|
14 |
# chop off the extension for the results run key
|
15 |
result_file_name = raw_file_name.split('.')[0]
|
16 |
run_key = f"{result_file_name}-{int(time.time())}"
|
17 |
|
18 |
# override
|
19 |
-
run_key = 'food-forward-2022-raw-data-1719334900'
|
20 |
|
21 |
# find the number of rows that were already processed associated with this run key
|
22 |
db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
|
@@ -36,7 +36,7 @@ if __name__ == "__main__":
|
|
36 |
last_row_num = last_row[0] - 1
|
37 |
|
38 |
print(f"Processing {len(input_data)} rows")
|
39 |
-
print(f"
|
40 |
|
41 |
input_data = input_data[last_row_num:]
|
42 |
|
|
|
8 |
if __name__ == "__main__":
|
9 |
db_conn = get_connection()
|
10 |
db_cursor = db_conn.cursor()
|
11 |
+
# raw_file_name = 'food-forward-2022-raw-data.csv'
|
12 |
+
raw_file_name = 'MFB-2023-raw-data.csv'
|
13 |
|
14 |
# chop off the extension for the results run key
|
15 |
result_file_name = raw_file_name.split('.')[0]
|
16 |
run_key = f"{result_file_name}-{int(time.time())}"
|
17 |
|
18 |
# override
|
19 |
+
# run_key = 'food-forward-2022-raw-data-1719334900'
|
20 |
|
21 |
# find the number of rows that were already processed associated with this run key
|
22 |
db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
|
|
|
36 |
last_row_num = last_row[0] - 1
|
37 |
|
38 |
print(f"Processing {len(input_data)} rows")
|
39 |
+
print(f"Starting at row #{last_row_num}")
|
40 |
|
41 |
input_data = input_data[last_row_num:]
|
42 |
|