beweinreich commited on
Commit
a216741
1 Parent(s): efa7589

need to figure out how to make this more performant

Browse files
Files changed (3) hide show
  1. db/db_utils.py +17 -0
  2. requirements.txt +0 -1
  3. run.py +4 -4
db/db_utils.py CHANGED
@@ -101,6 +101,23 @@ def get_mapping_from_db(cursor, cleaned_word):
101
  return dict(zip(columns, row))
102
  return None
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def get_dictionary_data_from_db(cursor, dictionary_word):
105
  cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
106
  row = cursor.fetchone()
 
101
  return dict(zip(columns, row))
102
  return None
103
 
104
+ def get_batch_mapping_from_db(cursor, cleaned_words):
105
+ if not cleaned_words:
106
+ return {}
107
+
108
+ # Create a query with a list of placeholders
109
+ placeholders = ', '.join(['%s'] * len(cleaned_words))
110
+ query = f'SELECT * FROM mappings WHERE cleaned_word IN ({placeholders})'
111
+
112
+ cursor.execute(query, tuple(cleaned_words))
113
+ rows = cursor.fetchall()
114
+
115
+ if rows:
116
+ columns = [col[0] for col in cursor.description]
117
+ return {row[columns.index('cleaned_word')]: dict(zip(columns, row)) for row in rows}
118
+
119
+ return {}
120
+
121
  def get_dictionary_data_from_db(cursor, dictionary_word):
122
  cursor.execute('SELECT * FROM dictionary WHERE description = %s', (dictionary_word,))
123
  row = cursor.fetchone()
requirements.txt CHANGED
@@ -7,7 +7,6 @@ openai==1.34.0
7
  pluralizer==1.2.0
8
  psutil==6.0.0
9
  psycopg2-binary==2.9.9
10
- asyncpg==0.29.0
11
  python-dotenv==1.0.1
12
  python-Levenshtein==0.25.1
13
  requests==2.32.3
 
7
  pluralizer==1.2.0
8
  psutil==6.0.0
9
  psycopg2-binary==2.9.9
 
10
  python-dotenv==1.0.1
11
  python-Levenshtein==0.25.1
12
  requests==2.32.3
run.py CHANGED
@@ -8,15 +8,15 @@ from db.db_utils import get_connection
8
  if __name__ == "__main__":
9
  db_conn = get_connection()
10
  db_cursor = db_conn.cursor()
11
- raw_file_name = 'food-forward-2022-raw-data.csv'
12
- # raw_file_name = 'MFB-2023-raw-data.csv'
13
 
14
  # chop off the extension for the results run key
15
  result_file_name = raw_file_name.split('.')[0]
16
  run_key = f"{result_file_name}-{int(time.time())}"
17
 
18
  # override
19
- run_key = 'food-forward-2022-raw-data-1719334900'
20
 
21
  # find the number of rows that were already processed associated with this run key
22
  db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
@@ -36,7 +36,7 @@ if __name__ == "__main__":
36
  last_row_num = last_row[0] - 1
37
 
38
  print(f"Processing {len(input_data)} rows")
39
- print(f"Skipping to {last_row_num} row")
40
 
41
  input_data = input_data[last_row_num:]
42
 
 
8
  if __name__ == "__main__":
9
  db_conn = get_connection()
10
  db_cursor = db_conn.cursor()
11
+ # raw_file_name = 'food-forward-2022-raw-data.csv'
12
+ raw_file_name = 'MFB-2023-raw-data.csv'
13
 
14
  # chop off the extension for the results run key
15
  result_file_name = raw_file_name.split('.')[0]
16
  run_key = f"{result_file_name}-{int(time.time())}"
17
 
18
  # override
19
+ # run_key = 'food-forward-2022-raw-data-1719334900'
20
 
21
  # find the number of rows that were already processed associated with this run key
22
  db_cursor.execute('SELECT run_row FROM results WHERE run_key = %s ORDER BY run_row DESC', (run_key,))
 
36
  last_row_num = last_row[0] - 1
37
 
38
  print(f"Processing {len(input_data)} rows")
39
+ print(f"Starting at row #{last_row_num}")
40
 
41
  input_data = input_data[last_row_num:]
42