beweinreich commited on
Commit
22ad617
β€’
1 Parent(s): 32ab113

ported everything into dictionary postgres

Browse files
category_mapper.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
  from tqdm import tqdm
6
  from openai import OpenAI
7
  from dotenv import load_dotenv
 
8
 
9
 
10
  load_dotenv()
@@ -12,6 +13,9 @@ load_dotenv()
12
  api_key = os.getenv("OPENAI_API_KEY")
13
  client = OpenAI(api_key=api_key)
14
 
 
 
 
15
  # Load your Excel file
16
  file_path = './dictionary/final_corrected_wweia_food_category_complete - final_corrected_wweia_food_category_complete.csv'
17
  spreadsheet = pd.read_csv(file_path)
@@ -48,11 +52,12 @@ def parse_response(response):
48
  return None
49
 
50
  # open up the current dictionary csv file
51
- csv_file_path = './dictionary/dictionary.csv'
52
- df_dictionary = pd.read_csv(csv_file_path)
53
 
54
- for index, row in tqdm(df_dictionary.iterrows(), desc="Processing input words"):
55
  # Get the food item and category
 
56
  food_item = row['description']
57
  category = row['food_category']
58
 
@@ -68,8 +73,5 @@ for index, row in tqdm(df_dictionary.iterrows(), desc="Processing input words"):
68
  print(f"A: '{best_category}'")
69
  print()
70
 
71
- # Update the dictionary.csv file by adding the best category to the wweia_category column
72
  if best_category:
73
- df_dictionary.loc[index, 'wweia_category'] = best_category
74
-
75
- df_dictionary.to_csv(csv_file_path, index=False)
 
5
  from tqdm import tqdm
6
  from openai import OpenAI
7
  from dotenv import load_dotenv
8
+ from db.db_utils import get_connection
9
 
10
 
11
  load_dotenv()
 
13
  api_key = os.getenv("OPENAI_API_KEY")
14
  client = OpenAI(api_key=api_key)
15
 
16
+ db_conn = get_connection()
17
+ db_cursor = db_conn.cursor()
18
+
19
  # Load your Excel file
20
  file_path = './dictionary/final_corrected_wweia_food_category_complete - final_corrected_wweia_food_category_complete.csv'
21
  spreadsheet = pd.read_csv(file_path)
 
52
  return None
53
 
54
  # open up the current dictionary csv file
55
+ db_cursor.execute('SELECT * FROM dictionary where wweia_category is null')
56
+ rows = db_cursor.fetchall()
57
 
58
+ for row in tqdm(rows, desc="Processing"):
59
  # Get the food item and category
60
+ fdc_id = row['fdc_id']
61
  food_item = row['description']
62
  category = row['food_category']
63
 
 
73
  print(f"A: '{best_category}'")
74
  print()
75
 
 
76
  if best_category:
77
+ db_cursor.execute('UPDATE dictionary SET wweia_category = %s WHERE fdc_id = %s', (best_category, fdc_id))
 
 
chatgpt_audit.py CHANGED
@@ -83,17 +83,14 @@ def parse_response(response):
83
  return None
84
 
85
 
86
- csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
87
- dictionary = []
88
- for csv_file_path in csv_file_paths:
89
- df_dictionary = pd.read_csv(csv_file_path)
90
- _dictionary = df_dictionary['description'].astype(str).tolist()
91
- stripped_dictionary = [word.strip() for word in _dictionary]
92
- dictionary.extend(stripped_dictionary)
93
-
94
  db_conn = get_connection()
95
  db_cursor = db_conn.cursor()
96
 
 
 
 
 
 
97
  # select all mappings that have not been reviewed
98
  db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = 0")
99
  results = db_cursor.fetchall()
@@ -101,45 +98,47 @@ results = db_cursor.fetchall()
101
  # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
102
 
103
  print("Soft drink, NFS" in dictionary)
104
-
105
- csv_data = []
106
- for row in results:
107
- input_word = row[0]
108
- dictionary_word = row[1]
109
- similar_words = [item.strip() for item in row[2].split('|')]
 
 
110
 
111
- # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
112
- levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
113
- print(f"Input: {input_word}")
114
- print(f" - dictionary_word: {dictionary_word}")
115
- print(f" - similar_words: {similar_words}")
116
- print(f" - levenshtein_words: {levenshtein_words}")
117
-
118
- # concatenate the similar_words and levenshtein_words
119
- all_words = similar_words + levenshtein_words
120
- all_words = list(set(all_words)) # remove duplicates
121
- response = query_gpt(input_word, dictionary_word, all_words)
122
- if response:
123
- csv_data.append({
124
- 'input_word': input_word,
125
- 'original_dictionary_word': dictionary_word,
126
- 'new_dictionary_word': response
127
- })
128
- if response == dictionary_word and response in dictionary:
129
- print(f" - Mapping is correct")
130
- db_cursor.execute("UPDATE mappings SET reviewed = 1 WHERE input_word = ?", (input_word,))
131
- else:
132
- # We should update the mapping in the database
133
- # We should replace dictionary_word with response
134
- # We should set reviewed to 1
135
- # first confirm that the response is in the dictionary
136
- if response in dictionary:
137
- print(f" - Updating mapping with {response}")
138
- db_cursor.execute("UPDATE mappings SET dictionary_word = ?, reviewed = 1 WHERE input_word = ?", (response, input_word))
139
- db_conn.commit()
140
- else:
141
- print(f" - Response {response} is not in the dictionary")
142
-
143
- update_csv(csv_data)
144
-
145
- db_conn.close()
 
83
  return None
84
 
85
 
 
 
 
 
 
 
 
 
86
  db_conn = get_connection()
87
  db_cursor = db_conn.cursor()
88
 
89
+ # Load the dictionary
90
+ db_cursor.execute("SELECT description FROM dictionary")
91
+ dictionary = db_cursor.fetchall()
92
+ dictionary = [item[0] for item in dictionary]
93
+
94
  # select all mappings that have not been reviewed
95
  db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = 0")
96
  results = db_cursor.fetchall()
 
98
  # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
99
 
100
  print("Soft drink, NFS" in dictionary)
101
+ print(dictionary)
102
+ print("ensure dictionary works before we start")
103
+
104
+ # csv_data = []
105
+ # for row in results:
106
+ # input_word = row[0]
107
+ # dictionary_word = row[1]
108
+ # similar_words = [item.strip() for item in row[2].split('|')]
109
 
110
+ # # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
111
+ # levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
112
+ # print(f"Input: {input_word}")
113
+ # print(f" - dictionary_word: {dictionary_word}")
114
+ # print(f" - similar_words: {similar_words}")
115
+ # print(f" - levenshtein_words: {levenshtein_words}")
116
+
117
+ # # concatenate the similar_words and levenshtein_words
118
+ # all_words = similar_words + levenshtein_words
119
+ # all_words = list(set(all_words)) # remove duplicates
120
+ # response = query_gpt(input_word, dictionary_word, all_words)
121
+ # if response:
122
+ # csv_data.append({
123
+ # 'input_word': input_word,
124
+ # 'original_dictionary_word': dictionary_word,
125
+ # 'new_dictionary_word': response
126
+ # })
127
+ # if response == dictionary_word and response in dictionary:
128
+ # print(f" - Mapping is correct")
129
+ # db_cursor.execute("UPDATE mappings SET reviewed = 1 WHERE input_word = ?", (input_word,))
130
+ # else:
131
+ # # We should update the mapping in the database
132
+ # # We should replace dictionary_word with response
133
+ # # We should set reviewed to 1
134
+ # # first confirm that the response is in the dictionary
135
+ # if response in dictionary:
136
+ # print(f" - Updating mapping with {response}")
137
+ # db_cursor.execute("UPDATE mappings SET dictionary_word = ?, reviewed = 1 WHERE input_word = ?", (response, input_word))
138
+ # db_conn.commit()
139
+ # else:
140
+ # print(f" - Response {response} is not in the dictionary")
141
+
142
+ # update_csv(csv_data)
143
+
144
+ # db_conn.close()
db/db_utils.py CHANGED
@@ -37,6 +37,19 @@ def initialize_db(conn):
37
  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
38
  )
39
  ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  conn.commit()
41
 
42
  def get_mapping_from_db(cursor, cleaned_word):
 
37
  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
38
  )
39
  ''')
40
+ cursor.execute('''
41
+ CREATE TABLE IF NOT EXISTS dictionary (
42
+ fdc_id INTEGER PRIMARY KEY,
43
+ description TEXT,
44
+ food_category TEXT,
45
+ wweia_category TEXT,
46
+ water_content REAL,
47
+ dry_matter_content REAL,
48
+ leakage REAL,
49
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
50
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
51
+ )
52
+ ''')
53
  conn.commit()
54
 
55
  def get_mapping_from_db(cursor, cleaned_word):
dictionary/additions.csv DELETED
@@ -1,32 +0,0 @@
1
- description,food_category
2
- "Mixed Produce", "Heterogeneous Mixture"
3
- "Miscellaneous", "Heterogeneous Mixture"
4
- "Assorted Vegetables", "Heterogeneous Mixture"
5
- "Various Fruits", "Heterogeneous Mixture"
6
- "Assorted Produce", "Heterogeneous Mixture"
7
- "Mixed Vegetables", "Heterogeneous Mixture"
8
- "Mixed Vegetables, canned", "Heterogeneous Mixture"
9
- "Miscellaneous Items", "Heterogeneous Mixture"
10
- "Mixed Goods", "Heterogeneous Mixture"
11
- "Assorted Groceries", "Heterogeneous Mixture"
12
- "Various Groceries", "Heterogeneous Mixture"
13
- "Mixed Food Items", "Heterogeneous Mixture"
14
- "Assorted Foods", "Heterogeneous Mixture"
15
- "Varied Produce", "Heterogeneous Mixture"
16
- "Assorted Fruit and Veg", "Heterogeneous Mixture"
17
- "Mixed Fruits and Vegetables", "Heterogeneous Mixture"
18
- "Miscellaneous Produce", "Heterogeneous Mixture"
19
- "Assorted Consumables", "Heterogeneous Mixture"
20
- "Various Edibles", "Heterogeneous Mixture"
21
- "Mixed Edibles", "Heterogeneous Mixture"
22
- "Assorted Edible Items", "Heterogeneous Mixture"
23
- "Mixed Fresh Produce", "Heterogeneous Mixture"
24
- "Various Produce Items", "Heterogeneous Mixture"
25
- "Misc Grocery", "Heterogeneous Mixture"
26
- "Misc Meat", "Heterogeneous Mixture"
27
- "Misc Produce", "Heterogeneous Mixture"
28
- "Misc Vegetables", "Heterogeneous Mixture"
29
- "Grocery Items", "Heterogeneous Mixture"
30
- "Grocery", "Heterogeneous Mixture"
31
- "Misc Items", "Non-Food Item"
32
- "Non-Food Item", "Non-Food Item"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dictionary/dictionary.csv DELETED
The diff for this file is too large to render. See raw diff
 
{multi-item-experiments β†’ old_experiments/multi-item-experiments}/classification_results2.csv RENAMED
File without changes
{multi-item-experiments β†’ old_experiments/multi-item-experiments}/multifood2.py RENAMED
File without changes
{multi-item-experiments β†’ old_experiments/multi-item-experiments}/multifood_viz.py RENAMED
File without changes
preseed.py β†’ old_experiments/preseed.py RENAMED
File without changes
playground.py CHANGED
@@ -1,108 +1 @@
1
- import os
2
- import csv
3
- import json
4
- import time
5
- import heapq
6
- import pandas as pd
7
- from openai import OpenAI
8
- from dotenv import load_dotenv
9
- from Levenshtein import distance
10
- from tqdm import tqdm
11
- from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db
12
- from ask_gpt import query_gpt
13
-
14
- # For any unreviewed mappings, we ask chatgpt to consider:
15
- # 1. The similar_words list
16
- # 2. Similar words from the dictionary based on small levenstein distance
17
-
18
- # ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
19
- # If its a Non-Food Item, we should confirm that
20
- # If it's a homogenous or hetergeneous mixture, we should confirm that
21
-
22
- load_dotenv()
23
-
24
- api_key = os.getenv("OPENAI_API_KEY")
25
- client = OpenAI(api_key=api_key)
26
-
27
-
28
- def save_to_csv(results):
29
- output_file_path = f'./audits/{int(time.time())}.csv'
30
- df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
31
- df_results.to_csv(output_file_path, index=False)
32
-
33
- def find_close_levenshtein_words(input_word, dictionary, threshold=3):
34
- # Calculate Levenshtein distances for each word in the dictionary
35
- close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
36
- return close_words
37
-
38
- def query_gpt(food_item, dictionary_word, similar_words):
39
- line_separated_words = '\n'.join(similar_words)
40
-
41
- prompt = (
42
- f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
43
-
44
- Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.
45
-
46
- I will also provide a list of other similar words that you could be a better fit.
47
-
48
- If it's not a food item, return 'Non-Food Item'.
49
-
50
- You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.
51
-
52
- The food item is: "{food_item}"
53
- It has been mapped to: "{dictionary_word}"
54
-
55
- Similar words:
56
- {line_separated_words}"""
57
- )
58
-
59
- completion = client.chat.completions.create(
60
- messages=[
61
- {"role": "system", "content": "You are a helpful assistant."},
62
- {"role": "user", "content": prompt}
63
- ],
64
- model="gpt-3.5-turbo-1106",
65
- response_format={"type": "json_object"},
66
- )
67
- response = completion.choices[0].message.content
68
- parsed = parse_response(response)
69
- print(f"Q: '{food_item}'")
70
- print(f"A: '{parsed}'")
71
- print()
72
- return parsed
73
-
74
- # Define the function to parse the GPT response
75
- def parse_response(response):
76
- try:
77
- result = json.loads(response)
78
- return result['guess']
79
- except (json.JSONDecodeError, KeyError) as e:
80
- print(f"Error parsing response: {response} - {e}")
81
- return None
82
-
83
-
84
- csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
85
- dictionary = []
86
- for csv_file_path in csv_file_paths:
87
- df_dictionary = pd.read_csv(csv_file_path)
88
- _dictionary = df_dictionary['description'].astype(str).tolist()
89
- dictionary.extend(_dictionary)
90
-
91
- db_conn = get_connection()
92
- db_cursor = db_conn.cursor()
93
-
94
- # select all mappings that have not been reviewed
95
- db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings")
96
- results = db_cursor.fetchall()
97
-
98
- # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
99
- csv_data = []
100
- for row in results:
101
- input_word = row[0]
102
- print(f"input_word: {input_word}")
103
- dictionary_word = row[1]
104
- if dictionary_word not in dictionary:
105
- db_cursor.execute("UPDATE mappings SET reviewed = 0 WHERE input_word = ?", (input_word,))
106
-
107
-
108
- print(csv_data)
 
1
+ # Nothing here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
similarity_fast.py CHANGED
@@ -10,7 +10,6 @@ from utils import generate_embedding, clean_word, cosine_similarity, calculate_c
10
 
11
  # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
12
  model_name = 'sentence-transformers/all-mpnet-base-v2'
13
- csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
14
  filename = model_name.replace('/', '-')
15
  pickle_file_path = f'./embeddings/fast/{filename}.pkl'
16
 
@@ -20,11 +19,9 @@ class SimilarityFast:
20
  self.db_cursor = db_cursor
21
  self.model = SentenceTransformer(model_name)
22
 
23
- dictionary = []
24
- for csv_file_path in csv_file_paths:
25
- df_dictionary = pd.read_csv(csv_file_path)
26
- _dictionary = df_dictionary['description'].astype(str).tolist()
27
- dictionary.extend(_dictionary)
28
 
29
  self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
30
 
 
10
 
11
  # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
12
  model_name = 'sentence-transformers/all-mpnet-base-v2'
 
13
  filename = model_name.replace('/', '-')
14
  pickle_file_path = f'./embeddings/fast/{filename}.pkl'
15
 
 
19
  self.db_cursor = db_cursor
20
  self.model = SentenceTransformer(model_name)
21
 
22
+ self.db_cursor.execute("SELECT description FROM dictionary")
23
+ dictionary = self.db_cursor.fetchall()
24
+ dictionary = [item[0] for item in dictionary]
 
 
25
 
26
  self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
27
 
similarity_slow.py CHANGED
@@ -10,7 +10,6 @@ from utils import generate_embedding, cosine_similarity, clean_word, calculate_c
10
 
11
  # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
12
  model_name = 'sentence-transformers/all-mpnet-base-v2'
13
- csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
14
  filename = model_name.replace('/', '-')
15
  pickle_file_path = f'./embeddings/slow/{filename}.pkl'
16
 
@@ -21,12 +20,9 @@ class SimilaritySlow:
21
  self.db_conn = db_conn
22
  self.model = SentenceTransformer(model_name)
23
 
24
- dictionary = []
25
- for csv_file_path in csv_file_paths:
26
- df_dictionary = pd.read_csv(csv_file_path)
27
- _dictionary = df_dictionary['description'].astype(str).tolist()
28
- dictionary.extend(_dictionary)
29
-
30
  self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
31
 
32
  def preprocess_dictionary_word(self, text):
 
10
 
11
  # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
12
  model_name = 'sentence-transformers/all-mpnet-base-v2'
 
13
  filename = model_name.replace('/', '-')
14
  pickle_file_path = f'./embeddings/slow/{filename}.pkl'
15
 
 
20
  self.db_conn = db_conn
21
  self.model = SentenceTransformer(model_name)
22
 
23
+ self.db_cursor.execute("SELECT description FROM dictionary")
24
+ dictionary = self.db_cursor.fetchall()
25
+ dictionary = [item[0] for item in dictionary]
 
 
 
26
  self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
27
 
28
  def preprocess_dictionary_word(self, text):