Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on Jun 14

Commit

22ad617

•

1 Parent(s): 32ab113

ported everything into dictionary postgres

Browse files

Files changed (12) hide show

category_mapper.py +9 -7
chatgpt_audit.py +48 -49
db/db_utils.py +13 -0
dictionary/additions.csv +0 -32
dictionary/dictionary.csv +0 -0
{multi-item-experiments → old_experiments/multi-item-experiments}/classification_results2.csv +0 -0
{multi-item-experiments → old_experiments/multi-item-experiments}/multifood2.py +0 -0
{multi-item-experiments → old_experiments/multi-item-experiments}/multifood_viz.py +0 -0
preseed.py → old_experiments/preseed.py +0 -0
playground.py +1 -108
similarity_fast.py +3 -6
similarity_slow.py +3 -7

category_mapper.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 from tqdm import tqdm
 from openai import OpenAI
 from dotenv import load_dotenv
 load_dotenv()
@@ -12,6 +13,9 @@ load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=api_key)
 # Load your Excel file
 file_path = './dictionary/final_corrected_wweia_food_category_complete - final_corrected_wweia_food_category_complete.csv'
 spreadsheet = pd.read_csv(file_path)
@@ -48,11 +52,12 @@ def parse_response(response):
         return None
 # open up the current dictionary csv file
-csv_file_path = './dictionary/dictionary.csv'
-df_dictionary = pd.read_csv(csv_file_path)
-for index, row in tqdm(df_dictionary.iterrows(), desc="Processing input words"):
     # Get the food item and category
     food_item = row['description']
     category = row['food_category']
@@ -68,8 +73,5 @@ for index, row in tqdm(df_dictionary.iterrows(), desc="Processing input words"):
     print(f"A: '{best_category}'")
     print()
-    # Update the dictionary.csv file by adding the best category to the wweia_category column
     if best_category:
-        df_dictionary.loc[index, 'wweia_category'] = best_category
-    df_dictionary.to_csv(csv_file_path, index=False)

 from tqdm import tqdm
 from openai import OpenAI
 from dotenv import load_dotenv
+from db.db_utils import get_connection
 load_dotenv()
 api_key = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=api_key)
+db_conn = get_connection()
+db_cursor = db_conn.cursor()
 # Load your Excel file
 file_path = './dictionary/final_corrected_wweia_food_category_complete - final_corrected_wweia_food_category_complete.csv'
 spreadsheet = pd.read_csv(file_path)
         return None
 # open up the current dictionary csv file
+db_cursor.execute('SELECT * FROM dictionary where wweia_category is null')
+rows = db_cursor.fetchall()
+for row in tqdm(rows, desc="Processing"):
     # Get the food item and category
+    fdc_id = row['fdc_id']
     food_item = row['description']
     category = row['food_category']
     print(f"A: '{best_category}'")
     print()
     if best_category:
+        db_cursor.execute('UPDATE dictionary SET wweia_category = %s WHERE fdc_id = %s', (best_category, fdc_id))

chatgpt_audit.py CHANGED Viewed

@@ -83,17 +83,14 @@ def parse_response(response):
         return None
-csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
-dictionary = []
-for csv_file_path in csv_file_paths:
-    df_dictionary = pd.read_csv(csv_file_path)
-    _dictionary = df_dictionary['description'].astype(str).tolist()
-    stripped_dictionary = [word.strip() for word in _dictionary]
-    dictionary.extend(stripped_dictionary)
 db_conn = get_connection()
 db_cursor = db_conn.cursor()
 # select all mappings that have not been reviewed
 db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = 0")
 results = db_cursor.fetchall()
@@ -101,45 +98,47 @@ results = db_cursor.fetchall()
 # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
 print("Soft drink, NFS" in dictionary)
-csv_data = []
-for row in results:
-    input_word = row[0]
-    dictionary_word = row[1]
-    similar_words = [item.strip() for item in row[2].split('|')]
-    # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
-    levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
-    print(f"Input: {input_word}")
-    print(f" - dictionary_word: {dictionary_word}")
-    print(f" - similar_words: {similar_words}")
-    print(f" - levenshtein_words: {levenshtein_words}")
-    # concatenate the similar_words and levenshtein_words
-    all_words = similar_words + levenshtein_words
-    all_words = list(set(all_words))  # remove duplicates
-    response = query_gpt(input_word, dictionary_word, all_words)
-    if response:
-        csv_data.append({
-            'input_word': input_word,
-            'original_dictionary_word': dictionary_word,
-            'new_dictionary_word': response
-        })
-        if response == dictionary_word and response in dictionary:
-            print(f" - Mapping is correct")
-            db_cursor.execute("UPDATE mappings SET reviewed = 1 WHERE input_word = ?", (input_word,))
-        else:
-            # We should update the mapping in the database
-            # We should replace dictionary_word with response
-            # We should set reviewed to 1
-            # first confirm that the response is in the dictionary
-            if response in dictionary:
-                print(f" - Updating mapping with {response}")
-                db_cursor.execute("UPDATE mappings SET dictionary_word = ?, reviewed = 1 WHERE input_word = ?", (response, input_word))
-                db_conn.commit()
-            else:
-                print(f" - Response {response} is not in the dictionary")
-        update_csv(csv_data)
-db_conn.close()

         return None
 db_conn = get_connection()
 db_cursor = db_conn.cursor()
+# Load the dictionary
+db_cursor.execute("SELECT description FROM dictionary")
+dictionary = db_cursor.fetchall()
+dictionary = [item[0] for item in dictionary]
 # select all mappings that have not been reviewed
 db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings WHERE reviewed = 0")
 results = db_cursor.fetchall()
 # iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
 print("Soft drink, NFS" in dictionary)
+print(dictionary)
+print("ensure dictionary works before we start")
+# csv_data = []
+# for row in results:
+#     input_word = row[0]
+#     dictionary_word = row[1]
+#     similar_words = [item.strip() for item in row[2].split('|')]
+#     # find words from the dictionary list based on small levenstein distance between input_word and each word in the dictionary
+#     levenshtein_words = find_close_levenshtein_words(input_word, dictionary)
+#     print(f"Input: {input_word}")
+#     print(f" - dictionary_word: {dictionary_word}")
+#     print(f" - similar_words: {similar_words}")
+#     print(f" - levenshtein_words: {levenshtein_words}")
+#     # concatenate the similar_words and levenshtein_words
+#     all_words = similar_words + levenshtein_words
+#     all_words = list(set(all_words))  # remove duplicates
+#     response = query_gpt(input_word, dictionary_word, all_words)
+#     if response:
+#         csv_data.append({
+#             'input_word': input_word,
+#             'original_dictionary_word': dictionary_word,
+#             'new_dictionary_word': response
+#         })
+#         if response == dictionary_word and response in dictionary:
+#             print(f" - Mapping is correct")
+#             db_cursor.execute("UPDATE mappings SET reviewed = 1 WHERE input_word = ?", (input_word,))
+#         else:
+#             # We should update the mapping in the database
+#             # We should replace dictionary_word with response
+#             # We should set reviewed to 1
+#             # first confirm that the response is in the dictionary
+#             if response in dictionary:
+#                 print(f" - Updating mapping with {response}")
+#                 db_cursor.execute("UPDATE mappings SET dictionary_word = ?, reviewed = 1 WHERE input_word = ?", (response, input_word))
+#                 db_conn.commit()
+#             else:
+#                 print(f" - Response {response} is not in the dictionary")
+#         update_csv(csv_data)
+# db_conn.close()

db/db_utils.py CHANGED Viewed

@@ -37,6 +37,19 @@ def initialize_db(conn):
             updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         )
     ''')
     conn.commit()
 def get_mapping_from_db(cursor, cleaned_word):

             updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         )
     ''')
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS dictionary (
+            fdc_id INTEGER PRIMARY KEY,
+            description TEXT,
+            food_category TEXT,
+            wweia_category TEXT,
+            water_content REAL,
+            dry_matter_content REAL,
+            leakage REAL,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
     conn.commit()
 def get_mapping_from_db(cursor, cleaned_word):

dictionary/additions.csv DELETED Viewed

@@ -1,32 +0,0 @@
-description,food_category
-"Mixed Produce", "Heterogeneous Mixture"
-"Miscellaneous", "Heterogeneous Mixture"
-"Assorted Vegetables", "Heterogeneous Mixture"
-"Various Fruits", "Heterogeneous Mixture"
-"Assorted Produce", "Heterogeneous Mixture"
-"Mixed Vegetables", "Heterogeneous Mixture"
-"Mixed Vegetables, canned", "Heterogeneous Mixture"
-"Miscellaneous Items", "Heterogeneous Mixture"
-"Mixed Goods", "Heterogeneous Mixture"
-"Assorted Groceries", "Heterogeneous Mixture"
-"Various Groceries", "Heterogeneous Mixture"
-"Mixed Food Items", "Heterogeneous Mixture"
-"Assorted Foods", "Heterogeneous Mixture"
-"Varied Produce", "Heterogeneous Mixture"
-"Assorted Fruit and Veg", "Heterogeneous Mixture"
-"Mixed Fruits and Vegetables", "Heterogeneous Mixture"
-"Miscellaneous Produce", "Heterogeneous Mixture"
-"Assorted Consumables", "Heterogeneous Mixture"
-"Various Edibles", "Heterogeneous Mixture"
-"Mixed Edibles", "Heterogeneous Mixture"
-"Assorted Edible Items", "Heterogeneous Mixture"
-"Mixed Fresh Produce", "Heterogeneous Mixture"
-"Various Produce Items", "Heterogeneous Mixture"
-"Misc Grocery", "Heterogeneous Mixture"
-"Misc Meat", "Heterogeneous Mixture"
-"Misc Produce", "Heterogeneous Mixture"
-"Misc Vegetables", "Heterogeneous Mixture"
-"Grocery Items", "Heterogeneous Mixture"
-"Grocery", "Heterogeneous Mixture"
-"Misc Items", "Non-Food Item"
-"Non-Food Item", "Non-Food Item"

dictionary/dictionary.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

{multi-item-experiments → old_experiments/multi-item-experiments}/classification_results2.csv RENAMED Viewed

File without changes

{multi-item-experiments → old_experiments/multi-item-experiments}/multifood2.py RENAMED Viewed

File without changes

{multi-item-experiments → old_experiments/multi-item-experiments}/multifood_viz.py RENAMED Viewed

File without changes

preseed.py → old_experiments/preseed.py RENAMED Viewed

File without changes

playground.py CHANGED Viewed

@@ -1,108 +1 @@
-import os
-import csv
-import json
-import time
-import heapq
-import pandas as pd
-from openai import OpenAI
-from dotenv import load_dotenv
-from Levenshtein import distance
-from tqdm import tqdm
-from db.db_utils import get_connection, store_mapping_to_db, get_mapping_from_db
-from ask_gpt import query_gpt
-# For any unreviewed mappings, we ask chatgpt to consider:
-# 1. The similar_words list
-# 2. Similar words from the dictionary based on small levenstein distance
-# ChatGPT should confirm that the current mapping is the best one. If not, they should provide the better mapping.
-# If its a Non-Food Item, we should confirm that
-# If it's a homogenous or hetergeneous mixture, we should confirm that
-load_dotenv()
-api_key = os.getenv("OPENAI_API_KEY")
-client = OpenAI(api_key=api_key)
-def save_to_csv(results):
-        output_file_path = f'./audits/{int(time.time())}.csv'
-        df_results = pd.DataFrame(results, columns=['input_word', 'original_dictionary_word', 'new_dictionary_word',])
-        df_results.to_csv(output_file_path, index=False)
-def find_close_levenshtein_words(input_word, dictionary, threshold=3):
-    # Calculate Levenshtein distances for each word in the dictionary
-    close_words = [word for word in dictionary if distance(input_word, word) <= threshold]
-    return close_words
-def query_gpt(food_item, dictionary_word, similar_words):
-    line_separated_words = '\n'.join(similar_words)
-    prompt = (
-      f"""I have a particular food item and a mapping to a USDA word. Can you confirm if the food item is most similar to the mapping?
-      Generally, you should prefer the mapped word, but if you believe there is a better fit, please provide it.
-      I will also provide a list of other similar words that you could be a better fit.
-      If it's not a food item, return 'Non-Food Item'.
-      You should respond in JSON format with an object that has the key `guess`, and the value is the most similar food item.
-      The food item is: "{food_item}"
-      It has been mapped to: "{dictionary_word}"
-      Similar words:
-      {line_separated_words}"""
-    )
-    completion = client.chat.completions.create(
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt}
-        ],
-        model="gpt-3.5-turbo-1106",
-        response_format={"type": "json_object"},
-    )
-    response = completion.choices[0].message.content
-    parsed = parse_response(response)
-    print(f"Q: '{food_item}'")
-    print(f"A: '{parsed}'")
-    print()
-    return parsed
-# Define the function to parse the GPT response
-def parse_response(response):
-    try:
-        result = json.loads(response)
-        return result['guess']
-    except (json.JSONDecodeError, KeyError) as e:
-        print(f"Error parsing response: {response} - {e}")
-        return None
-csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
-dictionary = []
-for csv_file_path in csv_file_paths:
-    df_dictionary = pd.read_csv(csv_file_path)
-    _dictionary = df_dictionary['description'].astype(str).tolist()
-    dictionary.extend(_dictionary)
-db_conn = get_connection()
-db_cursor = db_conn.cursor()
-# select all mappings that have not been reviewed
-db_cursor.execute("SELECT input_word, dictionary_word, similar_words FROM mappings")
-results = db_cursor.fetchall()
-# iterate through each row, grab the input_word and ask chatgpt to compare it to the dictionary_word
-csv_data = []
-for row in results:
-    input_word = row[0]
-    print(f"input_word: {input_word}")
-    dictionary_word = row[1]
-    if dictionary_word not in dictionary:
-        db_cursor.execute("UPDATE mappings SET reviewed = 0 WHERE input_word = ?", (input_word,))
-print(csv_data)


1	+ # Nothing here

similarity_fast.py CHANGED Viewed

@@ -10,7 +10,6 @@ from utils import generate_embedding, clean_word, cosine_similarity, calculate_c
 # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
 model_name = 'sentence-transformers/all-mpnet-base-v2'
-csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
 filename = model_name.replace('/', '-')
 pickle_file_path = f'./embeddings/fast/{filename}.pkl'
@@ -20,11 +19,9 @@ class SimilarityFast:
         self.db_cursor = db_cursor
         self.model = SentenceTransformer(model_name)
-        dictionary = []
-        for csv_file_path in csv_file_paths:
-            df_dictionary = pd.read_csv(csv_file_path)
-            _dictionary = df_dictionary['description'].astype(str).tolist()
-            dictionary.extend(_dictionary)
         self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)

 # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 filename = model_name.replace('/', '-')
 pickle_file_path = f'./embeddings/fast/{filename}.pkl'
         self.db_cursor = db_cursor
         self.model = SentenceTransformer(model_name)
+        self.db_cursor.execute("SELECT description FROM dictionary")
+        dictionary = self.db_cursor.fetchall()
+        dictionary = [item[0] for item in dictionary]
         self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)

similarity_slow.py CHANGED Viewed

@@ -10,7 +10,6 @@ from utils import generate_embedding, cosine_similarity, clean_word, calculate_c
 # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
 model_name = 'sentence-transformers/all-mpnet-base-v2'
-csv_file_paths = ['./dictionary/dictionary.csv','./dictionary/additions.csv']
 filename = model_name.replace('/', '-')
 pickle_file_path = f'./embeddings/slow/{filename}.pkl'
@@ -21,12 +20,9 @@ class SimilaritySlow:
         self.db_conn = db_conn
         self.model = SentenceTransformer(model_name)
-        dictionary = []
-        for csv_file_path in csv_file_paths:
-            df_dictionary = pd.read_csv(csv_file_path)
-            _dictionary = df_dictionary['description'].astype(str).tolist()
-            dictionary.extend(_dictionary)
         self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
     def preprocess_dictionary_word(self, text):

 # model_name = 'sentence-transformers/all-MiniLM-L6-v2'
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 filename = model_name.replace('/', '-')
 pickle_file_path = f'./embeddings/slow/{filename}.pkl'
         self.db_conn = db_conn
         self.model = SentenceTransformer(model_name)
+        self.db_cursor.execute("SELECT description FROM dictionary")
+        dictionary = self.db_cursor.fetchall()
+        dictionary = [item[0] for item in dictionary]
         self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary)
     def preprocess_dictionary_word(self, text):