Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

beweinreich commited on 15 days ago

Commit

b0756e7

•

1 Parent(s): 1092d9e

added in a specificity audit

Browse files

Files changed (3) hide show

audits/1720199868.csv +1 -0
specificity_audit.py +28 -0
specificity_classifier.py +17 -21

audits/1720199868.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ input_word,original_dictionary_word,new_dictionary_word

specificity_audit.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import torch
+import logging
+from db.db_utils import get_connection
+from specificity_classifier import classify_text_to_specificity
+logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.info("Connecting to the database...")
+db_conn = get_connection()
+db_cursor = db_conn.cursor()
+logging.info("Fetching data from the database...")
+db_cursor.execute("SELECT input_word, cleaned_word, specificity FROM mappings WHERE specificity IS NOT NULL and is_food = true")
+results = db_cursor.fetchall()
+for row in results:
+    input_word = row[0]
+    cleaned_word = row[1]
+    specificity = row[2]
+    new_specificity = classify_text_to_specificity(cleaned_word)
+    if new_specificity != specificity:
+        print(f"{input_word}")
+        print(f"Old Specificity: {specificity}")
+        print(f"New Specificity: {new_specificity}")
+        # db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (new_specificity, input_word))

specificity_classifier.py CHANGED Viewed

@@ -5,10 +5,6 @@ from sklearn.model_selection import train_test_split
 from db.db_utils import get_connection
 import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Set the device to CPU
 device = torch.device("cpu")
@@ -20,7 +16,7 @@ label_mapping = {}
 # Check if the model exists
 if os.path.exists(save_directory):
-    logger.info(f"Loading the existing model from {save_directory}...")
     tokenizer = BertTokenizer.from_pretrained(save_directory)
     model = BertForSequenceClassification.from_pretrained(save_directory)
     # Load the label mapping
@@ -28,34 +24,34 @@ if os.path.exists(save_directory):
         with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
             label_mapping = eval(f.read())
 else:
-    logger.info("Loading BERT tokenizer and model...")
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
     # Get data from database
-    logger.info("Connecting to the database...")
     db_conn = get_connection()
     db_cursor = db_conn.cursor()
-    logger.info("Fetching data from the database...")
     db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
     results = db_cursor.fetchall()
     training_data = [(row[0], row[1]) for row in results]
     texts, labels = zip(*training_data)
-    logger.info(f"Fetched {len(texts)} records from the database.")
     # Convert labels to integers
-    logger.info("Converting labels to integers...")
     label_mapping = {label: idx for idx, label in enumerate(set(labels))}
     labels = [label_mapping[label] for label in labels]
     # Split data into training and testing sets
-    logger.info("Splitting data into training and testing sets...")
     X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
     # Tokenize the data
-    logger.info("Tokenizing the data...")
     train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
     test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
@@ -72,7 +68,7 @@ else:
         def __len__(self):
             return len(self.labels)
-    logger.info("Creating datasets...")
     train_dataset = SpecificityDataset(train_encodings, y_train)
     test_dataset = SpecificityDataset(test_encodings, y_test)
@@ -88,7 +84,7 @@ else:
         evaluation_strategy="epoch"
     )
-    logger.info("Initializing the Trainer...")
     trainer = Trainer(
         model=model,                         # the instantiated 🤗 Transformers model to be trained
         args=training_args,                  # training arguments, defined above
@@ -96,15 +92,15 @@ else:
         eval_dataset=test_dataset             # evaluation dataset
     )
-    logger.info("Starting training...")
     trainer.train()
-    logger.info("Evaluating the model...")
     eval_result = trainer.evaluate()
-    logger.info(f"Evaluation results: {eval_result}")
     # Save the model and tokenizer
-    logger.info(f"Saving the model to {save_directory}...")
     model.save_pretrained(save_directory)
     tokenizer.save_pretrained(save_directory)
     # Save the label mapping
@@ -114,7 +110,7 @@ else:
 model.to(device)
 def classify_text_to_specificity(text):
-    logger.info(f"Classifying text: {text}")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
     outputs = model(**inputs)
     logits = outputs.logits
@@ -126,5 +122,5 @@ def classify_text_to_specificity(text):
 # Example usage
 # for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
 #     predicted_specificity = classify_text_to_specificity(example_text)
-#     logger.info(f"The predicted specificity for '{example_text}' is '{predicted_specificity}'")
-#     logger.info("----------")

 from db.db_utils import get_connection
 import logging
 # Set the device to CPU
 device = torch.device("cpu")
 # Check if the model exists
 if os.path.exists(save_directory):
+    logging.info(f"Loading the existing model from {save_directory}...")
     tokenizer = BertTokenizer.from_pretrained(save_directory)
     model = BertForSequenceClassification.from_pretrained(save_directory)
     # Load the label mapping
         with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
             label_mapping = eval(f.read())
 else:
+    logging.info("Loading BERT tokenizer and model...")
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
     # Get data from database
+    logging.info("Connecting to the database...")
     db_conn = get_connection()
     db_cursor = db_conn.cursor()
+    logging.info("Fetching data from the database...")
     db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
     results = db_cursor.fetchall()
     training_data = [(row[0], row[1]) for row in results]
     texts, labels = zip(*training_data)
+    logging.info(f"Fetched {len(texts)} records from the database.")
     # Convert labels to integers
+    logging.info("Converting labels to integers...")
     label_mapping = {label: idx for idx, label in enumerate(set(labels))}
     labels = [label_mapping[label] for label in labels]
     # Split data into training and testing sets
+    logging.info("Splitting data into training and testing sets...")
     X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
     # Tokenize the data
+    logging.info("Tokenizing the data...")
     train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
     test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
         def __len__(self):
             return len(self.labels)
+    logging.info("Creating datasets...")
     train_dataset = SpecificityDataset(train_encodings, y_train)
     test_dataset = SpecificityDataset(test_encodings, y_test)
         evaluation_strategy="epoch"
     )
+    logging.info("Initializing the Trainer...")
     trainer = Trainer(
         model=model,                         # the instantiated 🤗 Transformers model to be trained
         args=training_args,                  # training arguments, defined above
         eval_dataset=test_dataset             # evaluation dataset
     )
+    logging.info("Starting training...")
     trainer.train()
+    logging.info("Evaluating the model...")
     eval_result = trainer.evaluate()
+    logging.info(f"Evaluation results: {eval_result}")
     # Save the model and tokenizer
+    logging.info(f"Saving the model to {save_directory}...")
     model.save_pretrained(save_directory)
     tokenizer.save_pretrained(save_directory)
     # Save the label mapping
 model.to(device)
 def classify_text_to_specificity(text):
+    logging.info(f"Classifying text: {text}")
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
     outputs = model(**inputs)
     logits = outputs.logits
 # Example usage
 # for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
 #     predicted_specificity = classify_text_to_specificity(example_text)
+#     logging.info(f"The predicted specificity for '{example_text}' is '{predicted_specificity}'")
+#     logging.info("----------")