beweinreich commited on
Commit
b0756e7
β€’
1 Parent(s): 1092d9e

added in a specificity audit

Browse files
audits/1720199868.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ input_word,original_dictionary_word,new_dictionary_word
specificity_audit.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import logging
4
+ from db.db_utils import get_connection
5
+ from specificity_classifier import classify_text_to_specificity
6
+
7
+ logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ logging.info("Connecting to the database...")
10
+ db_conn = get_connection()
11
+ db_cursor = db_conn.cursor()
12
+
13
+ logging.info("Fetching data from the database...")
14
+ db_cursor.execute("SELECT input_word, cleaned_word, specificity FROM mappings WHERE specificity IS NOT NULL and is_food = true")
15
+ results = db_cursor.fetchall()
16
+
17
+ for row in results:
18
+ input_word = row[0]
19
+ cleaned_word = row[1]
20
+ specificity = row[2]
21
+
22
+ new_specificity = classify_text_to_specificity(cleaned_word)
23
+ if new_specificity != specificity:
24
+ print(f"{input_word}")
25
+ print(f"Old Specificity: {specificity}")
26
+ print(f"New Specificity: {new_specificity}")
27
+ # db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (new_specificity, input_word))
28
+
specificity_classifier.py CHANGED
@@ -5,10 +5,6 @@ from sklearn.model_selection import train_test_split
5
  from db.db_utils import get_connection
6
  import logging
7
 
8
- # Set up logging
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
-
12
  # Set the device to CPU
13
  device = torch.device("cpu")
14
 
@@ -20,7 +16,7 @@ label_mapping = {}
20
 
21
  # Check if the model exists
22
  if os.path.exists(save_directory):
23
- logger.info(f"Loading the existing model from {save_directory}...")
24
  tokenizer = BertTokenizer.from_pretrained(save_directory)
25
  model = BertForSequenceClassification.from_pretrained(save_directory)
26
  # Load the label mapping
@@ -28,34 +24,34 @@ if os.path.exists(save_directory):
28
  with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
29
  label_mapping = eval(f.read())
30
  else:
31
- logger.info("Loading BERT tokenizer and model...")
32
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
33
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
34
 
35
  # Get data from database
36
- logger.info("Connecting to the database...")
37
  db_conn = get_connection()
38
  db_cursor = db_conn.cursor()
39
 
40
- logger.info("Fetching data from the database...")
41
  db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
42
  results = db_cursor.fetchall()
43
  training_data = [(row[0], row[1]) for row in results]
44
 
45
  texts, labels = zip(*training_data)
46
- logger.info(f"Fetched {len(texts)} records from the database.")
47
 
48
  # Convert labels to integers
49
- logger.info("Converting labels to integers...")
50
  label_mapping = {label: idx for idx, label in enumerate(set(labels))}
51
  labels = [label_mapping[label] for label in labels]
52
 
53
  # Split data into training and testing sets
54
- logger.info("Splitting data into training and testing sets...")
55
  X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
56
 
57
  # Tokenize the data
58
- logger.info("Tokenizing the data...")
59
  train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
60
  test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
61
 
@@ -72,7 +68,7 @@ else:
72
  def __len__(self):
73
  return len(self.labels)
74
 
75
- logger.info("Creating datasets...")
76
  train_dataset = SpecificityDataset(train_encodings, y_train)
77
  test_dataset = SpecificityDataset(test_encodings, y_test)
78
 
@@ -88,7 +84,7 @@ else:
88
  evaluation_strategy="epoch"
89
  )
90
 
91
- logger.info("Initializing the Trainer...")
92
  trainer = Trainer(
93
  model=model, # the instantiated πŸ€— Transformers model to be trained
94
  args=training_args, # training arguments, defined above
@@ -96,15 +92,15 @@ else:
96
  eval_dataset=test_dataset # evaluation dataset
97
  )
98
 
99
- logger.info("Starting training...")
100
  trainer.train()
101
 
102
- logger.info("Evaluating the model...")
103
  eval_result = trainer.evaluate()
104
- logger.info(f"Evaluation results: {eval_result}")
105
 
106
  # Save the model and tokenizer
107
- logger.info(f"Saving the model to {save_directory}...")
108
  model.save_pretrained(save_directory)
109
  tokenizer.save_pretrained(save_directory)
110
  # Save the label mapping
@@ -114,7 +110,7 @@ else:
114
  model.to(device)
115
 
116
  def classify_text_to_specificity(text):
117
- logger.info(f"Classifying text: {text}")
118
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
119
  outputs = model(**inputs)
120
  logits = outputs.logits
@@ -126,5 +122,5 @@ def classify_text_to_specificity(text):
126
  # Example usage
127
  # for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
128
  # predicted_specificity = classify_text_to_specificity(example_text)
129
- # logger.info(f"The predicted specificity for '{example_text}' is '{predicted_specificity}'")
130
- # logger.info("----------")
 
5
  from db.db_utils import get_connection
6
  import logging
7
 
 
 
 
 
8
  # Set the device to CPU
9
  device = torch.device("cpu")
10
 
 
16
 
17
  # Check if the model exists
18
  if os.path.exists(save_directory):
19
+ logging.info(f"Loading the existing model from {save_directory}...")
20
  tokenizer = BertTokenizer.from_pretrained(save_directory)
21
  model = BertForSequenceClassification.from_pretrained(save_directory)
22
  # Load the label mapping
 
24
  with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
25
  label_mapping = eval(f.read())
26
  else:
27
+ logging.info("Loading BERT tokenizer and model...")
28
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
29
  model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
30
 
31
  # Get data from database
32
+ logging.info("Connecting to the database...")
33
  db_conn = get_connection()
34
  db_cursor = db_conn.cursor()
35
 
36
+ logging.info("Fetching data from the database...")
37
  db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
38
  results = db_cursor.fetchall()
39
  training_data = [(row[0], row[1]) for row in results]
40
 
41
  texts, labels = zip(*training_data)
42
+ logging.info(f"Fetched {len(texts)} records from the database.")
43
 
44
  # Convert labels to integers
45
+ logging.info("Converting labels to integers...")
46
  label_mapping = {label: idx for idx, label in enumerate(set(labels))}
47
  labels = [label_mapping[label] for label in labels]
48
 
49
  # Split data into training and testing sets
50
+ logging.info("Splitting data into training and testing sets...")
51
  X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
52
 
53
  # Tokenize the data
54
+ logging.info("Tokenizing the data...")
55
  train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
56
  test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
57
 
 
68
  def __len__(self):
69
  return len(self.labels)
70
 
71
+ logging.info("Creating datasets...")
72
  train_dataset = SpecificityDataset(train_encodings, y_train)
73
  test_dataset = SpecificityDataset(test_encodings, y_test)
74
 
 
84
  evaluation_strategy="epoch"
85
  )
86
 
87
+ logging.info("Initializing the Trainer...")
88
  trainer = Trainer(
89
  model=model, # the instantiated πŸ€— Transformers model to be trained
90
  args=training_args, # training arguments, defined above
 
92
  eval_dataset=test_dataset # evaluation dataset
93
  )
94
 
95
+ logging.info("Starting training...")
96
  trainer.train()
97
 
98
+ logging.info("Evaluating the model...")
99
  eval_result = trainer.evaluate()
100
+ logging.info(f"Evaluation results: {eval_result}")
101
 
102
  # Save the model and tokenizer
103
+ logging.info(f"Saving the model to {save_directory}...")
104
  model.save_pretrained(save_directory)
105
  tokenizer.save_pretrained(save_directory)
106
  # Save the label mapping
 
110
  model.to(device)
111
 
112
  def classify_text_to_specificity(text):
113
+ logging.info(f"Classifying text: {text}")
114
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
115
  outputs = model(**inputs)
116
  logits = outputs.logits
 
122
  # Example usage
123
  # for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
124
  # predicted_specificity = classify_text_to_specificity(example_text)
125
+ # logging.info(f"The predicted specificity for '{example_text}' is '{predicted_specificity}'")
126
+ # logging.info("----------")