Spaces:
Paused
Paused
Commit
β’
b0756e7
1
Parent(s):
1092d9e
added in a specificity audit
Browse files- audits/1720199868.csv +1 -0
- specificity_audit.py +28 -0
- specificity_classifier.py +17 -21
audits/1720199868.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
input_word,original_dictionary_word,new_dictionary_word
|
specificity_audit.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import logging
|
4 |
+
from db.db_utils import get_connection
|
5 |
+
from specificity_classifier import classify_text_to_specificity
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
8 |
+
|
9 |
+
logging.info("Connecting to the database...")
|
10 |
+
db_conn = get_connection()
|
11 |
+
db_cursor = db_conn.cursor()
|
12 |
+
|
13 |
+
logging.info("Fetching data from the database...")
|
14 |
+
db_cursor.execute("SELECT input_word, cleaned_word, specificity FROM mappings WHERE specificity IS NOT NULL and is_food = true")
|
15 |
+
results = db_cursor.fetchall()
|
16 |
+
|
17 |
+
for row in results:
|
18 |
+
input_word = row[0]
|
19 |
+
cleaned_word = row[1]
|
20 |
+
specificity = row[2]
|
21 |
+
|
22 |
+
new_specificity = classify_text_to_specificity(cleaned_word)
|
23 |
+
if new_specificity != specificity:
|
24 |
+
print(f"{input_word}")
|
25 |
+
print(f"Old Specificity: {specificity}")
|
26 |
+
print(f"New Specificity: {new_specificity}")
|
27 |
+
# db_cursor.execute("UPDATE mappings SET specificity = %s WHERE input_word = %s", (new_specificity, input_word))
|
28 |
+
|
specificity_classifier.py
CHANGED
@@ -5,10 +5,6 @@ from sklearn.model_selection import train_test_split
|
|
5 |
from db.db_utils import get_connection
|
6 |
import logging
|
7 |
|
8 |
-
# Set up logging
|
9 |
-
logging.basicConfig(level=logging.INFO)
|
10 |
-
logger = logging.getLogger(__name__)
|
11 |
-
|
12 |
# Set the device to CPU
|
13 |
device = torch.device("cpu")
|
14 |
|
@@ -20,7 +16,7 @@ label_mapping = {}
|
|
20 |
|
21 |
# Check if the model exists
|
22 |
if os.path.exists(save_directory):
|
23 |
-
|
24 |
tokenizer = BertTokenizer.from_pretrained(save_directory)
|
25 |
model = BertForSequenceClassification.from_pretrained(save_directory)
|
26 |
# Load the label mapping
|
@@ -28,34 +24,34 @@ if os.path.exists(save_directory):
|
|
28 |
with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
|
29 |
label_mapping = eval(f.read())
|
30 |
else:
|
31 |
-
|
32 |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
33 |
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
|
34 |
|
35 |
# Get data from database
|
36 |
-
|
37 |
db_conn = get_connection()
|
38 |
db_cursor = db_conn.cursor()
|
39 |
|
40 |
-
|
41 |
db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
|
42 |
results = db_cursor.fetchall()
|
43 |
training_data = [(row[0], row[1]) for row in results]
|
44 |
|
45 |
texts, labels = zip(*training_data)
|
46 |
-
|
47 |
|
48 |
# Convert labels to integers
|
49 |
-
|
50 |
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
|
51 |
labels = [label_mapping[label] for label in labels]
|
52 |
|
53 |
# Split data into training and testing sets
|
54 |
-
|
55 |
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
|
56 |
|
57 |
# Tokenize the data
|
58 |
-
|
59 |
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
|
60 |
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
|
61 |
|
@@ -72,7 +68,7 @@ else:
|
|
72 |
def __len__(self):
|
73 |
return len(self.labels)
|
74 |
|
75 |
-
|
76 |
train_dataset = SpecificityDataset(train_encodings, y_train)
|
77 |
test_dataset = SpecificityDataset(test_encodings, y_test)
|
78 |
|
@@ -88,7 +84,7 @@ else:
|
|
88 |
evaluation_strategy="epoch"
|
89 |
)
|
90 |
|
91 |
-
|
92 |
trainer = Trainer(
|
93 |
model=model, # the instantiated π€ Transformers model to be trained
|
94 |
args=training_args, # training arguments, defined above
|
@@ -96,15 +92,15 @@ else:
|
|
96 |
eval_dataset=test_dataset # evaluation dataset
|
97 |
)
|
98 |
|
99 |
-
|
100 |
trainer.train()
|
101 |
|
102 |
-
|
103 |
eval_result = trainer.evaluate()
|
104 |
-
|
105 |
|
106 |
# Save the model and tokenizer
|
107 |
-
|
108 |
model.save_pretrained(save_directory)
|
109 |
tokenizer.save_pretrained(save_directory)
|
110 |
# Save the label mapping
|
@@ -114,7 +110,7 @@ else:
|
|
114 |
model.to(device)
|
115 |
|
116 |
def classify_text_to_specificity(text):
|
117 |
-
|
118 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
|
119 |
outputs = model(**inputs)
|
120 |
logits = outputs.logits
|
@@ -126,5 +122,5 @@ def classify_text_to_specificity(text):
|
|
126 |
# Example usage
|
127 |
# for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
|
128 |
# predicted_specificity = classify_text_to_specificity(example_text)
|
129 |
-
#
|
130 |
-
#
|
|
|
5 |
from db.db_utils import get_connection
|
6 |
import logging
|
7 |
|
|
|
|
|
|
|
|
|
8 |
# Set the device to CPU
|
9 |
device = torch.device("cpu")
|
10 |
|
|
|
16 |
|
17 |
# Check if the model exists
|
18 |
if os.path.exists(save_directory):
|
19 |
+
logging.info(f"Loading the existing model from {save_directory}...")
|
20 |
tokenizer = BertTokenizer.from_pretrained(save_directory)
|
21 |
model = BertForSequenceClassification.from_pretrained(save_directory)
|
22 |
# Load the label mapping
|
|
|
24 |
with open(os.path.join(save_directory, 'label_mapping.txt'), 'r') as f:
|
25 |
label_mapping = eval(f.read())
|
26 |
else:
|
27 |
+
logging.info("Loading BERT tokenizer and model...")
|
28 |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
29 |
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
|
30 |
|
31 |
# Get data from database
|
32 |
+
logging.info("Connecting to the database...")
|
33 |
db_conn = get_connection()
|
34 |
db_cursor = db_conn.cursor()
|
35 |
|
36 |
+
logging.info("Fetching data from the database...")
|
37 |
db_cursor.execute("SELECT input_word, specificity FROM mappings WHERE specificity IS NOT NULL and reviewed = true and is_food = true")
|
38 |
results = db_cursor.fetchall()
|
39 |
training_data = [(row[0], row[1]) for row in results]
|
40 |
|
41 |
texts, labels = zip(*training_data)
|
42 |
+
logging.info(f"Fetched {len(texts)} records from the database.")
|
43 |
|
44 |
# Convert labels to integers
|
45 |
+
logging.info("Converting labels to integers...")
|
46 |
label_mapping = {label: idx for idx, label in enumerate(set(labels))}
|
47 |
labels = [label_mapping[label] for label in labels]
|
48 |
|
49 |
# Split data into training and testing sets
|
50 |
+
logging.info("Splitting data into training and testing sets...")
|
51 |
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
|
52 |
|
53 |
# Tokenize the data
|
54 |
+
logging.info("Tokenizing the data...")
|
55 |
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
|
56 |
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)
|
57 |
|
|
|
68 |
def __len__(self):
|
69 |
return len(self.labels)
|
70 |
|
71 |
+
logging.info("Creating datasets...")
|
72 |
train_dataset = SpecificityDataset(train_encodings, y_train)
|
73 |
test_dataset = SpecificityDataset(test_encodings, y_test)
|
74 |
|
|
|
84 |
evaluation_strategy="epoch"
|
85 |
)
|
86 |
|
87 |
+
logging.info("Initializing the Trainer...")
|
88 |
trainer = Trainer(
|
89 |
model=model, # the instantiated π€ Transformers model to be trained
|
90 |
args=training_args, # training arguments, defined above
|
|
|
92 |
eval_dataset=test_dataset # evaluation dataset
|
93 |
)
|
94 |
|
95 |
+
logging.info("Starting training...")
|
96 |
trainer.train()
|
97 |
|
98 |
+
logging.info("Evaluating the model...")
|
99 |
eval_result = trainer.evaluate()
|
100 |
+
logging.info(f"Evaluation results: {eval_result}")
|
101 |
|
102 |
# Save the model and tokenizer
|
103 |
+
logging.info(f"Saving the model to {save_directory}...")
|
104 |
model.save_pretrained(save_directory)
|
105 |
tokenizer.save_pretrained(save_directory)
|
106 |
# Save the label mapping
|
|
|
110 |
model.to(device)
|
111 |
|
112 |
def classify_text_to_specificity(text):
|
113 |
+
logging.info(f"Classifying text: {text}")
|
114 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
|
115 |
outputs = model(**inputs)
|
116 |
logits = outputs.logits
|
|
|
122 |
# Example usage
|
123 |
# for example_text in ["produce items", "bananas", "milk", "mixed items", "random assortment", "heterogeneous mixture"]:
|
124 |
# predicted_specificity = classify_text_to_specificity(example_text)
|
125 |
+
# logging.info(f"The predicted specificity for '{example_text}' is '{predicted_specificity}'")
|
126 |
+
# logging.info("----------")
|