File size: 19,270 Bytes
733fcd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
import pandas as pd
from ast import literal_eval
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch
import os
from sklearn.model_selection import train_test_split
import random
import re

def clean_text(text):
  #helper function to clean the text from whitespace, double spaces
  # converts to lowercase and checks if the text is a string first to avoid errors
  if not isinstance(text, str):
    return ''
  text = text.lower()
  text = ' '.join(text.split())
  return text.strip()

def setup_tag_categories():
    tag_categories = {
        'cuisine': [
            'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai', 
            'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish', 
            'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese'
        ],
        'course': [
            'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast', 
            'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups'
        ],
        'main_ingredient': [
            'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit', 
            'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey',
            'beans', 'nuts', 'eggs', 'tofu'
        ],
        'dietary': [
            'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat', 
            'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30'
        ],
        'cooking_method': [
            'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling', 
            'baking', 'roasting', 'frying', 'steaming', 'braising'
        ],
        'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'],
        'time': [
            '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less', 
            '4-hours-or-less', 'weeknight'
        ],
        'occasion': [
            'holiday-event', 'christmas', 'thanksgiving', 'valentines-day', 
            'summer', 'winter', 'spring', 'fall', 'party', 'picnic'
        ]
    }
    return tag_categories

def setup_ingredient_groups():
    ingredient_groups = {
        'proteins': [
            'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey',
            'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils'
        ],
        'vegetables': [
            'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom',
            'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato'
        ],
        'grains_starches': [
            'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles'
        ],
        'dairy': [
            'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese'
        ]
    }
    return ingredient_groups

def categorize_recipe_tags(recipe_tags, tag_categories):
    categorized_tags = {}
    
    # Initialize empty lists for each category
    for category_name in tag_categories.keys():
        categorized_tags[category_name] = []
    
    # Check each tag
    for tag in recipe_tags:
        tag_lower = tag.lower()
        
        # Check each category
        for category_name in tag_categories.keys():
            category_keywords = tag_categories[category_name]
            
            # Check if any keyword matches this tag
            for keyword in category_keywords:
                if keyword in tag_lower:
                    categorized_tags[category_name].append(tag)
                    break
    
    return categorized_tags

def extract_main_ingredients(ingredients_list, ingredient_groups):
    if not ingredients_list or not isinstance(ingredients_list, list):
        return []
    
    # Clean each ingredient
    cleaned_ingredients = []
    
    for ingredient in ingredients_list:
        # Convert to string
        ingredient_string = str(ingredient) if ingredient is not None else ''
        if not ingredient_string or ingredient_string == 'nan':
            continue
        
        # Make lowercase
        cleaned_ingredient = ingredient_string.lower()
        
        # Remove common descriptor words
        words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium']
        for word in words_to_remove:
            cleaned_ingredient = cleaned_ingredient.replace(word, '')
        
        # Remove numbers
        cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient)
        
        # Remove measurement words
        measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces']
        for measurement in measurement_words:
            cleaned_ingredient = cleaned_ingredient.replace(measurement, '')
        
        # Clean up extra spaces
        cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip()
        
        # Only keep if it's long enough
        if cleaned_ingredient and len(cleaned_ingredient) > 2:
            cleaned_ingredients.append(cleaned_ingredient)

    
    # Put ingredients in order of importance
    ordered_ingredients = []
    
    # First, add proteins (most important)
    for ingredient in cleaned_ingredients:
        for protein in ingredient_groups['proteins']:
            if protein in ingredient:
                ordered_ingredients.append(ingredient)
                break
        
    
    # Then add vegetables, grains, and dairy
    other_groups = ['vegetables', 'grains_starches', 'dairy']
    for group_name in other_groups:
        for ingredient in cleaned_ingredients:
            if ingredient not in ordered_ingredients:
                for group_item in ingredient_groups[group_name]:
                    if group_item in ingredient:
                        ordered_ingredients.append(ingredient)
                        break
    
    # Finally, add any remaining ingredients
    for ingredient in cleaned_ingredients:
        if ingredient not in ordered_ingredients:
            ordered_ingredients.append(ingredient)
    
    return ordered_ingredients

def create_structured_recipe_text(recipe, tag_categories, ingredient_groups):
    # Get recipe tags and categorize them
    recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
    categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories)
    
    # Choose tags in priority order
    priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method']
    selected_tags = []
    
    for category in priority_categories:
        if category in categorized_tags:
            # Take up to 2 tags from each category
            category_tags = categorized_tags[category][:2]
            for tag in category_tags:
                selected_tags.append(tag)
    
    # Add some additional important tags
    important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet']
    remaining_tags = []
    
    for tag in recipe_tags:
        if tag not in selected_tags:  
            for keyword in important_keywords:
                if keyword in tag.lower():
                    remaining_tags.append(tag)
                    break
            
    
    # Add up to 3 remaining tags
    for i in range(min(3, len(remaining_tags))):
        selected_tags.append(remaining_tags[i])
    
    # Process ingredients
    recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
    main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups)
    
    # Step 5: Create the final structured text
    # Join first 8 ingredients
    ingredients_text = ', '.join(main_ingredients[:8])
    
    # Join first 10 tags
    tags_text = ', '.join(selected_tags[:10])
    
    # Get recipe name
    recipe_name = str(recipe['name']).replace('  ', ' ').strip()
    
    # Create final structured text
    structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}"
    
    return structured_text

def create_pair_data(recipes_df: pd.DataFrame, interactions_df: pd.DataFrame ,num_pairs: int = 15000):
    # This function creates the training pairs for the model. 
    # we first analyzed the data to create catogeries for the tags and ingredients. Under each of these, we have a list for cuisine, dietery, poultry, etc.
    # As we trained the model, we found that the model was not able to learn the tags and ingredients so we created a structured text represenation so it can easily learn. 
    # the prompt used is: Analyze the two csv files attached and created a structured text representation to be used for training a bert model to understand
    # tags and ingredients such that if a user later searches for a quick recipe, it can be used to find a recipe that is quick to make. 
  
  # Set up the structured text categories and groups
  tag_categories = setup_tag_categories()
  ingredient_groups = setup_ingredient_groups()
  
  # Make a list to store all our pairs
  pair_data_list = []
  
  # create the pairs
  for pair_number in range(num_pairs):
    
    #Pick a random recipe from our dataframe
    random_recipe_data = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)]
    
    # Get the tags from this recipe
    recipe_tags_list = random_recipe_data['tags']
    
    # Select some random tags (maximum 5, but maybe less if recipe has fewer tags)
    num_tags_to_select = min(5, len(recipe_tags_list))
    selected_tags_list = []
    
    # Pick random sample of tags and join them to a query string
    selected_tags_list = random.sample(recipe_tags_list, num_tags_to_select)
    
    # Create the positive recipe text using structured format
    positive_recipe_text = create_structured_recipe_text(random_recipe_data, tag_categories, ingredient_groups)
    
    # Find a negative recipe that has less than 2 tags in common with the query
    anchor = ' '.join(selected_tags_list)
    anchor_tags_set = set(anchor.split())
    
    negative_recipe_text = None
    attempts_counter = 0
    max_attempts_allowed = 100
    
    # Keep trying until we find a good negative recipe (Added a max attempts to avoid infinite loop)
    while negative_recipe_text is None and attempts_counter < max_attempts_allowed:
      random_negative_recipe = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)]
      
      # Get tags from this negative recipe
      negative_recipe_tags = random_negative_recipe['tags']
      negative_recipe_tags_set = set(negative_recipe_tags)
      
      # Count how many tags overlap
      overlap_count = 0
      for anchor_tag in anchor_tags_set:
        if anchor_tag in negative_recipe_tags_set:
          overlap_count = overlap_count + 1
      
      attempts_counter = attempts_counter + 1
      
      # If overlap is small enough (2 or less), we can use this as negative
      if overlap_count <= 2:
        # Create the negative recipe text using structured format
        negative_recipe_text = create_structured_recipe_text(random_negative_recipe, tag_categories, ingredient_groups)
        
        print(f"Found all negative recipes. Overlap: {overlap_count}")
        break

    # If we found a negative recipe, add this pair to our list
    if negative_recipe_text is not None:
      # Create a tuple with the three parts
      pair_data_list.append((anchor, positive_recipe_text, negative_recipe_text))
      print(f"Created pair {pair_number + 1}: Anchor='{anchor}', Overlap={overlap_count}")
    else:
      print(f"Could not find negative recipe for anchor '{anchor}' after {max_attempts_allowed} attempts")

    # Show progress every 1000 pairs
    if (pair_number + 1) % 1000 == 0:
      print(f"Progress: Created {pair_number + 1}/{num_pairs} pairs")

  # Convert our list to a pandas DataFrame and return it
  result_dataframe = pd.DataFrame(pair_data_list, columns=['anchor', 'positive', 'negative'])
  
  print(f"Final result: Created {len(result_dataframe)} pairs total")
  return result_dataframe

class pos_neg_pair_dataset(Dataset):
  #typical dataset class to tokenize for bert model and return the ids and masks
  def __init__(self, pair_data, tokenizer, max_length=128):
    self.pair_data = pair_data
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.pair_data)

  def __getitem__(self, idx):

    anchor = self.tokenizer(
      self.pair_data.iloc[idx]['anchor'],
      return_tensors='pt',
      truncation=True,
      max_length=self.max_length,
      padding='max_length')
    positive = self.tokenizer(
        self.pair_data.iloc[idx]['positive'],
        return_tensors='pt',
        truncation=True,
        max_length=self.max_length,
        padding='max_length')
    negative = self.tokenizer(
        self.pair_data.iloc[idx]['negative'],
        return_tensors='pt',
        truncation=True,
        max_length=self.max_length,
        padding='max_length')

    return {
      'anchor_input_ids': anchor['input_ids'].squeeze(),
      'anchor_attention_mask': anchor['attention_mask'].squeeze(),
      'positive_input_ids': positive['input_ids'].squeeze(),
      'positive_attention_mask': positive['attention_mask'].squeeze(),
      'negative_input_ids': negative['input_ids'].squeeze(),
      'negative_attention_mask': negative['attention_mask'].squeeze()
    }

def evaluate_model(model, val_loader):
    #evaluation method, same as training but with no gradient updates
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    total_loss = 0
    criterion = nn.TripletMarginLoss(margin=1.0)
    with torch.no_grad():
        for batch in val_loader:
            anchor_input_ids = batch['anchor_input_ids'].to(device)
            anchor_attention_mask = batch['anchor_attention_mask'].to(device)
            positive_input_ids = batch['positive_input_ids'].to(device)
            positive_attention_mask = batch['positive_attention_mask'].to(device)
            negative_input_ids = batch['negative_input_ids'].to(device)
            negative_attention_mask = batch['negative_attention_mask'].to(device)

            # Forward pass - get raw BERT embeddings
            anchor_outputs = model(anchor_input_ids, anchor_attention_mask)
            positive_outputs = model(positive_input_ids, positive_attention_mask)
            negative_outputs = model(negative_input_ids, negative_attention_mask)

            # Extract [CLS] token embeddings
            anchor_emb = anchor_outputs.last_hidden_state[:, 0, :]
            positive_emb = positive_outputs.last_hidden_state[:, 0, :]
            negative_emb = negative_outputs.last_hidden_state[:, 0, :]

            # Calculate loss
            loss = criterion(anchor_emb, positive_emb, negative_emb)

            total_loss += loss.item()

    print(f"Average loss on validation set: {total_loss/len(val_loader):.4f}")

def train_model(train_loader, num_epochs=3):
    # initialize the model, criterion, and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    criterion = nn.TripletMarginLoss(margin=1.0)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            #load the ids and masks to device 
            anchor_input_ids = batch['anchor_input_ids'].to(device)
            anchor_attention_mask = batch['anchor_attention_mask'].to(device)
            positive_input_ids = batch['positive_input_ids'].to(device)
            positive_attention_mask = batch['positive_attention_mask'].to(device)
            negative_input_ids = batch['negative_input_ids'].to(device)
            negative_attention_mask = batch['negative_attention_mask'].to(device)

            # get the embeddings to extract the [CLS] token embeddings
            model(anchor_input_ids,anchor_attention_mask)
            anchor_outputs = model(anchor_input_ids, anchor_attention_mask)
            positive_outputs = model(positive_input_ids, positive_attention_mask)
            negative_outputs = model(negative_input_ids, negative_attention_mask)

            # Extract the[CLS] token embeddings
            anchor_emb = anchor_outputs.last_hidden_state[:, 0, :]
            positive_emb = positive_outputs.last_hidden_state[:, 0, :]
            negative_emb = negative_outputs.last_hidden_state[:, 0, :]

            # Calculate loss
            loss = criterion(anchor_emb, positive_emb, negative_emb)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # per batch average loss total loss / number of batches
        print(f'Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader):.4f}')

    return model

if __name__ == '__main__':

  if not os.path.exists('pair_data.parquet'):
    # Load and prepare the data
    print("Loading recipe data")
    recipes_df = pd.read_csv('RAW_recipes.csv')

    # Clean the data
    recipes_df['name'] = recipes_df['name'].apply(clean_text)
    recipes_df['tags'] = recipes_df['tags'].apply(literal_eval)
    recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval)

    # Filter recipes with meaningful data (no empty tags)
    recipes_df = recipes_df[recipes_df['tags'].str.len() > 0]

    # Load interactions
    print("Loading interaction data")
    interactions_df = pd.read_csv('RAW_interactions.csv')
    interactions_df = interactions_df.dropna(subset=['rating'])
    interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
    interactions_df = interactions_df.dropna(subset=['rating'])

    # Create training pairs
    pair_data = create_pair_data(recipes_df, interactions_df, num_pairs=15000)

    # Save the pair data
    pair_data.to_parquet('pair_data.parquet', index=False)
    print('Data saved to pair_data.parquet')

  else:
    pair_data = pd.read_parquet('pair_data.parquet')
    print('Data loaded from pair_data.parquet')

  # Split data to training and validation (80% training, 20% validation)
  train_data, val_data = train_test_split(pair_data, test_size=0.2, random_state=42)

# initialize tokenizer and model
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  # Create the datasets with reduced max_length for better performance
  train_dataset = pos_neg_pair_dataset(train_data, tokenizer, max_length=128)
  val_dataset = pos_neg_pair_dataset(val_data, tokenizer, max_length=128)

  # Create dataloaders with smaller batch size for stability
  train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

  # Train model
  print("Starting training...")
  model = train_model(train_loader, num_epochs=3)

  #evaluate the model
  print("Evaluating model...")
  evaluate_model(model, val_loader)

  # Save model
  torch.save(model.state_dict(), 'tag_based_bert_model.pth')
  print("Model saved to tag_based_bert_model.pth")
  print("Training Complete")