import os import pickle import pandas as pd from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import torch import re from tqdm import tqdm # Check if MPS is available device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') print("Device:", device) # Load model and tokenizer model_id = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device) # Load dictionary from CSV file csv_file_path = './dictionary/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() # Cosine Similarity def cosine_similarity(embedding1, embedding2): return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item() # Euclidean Distance def euclidean_distance(embedding1, embedding2): return -torch.dist(embedding1, embedding2).item() # Negative to keep similarity comparison consistent # Method to generate embeddings using the text generation pipeline def generate_embedding(sentence): inputs = tokenizer(sentence, return_tensors='pt').to(device) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.logits.mean(dim=1).squeeze().cpu() return embeddings # Method to find the best match for the input word in the dictionary def match_word(input_word, dictionary, similarity_measure): # Remove anything in parentheses (i.e. (12 oz)) input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip() # Check for substring relationship and adjust input_word_clean if '-' in input_word_clean: left_term, right_term = map(str.strip, input_word_clean.split('-')) if left_term.lower() in right_term.lower(): input_word_clean = right_term words = re.findall(r'\w+', input_word_clean.lower()) filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] # print(f"Filtered dictionary size: {len(filtered_dictionary)}") input_embedding = generate_embedding(input_word_clean) similarities = [] for entry in filtered_dictionary: entry_embedding = dictionary_embeddings[entry] # Use pre-computed embedding similarity_score = similarity_measure(input_embedding, entry_embedding) similarities.append((entry, similarity_score)) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Check if the pickle file exists if os.path.exists('dictionary_embeddings.pkl'): # Load the pre-computed embeddings from the pickle file with open('dictionary_embeddings.pkl', 'rb') as f: dictionary_embeddings = pickle.load(f) else: # Generate embeddings for all entries in the dictionary dictionary_embeddings = {} for entry in tqdm(dictionary, desc="Generating Embeddings"): dictionary_embeddings[entry] = generate_embedding(entry) # Save the pre-computed embeddings to a pickle file with open('dictionary_embeddings.pkl', 'wb') as f: pickle.dump(dictionary_embeddings, f) input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv' df = pd.read_csv(input_file_path) input_words = df['description'].tolist() similarity_measure = cosine_similarity results = [] for input_word in tqdm(input_words, desc="Matching Words"): # print("Input word:", input_word) try: matched_entry = match_word(input_word, dictionary, similarity_measure) if (matched_entry): # print("Matched entry:", matched_entry[0]) # print("Similarity score:", matched_entry[1]) results.append({ 'input_word': input_word, 'matched_word': matched_entry[0], 'score': matched_entry[1] }) else: # print("Matched entry: None") results.append({ 'input_word': input_word, 'matched_word': None, 'score': None }) print() except Exception as e: print("Error:", e) results.append({ 'input_word': input_word, 'matched_word': None, 'score': None }) # print() df_results = pd.DataFrame(results) csv_file_path = f'results/results.csv' df_results.to_csv(csv_file_path, index=False)