import pandas as pd from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import torch import re from tqdm import tqdm # Check if MPS is available device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') print("Device:", device) # Load model and tokenizer model_id = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device) # Load dictionary from CSV file csv_file_path = '/Users/bw/Webstuff/btest/test/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() # Define the prompt with instructions prompt = "The text sometimes comes hyphenated, where the part before the hyphen is the general category, and the item after the hyphen is the more specific item. Please generate an embedding for the following text: " # Method to generate embeddings using the text generation pipeline def generate_embedding(sentence): # Combine the prompt with the sentence input_text = prompt + sentence inputs = tokenizer(input_text, return_tensors='pt').to(device) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.logits.mean(dim=1).squeeze().cpu() return embeddings # Cosine Similarity def cosine_similarity(embedding1, embedding2): return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item() # Custom scoring function def custom_score(input_word, input_embedding, entry_embedding, entry_text): # Calculate cosine similarity similarity_score = cosine_similarity(input_embedding, entry_embedding) # Boost score if the input word is a single word and the entry contains preferred keywords if 'raw' in entry_text.lower() and len(re.findall(r'\w+', input_word.lower())) == 1: similarity_score += 0.1 # Adjust this value as needed return similarity_score # Method to find the best match for the input word in the dictionary def match_word(input_word, dictionary): # Remove text in parentheses input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip() words = re.findall(r'\w+', input_word_clean.lower()) filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] print(f"Filtered dictionary size: {len(filtered_dictionary)}") input_embedding = generate_embedding(input_word_clean) similarities = [] for entry in tqdm(filtered_dictionary, desc="Processing Entries"): entry_embedding = generate_embedding(entry) score = custom_score(input_word_clean, input_embedding, entry_embedding, entry) similarities.append((entry, score)) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Example usage input_words = [ "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", "Pepper - Jalapeno Pepper", "carrot (10 lbs )" ] for input_word in tqdm(input_words, desc="Matching Words"): print("Input word:", input_word) matched_entry = match_word(input_word, dictionary) if matched_entry: print("Matched entry:", matched_entry[0]) print("Similarity score:", matched_entry[1]) else: print("Matched entry: None") print()