import pandas as pd from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import torch import re from tqdm import tqdm # Check if MPS is available device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') print("Device:", device) # Load model and tokenizer model_id = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device) # Load dictionary from CSV file csv_file_path = '/Users/bw/Webstuff/btest/test/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() # Define the prompt with instructions for comparison compare_prompt = "Compare the following two texts and rate their similarity on a scale from 0 to 1. Text 1: {} Text 2: {}. Similarity score: " # Method to generate embeddings using the text generation pipeline def generate_embedding(sentence): input_text = sentence inputs = tokenizer(input_text, return_tensors='pt').to(device) with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.logits.mean(dim=1).squeeze().cpu() return embeddings # Method to get similarity score using Llama model's comprehension def get_similarity_score(text1, text2): input_text = compare_prompt.format(text1, text2) inputs = tokenizer(input_text, return_tensors='pt').to(device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=50) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) try: score = float(re.findall(r"\d+\.\d+", generated_text)[-1]) except: score = 0.0 print(text1, text2, score) return score # Method to find the best match for the input word in the dictionary def match_word(input_word, dictionary): # Remove text in parentheses input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip() words = re.findall(r'\w+', input_word_clean.lower()) filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] print(f"Filtered dictionary size: {len(filtered_dictionary)}") similarities = [] for entry in tqdm(filtered_dictionary, desc="Processing Entries"): score = get_similarity_score(input_word_clean, entry) if 'raw' in entry.lower() and len(words) == 1: score += 0.1 # Boost for raw version and single-word input similarities.append((entry, score)) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Example usage input_words = [ "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", "Pepper - Jalapeno Pepper", "carrot (10 lbs )" ] for input_word in tqdm(input_words, desc="Matching Words"): print("Input word:", input_word) matched_entry = match_word(input_word, dictionary) if matched_entry: print("Matched entry:", matched_entry[0]) print("Similarity score:", matched_entry[1]) else: print("Matched entry: None") print()