import pandas as pd from transformers import pipeline import torch import re model_id = "meta-llama/Meta-Llama-3-8B" generator = pipeline( "text-generation", model="meta-llama/Meta-Llama-3-8B", model_kwargs={"torch_dtype": torch.bfloat16}, device=-1 ) # Load dictionary from CSV file csv_file_path = './dictionary/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() # Method to generate embeddings using the text generation pipeline def generate_embedding(sentence): # Generate a text embedding using the pipeline inputs = generator.tokenizer(sentence, return_tensors='pt')['input_ids'] with torch.no_grad(): outputs = generator.model(inputs) # Extract the embeddings from the logits embeddings = outputs.logits.mean(dim=1).squeeze() return embeddings # Method to find the best match for the input word in the dictionary def match_word(input_word, dictionary): # Extract words from the input words = re.findall(r'\w+', input_word.lower()) # Filter dictionary based on words filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] print(f"Filtered dictionary size: {len(filtered_dictionary)}") # print(f"Filtered dictionary: {filtered_dictionary}") # Generate embeddings and calculate cosine similarity on the filtered dictionary input_embedding = generate_embedding(input_word) similarities = [] for entry in filtered_dictionary: entry_embedding = generate_embedding(entry) similarity_score = torch.nn.functional.cosine_similarity(input_embedding, entry_embedding, dim=0).item() similarities.append((entry, similarity_score)) # print(similarities) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Example usage input_words = [ "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", "Pepper - Jalapeno Pepper", "carrot (10 lbs )" ] for input_word in input_words: matched_entry = match_word(input_word, dictionary) if matched_entry: print("Input word:", input_word) print("Matched entry:", matched_entry[0]) print("Similarity score:", matched_entry[1]) else: print("Input word:", input_word) print("Matched entry: None") print()