# This code includes a secondary filtering step that checks for the presence of specific keywords (like "pepper" in this case). import pandas as pd from sentence_transformers import SentenceTransformer, util import re # Load pre-trained SBERT model model = SentenceTransformer('all-MiniLM-L6-v2') # Smaller and faster, but you can choose a larger model if needed # Load dictionary from CSV file csv_file_path = './dictionary/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() # Method to compute refined similarity def refined_similarity(input_word, filtered_dictionary): input_embedding = model.encode(input_word, convert_to_tensor=True) similarities = [] for entry in filtered_dictionary: entry_embedding = model.encode(entry, convert_to_tensor=True) similarity_score = util.pytorch_cos_sim(input_embedding, entry_embedding).item() similarities.append((entry, similarity_score)) return similarities # Method to find the best match for the input word in the dictionary def match_word(input_word, dictionary): # Extract words from the input words = re.findall(r'\w+', input_word.lower()) # Filter dictionary based on words filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] print(f"Filtered dictionary size: {len(filtered_dictionary)}") # Refined filtering by checking for exact word presence further_filtered = [desc for desc in filtered_dictionary if "pepper" in desc.lower()] # If further_filtered is empty, fallback to filtered_dictionary if further_filtered: filtered_dictionary = further_filtered print(f"Further filtered dictionary size: {len(filtered_dictionary)}") # print(f"Filtered dictionary: {filtered_dictionary}") # Proceed with SBERT embeddings and cosine similarity on the filtered dictionary similarities = refined_similarity(input_word, filtered_dictionary) # print(similarities) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Example usage input_words = [ "Carrot (10 lbs )", "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", "Pepper - Jalapeno Pepper" ] for input_word in input_words: print("Input word:", input_word) matched_entry = match_word(input_word, dictionary) if matched_entry: print("Matched entry:", matched_entry[0]) print("Similarity score:", matched_entry[1]) else: print("Matched entry: None") print()