import pandas as pd from sentence_transformers import SentenceTransformer, util import re # Load pre-trained SBERT model model = SentenceTransformer('all-mpnet-base-v2') # Load dictionary from CSV file csv_file_path = './dictionary/dictionary.csv' df = pd.read_csv(csv_file_path) dictionary = df['description'].tolist() def match_word(input_word, dictionary): # Extract words from the input words = re.findall(r'\w+', input_word.lower()) # Filter dictionary based on words filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] print(f"Filtered dictionary size: {len(filtered_dictionary)}") # print(f"Filtered dictionary: {filtered_dictionary}") # Proceed with SBERT embeddings and cosine similarity on the filtered dictionary input_embedding = model.encode(input_word, convert_to_tensor=True) similarities = [] for entry in filtered_dictionary: entry_embedding = model.encode(entry, convert_to_tensor=True) similarity_score = util.pytorch_cos_sim(input_embedding, entry_embedding).item() similarities.append((entry, similarity_score)) # print(similarities) if similarities: best_match = max(similarities, key=lambda x: x[1]) return best_match if best_match[1] > 0.7 else None else: return None # Example usage input_words = [ "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", "Pepper - Jalapeno Pepper", "carrot (10 lbs )" ] for input_word in input_words: print("Input word:", input_word) matched_entry = match_word(input_word, dictionary) if matched_entry: print("Matched entry:", matched_entry[0]) print("Similarity score:", matched_entry[1]) else: print("Matched entry: None") print()