# This code includes a secondary filtering step that checks for the presence of specific keywords (like "pepper" in this case). 


import pandas as pd
from sentence_transformers import SentenceTransformer, util
import re

# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Smaller and faster, but you can choose a larger model if needed

# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()

# Method to compute refined similarity
def refined_similarity(input_word, filtered_dictionary):
    input_embedding = model.encode(input_word, convert_to_tensor=True)
    similarities = []

    for entry in filtered_dictionary:
        entry_embedding = model.encode(entry, convert_to_tensor=True)
        similarity_score = util.pytorch_cos_sim(input_embedding, entry_embedding).item()
        similarities.append((entry, similarity_score))

    return similarities

# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
    # Extract words from the input
    words = re.findall(r'\w+', input_word.lower())

    # Filter dictionary based on words
    filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
    
    print(f"Filtered dictionary size: {len(filtered_dictionary)}")

    # Refined filtering by checking for exact word presence
    further_filtered = [desc for desc in filtered_dictionary if "pepper" in desc.lower()]

    # If further_filtered is empty, fallback to filtered_dictionary
    if further_filtered:
        filtered_dictionary = further_filtered

    print(f"Further filtered dictionary size: {len(filtered_dictionary)}")
    # print(f"Filtered dictionary: {filtered_dictionary}")

    # Proceed with SBERT embeddings and cosine similarity on the filtered dictionary
    similarities = refined_similarity(input_word, filtered_dictionary)

    # print(similarities)

    if similarities:
        best_match = max(similarities, key=lambda x: x[1])
        return best_match if best_match[1] > 0.7 else None
    else:
        return None

# Example usage
input_words = [
  "Carrot (10 lbs )",
    "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", 
    "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato",
    "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash",
    "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper",
    "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper",
    "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)",
    "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper",
    "Pepper - Jalapeno Pepper"
]

for input_word in input_words:
    print("Input word:", input_word)
    matched_entry = match_word(input_word, dictionary)
    if matched_entry:
        print("Matched entry:", matched_entry[0])
        print("Similarity score:", matched_entry[1])
    else:
        print("Matched entry: None")
    print()