File size: 2,790 Bytes
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import re

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()

# Method to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    return cosine_similarity(embedding1, embedding2)[0][0]

# Method to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
    # Extract words from the input
    words = re.findall(r'\w+', input_word.lower())

    # Filter dictionary based on words
    filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
    
    print(f"Filtered dictionary size: {len(filtered_dictionary)}")
    # print(f"Filtered dictionary: {filtered_dictionary}")

    # Proceed with BERT embeddings and cosine similarity on the filtered dictionary
    input_embedding = get_bert_embedding(input_word)
    similarities = []

    for entry in filtered_dictionary:
        entry_embedding = get_bert_embedding(entry)
        similarity_score = compute_cosine_similarity(input_embedding, entry_embedding)
        similarities.append((entry, similarity_score))

    # print(similarities)

    if similarities:
        best_match = max(similarities, key=lambda x: x[1])
        return best_match[0] if best_match[1] > 0.7 else None
    else:
        return None

# Example usage
input_words = ["Yellow Squash", "Cauliflower", "Habanero Pepper", "Bananas (12 lbs)", "Squash mix italian/yellow (30 lbs )"]

# Filtered dictionary size: 224
# Input word: Yellow Squash
# Matched entry: None

# Filtered dictionary size: 37
# Input word: Cauliflower
# Matched entry: Fried cauliflower

# Filtered dictionary size: 185
# Input word: Habanero Pepper
# Matched entry: Peppers, ancho, dried

# Filtered dictionary size: 414
# Input word: Bananas (12 lbs)
# Matched entry: Bananas, raw

# Filtered dictionary size: 784
# Input word: Squash mix italian/yellow (30 lbs )
# Matched entry: Nutritional powder mix (Slim Fast)



for input_word in input_words:
    matched_entry = match_word(input_word, dictionary)
    print("Input word:", input_word)
    print("Matched entry:", matched_entry)
    print()