File size: 2,953 Bytes
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from transformers import pipeline
import torch
import re

model_id = "meta-llama/Meta-Llama-3-8B"

generator = pipeline(
  "text-generation",
  model="meta-llama/Meta-Llama-3-8B",
  model_kwargs={"torch_dtype": torch.bfloat16},
  device=-1 
)

# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()

# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
    # Generate a text embedding using the pipeline
    inputs = generator.tokenizer(sentence, return_tensors='pt')['input_ids']
    with torch.no_grad():
        outputs = generator.model(inputs)
    # Extract the embeddings from the logits
    embeddings = outputs.logits.mean(dim=1).squeeze()
    return embeddings

# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
    # Extract words from the input
    words = re.findall(r'\w+', input_word.lower())

    # Filter dictionary based on words
    filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
    
    print(f"Filtered dictionary size: {len(filtered_dictionary)}")
    # print(f"Filtered dictionary: {filtered_dictionary}")

    # Generate embeddings and calculate cosine similarity on the filtered dictionary
    input_embedding = generate_embedding(input_word)
    similarities = []

    for entry in filtered_dictionary:
        entry_embedding = generate_embedding(entry)
        similarity_score = torch.nn.functional.cosine_similarity(input_embedding, entry_embedding, dim=0).item()
        similarities.append((entry, similarity_score))

    # print(similarities)

    if similarities:
        best_match = max(similarities, key=lambda x: x[1])
        return best_match if best_match[1] > 0.7 else None
    else:
        return None

# Example usage
input_words = [
    "Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", 
    "Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato",
    "Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash",
    "Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper",
    "Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper",
    "Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)",
    "Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper",
    "Pepper - Jalapeno Pepper", "carrot (10 lbs )"
]

for input_word in input_words:
    matched_entry = match_word(input_word, dictionary)
    if matched_entry:
        print("Input word:", input_word)
        print("Matched entry:", matched_entry[0])
        print("Similarity score:", matched_entry[1])
    else:
        print("Input word:", input_word)
        print("Matched entry: None")
    print()