File size: 4,467 Bytes
9189e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import pickle
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm

# Check if MPS is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

print("Device:", device)

# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()

# Cosine Similarity
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()

# Euclidean Distance
def euclidean_distance(embedding1, embedding2):
    return -torch.dist(embedding1, embedding2).item()  # Negative to keep similarity comparison consistent

# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
    return embeddings

# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary, similarity_measure):
    # Remove anything in parentheses (i.e. (12 oz))
    input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()
    
    # Check for substring relationship and adjust input_word_clean
    if '-' in input_word_clean:
        left_term, right_term = map(str.strip, input_word_clean.split('-'))
        if left_term.lower() in right_term.lower():
            input_word_clean = right_term

    words = re.findall(r'\w+', input_word_clean.lower())
    filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
    # print(f"Filtered dictionary size: {len(filtered_dictionary)}")

    input_embedding = generate_embedding(input_word_clean)
    similarities = []

    for entry in filtered_dictionary:
        entry_embedding = dictionary_embeddings[entry]  # Use pre-computed embedding
        similarity_score = similarity_measure(input_embedding, entry_embedding)
        similarities.append((entry, similarity_score))

    if similarities:
        best_match = max(similarities, key=lambda x: x[1])
        return best_match if best_match[1] > 0.7 else None
    else:
        return None

# Check if the pickle file exists
if os.path.exists('dictionary_embeddings.pkl'):
    # Load the pre-computed embeddings from the pickle file
    with open('dictionary_embeddings.pkl', 'rb') as f:
        dictionary_embeddings = pickle.load(f)
else:
    # Generate embeddings for all entries in the dictionary
    dictionary_embeddings = {}
    for entry in tqdm(dictionary, desc="Generating Embeddings"):
        dictionary_embeddings[entry] = generate_embedding(entry)

    # Save the pre-computed embeddings to a pickle file
    with open('dictionary_embeddings.pkl', 'wb') as f:
        pickle.dump(dictionary_embeddings, f)


input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
df = pd.read_csv(input_file_path)
input_words = df['description'].tolist()

similarity_measure = cosine_similarity
results = []
for input_word in tqdm(input_words, desc="Matching Words"):
    # print("Input word:", input_word)
    try:
        matched_entry = match_word(input_word, dictionary, similarity_measure)
        if (matched_entry):
            # print("Matched entry:", matched_entry[0])
            # print("Similarity score:", matched_entry[1])
            results.append({
                'input_word': input_word,
                'matched_word': matched_entry[0],
                'score': matched_entry[1]
            })
        else:
            # print("Matched entry: None")
            results.append({
                'input_word': input_word,
                'matched_word': None,
                'score': None
            })
        print()
    except Exception as e:
        print("Error:", e)
        results.append({
            'input_word': input_word,
            'matched_word': None,
            'score': None
        })
        # print()

df_results = pd.DataFrame(results)
csv_file_path = f'results/results.csv'
df_results.to_csv(csv_file_path, index=False)