Spaces:
Paused
Paused
File size: 4,467 Bytes
9189e38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import pickle
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm
# Check if MPS is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print("Device:", device)
# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
# Cosine Similarity
def cosine_similarity(embedding1, embedding2):
return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()
# Euclidean Distance
def euclidean_distance(embedding1, embedding2):
return -torch.dist(embedding1, embedding2).item() # Negative to keep similarity comparison consistent
# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
inputs = tokenizer(sentence, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
return embeddings
# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary, similarity_measure):
# Remove anything in parentheses (i.e. (12 oz))
input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()
# Check for substring relationship and adjust input_word_clean
if '-' in input_word_clean:
left_term, right_term = map(str.strip, input_word_clean.split('-'))
if left_term.lower() in right_term.lower():
input_word_clean = right_term
words = re.findall(r'\w+', input_word_clean.lower())
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
# print(f"Filtered dictionary size: {len(filtered_dictionary)}")
input_embedding = generate_embedding(input_word_clean)
similarities = []
for entry in filtered_dictionary:
entry_embedding = dictionary_embeddings[entry] # Use pre-computed embedding
similarity_score = similarity_measure(input_embedding, entry_embedding)
similarities.append((entry, similarity_score))
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match if best_match[1] > 0.7 else None
else:
return None
# Check if the pickle file exists
if os.path.exists('dictionary_embeddings.pkl'):
# Load the pre-computed embeddings from the pickle file
with open('dictionary_embeddings.pkl', 'rb') as f:
dictionary_embeddings = pickle.load(f)
else:
# Generate embeddings for all entries in the dictionary
dictionary_embeddings = {}
for entry in tqdm(dictionary, desc="Generating Embeddings"):
dictionary_embeddings[entry] = generate_embedding(entry)
# Save the pre-computed embeddings to a pickle file
with open('dictionary_embeddings.pkl', 'wb') as f:
pickle.dump(dictionary_embeddings, f)
input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
df = pd.read_csv(input_file_path)
input_words = df['description'].tolist()
similarity_measure = cosine_similarity
results = []
for input_word in tqdm(input_words, desc="Matching Words"):
# print("Input word:", input_word)
try:
matched_entry = match_word(input_word, dictionary, similarity_measure)
if (matched_entry):
# print("Matched entry:", matched_entry[0])
# print("Similarity score:", matched_entry[1])
results.append({
'input_word': input_word,
'matched_word': matched_entry[0],
'score': matched_entry[1]
})
else:
# print("Matched entry: None")
results.append({
'input_word': input_word,
'matched_word': None,
'score': None
})
print()
except Exception as e:
print("Error:", e)
results.append({
'input_word': input_word,
'matched_word': None,
'score': None
})
# print()
df_results = pd.DataFrame(results)
csv_file_path = f'results/results.csv'
df_results.to_csv(csv_file_path, index=False)
|