Spaces:
Paused
Paused
import os | |
import pickle | |
import pandas as pd | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import re | |
from tqdm import tqdm | |
# Check if MPS is available | |
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') | |
print("Device:", device) | |
# Load model and tokenizer | |
model_id = "meta-llama/Meta-Llama-3-8B" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device) | |
# Load dictionary from CSV file | |
csv_file_path = './dictionary/dictionary.csv' | |
df = pd.read_csv(csv_file_path) | |
dictionary = df['description'].tolist() | |
# Cosine Similarity | |
def cosine_similarity(embedding1, embedding2): | |
return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item() | |
# Euclidean Distance | |
def euclidean_distance(embedding1, embedding2): | |
return -torch.dist(embedding1, embedding2).item() # Negative to keep similarity comparison consistent | |
# Method to generate embeddings using the text generation pipeline | |
def generate_embedding(sentence): | |
inputs = tokenizer(sentence, return_tensors='pt').to(device) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.logits.mean(dim=1).squeeze().cpu() | |
return embeddings | |
# Method to find the best match for the input word in the dictionary | |
def match_word(input_word, dictionary, similarity_measure): | |
# Remove anything in parentheses (i.e. (12 oz)) | |
input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip() | |
# Check for substring relationship and adjust input_word_clean | |
if '-' in input_word_clean: | |
left_term, right_term = map(str.strip, input_word_clean.split('-')) | |
if left_term.lower() in right_term.lower(): | |
input_word_clean = right_term | |
words = re.findall(r'\w+', input_word_clean.lower()) | |
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] | |
# print(f"Filtered dictionary size: {len(filtered_dictionary)}") | |
input_embedding = generate_embedding(input_word_clean) | |
similarities = [] | |
for entry in filtered_dictionary: | |
entry_embedding = dictionary_embeddings[entry] # Use pre-computed embedding | |
similarity_score = similarity_measure(input_embedding, entry_embedding) | |
similarities.append((entry, similarity_score)) | |
if similarities: | |
best_match = max(similarities, key=lambda x: x[1]) | |
return best_match if best_match[1] > 0.7 else None | |
else: | |
return None | |
# Check if the pickle file exists | |
if os.path.exists('dictionary_embeddings.pkl'): | |
# Load the pre-computed embeddings from the pickle file | |
with open('dictionary_embeddings.pkl', 'rb') as f: | |
dictionary_embeddings = pickle.load(f) | |
else: | |
# Generate embeddings for all entries in the dictionary | |
dictionary_embeddings = {} | |
for entry in tqdm(dictionary, desc="Generating Embeddings"): | |
dictionary_embeddings[entry] = generate_embedding(entry) | |
# Save the pre-computed embeddings to a pickle file | |
with open('dictionary_embeddings.pkl', 'wb') as f: | |
pickle.dump(dictionary_embeddings, f) | |
input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv' | |
df = pd.read_csv(input_file_path) | |
input_words = df['description'].tolist() | |
similarity_measure = cosine_similarity | |
results = [] | |
for input_word in tqdm(input_words, desc="Matching Words"): | |
# print("Input word:", input_word) | |
try: | |
matched_entry = match_word(input_word, dictionary, similarity_measure) | |
if (matched_entry): | |
# print("Matched entry:", matched_entry[0]) | |
# print("Similarity score:", matched_entry[1]) | |
results.append({ | |
'input_word': input_word, | |
'matched_word': matched_entry[0], | |
'score': matched_entry[1] | |
}) | |
else: | |
# print("Matched entry: None") | |
results.append({ | |
'input_word': input_word, | |
'matched_word': None, | |
'score': None | |
}) | |
print() | |
except Exception as e: | |
print("Error:", e) | |
results.append({ | |
'input_word': input_word, | |
'matched_word': None, | |
'score': None | |
}) | |
# print() | |
df_results = pd.DataFrame(results) | |
csv_file_path = f'results/results.csv' | |
df_results.to_csv(csv_file_path, index=False) | |