brightly-ai / old_experiments /llama3-gpu.py
beweinreich's picture
first
9189e38
raw
history blame
No virus
4.47 kB
import os
import pickle
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm
# Check if MPS is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print("Device:", device)
# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
# Cosine Similarity
def cosine_similarity(embedding1, embedding2):
return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()
# Euclidean Distance
def euclidean_distance(embedding1, embedding2):
return -torch.dist(embedding1, embedding2).item() # Negative to keep similarity comparison consistent
# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
inputs = tokenizer(sentence, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
return embeddings
# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary, similarity_measure):
# Remove anything in parentheses (i.e. (12 oz))
input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()
# Check for substring relationship and adjust input_word_clean
if '-' in input_word_clean:
left_term, right_term = map(str.strip, input_word_clean.split('-'))
if left_term.lower() in right_term.lower():
input_word_clean = right_term
words = re.findall(r'\w+', input_word_clean.lower())
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
# print(f"Filtered dictionary size: {len(filtered_dictionary)}")
input_embedding = generate_embedding(input_word_clean)
similarities = []
for entry in filtered_dictionary:
entry_embedding = dictionary_embeddings[entry] # Use pre-computed embedding
similarity_score = similarity_measure(input_embedding, entry_embedding)
similarities.append((entry, similarity_score))
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match if best_match[1] > 0.7 else None
else:
return None
# Check if the pickle file exists
if os.path.exists('dictionary_embeddings.pkl'):
# Load the pre-computed embeddings from the pickle file
with open('dictionary_embeddings.pkl', 'rb') as f:
dictionary_embeddings = pickle.load(f)
else:
# Generate embeddings for all entries in the dictionary
dictionary_embeddings = {}
for entry in tqdm(dictionary, desc="Generating Embeddings"):
dictionary_embeddings[entry] = generate_embedding(entry)
# Save the pre-computed embeddings to a pickle file
with open('dictionary_embeddings.pkl', 'wb') as f:
pickle.dump(dictionary_embeddings, f)
input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
df = pd.read_csv(input_file_path)
input_words = df['description'].tolist()
similarity_measure = cosine_similarity
results = []
for input_word in tqdm(input_words, desc="Matching Words"):
# print("Input word:", input_word)
try:
matched_entry = match_word(input_word, dictionary, similarity_measure)
if (matched_entry):
# print("Matched entry:", matched_entry[0])
# print("Similarity score:", matched_entry[1])
results.append({
'input_word': input_word,
'matched_word': matched_entry[0],
'score': matched_entry[1]
})
else:
# print("Matched entry: None")
results.append({
'input_word': input_word,
'matched_word': None,
'score': None
})
print()
except Exception as e:
print("Error:", e)
results.append({
'input_word': input_word,
'matched_word': None,
'score': None
})
# print()
df_results = pd.DataFrame(results)
csv_file_path = f'results/results.csv'
df_results.to_csv(csv_file_path, index=False)