Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / old_experiments /llama3-gpu.py

beweinreich

first

9189e38 about 1 month ago

raw

history blame

No virus

4.47 kB

	import os
	import pickle
	import pandas as pd
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import re
	from tqdm import tqdm

	# Check if MPS is available
	device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

	print("Device:", device)

	# Load model and tokenizer
	model_id = "meta-llama/Meta-Llama-3-8B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

	# Load dictionary from CSV file
	csv_file_path = './dictionary/dictionary.csv'
	df = pd.read_csv(csv_file_path)
	dictionary = df['description'].tolist()

	# Cosine Similarity
	def cosine_similarity(embedding1, embedding2):
	return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()

	# Euclidean Distance
	def euclidean_distance(embedding1, embedding2):
	return -torch.dist(embedding1, embedding2).item() # Negative to keep similarity comparison consistent

	# Method to generate embeddings using the text generation pipeline
	def generate_embedding(sentence):
	inputs = tokenizer(sentence, return_tensors='pt').to(device)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
	return embeddings

	# Method to find the best match for the input word in the dictionary
	def match_word(input_word, dictionary, similarity_measure):
	# Remove anything in parentheses (i.e. (12 oz))
	input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()

	# Check for substring relationship and adjust input_word_clean
	if '-' in input_word_clean:
	left_term, right_term = map(str.strip, input_word_clean.split('-'))
	if left_term.lower() in right_term.lower():
	input_word_clean = right_term

	words = re.findall(r'\w+', input_word_clean.lower())
	filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
	# print(f"Filtered dictionary size: {len(filtered_dictionary)}")

	input_embedding = generate_embedding(input_word_clean)
	similarities = []

	for entry in filtered_dictionary:
	entry_embedding = dictionary_embeddings[entry] # Use pre-computed embedding
	similarity_score = similarity_measure(input_embedding, entry_embedding)
	similarities.append((entry, similarity_score))

	if similarities:
	best_match = max(similarities, key=lambda x: x[1])
	return best_match if best_match[1] > 0.7 else None
	else:
	return None

	# Check if the pickle file exists
	if os.path.exists('dictionary_embeddings.pkl'):
	# Load the pre-computed embeddings from the pickle file
	with open('dictionary_embeddings.pkl', 'rb') as f:
	dictionary_embeddings = pickle.load(f)
	else:
	# Generate embeddings for all entries in the dictionary
	dictionary_embeddings = {}
	for entry in tqdm(dictionary, desc="Generating Embeddings"):
	dictionary_embeddings[entry] = generate_embedding(entry)

	# Save the pre-computed embeddings to a pickle file
	with open('dictionary_embeddings.pkl', 'wb') as f:
	pickle.dump(dictionary_embeddings, f)


	input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
	df = pd.read_csv(input_file_path)
	input_words = df['description'].tolist()

	similarity_measure = cosine_similarity
	results = []
	for input_word in tqdm(input_words, desc="Matching Words"):
	# print("Input word:", input_word)
	try:
	matched_entry = match_word(input_word, dictionary, similarity_measure)
	if (matched_entry):
	# print("Matched entry:", matched_entry[0])
	# print("Similarity score:", matched_entry[1])
	results.append({
	'input_word': input_word,
	'matched_word': matched_entry[0],
	'score': matched_entry[1]
	})
	else:
	# print("Matched entry: None")
	results.append({
	'input_word': input_word,
	'matched_word': None,
	'score': None
	})
	print()
	except Exception as e:
	print("Error:", e)
	results.append({
	'input_word': input_word,
	'matched_word': None,
	'score': None
	})
	# print()

	df_results = pd.DataFrame(results)
	csv_file_path = f'results/results.csv'
	df_results.to_csv(csv_file_path, index=False)