Spaces:

madebybread
/

brightly-ai

Paused

App Files Files Community

brightly-ai / old_experiments /experiment2.py

beweinreich

first

9189e38 about 1 month ago

raw

history blame

No virus

5.82 kB

	import os
	import pickle
	import pandas as pd
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import re
	from tqdm import tqdm
	from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer

	# Check if MPS is available
	device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
	print("Device:", device)

	# Load model and tokenizer
	model_id = "meta-llama/Meta-Llama-3-8B"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

	# Preprocess the dictionary words
	# We should remove the word "raw" from the dictionary words
	# Also, if the dictionary word is comma separated, we should remove the comma and reverse the order
	# So, for example, "Tomato, Roma" should be converted to "Roma Tomato"

	def preprocess_dictionary_word(text):
	# lowercase the word and remove leading/trailing whitespaces
	text = text.strip().lower()

	# Remove the word "raw"
	text = text.replace(", raw", "").replace(" raw", "")

	# Remove the word "nfs" (not further specified)
	text = text.replace(", nfs", "").replace(" nfs", "")

	# If the text contains a comma, reverse the order
	if ',' in text:
	parts = [part.strip() for part in text.split(',')]
	text = ' '.join(reversed(parts))

	return text

	def generate_embedding(sentence):
	inputs = tokenizer(sentence, return_tensors='pt').to(device)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
	return embeddings

	def cosine_similarity(embedding1, embedding2):
	return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()


	# Load the dictionary
	csv_file_path = './dictionary/dictionary.csv'
	df_dictionary = pd.read_csv(csv_file_path)
	dictionary = df_dictionary['description'].astype(str).tolist()

	# Load the input words
	input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
	df_input = pd.read_csv(input_file_path)
	input_words = df_input['description'].astype(str).tolist()

	# Check if the embeddings pickle file exists
	pickle_file_path = './dictionary_embeddings_llama.pkl'
	if os.path.exists(pickle_file_path):
	with open(pickle_file_path, 'rb') as f:
	dictionary_embeddings = pickle.load(f)
	else:
	# Generate embeddings for dictionary words
	dictionary_embeddings = {}
	for desc in tqdm(dictionary, desc="Generating embeddings for dictionary words"):
	dictionary_embeddings[desc] = generate_embedding(preprocess_dictionary_word(desc))

	# Save the embeddings to a pickle file
	with open(pickle_file_path, 'wb') as f:
	pickle.dump(dictionary_embeddings, f)

	# Find the most similar word in the dictionary for each input word
	results = []
	for input_word in tqdm(input_words, desc="Processing input words"):
	if not isinstance(input_word, str) or not input_word:
	continue

	input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()

	print(f"Processing input word: {input_word}\nCleaned: {input_word_clean}")
	input_embedding = generate_embedding(input_word_clean)

	similarities = [(desc, cosine_similarity(input_embedding, dict_embedding))
	for desc, dict_embedding in dictionary_embeddings.items()]
	most_similar_word, highest_score = max(similarities, key=lambda x: x[1])
	print(f"Most similar word: {most_similar_word}")

	# Calculate confidence score
	high_similarities = [(desc, score) for desc, score in similarities if abs(score - highest_score) <= 0.05]
	high_similarities.sort(key=lambda x: x[1], reverse=True)
	confidence_score = 1 if len(high_similarities) <= 1 else 0

	print(f"Most similar word: {most_similar_word}")

	similar_words = []
	if confidence_score == 0:
	similar_words = [desc for desc, score in high_similarities[:5]] # Limit to top 5 similar words

	results.append((input_word, input_word_clean, most_similar_word, highest_score, confidence_score, similar_words))


	# Print the results
	for input_word, input_word_clean, most_similar_word, score, confidence, similar_words in results:
	print(f"Input word: {input_word}")
	print(f"Cleaned word: {input_word_clean}")
	print(f"Most similar word: {most_similar_word}")
	print(f"Similarity score: {score}")
	print(f"Confidence score: {confidence}")
	print(f"Similar words: {similar_words}\n")

	# Export results to CSV
	output_file_path = './results/experiment2.csv'
	df_results = pd.DataFrame(results, columns=['input_word', 'input_word_clean', 'match_word', 'similarity_score', 'confidence_score', 'similar_words'])
	df_results.to_csv(output_file_path, index=False)



	# If there are a number of results that are within 0.01 of each other, then we need to consider all of them


	# cosine_similarity(generate_embedding("Italian Squash"), generate_embedding("Squash, Italian, raw"))
	# cosine_similarity(generate_embedding("Italian Squash"), generate_embedding("Italian Sausage"))

	# cosine_similarity(generate_embedding("Tomato - Beefsteak Tomato"), generate_embedding("Beef with tomato-based sauce"))

	# cosine_similarity(generate_embedding("Tomato - Beefsteak Tomato"), generate_embedding("Tomato, Roma"))

	# cosine_similarity(generate_embedding("Tomato - Beefsteak Tomato"), generate_embedding("Tomato, raw"))

	# cosine_similarity(generate_embedding("Eggplant"), generate_embedding("Eggplant dip"))
	# cosine_similarity(generate_embedding("Eggplant"), generate_embedding("Eggplant,raw"))
	# cosine_similarity(generate_embedding("Eggplant"), generate_embedding("Eggplant raw"))
	# cosine_similarity(generate_embedding("Eggplant"), generate_embedding("raw Eggplant"))