brightly-ai / old_experiments /llama3-gpu-compare.py
beweinreich's picture
first
9189e38
raw
history blame
No virus
3.69 kB
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm
# Check if MPS is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print("Device:", device)
# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
# Load dictionary from CSV file
csv_file_path = '/Users/bw/Webstuff/btest/test/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
# Define the prompt with instructions for comparison
compare_prompt = "Compare the following two texts and rate their similarity on a scale from 0 to 1. Text 1: {} Text 2: {}. Similarity score: "
# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
input_text = sentence
inputs = tokenizer(input_text, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
return embeddings
# Method to get similarity score using Llama model's comprehension
def get_similarity_score(text1, text2):
input_text = compare_prompt.format(text1, text2)
inputs = tokenizer(input_text, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
try:
score = float(re.findall(r"\d+\.\d+", generated_text)[-1])
except:
score = 0.0
print(text1, text2, score)
return score
# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
# Remove text in parentheses
input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()
words = re.findall(r'\w+', input_word_clean.lower())
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
print(f"Filtered dictionary size: {len(filtered_dictionary)}")
similarities = []
for entry in tqdm(filtered_dictionary, desc="Processing Entries"):
score = get_similarity_score(input_word_clean, entry)
if 'raw' in entry.lower() and len(words) == 1:
score += 0.1 # Boost for raw version and single-word input
similarities.append((entry, score))
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match if best_match[1] > 0.7 else None
else:
return None
# Example usage
input_words = [
"Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower",
"Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato",
"Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash",
"Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper",
"Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper",
"Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)",
"Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper",
"Pepper - Jalapeno Pepper", "carrot (10 lbs )"
]
for input_word in tqdm(input_words, desc="Matching Words"):
print("Input word:", input_word)
matched_entry = match_word(input_word, dictionary)
if matched_entry:
print("Matched entry:", matched_entry[0])
print("Similarity score:", matched_entry[1])
else:
print("Matched entry: None")
print()