Spaces:
Paused
Paused
import pandas as pd | |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import re | |
from tqdm import tqdm | |
# Check if MPS is available | |
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') | |
print("Device:", device) | |
# Load model and tokenizer | |
model_id = "meta-llama/Meta-Llama-3-8B" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device) | |
# Load dictionary from CSV file | |
csv_file_path = '/Users/bw/Webstuff/btest/test/dictionary.csv' | |
df = pd.read_csv(csv_file_path) | |
dictionary = df['description'].tolist() | |
# Define the prompt with instructions | |
prompt = "The text sometimes comes hyphenated, where the part before the hyphen is the general category, and the item after the hyphen is the more specific item. Please generate an embedding for the following text: " | |
# Method to generate embeddings using the text generation pipeline | |
def generate_embedding(sentence): | |
# Combine the prompt with the sentence | |
input_text = prompt + sentence | |
inputs = tokenizer(input_text, return_tensors='pt').to(device) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.logits.mean(dim=1).squeeze().cpu() | |
return embeddings | |
# Cosine Similarity | |
def cosine_similarity(embedding1, embedding2): | |
return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item() | |
# Custom scoring function | |
def custom_score(input_word, input_embedding, entry_embedding, entry_text): | |
# Calculate cosine similarity | |
similarity_score = cosine_similarity(input_embedding, entry_embedding) | |
# Boost score if the input word is a single word and the entry contains preferred keywords | |
if 'raw' in entry_text.lower() and len(re.findall(r'\w+', input_word.lower())) == 1: | |
similarity_score += 0.1 # Adjust this value as needed | |
return similarity_score | |
# Method to find the best match for the input word in the dictionary | |
def match_word(input_word, dictionary): | |
# Remove text in parentheses | |
input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip() | |
words = re.findall(r'\w+', input_word_clean.lower()) | |
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)] | |
print(f"Filtered dictionary size: {len(filtered_dictionary)}") | |
input_embedding = generate_embedding(input_word_clean) | |
similarities = [] | |
for entry in tqdm(filtered_dictionary, desc="Processing Entries"): | |
entry_embedding = generate_embedding(entry) | |
score = custom_score(input_word_clean, input_embedding, entry_embedding, entry) | |
similarities.append((entry, score)) | |
if similarities: | |
best_match = max(similarities, key=lambda x: x[1]) | |
return best_match if best_match[1] > 0.7 else None | |
else: | |
return None | |
# Example usage | |
input_words = [ | |
"Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower", | |
"Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato", | |
"Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash", | |
"Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper", | |
"Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper", | |
"Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)", | |
"Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper", | |
"Pepper - Jalapeno Pepper", "carrot (10 lbs )" | |
] | |
for input_word in tqdm(input_words, desc="Matching Words"): | |
print("Input word:", input_word) | |
matched_entry = match_word(input_word, dictionary) | |
if matched_entry: | |
print("Matched entry:", matched_entry[0]) | |
print("Similarity score:", matched_entry[1]) | |
else: | |
print("Matched entry: None") | |
print() | |