Spaces:

madebybread
/

brightly-ai

Paused

File size: 4,467 Bytes

9189e38

import os
import pickle
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import re
from tqdm import tqdm

# Check if MPS is available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

print("Device:", device)

# Load model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()

# Cosine Similarity
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()

# Euclidean Distance
def euclidean_distance(embedding1, embedding2):
    return -torch.dist(embedding1, embedding2).item()  # Negative to keep similarity comparison consistent

# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.logits.mean(dim=1).squeeze().cpu()
    return embeddings

# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary, similarity_measure):
    # Remove anything in parentheses (i.e. (12 oz))
    input_word_clean = re.sub(r'\(.*?\)', '', input_word).strip()
    
    # Check for substring relationship and adjust input_word_clean
    if '-' in input_word_clean:
        left_term, right_term = map(str.strip, input_word_clean.split('-'))
        if left_term.lower() in right_term.lower():
            input_word_clean = right_term

    words = re.findall(r'\w+', input_word_clean.lower())
    filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
    # print(f"Filtered dictionary size: {len(filtered_dictionary)}")

    input_embedding = generate_embedding(input_word_clean)
    similarities = []

    for entry in filtered_dictionary:
        entry_embedding = dictionary_embeddings[entry]  # Use pre-computed embedding
        similarity_score = similarity_measure(input_embedding, entry_embedding)
        similarities.append((entry, similarity_score))

    if similarities:
        best_match = max(similarities, key=lambda x: x[1])
        return best_match if best_match[1] > 0.7 else None
    else:
        return None

# Check if the pickle file exists
if os.path.exists('dictionary_embeddings.pkl'):
    # Load the pre-computed embeddings from the pickle file
    with open('dictionary_embeddings.pkl', 'rb') as f:
        dictionary_embeddings = pickle.load(f)
else:
    # Generate embeddings for all entries in the dictionary
    dictionary_embeddings = {}
    for entry in tqdm(dictionary, desc="Generating Embeddings"):
        dictionary_embeddings[entry] = generate_embedding(entry)

    # Save the pre-computed embeddings to a pickle file
    with open('dictionary_embeddings.pkl', 'wb') as f:
        pickle.dump(dictionary_embeddings, f)


input_file_path = 'raw/food-forward-2023-raw-data - food-forward-2023-raw-data.csv'
df = pd.read_csv(input_file_path)
input_words = df['description'].tolist()

similarity_measure = cosine_similarity
results = []
for input_word in tqdm(input_words, desc="Matching Words"):
    # print("Input word:", input_word)
    try:
        matched_entry = match_word(input_word, dictionary, similarity_measure)
        if (matched_entry):
            # print("Matched entry:", matched_entry[0])
            # print("Similarity score:", matched_entry[1])
            results.append({
                'input_word': input_word,
                'matched_word': matched_entry[0],
                'score': matched_entry[1]
            })
        else:
            # print("Matched entry: None")
            results.append({
                'input_word': input_word,
                'matched_word': None,
                'score': None
            })
        print()
    except Exception as e:
        print("Error:", e)
        results.append({
            'input_word': input_word,
            'matched_word': None,
            'score': None
        })
        # print()

df_results = pd.DataFrame(results)
csv_file_path = f'results/results.csv'
df_results.to_csv(csv_file_path, index=False)