Spaces:

madebybread
/

brightly-ai

Paused

File size: 5,404 Bytes

import os
import re
import pickle
import pandas as pd
import requests
from sentence_transformers import SentenceTransformer, util
from db.db_utils import get_connection, initialize_db, store_mapping_to_db
from food_nonfood import classify_as_food_nonfood
from utils import generate_embedding, clean_word, cosine_similarity, calculate_confidence_and_similar_words_str, generate_embedded_dictionary
from add_mappings_to_embeddings import run_mappings_to_embeddings

# model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'sentence-transformers/all-mpnet-base-v2'
filename = model_name.replace('/', '-')
all_pickle_file_path = f'./embeddings/fast/{filename}.pkl'
category_pickle_file_path = f'./embeddings/fast/{filename}-categories.pkl'


class SimilarityFast:
    def __init__(self, db_cursor):
        self.db_cursor = db_cursor
        self.model = SentenceTransformer(model_name)
        
        self.db_cursor.execute("SELECT description FROM dictionary")
        dictionary = self.db_cursor.fetchall()
        dictionary = [item[0] for item in dictionary]

        self.db_cursor.execute("SELECT description FROM dictionary where fdc_id >= 9999000")
        categories = self.db_cursor.fetchall()
        categories = [item[0] for item in categories]

        self.dictionary_embeddings = self.load_dictionary_embeddings(dictionary, all_pickle_file_path)
        self.category_embeddings = self.load_dictionary_embeddings(categories, category_pickle_file_path)

    def preprocess_dictionary_word(self, text):
        text = text.strip().lower()
        text = text.replace(", raw", "").replace(" raw", "")
        text = text.replace(", nfs", "").replace(" nfs", "")
        if ',' in text:
            parts = [part.strip() for part in text.split(',')]
            text = ' '.join(reversed(parts))
        text = text.strip() # strip again in case there were multiple commas
        return text

    def load_dictionary_embeddings(self, data, file_path):
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                return pickle.load(f)
        else:
            dictionary_embeddings = generate_embedded_dictionary(data, self.model, self.preprocess_dictionary_word)
            with open(file_path, 'wb') as f:
                pickle.dump(dictionary_embeddings, f)

            new_entries = run_mappings_to_embeddings(self.model)
            # merge the new entries with the dictionary embeddings
            dictionary_embeddings.update(new_entries)

            return dictionary_embeddings

    def calculate_similarity_score(self, input_word_clean, embeddings):
        input_embedding = generate_embedding(self.model, input_word_clean)
        similarities = []

        for key, val in embeddings:
            similarity_score = cosine_similarity(input_embedding, val['v'])
            
            adjustment_made = False
            if 'cooked' in key.lower() and 'cooked' in input_word_clean:
                adjustment_made = True
                similarity_score *= 1.07
            if 'frozen' in key.lower() and 'frozen' in input_word_clean:
                adjustment_made = True
                similarity_score *= 1.07
            if 'canned' in key.lower() and 'canned' in input_word_clean:
                adjustment_made = True
                similarity_score *= 1.07
            if 'raw' in key.lower() and 'raw' in input_word_clean:
                adjustment_made = True
                similarity_score *= 1.07

            if not adjustment_made:
                if 'cooked' in key.lower():
                    similarity_score *= 0.95
                if 'frozen' in key.lower():
                    similarity_score *= 0.95
                if 'canned' in key.lower():
                    similarity_score *= 0.95
                
                # if we haven't made any adjustments, we can make a slight boost for raw
                if 'raw' in input_word_clean:
                    similarity_score *= 1.02
            
            similarities.append((key, val['d'], similarity_score))
        
        most_similar_word, dictionary_word, highest_score = max(similarities, key=lambda x: x[2])

        # ensure highest_score is not negative nor greater than 1
        highest_score = max(0, min(1, highest_score))

        confidence_score, similar_words_str = calculate_confidence_and_similar_words_str(similarities, highest_score)

        return most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str
        
    def find_most_similar_word(self, input_word, only_categories=False):
        if not isinstance(input_word, str) or not input_word:
            return None

        embeddings = self.category_embeddings.items() if only_categories else self.dictionary_embeddings.items()

        input_word_clean = clean_word(input_word)
        most_similar_word, dictionary_word, highest_score, confidence_score, similar_words_str = self.calculate_similarity_score(input_word_clean, embeddings)

        mapping = {
            'input_word': input_word,
            'cleaned_word': input_word_clean,
            'most_similar_word': most_similar_word,
            'dictionary_word': dictionary_word,
            'similarity_score': highest_score,
            'confidence_score': confidence_score,
            'similar_words': similar_words_str,
        }

        return mapping