beweinreich's picture
first
9189e38
raw
history blame
No virus
2.34 kB
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import re
# Load pre-trained SBERT model
model = SentenceTransformer('all-mpnet-base-v2')
# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
def match_word(input_word, dictionary):
# Extract words from the input
words = re.findall(r'\w+', input_word.lower())
# Filter dictionary based on words
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
print(f"Filtered dictionary size: {len(filtered_dictionary)}")
# print(f"Filtered dictionary: {filtered_dictionary}")
# Proceed with SBERT embeddings and cosine similarity on the filtered dictionary
input_embedding = model.encode(input_word, convert_to_tensor=True)
similarities = []
for entry in filtered_dictionary:
entry_embedding = model.encode(entry, convert_to_tensor=True)
similarity_score = util.pytorch_cos_sim(input_embedding, entry_embedding).item()
similarities.append((entry, similarity_score))
# print(similarities)
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match if best_match[1] > 0.7 else None
else:
return None
# Example usage
input_words = [
"Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower",
"Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato",
"Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash",
"Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper",
"Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper",
"Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)",
"Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper",
"Pepper - Jalapeno Pepper", "carrot (10 lbs )"
]
for input_word in input_words:
print("Input word:", input_word)
matched_entry = match_word(input_word, dictionary)
if matched_entry:
print("Matched entry:", matched_entry[0])
print("Similarity score:", matched_entry[1])
else:
print("Matched entry: None")
print()