beweinreich's picture
first
9189e38
raw
history blame
No virus
2.95 kB
import pandas as pd
from transformers import pipeline
import torch
import re
model_id = "meta-llama/Meta-Llama-3-8B"
generator = pipeline(
"text-generation",
model="meta-llama/Meta-Llama-3-8B",
model_kwargs={"torch_dtype": torch.bfloat16},
device=-1
)
# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
# Method to generate embeddings using the text generation pipeline
def generate_embedding(sentence):
# Generate a text embedding using the pipeline
inputs = generator.tokenizer(sentence, return_tensors='pt')['input_ids']
with torch.no_grad():
outputs = generator.model(inputs)
# Extract the embeddings from the logits
embeddings = outputs.logits.mean(dim=1).squeeze()
return embeddings
# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
# Extract words from the input
words = re.findall(r'\w+', input_word.lower())
# Filter dictionary based on words
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
print(f"Filtered dictionary size: {len(filtered_dictionary)}")
# print(f"Filtered dictionary: {filtered_dictionary}")
# Generate embeddings and calculate cosine similarity on the filtered dictionary
input_embedding = generate_embedding(input_word)
similarities = []
for entry in filtered_dictionary:
entry_embedding = generate_embedding(entry)
similarity_score = torch.nn.functional.cosine_similarity(input_embedding, entry_embedding, dim=0).item()
similarities.append((entry, similarity_score))
# print(similarities)
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match if best_match[1] > 0.7 else None
else:
return None
# Example usage
input_words = [
"Pepper - Habanero Pepper", "Bananas (12 lbs)", "Squash - Yellow Squash", "Cauliflower",
"Squash mix italian/yellow (30 lbs)", "Tomato - Roma Tomato", "Tomato - Grape Tomato",
"Squash - Mexican Squash", "Pepper - Bell Pepper", "Squash - Italian Squash",
"Pepper - Red Fresno Pepper", "Tomato - Cherry Tomato", "Pepper - Serrano Pepper",
"Kale ( 5 lbs)", "Tomato - Beefsteak Tomato", "Pepper - Anaheim Pepper",
"Banana - Burro Banana", "Squash - Butternut Squash", "Apricot ( 10 lbs)",
"Squash - Acorn Squash", "Tomato - Heirloom Tomato", "Pepper - Pasilla Pepper",
"Pepper - Jalapeno Pepper", "carrot (10 lbs )"
]
for input_word in input_words:
matched_entry = match_word(input_word, dictionary)
if matched_entry:
print("Input word:", input_word)
print("Matched entry:", matched_entry[0])
print("Similarity score:", matched_entry[1])
else:
print("Input word:", input_word)
print("Matched entry: None")
print()