Spaces:
Paused
Paused
File size: 2,731 Bytes
9189e38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
import re
# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
# Load dictionary from CSV file
csv_file_path = './dictionary/dictionary.csv'
df = pd.read_csv(csv_file_path)
dictionary = df['description'].tolist()
# Method to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
return cosine_similarity(embedding1, embedding2)[0][0]
# Method to get RoBERTa embeddings
def get_roberta_embedding(text):
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).detach().numpy()
# Method to find the best match for the input word in the dictionary
def match_word(input_word, dictionary):
# Extract words from the input
words = re.findall(r'\w+', input_word.lower())
# Filter dictionary based on words
filtered_dictionary = [desc for desc in dictionary if any(word in desc.lower() for word in words)]
print(f"Filtered dictionary size: {len(filtered_dictionary)}")
# print(f"Filtered dictionary: {filtered_dictionary}")
# Proceed with RoBERTa embeddings and cosine similarity on the filtered dictionary
input_embedding = get_roberta_embedding(input_word)
similarities = []
for entry in filtered_dictionary:
entry_embedding = get_roberta_embedding(entry)
similarity_score = compute_cosine_similarity(input_embedding, entry_embedding)
similarities.append((entry, similarity_score))
if similarities:
best_match = max(similarities, key=lambda x: x[1])
return best_match[0] if best_match[1] > 0.7 else None
else:
return None
# Example usage
# Filtered dictionary size: 224
# Input word: Squash - Yellow Squash
# Matched entry: Squash, Indian, raw (Navajo)
# Filtered dictionary size: 37
# Input word: Cauliflower
# Matched entry: Cauliflower, raw
# Filtered dictionary size: 185
# Input word: Pepper - Habanero Pepper
# Matched entry: Stuffed green pepper, Puerto Rican style
# Filtered dictionary size: 414
# Input word: Bananas (12 lbs)
# Matched entry: Bananas, slightly ripe
# Filtered dictionary size: 784
# Input word: Squash mix italian/yellow (30 lbs )
# Matched entry: Squash, Indian, cooked, boiled (Navajo)
for input_word in input_words:
matched_entry = match_word(input_word, dictionary)
print("Input word:", input_word)
print("Matched entry:", matched_entry)
print()
|