fastlane / services /utils.py
hrguarinv's picture
Update services/utils.py
a9e0812 verified
raw
history blame
1.85 kB
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
model = SentenceTransformer('all-mpnet-base-v2')
clip_model = SentenceTransformer('clip-ViT-B-32')
def clean_text(text):
# Lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-z\s]', '', text)
# Tokenize
words = word_tokenize(text)
# Remove stopwords and lemmatize
words = [lemmatizer.lemmatize(word)
for word in words if word not in stop_words]
# Join words back to a single string
cleaned_text = ' '.join(words)
return cleaned_text
def extract_order_id_from_query(text):
match = re.search(r'\bB-\d+\b', text)
if match:
return match.group(0)
return None
def generate_text_embedding(text):
try:
text_embedding = clip_model.encode(text, convert_to_tensor=True)
return text_embedding.cpu().numpy()
except Exception as e:
print(f"Error processing text '{text}': {e}")
return np.zeros((512,))
def generate_image_embedding(image_path):
try:
image = Image.open(image_path)
image = image.convert('RGB')
image_embedding = clip_model.encode(image, convert_to_tensor=True)
return image_embedding.cpu().numpy() # Convert to numpy array for easy storage
except Exception as e:
print(f"Error processing image from {image_path}: {e}")
return np.zeros((512,))
def clear_chat():
return []
def undo_last_message(chatbot):
if chatbot:
chatbot.pop()
return chatbot