|
import re |
|
import numpy as np |
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
from sentence_transformers import SentenceTransformer |
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
nltk.download('wordnet') |
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
model = SentenceTransformer('all-mpnet-base-v2') |
|
clip_model = SentenceTransformer('clip-ViT-B-32') |
|
|
|
def clean_text(text): |
|
|
|
text = text.lower() |
|
|
|
text = re.sub(r'[^a-z\s]', '', text) |
|
|
|
words = word_tokenize(text) |
|
|
|
words = [lemmatizer.lemmatize(word) |
|
for word in words if word not in stop_words] |
|
|
|
cleaned_text = ' '.join(words) |
|
return cleaned_text |
|
|
|
def extract_order_id_from_query(text): |
|
match = re.search(r'\bB-\d+\b', text) |
|
if match: |
|
return match.group(0) |
|
return None |
|
|
|
def generate_text_embedding(text): |
|
try: |
|
text_embedding = clip_model.encode(text, convert_to_tensor=True) |
|
return text_embedding.cpu().numpy() |
|
except Exception as e: |
|
print(f"Error processing text '{text}': {e}") |
|
return np.zeros((512,)) |
|
|
|
def generate_image_embedding(image_path): |
|
try: |
|
image = Image.open(image_path) |
|
image = image.convert('RGB') |
|
image_embedding = clip_model.encode(image, convert_to_tensor=True) |
|
return image_embedding.cpu().numpy() |
|
except Exception as e: |
|
print(f"Error processing image from {image_path}: {e}") |
|
return np.zeros((512,)) |
|
|
|
def clear_chat(): |
|
return [] |
|
|
|
|
|
def undo_last_message(chatbot): |
|
if chatbot: |
|
chatbot.pop() |
|
return chatbot |