More_Advanced_Embeddings_Comparator

Running

Chris4K commited on Oct 18, 2024

Commit

ce988dc

verified ·

1 Parent(s): a5f7e3b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -146,6 +146,30 @@ def phonetic_match(text, query, method='levenshtein_distance'):
         return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
     return 0
 # Custom Tokenizer
 def create_custom_tokenizer(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
@@ -396,7 +420,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
         # Custom embedding handling
         if use_custom_embedding:
-            custom_model = create_custom_embedding(chunks)
             embedding_model = CustomEmbeddings(custom_model)
         # Optimizing vocabulary if required

         return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
     return 0
+def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
+    # Tokenize the texts
+    tokenized_texts = [text.split() for text in texts]
+    if model_type == 'word2vec':
+        model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
+    elif model_type == 'fasttext':
+        model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
+    else:
+        raise ValueError("Unsupported model type")
+    return model
+class CustomEmbeddings(HuggingFaceEmbeddings):
+    def __init__(self, model_path):
+        self.model = Word2Vec.load(model_path)  # or FastText.load() for FastText models
+    def embed_documents(self, texts):
+        return [self.model.wv[text.split()] for text in texts]
+    def embed_query(self, text):
+        return self.model.wv[text.split()]
 # Custom Tokenizer
 def create_custom_tokenizer(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
         # Custom embedding handling
         if use_custom_embedding:
+            custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
             embedding_model = CustomEmbeddings(custom_model)
         # Optimizing vocabulary if required