Update app.py
Browse files
app.py
CHANGED
@@ -146,6 +146,30 @@ def phonetic_match(text, query, method='levenshtein_distance'):
|
|
146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
147 |
return 0
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
# Custom Tokenizer
|
150 |
def create_custom_tokenizer(file_path):
|
151 |
with open(file_path, 'r', encoding='utf-8') as f:
|
@@ -396,7 +420,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
396 |
|
397 |
# Custom embedding handling
|
398 |
if use_custom_embedding:
|
399 |
-
custom_model = create_custom_embedding(chunks)
|
400 |
embedding_model = CustomEmbeddings(custom_model)
|
401 |
|
402 |
# Optimizing vocabulary if required
|
|
|
146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
147 |
return 0
|
148 |
|
149 |
+
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
150 |
+
# Tokenize the texts
|
151 |
+
tokenized_texts = [text.split() for text in texts]
|
152 |
+
|
153 |
+
if model_type == 'word2vec':
|
154 |
+
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
155 |
+
elif model_type == 'fasttext':
|
156 |
+
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
157 |
+
else:
|
158 |
+
raise ValueError("Unsupported model type")
|
159 |
+
|
160 |
+
return model
|
161 |
+
|
162 |
+
class CustomEmbeddings(HuggingFaceEmbeddings):
|
163 |
+
def __init__(self, model_path):
|
164 |
+
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
165 |
+
|
166 |
+
def embed_documents(self, texts):
|
167 |
+
return [self.model.wv[text.split()] for text in texts]
|
168 |
+
|
169 |
+
def embed_query(self, text):
|
170 |
+
return self.model.wv[text.split()]
|
171 |
+
|
172 |
+
|
173 |
# Custom Tokenizer
|
174 |
def create_custom_tokenizer(file_path):
|
175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
420 |
|
421 |
# Custom embedding handling
|
422 |
if use_custom_embedding:
|
423 |
+
custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
424 |
embedding_model = CustomEmbeddings(custom_model)
|
425 |
|
426 |
# Optimizing vocabulary if required
|