Chris4K commited on
Commit
ce988dc
·
verified ·
1 Parent(s): a5f7e3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -1
app.py CHANGED
@@ -146,6 +146,30 @@ def phonetic_match(text, query, method='levenshtein_distance'):
146
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
147
  return 0
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # Custom Tokenizer
150
  def create_custom_tokenizer(file_path):
151
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -396,7 +420,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
396
 
397
  # Custom embedding handling
398
  if use_custom_embedding:
399
- custom_model = create_custom_embedding(chunks)
400
  embedding_model = CustomEmbeddings(custom_model)
401
 
402
  # Optimizing vocabulary if required
 
146
  return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
147
  return 0
148
 
149
+ def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
150
+ # Tokenize the texts
151
+ tokenized_texts = [text.split() for text in texts]
152
+
153
+ if model_type == 'word2vec':
154
+ model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
155
+ elif model_type == 'fasttext':
156
+ model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
157
+ else:
158
+ raise ValueError("Unsupported model type")
159
+
160
+ return model
161
+
162
+ class CustomEmbeddings(HuggingFaceEmbeddings):
163
+ def __init__(self, model_path):
164
+ self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
165
+
166
+ def embed_documents(self, texts):
167
+ return [self.model.wv[text.split()] for text in texts]
168
+
169
+ def embed_query(self, text):
170
+ return self.model.wv[text.split()]
171
+
172
+
173
  # Custom Tokenizer
174
  def create_custom_tokenizer(file_path):
175
  with open(file_path, 'r', encoding='utf-8') as f:
 
420
 
421
  # Custom embedding handling
422
  if use_custom_embedding:
423
+ custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
424
  embedding_model = CustomEmbeddings(custom_model)
425
 
426
  # Optimizing vocabulary if required