Mel Seto commited on
Commit
c443bc0
·
1 Parent(s): 456f698

update embedding model to multilinugal

Browse files
src/retrieval/__init__.py CHANGED
@@ -1 +0,0 @@
1
- EMBEDDING_MODEL = "multilingual-e5-small"
 
 
src/retrieval/constants.py ADDED
@@ -0,0 +1 @@
 
 
1
+ EMBEDDING_MODEL = "intfloat/multilingual-e5-small"
src/retrieval/embed_corpus.py CHANGED
@@ -2,10 +2,13 @@ import json
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
4
 
 
 
 
5
  INPUT_FILE = "data/idioms-and-definitions.json"
6
  EMBED_FILE = "data/idiom_embeddings.npy"
7
 
8
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
9
 
10
  # Load idioms
11
  with open(INPUT_FILE, "r", encoding="utf-8") as f:
 
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
4
 
5
+ from .constants import EMBEDDING_MODEL
6
+
7
+
8
  INPUT_FILE = "data/idioms-and-definitions.json"
9
  EMBED_FILE = "data/idiom_embeddings.npy"
10
 
11
+ embedder = SentenceTransformer(EMBEDDING_MODEL)
12
 
13
  # Load idioms
14
  with open(INPUT_FILE, "r", encoding="utf-8") as f:
src/retrieval/retriever.py CHANGED
@@ -4,7 +4,7 @@ import requests
4
  from sentence_transformers import SentenceTransformer
5
  import os
6
 
7
- from retrieval import EMBEDDING_MODEL
8
 
9
 
10
  # HF Dataset URL for the embeddings
 
4
  from sentence_transformers import SentenceTransformer
5
  import os
6
 
7
+ from .constants import EMBEDDING_MODEL
8
 
9
 
10
  # HF Dataset URL for the embeddings