Mishig
commited on
Update embedding model for WebSearch (#437)
Browse files* Use `gte-base` as the emebdding model
* use `bge-small-en-v1.5`
* Revert "use `bge-small-en-v1.5`"
This reverts commit 8cfe084d26884c130d6034afa095b7bbf0f7fa1b.
* Use `gte-small`
src/lib/server/websearch/sentenceSimilarity.ts
CHANGED
@@ -6,16 +6,14 @@ function innerProduct(tensor1: Tensor, tensor2: Tensor) {
|
|
6 |
return 1.0 - dot(tensor1.data, tensor2.data);
|
7 |
}
|
8 |
|
9 |
-
const extractor = await pipeline("feature-extraction", "Xenova/
|
10 |
|
11 |
export async function findSimilarSentences(
|
12 |
query: string,
|
13 |
sentences: string[],
|
14 |
{ topK = 5 }: { topK: number }
|
15 |
) {
|
16 |
-
|
17 |
-
// see more: https://huggingface.co/intfloat/e5-small-v2/blob/main/README.md?code=true#L2631
|
18 |
-
const input = [`query: ${query}`, ...sentences.map((s) => `passage: ${s}`)];
|
19 |
const output: Tensor = await extractor(input, { pooling: "mean", normalize: true });
|
20 |
|
21 |
const queryTensor: Tensor = output[0];
|
|
|
6 |
return 1.0 - dot(tensor1.data, tensor2.data);
|
7 |
}
|
8 |
|
9 |
+
const extractor = await pipeline("feature-extraction", "Xenova/gte-small");
|
10 |
|
11 |
export async function findSimilarSentences(
|
12 |
query: string,
|
13 |
sentences: string[],
|
14 |
{ topK = 5 }: { topK: number }
|
15 |
) {
|
16 |
+
const input = [query, ...sentences];
|
|
|
|
|
17 |
const output: Tensor = await extractor(input, { pooling: "mean", normalize: true });
|
18 |
|
19 |
const queryTensor: Tensor = output[0];
|