Mishig
commited on
Commit
•
3acc11d
1
Parent(s):
9960338
Make embedding model settings more future-proof (#454)
Browse files
src/lib/server/websearch/runWebSearch.ts
CHANGED
@@ -4,7 +4,10 @@ import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
|
|
4 |
import { generateQuery } from "$lib/server/websearch/generateQuery";
|
5 |
import { parseWeb } from "$lib/server/websearch/parseWeb";
|
6 |
import { chunk } from "$lib/utils/chunk";
|
7 |
-
import {
|
|
|
|
|
|
|
8 |
import type { Conversation } from "$lib/types/Conversation";
|
9 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
10 |
|
@@ -62,7 +65,6 @@ export async function runWebSearch(
|
|
62 |
} catch (e) {
|
63 |
console.error(`Error parsing webpage "${link}"`, e);
|
64 |
}
|
65 |
-
const CHUNK_CAR_LEN = 512;
|
66 |
const MAX_N_CHUNKS = 100;
|
67 |
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
|
68 |
return texts.map((t) => ({ source: result, text: t }));
|
|
|
4 |
import { generateQuery } from "$lib/server/websearch/generateQuery";
|
5 |
import { parseWeb } from "$lib/server/websearch/parseWeb";
|
6 |
import { chunk } from "$lib/utils/chunk";
|
7 |
+
import {
|
8 |
+
MAX_SEQ_LEN as CHUNK_CAR_LEN,
|
9 |
+
findSimilarSentences,
|
10 |
+
} from "$lib/server/websearch/sentenceSimilarity";
|
11 |
import type { Conversation } from "$lib/types/Conversation";
|
12 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
13 |
|
|
|
65 |
} catch (e) {
|
66 |
console.error(`Error parsing webpage "${link}"`, e);
|
67 |
}
|
|
|
68 |
const MAX_N_CHUNKS = 100;
|
69 |
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
|
70 |
return texts.map((t) => ({ source: result, text: t }));
|
src/lib/server/websearch/sentenceSimilarity.ts
CHANGED
@@ -6,7 +6,10 @@ function innerProduct(tensor1: Tensor, tensor2: Tensor) {
|
|
6 |
return 1.0 - dot(tensor1.data, tensor2.data);
|
7 |
}
|
8 |
|
9 |
-
const
|
|
|
|
|
|
|
10 |
|
11 |
export async function findSimilarSentences(
|
12 |
query: string,
|
|
|
6 |
return 1.0 - dot(tensor1.data, tensor2.data);
|
7 |
}
|
8 |
|
9 |
+
const modelId = "Xenova/gte-small";
|
10 |
+
const extractor = await pipeline("feature-extraction", modelId);
|
11 |
+
// see https://huggingface.co/thenlper/gte-small/blob/d8e2604cadbeeda029847d19759d219e0ce2e6d8/README.md?code=true#L2625
|
12 |
+
export const MAX_SEQ_LEN = 512 as const;
|
13 |
|
14 |
export async function findSimilarSentences(
|
15 |
query: string,
|