File size: 4,394 Bytes
e943a05 3a01622 e943a05 4606755 3a01622 e943a05 e3af794 e943a05 d5559df e943a05 d5559df e943a05 4606755 e943a05 e3af794 e63c1d7 e943a05 e63c1d7 e943a05 e3af794 e943a05 3a01622 e943a05 4606755 e943a05 3a01622 e943a05 3a01622 e943a05 5071731 e943a05 bccd811 e943a05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import { searchWeb } from "$lib/server/websearch/searchWeb";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import { getWebSearchProvider } from "./searchWeb";
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;
const DOMAIN_BLOCKLIST = ["youtube.com", "twitter.com"];
export async function runWebSearch(
conv: Conversation,
prompt: string,
updatePad: (upd: MessageUpdate) => void
) {
const messages = (() => {
return [...conv.messages, { content: prompt, from: "user", id: crypto.randomUUID() }];
})() satisfies Message[];
const webSearch: WebSearch = {
prompt,
searchQuery: "",
results: [],
context: "",
contextSources: [],
createdAt: new Date(),
updatedAt: new Date(),
};
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
}
try {
webSearch.searchQuery = await generateQuery(messages);
const searchProvider = getWebSearchProvider();
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
const results = await searchWeb(webSearch.searchQuery);
webSearch.results =
(results.organic_results &&
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
try {
const { title, link, text } = el;
const { hostname } = new URL(link);
return { title, link, hostname, text };
} catch (e) {
// Ignore Errors
return null;
}
})) ??
[];
webSearch.results = webSearch.results.filter((value) => value !== null);
webSearch.results = webSearch.results
.filter(({ link }) => !DOMAIN_BLOCKLIST.some((el) => link.includes(el))) // filter out blocklist links
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
// fetch the model
const embeddingModel =
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
if (!embeddingModel) {
throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
}
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
if (webSearch.results.length > 0) {
appendUpdate("Browsing results");
const promises = webSearch.results.map(async (result) => {
const { link } = result;
let text = result.text ?? "";
if (!text) {
try {
text = await parseWeb(link);
appendUpdate("Browsing webpage", [link]);
} catch (e) {
// ignore errors
}
}
const MAX_N_CHUNKS = 100;
const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
return texts.map((t) => ({ source: result, text: t }));
});
const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
paragraphChunks = nestedParagraphChunks.flat();
if (!paragraphChunks.length) {
throw new Error("No text found on the first 5 results");
}
} else {
throw new Error("No results found for this search query");
}
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 8;
const texts = paragraphChunks.map(({ text }) => text);
const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
topK: topKClosestParagraphs,
});
webSearch.context = indices.map((idx) => texts[idx]).join("");
const usedSources = new Set<string>();
for (const idx of indices) {
const { source } = paragraphChunks[idx];
if (!usedSources.has(source.link)) {
usedSources.add(source.link);
webSearch.contextSources.push(source);
}
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
} catch (searchError) {
if (searchError instanceof Error) {
appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
}
}
return webSearch;
}
|