chat-ui / src /lib /server /websearch /runWebSearch.ts
nsarrazin's picture
nsarrazin HF staff
Move vars to dynamic, add metrics (#1085)
98b1c51 unverified
import { searchWeb } from "$lib/server/websearch/searchWeb";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
import { getWebSearchProvider } from "./searchWeb";
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
import { env } from "$env/dynamic/private";
import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";
import { z } from "zod";
import JSON5 from "json5";
import { isURLLocal } from "../isURLLocal";
const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;
const listSchema = z.array(z.string()).default([]);
const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
export async function runWebSearch(
conv: Conversation,
messages: Message[],
updatePad: (upd: MessageUpdate) => void,
ragSettings?: Assistant["rag"]
) {
const prompt = messages[messages.length - 1].content;
const webSearch: WebSearch = {
prompt,
searchQuery: "",
results: [],
contextSources: [],
createdAt: new Date(),
updatedAt: new Date(),
};
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
}
try {
// if the assistant specified direct links, skip the websearch
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
appendUpdate("Using links specified in Assistant");
let linksToUse = [...ragSettings.allowedLinks];
if (env.ENABLE_LOCAL_FETCH !== "true") {
const localLinks = await Promise.all(
linksToUse.map(async (link) => {
try {
const url = new URL(link);
return await isURLLocal(url);
} catch (e) {
return true;
}
})
);
linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
}
webSearch.results = linksToUse.map((link) => {
return { link, hostname: new URL(link).hostname, title: "", text: "" };
});
} else {
webSearch.searchQuery = await generateQuery(messages);
const searchProvider = getWebSearchProvider();
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
let filters = "";
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
appendUpdate("Filtering on specified domains");
filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
}
// handle the global lists
filters +=
allowList.map((item) => "site:" + item).join(" OR ") +
" " +
blockList.map((item) => "-site:" + item).join(" ");
webSearch.searchQuery = filters + " " + webSearch.searchQuery;
const results = await searchWeb(webSearch.searchQuery);
webSearch.results =
(results.organic_results &&
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
try {
const { title, link, text } = el;
const { hostname } = new URL(link);
return { title, link, hostname, text };
} catch (e) {
// Ignore Errors
return null;
}
})) ??
[];
}
webSearch.results = webSearch.results.filter((value) => value !== null);
webSearch.results = webSearch.results
.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
// fetch the model
const embeddingModel =
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
if (!embeddingModel) {
throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
}
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
if (webSearch.results.length > 0) {
appendUpdate("Browsing results");
const promises = webSearch.results.map(async (result) => {
const { link } = result;
let text = result.text ?? "";
if (!text) {
try {
text = await parseWeb(link);
appendUpdate("Browsing webpage", [link]);
} catch (e) {
appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
// ignore errors
}
}
const MAX_N_CHUNKS = 100;
const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
return texts.map((t) => ({ source: result, text: t }));
});
const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
paragraphChunks = nestedParagraphChunks.flat();
if (!paragraphChunks.length) {
throw new Error("No text found on the first 5 results");
}
} else {
throw new Error("No results found for this search query");
}
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 8;
const texts = paragraphChunks.map(({ text }) => text);
const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
topK: topKClosestParagraphs,
});
for (const idx of indices) {
const { source } = paragraphChunks[idx];
const contextWithId = { idx, text: texts[idx] };
const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
if (usedSource) {
usedSource.context.push(contextWithId);
} else {
webSearch.contextSources.push({ ...source, context: [contextWithId] });
}
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
} catch (searchError) {
if (searchError instanceof Error) {
appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
}
}
return webSearch;
}