import { searchWeb } from "$lib/server/websearch/searchWeb"; import { generateQuery } from "$lib/server/websearch/generateQuery"; import { parseWeb } from "$lib/server/websearch/parseWeb"; import { chunk } from "$lib/utils/chunk"; import { findSimilarSentences } from "$lib/server/sentenceSimilarity"; import { getWebSearchProvider } from "./searchWeb"; import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels"; import { WEBSEARCH_ALLOWLIST, WEBSEARCH_BLOCKLIST, ENABLE_LOCAL_FETCH } from "$env/static/private"; import type { Conversation } from "$lib/types/Conversation"; import type { MessageUpdate } from "$lib/types/MessageUpdate"; import type { Message } from "$lib/types/Message"; import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch"; import type { Assistant } from "$lib/types/Assistant"; import { z } from "zod"; import JSON5 from "json5"; import { isURLLocal } from "../isURLLocal"; const MAX_N_PAGES_SCRAPE = 10 as const; const MAX_N_PAGES_EMBED = 5 as const; const listSchema = z.array(z.string()).default([]); const allowList = listSchema.parse(JSON5.parse(WEBSEARCH_ALLOWLIST)); const blockList = listSchema.parse(JSON5.parse(WEBSEARCH_BLOCKLIST)); export async function runWebSearch( conv: Conversation, messages: Message[], updatePad: (upd: MessageUpdate) => void, ragSettings?: Assistant["rag"] ) { const prompt = messages[messages.length - 1].content; const webSearch: WebSearch = { prompt, searchQuery: "", results: [], contextSources: [], createdAt: new Date(), updatedAt: new Date(), }; function appendUpdate(message: string, args?: string[], type?: "error" | "update") { updatePad({ type: "webSearch", messageType: type ?? "update", message, args }); } try { // if the assistant specified direct links, skip the websearch if (ragSettings && ragSettings?.allowedLinks.length > 0) { appendUpdate("Using links specified in Assistant"); let linksToUse = [...ragSettings.allowedLinks]; if (ENABLE_LOCAL_FETCH !== "true") { const localLinks = await Promise.all( linksToUse.map(async (link) => { try { const url = new URL(link); return await isURLLocal(url); } catch (e) { return true; } }) ); linksToUse = linksToUse.filter((_, index) => !localLinks[index]); } webSearch.results = linksToUse.map((link) => { return { link, hostname: new URL(link).hostname, title: "", text: "" }; }); } else { webSearch.searchQuery = await generateQuery(messages); const searchProvider = getWebSearchProvider(); appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]); let filters = ""; if (ragSettings && ragSettings?.allowedDomains.length > 0) { appendUpdate("Filtering on specified domains"); filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR "); } // handle the global lists filters += allowList.map((item) => "site:" + item).join(" OR ") + " " + blockList.map((item) => "-site:" + item).join(" "); webSearch.searchQuery = filters + " " + webSearch.searchQuery; const results = await searchWeb(webSearch.searchQuery); webSearch.results = (results.organic_results && results.organic_results.map((el: { title?: string; link: string; text?: string }) => { try { const { title, link, text } = el; const { hostname } = new URL(link); return { title, link, hostname, text }; } catch (e) { // Ignore Errors return null; } })) ?? []; } webSearch.results = webSearch.results.filter((value) => value !== null); webSearch.results = webSearch.results .filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links .slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only // fetch the model const embeddingModel = embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel; if (!embeddingModel) { throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`); } let paragraphChunks: { source: WebSearchSource; text: string }[] = []; if (webSearch.results.length > 0) { appendUpdate("Browsing results"); const promises = webSearch.results.map(async (result) => { const { link } = result; let text = result.text ?? ""; if (!text) { try { text = await parseWeb(link); appendUpdate("Browsing webpage", [link]); } catch (e) { appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error"); // ignore errors } } const MAX_N_CHUNKS = 100; const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS); return texts.map((t) => ({ source: result, text: t })); }); const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED); paragraphChunks = nestedParagraphChunks.flat(); if (!paragraphChunks.length) { throw new Error("No text found on the first 5 results"); } } else { throw new Error("No results found for this search query"); } appendUpdate("Extracting relevant information"); const topKClosestParagraphs = 8; const texts = paragraphChunks.map(({ text }) => text); const indices = await findSimilarSentences(embeddingModel, prompt, texts, { topK: topKClosestParagraphs, }); for (const idx of indices) { const { source } = paragraphChunks[idx]; const contextWithId = { idx, text: texts[idx] }; const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link); if (usedSource) { usedSource.context.push(contextWithId); } else { webSearch.contextSources.push({ ...source, context: [contextWithId] }); } } updatePad({ type: "webSearch", messageType: "sources", message: "sources", sources: webSearch.contextSources, }); } catch (searchError) { if (searchError instanceof Error) { appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error"); } } return webSearch; }