File size: 6,084 Bytes
e943a05
 
 
 
3a01622
4606755
3a01622
3f5871c
 
 
 
 
 
 
 
 
 
 
e943a05
 
 
 
3f5871c
 
 
 
e3af794
e943a05
 
e6addfc
3f5871c
 
e943a05
e6addfc
e943a05
d5559df
e943a05
 
 
 
 
 
 
 
d5559df
e943a05
 
 
3f5871c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233d2a9
3f5871c
 
233d2a9
3f5871c
 
 
233d2a9
cbf4205
3f5871c
 
 
233d2a9
 
3f5871c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e63c1d7
e943a05
3f5871c
e943a05
 
3a01622
 
 
 
 
 
 
 
e943a05
 
 
 
 
4606755
 
 
 
 
 
e04d4e8
4606755
 
e943a05
 
3a01622
e943a05
 
 
 
 
 
 
 
 
 
 
 
 
 
3a01622
e943a05
 
 
 
 
2e2f16c
 
 
 
 
 
e943a05
 
5071731
 
 
 
 
 
e943a05
 
bccd811
e943a05
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import { searchWeb } from "$lib/server/websearch/searchWeb";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
import { getWebSearchProvider } from "./searchWeb";
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
import { WEBSEARCH_ALLOWLIST, WEBSEARCH_BLOCKLIST, ENABLE_LOCAL_FETCH } from "$env/static/private";

import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";

import { z } from "zod";
import JSON5 from "json5";
import { isURLLocal } from "../isURLLocal";

const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;

const listSchema = z.array(z.string()).default([]);

const allowList = listSchema.parse(JSON5.parse(WEBSEARCH_ALLOWLIST));
const blockList = listSchema.parse(JSON5.parse(WEBSEARCH_BLOCKLIST));

export async function runWebSearch(
	conv: Conversation,
	messages: Message[],
	updatePad: (upd: MessageUpdate) => void,
	ragSettings?: Assistant["rag"]
) {
	const prompt = messages[messages.length - 1].content;
	const webSearch: WebSearch = {
		prompt,
		searchQuery: "",
		results: [],
		contextSources: [],
		createdAt: new Date(),
		updatedAt: new Date(),
	};

	function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
		updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
	}

	try {
		// if the assistant specified direct links, skip the websearch
		if (ragSettings && ragSettings?.allowedLinks.length > 0) {
			appendUpdate("Using links specified in Assistant");

			let linksToUse = [...ragSettings.allowedLinks];

			if (ENABLE_LOCAL_FETCH !== "true") {
				const localLinks = await Promise.all(
					linksToUse.map(async (link) => {
						try {
							const url = new URL(link);
							return await isURLLocal(url);
						} catch (e) {
							return true;
						}
					})
				);

				linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
			}

			webSearch.results = linksToUse.map((link) => {
				return { link, hostname: new URL(link).hostname, title: "", text: "" };
			});
		} else {
			webSearch.searchQuery = await generateQuery(messages);
			const searchProvider = getWebSearchProvider();
			appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);

			let filters = "";
			if (ragSettings && ragSettings?.allowedDomains.length > 0) {
				appendUpdate("Filtering on specified domains");
				filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
			}

			// handle the global lists
			filters +=
				allowList.map((item) => "site:" + item).join(" OR ") +
				" " +
				blockList.map((item) => "-site:" + item).join(" ");

			webSearch.searchQuery = filters + " " + webSearch.searchQuery;

			const results = await searchWeb(webSearch.searchQuery);
			webSearch.results =
				(results.organic_results &&
					results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
						try {
							const { title, link, text } = el;
							const { hostname } = new URL(link);
							return { title, link, hostname, text };
						} catch (e) {
							// Ignore Errors
							return null;
						}
					})) ??
				[];
		}

		webSearch.results = webSearch.results.filter((value) => value !== null);
		webSearch.results = webSearch.results
			.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
			.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only

		// fetch the model
		const embeddingModel =
			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;

		if (!embeddingModel) {
			throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
		}

		let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
		if (webSearch.results.length > 0) {
			appendUpdate("Browsing results");
			const promises = webSearch.results.map(async (result) => {
				const { link } = result;
				let text = result.text ?? "";
				if (!text) {
					try {
						text = await parseWeb(link);
						appendUpdate("Browsing webpage", [link]);
					} catch (e) {
						appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
						// ignore errors
					}
				}
				const MAX_N_CHUNKS = 100;
				const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
				return texts.map((t) => ({ source: result, text: t }));
			});
			const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
			paragraphChunks = nestedParagraphChunks.flat();
			if (!paragraphChunks.length) {
				throw new Error("No text found on the first 5 results");
			}
		} else {
			throw new Error("No results found for this search query");
		}

		appendUpdate("Extracting relevant information");
		const topKClosestParagraphs = 8;
		const texts = paragraphChunks.map(({ text }) => text);
		const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
			topK: topKClosestParagraphs,
		});

		for (const idx of indices) {
			const { source } = paragraphChunks[idx];
			const contextWithId = { idx, text: texts[idx] };
			const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
			if (usedSource) {
				usedSource.context.push(contextWithId);
			} else {
				webSearch.contextSources.push({ ...source, context: [contextWithId] });
			}
		}
		updatePad({
			type: "webSearch",
			messageType: "sources",
			message: "sources",
			sources: webSearch.contextSources,
		});
	} catch (searchError) {
		if (searchError instanceof Error) {
			appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
		}
	}

	return webSearch;
}