mrbean David Li Mishig commited on
Commit
4606755
1 Parent(s): 4e58dda

add you.com integration (#514)

Browse files

* add you.com integration

* [Feat] Add types (#1)

* feat: add types

* feat: specify search provider

* [Feat] add return values (#3)

* feat: add types

* feat: specify search provider

* add values for ui

* add idea to gitignore

* lint and downgrade

* Revert "lint and downgrade"

This reverts commit fbfe012cf4d6aaf5ea00508ca0f99a408ce152e6.

* Updates to you.com integration (#4)

* `npm run format`

* enable search when `YDC_API_KEY` is available

* typing nit

* enum WebSearchProvider

* eslint-disable-next-line no-shadow

* downgrade

* You.com already provides text snippets (#5)

* Order desc you.com results by text length

* You.com already provides texts from webpages

* add to env

* Update .env

Co-authored-by: Mishig <mishig.davaadorj@coloradocollege.edu>

* Update src/lib/server/websearch/searchWeb.ts

---------

Co-authored-by: David Li <david.li3100@gmail.com>
Co-authored-by: Mishig <mishig.davaadorj@coloradocollege.edu>

.env CHANGED
@@ -10,6 +10,7 @@ HF_ACCESS_TOKEN=#hf_<token> from from https://huggingface.co/settings/token
10
  HF_API_ROOT=https://api-inference.huggingface.co/models
11
 
12
  # used to activate search with web functionality. disabled if none are defined. choose one of the following:
 
13
  SERPER_API_KEY=#your serper.dev api key here
14
  SERPAPI_KEY=#your serpapi key here
15
 
 
10
  HF_API_ROOT=https://api-inference.huggingface.co/models
11
 
12
  # used to activate search with web functionality. disabled if none are defined. choose one of the following:
13
+ YDC_API_KEY=#your docs.you.com api key here
14
  SERPER_API_KEY=#your serper.dev api key here
15
  SERPAPI_KEY=#your serpapi key here
16
 
.gitignore CHANGED
@@ -9,4 +9,5 @@ node_modules
9
  !.env.template
10
  vite.config.js.timestamp-*
11
  vite.config.ts.timestamp-*
12
- SECRET_CONFIG
 
 
9
  !.env.template
10
  vite.config.js.timestamp-*
11
  vite.config.ts.timestamp-*
12
+ SECRET_CONFIG
13
+ .idea
README.md CHANGED
@@ -76,8 +76,8 @@ npm run dev
76
 
77
  Chat UI features a powerful Web Search feature. It works by:
78
 
79
- 1. Generating an appropriate Google query from the user prompt.
80
- 2. Performing Google search and extracting content from webpages.
81
  3. Creating embeddings from texts using [transformers.js](https://huggingface.co/docs/transformers.js). Specifically, using [Xenova/gte-small](https://huggingface.co/Xenova/gte-small) model.
82
  4. From these embeddings, find the ones that are closest to the user query using vector similarity search. Specifically, we use `inner product` distance.
83
  5. Get the corresponding texts to those closest embeddings and perform [Retrieval-Augmented Generation](https://huggingface.co/papers/2005.11401) (i.e. expand user prompt by adding those texts so that a LLM can use this information).
@@ -122,7 +122,7 @@ PUBLIC_APP_DISCLAIMER=
122
 
123
  ### Web Search config
124
 
125
- You can enable the web search by adding either `SERPER_API_KEY` ([serper.dev](https://serper.dev/)) or `SERPAPI_KEY` ([serpapi.com](https://serpapi.com/)) to your `.env.local`.
126
 
127
  ### Custom models
128
 
@@ -209,7 +209,7 @@ The following is the default `webSearchQueryPromptTemplate`.
209
  ```prompt
210
  {{userMessageToken}}
211
  My question is: {{message.content}}.
212
- Based on the conversation history (my previous questions are: {{previousMessages}}), give me an appropriate query to answer my question for google search. You should not say more than query. You should not say any words except the query. For the context, today is {{currentDate}}
213
  {{userMessageEndToken}}
214
  {{assistantMessageToken}}
215
  ```
 
76
 
77
  Chat UI features a powerful Web Search feature. It works by:
78
 
79
+ 1. Generating an appropriate search query from the user prompt.
80
+ 2. Performing web search and extracting content from webpages.
81
  3. Creating embeddings from texts using [transformers.js](https://huggingface.co/docs/transformers.js). Specifically, using [Xenova/gte-small](https://huggingface.co/Xenova/gte-small) model.
82
  4. From these embeddings, find the ones that are closest to the user query using vector similarity search. Specifically, we use `inner product` distance.
83
  5. Get the corresponding texts to those closest embeddings and perform [Retrieval-Augmented Generation](https://huggingface.co/papers/2005.11401) (i.e. expand user prompt by adding those texts so that a LLM can use this information).
 
122
 
123
  ### Web Search config
124
 
125
+ You can enable the web search by adding any of `YDC_API_KEY` ([docs.you.com](https://docs.you.com)) or `SERPER_API_KEY` ([serper.dev](https://serper.dev/)) or `SERPAPI_KEY` ([serpapi.com](https://serpapi.com/)) to your `.env.local`.
126
 
127
  ### Custom models
128
 
 
209
  ```prompt
210
  {{userMessageToken}}
211
  My question is: {{message.content}}.
212
+ Based on the conversation history (my previous questions are: {{previousMessages}}), give me an appropriate query to answer my question for web search. You should not say more than query. You should not say any words except the query. For the context, today is {{currentDate}}
213
  {{userMessageEndToken}}
214
  {{assistantMessageToken}}
215
  ```
src/lib/server/websearch/runWebSearch.ts CHANGED
@@ -10,6 +10,7 @@ import {
10
  } from "$lib/server/websearch/sentenceSimilarity";
11
  import type { Conversation } from "$lib/types/Conversation";
12
  import type { MessageUpdate } from "$lib/types/MessageUpdate";
 
13
 
14
  const MAX_N_PAGES_SCRAPE = 10 as const;
15
  const MAX_N_PAGES_EMBED = 5 as const;
@@ -39,14 +40,15 @@ export async function runWebSearch(
39
 
40
  try {
41
  webSearch.searchQuery = await generateQuery(messages);
42
- appendUpdate("Searching Google", [webSearch.searchQuery]);
 
43
  const results = await searchWeb(webSearch.searchQuery);
44
  webSearch.results =
45
  (results.organic_results &&
46
- results.organic_results.map((el: { title: string; link: string }) => {
47
- const { title, link } = el;
48
  const { hostname } = new URL(link);
49
- return { title, link, hostname };
50
  })) ??
51
  [];
52
  webSearch.results = webSearch.results
@@ -58,12 +60,14 @@ export async function runWebSearch(
58
  appendUpdate("Browsing results");
59
  const promises = webSearch.results.map(async (result) => {
60
  const { link } = result;
61
- let text = "";
62
- try {
63
- text = await parseWeb(link);
64
- appendUpdate("Browsing webpage", [link]);
65
- } catch (e) {
66
- // ignore errors
 
 
67
  }
68
  const MAX_N_CHUNKS = 100;
69
  const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
 
10
  } from "$lib/server/websearch/sentenceSimilarity";
11
  import type { Conversation } from "$lib/types/Conversation";
12
  import type { MessageUpdate } from "$lib/types/MessageUpdate";
13
+ import { getWebSearchProvider } from "./searchWeb";
14
 
15
  const MAX_N_PAGES_SCRAPE = 10 as const;
16
  const MAX_N_PAGES_EMBED = 5 as const;
 
40
 
41
  try {
42
  webSearch.searchQuery = await generateQuery(messages);
43
+ const searchProvider = getWebSearchProvider();
44
+ appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
45
  const results = await searchWeb(webSearch.searchQuery);
46
  webSearch.results =
47
  (results.organic_results &&
48
+ results.organic_results.map((el: { title: string; link: string; text?: string }) => {
49
+ const { title, link, text } = el;
50
  const { hostname } = new URL(link);
51
+ return { title, link, hostname, text };
52
  })) ??
53
  [];
54
  webSearch.results = webSearch.results
 
60
  appendUpdate("Browsing results");
61
  const promises = webSearch.results.map(async (result) => {
62
  const { link } = result;
63
+ let text = result.text ?? "";
64
+ if (!text) {
65
+ try {
66
+ text = await parseWeb(link);
67
+ appendUpdate("Browsing webpage", [link]);
68
+ } catch (e) {
69
+ // ignore errors
70
+ }
71
  }
72
  const MAX_N_CHUNKS = 100;
73
  const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
src/lib/server/websearch/searchWeb.ts CHANGED
@@ -1,17 +1,26 @@
1
- import { SERPAPI_KEY, SERPER_API_KEY } from "$env/static/private";
2
-
 
3
  import { getJson } from "serpapi";
4
  import type { GoogleParameters } from "serpapi";
5
 
 
 
 
 
 
6
  // Show result as JSON
7
  export async function searchWeb(query: string) {
8
  if (SERPER_API_KEY) {
9
  return await searchWebSerper(query);
10
  }
 
 
 
11
  if (SERPAPI_KEY) {
12
  return await searchWebSerpApi(query);
13
  }
14
- throw new Error("No Serper.dev or SerpAPI key found");
15
  }
16
 
17
  export async function searchWebSerper(query: string) {
@@ -59,3 +68,31 @@ export async function searchWebSerpApi(query: string) {
59
 
60
  return response;
61
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { YouWebSearch } from "../../types/WebSearch";
2
+ import { WebSearchProvider } from "../../types/WebSearch";
3
+ import { SERPAPI_KEY, SERPER_API_KEY, YDC_API_KEY } from "$env/static/private";
4
  import { getJson } from "serpapi";
5
  import type { GoogleParameters } from "serpapi";
6
 
7
+ // get which SERP api is providing web results
8
+ export function getWebSearchProvider() {
9
+ return YDC_API_KEY ? WebSearchProvider.YOU : WebSearchProvider.GOOGLE;
10
+ }
11
+
12
  // Show result as JSON
13
  export async function searchWeb(query: string) {
14
  if (SERPER_API_KEY) {
15
  return await searchWebSerper(query);
16
  }
17
+ if (YDC_API_KEY) {
18
+ return await searchWebYouApi(query);
19
+ }
20
  if (SERPAPI_KEY) {
21
  return await searchWebSerpApi(query);
22
  }
23
+ throw new Error("No You.com or Serper.dev or SerpAPI key found");
24
  }
25
 
26
  export async function searchWebSerper(query: string) {
 
68
 
69
  return response;
70
  }
71
+
72
+ export async function searchWebYouApi(query: string) {
73
+ const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
74
+ method: "GET",
75
+ headers: {
76
+ "X-API-Key": YDC_API_KEY,
77
+ "Content-type": "application/json; charset=UTF-8",
78
+ },
79
+ });
80
+
81
+ if (!response.ok) {
82
+ throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
83
+ }
84
+
85
+ const data = (await response.json()) as YouWebSearch;
86
+ const formattedResultsWithSnippets = data.hits
87
+ .map(({ title, url, snippets }) => ({
88
+ title,
89
+ link: url,
90
+ text: snippets?.join("\n") || "",
91
+ hostname: new URL(url).hostname,
92
+ }))
93
+ .sort((a, b) => b.text.length - a.text.length); // desc order by text length
94
+
95
+ return {
96
+ organic_results: formattedResultsWithSnippets,
97
+ };
98
+ }
src/lib/types/WebSearch.ts CHANGED
@@ -18,9 +18,28 @@ export interface WebSearchSource {
18
  title: string;
19
  link: string;
20
  hostname: string;
 
21
  }
22
 
23
  export type WebSearchMessageSources = {
24
  type: "sources";
25
  sources: WebSearchSource[];
26
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  title: string;
19
  link: string;
20
  hostname: string;
21
+ text?: string; // You.com provides text of webpage right away
22
  }
23
 
24
  export type WebSearchMessageSources = {
25
  type: "sources";
26
  sources: WebSearchSource[];
27
  };
28
+
29
+ export interface YouWebSearch {
30
+ hits: YouSearchHit[];
31
+ latency: number;
32
+ }
33
+
34
+ interface YouSearchHit {
35
+ url: string;
36
+ title: string;
37
+ description: string;
38
+ snippets: string[];
39
+ }
40
+
41
+ // eslint-disable-next-line no-shadow
42
+ export enum WebSearchProvider {
43
+ GOOGLE = "Google",
44
+ YOU = "You.com",
45
+ }
src/routes/+layout.server.ts CHANGED
@@ -6,7 +6,12 @@ import { UrlDependency } from "$lib/types/UrlDependency";
6
  import { defaultModel, models, oldModels, validateModel } from "$lib/server/models";
7
  import { authCondition, requiresUser } from "$lib/server/auth";
8
  import { DEFAULT_SETTINGS } from "$lib/types/Settings";
9
- import { SERPAPI_KEY, SERPER_API_KEY, MESSAGES_BEFORE_LOGIN } from "$env/static/private";
 
 
 
 
 
10
 
11
  export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
12
  const { conversations } = collections;
@@ -82,7 +87,7 @@ export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
82
  ethicsModalAcceptedAt: settings?.ethicsModalAcceptedAt ?? null,
83
  activeModel: settings?.activeModel ?? DEFAULT_SETTINGS.activeModel,
84
  hideEmojiOnSidebar: settings?.hideEmojiOnSidebar ?? false,
85
- searchEnabled: !!(SERPAPI_KEY || SERPER_API_KEY),
86
  customPrompts: settings?.customPrompts ?? {},
87
  },
88
  models: models.map((model) => ({
 
6
  import { defaultModel, models, oldModels, validateModel } from "$lib/server/models";
7
  import { authCondition, requiresUser } from "$lib/server/auth";
8
  import { DEFAULT_SETTINGS } from "$lib/types/Settings";
9
+ import {
10
+ SERPAPI_KEY,
11
+ SERPER_API_KEY,
12
+ MESSAGES_BEFORE_LOGIN,
13
+ YDC_API_KEY,
14
+ } from "$env/static/private";
15
 
16
  export const load: LayoutServerLoad = async ({ locals, depends, url }) => {
17
  const { conversations } = collections;
 
87
  ethicsModalAcceptedAt: settings?.ethicsModalAcceptedAt ?? null,
88
  activeModel: settings?.activeModel ?? DEFAULT_SETTINGS.activeModel,
89
  hideEmojiOnSidebar: settings?.hideEmojiOnSidebar ?? false,
90
+ searchEnabled: !!(SERPAPI_KEY || SERPER_API_KEY || YDC_API_KEY),
91
  customPrompts: settings?.customPrompts ?? {},
92
  },
93
  models: models.map((model) => ({