nomagick commited on
Commit
e9d69e6
·
unverified ·
1 Parent(s): 51779dc

searcher: image/news/bing search (#1176)

Browse files

* wip

* refactor: more serper search features

* fix: usage naming in crawl

src/api/crawler.ts CHANGED
@@ -279,7 +279,7 @@ export class CrawlerHost extends RPCHost {
279
  return;
280
  }
281
  if (chargeAmount) {
282
- auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
283
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
284
  });
285
  apiRoll.chargeAmount = chargeAmount;
@@ -914,12 +914,12 @@ export class CrawlerHost extends RPCHost {
914
  amount += x1 * 2;
915
  }
916
  amount += x1;
917
- } else if (formatted.description) {
918
- amount += estimateToken(formatted.description);
919
  }
 
920
  if (formatted.text) {
921
  amount += estimateToken(formatted.text);
922
  }
 
923
  if (formatted.html) {
924
  amount += estimateToken(formatted.html);
925
  }
 
279
  return;
280
  }
281
  if (chargeAmount) {
282
+ auth.reportUsage(chargeAmount, `reader-crawl`).catch((err) => {
283
  this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
284
  });
285
  apiRoll.chargeAmount = chargeAmount;
 
914
  amount += x1 * 2;
915
  }
916
  amount += x1;
 
 
917
  }
918
+
919
  if (formatted.text) {
920
  amount += estimateToken(formatted.text);
921
  }
922
+
923
  if (formatted.html) {
924
  amount += estimateToken(formatted.html);
925
  }
src/api/searcher-serper.ts CHANGED
@@ -10,7 +10,7 @@ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
10
 
11
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
12
  import { SerperSearchResult } from '../db/searched';
13
- import { CrawlerOptions } from '../dto/crawler-options';
14
  import { SnapshotFormatter, FormattedPage as RealFormattedPage } from '../services/snapshot-formatter';
15
  import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
16
 
@@ -20,7 +20,8 @@ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
20
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
21
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
22
  import { InsufficientBalanceError } from '../services/errors';
23
- import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
 
24
 
25
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
26
 
@@ -87,6 +88,10 @@ export class SearcherHost extends RPCHost {
87
  searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
88
  @Param('count', { validate: (v: number) => v >= 0 && v <= 20 })
89
  count: number,
 
 
 
 
90
  @Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
91
  num?: number,
92
  @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v) }) gl?: string,
@@ -104,6 +109,8 @@ export class SearcherHost extends RPCHost {
104
  // Return content by default
105
  const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
106
  const withFavicon = Boolean(ctx.get('X-With-Favicons'));
 
 
107
 
108
  let chargeAmount = 0;
109
  const noSlashPath = decodeURIComponent(ctx.path).slice(1);
@@ -163,7 +170,18 @@ export class SearcherHost extends RPCHost {
163
  fetchNum = count > 10 ? 30 : 20;
164
  }
165
 
166
- const r = await this.cachedWebSearch({
 
 
 
 
 
 
 
 
 
 
 
167
  q: searchQuery,
168
  num: fetchNum,
169
  gl,
@@ -172,7 +190,24 @@ export class SearcherHost extends RPCHost {
172
  page,
173
  }, crawlerOptions.noCache);
174
 
175
- if (!r.organic.length) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
177
  }
178
 
@@ -183,29 +218,36 @@ export class SearcherHost extends RPCHost {
183
 
184
  let lastScrapped: any[] | undefined;
185
  const targetResultCount = crawlWithoutContent ? count : count + 2;
186
- const organicSearchResults = r.organic.slice(0, targetResultCount);
187
- if (crawlWithoutContent || count === 0) {
188
- const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
189
- lastScrapped = fakeResults;
190
- chargeAmount = this.assignChargeAmount(!crawlWithoutContent ? lastScrapped : [], count);
191
-
192
- this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
193
- if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) {
194
- return lastScrapped;
195
  }
196
- return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
197
  }
198
-
199
- const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
200
- CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
201
- count,
202
- withFavicon
203
  );
204
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
206
  const sseStream = new OutputServerEventStream();
207
  rpcReflect.return(sseStream);
208
-
209
  try {
210
  for await (const scrapped of it) {
211
  if (!scrapped) {
@@ -215,7 +257,8 @@ export class SearcherHost extends RPCHost {
215
  break;
216
  }
217
 
218
- chargeAmount = this.assignChargeAmount(scrapped, count);
 
219
  sseStream.write({
220
  event: 'data',
221
  data: scrapped,
@@ -243,13 +286,12 @@ export class SearcherHost extends RPCHost {
243
  if (earlyReturnTimer) {
244
  return;
245
  }
246
- earlyReturnTimer = setTimeout(() => {
247
  if (!lastScrapped) {
248
  return;
249
  }
250
- chargeAmount = this.assignChargeAmount(lastScrapped, count);
251
-
252
- this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
253
  rpcReflect.return(lastScrapped);
254
  earlyReturn = true;
255
  }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
@@ -257,7 +299,7 @@ export class SearcherHost extends RPCHost {
257
 
258
  for await (const scrapped of it) {
259
  lastScrapped = scrapped;
260
- if (rpcReflect.signal.aborted) {
261
  break;
262
  }
263
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
@@ -269,9 +311,9 @@ export class SearcherHost extends RPCHost {
269
  if (earlyReturnTimer) {
270
  clearTimeout(earlyReturnTimer);
271
  }
272
- chargeAmount = this.assignChargeAmount(scrapped, count);
 
273
 
274
- this.assignTokenUsage(scrapped, chargeAmount, crawlWithoutContent);
275
  return scrapped;
276
  }
277
 
@@ -284,10 +326,10 @@ export class SearcherHost extends RPCHost {
284
  }
285
 
286
  if (!earlyReturn) {
287
- chargeAmount = this.assignChargeAmount(lastScrapped, count);
 
288
  }
289
 
290
- this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
291
  return lastScrapped;
292
  }
293
 
@@ -296,11 +338,12 @@ export class SearcherHost extends RPCHost {
296
  if (earlyReturnTimer) {
297
  return;
298
  }
299
- earlyReturnTimer = setTimeout(() => {
300
  if (!lastScrapped) {
301
  return;
302
  }
303
- chargeAmount = this.assignChargeAmount(lastScrapped, count);
 
304
  rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
305
  earlyReturn = true;
306
  }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
@@ -308,7 +351,7 @@ export class SearcherHost extends RPCHost {
308
 
309
  for await (const scrapped of it) {
310
  lastScrapped = scrapped;
311
- if (rpcReflect.signal.aborted) {
312
  break;
313
  }
314
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
@@ -322,8 +365,8 @@ export class SearcherHost extends RPCHost {
322
  if (earlyReturnTimer) {
323
  clearTimeout(earlyReturnTimer);
324
  }
325
-
326
- chargeAmount = this.assignChargeAmount(scrapped, count);
327
 
328
  return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
329
  }
@@ -337,141 +380,52 @@ export class SearcherHost extends RPCHost {
337
  }
338
 
339
  if (!earlyReturn) {
340
- chargeAmount = this.assignChargeAmount(lastScrapped, count);
 
341
  }
342
 
343
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
344
  }
345
 
346
- assignTokenUsage(result: FormattedPage[], chargeAmount: number, crawlWithoutContent: boolean) {
347
- if (crawlWithoutContent) {
348
- if (result) {
349
- result.forEach((x) => {
350
- delete x.usage;
351
- });
352
- }
353
- }
354
-
355
- assignMeta(result, { usage: { tokens: chargeAmount } });
356
- }
357
-
358
- async fakeResult(
359
- crawlerOptions: CrawlerOptions,
360
- searchResults?: SerperSearchResponse['organic'],
361
- withContent: boolean = false,
362
- withFavicon: boolean = false,
363
- ) {
364
- const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
365
-
366
- if (!searchResults) {
367
- return [];
368
- }
369
-
370
- const resultArray = await Promise.all(searchResults.map(async (upstreamSearchResult, index) => {
371
- const result = {
372
- url: upstreamSearchResult.link,
373
- title: upstreamSearchResult.title,
374
- description: upstreamSearchResult.snippet,
375
- date: upstreamSearchResult.date,
376
- } as FormattedPage;
377
-
378
- const dataItems = [
379
- { key: 'title', label: 'Title' },
380
- { key: 'url', label: 'URL Source' },
381
- { key: 'description', label: 'Description' },
382
- ];
383
-
384
- if (upstreamSearchResult.date) {
385
- dataItems.push({ key: 'date', label: 'Date' });
386
- }
387
-
388
- if (withContent) {
389
- result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
390
- }
391
-
392
- if (withFavicon) {
393
- const url = new URL(upstreamSearchResult.link);
394
- result.favicon = await this.getFavicon(url.origin);
395
- dataItems.push({
396
- key: 'favicon',
397
- label: 'Favicon',
398
- });
399
- }
400
-
401
- result.toString = function () {
402
- const self = this as any;
403
- return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
404
- };
405
- return result;
406
- }));
407
-
408
- resultArray.toString = function () {
409
- return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
410
- };
411
-
412
- return resultArray;
413
- }
414
-
415
  async *fetchSearchResults(
416
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
417
- searchResults?: SerperSearchResponse['organic'],
418
  options?: ExtraScrappingOptions,
419
  crawlerOptions?: CrawlerOptions,
420
  count?: number,
421
- withFavicon?: boolean,
422
  ) {
423
  if (!searchResults) {
424
  return;
425
  }
426
- const urls = searchResults.map((x) => new URL(x.link));
427
  const snapshotMap = new WeakMap();
428
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
429
  const mapped = scrapped.map((x, i) => {
430
- const upstreamSearchResult = searchResults[i];
431
- const url = upstreamSearchResult.link;
432
-
433
  if (!x) {
434
- return {
435
- url,
436
- title: upstreamSearchResult.title,
437
- description: upstreamSearchResult.snippet,
438
- date: upstreamSearchResult.date,
439
- content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
440
- };
441
  }
442
  if (snapshotMap.has(x)) {
443
  return snapshotMap.get(x);
444
  }
445
  return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
446
- r.title ??= upstreamSearchResult.title;
447
- r.description = upstreamSearchResult.snippet;
448
- r.date ??= upstreamSearchResult.date;
449
  snapshotMap.set(x, r);
450
 
451
  return r;
452
  }).catch((err) => {
453
  this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
454
 
455
- return {
456
- url,
457
- title: upstreamSearchResult.title,
458
- description: upstreamSearchResult.snippet,
459
- date: upstreamSearchResult.date,
460
- content: x.text,
461
- };
462
  });
463
- }).map(async (x) => {
464
- const page = await x;
465
- if (withFavicon && page.url) {
466
- const url = new URL(page.url);
467
- page.favicon = await this.getFavicon(url.origin);
468
- }
469
- return page;
470
  });
471
 
472
  const resultArray = await Promise.all(mapped) as FormattedPage[];
 
 
 
 
 
473
 
474
- yield this.reOrganizeSearchResults(resultArray, count);
475
  }
476
  }
477
 
@@ -487,78 +441,33 @@ export class SearcherHost extends RPCHost {
487
 
488
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
489
 
490
- const resultArray = filtered.map((x, i) => {
491
-
492
- return {
493
- ...x,
494
- toString(this: any) {
495
- if (!this.content && this.description) {
496
- if (this.title || x.textRepresentation) {
497
- const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
498
- return `[${i + 1}] Title: ${this.title}
499
- [${i + 1}] URL Source: ${this.url}
500
- [${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.date ? `\n[${i + 1}] Date: ${this.date}` : ''}
501
- `;
502
- }
503
-
504
- return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
505
- }
506
 
507
- const mixins = [];
508
- if (this.description) {
509
- mixins.push(`[${i + 1}] Description: ${this.description}`);
510
- }
511
- if (this.publishedTime) {
512
- mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
513
- }
514
 
515
- const suffixMixins = [];
516
- if (this.images) {
517
- const imageSummaryChunks = [`[${i + 1}] Images:`];
518
- for (const [k, v] of Object.entries(this.images)) {
519
- imageSummaryChunks.push(`- ![${k}](${v})`);
520
- }
521
- if (imageSummaryChunks.length === 1) {
522
- imageSummaryChunks.push('This page does not seem to contain any images.');
523
- }
524
- suffixMixins.push(imageSummaryChunks.join('\n'));
525
- }
526
- if (this.links) {
527
- const linkSummaryChunks = [`[${i + 1}] Links/Buttons:`];
528
- for (const [k, v] of Object.entries(this.links)) {
529
- linkSummaryChunks.push(`- [${k}](${v})`);
530
- }
531
- if (linkSummaryChunks.length === 1) {
532
- linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
533
- }
534
- suffixMixins.push(linkSummaryChunks.join('\n'));
535
- }
536
 
537
- return `[${i + 1}] Title: ${this.title}
538
- [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.date ? `\n[${i + 1}] Date: ${this.date}` : ''}
539
- [${i + 1}] Markdown Content:
540
- ${this.content}
541
- ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
542
- }
543
- };
544
- });
545
 
546
- resultArray.toString = function () {
547
- return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${this[i].url}`).join('\n\n').trimEnd() + '\n';
548
- };
 
549
 
550
- return resultArray;
551
- }
552
 
553
- assignChargeAmount(formatted: FormattedPage[], num: number) {
554
- const countentCharge = _.sum(
555
- formatted.map((x) => this.crawler.assignChargeAmount(x) || 0)
556
- );
557
 
558
- const numCharge = Math.ceil(num / 10) * 10000;
559
 
560
- return Math.max(countentCharge, numCharge);
561
 
 
562
  }
563
 
564
  pageQualified(formattedPage: FormattedPage) {
@@ -592,8 +501,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
592
  }
593
  }
594
 
595
- async cachedWebSearch(query: SerperSearchQueryParams, noCache: boolean = false) {
596
  const queryDigest = objHashMd5B64Of(query);
 
597
  let cache;
598
  if (!noCache) {
599
  cache = (await SerperSearchResult.fromFirestoreQuery(
@@ -615,7 +525,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
615
  }
616
 
617
  try {
618
- const r = await this.serperSearchService.webSearch(query);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
 
620
  const nowDate = new Date();
621
  const record = SerperSearchResult.from({
@@ -641,4 +568,95 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
641
  }
642
 
643
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  import { CrawlerHost, ExtraScrappingOptions } from './crawler';
12
  import { SerperSearchResult } from '../db/searched';
13
+ import { CrawlerOptions, RESPOND_TIMING } from '../dto/crawler-options';
14
  import { SnapshotFormatter, FormattedPage as RealFormattedPage } from '../services/snapshot-formatter';
15
  import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
16
 
 
20
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
21
  import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
22
  import { InsufficientBalanceError } from '../services/errors';
23
+ import { SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperSearchResponse, SerperWebSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
24
+ import { toAsyncGenerator } from '../utils/misc';
25
 
26
  const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
27
 
 
88
  searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
89
  @Param('count', { validate: (v: number) => v >= 0 && v <= 20 })
90
  count: number,
91
+ @Param('variant', { type: new Set(['web', 'images', 'news']), default: 'web' })
92
+ variant: 'web' | 'images' | 'news',
93
+ @Param('provider', { type: new Set(['google', 'bing']), default: 'google' })
94
+ searchEngine: 'google' | 'bing',
95
  @Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
96
  num?: number,
97
  @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v) }) gl?: string,
 
109
  // Return content by default
110
  const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
111
  const withFavicon = Boolean(ctx.get('X-With-Favicons'));
112
+ this.threadLocal.set('collect-favicon', withFavicon);
113
+ crawlerOptions.respondTiming ??= RESPOND_TIMING.VISIBLE_CONTENT;
114
 
115
  let chargeAmount = 0;
116
  const noSlashPath = decodeURIComponent(ctx.path).slice(1);
 
170
  fetchNum = count > 10 ? 30 : 20;
171
  }
172
 
173
+ let chargeAmountScaler = 1;
174
+ if (searchEngine === 'bing') {
175
+ this.threadLocal.set('bing-preferred', true);
176
+ chargeAmountScaler = 2;
177
+ }
178
+ if (variant !== 'web') {
179
+ chargeAmountScaler = 3;
180
+ }
181
+
182
+ const r = await this.cachedSearch({
183
+ variant,
184
+ provider: searchEngine,
185
  q: searchQuery,
186
  num: fetchNum,
187
  gl,
 
190
  page,
191
  }, crawlerOptions.noCache);
192
 
193
+ let results;
194
+ switch (variant) {
195
+ case 'images': {
196
+ results = (r as SerperImageSearchResponse).images;
197
+ break;
198
+ }
199
+ case 'news': {
200
+ results = (r as SerperNewsSearchResponse).news;
201
+ break;
202
+ }
203
+ case 'web':
204
+ default: {
205
+ results = (r as SerperWebSearchResponse).organic;
206
+ break;
207
+ }
208
+ }
209
+
210
+ if (!results.length) {
211
  throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
212
  }
213
 
 
218
 
219
  let lastScrapped: any[] | undefined;
220
  const targetResultCount = crawlWithoutContent ? count : count + 2;
221
+ const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
222
+ trimmedResults.toString = function () {
223
+ return this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
224
+ };
225
+ if (!crawlerOptions.respondWith.includes('no-content') &&
226
+ ['html', 'text', 'shot', 'markdown', 'content'].some((x) => crawlerOptions.respondWith.includes(x))
227
+ ) {
228
+ for (const x of trimmedResults) {
229
+ x.content ??= '';
230
  }
 
231
  }
232
+ const assigningOfGeneralMixins = Promise.allSettled(
233
+ trimmedResults.map((x) => this.assignGeneralMixin(x))
 
 
 
234
  );
235
 
236
+ let it;
237
+
238
+ if (crawlWithoutContent || count === 0) {
239
+ it = toAsyncGenerator(trimmedResults);
240
+ await assigningOfGeneralMixins;
241
+ } else {
242
+ it = this.fetchSearchResults(crawlerOptions.respondWith, trimmedResults, crawlOpts,
243
+ CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
244
+ count,
245
+ );
246
+ }
247
+
248
  if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
249
  const sseStream = new OutputServerEventStream();
250
  rpcReflect.return(sseStream);
 
251
  try {
252
  for await (const scrapped of it) {
253
  if (!scrapped) {
 
257
  break;
258
  }
259
 
260
+ chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
261
+ lastScrapped = scrapped;
262
  sseStream.write({
263
  event: 'data',
264
  data: scrapped,
 
286
  if (earlyReturnTimer) {
287
  return;
288
  }
289
+ earlyReturnTimer = setTimeout(async () => {
290
  if (!lastScrapped) {
291
  return;
292
  }
293
+ await assigningOfGeneralMixins;
294
+ chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
 
295
  rpcReflect.return(lastScrapped);
296
  earlyReturn = true;
297
  }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
 
299
 
300
  for await (const scrapped of it) {
301
  lastScrapped = scrapped;
302
+ if (rpcReflect.signal.aborted || earlyReturn) {
303
  break;
304
  }
305
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
 
311
  if (earlyReturnTimer) {
312
  clearTimeout(earlyReturnTimer);
313
  }
314
+ await assigningOfGeneralMixins;
315
+ chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
316
 
 
317
  return scrapped;
318
  }
319
 
 
326
  }
327
 
328
  if (!earlyReturn) {
329
+ await assigningOfGeneralMixins;
330
+ chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
331
  }
332
 
 
333
  return lastScrapped;
334
  }
335
 
 
338
  if (earlyReturnTimer) {
339
  return;
340
  }
341
+ earlyReturnTimer = setTimeout(async () => {
342
  if (!lastScrapped) {
343
  return;
344
  }
345
+ await assigningOfGeneralMixins;
346
+ chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
347
  rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
348
  earlyReturn = true;
349
  }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
 
351
 
352
  for await (const scrapped of it) {
353
  lastScrapped = scrapped;
354
+ if (rpcReflect.signal.aborted || earlyReturn) {
355
  break;
356
  }
357
  if (_.some(scrapped, (x) => this.pageQualified(x))) {
 
365
  if (earlyReturnTimer) {
366
  clearTimeout(earlyReturnTimer);
367
  }
368
+ await assigningOfGeneralMixins;
369
+ chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
370
 
371
  return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
372
  }
 
380
  }
381
 
382
  if (!earlyReturn) {
383
+ await assigningOfGeneralMixins;
384
+ chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
385
  }
386
 
387
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
388
  }
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  async *fetchSearchResults(
391
  mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
392
+ searchResults?: FormattedPage[],
393
  options?: ExtraScrappingOptions,
394
  crawlerOptions?: CrawlerOptions,
395
  count?: number,
 
396
  ) {
397
  if (!searchResults) {
398
  return;
399
  }
400
+ const urls = searchResults.map((x) => new URL(x.url!));
401
  const snapshotMap = new WeakMap();
402
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
403
  const mapped = scrapped.map((x, i) => {
 
 
 
404
  if (!x) {
405
+ return {};
 
 
 
 
 
 
406
  }
407
  if (snapshotMap.has(x)) {
408
  return snapshotMap.get(x);
409
  }
410
  return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
 
 
 
411
  snapshotMap.set(x, r);
412
 
413
  return r;
414
  }).catch((err) => {
415
  this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
416
 
417
+ return {};
 
 
 
 
 
 
418
  });
 
 
 
 
 
 
 
419
  });
420
 
421
  const resultArray = await Promise.all(mapped) as FormattedPage[];
422
+ for (const [i, v] of resultArray.entries()) {
423
+ if (v) {
424
+ Object.assign(searchResults[i], v);
425
+ }
426
+ }
427
 
428
+ yield this.reOrganizeSearchResults(searchResults, count);
429
  }
430
  }
431
 
 
441
 
442
  const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
443
 
444
+ const resultArray = filtered;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
+ resultArray.toString = searchResults.toString;
 
 
 
 
 
 
447
 
448
+ return resultArray;
449
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
451
+ assignChargeAmount(formatted: FormattedPage[], num: number, scaler: number) {
452
+ let contentCharge = 0;
453
+ for (const x of formatted) {
454
+ const itemAmount = this.crawler.assignChargeAmount(x) || 0;
 
 
 
 
455
 
456
+ if (!itemAmount) {
457
+ Reflect.deleteProperty(x, 'usage');
458
+ continue;
459
+ }
460
 
461
+ contentCharge += itemAmount;
462
+ }
463
 
464
+ const numCharge = Math.ceil(formatted.length / 10) * 10000 * scaler;
 
 
 
465
 
466
+ const final = Math.max(contentCharge, numCharge);
467
 
468
+ assignMeta(formatted, { usage: { tokens: final } });
469
 
470
+ return final;
471
  }
472
 
473
  pageQualified(formattedPage: FormattedPage) {
 
501
  }
502
  }
503
 
504
+ async cachedSearch(query: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; }, noCache: boolean = false) {
505
  const queryDigest = objHashMd5B64Of(query);
506
+ Reflect.deleteProperty(query, 'provider');
507
  let cache;
508
  if (!noCache) {
509
  cache = (await SerperSearchResult.fromFirestoreQuery(
 
525
  }
526
 
527
  try {
528
+ let r;
529
+ const variant = query.variant;
530
+ Reflect.deleteProperty(query, 'variant');
531
+ switch (variant) {
532
+ case 'images': {
533
+ r = await this.serperSearchService.imageSearch(query);
534
+ break;
535
+ }
536
+ case 'news': {
537
+ r = await this.serperSearchService.newsSearch(query);
538
+ break;
539
+ }
540
+ case 'web':
541
+ default: {
542
+ r = await this.serperSearchService.webSearch(query);
543
+ break;
544
+ }
545
+ }
546
 
547
  const nowDate = new Date();
548
  const record = SerperSearchResult.from({
 
568
  }
569
 
570
  }
571
+
572
+ mapToFinalResults(input:
573
+ | SerperImageSearchResponse['images'][0]
574
+ | SerperWebSearchResponse['organic'][0]
575
+ | SerperNewsSearchResponse['news'][0],
576
+ ) {
577
+ const whitelistedProps = [
578
+ 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date'
579
+ ];
580
+ const result = {
581
+ title: input.title,
582
+ url: input.link,
583
+ description: Reflect.get(input, 'snippet'),
584
+ ..._.pick(input, whitelistedProps),
585
+ } as FormattedPage;
586
+
587
+ return result;
588
+ }
589
+
590
+ async assignGeneralMixin(result: FormattedPage) {
591
+ const collectFavicon = this.threadLocal.get('collect-favicon');
592
+
593
+ if (collectFavicon && result.url) {
594
+ const url = new URL(result.url);
595
+ Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
596
+ }
597
+
598
+ Object.setPrototypeOf(result, searchResultProto);
599
+ }
600
  }
601
+
602
+ const dataItems = [
603
+ { key: 'title', label: 'Title' },
604
+ { key: 'source', label: 'Source' },
605
+ { key: 'url', label: 'URL Source' },
606
+ { key: 'imageUrl', label: 'Image URL' },
607
+ { key: 'description', label: 'Description' },
608
+ { key: 'publishedTime', label: 'Published Time' },
609
+ { key: 'imageWidth', label: 'Image Width' },
610
+ { key: 'imageHeight', label: 'Image Height' },
611
+ { key: 'date', label: 'Date' },
612
+ { key: 'favicon', label: 'Favicon' },
613
+ ];
614
+
615
+ const searchResultProto = {
616
+ toString(this: FormattedPage, i?: number) {
617
+ const chunks = [];
618
+ for (const item of dataItems) {
619
+ const v = Reflect.get(this, item.key);
620
+ if (typeof v !== 'undefined') {
621
+ if (i === undefined) {
622
+ chunks.push(`[${item.label}]: ${v}`);
623
+ } else {
624
+ chunks.push(`[${i + 1}] ${item.label}: ${v}`);
625
+ }
626
+ }
627
+ }
628
+
629
+ if (this.content) {
630
+ chunks.push(`\n${this.content}`);
631
+ }
632
+
633
+ if (this.images) {
634
+ const imageSummaryChunks = [`${i === undefined ? '' : `[${i + 1}] `}Images:`];
635
+ for (const [k, v] of Object.entries(this.images)) {
636
+ imageSummaryChunks.push(`- ![${k}](${v})`);
637
+ }
638
+ if (imageSummaryChunks.length === 1) {
639
+ imageSummaryChunks.push('This page does not seem to contain any images.');
640
+ }
641
+ chunks.push(imageSummaryChunks.join('\n'));
642
+ }
643
+ if (this.links) {
644
+ const linkSummaryChunks = [`${i === undefined ? '' : `[${i + 1}] `}Links/Buttons:`];
645
+ if (Array.isArray(this.links)) {
646
+ for (const [k, v] of this.links) {
647
+ linkSummaryChunks.push(`- [${k}](${v})`);
648
+ }
649
+ } else {
650
+ for (const [k, v] of Object.entries(this.links)) {
651
+ linkSummaryChunks.push(`- [${k}](${v})`);
652
+ }
653
+ }
654
+ if (linkSummaryChunks.length === 1) {
655
+ linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
656
+ }
657
+ chunks.push(linkSummaryChunks.join('\n'));
658
+ }
659
+
660
+ return chunks.join('\n');
661
+ }
662
+ };
src/services/serper-search.ts CHANGED
@@ -4,16 +4,18 @@ import { GlobalLogger } from './logger';
4
  import { SecretExposer } from '../shared/services/secrets';
5
  import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
6
  import { AsyncLocalContext } from './async-context';
7
- import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
8
  import { BlackHoleDetector } from './blackhole-detector';
9
  import { Context } from './registry';
 
10
 
11
  @singleton()
12
  export class SerperSearchService extends AsyncService {
13
 
14
  logger = this.globalLogger.child({ service: this.constructor.name });
15
 
16
- serperSearchHTTP!: SerperGoogleHTTP;
 
17
 
18
  constructor(
19
  protected globalLogger: GlobalLogger,
@@ -29,10 +31,24 @@ export class SerperSearchService extends AsyncService {
29
  await this.dependencyReady();
30
  this.emit('ready');
31
 
32
- this.serperSearchHTTP = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
 
33
  }
34
 
35
- async webSearch(query: SerperSearchQueryParams) {
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  const ip = this.threadLocal.get('ip');
37
  if (ip) {
38
  const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
@@ -57,27 +73,89 @@ export class SerperSearchService extends AsyncService {
57
  }
58
  }
59
 
 
 
 
 
 
 
60
  let maxTries = 3;
61
 
62
  while (maxTries--) {
63
  try {
64
  this.logger.debug(`Doing external search`, query);
65
- const r = await this.serperSearchHTTP.webSearch(query);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  this.blackHoleDetector.itWorked();
67
 
68
  return r.parsed;
69
  } catch (err: any) {
70
- this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
71
  if (err?.status === 429) {
72
  await delay(500 + 1000 * Math.random());
73
  continue;
74
  }
 
 
 
75
 
76
- throw new DownstreamServiceFailureError({ message: `Search failed` });
77
  }
78
  }
79
 
80
- throw new DownstreamServiceFailureError({ message: `Search failed` });
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
 
83
  }
 
4
  import { SecretExposer } from '../shared/services/secrets';
5
  import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
6
  import { AsyncLocalContext } from './async-context';
7
+ import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
8
  import { BlackHoleDetector } from './blackhole-detector';
9
  import { Context } from './registry';
10
+ import { ServiceBadAttemptError } from '../shared';
11
 
12
  @singleton()
13
  export class SerperSearchService extends AsyncService {
14
 
15
  logger = this.globalLogger.child({ service: this.constructor.name });
16
 
17
+ serperGoogleSearchHTTP!: SerperGoogleHTTP;
18
+ serperBingSearchHTTP!: SerperBingHTTP;
19
 
20
  constructor(
21
  protected globalLogger: GlobalLogger,
 
31
  await this.dependencyReady();
32
  this.emit('ready');
33
 
34
+ this.serperGoogleSearchHTTP = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
35
+ this.serperBingSearchHTTP = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
36
  }
37
 
38
+ *iterClient() {
39
+ const preferBingSearch = this.threadLocal.get('bing-preferred');
40
+ if (preferBingSearch) {
41
+ yield this.serperBingSearchHTTP;
42
+ }
43
+ while (true) {
44
+ yield this.serperGoogleSearchHTTP;
45
+ }
46
+ }
47
+
48
+ doSearch(variant: 'web', query: SerperSearchQueryParams): Promise<SerperWebSearchResponse>;
49
+ doSearch(variant: 'images', query: SerperSearchQueryParams): Promise<SerperImageSearchResponse>;
50
+ doSearch(variant: 'news', query: SerperSearchQueryParams): Promise<SerperNewsSearchResponse>;
51
+ async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
52
  const ip = this.threadLocal.get('ip');
53
  if (ip) {
54
  const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
 
73
  }
74
  }
75
 
76
+ const clientIt = this.iterClient();
77
+ let client = clientIt.next().value;
78
+ if (!client) {
79
+ throw new Error(`Error iterating serper client`);
80
+ }
81
+
82
  let maxTries = 3;
83
 
84
  while (maxTries--) {
85
  try {
86
  this.logger.debug(`Doing external search`, query);
87
+ let r;
88
+ switch (variant) {
89
+ case 'images': {
90
+ r = await client.imageSearch(query);
91
+ const nextClient = clientIt.next().value;
92
+ if (nextClient && nextClient !== client) {
93
+ const results = r.parsed.images;
94
+ if (!results.length) {
95
+ client = nextClient;
96
+ throw new ServiceBadAttemptError('No results found');
97
+ }
98
+ }
99
+
100
+ break;
101
+ }
102
+ case 'news': {
103
+ r = await client.newsSearch(query);
104
+ const nextClient = clientIt.next().value;
105
+ if (nextClient && nextClient !== client) {
106
+ const results = r.parsed.news;
107
+ if (!results.length) {
108
+ client = nextClient;
109
+ throw new ServiceBadAttemptError('No results found');
110
+ }
111
+ }
112
+
113
+ break;
114
+ }
115
+ case 'web':
116
+ default: {
117
+ r = await client.webSearch(query);
118
+ const nextClient = clientIt.next().value;
119
+ if (nextClient && nextClient !== client) {
120
+ const results = r.parsed.organic;
121
+ if (!results.length) {
122
+ client = nextClient;
123
+ throw new ServiceBadAttemptError('No results found');
124
+ }
125
+ }
126
+
127
+ break;
128
+ }
129
+ }
130
  this.blackHoleDetector.itWorked();
131
 
132
  return r.parsed;
133
  } catch (err: any) {
134
+ this.logger.error(`${variant} search failed: ${err?.message}`, { err: marshalErrorLike(err) });
135
  if (err?.status === 429) {
136
  await delay(500 + 1000 * Math.random());
137
  continue;
138
  }
139
+ if (err instanceof ServiceBadAttemptError) {
140
+ continue;
141
+ }
142
 
143
+ throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` });
144
  }
145
  }
146
 
147
+ throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` });
148
+ }
149
+
150
+
151
+ async webSearch(query: SerperSearchQueryParams) {
152
+ return this.doSearch('web', query);
153
+ }
154
+ async imageSearch(query: SerperSearchQueryParams) {
155
+ return this.doSearch('images', query);
156
+ }
157
+ async newsSearch(query: SerperSearchQueryParams) {
158
+ return this.doSearch('news', query);
159
  }
160
 
161
  }
src/utils/misc.ts CHANGED
@@ -16,3 +16,12 @@ export function tryDecodeURIComponent(input: string) {
16
  throw new ParamValidationError(`Invalid URIComponent: ${input}`);
17
  }
18
  }
 
 
 
 
 
 
 
 
 
 
16
  throw new ParamValidationError(`Invalid URIComponent: ${input}`);
17
  }
18
  }
19
+
20
+
21
+ export async function* toAsyncGenerator<T>(val: T) {
22
+ yield val;
23
+ }
24
+
25
+ export async function* toGenerator<T>(val: T) {
26
+ yield val;
27
+ }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit a2ebcb882fa92644cc3dfd6b8d8e66f06dd940e9
 
1
+ Subproject commit 8c31e85dc52dfcc7d1d86df0328df3a94319b534