Aaron Ji nomagick commited on
Commit
21ae52a
·
unverified ·
1 Parent(s): cd6a4ee

feat: support fetching favicon (#1155)

Browse files

* feat: support fetching favicon

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* chore: remove 'withFavicon' from CrawlerOptions

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

backend/functions/src/cloud-functions/searcher-serper.ts CHANGED
@@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
92
  // Return content by default
93
  const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
94
  const crawlWithoutContent = respondWith.includes('no-content');
 
95
 
96
  let chargeAmount = 0;
97
  const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
@@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
164
  const targetResultCount = crawlWithoutContent ? count : count + 2;
165
  const organicSearchResults = r.organic.slice(0, targetResultCount);
166
  if (crawlWithoutContent || count === 0) {
167
- const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
168
  lastScrapped = fakeResults;
169
  if (!crawlWithoutContent) {
170
  chargeAmount = this.assignChargeAmount(lastScrapped);
@@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
181
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
182
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
183
  count,
 
184
  );
185
 
186
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
328
  }
329
 
330
  async fakeResult(
331
- mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
332
  searchResults?: SerperSearchResponse['organic'],
333
- withContent: boolean = false
 
334
  ) {
 
 
335
  if (!searchResults) {
336
  return [];
337
  }
@@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
352
  if (withContent) {
353
  result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
354
  }
355
- if (mode.includes('no-content')) {
 
356
  const url = new URL(upstreamSearchResult.link);
357
  result.favicon = await this.getFavicon(url.origin);
358
  dataItems.push({
@@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
381
  options?: ExtraScrappingOptions,
382
  crawlerOptions?: CrawlerOptions,
383
  count?: number,
 
384
  ) {
385
  if (!searchResults) {
386
  return;
@@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
391
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
392
  const mapped = scrapped.map((x, i) => {
393
  const upstreamSearchResult = searchResults[i];
 
 
394
  if (!x) {
395
  return {
396
- url: upstreamSearchResult.link,
397
  title: upstreamSearchResult.title,
398
  description: upstreamSearchResult.snippet,
399
  content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
@@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
412
  this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
413
 
414
  return {
415
- url: upstreamSearchResult.link,
416
  title: upstreamSearchResult.title,
417
  description: upstreamSearchResult.snippet,
418
  content: x.text,
419
  };
420
  });
 
 
 
 
 
 
 
421
  });
422
 
423
  const resultArray = await Promise.all(mapped) as FormattedPage[];
@@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
448
  const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
449
  return `[${i + 1}] Title: ${this.title}
450
  [${i + 1}] URL Source: ${this.url}
451
- [${i + 1}] Description: ${this.description}${textRep}
452
  `;
453
  }
454
 
455
- return `[${i + 1}] No content available for ${this.url}`;
456
  }
457
 
458
  const mixins = [];
@@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
486
  }
487
 
488
  return `[${i + 1}] Title: ${this.title}
489
- [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
490
  [${i + 1}] Markdown Content:
491
  ${this.content}
492
  ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
 
92
  // Return content by default
93
  const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
94
  const crawlWithoutContent = respondWith.includes('no-content');
95
+ const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
96
 
97
  let chargeAmount = 0;
98
  const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
 
165
  const targetResultCount = crawlWithoutContent ? count : count + 2;
166
  const organicSearchResults = r.organic.slice(0, targetResultCount);
167
  if (crawlWithoutContent || count === 0) {
168
+ const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
169
  lastScrapped = fakeResults;
170
  if (!crawlWithoutContent) {
171
  chargeAmount = this.assignChargeAmount(lastScrapped);
 
182
  const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
183
  CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
184
  count,
185
+ withFavicon
186
  );
187
 
188
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
 
330
  }
331
 
332
  async fakeResult(
333
+ crawlerOptions: CrawlerOptions,
334
  searchResults?: SerperSearchResponse['organic'],
335
+ withContent: boolean = false,
336
+ withFavicon: boolean = false,
337
  ) {
338
+ const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
339
+
340
  if (!searchResults) {
341
  return [];
342
  }
 
357
  if (withContent) {
358
  result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
359
  }
360
+
361
+ if (withFavicon) {
362
  const url = new URL(upstreamSearchResult.link);
363
  result.favicon = await this.getFavicon(url.origin);
364
  dataItems.push({
 
387
  options?: ExtraScrappingOptions,
388
  crawlerOptions?: CrawlerOptions,
389
  count?: number,
390
+ withFavicon?: boolean,
391
  ) {
392
  if (!searchResults) {
393
  return;
 
398
  for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
399
  const mapped = scrapped.map((x, i) => {
400
  const upstreamSearchResult = searchResults[i];
401
+ const url = upstreamSearchResult.link;
402
+
403
  if (!x) {
404
  return {
405
+ url,
406
  title: upstreamSearchResult.title,
407
  description: upstreamSearchResult.snippet,
408
  content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
 
421
  this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
422
 
423
  return {
424
+ url,
425
  title: upstreamSearchResult.title,
426
  description: upstreamSearchResult.snippet,
427
  content: x.text,
428
  };
429
  });
430
+ }).map(async (x) => {
431
+ const page = await x;
432
+ if (withFavicon && page.url) {
433
+ const url = new URL(page.url);
434
+ page.favicon = await this.getFavicon(url.origin);
435
+ }
436
+ return page;
437
  });
438
 
439
  const resultArray = await Promise.all(mapped) as FormattedPage[];
 
464
  const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
465
  return `[${i + 1}] Title: ${this.title}
466
  [${i + 1}] URL Source: ${this.url}
467
+ [${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
468
  `;
469
  }
470
 
471
+ return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
472
  }
473
 
474
  const mixins = [];
 
502
  }
503
 
504
  return `[${i + 1}] Title: ${this.title}
505
+ [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
506
  [${i + 1}] Markdown Content:
507
  ${this.content}
508
  ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;