Spaces:
Build error
Build error
feat: support fetching favicon (#1155)
Browse files* feat: support fetching favicon
* Update backend/functions/src/dto/scrapping-options.ts
Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
* Update backend/functions/src/dto/scrapping-options.ts
Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
* chore: remove 'withFavicon' from CrawlerOptions
---------
Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
backend/functions/src/cloud-functions/searcher-serper.ts
CHANGED
|
@@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
|
|
| 92 |
// Return content by default
|
| 93 |
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
| 94 |
const crawlWithoutContent = respondWith.includes('no-content');
|
|
|
|
| 95 |
|
| 96 |
let chargeAmount = 0;
|
| 97 |
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
|
@@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
|
|
| 164 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 165 |
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
| 166 |
if (crawlWithoutContent || count === 0) {
|
| 167 |
-
const fakeResults = await this.fakeResult(crawlerOptions
|
| 168 |
lastScrapped = fakeResults;
|
| 169 |
if (!crawlWithoutContent) {
|
| 170 |
chargeAmount = this.assignChargeAmount(lastScrapped);
|
|
@@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
|
|
| 181 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
| 182 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 183 |
count,
|
|
|
|
| 184 |
);
|
| 185 |
|
| 186 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
@@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
|
|
| 328 |
}
|
| 329 |
|
| 330 |
async fakeResult(
|
| 331 |
-
|
| 332 |
searchResults?: SerperSearchResponse['organic'],
|
| 333 |
-
withContent: boolean = false
|
|
|
|
| 334 |
) {
|
|
|
|
|
|
|
| 335 |
if (!searchResults) {
|
| 336 |
return [];
|
| 337 |
}
|
|
@@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
|
|
| 352 |
if (withContent) {
|
| 353 |
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
| 354 |
}
|
| 355 |
-
|
|
|
|
| 356 |
const url = new URL(upstreamSearchResult.link);
|
| 357 |
result.favicon = await this.getFavicon(url.origin);
|
| 358 |
dataItems.push({
|
|
@@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
|
|
| 381 |
options?: ExtraScrappingOptions,
|
| 382 |
crawlerOptions?: CrawlerOptions,
|
| 383 |
count?: number,
|
|
|
|
| 384 |
) {
|
| 385 |
if (!searchResults) {
|
| 386 |
return;
|
|
@@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
|
|
| 391 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 392 |
const mapped = scrapped.map((x, i) => {
|
| 393 |
const upstreamSearchResult = searchResults[i];
|
|
|
|
|
|
|
| 394 |
if (!x) {
|
| 395 |
return {
|
| 396 |
-
url
|
| 397 |
title: upstreamSearchResult.title,
|
| 398 |
description: upstreamSearchResult.snippet,
|
| 399 |
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
|
@@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
|
|
| 412 |
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
| 413 |
|
| 414 |
return {
|
| 415 |
-
url
|
| 416 |
title: upstreamSearchResult.title,
|
| 417 |
description: upstreamSearchResult.snippet,
|
| 418 |
content: x.text,
|
| 419 |
};
|
| 420 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
});
|
| 422 |
|
| 423 |
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
|
@@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
|
|
| 448 |
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
| 449 |
return `[${i + 1}] Title: ${this.title}
|
| 450 |
[${i + 1}] URL Source: ${this.url}
|
| 451 |
-
[${i + 1}] Description: ${this.description}${textRep}
|
| 452 |
`;
|
| 453 |
}
|
| 454 |
|
| 455 |
-
return `[${i + 1}] No content available for ${this.url}`;
|
| 456 |
}
|
| 457 |
|
| 458 |
const mixins = [];
|
|
@@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
|
|
| 486 |
}
|
| 487 |
|
| 488 |
return `[${i + 1}] Title: ${this.title}
|
| 489 |
-
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
| 490 |
[${i + 1}] Markdown Content:
|
| 491 |
${this.content}
|
| 492 |
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|
|
|
| 92 |
// Return content by default
|
| 93 |
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
| 94 |
const crawlWithoutContent = respondWith.includes('no-content');
|
| 95 |
+
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
|
| 96 |
|
| 97 |
let chargeAmount = 0;
|
| 98 |
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
|
|
|
| 165 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 166 |
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
| 167 |
if (crawlWithoutContent || count === 0) {
|
| 168 |
+
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
|
| 169 |
lastScrapped = fakeResults;
|
| 170 |
if (!crawlWithoutContent) {
|
| 171 |
chargeAmount = this.assignChargeAmount(lastScrapped);
|
|
|
|
| 182 |
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
| 183 |
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 184 |
count,
|
| 185 |
+
withFavicon
|
| 186 |
);
|
| 187 |
|
| 188 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
|
|
|
| 330 |
}
|
| 331 |
|
| 332 |
async fakeResult(
|
| 333 |
+
crawlerOptions: CrawlerOptions,
|
| 334 |
searchResults?: SerperSearchResponse['organic'],
|
| 335 |
+
withContent: boolean = false,
|
| 336 |
+
withFavicon: boolean = false,
|
| 337 |
) {
|
| 338 |
+
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
|
| 339 |
+
|
| 340 |
if (!searchResults) {
|
| 341 |
return [];
|
| 342 |
}
|
|
|
|
| 357 |
if (withContent) {
|
| 358 |
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
| 359 |
}
|
| 360 |
+
|
| 361 |
+
if (withFavicon) {
|
| 362 |
const url = new URL(upstreamSearchResult.link);
|
| 363 |
result.favicon = await this.getFavicon(url.origin);
|
| 364 |
dataItems.push({
|
|
|
|
| 387 |
options?: ExtraScrappingOptions,
|
| 388 |
crawlerOptions?: CrawlerOptions,
|
| 389 |
count?: number,
|
| 390 |
+
withFavicon?: boolean,
|
| 391 |
) {
|
| 392 |
if (!searchResults) {
|
| 393 |
return;
|
|
|
|
| 398 |
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 399 |
const mapped = scrapped.map((x, i) => {
|
| 400 |
const upstreamSearchResult = searchResults[i];
|
| 401 |
+
const url = upstreamSearchResult.link;
|
| 402 |
+
|
| 403 |
if (!x) {
|
| 404 |
return {
|
| 405 |
+
url,
|
| 406 |
title: upstreamSearchResult.title,
|
| 407 |
description: upstreamSearchResult.snippet,
|
| 408 |
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
|
|
|
| 421 |
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
| 422 |
|
| 423 |
return {
|
| 424 |
+
url,
|
| 425 |
title: upstreamSearchResult.title,
|
| 426 |
description: upstreamSearchResult.snippet,
|
| 427 |
content: x.text,
|
| 428 |
};
|
| 429 |
});
|
| 430 |
+
}).map(async (x) => {
|
| 431 |
+
const page = await x;
|
| 432 |
+
if (withFavicon && page.url) {
|
| 433 |
+
const url = new URL(page.url);
|
| 434 |
+
page.favicon = await this.getFavicon(url.origin);
|
| 435 |
+
}
|
| 436 |
+
return page;
|
| 437 |
});
|
| 438 |
|
| 439 |
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
|
|
|
| 464 |
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
| 465 |
return `[${i + 1}] Title: ${this.title}
|
| 466 |
[${i + 1}] URL Source: ${this.url}
|
| 467 |
+
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
| 468 |
`;
|
| 469 |
}
|
| 470 |
|
| 471 |
+
return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
|
| 472 |
}
|
| 473 |
|
| 474 |
const mixins = [];
|
|
|
|
| 502 |
}
|
| 503 |
|
| 504 |
return `[${i + 1}] Title: ${this.title}
|
| 505 |
+
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
| 506 |
[${i + 1}] Markdown Content:
|
| 507 |
${this.content}
|
| 508 |
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|