Spaces:
Build error
Build error
Aaron Ji commited on
feat: support fallback query (#1179)
Browse files* feat: support fallback query
* chore: update scaler value
- src/api/searcher-serper.ts +100 -13
src/api/searcher-serper.ts
CHANGED
|
@@ -112,6 +112,7 @@ export class SearcherHost extends RPCHost {
|
|
| 112 |
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
| 113 |
@Param('location') location?: string,
|
| 114 |
@Param('page') page?: number,
|
|
|
|
| 115 |
@Param('q') q?: string,
|
| 116 |
) {
|
| 117 |
// We want to make our search API follow SERP schema, so we need to expose 'num' parameter.
|
|
@@ -262,16 +263,19 @@ export class SearcherHost extends RPCHost {
|
|
| 262 |
fetchNum = count > 10 ? 30 : 20;
|
| 263 |
}
|
| 264 |
|
|
|
|
| 265 |
let chargeAmountScaler = 1;
|
| 266 |
if (searchEngine === 'bing') {
|
| 267 |
this.threadLocal.set('bing-preferred', true);
|
| 268 |
chargeAmountScaler = 3;
|
| 269 |
}
|
|
|
|
| 270 |
if (variant !== 'web') {
|
| 271 |
chargeAmountScaler = 5;
|
| 272 |
}
|
| 273 |
|
| 274 |
-
|
|
|
|
| 275 |
variant,
|
| 276 |
provider: searchEngine,
|
| 277 |
q: searchQuery,
|
|
@@ -280,7 +284,14 @@ export class SearcherHost extends RPCHost {
|
|
| 280 |
hl,
|
| 281 |
location,
|
| 282 |
page,
|
| 283 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
let results;
|
| 286 |
switch (variant) {
|
|
@@ -312,7 +323,11 @@ export class SearcherHost extends RPCHost {
|
|
| 312 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 313 |
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
|
| 314 |
trimmedResults.toString = function () {
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
};
|
| 317 |
if (!crawlerOptions.respondWith.includes('no-content') &&
|
| 318 |
['html', 'text', 'shot', 'markdown', 'content'].some((x) => crawlerOptions.respondWith.includes(x))
|
|
@@ -349,8 +364,16 @@ export class SearcherHost extends RPCHost {
|
|
| 349 |
break;
|
| 350 |
}
|
| 351 |
|
| 352 |
-
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
|
| 353 |
lastScrapped = scrapped;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
sseStream.write({
|
| 355 |
event: 'data',
|
| 356 |
data: scrapped,
|
|
@@ -383,7 +406,8 @@ export class SearcherHost extends RPCHost {
|
|
| 383 |
return;
|
| 384 |
}
|
| 385 |
await assigningOfGeneralMixins;
|
| 386 |
-
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
|
|
|
|
| 387 |
rpcReflect.return(lastScrapped);
|
| 388 |
earlyReturn = true;
|
| 389 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
@@ -404,7 +428,7 @@ export class SearcherHost extends RPCHost {
|
|
| 404 |
clearTimeout(earlyReturnTimer);
|
| 405 |
}
|
| 406 |
await assigningOfGeneralMixins;
|
| 407 |
-
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
|
| 408 |
|
| 409 |
return scrapped;
|
| 410 |
}
|
|
@@ -419,7 +443,7 @@ export class SearcherHost extends RPCHost {
|
|
| 419 |
|
| 420 |
if (!earlyReturn) {
|
| 421 |
await assigningOfGeneralMixins;
|
| 422 |
-
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
|
| 423 |
}
|
| 424 |
|
| 425 |
return lastScrapped;
|
|
@@ -435,7 +459,8 @@ export class SearcherHost extends RPCHost {
|
|
| 435 |
return;
|
| 436 |
}
|
| 437 |
await assigningOfGeneralMixins;
|
| 438 |
-
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
|
|
|
|
| 439 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 440 |
earlyReturn = true;
|
| 441 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
@@ -458,7 +483,7 @@ export class SearcherHost extends RPCHost {
|
|
| 458 |
clearTimeout(earlyReturnTimer);
|
| 459 |
}
|
| 460 |
await assigningOfGeneralMixins;
|
| 461 |
-
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler);
|
| 462 |
|
| 463 |
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 464 |
}
|
|
@@ -473,12 +498,70 @@ export class SearcherHost extends RPCHost {
|
|
| 473 |
|
| 474 |
if (!earlyReturn) {
|
| 475 |
await assigningOfGeneralMixins;
|
| 476 |
-
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler);
|
| 477 |
}
|
| 478 |
|
| 479 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
| 480 |
}
|
| 481 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
async *fetchSearchResults(
|
| 483 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
|
| 484 |
searchResults?: FormattedPage[],
|
|
@@ -540,7 +623,7 @@ export class SearcherHost extends RPCHost {
|
|
| 540 |
return resultArray;
|
| 541 |
}
|
| 542 |
|
| 543 |
-
assignChargeAmount(formatted: FormattedPage[], num: number, scaler: number) {
|
| 544 |
let contentCharge = 0;
|
| 545 |
for (const x of formatted) {
|
| 546 |
const itemAmount = this.crawler.assignChargeAmount(x) || 0;
|
|
@@ -562,8 +645,12 @@ export class SearcherHost extends RPCHost {
|
|
| 562 |
}
|
| 563 |
}
|
| 564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
-
assignMeta(formatted,
|
| 567 |
|
| 568 |
return final;
|
| 569 |
}
|
|
@@ -757,4 +844,4 @@ const searchResultProto = {
|
|
| 757 |
|
| 758 |
return chunks.join('\n');
|
| 759 |
}
|
| 760 |
-
};
|
|
|
|
| 112 |
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
| 113 |
@Param('location') location?: string,
|
| 114 |
@Param('page') page?: number,
|
| 115 |
+
@Param('fallback', { type: Boolean, default: false }) fallback?: boolean,
|
| 116 |
@Param('q') q?: string,
|
| 117 |
) {
|
| 118 |
// We want to make our search API follow SERP schema, so we need to expose 'num' parameter.
|
|
|
|
| 263 |
fetchNum = count > 10 ? 30 : 20;
|
| 264 |
}
|
| 265 |
|
| 266 |
+
let fallbackQuery: string | undefined;
|
| 267 |
let chargeAmountScaler = 1;
|
| 268 |
if (searchEngine === 'bing') {
|
| 269 |
this.threadLocal.set('bing-preferred', true);
|
| 270 |
chargeAmountScaler = 3;
|
| 271 |
}
|
| 272 |
+
|
| 273 |
if (variant !== 'web') {
|
| 274 |
chargeAmountScaler = 5;
|
| 275 |
}
|
| 276 |
|
| 277 |
+
// Search with fallback logic if enabled
|
| 278 |
+
const searchParams = {
|
| 279 |
variant,
|
| 280 |
provider: searchEngine,
|
| 281 |
q: searchQuery,
|
|
|
|
| 284 |
hl,
|
| 285 |
location,
|
| 286 |
page,
|
| 287 |
+
};
|
| 288 |
+
|
| 289 |
+
const { response: r, query: successQuery, tryTimes } = await this.searchWithFallback(
|
| 290 |
+
searchParams, fallback, crawlerOptions.noCache
|
| 291 |
+
);
|
| 292 |
+
chargeAmountScaler *= tryTimes;
|
| 293 |
+
|
| 294 |
+
fallbackQuery = successQuery !== searchQuery ? successQuery : undefined;
|
| 295 |
|
| 296 |
let results;
|
| 297 |
switch (variant) {
|
|
|
|
| 323 |
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
| 324 |
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));
|
| 325 |
trimmedResults.toString = function () {
|
| 326 |
+
let r = this.map((x, i) => x ? Reflect.apply(x.toString, x, [i]) : '').join('\n\n').trimEnd() + '\n';
|
| 327 |
+
if (fallbackQuery) {
|
| 328 |
+
r = `Fallback query: ${fallbackQuery}\n\n${r}`;
|
| 329 |
+
}
|
| 330 |
+
return r;
|
| 331 |
};
|
| 332 |
if (!crawlerOptions.respondWith.includes('no-content') &&
|
| 333 |
['html', 'text', 'shot', 'markdown', 'content'].some((x) => crawlerOptions.respondWith.includes(x))
|
|
|
|
| 364 |
break;
|
| 365 |
}
|
| 366 |
|
| 367 |
+
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);
|
| 368 |
lastScrapped = scrapped;
|
| 369 |
+
|
| 370 |
+
if (fallbackQuery) {
|
| 371 |
+
sseStream.write({
|
| 372 |
+
event: 'meta',
|
| 373 |
+
data: { fallback: fallbackQuery },
|
| 374 |
+
});
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
sseStream.write({
|
| 378 |
event: 'data',
|
| 379 |
data: scrapped,
|
|
|
|
| 406 |
return;
|
| 407 |
}
|
| 408 |
await assigningOfGeneralMixins;
|
| 409 |
+
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
|
| 410 |
+
|
| 411 |
rpcReflect.return(lastScrapped);
|
| 412 |
earlyReturn = true;
|
| 413 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
|
|
| 428 |
clearTimeout(earlyReturnTimer);
|
| 429 |
}
|
| 430 |
await assigningOfGeneralMixins;
|
| 431 |
+
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);
|
| 432 |
|
| 433 |
return scrapped;
|
| 434 |
}
|
|
|
|
| 443 |
|
| 444 |
if (!earlyReturn) {
|
| 445 |
await assigningOfGeneralMixins;
|
| 446 |
+
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
|
| 447 |
}
|
| 448 |
|
| 449 |
return lastScrapped;
|
|
|
|
| 459 |
return;
|
| 460 |
}
|
| 461 |
await assigningOfGeneralMixins;
|
| 462 |
+
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
|
| 463 |
+
|
| 464 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 465 |
earlyReturn = true;
|
| 466 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
|
|
| 483 |
clearTimeout(earlyReturnTimer);
|
| 484 |
}
|
| 485 |
await assigningOfGeneralMixins;
|
| 486 |
+
chargeAmount = this.assignChargeAmount(scrapped, count, chargeAmountScaler, fallbackQuery);
|
| 487 |
|
| 488 |
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 489 |
}
|
|
|
|
| 498 |
|
| 499 |
if (!earlyReturn) {
|
| 500 |
await assigningOfGeneralMixins;
|
| 501 |
+
chargeAmount = this.assignChargeAmount(lastScrapped, count, chargeAmountScaler, fallbackQuery);
|
| 502 |
}
|
| 503 |
|
| 504 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
| 505 |
}
|
| 506 |
|
| 507 |
+
/**
|
| 508 |
+
* Search with fallback to progressively shorter queries if no results found
|
| 509 |
+
* @param params Search parameters
|
| 510 |
+
* @param useFallback Whether to use the fallback mechanism
|
| 511 |
+
* @param noCache Whether to bypass cache
|
| 512 |
+
* @returns Search response and the successful query
|
| 513 |
+
*/
|
| 514 |
+
async searchWithFallback(
|
| 515 |
+
params: SerperSearchQueryParams & { variant: 'web' | 'images' | 'news'; provider?: string; },
|
| 516 |
+
useFallback: boolean = false,
|
| 517 |
+
noCache: boolean = false
|
| 518 |
+
): Promise<{ response: SerperSearchResponse; query: string; tryTimes: number }> {
|
| 519 |
+
// Try original query first
|
| 520 |
+
const originalQuery = params.q;
|
| 521 |
+
const response = await this.cachedSearch(params, noCache);
|
| 522 |
+
|
| 523 |
+
// Extract results based on variant
|
| 524 |
+
let results: any[] = [];
|
| 525 |
+
let tryTimes = 1;
|
| 526 |
+
switch (params.variant) {
|
| 527 |
+
case 'images': results = (response as SerperImageSearchResponse).images; break;
|
| 528 |
+
case 'news': results = (response as SerperNewsSearchResponse).news; break;
|
| 529 |
+
case 'web': default: results = (response as SerperWebSearchResponse).organic; break;
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
// Return early if we got results or fallback is disabled
|
| 533 |
+
if (results.length > 0 || !useFallback) {
|
| 534 |
+
return { response, query: originalQuery, tryTimes };
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
// Try with progressively shorter queries
|
| 538 |
+
const terms = originalQuery.trim().split(/\s+/);
|
| 539 |
+
|
| 540 |
+
this.logger.info(`No results for "${originalQuery}", trying fallback queries`);
|
| 541 |
+
|
| 542 |
+
while (terms.length > 1) {
|
| 543 |
+
terms.pop(); // Remove last term
|
| 544 |
+
const shortenedQuery = terms.join(' ');
|
| 545 |
+
|
| 546 |
+
const fallbackParams = { ...params, q: shortenedQuery };
|
| 547 |
+
const fallbackResponse = await this.cachedSearch(fallbackParams, noCache);
|
| 548 |
+
|
| 549 |
+
let fallbackResults: any[] = [];
|
| 550 |
+
switch (params.variant) {
|
| 551 |
+
case 'images': fallbackResults = (fallbackResponse as SerperImageSearchResponse).images; break;
|
| 552 |
+
case 'news': fallbackResults = (fallbackResponse as SerperNewsSearchResponse).news; break;
|
| 553 |
+
case 'web': default: fallbackResults = (fallbackResponse as SerperWebSearchResponse).organic; break;
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
tryTimes++;
|
| 557 |
+
if (fallbackResults.length > 0) {
|
| 558 |
+
return { response: fallbackResponse, query: shortenedQuery, tryTimes };
|
| 559 |
+
}
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
return { response, query: originalQuery, tryTimes };
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
async *fetchSearchResults(
|
| 566 |
mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'favicon' | 'content',
|
| 567 |
searchResults?: FormattedPage[],
|
|
|
|
| 623 |
return resultArray;
|
| 624 |
}
|
| 625 |
|
| 626 |
+
assignChargeAmount(formatted: FormattedPage[], num: number, scaler: number, fallbackQuery?: string) {
|
| 627 |
let contentCharge = 0;
|
| 628 |
for (const x of formatted) {
|
| 629 |
const itemAmount = this.crawler.assignChargeAmount(x) || 0;
|
|
|
|
| 645 |
}
|
| 646 |
}
|
| 647 |
|
| 648 |
+
const metadata: Record<string, any> = { usage: { tokens: final } };
|
| 649 |
+
if (fallbackQuery) {
|
| 650 |
+
metadata.fallback = fallbackQuery;
|
| 651 |
+
}
|
| 652 |
|
| 653 |
+
assignMeta(formatted, metadata);
|
| 654 |
|
| 655 |
return final;
|
| 656 |
}
|
|
|
|
| 844 |
|
| 845 |
return chunks.join('\n');
|
| 846 |
}
|
| 847 |
+
};
|