Spaces:
Build error
Build error
feat(adaptive-crawler): optimize relevance detection
Browse files
backend/functions/src/cloud-functions/adaptive-crawler.ts
CHANGED
|
@@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 362 |
|
| 363 |
const title = json.data.title;
|
| 364 |
const description = json.data.description;
|
| 365 |
-
const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
|
| 366 |
const links = json.data.links as Record<string, string>;
|
| 367 |
|
| 368 |
-
const relevantUrls = await this.getRelevantUrls(token, {
|
| 369 |
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
|
| 370 |
|
| 371 |
for (const url of relevantUrls) {
|
|
@@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 418 |
}
|
| 419 |
|
| 420 |
async getRelevantUrls(token: string, {
|
| 421 |
-
|
| 422 |
}: {
|
| 423 |
-
|
|
|
|
| 424 |
links: Record<string, string>;
|
| 425 |
}) {
|
| 426 |
const invalidSuffix = [
|
|
@@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 434 |
.map(([title, link]) => link)
|
| 435 |
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
const data = {
|
| 438 |
model: 'jina-reranker-v2-base-multilingual',
|
| 439 |
query,
|
|
@@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|
| 460 |
}[];
|
| 461 |
};
|
| 462 |
|
| 463 |
-
|
|
|
|
| 464 |
}
|
| 465 |
|
| 466 |
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
|
|
|
| 362 |
|
| 363 |
const title = json.data.title;
|
| 364 |
const description = json.data.description;
|
|
|
|
| 365 |
const links = json.data.links as Record<string, string>;
|
| 366 |
|
| 367 |
+
const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
|
| 368 |
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
|
| 369 |
|
| 370 |
for (const url of relevantUrls) {
|
|
|
|
| 417 |
}
|
| 418 |
|
| 419 |
async getRelevantUrls(token: string, {
|
| 420 |
+
title, description, links
|
| 421 |
}: {
|
| 422 |
+
title: string;
|
| 423 |
+
description: string;
|
| 424 |
links: Record<string, string>;
|
| 425 |
}) {
|
| 426 |
const invalidSuffix = [
|
|
|
|
| 434 |
.map(([title, link]) => link)
|
| 435 |
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
|
| 436 |
|
| 437 |
+
let query = '';
|
| 438 |
+
if (!description) {
|
| 439 |
+
query += title;
|
| 440 |
+
} else {
|
| 441 |
+
query += `TITLE: ${title}; DESCRIPTION: ${description}`;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
const data = {
|
| 445 |
model: 'jina-reranker-v2-base-multilingual',
|
| 446 |
query,
|
|
|
|
| 467 |
}[];
|
| 468 |
};
|
| 469 |
|
| 470 |
+
const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
|
| 471 |
+
return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
|
| 472 |
}
|
| 473 |
|
| 474 |
getIndex(user?: JinaEmbeddingsTokenAccount) {
|