Spaces:
Build error
Build error
feat: serp endpoint (#1180)
Browse files* wip
* wip
* fix
* wip
* fix: add jitter to user cache
* cd
* fix
* fix
* fix: user cache age comparison
* fix: try to partition apiroll query
* bump: deps
* wip
* cd
* feat: fallback for serp
* fix
* cd
* fix
* fix
* serp: stop hiding expense
* serp: enable fallback by default
- .github/workflows/cd.yml +7 -1
- .vscode/launch.json +22 -0
- src/api/crawler.ts +63 -6
- src/api/searcher.ts +0 -503
- src/api/serp.ts +505 -0
- src/db/searched.ts +4 -0
- src/dto/crawler-options.ts +2 -0
- src/dto/jina-embeddings-auth.ts +21 -4
- src/services/geoip.ts +9 -0
- src/services/misc.ts +9 -1
- src/services/puppeteer.ts +2 -5
- src/services/serp/compat.ts +12 -0
- src/services/serp/google.ts +314 -0
- src/services/serp/puppeteer.ts +692 -0
- src/services/serp/serper.ts +165 -0
- src/stand-alone/serp.ts +160 -0
- thinapps-shared +1 -1
.github/workflows/cd.yml
CHANGED
|
@@ -75,9 +75,15 @@ jobs:
|
|
| 75 |
- name: Deploy SEARCH with Tag
|
| 76 |
run: |
|
| 77 |
gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
|
|
|
|
|
|
|
|
|
| 78 |
- name: Deploy CRAWL-EU with Tag
|
| 79 |
run: |
|
| 80 |
gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 81 |
- name: Deploy SEARCH-EU with Tag
|
| 82 |
run: |
|
| 83 |
-
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
- name: Deploy SEARCH with Tag
|
| 76 |
run: |
|
| 77 |
gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 78 |
+
- name: Deploy SERP with Tag
|
| 79 |
+
run: |
|
| 80 |
+
gcloud beta run deploy serp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 81 |
- name: Deploy CRAWL-EU with Tag
|
| 82 |
run: |
|
| 83 |
gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 84 |
- name: Deploy SEARCH-EU with Tag
|
| 85 |
run: |
|
| 86 |
+
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
| 87 |
+
- name: Deploy SERP-JP with Tag
|
| 88 |
+
run: |
|
| 89 |
+
gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2
|
.vscode/launch.json
CHANGED
|
@@ -102,5 +102,27 @@
|
|
| 102 |
"preLaunchTask": "Backend:build:watch",
|
| 103 |
"killBehavior": "forceful"
|
| 104 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
]
|
| 106 |
}
|
|
|
|
| 102 |
"preLaunchTask": "Backend:build:watch",
|
| 103 |
"killBehavior": "forceful"
|
| 104 |
},
|
| 105 |
+
{
|
| 106 |
+
"name": "Debug Stand Alone SERP",
|
| 107 |
+
"request": "launch",
|
| 108 |
+
"runtimeArgs": [
|
| 109 |
+
"--env-file=.secret.local",
|
| 110 |
+
],
|
| 111 |
+
"env": {
|
| 112 |
+
"GCLOUD_PROJECT": "reader-6b7dc",
|
| 113 |
+
"PREFERRED_PROXY_COUNTRY": "hk",
|
| 114 |
+
"OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
|
| 115 |
+
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
| 116 |
+
},
|
| 117 |
+
"cwd": "${workspaceFolder}",
|
| 118 |
+
"program": "build/stand-alone/serp.js",
|
| 119 |
+
"skipFiles": [
|
| 120 |
+
"<node_internals>/**"
|
| 121 |
+
],
|
| 122 |
+
"type": "node",
|
| 123 |
+
"outputCapture": "std",
|
| 124 |
+
"preLaunchTask": "Backend:build:watch",
|
| 125 |
+
"killBehavior": "forceful"
|
| 126 |
+
},
|
| 127 |
]
|
| 128 |
}
|
src/api/crawler.ts
CHANGED
|
@@ -48,6 +48,7 @@ import { RobotsTxtService } from '../services/robots-text';
|
|
| 48 |
import { TempFileManager } from '../services/temp-file';
|
| 49 |
import { MiscService } from '../services/misc';
|
| 50 |
import { HTTPServiceError } from 'civkit';
|
|
|
|
| 51 |
|
| 52 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 53 |
withIframe?: boolean | 'quoted';
|
|
@@ -58,6 +59,7 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
|
| 58 |
engine?: string;
|
| 59 |
allocProxy?: string;
|
| 60 |
private?: boolean;
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
const indexProto = {
|
|
@@ -94,6 +96,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 94 |
protected threadLocal: AsyncLocalContext,
|
| 95 |
protected robotsTxtService: RobotsTxtService,
|
| 96 |
protected tempFileManager: TempFileManager,
|
|
|
|
| 97 |
protected miscService: MiscService,
|
| 98 |
) {
|
| 99 |
super(...arguments);
|
|
@@ -511,15 +514,16 @@ export class CrawlerHost extends RPCHost {
|
|
| 511 |
});
|
| 512 |
}
|
| 513 |
|
| 514 |
-
const
|
| 515 |
-
if (this.puppeteerControl.circuitBreakerHosts.has(
|
| 516 |
throw new SecurityCompromiseError({
|
| 517 |
-
message: `Circular hostname: ${
|
| 518 |
path: 'url'
|
| 519 |
});
|
| 520 |
}
|
|
|
|
| 521 |
|
| 522 |
-
return
|
| 523 |
}
|
| 524 |
|
| 525 |
getUrlDigest(urlToCrawl: URL) {
|
|
@@ -886,7 +890,11 @@ export class CrawlerHost extends RPCHost {
|
|
| 886 |
}
|
| 887 |
}
|
| 888 |
} else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) {
|
| 889 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
}
|
| 891 |
|
| 892 |
try {
|
|
@@ -1030,6 +1038,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 1030 |
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 1031 |
private: Boolean(opts.doNotTrack),
|
| 1032 |
};
|
|
|
|
| 1033 |
if (crawlOpts.targetSelector?.length) {
|
| 1034 |
if (typeof crawlOpts.targetSelector === 'string') {
|
| 1035 |
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
|
@@ -1046,6 +1055,18 @@ export class CrawlerHost extends RPCHost {
|
|
| 1046 |
}
|
| 1047 |
}
|
| 1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
if (opts.locale) {
|
| 1050 |
crawlOpts.extraHeaders ??= {};
|
| 1051 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
|
@@ -1221,6 +1242,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 1221 |
};
|
| 1222 |
}
|
| 1223 |
|
|
|
|
| 1224 |
@retryWith((err) => {
|
| 1225 |
if (err instanceof ServiceBadApproachError) {
|
| 1226 |
return false;
|
|
@@ -1239,7 +1261,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 1239 |
if (opts?.allocProxy === 'none') {
|
| 1240 |
return this.curlControl.sideLoad(url, opts);
|
| 1241 |
}
|
| 1242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1243 |
const r = await this.curlControl.sideLoad(url, {
|
| 1244 |
...opts,
|
| 1245 |
proxyUrl: proxy.href,
|
|
@@ -1252,6 +1281,34 @@ export class CrawlerHost extends RPCHost {
|
|
| 1252 |
return { ...r, proxy };
|
| 1253 |
}
|
| 1254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1255 |
knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
|
| 1256 |
if (url.hostname === 'chromewebstore.google.com') {
|
| 1257 |
return true;
|
|
|
|
| 48 |
import { TempFileManager } from '../services/temp-file';
|
| 49 |
import { MiscService } from '../services/misc';
|
| 50 |
import { HTTPServiceError } from 'civkit';
|
| 51 |
+
import { GeoIPService } from '../services/geoip';
|
| 52 |
|
| 53 |
export interface ExtraScrappingOptions extends ScrappingOptions {
|
| 54 |
withIframe?: boolean | 'quoted';
|
|
|
|
| 59 |
engine?: string;
|
| 60 |
allocProxy?: string;
|
| 61 |
private?: boolean;
|
| 62 |
+
countryHint?: string;
|
| 63 |
}
|
| 64 |
|
| 65 |
const indexProto = {
|
|
|
|
| 96 |
protected threadLocal: AsyncLocalContext,
|
| 97 |
protected robotsTxtService: RobotsTxtService,
|
| 98 |
protected tempFileManager: TempFileManager,
|
| 99 |
+
protected geoIpService: GeoIPService,
|
| 100 |
protected miscService: MiscService,
|
| 101 |
) {
|
| 102 |
super(...arguments);
|
|
|
|
| 514 |
});
|
| 515 |
}
|
| 516 |
|
| 517 |
+
const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url);
|
| 518 |
+
if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) {
|
| 519 |
throw new SecurityCompromiseError({
|
| 520 |
+
message: `Circular hostname: ${safeURL.protocol}`,
|
| 521 |
path: 'url'
|
| 522 |
});
|
| 523 |
}
|
| 524 |
+
crawlerOptions._hintIps = ips;
|
| 525 |
|
| 526 |
+
return safeURL;
|
| 527 |
}
|
| 528 |
|
| 529 |
getUrlDigest(urlToCrawl: URL) {
|
|
|
|
| 890 |
}
|
| 891 |
}
|
| 892 |
} else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) {
|
| 893 |
+
const proxyUrl = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(crawlOpts));
|
| 894 |
+
if (proxyUrl.protocol === 'socks5h:') {
|
| 895 |
+
proxyUrl.protocol = 'socks5:';
|
| 896 |
+
}
|
| 897 |
+
crawlOpts.proxyUrl = proxyUrl.href;
|
| 898 |
}
|
| 899 |
|
| 900 |
try {
|
|
|
|
| 1038 |
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 1039 |
private: Boolean(opts.doNotTrack),
|
| 1040 |
};
|
| 1041 |
+
|
| 1042 |
if (crawlOpts.targetSelector?.length) {
|
| 1043 |
if (typeof crawlOpts.targetSelector === 'string') {
|
| 1044 |
crawlOpts.targetSelector = [crawlOpts.targetSelector];
|
|
|
|
| 1055 |
}
|
| 1056 |
}
|
| 1057 |
|
| 1058 |
+
if (opts._hintIps?.length) {
|
| 1059 |
+
const hints = await this.geoIpService.lookupCities(opts._hintIps);
|
| 1060 |
+
const board: Record<string, number> = {};
|
| 1061 |
+
for (const x of hints) {
|
| 1062 |
+
if (x.country?.code) {
|
| 1063 |
+
board[x.country.code] = (board[x.country.code] || 0) + 1;
|
| 1064 |
+
}
|
| 1065 |
+
}
|
| 1066 |
+
const hintCountry = _.maxBy(Array.from(Object.entries(board)), 1)?.[0];
|
| 1067 |
+
crawlOpts.countryHint = hintCountry?.toLowerCase();
|
| 1068 |
+
}
|
| 1069 |
+
|
| 1070 |
if (opts.locale) {
|
| 1071 |
crawlOpts.extraHeaders ??= {};
|
| 1072 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
|
|
|
| 1242 |
};
|
| 1243 |
}
|
| 1244 |
|
| 1245 |
+
retryDet = new WeakSet<ExtraScrappingOptions>();
|
| 1246 |
@retryWith((err) => {
|
| 1247 |
if (err instanceof ServiceBadApproachError) {
|
| 1248 |
return false;
|
|
|
|
| 1261 |
if (opts?.allocProxy === 'none') {
|
| 1262 |
return this.curlControl.sideLoad(url, opts);
|
| 1263 |
}
|
| 1264 |
+
|
| 1265 |
+
const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
|
| 1266 |
+
if (opts) {
|
| 1267 |
+
if (this.retryDet.has(opts) && proxy.protocol === 'socks5h:') {
|
| 1268 |
+
proxy.protocol = 'socks5:';
|
| 1269 |
+
}
|
| 1270 |
+
this.retryDet.add(opts);
|
| 1271 |
+
}
|
| 1272 |
const r = await this.curlControl.sideLoad(url, {
|
| 1273 |
...opts,
|
| 1274 |
proxyUrl: proxy.href,
|
|
|
|
| 1281 |
return { ...r, proxy };
|
| 1282 |
}
|
| 1283 |
|
| 1284 |
+
protected figureOutBestProxyCountry(opts?: ExtraScrappingOptions) {
|
| 1285 |
+
if (!opts) {
|
| 1286 |
+
return 'auto';
|
| 1287 |
+
}
|
| 1288 |
+
|
| 1289 |
+
let draft;
|
| 1290 |
+
|
| 1291 |
+
if (opts.allocProxy) {
|
| 1292 |
+
if (this.proxyProvider.supports(opts.allocProxy)) {
|
| 1293 |
+
draft = opts.allocProxy;
|
| 1294 |
+
} else if (opts.allocProxy === 'none') {
|
| 1295 |
+
return 'none';
|
| 1296 |
+
}
|
| 1297 |
+
}
|
| 1298 |
+
|
| 1299 |
+
if (opts.countryHint) {
|
| 1300 |
+
if (this.proxyProvider.supports(opts.countryHint)) {
|
| 1301 |
+
draft ??= opts.countryHint;
|
| 1302 |
+
} else if (opts.countryHint === 'cn') {
|
| 1303 |
+
draft ??= 'hk';
|
| 1304 |
+
}
|
| 1305 |
+
}
|
| 1306 |
+
|
| 1307 |
+
draft ??= opts.allocProxy || 'auto';
|
| 1308 |
+
|
| 1309 |
+
return draft;
|
| 1310 |
+
}
|
| 1311 |
+
|
| 1312 |
knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
|
| 1313 |
if (url.hostname === 'chromewebstore.google.com') {
|
| 1314 |
return true;
|
src/api/searcher.ts
DELETED
|
@@ -1,503 +0,0 @@
|
|
| 1 |
-
import { singleton } from 'tsyringe';
|
| 2 |
-
import _ from 'lodash';
|
| 3 |
-
|
| 4 |
-
import {
|
| 5 |
-
assignTransferProtocolMeta, RPCHost, RPCReflection,
|
| 6 |
-
AssertionFailureError,
|
| 7 |
-
RawString,
|
| 8 |
-
} from 'civkit/civ-rpc';
|
| 9 |
-
import { marshalErrorLike } from 'civkit/lang';
|
| 10 |
-
import { objHashMd5B64Of } from 'civkit/hash';
|
| 11 |
-
|
| 12 |
-
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 13 |
-
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
| 14 |
-
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
| 15 |
-
|
| 16 |
-
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
| 17 |
-
import { SearchResult } from '../db/searched';
|
| 18 |
-
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 19 |
-
import { CrawlerOptions } from '../dto/crawler-options';
|
| 20 |
-
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
| 21 |
-
|
| 22 |
-
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
| 23 |
-
import { GlobalLogger } from '../services/logger';
|
| 24 |
-
import { AsyncLocalContext } from '../services/async-context';
|
| 25 |
-
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 26 |
-
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 27 |
-
import { InsufficientBalanceError } from '../services/errors';
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
@singleton()
|
| 31 |
-
export class SearcherHost extends RPCHost {
|
| 32 |
-
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 33 |
-
|
| 34 |
-
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 35 |
-
cacheValidMs = 1000 * 3600;
|
| 36 |
-
pageCacheToleranceMs = 1000 * 3600 * 24;
|
| 37 |
-
|
| 38 |
-
reasonableDelayMs = 15_000;
|
| 39 |
-
|
| 40 |
-
targetResultCount = 5;
|
| 41 |
-
|
| 42 |
-
constructor(
|
| 43 |
-
protected globalLogger: GlobalLogger,
|
| 44 |
-
protected rateLimitControl: RateLimitControl,
|
| 45 |
-
protected threadLocal: AsyncLocalContext,
|
| 46 |
-
protected braveSearchService: BraveSearchService,
|
| 47 |
-
protected crawler: CrawlerHost,
|
| 48 |
-
protected snapshotFormatter: SnapshotFormatter,
|
| 49 |
-
) {
|
| 50 |
-
super(...arguments);
|
| 51 |
-
}
|
| 52 |
-
|
| 53 |
-
override async init() {
|
| 54 |
-
await this.dependencyReady();
|
| 55 |
-
|
| 56 |
-
this.emit('ready');
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
@Method({
|
| 60 |
-
name: 'searchIndex',
|
| 61 |
-
ext: {
|
| 62 |
-
http: {
|
| 63 |
-
action: ['get', 'post'],
|
| 64 |
-
path: '/search'
|
| 65 |
-
}
|
| 66 |
-
},
|
| 67 |
-
tags: ['search'],
|
| 68 |
-
returnType: [String, OutputServerEventStream],
|
| 69 |
-
})
|
| 70 |
-
@Method({
|
| 71 |
-
ext: {
|
| 72 |
-
http: {
|
| 73 |
-
action: ['get', 'post'],
|
| 74 |
-
path: '::q'
|
| 75 |
-
}
|
| 76 |
-
},
|
| 77 |
-
tags: ['search'],
|
| 78 |
-
returnType: [String, OutputServerEventStream, RawString],
|
| 79 |
-
})
|
| 80 |
-
async search(
|
| 81 |
-
@RPCReflect() rpcReflect: RPCReflection,
|
| 82 |
-
@Ctx() ctx: Context,
|
| 83 |
-
auth: JinaEmbeddingsAuthDTO,
|
| 84 |
-
@Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
|
| 85 |
-
count: number,
|
| 86 |
-
crawlerOptions: CrawlerOptions,
|
| 87 |
-
braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
|
| 88 |
-
@Param('q') q?: string,
|
| 89 |
-
) {
|
| 90 |
-
const uid = await auth.solveUID();
|
| 91 |
-
let chargeAmount = 0;
|
| 92 |
-
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
| 93 |
-
if (!noSlashPath && !q) {
|
| 94 |
-
const index = await this.crawler.getIndex(auth);
|
| 95 |
-
if (!uid) {
|
| 96 |
-
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 97 |
-
}
|
| 98 |
-
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 99 |
-
|
| 100 |
-
return index;
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
return assignTransferProtocolMeta(`${index}`,
|
| 104 |
-
{ contentType: 'text/plain', envelope: null }
|
| 105 |
-
);
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
const user = await auth.assertUser();
|
| 109 |
-
if (!(user.wallet.total_balance > 0)) {
|
| 110 |
-
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
const rateLimitPolicy = auth.getRateLimits(rpcReflect.name.toUpperCase()) || [
|
| 114 |
-
parseInt(user.metadata?.speed_level) >= 2 ?
|
| 115 |
-
RateLimitDesc.from({
|
| 116 |
-
occurrence: 100,
|
| 117 |
-
periodSeconds: 60
|
| 118 |
-
}) :
|
| 119 |
-
RateLimitDesc.from({
|
| 120 |
-
occurrence: 40,
|
| 121 |
-
periodSeconds: 60
|
| 122 |
-
})
|
| 123 |
-
];
|
| 124 |
-
|
| 125 |
-
const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(
|
| 126 |
-
rpcReflect, uid!, [rpcReflect.name.toUpperCase()],
|
| 127 |
-
...rateLimitPolicy
|
| 128 |
-
);
|
| 129 |
-
|
| 130 |
-
rpcReflect.finally(() => {
|
| 131 |
-
if (chargeAmount) {
|
| 132 |
-
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
| 133 |
-
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 134 |
-
});
|
| 135 |
-
apiRoll.chargeAmount = chargeAmount;
|
| 136 |
-
}
|
| 137 |
-
});
|
| 138 |
-
|
| 139 |
-
delete crawlerOptions.html;
|
| 140 |
-
|
| 141 |
-
const crawlOpts = await this.crawler.configure(crawlerOptions);
|
| 142 |
-
const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
|
| 143 |
-
const r = await this.cachedWebSearch({
|
| 144 |
-
q: searchQuery,
|
| 145 |
-
count: count ? Math.floor(count + 2) : 20
|
| 146 |
-
}, crawlerOptions.noCache);
|
| 147 |
-
|
| 148 |
-
if (!r.web?.results.length) {
|
| 149 |
-
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
|
| 150 |
-
}
|
| 151 |
-
|
| 152 |
-
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
|
| 153 |
-
delete crawlOpts.timeoutMs;
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
|
| 157 |
-
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
| 158 |
-
count,
|
| 159 |
-
);
|
| 160 |
-
|
| 161 |
-
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 162 |
-
const sseStream = new OutputServerEventStream();
|
| 163 |
-
rpcReflect.return(sseStream);
|
| 164 |
-
|
| 165 |
-
try {
|
| 166 |
-
for await (const scrapped of it) {
|
| 167 |
-
if (!scrapped) {
|
| 168 |
-
continue;
|
| 169 |
-
}
|
| 170 |
-
|
| 171 |
-
chargeAmount = this.assignChargeAmount(scrapped);
|
| 172 |
-
sseStream.write({
|
| 173 |
-
event: 'data',
|
| 174 |
-
data: scrapped,
|
| 175 |
-
});
|
| 176 |
-
}
|
| 177 |
-
} catch (err: any) {
|
| 178 |
-
this.logger.error(`Failed to collect search result for query ${searchQuery}`,
|
| 179 |
-
{ err: marshalErrorLike(err) }
|
| 180 |
-
);
|
| 181 |
-
sseStream.write({
|
| 182 |
-
event: 'error',
|
| 183 |
-
data: marshalErrorLike(err),
|
| 184 |
-
});
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
sseStream.end();
|
| 188 |
-
|
| 189 |
-
return sseStream;
|
| 190 |
-
}
|
| 191 |
-
|
| 192 |
-
let lastScrapped: any[] | undefined;
|
| 193 |
-
let earlyReturn = false;
|
| 194 |
-
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 195 |
-
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 196 |
-
const setEarlyReturnTimer = () => {
|
| 197 |
-
if (earlyReturnTimer) {
|
| 198 |
-
return;
|
| 199 |
-
}
|
| 200 |
-
earlyReturnTimer = setTimeout(() => {
|
| 201 |
-
if (!lastScrapped) {
|
| 202 |
-
return;
|
| 203 |
-
}
|
| 204 |
-
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 205 |
-
rpcReflect.return(lastScrapped);
|
| 206 |
-
earlyReturn = true;
|
| 207 |
-
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
| 208 |
-
};
|
| 209 |
-
|
| 210 |
-
for await (const scrapped of it) {
|
| 211 |
-
lastScrapped = scrapped;
|
| 212 |
-
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 213 |
-
setEarlyReturnTimer();
|
| 214 |
-
}
|
| 215 |
-
if (!this.searchResultsQualified(scrapped, count)) {
|
| 216 |
-
continue;
|
| 217 |
-
}
|
| 218 |
-
if (earlyReturnTimer) {
|
| 219 |
-
clearTimeout(earlyReturnTimer);
|
| 220 |
-
}
|
| 221 |
-
chargeAmount = this.assignChargeAmount(scrapped);
|
| 222 |
-
|
| 223 |
-
return scrapped;
|
| 224 |
-
}
|
| 225 |
-
|
| 226 |
-
if (earlyReturnTimer) {
|
| 227 |
-
clearTimeout(earlyReturnTimer);
|
| 228 |
-
}
|
| 229 |
-
|
| 230 |
-
if (!lastScrapped) {
|
| 231 |
-
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
if (!earlyReturn) {
|
| 235 |
-
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 236 |
-
}
|
| 237 |
-
|
| 238 |
-
return lastScrapped;
|
| 239 |
-
}
|
| 240 |
-
|
| 241 |
-
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
| 242 |
-
const setEarlyReturnTimer = () => {
|
| 243 |
-
if (earlyReturnTimer) {
|
| 244 |
-
return;
|
| 245 |
-
}
|
| 246 |
-
earlyReturnTimer = setTimeout(() => {
|
| 247 |
-
if (!lastScrapped) {
|
| 248 |
-
return;
|
| 249 |
-
}
|
| 250 |
-
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 251 |
-
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 252 |
-
earlyReturn = true;
|
| 253 |
-
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
| 254 |
-
};
|
| 255 |
-
|
| 256 |
-
for await (const scrapped of it) {
|
| 257 |
-
lastScrapped = scrapped;
|
| 258 |
-
|
| 259 |
-
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
| 260 |
-
setEarlyReturnTimer();
|
| 261 |
-
}
|
| 262 |
-
|
| 263 |
-
if (!this.searchResultsQualified(scrapped, count)) {
|
| 264 |
-
continue;
|
| 265 |
-
}
|
| 266 |
-
|
| 267 |
-
if (earlyReturnTimer) {
|
| 268 |
-
clearTimeout(earlyReturnTimer);
|
| 269 |
-
}
|
| 270 |
-
|
| 271 |
-
chargeAmount = this.assignChargeAmount(scrapped);
|
| 272 |
-
|
| 273 |
-
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 274 |
-
}
|
| 275 |
-
|
| 276 |
-
if (earlyReturnTimer) {
|
| 277 |
-
clearTimeout(earlyReturnTimer);
|
| 278 |
-
}
|
| 279 |
-
|
| 280 |
-
if (!lastScrapped) {
|
| 281 |
-
throw new AssertionFailureError(`No content available for query ${searchQuery}`);
|
| 282 |
-
}
|
| 283 |
-
|
| 284 |
-
if (!earlyReturn) {
|
| 285 |
-
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 286 |
-
}
|
| 287 |
-
|
| 288 |
-
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
async *fetchSearchResults(
|
| 292 |
-
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
| 293 |
-
searchResults?: WebSearchResult[],
|
| 294 |
-
options?: ExtraScrappingOptions,
|
| 295 |
-
crawlerOptions?: CrawlerOptions,
|
| 296 |
-
count?: number,
|
| 297 |
-
) {
|
| 298 |
-
if (!searchResults) {
|
| 299 |
-
return;
|
| 300 |
-
}
|
| 301 |
-
if (count === 0) {
|
| 302 |
-
const resultArray = searchResults.map((upstreamSearchResult, i) => ({
|
| 303 |
-
url: upstreamSearchResult.url,
|
| 304 |
-
title: upstreamSearchResult.title,
|
| 305 |
-
description: upstreamSearchResult.description,
|
| 306 |
-
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '',
|
| 307 |
-
toString() {
|
| 308 |
-
return `[${i + 1}] Title: ${this.title}
|
| 309 |
-
[${i + 1}] URL Source: ${this.url}
|
| 310 |
-
[${i + 1}] Description: ${this.description}
|
| 311 |
-
`;
|
| 312 |
-
}
|
| 313 |
-
|
| 314 |
-
})) as FormattedPage[];
|
| 315 |
-
resultArray.toString = function () {
|
| 316 |
-
return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
|
| 317 |
-
};
|
| 318 |
-
yield resultArray;
|
| 319 |
-
return;
|
| 320 |
-
}
|
| 321 |
-
const urls = searchResults.map((x) => new URL(x.url));
|
| 322 |
-
const snapshotMap = new WeakMap();
|
| 323 |
-
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
| 324 |
-
const mapped = scrapped.map((x, i) => {
|
| 325 |
-
const upstreamSearchResult = searchResults[i];
|
| 326 |
-
if (!x) {
|
| 327 |
-
return {
|
| 328 |
-
url: upstreamSearchResult.url,
|
| 329 |
-
title: upstreamSearchResult.title,
|
| 330 |
-
description: upstreamSearchResult.description,
|
| 331 |
-
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
| 332 |
-
};
|
| 333 |
-
}
|
| 334 |
-
if (snapshotMap.has(x)) {
|
| 335 |
-
return snapshotMap.get(x);
|
| 336 |
-
}
|
| 337 |
-
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
|
| 338 |
-
r.title ??= upstreamSearchResult.title;
|
| 339 |
-
r.description = upstreamSearchResult.description;
|
| 340 |
-
snapshotMap.set(x, r);
|
| 341 |
-
|
| 342 |
-
return r;
|
| 343 |
-
}).catch((err) => {
|
| 344 |
-
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
| 345 |
-
|
| 346 |
-
return {
|
| 347 |
-
url: upstreamSearchResult.url,
|
| 348 |
-
title: upstreamSearchResult.title,
|
| 349 |
-
description: upstreamSearchResult.description,
|
| 350 |
-
content: x.text,
|
| 351 |
-
};
|
| 352 |
-
});
|
| 353 |
-
});
|
| 354 |
-
|
| 355 |
-
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
| 356 |
-
|
| 357 |
-
yield this.reOrganizeSearchResults(resultArray, count);
|
| 358 |
-
}
|
| 359 |
-
}
|
| 360 |
-
|
| 361 |
-
reOrganizeSearchResults(searchResults: FormattedPage[], count?: number) {
|
| 362 |
-
const targetResultCount = count || this.targetResultCount;
|
| 363 |
-
const [qualifiedPages, unqualifiedPages] = _.partition(searchResults, (x) => this.pageQualified(x));
|
| 364 |
-
const acceptSet = new Set(qualifiedPages);
|
| 365 |
-
|
| 366 |
-
const n = targetResultCount - qualifiedPages.length;
|
| 367 |
-
for (const x of unqualifiedPages.slice(0, n >= 0 ? n : 0)) {
|
| 368 |
-
acceptSet.add(x);
|
| 369 |
-
}
|
| 370 |
-
|
| 371 |
-
const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
|
| 372 |
-
|
| 373 |
-
const resultArray = filtered.map((x, i) => {
|
| 374 |
-
return {
|
| 375 |
-
...x,
|
| 376 |
-
toString(this: any) {
|
| 377 |
-
if (!this.content && this.description) {
|
| 378 |
-
if (this.title || x.textRepresentation) {
|
| 379 |
-
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
| 380 |
-
return `[${i + 1}] Title: ${this.title}
|
| 381 |
-
[${i + 1}] URL Source: ${this.url}
|
| 382 |
-
[${i + 1}] Description: ${this.description}${textRep}
|
| 383 |
-
`;
|
| 384 |
-
}
|
| 385 |
-
|
| 386 |
-
return `[${i + 1}] No content available for ${this.url}`;
|
| 387 |
-
}
|
| 388 |
-
|
| 389 |
-
const mixins = [];
|
| 390 |
-
if (this.description) {
|
| 391 |
-
mixins.push(`[${i + 1}] Description: ${this.description}`);
|
| 392 |
-
}
|
| 393 |
-
if (this.publishedTime) {
|
| 394 |
-
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
|
| 395 |
-
}
|
| 396 |
-
|
| 397 |
-
const suffixMixins = [];
|
| 398 |
-
if (this.images) {
|
| 399 |
-
const imageSummaryChunks = [`[${i + 1}] Images:`];
|
| 400 |
-
for (const [k, v] of Object.entries(this.images)) {
|
| 401 |
-
imageSummaryChunks.push(`- `);
|
| 402 |
-
}
|
| 403 |
-
if (imageSummaryChunks.length === 1) {
|
| 404 |
-
imageSummaryChunks.push('This page does not seem to contain any images.');
|
| 405 |
-
}
|
| 406 |
-
suffixMixins.push(imageSummaryChunks.join('\n'));
|
| 407 |
-
}
|
| 408 |
-
if (this.links) {
|
| 409 |
-
const linkSummaryChunks = [`[${i + 1}] Links/Buttons:`];
|
| 410 |
-
for (const [k, v] of Object.entries(this.links)) {
|
| 411 |
-
linkSummaryChunks.push(`- [${k}](${v})`);
|
| 412 |
-
}
|
| 413 |
-
if (linkSummaryChunks.length === 1) {
|
| 414 |
-
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
| 415 |
-
}
|
| 416 |
-
suffixMixins.push(linkSummaryChunks.join('\n'));
|
| 417 |
-
}
|
| 418 |
-
|
| 419 |
-
return `[${i + 1}] Title: ${this.title}
|
| 420 |
-
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
| 421 |
-
[${i + 1}] Markdown Content:
|
| 422 |
-
${this.content}
|
| 423 |
-
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
| 424 |
-
}
|
| 425 |
-
};
|
| 426 |
-
});
|
| 427 |
-
|
| 428 |
-
resultArray.toString = function () {
|
| 429 |
-
return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${this[i].url}`).join('\n\n').trimEnd() + '\n';
|
| 430 |
-
};
|
| 431 |
-
|
| 432 |
-
return resultArray;
|
| 433 |
-
}
|
| 434 |
-
|
| 435 |
-
assignChargeAmount(formatted: FormattedPage[]) {
|
| 436 |
-
return _.sum(
|
| 437 |
-
formatted.map((x) => this.crawler.assignChargeAmount(x) || 0)
|
| 438 |
-
);
|
| 439 |
-
}
|
| 440 |
-
|
| 441 |
-
pageQualified(formattedPage: FormattedPage) {
|
| 442 |
-
return formattedPage.title &&
|
| 443 |
-
formattedPage.content ||
|
| 444 |
-
formattedPage.screenshotUrl ||
|
| 445 |
-
formattedPage.pageshotUrl ||
|
| 446 |
-
formattedPage.text ||
|
| 447 |
-
formattedPage.html;
|
| 448 |
-
}
|
| 449 |
-
|
| 450 |
-
searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
|
| 451 |
-
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
|
| 452 |
-
}
|
| 453 |
-
|
| 454 |
-
async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
|
| 455 |
-
const queryDigest = objHashMd5B64Of(query);
|
| 456 |
-
let cache;
|
| 457 |
-
if (!noCache) {
|
| 458 |
-
cache = (await SearchResult.fromFirestoreQuery(
|
| 459 |
-
SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
|
| 460 |
-
.orderBy('createdAt', 'desc')
|
| 461 |
-
.limit(1)
|
| 462 |
-
))[0];
|
| 463 |
-
if (cache) {
|
| 464 |
-
const age = Date.now() - cache.createdAt.valueOf();
|
| 465 |
-
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
| 466 |
-
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
|
| 467 |
-
query, digest: queryDigest, age, stale
|
| 468 |
-
});
|
| 469 |
-
|
| 470 |
-
if (!stale) {
|
| 471 |
-
return cache.response as WebSearchApiResponse;
|
| 472 |
-
}
|
| 473 |
-
}
|
| 474 |
-
}
|
| 475 |
-
|
| 476 |
-
try {
|
| 477 |
-
const r = await this.braveSearchService.webSearch(query);
|
| 478 |
-
|
| 479 |
-
const nowDate = new Date();
|
| 480 |
-
const record = SearchResult.from({
|
| 481 |
-
query,
|
| 482 |
-
queryDigest,
|
| 483 |
-
response: r,
|
| 484 |
-
createdAt: nowDate,
|
| 485 |
-
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 486 |
-
});
|
| 487 |
-
SearchResult.save(record.degradeForFireStore()).catch((err) => {
|
| 488 |
-
this.logger.warn(`Failed to cache search result`, { err });
|
| 489 |
-
});
|
| 490 |
-
|
| 491 |
-
return r;
|
| 492 |
-
} catch (err: any) {
|
| 493 |
-
if (cache) {
|
| 494 |
-
this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
|
| 495 |
-
|
| 496 |
-
return cache.response as WebSearchApiResponse;
|
| 497 |
-
}
|
| 498 |
-
|
| 499 |
-
throw err;
|
| 500 |
-
}
|
| 501 |
-
|
| 502 |
-
}
|
| 503 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/api/serp.ts
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { singleton } from 'tsyringe';
|
| 2 |
+
import {
|
| 3 |
+
RPCHost, RPCReflection, assignMeta, RawString,
|
| 4 |
+
ParamValidationError,
|
| 5 |
+
assignTransferProtocolMeta,
|
| 6 |
+
} from 'civkit/civ-rpc';
|
| 7 |
+
import { marshalErrorLike } from 'civkit/lang';
|
| 8 |
+
import _ from 'lodash';
|
| 9 |
+
|
| 10 |
+
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
| 11 |
+
|
| 12 |
+
import { GlobalLogger } from '../services/logger';
|
| 13 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 14 |
+
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
| 15 |
+
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 16 |
+
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
| 17 |
+
import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors';
|
| 18 |
+
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
| 19 |
+
import { GoogleSERP } from '../services/serp/google';
|
| 20 |
+
import { WebSearchEntry } from '../services/serp/compat';
|
| 21 |
+
import { CrawlerOptions } from '../dto/crawler-options';
|
| 22 |
+
import { ScrappingOptions } from '../services/serp/puppeteer';
|
| 23 |
+
import { objHashMd5B64Of } from 'civkit/hash';
|
| 24 |
+
import { SERPResult } from '../db/searched';
|
| 25 |
+
import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
|
| 26 |
+
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
| 27 |
+
import { LRUCache } from 'lru-cache';
|
| 28 |
+
|
| 29 |
+
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
| 30 |
+
|
| 31 |
+
type RateLimitCache = {
|
| 32 |
+
blockedUntil?: Date;
|
| 33 |
+
user?: JinaEmbeddingsTokenAccount;
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
const indexProto = {
|
| 37 |
+
toString: function (): string {
|
| 38 |
+
return _(this)
|
| 39 |
+
.toPairs()
|
| 40 |
+
.map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
|
| 41 |
+
.value()
|
| 42 |
+
.join('\n') + '\n';
|
| 43 |
+
}
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
@singleton()
|
| 47 |
+
export class SerpHost extends RPCHost {
|
| 48 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 49 |
+
|
| 50 |
+
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
| 51 |
+
cacheValidMs = 1000 * 3600;
|
| 52 |
+
pageCacheToleranceMs = 1000 * 3600 * 24;
|
| 53 |
+
|
| 54 |
+
reasonableDelayMs = 15_000;
|
| 55 |
+
|
| 56 |
+
targetResultCount = 5;
|
| 57 |
+
|
| 58 |
+
highFreqKeyCache = new LRUCache<string, RateLimitCache>({
|
| 59 |
+
max: 256,
|
| 60 |
+
ttl: 60 * 60 * 1000,
|
| 61 |
+
updateAgeOnGet: false,
|
| 62 |
+
updateAgeOnHas: false,
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
|
| 66 |
+
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
| 67 |
+
Object.assign(indexObject, {
|
| 68 |
+
usage1: 'https://r.jina.ai/YOUR_URL',
|
| 69 |
+
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
|
| 70 |
+
usage3: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
|
| 71 |
+
homepage: 'https://jina.ai/reader',
|
| 72 |
+
sourceCode: 'https://github.com/jina-ai/reader',
|
| 73 |
+
});
|
| 74 |
+
|
| 75 |
+
if (auth && auth.user) {
|
| 76 |
+
indexObject[''] = undefined;
|
| 77 |
+
indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
|
| 78 |
+
indexObject.balanceLeft = auth.user.wallet.total_balance;
|
| 79 |
+
} else {
|
| 80 |
+
indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return indexObject;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
constructor(
|
| 87 |
+
protected globalLogger: GlobalLogger,
|
| 88 |
+
protected rateLimitControl: RateLimitControl,
|
| 89 |
+
protected threadLocal: AsyncLocalContext,
|
| 90 |
+
protected googleSerp: GoogleSERP,
|
| 91 |
+
protected serperGoogle: SerperGoogleSearchService,
|
| 92 |
+
protected serperBing: SerperBingSearchService,
|
| 93 |
+
) {
|
| 94 |
+
super(...arguments);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
override async init() {
|
| 98 |
+
await this.dependencyReady();
|
| 99 |
+
|
| 100 |
+
this.emit('ready');
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
@Method({
|
| 104 |
+
name: 'searchIndex',
|
| 105 |
+
ext: {
|
| 106 |
+
http: {
|
| 107 |
+
action: ['get', 'post'],
|
| 108 |
+
path: '/'
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
tags: ['search'],
|
| 112 |
+
returnType: [String, OutputServerEventStream, RawString],
|
| 113 |
+
})
|
| 114 |
+
@Method({
|
| 115 |
+
ext: {
|
| 116 |
+
http: {
|
| 117 |
+
action: ['get', 'post'],
|
| 118 |
+
}
|
| 119 |
+
},
|
| 120 |
+
tags: ['search'],
|
| 121 |
+
returnType: [String, OutputServerEventStream, RawString],
|
| 122 |
+
})
|
| 123 |
+
async search(
|
| 124 |
+
@RPCReflect() rpcReflect: RPCReflection,
|
| 125 |
+
@Ctx() ctx: Context,
|
| 126 |
+
crawlerOptions: CrawlerOptions,
|
| 127 |
+
auth: JinaEmbeddingsAuthDTO,
|
| 128 |
+
@Param('type', { type: new Set(['web', 'images', 'news']), default: 'web' })
|
| 129 |
+
variant: 'web' | 'images' | 'news',
|
| 130 |
+
@Param('q') q?: string,
|
| 131 |
+
@Param('provider', { type: new Set(['google', 'bing']) })
|
| 132 |
+
searchEngine?: 'google' | 'bing',
|
| 133 |
+
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
|
| 134 |
+
num?: number,
|
| 135 |
+
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
|
| 136 |
+
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
| 137 |
+
@Param('location') location?: string,
|
| 138 |
+
@Param('page') page?: number,
|
| 139 |
+
@Param('fallback', { default: true }) fallback?: boolean,
|
| 140 |
+
) {
|
| 141 |
+
const authToken = auth.bearerToken;
|
| 142 |
+
let highFreqKey: RateLimitCache | undefined;
|
| 143 |
+
if (authToken && this.highFreqKeyCache.has(authToken)) {
|
| 144 |
+
highFreqKey = this.highFreqKeyCache.get(authToken)!;
|
| 145 |
+
auth.user = highFreqKey.user;
|
| 146 |
+
auth.uid = highFreqKey.user?.user_id;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
const uid = await auth.solveUID();
|
| 150 |
+
if (!q) {
|
| 151 |
+
if (ctx.path === '/') {
|
| 152 |
+
const indexObject = this.getIndex(ctx, auth);
|
| 153 |
+
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
| 154 |
+
return indexObject;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
return assignTransferProtocolMeta(`${indexObject}`,
|
| 158 |
+
{ contentType: 'text/plain; charset=utf-8', envelope: null }
|
| 159 |
+
);
|
| 160 |
+
}
|
| 161 |
+
throw new ParamValidationError({
|
| 162 |
+
path: 'q',
|
| 163 |
+
message: `Required but not provided`
|
| 164 |
+
});
|
| 165 |
+
}
|
| 166 |
+
// Return content by default
|
| 167 |
+
const user = await auth.assertUser();
|
| 168 |
+
if (!(user.wallet.total_balance > 0)) {
|
| 169 |
+
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
if (highFreqKey?.blockedUntil) {
|
| 173 |
+
const now = new Date();
|
| 174 |
+
const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
|
| 175 |
+
if (blockedTimeRemaining > 0) {
|
| 176 |
+
throw RateLimitTriggeredError.from({
|
| 177 |
+
message: `Per UID rate limit exceeded (async)`,
|
| 178 |
+
retryAfter: Math.ceil(blockedTimeRemaining / 1000),
|
| 179 |
+
});
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
const PREMIUM_KEY_LIMIT = 400;
|
| 184 |
+
const rateLimitPolicy = auth.getRateLimits('SEARCH') || [
|
| 185 |
+
parseInt(user.metadata?.speed_level) >= 2 ?
|
| 186 |
+
RateLimitDesc.from({
|
| 187 |
+
occurrence: PREMIUM_KEY_LIMIT,
|
| 188 |
+
periodSeconds: 60
|
| 189 |
+
}) :
|
| 190 |
+
RateLimitDesc.from({
|
| 191 |
+
occurrence: 40,
|
| 192 |
+
periodSeconds: 60
|
| 193 |
+
})
|
| 194 |
+
];
|
| 195 |
+
|
| 196 |
+
const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(
|
| 197 |
+
rpcReflect, uid!, ['SEARCH'],
|
| 198 |
+
...rateLimitPolicy
|
| 199 |
+
);
|
| 200 |
+
|
| 201 |
+
if (!highFreqKey) {
|
| 202 |
+
// Normal path
|
| 203 |
+
await apiRollPromise;
|
| 204 |
+
|
| 205 |
+
if (rateLimitPolicy.some(
|
| 206 |
+
(x) => {
|
| 207 |
+
const rpm = x.occurrence / (x.periodSeconds / 60);
|
| 208 |
+
if (rpm >= PREMIUM_KEY_LIMIT) {
|
| 209 |
+
return true;
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
return false;
|
| 213 |
+
})
|
| 214 |
+
) {
|
| 215 |
+
this.highFreqKeyCache.set(auth.bearerToken!, {
|
| 216 |
+
user,
|
| 217 |
+
});
|
| 218 |
+
}
|
| 219 |
+
} else {
|
| 220 |
+
// High freq key path
|
| 221 |
+
apiRollPromise.then(
|
| 222 |
+
// Rate limit not triggered, make sure not blocking.
|
| 223 |
+
() => {
|
| 224 |
+
delete highFreqKey.blockedUntil;
|
| 225 |
+
},
|
| 226 |
+
// Rate limit triggered
|
| 227 |
+
(err) => {
|
| 228 |
+
if (!(err instanceof RateLimitTriggeredError)) {
|
| 229 |
+
return;
|
| 230 |
+
}
|
| 231 |
+
const now = Date.now();
|
| 232 |
+
let tgtDate;
|
| 233 |
+
if (err.retryAfter) {
|
| 234 |
+
tgtDate = new Date(now + err.retryAfter * 1000);
|
| 235 |
+
} else if (err.retryAfterDate) {
|
| 236 |
+
tgtDate = err.retryAfterDate;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if (tgtDate) {
|
| 240 |
+
const dt = tgtDate.valueOf() - now;
|
| 241 |
+
highFreqKey.blockedUntil = tgtDate;
|
| 242 |
+
setTimeout(() => {
|
| 243 |
+
if (highFreqKey.blockedUntil === tgtDate) {
|
| 244 |
+
delete highFreqKey.blockedUntil;
|
| 245 |
+
}
|
| 246 |
+
}, dt).unref();
|
| 247 |
+
}
|
| 248 |
+
}
|
| 249 |
+
).finally(async () => {
|
| 250 |
+
// Always asynchronously update user(wallet);
|
| 251 |
+
const user = await auth.getBrief().catch(() => undefined);
|
| 252 |
+
if (user) {
|
| 253 |
+
highFreqKey.user = user;
|
| 254 |
+
}
|
| 255 |
+
});
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
let chargeAmount = 0;
|
| 259 |
+
rpcReflect.finally(async () => {
|
| 260 |
+
if (chargeAmount) {
|
| 261 |
+
auth.reportUsage(chargeAmount, `reader-serp`).catch((err) => {
|
| 262 |
+
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
| 263 |
+
});
|
| 264 |
+
const apiRoll = await apiRollPromise;
|
| 265 |
+
apiRoll.chargeAmount = chargeAmount;
|
| 266 |
+
}
|
| 267 |
+
});
|
| 268 |
+
|
| 269 |
+
let chargeAmountScaler = 1;
|
| 270 |
+
if (searchEngine === 'bing') {
|
| 271 |
+
chargeAmountScaler = 3;
|
| 272 |
+
}
|
| 273 |
+
if (variant !== 'web') {
|
| 274 |
+
chargeAmountScaler = 5;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
let realQuery = q;
|
| 278 |
+
let results = await this.cachedSearch(variant, {
|
| 279 |
+
provider: searchEngine,
|
| 280 |
+
q,
|
| 281 |
+
num,
|
| 282 |
+
gl,
|
| 283 |
+
hl,
|
| 284 |
+
location,
|
| 285 |
+
page,
|
| 286 |
+
}, crawlerOptions);
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
if (fallback && !results?.length && (!page || page === 1)) {
|
| 290 |
+
let tryTimes = 1;
|
| 291 |
+
const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
|
| 292 |
+
const terms = q.split(/\s+/g).filter((x) => !!x);
|
| 293 |
+
while (terms.length > 1) {
|
| 294 |
+
containsRTL ? terms.shift() : terms.pop(); // reduce the query by one term at a time
|
| 295 |
+
realQuery = terms.join(' ').trim();
|
| 296 |
+
if (!realQuery) {
|
| 297 |
+
break;
|
| 298 |
+
}
|
| 299 |
+
this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
|
| 300 |
+
results = await this.cachedSearch(variant, {
|
| 301 |
+
provider: searchEngine,
|
| 302 |
+
q: realQuery,
|
| 303 |
+
num,
|
| 304 |
+
gl,
|
| 305 |
+
hl,
|
| 306 |
+
location,
|
| 307 |
+
}, crawlerOptions);
|
| 308 |
+
tryTimes += 1;
|
| 309 |
+
if (results?.length) {
|
| 310 |
+
break;
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
chargeAmountScaler *= tryTimes;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
if (!results?.length) {
|
| 317 |
+
results = [];
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
const finalResults = results.map((x: any) => this.mapToFinalResults(x));
|
| 321 |
+
|
| 322 |
+
await Promise.all(finalResults.map((x: any) => this.assignGeneralMixin(x)));
|
| 323 |
+
|
| 324 |
+
this.assignChargeAmount(finalResults, chargeAmountScaler);
|
| 325 |
+
assignMeta(finalResults, {
|
| 326 |
+
query: realQuery,
|
| 327 |
+
fallback: realQuery === q ? undefined : realQuery,
|
| 328 |
+
});
|
| 329 |
+
|
| 330 |
+
return finalResults;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
assignChargeAmount(items: unknown[], scaler: number) {
|
| 335 |
+
const numCharge = Math.ceil(items.length / 10) * 10000 * scaler;
|
| 336 |
+
assignMeta(items, { usage: { tokens: numCharge } });
|
| 337 |
+
|
| 338 |
+
return numCharge;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
async getFavicon(domain: string) {
|
| 342 |
+
const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;
|
| 343 |
+
|
| 344 |
+
try {
|
| 345 |
+
const response = await fetch(url);
|
| 346 |
+
if (!response.ok) {
|
| 347 |
+
return '';
|
| 348 |
+
}
|
| 349 |
+
const ab = await response.arrayBuffer();
|
| 350 |
+
const buffer = Buffer.from(ab);
|
| 351 |
+
const base64 = buffer.toString('base64');
|
| 352 |
+
return `data:image/png;base64,${base64}`;
|
| 353 |
+
} catch (error: any) {
|
| 354 |
+
this.logger.warn(`Failed to get favicon base64 string`, { err: marshalErrorLike(error) });
|
| 355 |
+
return '';
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
async configure(opts: CrawlerOptions) {
|
| 360 |
+
const crawlOpts: ScrappingOptions = {
|
| 361 |
+
proxyUrl: opts.proxyUrl,
|
| 362 |
+
cookies: opts.setCookies,
|
| 363 |
+
overrideUserAgent: opts.userAgent,
|
| 364 |
+
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
| 365 |
+
locale: opts.locale,
|
| 366 |
+
referer: opts.referer,
|
| 367 |
+
viewport: opts.viewport,
|
| 368 |
+
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
| 369 |
+
allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
|
| 370 |
+
};
|
| 371 |
+
|
| 372 |
+
if (opts.locale) {
|
| 373 |
+
crawlOpts.extraHeaders ??= {};
|
| 374 |
+
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
return crawlOpts;
|
| 378 |
+
}
|
| 379 |
+
|
| 380 |
+
mapToFinalResults(input: WebSearchEntry) {
|
| 381 |
+
const whitelistedProps = [
|
| 382 |
+
'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
|
| 383 |
+
];
|
| 384 |
+
const result = {
|
| 385 |
+
title: input.title,
|
| 386 |
+
url: input.link,
|
| 387 |
+
description: Reflect.get(input, 'snippet'),
|
| 388 |
+
..._.pick(input, whitelistedProps),
|
| 389 |
+
};
|
| 390 |
+
|
| 391 |
+
return result;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
*iterProviders(preference?: string) {
|
| 395 |
+
if (preference === 'bing') {
|
| 396 |
+
yield this.serperBing;
|
| 397 |
+
yield this.serperGoogle;
|
| 398 |
+
yield this.googleSerp;
|
| 399 |
+
|
| 400 |
+
return;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
if (preference === 'google') {
|
| 404 |
+
yield this.googleSerp;
|
| 405 |
+
yield this.googleSerp;
|
| 406 |
+
yield this.serperGoogle;
|
| 407 |
+
|
| 408 |
+
return;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
yield this.serperGoogle;
|
| 412 |
+
yield this.googleSerp;
|
| 413 |
+
yield this.googleSerp;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
|
| 417 |
+
const queryDigest = objHashMd5B64Of({ ...query, variant });
|
| 418 |
+
const provider = query.provider;
|
| 419 |
+
Reflect.deleteProperty(query, 'provider');
|
| 420 |
+
const noCache = opts.noCache;
|
| 421 |
+
let cache;
|
| 422 |
+
if (!noCache) {
|
| 423 |
+
cache = (await SERPResult.fromFirestoreQuery(
|
| 424 |
+
SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
|
| 425 |
+
.orderBy('createdAt', 'desc')
|
| 426 |
+
.limit(1)
|
| 427 |
+
))[0];
|
| 428 |
+
if (cache) {
|
| 429 |
+
const age = Date.now() - cache.createdAt.valueOf();
|
| 430 |
+
const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
|
| 431 |
+
this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
|
| 432 |
+
query, digest: queryDigest, age, stale
|
| 433 |
+
});
|
| 434 |
+
|
| 435 |
+
if (!stale) {
|
| 436 |
+
return cache.response as any;
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
}
|
| 440 |
+
const scrappingOptions = await this.configure(opts);
|
| 441 |
+
|
| 442 |
+
try {
|
| 443 |
+
let r: any[] | undefined;
|
| 444 |
+
let lastError;
|
| 445 |
+
outerLoop:
|
| 446 |
+
for (const client of this.iterProviders(provider)) {
|
| 447 |
+
try {
|
| 448 |
+
switch (variant) {
|
| 449 |
+
case 'images': {
|
| 450 |
+
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
|
| 451 |
+
break outerLoop;
|
| 452 |
+
}
|
| 453 |
+
case 'news': {
|
| 454 |
+
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
|
| 455 |
+
break outerLoop;
|
| 456 |
+
}
|
| 457 |
+
case 'web':
|
| 458 |
+
default: {
|
| 459 |
+
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
|
| 460 |
+
break outerLoop;
|
| 461 |
+
}
|
| 462 |
+
}
|
| 463 |
+
} catch (err) {
|
| 464 |
+
lastError = err;
|
| 465 |
+
this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err });
|
| 466 |
+
}
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
if (r?.length) {
|
| 470 |
+
const nowDate = new Date();
|
| 471 |
+
const record = SERPResult.from({
|
| 472 |
+
query,
|
| 473 |
+
queryDigest,
|
| 474 |
+
response: r,
|
| 475 |
+
createdAt: nowDate,
|
| 476 |
+
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 477 |
+
});
|
| 478 |
+
SERPResult.save(record.degradeForFireStore()).catch((err) => {
|
| 479 |
+
this.logger.warn(`Failed to cache search result`, { err });
|
| 480 |
+
});
|
| 481 |
+
} else if (lastError) {
|
| 482 |
+
throw lastError;
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
return r;
|
| 486 |
+
} catch (err: any) {
|
| 487 |
+
if (cache) {
|
| 488 |
+
this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
|
| 489 |
+
|
| 490 |
+
return cache.response as any;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
throw err;
|
| 494 |
+
}
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
async assignGeneralMixin(result: Partial<WebSearchEntry>) {
|
| 498 |
+
const collectFavicon = this.threadLocal.get('collect-favicon');
|
| 499 |
+
|
| 500 |
+
if (collectFavicon && result.link) {
|
| 501 |
+
const url = new URL(result.link);
|
| 502 |
+
Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
|
| 503 |
+
}
|
| 504 |
+
}
|
| 505 |
+
}
|
src/db/searched.ts
CHANGED
|
@@ -62,3 +62,7 @@ export class SearchResult extends FirestoreRecord {
|
|
| 62 |
export class SerperSearchResult extends SearchResult {
|
| 63 |
static override collectionName = 'serperSearchResults';
|
| 64 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
export class SerperSearchResult extends SearchResult {
|
| 63 |
static override collectionName = 'serperSearchResults';
|
| 64 |
}
|
| 65 |
+
|
| 66 |
+
export class SERPResult extends SearchResult {
|
| 67 |
+
static override collectionName = 'SERPResults';
|
| 68 |
+
}
|
src/dto/crawler-options.ts
CHANGED
|
@@ -429,6 +429,8 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 429 |
})
|
| 430 |
respondTiming?: RESPOND_TIMING;
|
| 431 |
|
|
|
|
|
|
|
| 432 |
static override from(input: any) {
|
| 433 |
const instance = super.from(input) as CrawlerOptions;
|
| 434 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
|
|
| 429 |
})
|
| 430 |
respondTiming?: RESPOND_TIMING;
|
| 431 |
|
| 432 |
+
_hintIps?: string[];
|
| 433 |
+
|
| 434 |
static override from(input: any) {
|
| 435 |
const instance = super.from(input) as CrawlerOptions;
|
| 436 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
src/dto/jina-embeddings-auth.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
import _ from 'lodash';
|
| 2 |
import {
|
| 3 |
Also, AuthenticationFailedError, AuthenticationRequiredError,
|
| 4 |
-
|
| 5 |
AutoCastable,
|
|
|
|
| 6 |
} from 'civkit/civ-rpc';
|
| 7 |
import { htmlEscape } from 'civkit/escape';
|
| 8 |
import { marshalErrorLike } from 'civkit/lang';
|
|
@@ -96,12 +97,14 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
|
| 96 |
});
|
| 97 |
}
|
| 98 |
|
|
|
|
| 99 |
let account;
|
| 100 |
try {
|
| 101 |
account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
|
| 102 |
} catch (err) {
|
| 103 |
// FireStore would not accept any string as input and may throw if not happy with it
|
| 104 |
-
|
|
|
|
| 105 |
}
|
| 106 |
|
| 107 |
|
|
@@ -109,7 +112,7 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
|
| 109 |
const jitter = Math.ceil(Math.random() * 30 * 1000);
|
| 110 |
|
| 111 |
if (account && !ignoreCache) {
|
| 112 |
-
if (
|
| 113 |
this.user = account;
|
| 114 |
this.uid = this.user?.user_id;
|
| 115 |
|
|
@@ -117,6 +120,20 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
|
| 117 |
}
|
| 118 |
}
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
try {
|
| 121 |
const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
|
| 122 |
const brief = r.data;
|
|
@@ -148,7 +165,7 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
|
| 148 |
}
|
| 149 |
|
| 150 |
|
| 151 |
-
throw new
|
| 152 |
}
|
| 153 |
}
|
| 154 |
|
|
|
|
| 1 |
import _ from 'lodash';
|
| 2 |
import {
|
| 3 |
Also, AuthenticationFailedError, AuthenticationRequiredError,
|
| 4 |
+
RPC_CALL_ENVIRONMENT,
|
| 5 |
AutoCastable,
|
| 6 |
+
DownstreamServiceError,
|
| 7 |
} from 'civkit/civ-rpc';
|
| 8 |
import { htmlEscape } from 'civkit/escape';
|
| 9 |
import { marshalErrorLike } from 'civkit/lang';
|
|
|
|
| 97 |
});
|
| 98 |
}
|
| 99 |
|
| 100 |
+
let firestoreDegradation = false;
|
| 101 |
let account;
|
| 102 |
try {
|
| 103 |
account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
|
| 104 |
} catch (err) {
|
| 105 |
// FireStore would not accept any string as input and may throw if not happy with it
|
| 106 |
+
firestoreDegradation = true;
|
| 107 |
+
logger.warn(`Firestore issue`, { err });
|
| 108 |
}
|
| 109 |
|
| 110 |
|
|
|
|
| 112 |
const jitter = Math.ceil(Math.random() * 30 * 1000);
|
| 113 |
|
| 114 |
if (account && !ignoreCache) {
|
| 115 |
+
if ((age < (180_000 - jitter)) && (account.wallet?.total_balance > 0)) {
|
| 116 |
this.user = account;
|
| 117 |
this.uid = this.user?.user_id;
|
| 118 |
|
|
|
|
| 120 |
}
|
| 121 |
}
|
| 122 |
|
| 123 |
+
if (firestoreDegradation) {
|
| 124 |
+
logger.debug(`Using remote UC cached user`);
|
| 125 |
+
const r = await this.jinaEmbeddingsDashboard.authorization(this.bearerToken);
|
| 126 |
+
const brief = r.data;
|
| 127 |
+
const draftAccount = JinaEmbeddingsTokenAccount.from({
|
| 128 |
+
...account, ...brief, _id: this.bearerToken,
|
| 129 |
+
lastSyncedAt: new Date()
|
| 130 |
+
});
|
| 131 |
+
this.user = draftAccount;
|
| 132 |
+
this.uid = this.user?.user_id;
|
| 133 |
+
|
| 134 |
+
return draftAccount;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
try {
|
| 138 |
const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
|
| 139 |
const brief = r.data;
|
|
|
|
| 165 |
}
|
| 166 |
|
| 167 |
|
| 168 |
+
throw new DownstreamServiceError(`Failed to authenticate: ${err}`);
|
| 169 |
}
|
| 170 |
}
|
| 171 |
|
src/services/geoip.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { CityResponse, Reader } from 'maxmind';
|
|
| 4 |
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
| 5 |
import { GlobalLogger } from './logger';
|
| 6 |
import path from 'path';
|
|
|
|
| 7 |
|
| 8 |
export enum GEOIP_SUPPORTED_LANGUAGES {
|
| 9 |
EN = 'en',
|
|
@@ -85,6 +86,7 @@ export class GeoIPService extends AsyncService {
|
|
| 85 |
}
|
| 86 |
|
| 87 |
|
|
|
|
| 88 |
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
| 89 |
await this._lazyload();
|
| 90 |
|
|
@@ -116,6 +118,13 @@ export class GeoIPService extends AsyncService {
|
|
| 116 |
});
|
| 117 |
}
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
}
|
| 120 |
|
| 121 |
const instance = container.resolve(GeoIPService);
|
|
|
|
| 4 |
import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
|
| 5 |
import { GlobalLogger } from './logger';
|
| 6 |
import path from 'path';
|
| 7 |
+
import { Threaded } from './threaded';
|
| 8 |
|
| 9 |
export enum GEOIP_SUPPORTED_LANGUAGES {
|
| 10 |
EN = 'en',
|
|
|
|
| 86 |
}
|
| 87 |
|
| 88 |
|
| 89 |
+
@Threaded()
|
| 90 |
async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
| 91 |
await this._lazyload();
|
| 92 |
|
|
|
|
| 118 |
});
|
| 119 |
}
|
| 120 |
|
| 121 |
+
@Threaded()
|
| 122 |
+
async lookupCities(ips: string[], lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
|
| 123 |
+
const r = (await Promise.all(ips.map((ip) => this.lookupCity(ip, lang)))).filter(Boolean) as GeoIPCityResponse[];
|
| 124 |
+
|
| 125 |
+
return r;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
}
|
| 129 |
|
| 130 |
const instance = container.resolve(GeoIPService);
|
src/services/misc.ts
CHANGED
|
@@ -57,7 +57,11 @@ export class MiscService extends AsyncService {
|
|
| 57 |
}
|
| 58 |
|
| 59 |
const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
|
|
|
|
| 60 |
const isIp = isIP(normalizedHostname);
|
|
|
|
|
|
|
|
|
|
| 61 |
if (
|
| 62 |
(result.hostname === 'localhost') ||
|
| 63 |
(isIp && isIPInNonPublicRange(normalizedHostname))
|
|
@@ -88,12 +92,16 @@ export class MiscService extends AsyncService {
|
|
| 88 |
path: 'url'
|
| 89 |
});
|
| 90 |
}
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
}
|
| 94 |
}
|
| 95 |
|
| 96 |
-
return
|
|
|
|
|
|
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
}
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
|
| 60 |
+
let ips: string[] = [];
|
| 61 |
const isIp = isIP(normalizedHostname);
|
| 62 |
+
if (isIp) {
|
| 63 |
+
ips.push(normalizedHostname);
|
| 64 |
+
}
|
| 65 |
if (
|
| 66 |
(result.hostname === 'localhost') ||
|
| 67 |
(isIp && isIPInNonPublicRange(normalizedHostname))
|
|
|
|
| 92 |
path: 'url'
|
| 93 |
});
|
| 94 |
}
|
| 95 |
+
ips.push(x.address);
|
| 96 |
}
|
| 97 |
|
| 98 |
}
|
| 99 |
}
|
| 100 |
|
| 101 |
+
return {
|
| 102 |
+
url: result,
|
| 103 |
+
ips
|
| 104 |
+
};
|
| 105 |
}
|
| 106 |
|
| 107 |
}
|
src/services/puppeteer.ts
CHANGED
|
@@ -562,7 +562,8 @@ export class PuppeteerControl extends AsyncService {
|
|
| 562 |
headless: !Boolean(process.env.DEBUG_BROWSER),
|
| 563 |
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
|
| 564 |
args: [
|
| 565 |
-
'--disable-dev-shm-usage',
|
|
|
|
| 566 |
]
|
| 567 |
}).catch((err: any) => {
|
| 568 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
|
@@ -1618,11 +1619,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1618 |
}
|
| 1619 |
}
|
| 1620 |
try {
|
| 1621 |
-
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
| 1622 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 1623 |
-
if (snapshot) {
|
| 1624 |
-
snapshot.childFrames = await pSubFrameSnapshots;
|
| 1625 |
-
}
|
| 1626 |
} catch (err: any) {
|
| 1627 |
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1628 |
if (stuff instanceof Error) {
|
|
|
|
| 562 |
headless: !Boolean(process.env.DEBUG_BROWSER),
|
| 563 |
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
|
| 564 |
args: [
|
| 565 |
+
'--disable-dev-shm-usage',
|
| 566 |
+
'--disable-blink-features=AutomationControlled'
|
| 567 |
]
|
| 568 |
}).catch((err: any) => {
|
| 569 |
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
|
|
|
| 1619 |
}
|
| 1620 |
}
|
| 1621 |
try {
|
|
|
|
| 1622 |
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
|
|
|
|
|
|
|
|
|
| 1623 |
} catch (err: any) {
|
| 1624 |
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
|
| 1625 |
if (stuff instanceof Error) {
|
src/services/serp/compat.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export interface WebSearchEntry {
|
| 2 |
+
link: string;
|
| 3 |
+
title: string;
|
| 4 |
+
source?: string;
|
| 5 |
+
date?: string;
|
| 6 |
+
snippet?: string;
|
| 7 |
+
imageUrl?: string;
|
| 8 |
+
siteLinks?: {
|
| 9 |
+
link: string; title: string; snippet?: string;
|
| 10 |
+
}[];
|
| 11 |
+
variant?: 'web' | 'images' | 'news';
|
| 12 |
+
}
|
src/services/serp/google.ts
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { singleton } from 'tsyringe';
|
| 2 |
+
import { AsyncService } from 'civkit/async-service';
|
| 3 |
+
import { GlobalLogger } from '../logger';
|
| 4 |
+
import { JSDomControl } from '../jsdom';
|
| 5 |
+
import { isMainThread } from 'worker_threads';
|
| 6 |
+
import _ from 'lodash';
|
| 7 |
+
import { WebSearchEntry } from './compat';
|
| 8 |
+
import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
|
| 9 |
+
import { CurlControl } from '../curl';
|
| 10 |
+
import { readFile } from 'fs/promises';
|
| 11 |
+
import { ApplicationError } from 'civkit/civ-rpc';
|
| 12 |
+
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
| 13 |
+
import { parseJSONText } from 'civkit/vectorize';
|
| 14 |
+
import { retryWith } from 'civkit/decorators';
|
| 15 |
+
import { ProxyProvider } from '../../shared/services/proxy-provider';
|
| 16 |
+
|
| 17 |
+
@singleton()
|
| 18 |
+
export class GoogleSERP extends AsyncService {
|
| 19 |
+
|
| 20 |
+
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
| 21 |
+
|
| 22 |
+
constructor(
|
| 23 |
+
protected globalLogger: GlobalLogger,
|
| 24 |
+
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
| 25 |
+
protected jsDomControl: JSDomControl,
|
| 26 |
+
protected curlControl: CurlControl,
|
| 27 |
+
protected proxyProvider: ProxyProvider,
|
| 28 |
+
) {
|
| 29 |
+
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
| 30 |
+
super(...filteredDeps);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
override async init() {
|
| 34 |
+
await this.dependencyReady();
|
| 35 |
+
|
| 36 |
+
this.emit('ready');
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
retryDet = new WeakSet<ScrappingOptions>();
|
| 40 |
+
@retryWith((err) => {
|
| 41 |
+
if (err instanceof ServiceBadApproachError) {
|
| 42 |
+
return false;
|
| 43 |
+
}
|
| 44 |
+
if (err instanceof ServiceBadAttemptError) {
|
| 45 |
+
// Keep trying
|
| 46 |
+
return true;
|
| 47 |
+
}
|
| 48 |
+
if (err instanceof ApplicationError) {
|
| 49 |
+
// Quit with this error
|
| 50 |
+
return false;
|
| 51 |
+
}
|
| 52 |
+
return undefined;
|
| 53 |
+
}, 3)
|
| 54 |
+
async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
|
| 55 |
+
if (opts?.allocProxy === 'none') {
|
| 56 |
+
return this.curlControl.sideLoad(url, opts);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
const proxy = await this.proxyProvider.alloc(
|
| 60 |
+
process.env.PREFERRED_PROXY_COUNTRY || 'auto'
|
| 61 |
+
);
|
| 62 |
+
if (opts) {
|
| 63 |
+
if (this.retryDet.has(opts) && proxy.protocol === 'socks5h:') {
|
| 64 |
+
proxy.protocol = 'socks5:';
|
| 65 |
+
}
|
| 66 |
+
this.retryDet.add(opts);
|
| 67 |
+
}
|
| 68 |
+
const r = await this.curlControl.sideLoad(url, {
|
| 69 |
+
...opts,
|
| 70 |
+
proxyUrl: proxy.href,
|
| 71 |
+
});
|
| 72 |
+
|
| 73 |
+
if (r.status === 429) {
|
| 74 |
+
throw new ServiceBadAttemptError('Google returned a 429 error. This may happen due to various reasons, including rate limiting or other issues.');
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
if (opts && opts.allocProxy) {
|
| 78 |
+
opts.proxyUrl ??= proxy.href;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
return { ...r, proxy };
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
digestQuery(query: { [k: string]: any; }) {
|
| 85 |
+
const url = new URL(`https://${this.googleDomain}/search`);
|
| 86 |
+
const clone = { ...query };
|
| 87 |
+
const num = clone.num || 10;
|
| 88 |
+
if (clone.page) {
|
| 89 |
+
const page = parseInt(clone.page);
|
| 90 |
+
delete clone.page;
|
| 91 |
+
clone.start = (page - 1) * num;
|
| 92 |
+
if (clone.start === 0) {
|
| 93 |
+
delete clone.start;
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
if (clone.location) {
|
| 97 |
+
delete clone.location;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
for (const [k, v] of Object.entries(clone)) {
|
| 101 |
+
if (v === undefined || v === null) {
|
| 102 |
+
continue;
|
| 103 |
+
}
|
| 104 |
+
url.searchParams.set(k, `${v}`);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
return url;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
| 111 |
+
const url = this.digestQuery(query);
|
| 112 |
+
|
| 113 |
+
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
| 114 |
+
if (opts && sideLoaded.sideLoadOpts) {
|
| 115 |
+
opts.sideLoad = sideLoaded.sideLoadOpts;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
|
| 119 |
+
|
| 120 |
+
return snapshot;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
| 124 |
+
const url = this.digestQuery(query);
|
| 125 |
+
|
| 126 |
+
url.searchParams.set('tbm', 'nws');
|
| 127 |
+
|
| 128 |
+
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
| 129 |
+
if (opts && sideLoaded.sideLoadOpts) {
|
| 130 |
+
opts.sideLoad = sideLoaded.sideLoadOpts;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
|
| 134 |
+
|
| 135 |
+
return snapshot;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
| 139 |
+
const url = this.digestQuery(query);
|
| 140 |
+
|
| 141 |
+
url.searchParams.set('tbm', 'isch');
|
| 142 |
+
url.searchParams.set('asearch', 'isch');
|
| 143 |
+
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
|
| 144 |
+
|
| 145 |
+
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
| 146 |
+
|
| 147 |
+
if (sideLoaded.status !== 200 || !sideLoaded.file) {
|
| 148 |
+
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
|
| 152 |
+
const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
|
| 153 |
+
|
| 154 |
+
return _.get(rJSON, 'ischj.metadata').map((x: any) => {
|
| 155 |
+
|
| 156 |
+
return {
|
| 157 |
+
link: _.get(x, 'result.referrer_url'),
|
| 158 |
+
title: _.get(x, 'result.page_title'),
|
| 159 |
+
snippet: _.get(x, 'text_in_grid.snippet'),
|
| 160 |
+
source: _.get(x, 'result.site_title'),
|
| 161 |
+
imageWidth: _.get(x, 'original_image.width'),
|
| 162 |
+
imageHeight: _.get(x, 'original_image.height'),
|
| 163 |
+
imageUrl: _.get(x, 'original_image.url'),
|
| 164 |
+
variant: 'images',
|
| 165 |
+
};
|
| 166 |
+
}) as WebSearchEntry[];
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
async function getWebSearchResults() {
|
| 171 |
+
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
| 172 |
+
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
// @ts-ignore
|
| 176 |
+
await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
|
| 177 |
+
|
| 178 |
+
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
|
| 179 |
+
|
| 180 |
+
if (!wrapper1) {
|
| 181 |
+
return undefined;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
|
| 185 |
+
|
| 186 |
+
if (!query) {
|
| 187 |
+
return undefined;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
|
| 191 |
+
|
| 192 |
+
return candidates.map((x, pos) => {
|
| 193 |
+
const primaryLink = x.querySelector('a:not([href="#"])');
|
| 194 |
+
if (!primaryLink) {
|
| 195 |
+
return undefined;
|
| 196 |
+
}
|
| 197 |
+
const url = primaryLink.getAttribute('href');
|
| 198 |
+
|
| 199 |
+
if (primaryLink.querySelector('div[role="heading"]')) {
|
| 200 |
+
// const spans = primaryLink.querySelectorAll('span');
|
| 201 |
+
// const title = spans[0]?.textContent;
|
| 202 |
+
// const source = spans[1]?.textContent;
|
| 203 |
+
// const date = spans[spans.length - 1].textContent;
|
| 204 |
+
|
| 205 |
+
// return {
|
| 206 |
+
// link: url,
|
| 207 |
+
// title,
|
| 208 |
+
// source,
|
| 209 |
+
// date,
|
| 210 |
+
// variant: 'video'
|
| 211 |
+
// };
|
| 212 |
+
return undefined;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
const title = primaryLink.querySelector('h3')?.textContent;
|
| 216 |
+
const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
|
| 217 |
+
const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
|
| 218 |
+
let date = cite?.split('·')[1]?.trim();
|
| 219 |
+
const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
|
| 220 |
+
let snippet = snippets[snippets.length - 1]?.textContent;
|
| 221 |
+
if (!snippet) {
|
| 222 |
+
snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
|
| 223 |
+
}
|
| 224 |
+
date ??= snippets[snippets.length - 2]?.textContent?.trim();
|
| 225 |
+
const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
|
| 226 |
+
let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
|
| 227 |
+
return {
|
| 228 |
+
link: l.getAttribute('href'),
|
| 229 |
+
title: l.textContent,
|
| 230 |
+
};
|
| 231 |
+
});
|
| 232 |
+
const perhapsParent = x.parentElement?.closest('div[data-hveid]');
|
| 233 |
+
if (!siteLinks?.length && perhapsParent) {
|
| 234 |
+
const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
|
| 235 |
+
if (candidates.length) {
|
| 236 |
+
siteLinks = candidates.map((l) => {
|
| 237 |
+
const link = l.querySelector('a');
|
| 238 |
+
if (!link) {
|
| 239 |
+
return undefined;
|
| 240 |
+
}
|
| 241 |
+
const snippet = l.nextElementSibling?.textContent;
|
| 242 |
+
return {
|
| 243 |
+
link: link.getAttribute('href'),
|
| 244 |
+
title: link.textContent,
|
| 245 |
+
snippet,
|
| 246 |
+
};
|
| 247 |
+
}).filter(Boolean) as any;
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
return {
|
| 252 |
+
link: url,
|
| 253 |
+
title,
|
| 254 |
+
source,
|
| 255 |
+
date,
|
| 256 |
+
snippet: snippet ?? undefined,
|
| 257 |
+
imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
|
| 258 |
+
siteLinks: siteLinks.length ? siteLinks : undefined,
|
| 259 |
+
variant: 'web',
|
| 260 |
+
};
|
| 261 |
+
}).filter(Boolean) as WebSearchEntry[];
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
async function getNewsSearchResults() {
|
| 265 |
+
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
| 266 |
+
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
// @ts-ignore
|
| 270 |
+
await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
|
| 271 |
+
|
| 272 |
+
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
|
| 273 |
+
|
| 274 |
+
if (!wrapper1) {
|
| 275 |
+
return undefined;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
|
| 279 |
+
|
| 280 |
+
if (!query) {
|
| 281 |
+
return undefined;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
const candidates = Array.from(wrapper1.querySelectorAll('div[data-news-doc-id]'));
|
| 285 |
+
|
| 286 |
+
return candidates.map((x) => {
|
| 287 |
+
const primaryLink = x.querySelector('a:not([href="#"])');
|
| 288 |
+
if (!primaryLink) {
|
| 289 |
+
return undefined;
|
| 290 |
+
}
|
| 291 |
+
const url = primaryLink.getAttribute('href');
|
| 292 |
+
const titleElem = primaryLink.querySelector('div[role="heading"]');
|
| 293 |
+
|
| 294 |
+
if (!titleElem) {
|
| 295 |
+
return undefined;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
const title = titleElem.textContent?.trim();
|
| 299 |
+
const source = titleElem.previousElementSibling?.textContent?.trim();
|
| 300 |
+
const snippet = titleElem.nextElementSibling?.textContent?.trim();
|
| 301 |
+
|
| 302 |
+
const innerSpans = Array.from(titleElem.parentElement?.querySelectorAll('span') || []);
|
| 303 |
+
const date = innerSpans[innerSpans.length - 1]?.textContent?.trim();
|
| 304 |
+
|
| 305 |
+
return {
|
| 306 |
+
link: url,
|
| 307 |
+
title,
|
| 308 |
+
source,
|
| 309 |
+
date,
|
| 310 |
+
snippet,
|
| 311 |
+
variant: 'news',
|
| 312 |
+
};
|
| 313 |
+
}).filter(Boolean) as WebSearchEntry[];
|
| 314 |
+
}
|
src/services/serp/puppeteer.ts
ADDED
|
@@ -0,0 +1,692 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import _ from 'lodash';
|
| 2 |
+
import { readFile } from 'fs/promises';
|
| 3 |
+
import { container, singleton } from 'tsyringe';
|
| 4 |
+
|
| 5 |
+
import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
|
| 6 |
+
import type { Cookie } from 'set-cookie-parser';
|
| 7 |
+
import puppeteer, { TimeoutError } from 'puppeteer';
|
| 8 |
+
|
| 9 |
+
import { Defer } from 'civkit/defer';
|
| 10 |
+
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
|
| 11 |
+
import { AsyncService } from 'civkit/async-service';
|
| 12 |
+
import { FancyFile } from 'civkit/fancy-file';
|
| 13 |
+
import { delay } from 'civkit/timeout';
|
| 14 |
+
|
| 15 |
+
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../../shared/lib/errors';
|
| 16 |
+
import { CurlControl } from '../curl';
|
| 17 |
+
import { AsyncLocalContext } from '../async-context';
|
| 18 |
+
import { GlobalLogger } from '../logger';
|
| 19 |
+
import { minimalStealth } from '../minimal-stealth';
|
| 20 |
+
import { BlackHoleDetector } from '../blackhole-detector';
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
export interface ScrappingOptions {
|
| 24 |
+
proxyUrl?: string;
|
| 25 |
+
cookies?: Cookie[];
|
| 26 |
+
overrideUserAgent?: string;
|
| 27 |
+
timeoutMs?: number;
|
| 28 |
+
locale?: string;
|
| 29 |
+
referer?: string;
|
| 30 |
+
extraHeaders?: Record<string, string>;
|
| 31 |
+
viewport?: Viewport;
|
| 32 |
+
proxyResources?: boolean;
|
| 33 |
+
allocProxy?: string;
|
| 34 |
+
|
| 35 |
+
sideLoad?: {
|
| 36 |
+
impersonate: {
|
| 37 |
+
[url: string]: {
|
| 38 |
+
status: number;
|
| 39 |
+
headers: { [k: string]: string | string[]; };
|
| 40 |
+
contentType?: string;
|
| 41 |
+
body?: FancyFile;
|
| 42 |
+
};
|
| 43 |
+
};
|
| 44 |
+
proxyOrigin: { [origin: string]: string; };
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
const SIMULATE_SCROLL = `
|
| 50 |
+
(function () {
|
| 51 |
+
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
|
| 52 |
+
const targetRect = target.getBoundingClientRect();
|
| 53 |
+
const record = {
|
| 54 |
+
target,
|
| 55 |
+
isIntersecting,
|
| 56 |
+
time: timestamp,
|
| 57 |
+
// If intersecting, intersectionRect matches boundingClientRect
|
| 58 |
+
// If not intersecting, intersectionRect is empty (0x0)
|
| 59 |
+
intersectionRect: isIntersecting
|
| 60 |
+
? targetRect
|
| 61 |
+
: new DOMRectReadOnly(0, 0, 0, 0),
|
| 62 |
+
// Current bounding client rect of the target
|
| 63 |
+
boundingClientRect: targetRect,
|
| 64 |
+
// Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting)
|
| 65 |
+
intersectionRatio: isIntersecting ? 1 : 0,
|
| 66 |
+
// Root bounds (viewport in our case)
|
| 67 |
+
rootBounds: new DOMRectReadOnly(
|
| 68 |
+
0,
|
| 69 |
+
0,
|
| 70 |
+
window.innerWidth,
|
| 71 |
+
window.innerHeight
|
| 72 |
+
)
|
| 73 |
+
};
|
| 74 |
+
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
|
| 75 |
+
return record;
|
| 76 |
+
}
|
| 77 |
+
function cloneIntersectionObserverEntry(entry) {
|
| 78 |
+
const record = {
|
| 79 |
+
target: entry.target,
|
| 80 |
+
isIntersecting: entry.isIntersecting,
|
| 81 |
+
time: entry.time,
|
| 82 |
+
intersectionRect: entry.intersectionRect,
|
| 83 |
+
boundingClientRect: entry.boundingClientRect,
|
| 84 |
+
intersectionRatio: entry.intersectionRatio,
|
| 85 |
+
rootBounds: entry.rootBounds
|
| 86 |
+
};
|
| 87 |
+
Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
|
| 88 |
+
return record;
|
| 89 |
+
}
|
| 90 |
+
const orig = window.IntersectionObserver;
|
| 91 |
+
const kCallback = Symbol('callback');
|
| 92 |
+
const kLastEntryMap = Symbol('lastEntryMap');
|
| 93 |
+
const liveObservers = new Map();
|
| 94 |
+
class MangledIntersectionObserver extends orig {
|
| 95 |
+
constructor(callback, options) {
|
| 96 |
+
super((entries, observer) => {
|
| 97 |
+
const lastEntryMap = observer[kLastEntryMap];
|
| 98 |
+
const lastEntry = entries[entries.length - 1];
|
| 99 |
+
lastEntryMap.set(lastEntry.target, lastEntry);
|
| 100 |
+
return callback(entries, observer);
|
| 101 |
+
}, options);
|
| 102 |
+
this[kCallback] = callback;
|
| 103 |
+
this[kLastEntryMap] = new WeakMap();
|
| 104 |
+
liveObservers.set(this, new Set());
|
| 105 |
+
}
|
| 106 |
+
disconnect() {
|
| 107 |
+
liveObservers.get(this)?.clear();
|
| 108 |
+
liveObservers.delete(this);
|
| 109 |
+
return super.disconnect();
|
| 110 |
+
}
|
| 111 |
+
observe(target) {
|
| 112 |
+
const observer = liveObservers.get(this);
|
| 113 |
+
observer?.add(target);
|
| 114 |
+
return super.observe(target);
|
| 115 |
+
}
|
| 116 |
+
unobserve(target) {
|
| 117 |
+
const observer = liveObservers.get(this);
|
| 118 |
+
observer?.delete(target);
|
| 119 |
+
return super.unobserve(target);
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false });
|
| 123 |
+
window.IntersectionObserver = MangledIntersectionObserver;
|
| 124 |
+
function simulateScroll() {
|
| 125 |
+
for (const [observer, targets] of liveObservers.entries()) {
|
| 126 |
+
const t0 = performance.now();
|
| 127 |
+
for (const target of targets) {
|
| 128 |
+
const entry = createIntersectionObserverEntry(target, true, t0);
|
| 129 |
+
observer[kCallback]([entry], observer);
|
| 130 |
+
setTimeout(() => {
|
| 131 |
+
const t1 = performance.now();
|
| 132 |
+
const lastEntry = observer[kLastEntryMap].get(target);
|
| 133 |
+
if (!lastEntry) {
|
| 134 |
+
return;
|
| 135 |
+
}
|
| 136 |
+
const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 };
|
| 137 |
+
observer[kCallback]([entry2], observer);
|
| 138 |
+
});
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
window.simulateScroll = simulateScroll;
|
| 143 |
+
})();
|
| 144 |
+
`;
|
| 145 |
+
|
| 146 |
+
const MUTATION_IDLE_WATCH = `
|
| 147 |
+
(function () {
|
| 148 |
+
let timeout;
|
| 149 |
+
const sendMsg = ()=> {
|
| 150 |
+
document.dispatchEvent(new CustomEvent('mutationIdle'));
|
| 151 |
+
};
|
| 152 |
+
|
| 153 |
+
const cb = () => {
|
| 154 |
+
if (timeout) {
|
| 155 |
+
clearTimeout(timeout);
|
| 156 |
+
timeout = setTimeout(sendMsg, 200);
|
| 157 |
+
}
|
| 158 |
+
};
|
| 159 |
+
const mutationObserver = new MutationObserver(cb);
|
| 160 |
+
|
| 161 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 162 |
+
mutationObserver.observe(document.documentElement, {
|
| 163 |
+
childList: true,
|
| 164 |
+
subtree: true,
|
| 165 |
+
});
|
| 166 |
+
timeout = setTimeout(sendMsg, 200);
|
| 167 |
+
}, { once: true })
|
| 168 |
+
})();
|
| 169 |
+
`;
|
| 170 |
+
|
| 171 |
+
const SCRIPT_TO_INJECT_INTO_FRAME = `
|
| 172 |
+
${SIMULATE_SCROLL}
|
| 173 |
+
${MUTATION_IDLE_WATCH}
|
| 174 |
+
(${minimalStealth.toString()})();
|
| 175 |
+
|
| 176 |
+
(function(){
|
| 177 |
+
|
| 178 |
+
let lastMutationIdle = 0;
|
| 179 |
+
let initialAnalytics;
|
| 180 |
+
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
| 181 |
+
|
| 182 |
+
function waitForSelector(selectorText) {
|
| 183 |
+
return new Promise((resolve) => {
|
| 184 |
+
const existing = document.querySelector(selectorText);
|
| 185 |
+
if (existing) {
|
| 186 |
+
resolve(existing);
|
| 187 |
+
return;
|
| 188 |
+
}
|
| 189 |
+
if (document.readyState === 'loading') {
|
| 190 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 191 |
+
const observer = new MutationObserver(() => {
|
| 192 |
+
const elem = document.querySelector(selectorText);
|
| 193 |
+
if (elem) {
|
| 194 |
+
resolve(document.querySelector(selectorText));
|
| 195 |
+
observer.disconnect();
|
| 196 |
+
}
|
| 197 |
+
});
|
| 198 |
+
observer.observe(document.documentElement, {
|
| 199 |
+
childList: true,
|
| 200 |
+
subtree: true
|
| 201 |
+
});
|
| 202 |
+
});
|
| 203 |
+
return;
|
| 204 |
+
}
|
| 205 |
+
const observer = new MutationObserver(() => {
|
| 206 |
+
const elem = document.querySelector(selectorText);
|
| 207 |
+
if (elem) {
|
| 208 |
+
resolve(document.querySelector(selectorText));
|
| 209 |
+
observer.disconnect();
|
| 210 |
+
}
|
| 211 |
+
});
|
| 212 |
+
observer.observe(document.documentElement, {
|
| 213 |
+
childList: true,
|
| 214 |
+
subtree: true
|
| 215 |
+
});
|
| 216 |
+
});
|
| 217 |
+
}
|
| 218 |
+
window.waitForSelector = waitForSelector;
|
| 219 |
+
})();
|
| 220 |
+
`;
|
| 221 |
+
|
| 222 |
+
@singleton()
|
| 223 |
+
export class SERPSpecializedPuppeteerControl extends AsyncService {
|
| 224 |
+
|
| 225 |
+
_sn = 0;
|
| 226 |
+
browser!: Browser;
|
| 227 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 228 |
+
|
| 229 |
+
__loadedPage: Page[] = [];
|
| 230 |
+
|
| 231 |
+
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
| 232 |
+
snMap = new WeakMap<Page, number>();
|
| 233 |
+
livePages = new Set<Page>();
|
| 234 |
+
lastPageCratedAt: number = 0;
|
| 235 |
+
ua: string = '';
|
| 236 |
+
|
| 237 |
+
protected _REPORT_FUNCTION_NAME = 'bingo';
|
| 238 |
+
|
| 239 |
+
lifeCycleTrack = new WeakMap();
|
| 240 |
+
|
| 241 |
+
constructor(
|
| 242 |
+
protected globalLogger: GlobalLogger,
|
| 243 |
+
protected asyncLocalContext: AsyncLocalContext,
|
| 244 |
+
protected curlControl: CurlControl,
|
| 245 |
+
protected blackHoleDetector: BlackHoleDetector,
|
| 246 |
+
) {
|
| 247 |
+
super(...arguments);
|
| 248 |
+
this.setMaxListeners(Infinity);
|
| 249 |
+
|
| 250 |
+
let crippledTimes = 0;
|
| 251 |
+
this.on('crippled', () => {
|
| 252 |
+
crippledTimes += 1;
|
| 253 |
+
this.__loadedPage.length = 0;
|
| 254 |
+
this.livePages.clear();
|
| 255 |
+
if (crippledTimes > 5) {
|
| 256 |
+
process.nextTick(() => {
|
| 257 |
+
this.emit('error', new Error('Browser crashed too many times, quitting...'));
|
| 258 |
+
// process.exit(1);
|
| 259 |
+
});
|
| 260 |
+
}
|
| 261 |
+
});
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
override async init() {
|
| 265 |
+
await this.dependencyReady();
|
| 266 |
+
if (process.env.NODE_ENV?.includes('dry-run')) {
|
| 267 |
+
this.emit('ready');
|
| 268 |
+
return;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
if (this.browser) {
|
| 272 |
+
if (this.browser.connected) {
|
| 273 |
+
await this.browser.close();
|
| 274 |
+
} else {
|
| 275 |
+
this.browser.process()?.kill('SIGKILL');
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
this.browser = await puppeteer.launch({
|
| 279 |
+
timeout: 10_000,
|
| 280 |
+
headless: !Boolean(process.env.DEBUG_BROWSER),
|
| 281 |
+
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
|
| 282 |
+
args: [
|
| 283 |
+
'--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled'
|
| 284 |
+
]
|
| 285 |
+
}).catch((err: any) => {
|
| 286 |
+
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
| 287 |
+
process.nextTick(() => {
|
| 288 |
+
this.emit('error', err);
|
| 289 |
+
// process.exit(1);
|
| 290 |
+
});
|
| 291 |
+
return Promise.reject(err);
|
| 292 |
+
});
|
| 293 |
+
this.browser.once('disconnected', () => {
|
| 294 |
+
this.logger.warn(`Browser disconnected`);
|
| 295 |
+
if (this.browser) {
|
| 296 |
+
this.emit('crippled');
|
| 297 |
+
}
|
| 298 |
+
process.nextTick(() => this.serviceReady());
|
| 299 |
+
});
|
| 300 |
+
this.ua = await this.browser.userAgent();
|
| 301 |
+
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
| 302 |
+
this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
|
| 303 |
+
|
| 304 |
+
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
| 305 |
+
|
| 306 |
+
this.emit('ready');
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
async newPage<T>(bewareDeadLock: any = false) {
|
| 310 |
+
if (!bewareDeadLock) {
|
| 311 |
+
await this.serviceReady();
|
| 312 |
+
}
|
| 313 |
+
const sn = this._sn++;
|
| 314 |
+
let page;
|
| 315 |
+
try {
|
| 316 |
+
const dedicatedContext = await this.browser.createBrowserContext();
|
| 317 |
+
page = await dedicatedContext.newPage();
|
| 318 |
+
} catch (err: any) {
|
| 319 |
+
this.logger.warn(`Failed to create page ${sn}`, { err });
|
| 320 |
+
this.browser.process()?.kill('SIGKILL');
|
| 321 |
+
throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
|
| 322 |
+
}
|
| 323 |
+
const preparations = [];
|
| 324 |
+
|
| 325 |
+
preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, '')));
|
| 326 |
+
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
| 327 |
+
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
| 328 |
+
preparations.push(page.setBypassCSP(true));
|
| 329 |
+
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
|
| 330 |
+
preparations.push(page.exposeFunction(this._REPORT_FUNCTION_NAME, (thing: T) => {
|
| 331 |
+
page.emit(this._REPORT_FUNCTION_NAME, thing);
|
| 332 |
+
}));
|
| 333 |
+
preparations.push(page.exposeFunction('setViewport', (viewport: Viewport | null) => {
|
| 334 |
+
page.setViewport(viewport).catch(() => undefined);
|
| 335 |
+
}));
|
| 336 |
+
preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
|
| 337 |
+
|
| 338 |
+
await Promise.all(preparations);
|
| 339 |
+
|
| 340 |
+
this.snMap.set(page, sn);
|
| 341 |
+
this.logger.debug(`Page ${sn} created.`);
|
| 342 |
+
this.lastPageCratedAt = Date.now();
|
| 343 |
+
this.livePages.add(page);
|
| 344 |
+
|
| 345 |
+
return page;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
async getNextPage() {
|
| 349 |
+
let thePage: Page | undefined;
|
| 350 |
+
if (this.__loadedPage.length) {
|
| 351 |
+
thePage = this.__loadedPage.shift();
|
| 352 |
+
if (this.__loadedPage.length <= 1) {
|
| 353 |
+
process.nextTick(() => {
|
| 354 |
+
this.newPage()
|
| 355 |
+
.then((r) => this.__loadedPage.push(r))
|
| 356 |
+
.catch((err) => {
|
| 357 |
+
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
| 358 |
+
});
|
| 359 |
+
});
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
if (!thePage) {
|
| 364 |
+
thePage = await this.newPage();
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
const timer = setTimeout(() => {
|
| 368 |
+
this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage!)}...`);
|
| 369 |
+
this.ditchPage(thePage!);
|
| 370 |
+
}, 300 * 1000);
|
| 371 |
+
|
| 372 |
+
this.finalizerMap.set(thePage, timer);
|
| 373 |
+
|
| 374 |
+
return thePage;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
async ditchPage(page: Page) {
|
| 378 |
+
if (this.finalizerMap.has(page)) {
|
| 379 |
+
clearTimeout(this.finalizerMap.get(page)!);
|
| 380 |
+
this.finalizerMap.delete(page);
|
| 381 |
+
}
|
| 382 |
+
if (page.isClosed()) {
|
| 383 |
+
return;
|
| 384 |
+
}
|
| 385 |
+
const sn = this.snMap.get(page);
|
| 386 |
+
this.logger.debug(`Closing page ${sn}`);
|
| 387 |
+
await Promise.race([
|
| 388 |
+
(async () => {
|
| 389 |
+
const ctx = page.browserContext();
|
| 390 |
+
try {
|
| 391 |
+
await page.close();
|
| 392 |
+
} finally {
|
| 393 |
+
await ctx.close();
|
| 394 |
+
}
|
| 395 |
+
})(),
|
| 396 |
+
delay(5000)
|
| 397 |
+
]).catch((err) => {
|
| 398 |
+
this.logger.error(`Failed to destroy page ${sn}`, { err });
|
| 399 |
+
});
|
| 400 |
+
this.livePages.delete(page);
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
|
| 404 |
+
// parsedUrl.search = '';
|
| 405 |
+
const url = parsedUrl.toString();
|
| 406 |
+
const page = await this.getNextPage();
|
| 407 |
+
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
| 408 |
+
page.on('response', (_resp) => {
|
| 409 |
+
this.blackHoleDetector.itWorked();
|
| 410 |
+
});
|
| 411 |
+
page.on('request', async (req) => {
|
| 412 |
+
if (req.isInterceptResolutionHandled()) {
|
| 413 |
+
return;
|
| 414 |
+
};
|
| 415 |
+
const reqUrlParsed = new URL(req.url());
|
| 416 |
+
if (!reqUrlParsed.protocol.startsWith('http')) {
|
| 417 |
+
const overrides = req.continueRequestOverrides();
|
| 418 |
+
|
| 419 |
+
return req.continue(overrides, 0);
|
| 420 |
+
}
|
| 421 |
+
const typ = req.resourceType();
|
| 422 |
+
if (typ === 'media' || typ === 'font' || typ === 'image' || typ === 'stylesheet') {
|
| 423 |
+
// Non-cooperative answer to block all media requests.
|
| 424 |
+
return req.abort('blockedbyclient');
|
| 425 |
+
}
|
| 426 |
+
if (!options.proxyResources) {
|
| 427 |
+
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
| 428 |
+
if (!isDocRequest) {
|
| 429 |
+
if (options.extraHeaders) {
|
| 430 |
+
const overrides = req.continueRequestOverrides();
|
| 431 |
+
const continueArgs = [{
|
| 432 |
+
...overrides,
|
| 433 |
+
headers: {
|
| 434 |
+
...req.headers(),
|
| 435 |
+
...overrides?.headers,
|
| 436 |
+
...options.extraHeaders,
|
| 437 |
+
}
|
| 438 |
+
}, 1] as const;
|
| 439 |
+
|
| 440 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 441 |
+
}
|
| 442 |
+
const overrides = req.continueRequestOverrides();
|
| 443 |
+
|
| 444 |
+
return req.continue(overrides, 0);
|
| 445 |
+
}
|
| 446 |
+
}
|
| 447 |
+
const sideload = options.sideLoad;
|
| 448 |
+
|
| 449 |
+
const impersonate = sideload?.impersonate[reqUrlParsed.href];
|
| 450 |
+
if (impersonate) {
|
| 451 |
+
let body;
|
| 452 |
+
if (impersonate.body) {
|
| 453 |
+
body = await readFile(await impersonate.body.filePath);
|
| 454 |
+
if (req.isInterceptResolutionHandled()) {
|
| 455 |
+
return;
|
| 456 |
+
}
|
| 457 |
+
}
|
| 458 |
+
return req.respond({
|
| 459 |
+
status: impersonate.status,
|
| 460 |
+
headers: impersonate.headers,
|
| 461 |
+
contentType: impersonate.contentType,
|
| 462 |
+
body: body ? Uint8Array.from(body) : undefined,
|
| 463 |
+
}, 999);
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
| 467 |
+
const ctx = this.lifeCycleTrack.get(page);
|
| 468 |
+
if (proxy && ctx) {
|
| 469 |
+
return await this.asyncLocalContext.bridge(ctx, async () => {
|
| 470 |
+
try {
|
| 471 |
+
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
| 472 |
+
...options,
|
| 473 |
+
method: req.method(),
|
| 474 |
+
body: req.postData(),
|
| 475 |
+
extraHeaders: {
|
| 476 |
+
...req.headers(),
|
| 477 |
+
...options.extraHeaders,
|
| 478 |
+
},
|
| 479 |
+
proxyUrl: proxy
|
| 480 |
+
});
|
| 481 |
+
if (req.isInterceptResolutionHandled()) {
|
| 482 |
+
return;
|
| 483 |
+
};
|
| 484 |
+
|
| 485 |
+
if (curled.chain.length === 1) {
|
| 486 |
+
if (!curled.file) {
|
| 487 |
+
return req.respond({
|
| 488 |
+
status: curled.status,
|
| 489 |
+
headers: _.omit(curled.headers, 'result'),
|
| 490 |
+
contentType: curled.contentType,
|
| 491 |
+
}, 3);
|
| 492 |
+
}
|
| 493 |
+
const body = await readFile(await curled.file.filePath);
|
| 494 |
+
if (req.isInterceptResolutionHandled()) {
|
| 495 |
+
return;
|
| 496 |
+
};
|
| 497 |
+
return req.respond({
|
| 498 |
+
status: curled.status,
|
| 499 |
+
headers: _.omit(curled.headers, 'result'),
|
| 500 |
+
contentType: curled.contentType,
|
| 501 |
+
body: Uint8Array.from(body),
|
| 502 |
+
}, 3);
|
| 503 |
+
}
|
| 504 |
+
options.sideLoad ??= curled.sideLoadOpts;
|
| 505 |
+
_.merge(options.sideLoad, curled.sideLoadOpts);
|
| 506 |
+
const firstReq = curled.chain[0];
|
| 507 |
+
|
| 508 |
+
return req.respond({
|
| 509 |
+
status: firstReq.result!.code,
|
| 510 |
+
headers: _.omit(firstReq, 'result'),
|
| 511 |
+
}, 3);
|
| 512 |
+
} catch (err: any) {
|
| 513 |
+
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
| 514 |
+
}
|
| 515 |
+
if (req.isInterceptResolutionHandled()) {
|
| 516 |
+
return;
|
| 517 |
+
};
|
| 518 |
+
const overrides = req.continueRequestOverrides();
|
| 519 |
+
const continueArgs = [{
|
| 520 |
+
...overrides,
|
| 521 |
+
headers: {
|
| 522 |
+
...req.headers(),
|
| 523 |
+
...overrides?.headers,
|
| 524 |
+
...options.extraHeaders,
|
| 525 |
+
}
|
| 526 |
+
}, 1] as const;
|
| 527 |
+
|
| 528 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 529 |
+
});
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
if (req.isInterceptResolutionHandled()) {
|
| 533 |
+
return;
|
| 534 |
+
};
|
| 535 |
+
const overrides = req.continueRequestOverrides();
|
| 536 |
+
const continueArgs = [{
|
| 537 |
+
...overrides,
|
| 538 |
+
headers: {
|
| 539 |
+
...req.headers(),
|
| 540 |
+
...overrides?.headers,
|
| 541 |
+
...options.extraHeaders,
|
| 542 |
+
}
|
| 543 |
+
}, 1] as const;
|
| 544 |
+
|
| 545 |
+
return req.continue(continueArgs[0], continueArgs[1]);
|
| 546 |
+
});
|
| 547 |
+
await page.setRequestInterception(true);
|
| 548 |
+
|
| 549 |
+
const sn = this.snMap.get(page);
|
| 550 |
+
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
| 551 |
+
|
| 552 |
+
await page.evaluateOnNewDocument(`(function () {
|
| 553 |
+
if (window.top !== window.self) {
|
| 554 |
+
return;
|
| 555 |
+
}
|
| 556 |
+
const func = ${func.toString()};
|
| 557 |
+
|
| 558 |
+
func().then((result) => {
|
| 559 |
+
window.${this._REPORT_FUNCTION_NAME}({data: result});
|
| 560 |
+
}).catch((err) => {
|
| 561 |
+
window.${this._REPORT_FUNCTION_NAME}({err: err});
|
| 562 |
+
});
|
| 563 |
+
|
| 564 |
+
})();`);
|
| 565 |
+
|
| 566 |
+
if (options.locale) {
|
| 567 |
+
// Add headers via request interception to walk around this bug
|
| 568 |
+
// https://github.com/puppeteer/puppeteer/issues/10235
|
| 569 |
+
// await page.setExtraHTTPHeaders({
|
| 570 |
+
// 'Accept-Language': options.locale
|
| 571 |
+
// });
|
| 572 |
+
|
| 573 |
+
await page.evaluateOnNewDocument(() => {
|
| 574 |
+
Object.defineProperty(navigator, "language", {
|
| 575 |
+
get: function () {
|
| 576 |
+
return options.locale;
|
| 577 |
+
}
|
| 578 |
+
});
|
| 579 |
+
Object.defineProperty(navigator, "languages", {
|
| 580 |
+
get: function () {
|
| 581 |
+
return [options.locale];
|
| 582 |
+
}
|
| 583 |
+
});
|
| 584 |
+
});
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
if (options.cookies) {
|
| 588 |
+
const mapped = options.cookies.map((x) => {
|
| 589 |
+
const draft: CookieParam = {
|
| 590 |
+
name: x.name,
|
| 591 |
+
value: encodeURIComponent(x.value),
|
| 592 |
+
secure: x.secure,
|
| 593 |
+
domain: x.domain,
|
| 594 |
+
path: x.path,
|
| 595 |
+
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
|
| 596 |
+
sameSite: x.sameSite as any,
|
| 597 |
+
};
|
| 598 |
+
if (!draft.expires && x.maxAge) {
|
| 599 |
+
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
|
| 600 |
+
}
|
| 601 |
+
if (!draft.domain) {
|
| 602 |
+
draft.url = parsedUrl.toString();
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
return draft;
|
| 606 |
+
});
|
| 607 |
+
try {
|
| 608 |
+
await page.setCookie(...mapped);
|
| 609 |
+
} catch (err: any) {
|
| 610 |
+
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
|
| 611 |
+
throw new ParamValidationError({
|
| 612 |
+
path: 'cookies',
|
| 613 |
+
message: `Failed to set cookies: ${err?.message}`
|
| 614 |
+
});
|
| 615 |
+
}
|
| 616 |
+
}
|
| 617 |
+
if (options.overrideUserAgent) {
|
| 618 |
+
await page.setUserAgent(options.overrideUserAgent);
|
| 619 |
+
}
|
| 620 |
+
if (options.viewport) {
|
| 621 |
+
await page.setViewport(options.viewport);
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
const resultDeferred = Defer<T>();
|
| 625 |
+
const crippleListener = () => resultDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
| 626 |
+
this.once('crippled', crippleListener);
|
| 627 |
+
resultDeferred.promise.finally(() => {
|
| 628 |
+
this.off('crippled', crippleListener);
|
| 629 |
+
});
|
| 630 |
+
const hdl = (s: {
|
| 631 |
+
err?: any;
|
| 632 |
+
data?: T;
|
| 633 |
+
}) => {
|
| 634 |
+
if (s.err) {
|
| 635 |
+
resultDeferred.reject(s.err);
|
| 636 |
+
}
|
| 637 |
+
resultDeferred.resolve(s.data);
|
| 638 |
+
};
|
| 639 |
+
page.on(this._REPORT_FUNCTION_NAME, hdl as any);
|
| 640 |
+
page.once('abuse', (event: any) => {
|
| 641 |
+
this.emit('abuse', { ...event, url: parsedUrl });
|
| 642 |
+
|
| 643 |
+
resultDeferred.reject(
|
| 644 |
+
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
| 645 |
+
);
|
| 646 |
+
});
|
| 647 |
+
|
| 648 |
+
const timeout = options.timeoutMs || 30_000;
|
| 649 |
+
const goToOptions: GoToOptions = {
|
| 650 |
+
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 651 |
+
timeout,
|
| 652 |
+
};
|
| 653 |
+
|
| 654 |
+
if (options.referer) {
|
| 655 |
+
goToOptions.referer = options.referer;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
const gotoPromise = page.goto(url, goToOptions)
|
| 660 |
+
.catch((err) => {
|
| 661 |
+
if (err instanceof TimeoutError) {
|
| 662 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
|
| 663 |
+
return new AssertionFailureError({
|
| 664 |
+
message: `Failed to goto ${url}: ${err}`,
|
| 665 |
+
cause: err,
|
| 666 |
+
});
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} aborted`, { err });
|
| 670 |
+
return undefined;
|
| 671 |
+
}).then(async (r) => {
|
| 672 |
+
await delay(5000);
|
| 673 |
+
resultDeferred.reject(new TimeoutError(`Control function did not respond in time`));
|
| 674 |
+
return r;
|
| 675 |
+
});
|
| 676 |
+
|
| 677 |
+
try {
|
| 678 |
+
await Promise.race([resultDeferred.promise, gotoPromise]);
|
| 679 |
+
|
| 680 |
+
return resultDeferred.promise;
|
| 681 |
+
} finally {
|
| 682 |
+
page.off(this._REPORT_FUNCTION_NAME, hdl as any);
|
| 683 |
+
this.ditchPage(page);
|
| 684 |
+
resultDeferred.resolve();
|
| 685 |
+
}
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
const puppeteerControl = container.resolve(SERPSpecializedPuppeteerControl);
|
| 691 |
+
|
| 692 |
+
export default puppeteerControl;
|
src/services/serp/serper.ts
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import { singleton } from 'tsyringe';
|
| 3 |
+
import { GlobalLogger } from '../logger';
|
| 4 |
+
import { SecretExposer } from '../../shared/services/secrets';
|
| 5 |
+
import { AsyncLocalContext } from '../async-context';
|
| 6 |
+
import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../../shared/3rd-party/serper-search';
|
| 7 |
+
import { BlackHoleDetector } from '../blackhole-detector';
|
| 8 |
+
import { Context } from '../registry';
|
| 9 |
+
import { AsyncService } from 'civkit/async-service';
|
| 10 |
+
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
|
| 11 |
+
|
| 12 |
+
@singleton()
|
| 13 |
+
export class SerperGoogleSearchService extends AsyncService {
|
| 14 |
+
|
| 15 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 16 |
+
|
| 17 |
+
client!: SerperGoogleHTTP;
|
| 18 |
+
|
| 19 |
+
constructor(
|
| 20 |
+
protected globalLogger: GlobalLogger,
|
| 21 |
+
protected secretExposer: SecretExposer,
|
| 22 |
+
protected threadLocal: AsyncLocalContext,
|
| 23 |
+
protected blackHoleDetector: BlackHoleDetector,
|
| 24 |
+
) {
|
| 25 |
+
super(...arguments);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
override async init() {
|
| 29 |
+
await this.dependencyReady();
|
| 30 |
+
this.emit('ready');
|
| 31 |
+
|
| 32 |
+
this.client = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
doSearch(variant: 'web', query: SerperSearchQueryParams): Promise<SerperWebSearchResponse['organic']>;
|
| 37 |
+
doSearch(variant: 'images', query: SerperSearchQueryParams): Promise<SerperImageSearchResponse['images']>;
|
| 38 |
+
doSearch(variant: 'news', query: SerperSearchQueryParams): Promise<SerperNewsSearchResponse['news']>;
|
| 39 |
+
async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
|
| 40 |
+
this.logger.debug(`Doing external search`, query);
|
| 41 |
+
let results;
|
| 42 |
+
switch (variant) {
|
| 43 |
+
case 'images': {
|
| 44 |
+
const r = await this.client.imageSearch(query);
|
| 45 |
+
|
| 46 |
+
results = r.parsed.images;
|
| 47 |
+
break;
|
| 48 |
+
}
|
| 49 |
+
case 'news': {
|
| 50 |
+
const r = await this.client.newsSearch(query);
|
| 51 |
+
|
| 52 |
+
results = r.parsed.news;
|
| 53 |
+
break;
|
| 54 |
+
}
|
| 55 |
+
case 'web':
|
| 56 |
+
default: {
|
| 57 |
+
const r = await this.client.webSearch(query);
|
| 58 |
+
|
| 59 |
+
results = r.parsed.organic;
|
| 60 |
+
break;
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
this.blackHoleDetector.itWorked();
|
| 65 |
+
|
| 66 |
+
return results;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
async webSearch(query: SerperSearchQueryParams) {
|
| 71 |
+
return this.doSearch('web', query);
|
| 72 |
+
}
|
| 73 |
+
async imageSearch(query: SerperSearchQueryParams) {
|
| 74 |
+
return this.doSearch('images', query);
|
| 75 |
+
}
|
| 76 |
+
async newsSearch(query: SerperSearchQueryParams) {
|
| 77 |
+
return this.doSearch('news', query);
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
@singleton()
|
| 83 |
+
export class SerperBingSearchService extends SerperGoogleSearchService {
|
| 84 |
+
override client!: SerperBingHTTP;
|
| 85 |
+
|
| 86 |
+
override async init() {
|
| 87 |
+
await this.dependencyReady();
|
| 88 |
+
this.emit('ready');
|
| 89 |
+
|
| 90 |
+
this.client = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
|
| 95 |
+
@Prop({
|
| 96 |
+
arrayOf: String,
|
| 97 |
+
desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.`
|
| 98 |
+
})
|
| 99 |
+
ext?: string | string[];
|
| 100 |
+
|
| 101 |
+
@Prop({
|
| 102 |
+
arrayOf: String,
|
| 103 |
+
desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.`
|
| 104 |
+
})
|
| 105 |
+
filetype?: string | string[];
|
| 106 |
+
|
| 107 |
+
@Prop({
|
| 108 |
+
arrayOf: String,
|
| 109 |
+
desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.`
|
| 110 |
+
})
|
| 111 |
+
intitle?: string | string[];
|
| 112 |
+
|
| 113 |
+
@Prop({
|
| 114 |
+
arrayOf: String,
|
| 115 |
+
desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
|
| 116 |
+
})
|
| 117 |
+
loc?: string | string[];
|
| 118 |
+
|
| 119 |
+
@Prop({
|
| 120 |
+
arrayOf: String,
|
| 121 |
+
desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.`
|
| 122 |
+
})
|
| 123 |
+
site?: string | string[];
|
| 124 |
+
|
| 125 |
+
addTo(searchTerm: string) {
|
| 126 |
+
const chunks = [];
|
| 127 |
+
for (const [key, value] of Object.entries(this)) {
|
| 128 |
+
if (value) {
|
| 129 |
+
const values = Array.isArray(value) ? value : [value];
|
| 130 |
+
const textValue = values.map((v) => `${key}:${v}`).join(' OR ');
|
| 131 |
+
if (textValue) {
|
| 132 |
+
chunks.push(textValue);
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks;
|
| 137 |
+
|
| 138 |
+
if (opPart.length) {
|
| 139 |
+
return [searchTerm, opPart].join(' ');
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
return searchTerm;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
static override from(input: any) {
|
| 146 |
+
const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
|
| 147 |
+
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
| 148 |
+
|
| 149 |
+
const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
|
| 150 |
+
|
| 151 |
+
for (const p of params) {
|
| 152 |
+
const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
|
| 153 |
+
if (!customValue) {
|
| 154 |
+
continue;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
const filtered = customValue.split(', ').filter(Boolean);
|
| 158 |
+
if (filtered.length) {
|
| 159 |
+
Reflect.set(instance, p, filtered);
|
| 160 |
+
}
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
return instance;
|
| 164 |
+
}
|
| 165 |
+
}
|
src/stand-alone/serp.ts
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import 'reflect-metadata';
|
| 2 |
+
import { container, singleton } from 'tsyringe';
|
| 3 |
+
|
| 4 |
+
import { KoaServer } from 'civkit/civ-rpc/koa';
|
| 5 |
+
import http2 from 'http2';
|
| 6 |
+
import http from 'http';
|
| 7 |
+
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
| 8 |
+
import path from 'path';
|
| 9 |
+
import fs from 'fs';
|
| 10 |
+
import { mimeOfExt } from 'civkit/mime';
|
| 11 |
+
import { Context, Next } from 'koa';
|
| 12 |
+
import { RPCRegistry } from '../services/registry';
|
| 13 |
+
import { AsyncResource } from 'async_hooks';
|
| 14 |
+
import { runOnce } from 'civkit/decorators';
|
| 15 |
+
import { randomUUID } from 'crypto';
|
| 16 |
+
import { ThreadedServiceRegistry } from '../services/threaded';
|
| 17 |
+
import { GlobalLogger } from '../services/logger';
|
| 18 |
+
import { AsyncLocalContext } from '../services/async-context';
|
| 19 |
+
import finalizer, { Finalizer } from '../services/finalizer';
|
| 20 |
+
import { SerpHost } from '../api/serp';
|
| 21 |
+
|
| 22 |
+
@singleton()
|
| 23 |
+
export class SERPStandAloneServer extends KoaServer {
|
| 24 |
+
logger = this.globalLogger.child({ service: this.constructor.name });
|
| 25 |
+
|
| 26 |
+
httpAlternativeServer?: typeof this['httpServer'];
|
| 27 |
+
assets = new Map<string, WalkOutEntity>();
|
| 28 |
+
|
| 29 |
+
constructor(
|
| 30 |
+
protected globalLogger: GlobalLogger,
|
| 31 |
+
protected registry: RPCRegistry,
|
| 32 |
+
protected serpHost: SerpHost,
|
| 33 |
+
protected threadLocal: AsyncLocalContext,
|
| 34 |
+
protected threads: ThreadedServiceRegistry,
|
| 35 |
+
) {
|
| 36 |
+
super(...arguments);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
h2c() {
|
| 40 |
+
this.httpAlternativeServer = this.httpServer;
|
| 41 |
+
const fn = this.koaApp.callback();
|
| 42 |
+
this.httpServer = http2.createServer((req, res) => {
|
| 43 |
+
const ar = new AsyncResource('HTTP2ServerRequest');
|
| 44 |
+
ar.runInAsyncScope(fn, this.koaApp, req, res);
|
| 45 |
+
});
|
| 46 |
+
// useResourceBasedDefaultTracker();
|
| 47 |
+
|
| 48 |
+
return this;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
override async init() {
|
| 52 |
+
await this.walkForAssets();
|
| 53 |
+
await this.dependencyReady();
|
| 54 |
+
|
| 55 |
+
for (const [k, v] of this.registry.conf.entries()) {
|
| 56 |
+
if (v.tags?.includes('crawl')) {
|
| 57 |
+
this.registry.conf.delete(k);
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
await super.init();
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
async walkForAssets() {
|
| 65 |
+
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
| 66 |
+
|
| 67 |
+
for (const file of files) {
|
| 68 |
+
if (file.type !== 'file') {
|
| 69 |
+
continue;
|
| 70 |
+
}
|
| 71 |
+
this.assets.set(file.relativePath.toString(), file);
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
override listen(port: number) {
|
| 76 |
+
const r = super.listen(port);
|
| 77 |
+
if (this.httpAlternativeServer) {
|
| 78 |
+
const altPort = port + 1;
|
| 79 |
+
this.httpAlternativeServer.listen(altPort, () => {
|
| 80 |
+
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
| 81 |
+
});
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
return r;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
makeAssetsServingController() {
|
| 88 |
+
return (ctx: Context, next: Next) => {
|
| 89 |
+
const requestPath = ctx.path;
|
| 90 |
+
const file = requestPath.slice(1);
|
| 91 |
+
if (!file) {
|
| 92 |
+
return next();
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
const asset = this.assets.get(file);
|
| 96 |
+
if (asset?.type !== 'file') {
|
| 97 |
+
return next();
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
ctx.body = fs.createReadStream(asset.path);
|
| 101 |
+
ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
|
| 102 |
+
ctx.set('Content-Length', asset.stats.size.toString());
|
| 103 |
+
|
| 104 |
+
return;
|
| 105 |
+
};
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
registerRoutes(): void {
|
| 109 |
+
this.koaApp.use(this.makeAssetsServingController());
|
| 110 |
+
this.koaApp.use(this.registry.makeShimController());
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
// Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
|
| 115 |
+
// TraceId is expected to be request-bound and unique. So these two has to be distinguished.
|
| 116 |
+
@runOnce()
|
| 117 |
+
override insertAsyncHookMiddleware() {
|
| 118 |
+
const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
|
| 119 |
+
const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
|
| 120 |
+
this.threadLocal.setup({
|
| 121 |
+
traceId: randomUUID(),
|
| 122 |
+
traceT0: new Date(),
|
| 123 |
+
googleTraceId,
|
| 124 |
+
});
|
| 125 |
+
|
| 126 |
+
return next();
|
| 127 |
+
};
|
| 128 |
+
|
| 129 |
+
this.koaApp.use(asyncHookMiddleware);
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
@Finalizer()
|
| 133 |
+
override async standDown() {
|
| 134 |
+
const tasks: Promise<any>[] = [];
|
| 135 |
+
if (this.httpAlternativeServer?.listening) {
|
| 136 |
+
(this.httpAlternativeServer as http.Server).closeIdleConnections?.();
|
| 137 |
+
this.httpAlternativeServer.close();
|
| 138 |
+
tasks.push(new Promise<void>((resolve, reject) => {
|
| 139 |
+
this.httpAlternativeServer!.close((err) => {
|
| 140 |
+
if (err) {
|
| 141 |
+
return reject(err);
|
| 142 |
+
}
|
| 143 |
+
resolve();
|
| 144 |
+
});
|
| 145 |
+
}));
|
| 146 |
+
}
|
| 147 |
+
tasks.push(super.standDown());
|
| 148 |
+
await Promise.all(tasks);
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
}
|
| 152 |
+
const instance = container.resolve(SERPStandAloneServer);
|
| 153 |
+
|
| 154 |
+
export default instance;
|
| 155 |
+
|
| 156 |
+
if (process.env.NODE_ENV?.includes('dry-run')) {
|
| 157 |
+
instance.serviceReady().then(() => finalizer.terminate());
|
| 158 |
+
} else {
|
| 159 |
+
instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
|
| 160 |
+
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit ca09ea8fcbb84aeea4eb8015bf8e98eef1813048
|