nomagick commited on
Commit
12ba1bc
·
unverified ·
1 Parent(s): 919a81d

feat: serp endpoint (#1180)

Browse files

* wip

* wip

* fix

* wip

* fix: add jitter to user cache

* cd

* fix

* fix

* fix: user cache age comparison

* fix: try to partition apiroll query

* bump: deps

* wip

* cd

* feat: fallback for serp

* fix

* cd

* fix

* fix

* serp: stop hiding expense

* serp: enable fallback by default

.github/workflows/cd.yml CHANGED
@@ -75,9 +75,15 @@ jobs:
75
  - name: Deploy SEARCH with Tag
76
  run: |
77
  gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
 
 
 
78
  - name: Deploy CRAWL-EU with Tag
79
  run: |
80
  gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
81
  - name: Deploy SEARCH-EU with Tag
82
  run: |
83
- gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
 
 
 
 
75
  - name: Deploy SEARCH with Tag
76
  run: |
77
  gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
78
+ - name: Deploy SERP with Tag
79
+ run: |
80
+ gcloud beta run deploy serp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
81
  - name: Deploy CRAWL-EU with Tag
82
  run: |
83
  gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
84
  - name: Deploy SEARCH-EU with Tag
85
  run: |
86
+ gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
87
+ - name: Deploy SERP-JP with Tag
88
+ run: |
89
+ gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2
.vscode/launch.json CHANGED
@@ -102,5 +102,27 @@
102
  "preLaunchTask": "Backend:build:watch",
103
  "killBehavior": "forceful"
104
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  ]
106
  }
 
102
  "preLaunchTask": "Backend:build:watch",
103
  "killBehavior": "forceful"
104
  },
105
+ {
106
+ "name": "Debug Stand Alone SERP",
107
+ "request": "launch",
108
+ "runtimeArgs": [
109
+ "--env-file=.secret.local",
110
+ ],
111
+ "env": {
112
+ "GCLOUD_PROJECT": "reader-6b7dc",
113
+ "PREFERRED_PROXY_COUNTRY": "hk",
114
+ "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
115
+ "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
116
+ },
117
+ "cwd": "${workspaceFolder}",
118
+ "program": "build/stand-alone/serp.js",
119
+ "skipFiles": [
120
+ "<node_internals>/**"
121
+ ],
122
+ "type": "node",
123
+ "outputCapture": "std",
124
+ "preLaunchTask": "Backend:build:watch",
125
+ "killBehavior": "forceful"
126
+ },
127
  ]
128
  }
src/api/crawler.ts CHANGED
@@ -48,6 +48,7 @@ import { RobotsTxtService } from '../services/robots-text';
48
  import { TempFileManager } from '../services/temp-file';
49
  import { MiscService } from '../services/misc';
50
  import { HTTPServiceError } from 'civkit';
 
51
 
52
  export interface ExtraScrappingOptions extends ScrappingOptions {
53
  withIframe?: boolean | 'quoted';
@@ -58,6 +59,7 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
58
  engine?: string;
59
  allocProxy?: string;
60
  private?: boolean;
 
61
  }
62
 
63
  const indexProto = {
@@ -94,6 +96,7 @@ export class CrawlerHost extends RPCHost {
94
  protected threadLocal: AsyncLocalContext,
95
  protected robotsTxtService: RobotsTxtService,
96
  protected tempFileManager: TempFileManager,
 
97
  protected miscService: MiscService,
98
  ) {
99
  super(...arguments);
@@ -511,15 +514,16 @@ export class CrawlerHost extends RPCHost {
511
  });
512
  }
513
 
514
- const result = await this.miscService.assertNormalizedUrl(url);
515
- if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
516
  throw new SecurityCompromiseError({
517
- message: `Circular hostname: ${result.protocol}`,
518
  path: 'url'
519
  });
520
  }
 
521
 
522
- return result;
523
  }
524
 
525
  getUrlDigest(urlToCrawl: URL) {
@@ -886,7 +890,11 @@ export class CrawlerHost extends RPCHost {
886
  }
887
  }
888
  } else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) {
889
- crawlOpts.proxyUrl = (await this.proxyProvider.alloc(crawlOpts.allocProxy)).href;
 
 
 
 
890
  }
891
 
892
  try {
@@ -1030,6 +1038,7 @@ export class CrawlerHost extends RPCHost {
1030
  proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
1031
  private: Boolean(opts.doNotTrack),
1032
  };
 
1033
  if (crawlOpts.targetSelector?.length) {
1034
  if (typeof crawlOpts.targetSelector === 'string') {
1035
  crawlOpts.targetSelector = [crawlOpts.targetSelector];
@@ -1046,6 +1055,18 @@ export class CrawlerHost extends RPCHost {
1046
  }
1047
  }
1048
 
 
 
 
 
 
 
 
 
 
 
 
 
1049
  if (opts.locale) {
1050
  crawlOpts.extraHeaders ??= {};
1051
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
@@ -1221,6 +1242,7 @@ export class CrawlerHost extends RPCHost {
1221
  };
1222
  }
1223
 
 
1224
  @retryWith((err) => {
1225
  if (err instanceof ServiceBadApproachError) {
1226
  return false;
@@ -1239,7 +1261,14 @@ export class CrawlerHost extends RPCHost {
1239
  if (opts?.allocProxy === 'none') {
1240
  return this.curlControl.sideLoad(url, opts);
1241
  }
1242
- const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
 
 
 
 
 
 
 
1243
  const r = await this.curlControl.sideLoad(url, {
1244
  ...opts,
1245
  proxyUrl: proxy.href,
@@ -1252,6 +1281,34 @@ export class CrawlerHost extends RPCHost {
1252
  return { ...r, proxy };
1253
  }
1254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1255
  knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
1256
  if (url.hostname === 'chromewebstore.google.com') {
1257
  return true;
 
48
  import { TempFileManager } from '../services/temp-file';
49
  import { MiscService } from '../services/misc';
50
  import { HTTPServiceError } from 'civkit';
51
+ import { GeoIPService } from '../services/geoip';
52
 
53
  export interface ExtraScrappingOptions extends ScrappingOptions {
54
  withIframe?: boolean | 'quoted';
 
59
  engine?: string;
60
  allocProxy?: string;
61
  private?: boolean;
62
+ countryHint?: string;
63
  }
64
 
65
  const indexProto = {
 
96
  protected threadLocal: AsyncLocalContext,
97
  protected robotsTxtService: RobotsTxtService,
98
  protected tempFileManager: TempFileManager,
99
+ protected geoIpService: GeoIPService,
100
  protected miscService: MiscService,
101
  ) {
102
  super(...arguments);
 
514
  });
515
  }
516
 
517
+ const { url: safeURL, ips } = await this.miscService.assertNormalizedUrl(url);
518
+ if (this.puppeteerControl.circuitBreakerHosts.has(safeURL.hostname.toLowerCase())) {
519
  throw new SecurityCompromiseError({
520
+ message: `Circular hostname: ${safeURL.protocol}`,
521
  path: 'url'
522
  });
523
  }
524
+ crawlerOptions._hintIps = ips;
525
 
526
+ return safeURL;
527
  }
528
 
529
  getUrlDigest(urlToCrawl: URL) {
 
890
  }
891
  }
892
  } else if (crawlOpts?.allocProxy && crawlOpts.allocProxy !== 'none' && !crawlOpts.proxyUrl) {
893
+ const proxyUrl = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(crawlOpts));
894
+ if (proxyUrl.protocol === 'socks5h:') {
895
+ proxyUrl.protocol = 'socks5:';
896
+ }
897
+ crawlOpts.proxyUrl = proxyUrl.href;
898
  }
899
 
900
  try {
 
1038
  proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
1039
  private: Boolean(opts.doNotTrack),
1040
  };
1041
+
1042
  if (crawlOpts.targetSelector?.length) {
1043
  if (typeof crawlOpts.targetSelector === 'string') {
1044
  crawlOpts.targetSelector = [crawlOpts.targetSelector];
 
1055
  }
1056
  }
1057
 
1058
+ if (opts._hintIps?.length) {
1059
+ const hints = await this.geoIpService.lookupCities(opts._hintIps);
1060
+ const board: Record<string, number> = {};
1061
+ for (const x of hints) {
1062
+ if (x.country?.code) {
1063
+ board[x.country.code] = (board[x.country.code] || 0) + 1;
1064
+ }
1065
+ }
1066
+ const hintCountry = _.maxBy(Array.from(Object.entries(board)), 1)?.[0];
1067
+ crawlOpts.countryHint = hintCountry?.toLowerCase();
1068
+ }
1069
+
1070
  if (opts.locale) {
1071
  crawlOpts.extraHeaders ??= {};
1072
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
 
1242
  };
1243
  }
1244
 
1245
+ retryDet = new WeakSet<ExtraScrappingOptions>();
1246
  @retryWith((err) => {
1247
  if (err instanceof ServiceBadApproachError) {
1248
  return false;
 
1261
  if (opts?.allocProxy === 'none') {
1262
  return this.curlControl.sideLoad(url, opts);
1263
  }
1264
+
1265
+ const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
1266
+ if (opts) {
1267
+ if (this.retryDet.has(opts) && proxy.protocol === 'socks5h:') {
1268
+ proxy.protocol = 'socks5:';
1269
+ }
1270
+ this.retryDet.add(opts);
1271
+ }
1272
  const r = await this.curlControl.sideLoad(url, {
1273
  ...opts,
1274
  proxyUrl: proxy.href,
 
1281
  return { ...r, proxy };
1282
  }
1283
 
1284
+ protected figureOutBestProxyCountry(opts?: ExtraScrappingOptions) {
1285
+ if (!opts) {
1286
+ return 'auto';
1287
+ }
1288
+
1289
+ let draft;
1290
+
1291
+ if (opts.allocProxy) {
1292
+ if (this.proxyProvider.supports(opts.allocProxy)) {
1293
+ draft = opts.allocProxy;
1294
+ } else if (opts.allocProxy === 'none') {
1295
+ return 'none';
1296
+ }
1297
+ }
1298
+
1299
+ if (opts.countryHint) {
1300
+ if (this.proxyProvider.supports(opts.countryHint)) {
1301
+ draft ??= opts.countryHint;
1302
+ } else if (opts.countryHint === 'cn') {
1303
+ draft ??= 'hk';
1304
+ }
1305
+ }
1306
+
1307
+ draft ??= opts.allocProxy || 'auto';
1308
+
1309
+ return draft;
1310
+ }
1311
+
1312
  knownUrlThatSideLoadingWouldCrashTheBrowser(url: URL) {
1313
  if (url.hostname === 'chromewebstore.google.com') {
1314
  return true;
src/api/searcher.ts DELETED
@@ -1,503 +0,0 @@
1
- import { singleton } from 'tsyringe';
2
- import _ from 'lodash';
3
-
4
- import {
5
- assignTransferProtocolMeta, RPCHost, RPCReflection,
6
- AssertionFailureError,
7
- RawString,
8
- } from 'civkit/civ-rpc';
9
- import { marshalErrorLike } from 'civkit/lang';
10
- import { objHashMd5B64Of } from 'civkit/hash';
11
-
12
- import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
13
- import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
14
- import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
15
-
16
- import { CrawlerHost, ExtraScrappingOptions } from './crawler';
17
- import { SearchResult } from '../db/searched';
18
- import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
19
- import { CrawlerOptions } from '../dto/crawler-options';
20
- import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
21
-
22
- import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
23
- import { GlobalLogger } from '../services/logger';
24
- import { AsyncLocalContext } from '../services/async-context';
25
- import { OutputServerEventStream } from '../lib/transform-server-event-stream';
26
- import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
27
- import { InsufficientBalanceError } from '../services/errors';
28
-
29
-
30
- @singleton()
31
- export class SearcherHost extends RPCHost {
32
- logger = this.globalLogger.child({ service: this.constructor.name });
33
-
34
- cacheRetentionMs = 1000 * 3600 * 24 * 7;
35
- cacheValidMs = 1000 * 3600;
36
- pageCacheToleranceMs = 1000 * 3600 * 24;
37
-
38
- reasonableDelayMs = 15_000;
39
-
40
- targetResultCount = 5;
41
-
42
- constructor(
43
- protected globalLogger: GlobalLogger,
44
- protected rateLimitControl: RateLimitControl,
45
- protected threadLocal: AsyncLocalContext,
46
- protected braveSearchService: BraveSearchService,
47
- protected crawler: CrawlerHost,
48
- protected snapshotFormatter: SnapshotFormatter,
49
- ) {
50
- super(...arguments);
51
- }
52
-
53
- override async init() {
54
- await this.dependencyReady();
55
-
56
- this.emit('ready');
57
- }
58
-
59
- @Method({
60
- name: 'searchIndex',
61
- ext: {
62
- http: {
63
- action: ['get', 'post'],
64
- path: '/search'
65
- }
66
- },
67
- tags: ['search'],
68
- returnType: [String, OutputServerEventStream],
69
- })
70
- @Method({
71
- ext: {
72
- http: {
73
- action: ['get', 'post'],
74
- path: '::q'
75
- }
76
- },
77
- tags: ['search'],
78
- returnType: [String, OutputServerEventStream, RawString],
79
- })
80
- async search(
81
- @RPCReflect() rpcReflect: RPCReflection,
82
- @Ctx() ctx: Context,
83
- auth: JinaEmbeddingsAuthDTO,
84
- @Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
85
- count: number,
86
- crawlerOptions: CrawlerOptions,
87
- braveSearchExplicitOperators: BraveSearchExplicitOperatorsDto,
88
- @Param('q') q?: string,
89
- ) {
90
- const uid = await auth.solveUID();
91
- let chargeAmount = 0;
92
- const noSlashPath = decodeURIComponent(ctx.path).slice(1);
93
- if (!noSlashPath && !q) {
94
- const index = await this.crawler.getIndex(auth);
95
- if (!uid) {
96
- index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
97
- }
98
- if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
99
-
100
- return index;
101
- }
102
-
103
- return assignTransferProtocolMeta(`${index}`,
104
- { contentType: 'text/plain', envelope: null }
105
- );
106
- }
107
-
108
- const user = await auth.assertUser();
109
- if (!(user.wallet.total_balance > 0)) {
110
- throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
111
- }
112
-
113
- const rateLimitPolicy = auth.getRateLimits(rpcReflect.name.toUpperCase()) || [
114
- parseInt(user.metadata?.speed_level) >= 2 ?
115
- RateLimitDesc.from({
116
- occurrence: 100,
117
- periodSeconds: 60
118
- }) :
119
- RateLimitDesc.from({
120
- occurrence: 40,
121
- periodSeconds: 60
122
- })
123
- ];
124
-
125
- const apiRoll = await this.rateLimitControl.simpleRPCUidBasedLimit(
126
- rpcReflect, uid!, [rpcReflect.name.toUpperCase()],
127
- ...rateLimitPolicy
128
- );
129
-
130
- rpcReflect.finally(() => {
131
- if (chargeAmount) {
132
- auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
133
- this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
134
- });
135
- apiRoll.chargeAmount = chargeAmount;
136
- }
137
- });
138
-
139
- delete crawlerOptions.html;
140
-
141
- const crawlOpts = await this.crawler.configure(crawlerOptions);
142
- const searchQuery = braveSearchExplicitOperators.addTo(q || noSlashPath);
143
- const r = await this.cachedWebSearch({
144
- q: searchQuery,
145
- count: count ? Math.floor(count + 2) : 20
146
- }, crawlerOptions.noCache);
147
-
148
- if (!r.web?.results.length) {
149
- throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
150
- }
151
-
152
- if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
153
- delete crawlOpts.timeoutMs;
154
- }
155
-
156
- const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results.slice(0, count + 2), crawlOpts,
157
- CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
158
- count,
159
- );
160
-
161
- if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
162
- const sseStream = new OutputServerEventStream();
163
- rpcReflect.return(sseStream);
164
-
165
- try {
166
- for await (const scrapped of it) {
167
- if (!scrapped) {
168
- continue;
169
- }
170
-
171
- chargeAmount = this.assignChargeAmount(scrapped);
172
- sseStream.write({
173
- event: 'data',
174
- data: scrapped,
175
- });
176
- }
177
- } catch (err: any) {
178
- this.logger.error(`Failed to collect search result for query ${searchQuery}`,
179
- { err: marshalErrorLike(err) }
180
- );
181
- sseStream.write({
182
- event: 'error',
183
- data: marshalErrorLike(err),
184
- });
185
- }
186
-
187
- sseStream.end();
188
-
189
- return sseStream;
190
- }
191
-
192
- let lastScrapped: any[] | undefined;
193
- let earlyReturn = false;
194
- if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
195
- let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
196
- const setEarlyReturnTimer = () => {
197
- if (earlyReturnTimer) {
198
- return;
199
- }
200
- earlyReturnTimer = setTimeout(() => {
201
- if (!lastScrapped) {
202
- return;
203
- }
204
- chargeAmount = this.assignChargeAmount(lastScrapped);
205
- rpcReflect.return(lastScrapped);
206
- earlyReturn = true;
207
- }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
208
- };
209
-
210
- for await (const scrapped of it) {
211
- lastScrapped = scrapped;
212
- if (_.some(scrapped, (x) => this.pageQualified(x))) {
213
- setEarlyReturnTimer();
214
- }
215
- if (!this.searchResultsQualified(scrapped, count)) {
216
- continue;
217
- }
218
- if (earlyReturnTimer) {
219
- clearTimeout(earlyReturnTimer);
220
- }
221
- chargeAmount = this.assignChargeAmount(scrapped);
222
-
223
- return scrapped;
224
- }
225
-
226
- if (earlyReturnTimer) {
227
- clearTimeout(earlyReturnTimer);
228
- }
229
-
230
- if (!lastScrapped) {
231
- throw new AssertionFailureError(`No content available for query ${searchQuery}`);
232
- }
233
-
234
- if (!earlyReturn) {
235
- chargeAmount = this.assignChargeAmount(lastScrapped);
236
- }
237
-
238
- return lastScrapped;
239
- }
240
-
241
- let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
242
- const setEarlyReturnTimer = () => {
243
- if (earlyReturnTimer) {
244
- return;
245
- }
246
- earlyReturnTimer = setTimeout(() => {
247
- if (!lastScrapped) {
248
- return;
249
- }
250
- chargeAmount = this.assignChargeAmount(lastScrapped);
251
- rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
252
- earlyReturn = true;
253
- }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
254
- };
255
-
256
- for await (const scrapped of it) {
257
- lastScrapped = scrapped;
258
-
259
- if (_.some(scrapped, (x) => this.pageQualified(x))) {
260
- setEarlyReturnTimer();
261
- }
262
-
263
- if (!this.searchResultsQualified(scrapped, count)) {
264
- continue;
265
- }
266
-
267
- if (earlyReturnTimer) {
268
- clearTimeout(earlyReturnTimer);
269
- }
270
-
271
- chargeAmount = this.assignChargeAmount(scrapped);
272
-
273
- return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
274
- }
275
-
276
- if (earlyReturnTimer) {
277
- clearTimeout(earlyReturnTimer);
278
- }
279
-
280
- if (!lastScrapped) {
281
- throw new AssertionFailureError(`No content available for query ${searchQuery}`);
282
- }
283
-
284
- if (!earlyReturn) {
285
- chargeAmount = this.assignChargeAmount(lastScrapped);
286
- }
287
-
288
- return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
289
- }
290
-
291
- async *fetchSearchResults(
292
- mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
293
- searchResults?: WebSearchResult[],
294
- options?: ExtraScrappingOptions,
295
- crawlerOptions?: CrawlerOptions,
296
- count?: number,
297
- ) {
298
- if (!searchResults) {
299
- return;
300
- }
301
- if (count === 0) {
302
- const resultArray = searchResults.map((upstreamSearchResult, i) => ({
303
- url: upstreamSearchResult.url,
304
- title: upstreamSearchResult.title,
305
- description: upstreamSearchResult.description,
306
- content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '',
307
- toString() {
308
- return `[${i + 1}] Title: ${this.title}
309
- [${i + 1}] URL Source: ${this.url}
310
- [${i + 1}] Description: ${this.description}
311
- `;
312
- }
313
-
314
- })) as FormattedPage[];
315
- resultArray.toString = function () {
316
- return this.map((x, i) => x ? x.toString() : '').join('\n\n').trimEnd() + '\n';
317
- };
318
- yield resultArray;
319
- return;
320
- }
321
- const urls = searchResults.map((x) => new URL(x.url));
322
- const snapshotMap = new WeakMap();
323
- for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
324
- const mapped = scrapped.map((x, i) => {
325
- const upstreamSearchResult = searchResults[i];
326
- if (!x) {
327
- return {
328
- url: upstreamSearchResult.url,
329
- title: upstreamSearchResult.title,
330
- description: upstreamSearchResult.description,
331
- content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
332
- };
333
- }
334
- if (snapshotMap.has(x)) {
335
- return snapshotMap.get(x);
336
- }
337
- return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
338
- r.title ??= upstreamSearchResult.title;
339
- r.description = upstreamSearchResult.description;
340
- snapshotMap.set(x, r);
341
-
342
- return r;
343
- }).catch((err) => {
344
- this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
345
-
346
- return {
347
- url: upstreamSearchResult.url,
348
- title: upstreamSearchResult.title,
349
- description: upstreamSearchResult.description,
350
- content: x.text,
351
- };
352
- });
353
- });
354
-
355
- const resultArray = await Promise.all(mapped) as FormattedPage[];
356
-
357
- yield this.reOrganizeSearchResults(resultArray, count);
358
- }
359
- }
360
-
361
- reOrganizeSearchResults(searchResults: FormattedPage[], count?: number) {
362
- const targetResultCount = count || this.targetResultCount;
363
- const [qualifiedPages, unqualifiedPages] = _.partition(searchResults, (x) => this.pageQualified(x));
364
- const acceptSet = new Set(qualifiedPages);
365
-
366
- const n = targetResultCount - qualifiedPages.length;
367
- for (const x of unqualifiedPages.slice(0, n >= 0 ? n : 0)) {
368
- acceptSet.add(x);
369
- }
370
-
371
- const filtered = searchResults.filter((x) => acceptSet.has(x)).slice(0, targetResultCount);
372
-
373
- const resultArray = filtered.map((x, i) => {
374
- return {
375
- ...x,
376
- toString(this: any) {
377
- if (!this.content && this.description) {
378
- if (this.title || x.textRepresentation) {
379
- const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
380
- return `[${i + 1}] Title: ${this.title}
381
- [${i + 1}] URL Source: ${this.url}
382
- [${i + 1}] Description: ${this.description}${textRep}
383
- `;
384
- }
385
-
386
- return `[${i + 1}] No content available for ${this.url}`;
387
- }
388
-
389
- const mixins = [];
390
- if (this.description) {
391
- mixins.push(`[${i + 1}] Description: ${this.description}`);
392
- }
393
- if (this.publishedTime) {
394
- mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
395
- }
396
-
397
- const suffixMixins = [];
398
- if (this.images) {
399
- const imageSummaryChunks = [`[${i + 1}] Images:`];
400
- for (const [k, v] of Object.entries(this.images)) {
401
- imageSummaryChunks.push(`- ![${k}](${v})`);
402
- }
403
- if (imageSummaryChunks.length === 1) {
404
- imageSummaryChunks.push('This page does not seem to contain any images.');
405
- }
406
- suffixMixins.push(imageSummaryChunks.join('\n'));
407
- }
408
- if (this.links) {
409
- const linkSummaryChunks = [`[${i + 1}] Links/Buttons:`];
410
- for (const [k, v] of Object.entries(this.links)) {
411
- linkSummaryChunks.push(`- [${k}](${v})`);
412
- }
413
- if (linkSummaryChunks.length === 1) {
414
- linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
415
- }
416
- suffixMixins.push(linkSummaryChunks.join('\n'));
417
- }
418
-
419
- return `[${i + 1}] Title: ${this.title}
420
- [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
421
- [${i + 1}] Markdown Content:
422
- ${this.content}
423
- ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
424
- }
425
- };
426
- });
427
-
428
- resultArray.toString = function () {
429
- return this.map((x, i) => x ? x.toString() : `[${i + 1}] No content available for ${this[i].url}`).join('\n\n').trimEnd() + '\n';
430
- };
431
-
432
- return resultArray;
433
- }
434
-
435
- assignChargeAmount(formatted: FormattedPage[]) {
436
- return _.sum(
437
- formatted.map((x) => this.crawler.assignChargeAmount(x) || 0)
438
- );
439
- }
440
-
441
- pageQualified(formattedPage: FormattedPage) {
442
- return formattedPage.title &&
443
- formattedPage.content ||
444
- formattedPage.screenshotUrl ||
445
- formattedPage.pageshotUrl ||
446
- formattedPage.text ||
447
- formattedPage.html;
448
- }
449
-
450
- searchResultsQualified(results: FormattedPage[], targetResultCount = this.targetResultCount) {
451
- return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
452
- }
453
-
454
- async cachedWebSearch(query: WebSearchQueryParams, noCache: boolean = false) {
455
- const queryDigest = objHashMd5B64Of(query);
456
- let cache;
457
- if (!noCache) {
458
- cache = (await SearchResult.fromFirestoreQuery(
459
- SearchResult.COLLECTION.where('queryDigest', '==', queryDigest)
460
- .orderBy('createdAt', 'desc')
461
- .limit(1)
462
- ))[0];
463
- if (cache) {
464
- const age = Date.now() - cache.createdAt.valueOf();
465
- const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
466
- this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
467
- query, digest: queryDigest, age, stale
468
- });
469
-
470
- if (!stale) {
471
- return cache.response as WebSearchApiResponse;
472
- }
473
- }
474
- }
475
-
476
- try {
477
- const r = await this.braveSearchService.webSearch(query);
478
-
479
- const nowDate = new Date();
480
- const record = SearchResult.from({
481
- query,
482
- queryDigest,
483
- response: r,
484
- createdAt: nowDate,
485
- expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
486
- });
487
- SearchResult.save(record.degradeForFireStore()).catch((err) => {
488
- this.logger.warn(`Failed to cache search result`, { err });
489
- });
490
-
491
- return r;
492
- } catch (err: any) {
493
- if (cache) {
494
- this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
495
-
496
- return cache.response as WebSearchApiResponse;
497
- }
498
-
499
- throw err;
500
- }
501
-
502
- }
503
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/api/serp.ts ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { singleton } from 'tsyringe';
2
+ import {
3
+ RPCHost, RPCReflection, assignMeta, RawString,
4
+ ParamValidationError,
5
+ assignTransferProtocolMeta,
6
+ } from 'civkit/civ-rpc';
7
+ import { marshalErrorLike } from 'civkit/lang';
8
+ import _ from 'lodash';
9
+
10
+ import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
11
+
12
+ import { GlobalLogger } from '../services/logger';
13
+ import { AsyncLocalContext } from '../services/async-context';
14
+ import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
15
+ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
16
+ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
17
+ import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors';
18
+ import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
19
+ import { GoogleSERP } from '../services/serp/google';
20
+ import { WebSearchEntry } from '../services/serp/compat';
21
+ import { CrawlerOptions } from '../dto/crawler-options';
22
+ import { ScrappingOptions } from '../services/serp/puppeteer';
23
+ import { objHashMd5B64Of } from 'civkit/hash';
24
+ import { SERPResult } from '../db/searched';
25
+ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
26
+ import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
27
+ import { LRUCache } from 'lru-cache';
28
+
29
+ const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
30
+
31
+ type RateLimitCache = {
32
+ blockedUntil?: Date;
33
+ user?: JinaEmbeddingsTokenAccount;
34
+ };
35
+
36
+ const indexProto = {
37
+ toString: function (): string {
38
+ return _(this)
39
+ .toPairs()
40
+ .map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
41
+ .value()
42
+ .join('\n') + '\n';
43
+ }
44
+ };
45
+
46
+ @singleton()
47
+ export class SerpHost extends RPCHost {
48
+ logger = this.globalLogger.child({ service: this.constructor.name });
49
+
50
+ cacheRetentionMs = 1000 * 3600 * 24 * 7;
51
+ cacheValidMs = 1000 * 3600;
52
+ pageCacheToleranceMs = 1000 * 3600 * 24;
53
+
54
+ reasonableDelayMs = 15_000;
55
+
56
+ targetResultCount = 5;
57
+
58
+ highFreqKeyCache = new LRUCache<string, RateLimitCache>({
59
+ max: 256,
60
+ ttl: 60 * 60 * 1000,
61
+ updateAgeOnGet: false,
62
+ updateAgeOnHas: false,
63
+ });
64
+
65
+ async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
66
+ const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
67
+ Object.assign(indexObject, {
68
+ usage1: 'https://r.jina.ai/YOUR_URL',
69
+ usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
70
+ usage3: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
71
+ homepage: 'https://jina.ai/reader',
72
+ sourceCode: 'https://github.com/jina-ai/reader',
73
+ });
74
+
75
+ if (auth && auth.user) {
76
+ indexObject[''] = undefined;
77
+ indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
78
+ indexObject.balanceLeft = auth.user.wallet.total_balance;
79
+ } else {
80
+ indexObject.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
81
+ }
82
+
83
+ return indexObject;
84
+ }
85
+
86
+ constructor(
87
+ protected globalLogger: GlobalLogger,
88
+ protected rateLimitControl: RateLimitControl,
89
+ protected threadLocal: AsyncLocalContext,
90
+ protected googleSerp: GoogleSERP,
91
+ protected serperGoogle: SerperGoogleSearchService,
92
+ protected serperBing: SerperBingSearchService,
93
+ ) {
94
+ super(...arguments);
95
+ }
96
+
97
+ override async init() {
98
+ await this.dependencyReady();
99
+
100
+ this.emit('ready');
101
+ }
102
+
103
+ @Method({
104
+ name: 'searchIndex',
105
+ ext: {
106
+ http: {
107
+ action: ['get', 'post'],
108
+ path: '/'
109
+ }
110
+ },
111
+ tags: ['search'],
112
+ returnType: [String, OutputServerEventStream, RawString],
113
+ })
114
+ @Method({
115
+ ext: {
116
+ http: {
117
+ action: ['get', 'post'],
118
+ }
119
+ },
120
+ tags: ['search'],
121
+ returnType: [String, OutputServerEventStream, RawString],
122
+ })
123
+ async search(
124
+ @RPCReflect() rpcReflect: RPCReflection,
125
+ @Ctx() ctx: Context,
126
+ crawlerOptions: CrawlerOptions,
127
+ auth: JinaEmbeddingsAuthDTO,
128
+ @Param('type', { type: new Set(['web', 'images', 'news']), default: 'web' })
129
+ variant: 'web' | 'images' | 'news',
130
+ @Param('q') q?: string,
131
+ @Param('provider', { type: new Set(['google', 'bing']) })
132
+ searchEngine?: 'google' | 'bing',
133
+ @Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
134
+ num?: number,
135
+ @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
136
+ @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
137
+ @Param('location') location?: string,
138
+ @Param('page') page?: number,
139
+ @Param('fallback', { default: true }) fallback?: boolean,
140
+ ) {
141
+ const authToken = auth.bearerToken;
142
+ let highFreqKey: RateLimitCache | undefined;
143
+ if (authToken && this.highFreqKeyCache.has(authToken)) {
144
+ highFreqKey = this.highFreqKeyCache.get(authToken)!;
145
+ auth.user = highFreqKey.user;
146
+ auth.uid = highFreqKey.user?.user_id;
147
+ }
148
+
149
+ const uid = await auth.solveUID();
150
+ if (!q) {
151
+ if (ctx.path === '/') {
152
+ const indexObject = this.getIndex(ctx, auth);
153
+ if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
154
+ return indexObject;
155
+ }
156
+
157
+ return assignTransferProtocolMeta(`${indexObject}`,
158
+ { contentType: 'text/plain; charset=utf-8', envelope: null }
159
+ );
160
+ }
161
+ throw new ParamValidationError({
162
+ path: 'q',
163
+ message: `Required but not provided`
164
+ });
165
+ }
166
+ // Return content by default
167
+ const user = await auth.assertUser();
168
+ if (!(user.wallet.total_balance > 0)) {
169
+ throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
170
+ }
171
+
172
+ if (highFreqKey?.blockedUntil) {
173
+ const now = new Date();
174
+ const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
175
+ if (blockedTimeRemaining > 0) {
176
+ throw RateLimitTriggeredError.from({
177
+ message: `Per UID rate limit exceeded (async)`,
178
+ retryAfter: Math.ceil(blockedTimeRemaining / 1000),
179
+ });
180
+ }
181
+ }
182
+
183
+ const PREMIUM_KEY_LIMIT = 400;
184
+ const rateLimitPolicy = auth.getRateLimits('SEARCH') || [
185
+ parseInt(user.metadata?.speed_level) >= 2 ?
186
+ RateLimitDesc.from({
187
+ occurrence: PREMIUM_KEY_LIMIT,
188
+ periodSeconds: 60
189
+ }) :
190
+ RateLimitDesc.from({
191
+ occurrence: 40,
192
+ periodSeconds: 60
193
+ })
194
+ ];
195
+
196
+ const apiRollPromise = this.rateLimitControl.simpleRPCUidBasedLimit(
197
+ rpcReflect, uid!, ['SEARCH'],
198
+ ...rateLimitPolicy
199
+ );
200
+
201
+ if (!highFreqKey) {
202
+ // Normal path
203
+ await apiRollPromise;
204
+
205
+ if (rateLimitPolicy.some(
206
+ (x) => {
207
+ const rpm = x.occurrence / (x.periodSeconds / 60);
208
+ if (rpm >= PREMIUM_KEY_LIMIT) {
209
+ return true;
210
+ }
211
+
212
+ return false;
213
+ })
214
+ ) {
215
+ this.highFreqKeyCache.set(auth.bearerToken!, {
216
+ user,
217
+ });
218
+ }
219
+ } else {
220
+ // High freq key path
221
+ apiRollPromise.then(
222
+ // Rate limit not triggered, make sure not blocking.
223
+ () => {
224
+ delete highFreqKey.blockedUntil;
225
+ },
226
+ // Rate limit triggered
227
+ (err) => {
228
+ if (!(err instanceof RateLimitTriggeredError)) {
229
+ return;
230
+ }
231
+ const now = Date.now();
232
+ let tgtDate;
233
+ if (err.retryAfter) {
234
+ tgtDate = new Date(now + err.retryAfter * 1000);
235
+ } else if (err.retryAfterDate) {
236
+ tgtDate = err.retryAfterDate;
237
+ }
238
+
239
+ if (tgtDate) {
240
+ const dt = tgtDate.valueOf() - now;
241
+ highFreqKey.blockedUntil = tgtDate;
242
+ setTimeout(() => {
243
+ if (highFreqKey.blockedUntil === tgtDate) {
244
+ delete highFreqKey.blockedUntil;
245
+ }
246
+ }, dt).unref();
247
+ }
248
+ }
249
+ ).finally(async () => {
250
+ // Always asynchronously update user(wallet);
251
+ const user = await auth.getBrief().catch(() => undefined);
252
+ if (user) {
253
+ highFreqKey.user = user;
254
+ }
255
+ });
256
+ }
257
+
258
+ let chargeAmount = 0;
259
+ rpcReflect.finally(async () => {
260
+ if (chargeAmount) {
261
+ auth.reportUsage(chargeAmount, `reader-serp`).catch((err) => {
262
+ this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
263
+ });
264
+ const apiRoll = await apiRollPromise;
265
+ apiRoll.chargeAmount = chargeAmount;
266
+ }
267
+ });
268
+
269
+ let chargeAmountScaler = 1;
270
+ if (searchEngine === 'bing') {
271
+ chargeAmountScaler = 3;
272
+ }
273
+ if (variant !== 'web') {
274
+ chargeAmountScaler = 5;
275
+ }
276
+
277
+ let realQuery = q;
278
+ let results = await this.cachedSearch(variant, {
279
+ provider: searchEngine,
280
+ q,
281
+ num,
282
+ gl,
283
+ hl,
284
+ location,
285
+ page,
286
+ }, crawlerOptions);
287
+
288
+
289
+ if (fallback && !results?.length && (!page || page === 1)) {
290
+ let tryTimes = 1;
291
+ const containsRTL = /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u0590-\u05FF\uFB1D-\uFB4F\u0700-\u074F\u0780-\u07BF\u07C0-\u07FF]/.test(q);
292
+ const terms = q.split(/\s+/g).filter((x) => !!x);
293
+ while (terms.length > 1) {
294
+ containsRTL ? terms.shift() : terms.pop(); // reduce the query by one term at a time
295
+ realQuery = terms.join(' ').trim();
296
+ if (!realQuery) {
297
+ break;
298
+ }
299
+ this.logger.info(`Retrying search with fallback query: "${realQuery}"`);
300
+ results = await this.cachedSearch(variant, {
301
+ provider: searchEngine,
302
+ q: realQuery,
303
+ num,
304
+ gl,
305
+ hl,
306
+ location,
307
+ }, crawlerOptions);
308
+ tryTimes += 1;
309
+ if (results?.length) {
310
+ break;
311
+ }
312
+ }
313
+ chargeAmountScaler *= tryTimes;
314
+ }
315
+
316
+ if (!results?.length) {
317
+ results = [];
318
+ }
319
+
320
+ const finalResults = results.map((x: any) => this.mapToFinalResults(x));
321
+
322
+ await Promise.all(finalResults.map((x: any) => this.assignGeneralMixin(x)));
323
+
324
+ this.assignChargeAmount(finalResults, chargeAmountScaler);
325
+ assignMeta(finalResults, {
326
+ query: realQuery,
327
+ fallback: realQuery === q ? undefined : realQuery,
328
+ });
329
+
330
+ return finalResults;
331
+ }
332
+
333
+
334
+ assignChargeAmount(items: unknown[], scaler: number) {
335
+ const numCharge = Math.ceil(items.length / 10) * 10000 * scaler;
336
+ assignMeta(items, { usage: { tokens: numCharge } });
337
+
338
+ return numCharge;
339
+ }
340
+
341
+ async getFavicon(domain: string) {
342
+ const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;
343
+
344
+ try {
345
+ const response = await fetch(url);
346
+ if (!response.ok) {
347
+ return '';
348
+ }
349
+ const ab = await response.arrayBuffer();
350
+ const buffer = Buffer.from(ab);
351
+ const base64 = buffer.toString('base64');
352
+ return `data:image/png;base64,${base64}`;
353
+ } catch (error: any) {
354
+ this.logger.warn(`Failed to get favicon base64 string`, { err: marshalErrorLike(error) });
355
+ return '';
356
+ }
357
+ }
358
+
359
+ async configure(opts: CrawlerOptions) {
360
+ const crawlOpts: ScrappingOptions = {
361
+ proxyUrl: opts.proxyUrl,
362
+ cookies: opts.setCookies,
363
+ overrideUserAgent: opts.userAgent,
364
+ timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
365
+ locale: opts.locale,
366
+ referer: opts.referer,
367
+ viewport: opts.viewport,
368
+ proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
369
+ allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
370
+ };
371
+
372
+ if (opts.locale) {
373
+ crawlOpts.extraHeaders ??= {};
374
+ crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
375
+ }
376
+
377
+ return crawlOpts;
378
+ }
379
+
380
+ mapToFinalResults(input: WebSearchEntry) {
381
+ const whitelistedProps = [
382
+ 'imageUrl', 'imageWidth', 'imageHeight', 'source', 'date', 'siteLinks'
383
+ ];
384
+ const result = {
385
+ title: input.title,
386
+ url: input.link,
387
+ description: Reflect.get(input, 'snippet'),
388
+ ..._.pick(input, whitelistedProps),
389
+ };
390
+
391
+ return result;
392
+ }
393
+
394
+ *iterProviders(preference?: string) {
395
+ if (preference === 'bing') {
396
+ yield this.serperBing;
397
+ yield this.serperGoogle;
398
+ yield this.googleSerp;
399
+
400
+ return;
401
+ }
402
+
403
+ if (preference === 'google') {
404
+ yield this.googleSerp;
405
+ yield this.googleSerp;
406
+ yield this.serperGoogle;
407
+
408
+ return;
409
+ }
410
+
411
+ yield this.serperGoogle;
412
+ yield this.googleSerp;
413
+ yield this.googleSerp;
414
+ }
415
+
416
+ async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
417
+ const queryDigest = objHashMd5B64Of({ ...query, variant });
418
+ const provider = query.provider;
419
+ Reflect.deleteProperty(query, 'provider');
420
+ const noCache = opts.noCache;
421
+ let cache;
422
+ if (!noCache) {
423
+ cache = (await SERPResult.fromFirestoreQuery(
424
+ SERPResult.COLLECTION.where('queryDigest', '==', queryDigest)
425
+ .orderBy('createdAt', 'desc')
426
+ .limit(1)
427
+ ))[0];
428
+ if (cache) {
429
+ const age = Date.now() - cache.createdAt.valueOf();
430
+ const stale = cache.createdAt.valueOf() < (Date.now() - this.cacheValidMs);
431
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for search query "${query.q}", normalized digest: ${queryDigest}, ${age}ms old`, {
432
+ query, digest: queryDigest, age, stale
433
+ });
434
+
435
+ if (!stale) {
436
+ return cache.response as any;
437
+ }
438
+ }
439
+ }
440
+ const scrappingOptions = await this.configure(opts);
441
+
442
+ try {
443
+ let r: any[] | undefined;
444
+ let lastError;
445
+ outerLoop:
446
+ for (const client of this.iterProviders(provider)) {
447
+ try {
448
+ switch (variant) {
449
+ case 'images': {
450
+ r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
451
+ break outerLoop;
452
+ }
453
+ case 'news': {
454
+ r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
455
+ break outerLoop;
456
+ }
457
+ case 'web':
458
+ default: {
459
+ r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
460
+ break outerLoop;
461
+ }
462
+ }
463
+ } catch (err) {
464
+ lastError = err;
465
+ this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err });
466
+ }
467
+ }
468
+
469
+ if (r?.length) {
470
+ const nowDate = new Date();
471
+ const record = SERPResult.from({
472
+ query,
473
+ queryDigest,
474
+ response: r,
475
+ createdAt: nowDate,
476
+ expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
477
+ });
478
+ SERPResult.save(record.degradeForFireStore()).catch((err) => {
479
+ this.logger.warn(`Failed to cache search result`, { err });
480
+ });
481
+ } else if (lastError) {
482
+ throw lastError;
483
+ }
484
+
485
+ return r;
486
+ } catch (err: any) {
487
+ if (cache) {
488
+ this.logger.warn(`Failed to fetch search result, but a stale cache is available. falling back to stale cache`, { err: marshalErrorLike(err) });
489
+
490
+ return cache.response as any;
491
+ }
492
+
493
+ throw err;
494
+ }
495
+ }
496
+
497
+ async assignGeneralMixin(result: Partial<WebSearchEntry>) {
498
+ const collectFavicon = this.threadLocal.get('collect-favicon');
499
+
500
+ if (collectFavicon && result.link) {
501
+ const url = new URL(result.link);
502
+ Reflect.set(result, 'favicon', await this.getFavicon(url.origin));
503
+ }
504
+ }
505
+ }
src/db/searched.ts CHANGED
@@ -62,3 +62,7 @@ export class SearchResult extends FirestoreRecord {
62
  export class SerperSearchResult extends SearchResult {
63
  static override collectionName = 'serperSearchResults';
64
  }
 
 
 
 
 
62
  export class SerperSearchResult extends SearchResult {
63
  static override collectionName = 'serperSearchResults';
64
  }
65
+
66
+ export class SERPResult extends SearchResult {
67
+ static override collectionName = 'SERPResults';
68
+ }
src/dto/crawler-options.ts CHANGED
@@ -429,6 +429,8 @@ export class CrawlerOptions extends AutoCastable {
429
  })
430
  respondTiming?: RESPOND_TIMING;
431
 
 
 
432
  static override from(input: any) {
433
  const instance = super.from(input) as CrawlerOptions;
434
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
 
429
  })
430
  respondTiming?: RESPOND_TIMING;
431
 
432
+ _hintIps?: string[];
433
+
434
  static override from(input: any) {
435
  const instance = super.from(input) as CrawlerOptions;
436
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
src/dto/jina-embeddings-auth.ts CHANGED
@@ -1,8 +1,9 @@
1
  import _ from 'lodash';
2
  import {
3
  Also, AuthenticationFailedError, AuthenticationRequiredError,
4
- DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT,
5
  AutoCastable,
 
6
  } from 'civkit/civ-rpc';
7
  import { htmlEscape } from 'civkit/escape';
8
  import { marshalErrorLike } from 'civkit/lang';
@@ -96,12 +97,14 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
96
  });
97
  }
98
 
 
99
  let account;
100
  try {
101
  account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
102
  } catch (err) {
103
  // FireStore would not accept any string as input and may throw if not happy with it
104
- void 0;
 
105
  }
106
 
107
 
@@ -109,7 +112,7 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
109
  const jitter = Math.ceil(Math.random() * 30 * 1000);
110
 
111
  if (account && !ignoreCache) {
112
- if (account && (age < (180_000 - jitter))) {
113
  this.user = account;
114
  this.uid = this.user?.user_id;
115
 
@@ -117,6 +120,20 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
117
  }
118
  }
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  try {
121
  const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
122
  const brief = r.data;
@@ -148,7 +165,7 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable {
148
  }
149
 
150
 
151
- throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`);
152
  }
153
  }
154
 
 
1
  import _ from 'lodash';
2
  import {
3
  Also, AuthenticationFailedError, AuthenticationRequiredError,
4
+ RPC_CALL_ENVIRONMENT,
5
  AutoCastable,
6
+ DownstreamServiceError,
7
  } from 'civkit/civ-rpc';
8
  import { htmlEscape } from 'civkit/escape';
9
  import { marshalErrorLike } from 'civkit/lang';
 
97
  });
98
  }
99
 
100
+ let firestoreDegradation = false;
101
  let account;
102
  try {
103
  account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
104
  } catch (err) {
105
  // FireStore would not accept any string as input and may throw if not happy with it
106
+ firestoreDegradation = true;
107
+ logger.warn(`Firestore issue`, { err });
108
  }
109
 
110
 
 
112
  const jitter = Math.ceil(Math.random() * 30 * 1000);
113
 
114
  if (account && !ignoreCache) {
115
+ if ((age < (180_000 - jitter)) && (account.wallet?.total_balance > 0)) {
116
  this.user = account;
117
  this.uid = this.user?.user_id;
118
 
 
120
  }
121
  }
122
 
123
+ if (firestoreDegradation) {
124
+ logger.debug(`Using remote UC cached user`);
125
+ const r = await this.jinaEmbeddingsDashboard.authorization(this.bearerToken);
126
+ const brief = r.data;
127
+ const draftAccount = JinaEmbeddingsTokenAccount.from({
128
+ ...account, ...brief, _id: this.bearerToken,
129
+ lastSyncedAt: new Date()
130
+ });
131
+ this.user = draftAccount;
132
+ this.uid = this.user?.user_id;
133
+
134
+ return draftAccount;
135
+ }
136
+
137
  try {
138
  const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
139
  const brief = r.data;
 
165
  }
166
 
167
 
168
+ throw new DownstreamServiceError(`Failed to authenticate: ${err}`);
169
  }
170
  }
171
 
src/services/geoip.ts CHANGED
@@ -4,6 +4,7 @@ import { CityResponse, Reader } from 'maxmind';
4
  import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
5
  import { GlobalLogger } from './logger';
6
  import path from 'path';
 
7
 
8
  export enum GEOIP_SUPPORTED_LANGUAGES {
9
  EN = 'en',
@@ -85,6 +86,7 @@ export class GeoIPService extends AsyncService {
85
  }
86
 
87
 
 
88
  async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
89
  await this._lazyload();
90
 
@@ -116,6 +118,13 @@ export class GeoIPService extends AsyncService {
116
  });
117
  }
118
 
 
 
 
 
 
 
 
119
  }
120
 
121
  const instance = container.resolve(GeoIPService);
 
4
  import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
5
  import { GlobalLogger } from './logger';
6
  import path from 'path';
7
+ import { Threaded } from './threaded';
8
 
9
  export enum GEOIP_SUPPORTED_LANGUAGES {
10
  EN = 'en',
 
86
  }
87
 
88
 
89
+ @Threaded()
90
  async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
91
  await this._lazyload();
92
 
 
118
  });
119
  }
120
 
121
+ @Threaded()
122
+ async lookupCities(ips: string[], lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
123
+ const r = (await Promise.all(ips.map((ip) => this.lookupCity(ip, lang)))).filter(Boolean) as GeoIPCityResponse[];
124
+
125
+ return r;
126
+ }
127
+
128
  }
129
 
130
  const instance = container.resolve(GeoIPService);
src/services/misc.ts CHANGED
@@ -57,7 +57,11 @@ export class MiscService extends AsyncService {
57
  }
58
 
59
  const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
 
60
  const isIp = isIP(normalizedHostname);
 
 
 
61
  if (
62
  (result.hostname === 'localhost') ||
63
  (isIp && isIPInNonPublicRange(normalizedHostname))
@@ -88,12 +92,16 @@ export class MiscService extends AsyncService {
88
  path: 'url'
89
  });
90
  }
 
91
  }
92
 
93
  }
94
  }
95
 
96
- return result;
 
 
 
97
  }
98
 
99
  }
 
57
  }
58
 
59
  const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
60
+ let ips: string[] = [];
61
  const isIp = isIP(normalizedHostname);
62
+ if (isIp) {
63
+ ips.push(normalizedHostname);
64
+ }
65
  if (
66
  (result.hostname === 'localhost') ||
67
  (isIp && isIPInNonPublicRange(normalizedHostname))
 
92
  path: 'url'
93
  });
94
  }
95
+ ips.push(x.address);
96
  }
97
 
98
  }
99
  }
100
 
101
+ return {
102
+ url: result,
103
+ ips
104
+ };
105
  }
106
 
107
  }
src/services/puppeteer.ts CHANGED
@@ -562,7 +562,8 @@ export class PuppeteerControl extends AsyncService {
562
  headless: !Boolean(process.env.DEBUG_BROWSER),
563
  executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
564
  args: [
565
- '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled'
 
566
  ]
567
  }).catch((err: any) => {
568
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
@@ -1618,11 +1619,7 @@ export class PuppeteerControl extends AsyncService {
1618
  }
1619
  }
1620
  try {
1621
- const pSubFrameSnapshots = this.snapshotChildFrames(page);
1622
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
1623
- if (snapshot) {
1624
- snapshot.childFrames = await pSubFrameSnapshots;
1625
- }
1626
  } catch (err: any) {
1627
  this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1628
  if (stuff instanceof Error) {
 
562
  headless: !Boolean(process.env.DEBUG_BROWSER),
563
  executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
564
  args: [
565
+ '--disable-dev-shm-usage',
566
+ '--disable-blink-features=AutomationControlled'
567
  ]
568
  }).catch((err: any) => {
569
  this.logger.error(`Unknown firebase issue, just die fast.`, { err });
 
1619
  }
1620
  }
1621
  try {
 
1622
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
 
 
 
1623
  } catch (err: any) {
1624
  this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
1625
  if (stuff instanceof Error) {
src/services/serp/compat.ts ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export interface WebSearchEntry {
2
+ link: string;
3
+ title: string;
4
+ source?: string;
5
+ date?: string;
6
+ snippet?: string;
7
+ imageUrl?: string;
8
+ siteLinks?: {
9
+ link: string; title: string; snippet?: string;
10
+ }[];
11
+ variant?: 'web' | 'images' | 'news';
12
+ }
src/services/serp/google.ts ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { singleton } from 'tsyringe';
2
+ import { AsyncService } from 'civkit/async-service';
3
+ import { GlobalLogger } from '../logger';
4
+ import { JSDomControl } from '../jsdom';
5
+ import { isMainThread } from 'worker_threads';
6
+ import _ from 'lodash';
7
+ import { WebSearchEntry } from './compat';
8
+ import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
9
+ import { CurlControl } from '../curl';
10
+ import { readFile } from 'fs/promises';
11
+ import { ApplicationError } from 'civkit/civ-rpc';
12
+ import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
13
+ import { parseJSONText } from 'civkit/vectorize';
14
+ import { retryWith } from 'civkit/decorators';
15
+ import { ProxyProvider } from '../../shared/services/proxy-provider';
16
+
17
+ @singleton()
18
+ export class GoogleSERP extends AsyncService {
19
+
20
+ googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
21
+
22
+ constructor(
23
+ protected globalLogger: GlobalLogger,
24
+ protected puppeteerControl: SERPSpecializedPuppeteerControl,
25
+ protected jsDomControl: JSDomControl,
26
+ protected curlControl: CurlControl,
27
+ protected proxyProvider: ProxyProvider,
28
+ ) {
29
+ const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
30
+ super(...filteredDeps);
31
+ }
32
+
33
+ override async init() {
34
+ await this.dependencyReady();
35
+
36
+ this.emit('ready');
37
+ }
38
+
39
+ retryDet = new WeakSet<ScrappingOptions>();
40
+ @retryWith((err) => {
41
+ if (err instanceof ServiceBadApproachError) {
42
+ return false;
43
+ }
44
+ if (err instanceof ServiceBadAttemptError) {
45
+ // Keep trying
46
+ return true;
47
+ }
48
+ if (err instanceof ApplicationError) {
49
+ // Quit with this error
50
+ return false;
51
+ }
52
+ return undefined;
53
+ }, 3)
54
+ async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
55
+ if (opts?.allocProxy === 'none') {
56
+ return this.curlControl.sideLoad(url, opts);
57
+ }
58
+
59
+ const proxy = await this.proxyProvider.alloc(
60
+ process.env.PREFERRED_PROXY_COUNTRY || 'auto'
61
+ );
62
+ if (opts) {
63
+ if (this.retryDet.has(opts) && proxy.protocol === 'socks5h:') {
64
+ proxy.protocol = 'socks5:';
65
+ }
66
+ this.retryDet.add(opts);
67
+ }
68
+ const r = await this.curlControl.sideLoad(url, {
69
+ ...opts,
70
+ proxyUrl: proxy.href,
71
+ });
72
+
73
+ if (r.status === 429) {
74
+ throw new ServiceBadAttemptError('Google returned a 429 error. This may happen due to various reasons, including rate limiting or other issues.');
75
+ }
76
+
77
+ if (opts && opts.allocProxy) {
78
+ opts.proxyUrl ??= proxy.href;
79
+ }
80
+
81
+ return { ...r, proxy };
82
+ }
83
+
84
+ digestQuery(query: { [k: string]: any; }) {
85
+ const url = new URL(`https://${this.googleDomain}/search`);
86
+ const clone = { ...query };
87
+ const num = clone.num || 10;
88
+ if (clone.page) {
89
+ const page = parseInt(clone.page);
90
+ delete clone.page;
91
+ clone.start = (page - 1) * num;
92
+ if (clone.start === 0) {
93
+ delete clone.start;
94
+ }
95
+ }
96
+ if (clone.location) {
97
+ delete clone.location;
98
+ }
99
+
100
+ for (const [k, v] of Object.entries(clone)) {
101
+ if (v === undefined || v === null) {
102
+ continue;
103
+ }
104
+ url.searchParams.set(k, `${v}`);
105
+ }
106
+
107
+ return url;
108
+ }
109
+
110
+ async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
111
+ const url = this.digestQuery(query);
112
+
113
+ const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
114
+ if (opts && sideLoaded.sideLoadOpts) {
115
+ opts.sideLoad = sideLoaded.sideLoadOpts;
116
+ }
117
+
118
+ const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
119
+
120
+ return snapshot;
121
+ }
122
+
123
+ async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
124
+ const url = this.digestQuery(query);
125
+
126
+ url.searchParams.set('tbm', 'nws');
127
+
128
+ const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
129
+ if (opts && sideLoaded.sideLoadOpts) {
130
+ opts.sideLoad = sideLoaded.sideLoadOpts;
131
+ }
132
+
133
+ const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
134
+
135
+ return snapshot;
136
+ }
137
+
138
+ async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
139
+ const url = this.digestQuery(query);
140
+
141
+ url.searchParams.set('tbm', 'isch');
142
+ url.searchParams.set('asearch', 'isch');
143
+ url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
144
+
145
+ const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
146
+
147
+ if (sideLoaded.status !== 200 || !sideLoaded.file) {
148
+ throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
149
+ }
150
+
151
+ const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
152
+ const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
153
+
154
+ return _.get(rJSON, 'ischj.metadata').map((x: any) => {
155
+
156
+ return {
157
+ link: _.get(x, 'result.referrer_url'),
158
+ title: _.get(x, 'result.page_title'),
159
+ snippet: _.get(x, 'text_in_grid.snippet'),
160
+ source: _.get(x, 'result.site_title'),
161
+ imageWidth: _.get(x, 'original_image.width'),
162
+ imageHeight: _.get(x, 'original_image.height'),
163
+ imageUrl: _.get(x, 'original_image.url'),
164
+ variant: 'images',
165
+ };
166
+ }) as WebSearchEntry[];
167
+ }
168
+ }
169
+
170
+ async function getWebSearchResults() {
171
+ if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
172
+ throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
173
+ }
174
+
175
+ // @ts-ignore
176
+ await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
177
+
178
+ const wrapper1 = document.querySelector('div[data-async-context^="query"]');
179
+
180
+ if (!wrapper1) {
181
+ return undefined;
182
+ }
183
+
184
+ const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
185
+
186
+ if (!query) {
187
+ return undefined;
188
+ }
189
+
190
+ const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
191
+
192
+ return candidates.map((x, pos) => {
193
+ const primaryLink = x.querySelector('a:not([href="#"])');
194
+ if (!primaryLink) {
195
+ return undefined;
196
+ }
197
+ const url = primaryLink.getAttribute('href');
198
+
199
+ if (primaryLink.querySelector('div[role="heading"]')) {
200
+ // const spans = primaryLink.querySelectorAll('span');
201
+ // const title = spans[0]?.textContent;
202
+ // const source = spans[1]?.textContent;
203
+ // const date = spans[spans.length - 1].textContent;
204
+
205
+ // return {
206
+ // link: url,
207
+ // title,
208
+ // source,
209
+ // date,
210
+ // variant: 'video'
211
+ // };
212
+ return undefined;
213
+ }
214
+
215
+ const title = primaryLink.querySelector('h3')?.textContent;
216
+ const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
217
+ const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
218
+ let date = cite?.split('·')[1]?.trim();
219
+ const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
220
+ let snippet = snippets[snippets.length - 1]?.textContent;
221
+ if (!snippet) {
222
+ snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
223
+ }
224
+ date ??= snippets[snippets.length - 2]?.textContent?.trim();
225
+ const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
226
+ let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
227
+ return {
228
+ link: l.getAttribute('href'),
229
+ title: l.textContent,
230
+ };
231
+ });
232
+ const perhapsParent = x.parentElement?.closest('div[data-hveid]');
233
+ if (!siteLinks?.length && perhapsParent) {
234
+ const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
235
+ if (candidates.length) {
236
+ siteLinks = candidates.map((l) => {
237
+ const link = l.querySelector('a');
238
+ if (!link) {
239
+ return undefined;
240
+ }
241
+ const snippet = l.nextElementSibling?.textContent;
242
+ return {
243
+ link: link.getAttribute('href'),
244
+ title: link.textContent,
245
+ snippet,
246
+ };
247
+ }).filter(Boolean) as any;
248
+ }
249
+ }
250
+
251
+ return {
252
+ link: url,
253
+ title,
254
+ source,
255
+ date,
256
+ snippet: snippet ?? undefined,
257
+ imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
258
+ siteLinks: siteLinks.length ? siteLinks : undefined,
259
+ variant: 'web',
260
+ };
261
+ }).filter(Boolean) as WebSearchEntry[];
262
+ }
263
+
264
+ async function getNewsSearchResults() {
265
+ if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
266
+ throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
267
+ }
268
+
269
+ // @ts-ignore
270
+ await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
271
+
272
+ const wrapper1 = document.querySelector('div[data-async-context^="query"]');
273
+
274
+ if (!wrapper1) {
275
+ return undefined;
276
+ }
277
+
278
+ const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
279
+
280
+ if (!query) {
281
+ return undefined;
282
+ }
283
+
284
+ const candidates = Array.from(wrapper1.querySelectorAll('div[data-news-doc-id]'));
285
+
286
+ return candidates.map((x) => {
287
+ const primaryLink = x.querySelector('a:not([href="#"])');
288
+ if (!primaryLink) {
289
+ return undefined;
290
+ }
291
+ const url = primaryLink.getAttribute('href');
292
+ const titleElem = primaryLink.querySelector('div[role="heading"]');
293
+
294
+ if (!titleElem) {
295
+ return undefined;
296
+ }
297
+
298
+ const title = titleElem.textContent?.trim();
299
+ const source = titleElem.previousElementSibling?.textContent?.trim();
300
+ const snippet = titleElem.nextElementSibling?.textContent?.trim();
301
+
302
+ const innerSpans = Array.from(titleElem.parentElement?.querySelectorAll('span') || []);
303
+ const date = innerSpans[innerSpans.length - 1]?.textContent?.trim();
304
+
305
+ return {
306
+ link: url,
307
+ title,
308
+ source,
309
+ date,
310
+ snippet,
311
+ variant: 'news',
312
+ };
313
+ }).filter(Boolean) as WebSearchEntry[];
314
+ }
src/services/serp/puppeteer.ts ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import _ from 'lodash';
2
+ import { readFile } from 'fs/promises';
3
+ import { container, singleton } from 'tsyringe';
4
+
5
+ import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
6
+ import type { Cookie } from 'set-cookie-parser';
7
+ import puppeteer, { TimeoutError } from 'puppeteer';
8
+
9
+ import { Defer } from 'civkit/defer';
10
+ import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
11
+ import { AsyncService } from 'civkit/async-service';
12
+ import { FancyFile } from 'civkit/fancy-file';
13
+ import { delay } from 'civkit/timeout';
14
+
15
+ import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../../shared/lib/errors';
16
+ import { CurlControl } from '../curl';
17
+ import { AsyncLocalContext } from '../async-context';
18
+ import { GlobalLogger } from '../logger';
19
+ import { minimalStealth } from '../minimal-stealth';
20
+ import { BlackHoleDetector } from '../blackhole-detector';
21
+
22
+
23
+ export interface ScrappingOptions {
24
+ proxyUrl?: string;
25
+ cookies?: Cookie[];
26
+ overrideUserAgent?: string;
27
+ timeoutMs?: number;
28
+ locale?: string;
29
+ referer?: string;
30
+ extraHeaders?: Record<string, string>;
31
+ viewport?: Viewport;
32
+ proxyResources?: boolean;
33
+ allocProxy?: string;
34
+
35
+ sideLoad?: {
36
+ impersonate: {
37
+ [url: string]: {
38
+ status: number;
39
+ headers: { [k: string]: string | string[]; };
40
+ contentType?: string;
41
+ body?: FancyFile;
42
+ };
43
+ };
44
+ proxyOrigin: { [origin: string]: string; };
45
+ };
46
+
47
+ }
48
+
49
+ const SIMULATE_SCROLL = `
50
+ (function () {
51
+ function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
52
+ const targetRect = target.getBoundingClientRect();
53
+ const record = {
54
+ target,
55
+ isIntersecting,
56
+ time: timestamp,
57
+ // If intersecting, intersectionRect matches boundingClientRect
58
+ // If not intersecting, intersectionRect is empty (0x0)
59
+ intersectionRect: isIntersecting
60
+ ? targetRect
61
+ : new DOMRectReadOnly(0, 0, 0, 0),
62
+ // Current bounding client rect of the target
63
+ boundingClientRect: targetRect,
64
+ // Intersection ratio is either 0 (not intersecting) or 1 (fully intersecting)
65
+ intersectionRatio: isIntersecting ? 1 : 0,
66
+ // Root bounds (viewport in our case)
67
+ rootBounds: new DOMRectReadOnly(
68
+ 0,
69
+ 0,
70
+ window.innerWidth,
71
+ window.innerHeight
72
+ )
73
+ };
74
+ Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
75
+ return record;
76
+ }
77
+ function cloneIntersectionObserverEntry(entry) {
78
+ const record = {
79
+ target: entry.target,
80
+ isIntersecting: entry.isIntersecting,
81
+ time: entry.time,
82
+ intersectionRect: entry.intersectionRect,
83
+ boundingClientRect: entry.boundingClientRect,
84
+ intersectionRatio: entry.intersectionRatio,
85
+ rootBounds: entry.rootBounds
86
+ };
87
+ Object.setPrototypeOf(record, window.IntersectionObserverEntry.prototype);
88
+ return record;
89
+ }
90
+ const orig = window.IntersectionObserver;
91
+ const kCallback = Symbol('callback');
92
+ const kLastEntryMap = Symbol('lastEntryMap');
93
+ const liveObservers = new Map();
94
+ class MangledIntersectionObserver extends orig {
95
+ constructor(callback, options) {
96
+ super((entries, observer) => {
97
+ const lastEntryMap = observer[kLastEntryMap];
98
+ const lastEntry = entries[entries.length - 1];
99
+ lastEntryMap.set(lastEntry.target, lastEntry);
100
+ return callback(entries, observer);
101
+ }, options);
102
+ this[kCallback] = callback;
103
+ this[kLastEntryMap] = new WeakMap();
104
+ liveObservers.set(this, new Set());
105
+ }
106
+ disconnect() {
107
+ liveObservers.get(this)?.clear();
108
+ liveObservers.delete(this);
109
+ return super.disconnect();
110
+ }
111
+ observe(target) {
112
+ const observer = liveObservers.get(this);
113
+ observer?.add(target);
114
+ return super.observe(target);
115
+ }
116
+ unobserve(target) {
117
+ const observer = liveObservers.get(this);
118
+ observer?.delete(target);
119
+ return super.unobserve(target);
120
+ }
121
+ }
122
+ Object.defineProperty(MangledIntersectionObserver, 'name', { value: 'IntersectionObserver', writable: false });
123
+ window.IntersectionObserver = MangledIntersectionObserver;
124
+ function simulateScroll() {
125
+ for (const [observer, targets] of liveObservers.entries()) {
126
+ const t0 = performance.now();
127
+ for (const target of targets) {
128
+ const entry = createIntersectionObserverEntry(target, true, t0);
129
+ observer[kCallback]([entry], observer);
130
+ setTimeout(() => {
131
+ const t1 = performance.now();
132
+ const lastEntry = observer[kLastEntryMap].get(target);
133
+ if (!lastEntry) {
134
+ return;
135
+ }
136
+ const entry2 = { ...cloneIntersectionObserverEntry(lastEntry), time: t1 };
137
+ observer[kCallback]([entry2], observer);
138
+ });
139
+ }
140
+ }
141
+ }
142
+ window.simulateScroll = simulateScroll;
143
+ })();
144
+ `;
145
+
146
+ const MUTATION_IDLE_WATCH = `
147
+ (function () {
148
+ let timeout;
149
+ const sendMsg = ()=> {
150
+ document.dispatchEvent(new CustomEvent('mutationIdle'));
151
+ };
152
+
153
+ const cb = () => {
154
+ if (timeout) {
155
+ clearTimeout(timeout);
156
+ timeout = setTimeout(sendMsg, 200);
157
+ }
158
+ };
159
+ const mutationObserver = new MutationObserver(cb);
160
+
161
+ document.addEventListener('DOMContentLoaded', () => {
162
+ mutationObserver.observe(document.documentElement, {
163
+ childList: true,
164
+ subtree: true,
165
+ });
166
+ timeout = setTimeout(sendMsg, 200);
167
+ }, { once: true })
168
+ })();
169
+ `;
170
+
171
+ const SCRIPT_TO_INJECT_INTO_FRAME = `
172
+ ${SIMULATE_SCROLL}
173
+ ${MUTATION_IDLE_WATCH}
174
+ (${minimalStealth.toString()})();
175
+
176
+ (function(){
177
+
178
+ let lastMutationIdle = 0;
179
+ let initialAnalytics;
180
+ document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
181
+
182
+ function waitForSelector(selectorText) {
183
+ return new Promise((resolve) => {
184
+ const existing = document.querySelector(selectorText);
185
+ if (existing) {
186
+ resolve(existing);
187
+ return;
188
+ }
189
+ if (document.readyState === 'loading') {
190
+ document.addEventListener('DOMContentLoaded', () => {
191
+ const observer = new MutationObserver(() => {
192
+ const elem = document.querySelector(selectorText);
193
+ if (elem) {
194
+ resolve(document.querySelector(selectorText));
195
+ observer.disconnect();
196
+ }
197
+ });
198
+ observer.observe(document.documentElement, {
199
+ childList: true,
200
+ subtree: true
201
+ });
202
+ });
203
+ return;
204
+ }
205
+ const observer = new MutationObserver(() => {
206
+ const elem = document.querySelector(selectorText);
207
+ if (elem) {
208
+ resolve(document.querySelector(selectorText));
209
+ observer.disconnect();
210
+ }
211
+ });
212
+ observer.observe(document.documentElement, {
213
+ childList: true,
214
+ subtree: true
215
+ });
216
+ });
217
+ }
218
+ window.waitForSelector = waitForSelector;
219
+ })();
220
+ `;
221
+
222
+ @singleton()
223
+ export class SERPSpecializedPuppeteerControl extends AsyncService {
224
+
225
+ _sn = 0;
226
+ browser!: Browser;
227
+ logger = this.globalLogger.child({ service: this.constructor.name });
228
+
229
+ __loadedPage: Page[] = [];
230
+
231
+ finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
232
+ snMap = new WeakMap<Page, number>();
233
+ livePages = new Set<Page>();
234
+ lastPageCratedAt: number = 0;
235
+ ua: string = '';
236
+
237
+ protected _REPORT_FUNCTION_NAME = 'bingo';
238
+
239
+ lifeCycleTrack = new WeakMap();
240
+
241
+ constructor(
242
+ protected globalLogger: GlobalLogger,
243
+ protected asyncLocalContext: AsyncLocalContext,
244
+ protected curlControl: CurlControl,
245
+ protected blackHoleDetector: BlackHoleDetector,
246
+ ) {
247
+ super(...arguments);
248
+ this.setMaxListeners(Infinity);
249
+
250
+ let crippledTimes = 0;
251
+ this.on('crippled', () => {
252
+ crippledTimes += 1;
253
+ this.__loadedPage.length = 0;
254
+ this.livePages.clear();
255
+ if (crippledTimes > 5) {
256
+ process.nextTick(() => {
257
+ this.emit('error', new Error('Browser crashed too many times, quitting...'));
258
+ // process.exit(1);
259
+ });
260
+ }
261
+ });
262
+ }
263
+
264
+ override async init() {
265
+ await this.dependencyReady();
266
+ if (process.env.NODE_ENV?.includes('dry-run')) {
267
+ this.emit('ready');
268
+ return;
269
+ }
270
+
271
+ if (this.browser) {
272
+ if (this.browser.connected) {
273
+ await this.browser.close();
274
+ } else {
275
+ this.browser.process()?.kill('SIGKILL');
276
+ }
277
+ }
278
+ this.browser = await puppeteer.launch({
279
+ timeout: 10_000,
280
+ headless: !Boolean(process.env.DEBUG_BROWSER),
281
+ executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
282
+ args: [
283
+ '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled'
284
+ ]
285
+ }).catch((err: any) => {
286
+ this.logger.error(`Unknown firebase issue, just die fast.`, { err });
287
+ process.nextTick(() => {
288
+ this.emit('error', err);
289
+ // process.exit(1);
290
+ });
291
+ return Promise.reject(err);
292
+ });
293
+ this.browser.once('disconnected', () => {
294
+ this.logger.warn(`Browser disconnected`);
295
+ if (this.browser) {
296
+ this.emit('crippled');
297
+ }
298
+ process.nextTick(() => this.serviceReady());
299
+ });
300
+ this.ua = await this.browser.userAgent();
301
+ this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
302
+ this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
303
+
304
+ await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
305
+
306
+ this.emit('ready');
307
+ }
308
+
309
+ async newPage<T>(bewareDeadLock: any = false) {
310
+ if (!bewareDeadLock) {
311
+ await this.serviceReady();
312
+ }
313
+ const sn = this._sn++;
314
+ let page;
315
+ try {
316
+ const dedicatedContext = await this.browser.createBrowserContext();
317
+ page = await dedicatedContext.newPage();
318
+ } catch (err: any) {
319
+ this.logger.warn(`Failed to create page ${sn}`, { err });
320
+ this.browser.process()?.kill('SIGKILL');
321
+ throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
322
+ }
323
+ const preparations = [];
324
+
325
+ preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, '')));
326
+ // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
327
+ // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
328
+ preparations.push(page.setBypassCSP(true));
329
+ preparations.push(page.setViewport({ width: 1024, height: 1024 }));
330
+ preparations.push(page.exposeFunction(this._REPORT_FUNCTION_NAME, (thing: T) => {
331
+ page.emit(this._REPORT_FUNCTION_NAME, thing);
332
+ }));
333
+ preparations.push(page.exposeFunction('setViewport', (viewport: Viewport | null) => {
334
+ page.setViewport(viewport).catch(() => undefined);
335
+ }));
336
+ preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
337
+
338
+ await Promise.all(preparations);
339
+
340
+ this.snMap.set(page, sn);
341
+ this.logger.debug(`Page ${sn} created.`);
342
+ this.lastPageCratedAt = Date.now();
343
+ this.livePages.add(page);
344
+
345
+ return page;
346
+ }
347
+
348
+ async getNextPage() {
349
+ let thePage: Page | undefined;
350
+ if (this.__loadedPage.length) {
351
+ thePage = this.__loadedPage.shift();
352
+ if (this.__loadedPage.length <= 1) {
353
+ process.nextTick(() => {
354
+ this.newPage()
355
+ .then((r) => this.__loadedPage.push(r))
356
+ .catch((err) => {
357
+ this.logger.warn(`Failed to load new page ahead of time`, { err });
358
+ });
359
+ });
360
+ }
361
+ }
362
+
363
+ if (!thePage) {
364
+ thePage = await this.newPage();
365
+ }
366
+
367
+ const timer = setTimeout(() => {
368
+ this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage!)}...`);
369
+ this.ditchPage(thePage!);
370
+ }, 300 * 1000);
371
+
372
+ this.finalizerMap.set(thePage, timer);
373
+
374
+ return thePage;
375
+ }
376
+
377
+ async ditchPage(page: Page) {
378
+ if (this.finalizerMap.has(page)) {
379
+ clearTimeout(this.finalizerMap.get(page)!);
380
+ this.finalizerMap.delete(page);
381
+ }
382
+ if (page.isClosed()) {
383
+ return;
384
+ }
385
+ const sn = this.snMap.get(page);
386
+ this.logger.debug(`Closing page ${sn}`);
387
+ await Promise.race([
388
+ (async () => {
389
+ const ctx = page.browserContext();
390
+ try {
391
+ await page.close();
392
+ } finally {
393
+ await ctx.close();
394
+ }
395
+ })(),
396
+ delay(5000)
397
+ ]).catch((err) => {
398
+ this.logger.error(`Failed to destroy page ${sn}`, { err });
399
+ });
400
+ this.livePages.delete(page);
401
+ }
402
+
403
+ async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
404
+ // parsedUrl.search = '';
405
+ const url = parsedUrl.toString();
406
+ const page = await this.getNextPage();
407
+ this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
408
+ page.on('response', (_resp) => {
409
+ this.blackHoleDetector.itWorked();
410
+ });
411
+ page.on('request', async (req) => {
412
+ if (req.isInterceptResolutionHandled()) {
413
+ return;
414
+ };
415
+ const reqUrlParsed = new URL(req.url());
416
+ if (!reqUrlParsed.protocol.startsWith('http')) {
417
+ const overrides = req.continueRequestOverrides();
418
+
419
+ return req.continue(overrides, 0);
420
+ }
421
+ const typ = req.resourceType();
422
+ if (typ === 'media' || typ === 'font' || typ === 'image' || typ === 'stylesheet') {
423
+ // Non-cooperative answer to block all media requests.
424
+ return req.abort('blockedbyclient');
425
+ }
426
+ if (!options.proxyResources) {
427
+ const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
428
+ if (!isDocRequest) {
429
+ if (options.extraHeaders) {
430
+ const overrides = req.continueRequestOverrides();
431
+ const continueArgs = [{
432
+ ...overrides,
433
+ headers: {
434
+ ...req.headers(),
435
+ ...overrides?.headers,
436
+ ...options.extraHeaders,
437
+ }
438
+ }, 1] as const;
439
+
440
+ return req.continue(continueArgs[0], continueArgs[1]);
441
+ }
442
+ const overrides = req.continueRequestOverrides();
443
+
444
+ return req.continue(overrides, 0);
445
+ }
446
+ }
447
+ const sideload = options.sideLoad;
448
+
449
+ const impersonate = sideload?.impersonate[reqUrlParsed.href];
450
+ if (impersonate) {
451
+ let body;
452
+ if (impersonate.body) {
453
+ body = await readFile(await impersonate.body.filePath);
454
+ if (req.isInterceptResolutionHandled()) {
455
+ return;
456
+ }
457
+ }
458
+ return req.respond({
459
+ status: impersonate.status,
460
+ headers: impersonate.headers,
461
+ contentType: impersonate.contentType,
462
+ body: body ? Uint8Array.from(body) : undefined,
463
+ }, 999);
464
+ }
465
+
466
+ const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
467
+ const ctx = this.lifeCycleTrack.get(page);
468
+ if (proxy && ctx) {
469
+ return await this.asyncLocalContext.bridge(ctx, async () => {
470
+ try {
471
+ const curled = await this.curlControl.sideLoad(reqUrlParsed, {
472
+ ...options,
473
+ method: req.method(),
474
+ body: req.postData(),
475
+ extraHeaders: {
476
+ ...req.headers(),
477
+ ...options.extraHeaders,
478
+ },
479
+ proxyUrl: proxy
480
+ });
481
+ if (req.isInterceptResolutionHandled()) {
482
+ return;
483
+ };
484
+
485
+ if (curled.chain.length === 1) {
486
+ if (!curled.file) {
487
+ return req.respond({
488
+ status: curled.status,
489
+ headers: _.omit(curled.headers, 'result'),
490
+ contentType: curled.contentType,
491
+ }, 3);
492
+ }
493
+ const body = await readFile(await curled.file.filePath);
494
+ if (req.isInterceptResolutionHandled()) {
495
+ return;
496
+ };
497
+ return req.respond({
498
+ status: curled.status,
499
+ headers: _.omit(curled.headers, 'result'),
500
+ contentType: curled.contentType,
501
+ body: Uint8Array.from(body),
502
+ }, 3);
503
+ }
504
+ options.sideLoad ??= curled.sideLoadOpts;
505
+ _.merge(options.sideLoad, curled.sideLoadOpts);
506
+ const firstReq = curled.chain[0];
507
+
508
+ return req.respond({
509
+ status: firstReq.result!.code,
510
+ headers: _.omit(firstReq, 'result'),
511
+ }, 3);
512
+ } catch (err: any) {
513
+ this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
514
+ }
515
+ if (req.isInterceptResolutionHandled()) {
516
+ return;
517
+ };
518
+ const overrides = req.continueRequestOverrides();
519
+ const continueArgs = [{
520
+ ...overrides,
521
+ headers: {
522
+ ...req.headers(),
523
+ ...overrides?.headers,
524
+ ...options.extraHeaders,
525
+ }
526
+ }, 1] as const;
527
+
528
+ return req.continue(continueArgs[0], continueArgs[1]);
529
+ });
530
+ }
531
+
532
+ if (req.isInterceptResolutionHandled()) {
533
+ return;
534
+ };
535
+ const overrides = req.continueRequestOverrides();
536
+ const continueArgs = [{
537
+ ...overrides,
538
+ headers: {
539
+ ...req.headers(),
540
+ ...overrides?.headers,
541
+ ...options.extraHeaders,
542
+ }
543
+ }, 1] as const;
544
+
545
+ return req.continue(continueArgs[0], continueArgs[1]);
546
+ });
547
+ await page.setRequestInterception(true);
548
+
549
+ const sn = this.snMap.get(page);
550
+ this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
551
+
552
+ await page.evaluateOnNewDocument(`(function () {
553
+ if (window.top !== window.self) {
554
+ return;
555
+ }
556
+ const func = ${func.toString()};
557
+
558
+ func().then((result) => {
559
+ window.${this._REPORT_FUNCTION_NAME}({data: result});
560
+ }).catch((err) => {
561
+ window.${this._REPORT_FUNCTION_NAME}({err: err});
562
+ });
563
+
564
+ })();`);
565
+
566
+ if (options.locale) {
567
+ // Add headers via request interception to walk around this bug
568
+ // https://github.com/puppeteer/puppeteer/issues/10235
569
+ // await page.setExtraHTTPHeaders({
570
+ // 'Accept-Language': options.locale
571
+ // });
572
+
573
+ await page.evaluateOnNewDocument(() => {
574
+ Object.defineProperty(navigator, "language", {
575
+ get: function () {
576
+ return options.locale;
577
+ }
578
+ });
579
+ Object.defineProperty(navigator, "languages", {
580
+ get: function () {
581
+ return [options.locale];
582
+ }
583
+ });
584
+ });
585
+ }
586
+
587
+ if (options.cookies) {
588
+ const mapped = options.cookies.map((x) => {
589
+ const draft: CookieParam = {
590
+ name: x.name,
591
+ value: encodeURIComponent(x.value),
592
+ secure: x.secure,
593
+ domain: x.domain,
594
+ path: x.path,
595
+ expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
596
+ sameSite: x.sameSite as any,
597
+ };
598
+ if (!draft.expires && x.maxAge) {
599
+ draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
600
+ }
601
+ if (!draft.domain) {
602
+ draft.url = parsedUrl.toString();
603
+ }
604
+
605
+ return draft;
606
+ });
607
+ try {
608
+ await page.setCookie(...mapped);
609
+ } catch (err: any) {
610
+ this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
611
+ throw new ParamValidationError({
612
+ path: 'cookies',
613
+ message: `Failed to set cookies: ${err?.message}`
614
+ });
615
+ }
616
+ }
617
+ if (options.overrideUserAgent) {
618
+ await page.setUserAgent(options.overrideUserAgent);
619
+ }
620
+ if (options.viewport) {
621
+ await page.setViewport(options.viewport);
622
+ }
623
+
624
+ const resultDeferred = Defer<T>();
625
+ const crippleListener = () => resultDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
626
+ this.once('crippled', crippleListener);
627
+ resultDeferred.promise.finally(() => {
628
+ this.off('crippled', crippleListener);
629
+ });
630
+ const hdl = (s: {
631
+ err?: any;
632
+ data?: T;
633
+ }) => {
634
+ if (s.err) {
635
+ resultDeferred.reject(s.err);
636
+ }
637
+ resultDeferred.resolve(s.data);
638
+ };
639
+ page.on(this._REPORT_FUNCTION_NAME, hdl as any);
640
+ page.once('abuse', (event: any) => {
641
+ this.emit('abuse', { ...event, url: parsedUrl });
642
+
643
+ resultDeferred.reject(
644
+ new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
645
+ );
646
+ });
647
+
648
+ const timeout = options.timeoutMs || 30_000;
649
+ const goToOptions: GoToOptions = {
650
+ waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
651
+ timeout,
652
+ };
653
+
654
+ if (options.referer) {
655
+ goToOptions.referer = options.referer;
656
+ }
657
+
658
+
659
+ const gotoPromise = page.goto(url, goToOptions)
660
+ .catch((err) => {
661
+ if (err instanceof TimeoutError) {
662
+ this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
663
+ return new AssertionFailureError({
664
+ message: `Failed to goto ${url}: ${err}`,
665
+ cause: err,
666
+ });
667
+ }
668
+
669
+ this.logger.warn(`Page ${sn}: Browsing of ${url} aborted`, { err });
670
+ return undefined;
671
+ }).then(async (r) => {
672
+ await delay(5000);
673
+ resultDeferred.reject(new TimeoutError(`Control function did not respond in time`));
674
+ return r;
675
+ });
676
+
677
+ try {
678
+ await Promise.race([resultDeferred.promise, gotoPromise]);
679
+
680
+ return resultDeferred.promise;
681
+ } finally {
682
+ page.off(this._REPORT_FUNCTION_NAME, hdl as any);
683
+ this.ditchPage(page);
684
+ resultDeferred.resolve();
685
+ }
686
+ }
687
+
688
+ }
689
+
690
+ const puppeteerControl = container.resolve(SERPSpecializedPuppeteerControl);
691
+
692
+ export default puppeteerControl;
src/services/serp/serper.ts ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import { singleton } from 'tsyringe';
3
+ import { GlobalLogger } from '../logger';
4
+ import { SecretExposer } from '../../shared/services/secrets';
5
+ import { AsyncLocalContext } from '../async-context';
6
+ import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../../shared/3rd-party/serper-search';
7
+ import { BlackHoleDetector } from '../blackhole-detector';
8
+ import { Context } from '../registry';
9
+ import { AsyncService } from 'civkit/async-service';
10
+ import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
11
+
12
+ @singleton()
13
+ export class SerperGoogleSearchService extends AsyncService {
14
+
15
+ logger = this.globalLogger.child({ service: this.constructor.name });
16
+
17
+ client!: SerperGoogleHTTP;
18
+
19
+ constructor(
20
+ protected globalLogger: GlobalLogger,
21
+ protected secretExposer: SecretExposer,
22
+ protected threadLocal: AsyncLocalContext,
23
+ protected blackHoleDetector: BlackHoleDetector,
24
+ ) {
25
+ super(...arguments);
26
+ }
27
+
28
+ override async init() {
29
+ await this.dependencyReady();
30
+ this.emit('ready');
31
+
32
+ this.client = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
33
+ }
34
+
35
+
36
+ doSearch(variant: 'web', query: SerperSearchQueryParams): Promise<SerperWebSearchResponse['organic']>;
37
+ doSearch(variant: 'images', query: SerperSearchQueryParams): Promise<SerperImageSearchResponse['images']>;
38
+ doSearch(variant: 'news', query: SerperSearchQueryParams): Promise<SerperNewsSearchResponse['news']>;
39
+ async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
40
+ this.logger.debug(`Doing external search`, query);
41
+ let results;
42
+ switch (variant) {
43
+ case 'images': {
44
+ const r = await this.client.imageSearch(query);
45
+
46
+ results = r.parsed.images;
47
+ break;
48
+ }
49
+ case 'news': {
50
+ const r = await this.client.newsSearch(query);
51
+
52
+ results = r.parsed.news;
53
+ break;
54
+ }
55
+ case 'web':
56
+ default: {
57
+ const r = await this.client.webSearch(query);
58
+
59
+ results = r.parsed.organic;
60
+ break;
61
+ }
62
+ }
63
+
64
+ this.blackHoleDetector.itWorked();
65
+
66
+ return results;
67
+ }
68
+
69
+
70
+ async webSearch(query: SerperSearchQueryParams) {
71
+ return this.doSearch('web', query);
72
+ }
73
+ async imageSearch(query: SerperSearchQueryParams) {
74
+ return this.doSearch('images', query);
75
+ }
76
+ async newsSearch(query: SerperSearchQueryParams) {
77
+ return this.doSearch('news', query);
78
+ }
79
+
80
+ }
81
+
82
+ @singleton()
83
+ export class SerperBingSearchService extends SerperGoogleSearchService {
84
+ override client!: SerperBingHTTP;
85
+
86
+ override async init() {
87
+ await this.dependencyReady();
88
+ this.emit('ready');
89
+
90
+ this.client = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
91
+ }
92
+ }
93
+
94
+ export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
95
+ @Prop({
96
+ arrayOf: String,
97
+ desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.`
98
+ })
99
+ ext?: string | string[];
100
+
101
+ @Prop({
102
+ arrayOf: String,
103
+ desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.`
104
+ })
105
+ filetype?: string | string[];
106
+
107
+ @Prop({
108
+ arrayOf: String,
109
+ desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.`
110
+ })
111
+ intitle?: string | string[];
112
+
113
+ @Prop({
114
+ arrayOf: String,
115
+ desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
116
+ })
117
+ loc?: string | string[];
118
+
119
+ @Prop({
120
+ arrayOf: String,
121
+ desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.`
122
+ })
123
+ site?: string | string[];
124
+
125
+ addTo(searchTerm: string) {
126
+ const chunks = [];
127
+ for (const [key, value] of Object.entries(this)) {
128
+ if (value) {
129
+ const values = Array.isArray(value) ? value : [value];
130
+ const textValue = values.map((v) => `${key}:${v}`).join(' OR ');
131
+ if (textValue) {
132
+ chunks.push(textValue);
133
+ }
134
+ }
135
+ }
136
+ const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks;
137
+
138
+ if (opPart.length) {
139
+ return [searchTerm, opPart].join(' ');
140
+ }
141
+
142
+ return searchTerm;
143
+ }
144
+
145
+ static override from(input: any) {
146
+ const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
147
+ const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
148
+
149
+ const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
150
+
151
+ for (const p of params) {
152
+ const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
153
+ if (!customValue) {
154
+ continue;
155
+ }
156
+
157
+ const filtered = customValue.split(', ').filter(Boolean);
158
+ if (filtered.length) {
159
+ Reflect.set(instance, p, filtered);
160
+ }
161
+ }
162
+
163
+ return instance;
164
+ }
165
+ }
src/stand-alone/serp.ts ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import 'reflect-metadata';
2
+ import { container, singleton } from 'tsyringe';
3
+
4
+ import { KoaServer } from 'civkit/civ-rpc/koa';
5
+ import http2 from 'http2';
6
+ import http from 'http';
7
+ import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
8
+ import path from 'path';
9
+ import fs from 'fs';
10
+ import { mimeOfExt } from 'civkit/mime';
11
+ import { Context, Next } from 'koa';
12
+ import { RPCRegistry } from '../services/registry';
13
+ import { AsyncResource } from 'async_hooks';
14
+ import { runOnce } from 'civkit/decorators';
15
+ import { randomUUID } from 'crypto';
16
+ import { ThreadedServiceRegistry } from '../services/threaded';
17
+ import { GlobalLogger } from '../services/logger';
18
+ import { AsyncLocalContext } from '../services/async-context';
19
+ import finalizer, { Finalizer } from '../services/finalizer';
20
+ import { SerpHost } from '../api/serp';
21
+
22
+ @singleton()
23
+ export class SERPStandAloneServer extends KoaServer {
24
+ logger = this.globalLogger.child({ service: this.constructor.name });
25
+
26
+ httpAlternativeServer?: typeof this['httpServer'];
27
+ assets = new Map<string, WalkOutEntity>();
28
+
29
+ constructor(
30
+ protected globalLogger: GlobalLogger,
31
+ protected registry: RPCRegistry,
32
+ protected serpHost: SerpHost,
33
+ protected threadLocal: AsyncLocalContext,
34
+ protected threads: ThreadedServiceRegistry,
35
+ ) {
36
+ super(...arguments);
37
+ }
38
+
39
+ h2c() {
40
+ this.httpAlternativeServer = this.httpServer;
41
+ const fn = this.koaApp.callback();
42
+ this.httpServer = http2.createServer((req, res) => {
43
+ const ar = new AsyncResource('HTTP2ServerRequest');
44
+ ar.runInAsyncScope(fn, this.koaApp, req, res);
45
+ });
46
+ // useResourceBasedDefaultTracker();
47
+
48
+ return this;
49
+ }
50
+
51
+ override async init() {
52
+ await this.walkForAssets();
53
+ await this.dependencyReady();
54
+
55
+ for (const [k, v] of this.registry.conf.entries()) {
56
+ if (v.tags?.includes('crawl')) {
57
+ this.registry.conf.delete(k);
58
+ }
59
+ }
60
+
61
+ await super.init();
62
+ }
63
+
64
+ async walkForAssets() {
65
+ const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
66
+
67
+ for (const file of files) {
68
+ if (file.type !== 'file') {
69
+ continue;
70
+ }
71
+ this.assets.set(file.relativePath.toString(), file);
72
+ }
73
+ }
74
+
75
+ override listen(port: number) {
76
+ const r = super.listen(port);
77
+ if (this.httpAlternativeServer) {
78
+ const altPort = port + 1;
79
+ this.httpAlternativeServer.listen(altPort, () => {
80
+ this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
81
+ });
82
+ }
83
+
84
+ return r;
85
+ }
86
+
87
+ makeAssetsServingController() {
88
+ return (ctx: Context, next: Next) => {
89
+ const requestPath = ctx.path;
90
+ const file = requestPath.slice(1);
91
+ if (!file) {
92
+ return next();
93
+ }
94
+
95
+ const asset = this.assets.get(file);
96
+ if (asset?.type !== 'file') {
97
+ return next();
98
+ }
99
+
100
+ ctx.body = fs.createReadStream(asset.path);
101
+ ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
102
+ ctx.set('Content-Length', asset.stats.size.toString());
103
+
104
+ return;
105
+ };
106
+ }
107
+
108
+ registerRoutes(): void {
109
+ this.koaApp.use(this.makeAssetsServingController());
110
+ this.koaApp.use(this.registry.makeShimController());
111
+ }
112
+
113
+
114
+ // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
115
+ // TraceId is expected to be request-bound and unique. So these two has to be distinguished.
116
+ @runOnce()
117
+ override insertAsyncHookMiddleware() {
118
+ const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
119
+ const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
120
+ this.threadLocal.setup({
121
+ traceId: randomUUID(),
122
+ traceT0: new Date(),
123
+ googleTraceId,
124
+ });
125
+
126
+ return next();
127
+ };
128
+
129
+ this.koaApp.use(asyncHookMiddleware);
130
+ }
131
+
132
+ @Finalizer()
133
+ override async standDown() {
134
+ const tasks: Promise<any>[] = [];
135
+ if (this.httpAlternativeServer?.listening) {
136
+ (this.httpAlternativeServer as http.Server).closeIdleConnections?.();
137
+ this.httpAlternativeServer.close();
138
+ tasks.push(new Promise<void>((resolve, reject) => {
139
+ this.httpAlternativeServer!.close((err) => {
140
+ if (err) {
141
+ return reject(err);
142
+ }
143
+ resolve();
144
+ });
145
+ }));
146
+ }
147
+ tasks.push(super.standDown());
148
+ await Promise.all(tasks);
149
+ }
150
+
151
+ }
152
+ const instance = container.resolve(SERPStandAloneServer);
153
+
154
+ export default instance;
155
+
156
+ if (process.env.NODE_ENV?.includes('dry-run')) {
157
+ instance.serviceReady().then(() => finalizer.terminate());
158
+ } else {
159
+ instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
160
+ }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit 2f45bd58ddfc007d04dfdb9cb0814d74dc25e3f3
 
1
+ Subproject commit ca09ea8fcbb84aeea4eb8015bf8e98eef1813048