nomagick commited on
Commit
f7dbadf
·
unverified ·
1 Parent(s): 5141814

behavior change: ditch content based return timing, adopt mutationIdle as default timing

Browse files
src/api/crawler.ts CHANGED
@@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE }
19
 
20
  import { Crawled } from '../db/crawled';
21
  import { DomainBlockade } from '../db/domain-blockade';
22
- import { DomainProfile } from '../db/domain-profile';
23
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
24
 
25
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost {
317
  if (crawlerOptions.robotsTxt) {
318
  await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
319
  }
 
 
 
320
  if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
321
  const sseStream = new OutputServerEventStream();
322
  rpcReflect.return(sseStream);
@@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost {
363
  if (rpcReflect.signal.aborted) {
364
  break;
365
  }
366
- if (!crawlerOptions.isEarlyReturnApplicable()) {
367
- continue;
368
- }
369
- if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
370
  continue;
371
  }
372
 
@@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost {
412
  if (rpcReflect.signal.aborted) {
413
  break;
414
  }
415
- if (!crawlerOptions.isEarlyReturnApplicable()) {
416
- continue;
417
- }
418
-
419
- if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
420
  continue;
421
  }
422
 
@@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost {
427
  }
428
 
429
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
430
-
431
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
432
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
433
  );
434
  }
435
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
436
-
437
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
438
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
439
  );
@@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost {
705
  return;
706
  }
707
 
708
- if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
 
 
 
 
709
  const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
710
  await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
711
  await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
@@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost {
779
 
780
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
781
  draftSnapshot.title ??= analyzed.title;
 
782
  let fallbackProxyIsUsed = false;
783
  if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
784
  (analyzed.tokens < 42 || sideLoaded.status !== 200)
@@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost {
798
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
799
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
800
  draftSnapshot = proxySnapshot;
 
801
  sideLoaded = proxyLoaded;
802
  fallbackProxyIsUsed = true;
803
  }
@@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost {
986
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
987
  }
988
 
989
- if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
990
  crawlOpts.favorScreenshot = true;
991
  }
992
 
@@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost {
1142
  return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
1143
  }
1144
 
1145
- async exploreDirectEngine(knownSnapshot: PageSnapshot) {
1146
- const realUrl = new URL(knownSnapshot.href);
1147
- const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
1148
- const profile = await DomainProfile.fromFirestore(digest);
1149
-
1150
- if (!profile) {
1151
- const record = DomainProfile.from({
1152
- _id: digest,
1153
- origin: realUrl.origin.toLowerCase(),
1154
- path,
1155
- triggerUrl: realUrl.href,
1156
- engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
1157
- createdAt: new Date(),
1158
- expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
1159
- });
1160
- await DomainProfile.save(record);
1161
-
1162
- return;
1163
- }
1164
-
1165
- if (profile.engine === ENGINE_TYPE.BROWSER) {
1166
- // Mixed engine, always use browser
1167
- return;
1168
- }
1169
-
1170
- profile.origin = realUrl.origin.toLowerCase();
1171
- profile.triggerUrl = realUrl.href;
1172
- profile.path = path;
1173
- profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
1174
- profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
1175
-
1176
- await DomainProfile.save(profile);
1177
-
1178
- return;
1179
- }
1180
-
1181
- async snapshotNotGoodEnough(snapshot: PageSnapshot) {
1182
- if (snapshot.pdfs?.length) {
1183
- return false;
1184
- }
1185
- if (!snapshot.title) {
1186
- return true;
1187
- }
1188
- if (snapshot.parsed?.content) {
1189
- return false;
1190
- }
1191
- if (snapshot.html) {
1192
- const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
1193
- const tokens = r.tokens;
1194
- if (tokens < 200) {
1195
- return true;
1196
- }
1197
- }
1198
- return false;
1199
- }
1200
-
1201
  getDomainProfileUrlDigest(url: URL) {
1202
  const pathname = url.pathname;
1203
  const pathVec = pathname.split('/');
 
19
 
20
  import { Crawled } from '../db/crawled';
21
  import { DomainBlockade } from '../db/domain-blockade';
 
22
  import { OutputServerEventStream } from '../lib/transform-server-event-stream';
23
 
24
  import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
 
316
  if (crawlerOptions.robotsTxt) {
317
  await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
318
  }
319
+ if (rpcReflect.signal.aborted) {
320
+ return;
321
+ }
322
  if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
323
  const sseStream = new OutputServerEventStream();
324
  rpcReflect.return(sseStream);
 
365
  if (rpcReflect.signal.aborted) {
366
  break;
367
  }
368
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
 
 
 
369
  continue;
370
  }
371
 
 
411
  if (rpcReflect.signal.aborted) {
412
  break;
413
  }
414
+ if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
 
 
 
 
415
  continue;
416
  }
417
 
 
422
  }
423
 
424
  if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
 
425
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
426
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
427
  );
428
  }
429
  if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
 
430
  return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
431
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
432
  );
 
698
  return;
699
  }
700
 
701
+ if (
702
+ crawlOpts?.engine === ENGINE_TYPE.CURL ||
703
+ // deprecated name
704
+ crawlOpts?.engine === 'direct'
705
+ ) {
706
  const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
707
  await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
708
  await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
 
776
 
777
  let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
778
  draftSnapshot.title ??= analyzed.title;
779
+ draftSnapshot.isIntermediate = true;
780
  let fallbackProxyIsUsed = false;
781
  if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
782
  (analyzed.tokens < 42 || sideLoaded.status !== 200)
 
796
  analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
797
  if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
798
  draftSnapshot = proxySnapshot;
799
+ draftSnapshot.isIntermediate = true;
800
  sideLoaded = proxyLoaded;
801
  fallbackProxyIsUsed = true;
802
  }
 
985
  crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
986
  }
987
 
988
+ if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
989
  crawlOpts.favorScreenshot = true;
990
  }
991
 
 
1141
  return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
1142
  }
1143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1144
  getDomainProfileUrlDigest(url: URL) {
1145
  const pathname = url.pathname;
1146
  const pathVec = pathname.split('/');
src/dto/crawler-options.ts CHANGED
@@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file';
3
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
4
  import { Context } from '../services/registry';
5
  import { TurnDownTweakableOptions } from './turndown-tweakable-options';
 
6
 
7
  export enum CONTENT_FORMAT {
8
  CONTENT = 'content',
@@ -18,12 +19,18 @@ export enum CONTENT_FORMAT {
18
  export enum ENGINE_TYPE {
19
  AUTO = 'auto',
20
  BROWSER = 'browser',
21
- DIRECT = 'direct',
22
- VLM = 'vlm',
23
- READER_LM = 'readerlm-v2',
24
  CF_BROWSER_RENDERING = 'cf-browser-rendering',
25
  }
26
 
 
 
 
 
 
 
 
 
27
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
28
 
29
  export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
@@ -213,6 +220,15 @@ class Viewport extends AutoCastable {
213
  in: 'header',
214
  schema: { type: 'string' }
215
  },
 
 
 
 
 
 
 
 
 
216
  'X-Engine': {
217
  description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
218
  in: 'header',
@@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable {
405
  @Prop()
406
  markdown?: TurnDownTweakableOptions;
407
 
 
 
 
 
 
408
  static override from(input: any) {
409
  const instance = super.from(input) as CrawlerOptions;
410
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
@@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable {
498
  if (instance.engine) {
499
  instance.engine = instance.engine.toLowerCase();
500
  }
501
- if (instance.engine === ENGINE_TYPE.VLM) {
502
  instance.engine = ENGINE_TYPE.BROWSER;
503
  instance.respondWith = CONTENT_FORMAT.VLM;
504
- } else if (instance.engine === ENGINE_TYPE.READER_LM) {
505
  instance.engine = ENGINE_TYPE.AUTO;
506
  instance.respondWith = CONTENT_FORMAT.READER_LM;
507
  }
@@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable {
558
  const dnt = ctx?.get('dnt');
559
  instance.doNotTrack ??= (parseInt(dnt || '') || null);
560
 
 
 
 
 
 
 
 
 
 
 
561
  if (instance.cacheTolerance) {
562
  instance.cacheTolerance = instance.cacheTolerance * 1000;
563
  }
@@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable {
569
  return instance;
570
  }
571
 
572
- isEarlyReturnApplicable() {
573
- if (this.timeout !== undefined) {
574
  return false;
575
  }
576
- if (this.waitForSelector?.length) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  return false;
578
  }
579
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
@@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable {
583
  return false;
584
  }
585
 
586
- return true;
587
  }
588
 
589
  isCacheQueryApplicable() {
@@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable {
611
  }
612
 
613
  browserIsNotRequired() {
 
 
 
614
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
615
  return false;
616
  }
 
3
  import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
4
  import { Context } from '../services/registry';
5
  import { TurnDownTweakableOptions } from './turndown-tweakable-options';
6
+ import type { PageSnapshot } from '../services/puppeteer';
7
 
8
  export enum CONTENT_FORMAT {
9
  CONTENT = 'content',
 
19
  export enum ENGINE_TYPE {
20
  AUTO = 'auto',
21
  BROWSER = 'browser',
22
+ CURL = 'curl',
 
 
23
  CF_BROWSER_RENDERING = 'cf-browser-rendering',
24
  }
25
 
26
+ export enum RESPOND_TIMING {
27
+ HTML = 'html',
28
+ MUTATION_IDLE = 'mutation-idle',
29
+ RESOURCE_IDLE = 'resource-idle',
30
+ MEDIA_IDLE = 'media-idle',
31
+ NETWORK_IDLE = 'network-idle',
32
+ }
33
+
34
  const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
35
 
36
  export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
 
220
  in: 'header',
221
  schema: { type: 'string' }
222
  },
223
+ 'X-Respond-Timing': {
224
+ description: `Explicitly specify the respond timing. One of the following:\n\n` +
225
+ `- html: unrendered HTML is enough to return\n` +
226
+ `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
227
+ `- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
228
+ `- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`,
229
+ in: 'header',
230
+ schema: { type: 'string' }
231
+ },
232
  'X-Engine': {
233
  description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
234
  in: 'header',
 
421
  @Prop()
422
  markdown?: TurnDownTweakableOptions;
423
 
424
+ @Prop({
425
+ type: RESPOND_TIMING,
426
+ })
427
+ respondTiming?: RESPOND_TIMING;
428
+
429
  static override from(input: any) {
430
  const instance = super.from(input) as CrawlerOptions;
431
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
 
519
  if (instance.engine) {
520
  instance.engine = instance.engine.toLowerCase();
521
  }
522
+ if (instance.engine === 'vlm') {
523
  instance.engine = ENGINE_TYPE.BROWSER;
524
  instance.respondWith = CONTENT_FORMAT.VLM;
525
+ } else if (instance.engine === 'readerlm-v2') {
526
  instance.engine = ENGINE_TYPE.AUTO;
527
  instance.respondWith = CONTENT_FORMAT.READER_LM;
528
  }
 
579
  const dnt = ctx?.get('dnt');
580
  instance.doNotTrack ??= (parseInt(dnt || '') || null);
581
 
582
+ const respondTiming = ctx?.get('x-respond-timing');
583
+ if (respondTiming) {
584
+ instance.respondTiming ??= respondTiming as RESPOND_TIMING;
585
+ }
586
+ instance.respondTiming ??= (
587
+ instance.timeout ||
588
+ instance.respondWith.includes('shot') ||
589
+ instance.respondWith.includes('vlm')
590
+ ) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE;
591
+
592
  if (instance.cacheTolerance) {
593
  instance.cacheTolerance = instance.cacheTolerance * 1000;
594
  }
 
600
  return instance;
601
  }
602
 
603
+ isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
604
+ if (this.waitForSelector?.length) {
605
  return false;
606
  }
607
+ if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
608
+ return true;
609
+ }
610
+ if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) {
611
+ const now = Date.now();
612
+ if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
613
+ return true;
614
+ }
615
+ }
616
+ if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
617
+ return false;
618
+ }
619
+ if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
620
+ return false;
621
+ }
622
+ if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
623
+ return true;
624
+ }
625
+ if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) {
626
+ const now = Date.now();
627
+ if ((snapshot.lastContentResourceLoaded + 500) < now) {
628
+ return true;
629
+ }
630
+ }
631
+
632
+ if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
633
  return false;
634
  }
635
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
 
639
  return false;
640
  }
641
 
642
+ return false;
643
  }
644
 
645
  isCacheQueryApplicable() {
 
667
  }
668
 
669
  browserIsNotRequired() {
670
+ if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
671
+ return false;
672
+ }
673
  if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
674
  return false;
675
  }
src/services/puppeteer.ts CHANGED
@@ -55,13 +55,10 @@ export interface PageSnapshot {
55
  href: string;
56
  rebase?: string;
57
  html: string;
58
- htmlModifiedByJs?: boolean;
59
  shadowExpanded?: string;
60
  text: string;
61
  status?: number;
62
  statusText?: string;
63
- isIntermediate?: boolean;
64
- isFromCache?: boolean;
65
  parsed?: Partial<ReadabilityParsed> | null;
66
  screenshot?: Buffer;
67
  pageshot?: Buffer;
@@ -70,6 +67,11 @@ export interface PageSnapshot {
70
  maxElemDepth?: number;
71
  elemCount?: number;
72
  childFrames?: PageSnapshot[];
 
 
 
 
 
73
  }
74
 
75
  export interface ExtendedSnapshot extends PageSnapshot {
@@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
374
  return false;
375
  }
376
 
377
- let initialHTML;
 
 
378
  function giveSnapshot(stopActiveSnapshot) {
379
- initialHTML ??= document.documentElement?.outerHTML;
380
  if (stopActiveSnapshot) {
381
  window.haltSnapshot = true;
382
  }
@@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) {
392
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
393
  href: document.location.href,
394
  html: document.documentElement?.outerHTML,
395
- htmlModifiedByJs: false,
396
  text: document.body?.innerText,
397
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
398
  parsed: parsed,
399
  imgs: [],
400
  maxElemDepth: domAnalysis.maxDepth,
401
  elemCount: domAnalysis.elementCount,
 
402
  };
403
- if (initialHTML) {
404
- r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
405
- }
406
  if (document.baseURI !== r.href) {
407
  r.rebase = document.baseURI;
408
  }
@@ -445,9 +445,20 @@ window.briefImgs = briefImgs;
445
  })();
446
  `;
447
 
 
 
 
 
 
 
 
 
448
  class PageReqCtrlKit {
449
  reqSet: Set<HTTPRequest> = new Set();
450
  blockers: Deferred<void>[] = [];
 
 
 
451
 
452
  constructor(
453
  public concurrency: number,
@@ -472,6 +483,15 @@ class PageReqCtrlKit {
472
  this.reqSet.delete(req);
473
  const deferred = this.blockers.shift();
474
  deferred?.resolve();
 
 
 
 
 
 
 
 
 
475
  }
476
  }
477
 
@@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService {
491
  lastPageCratedAt: number = 0;
492
  ua: string = '';
493
 
494
- concurrentRequestsPerPage: number = 16;
495
  pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
496
 
497
  lastReqSentAt: number = 0;
@@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService {
1050
  return;
1051
  }
1052
  snapshot = s;
 
 
 
 
 
1053
  if (s?.maxElemDepth && s.maxElemDepth > 256) {
1054
  return;
1055
  }
 
55
  href: string;
56
  rebase?: string;
57
  html: string;
 
58
  shadowExpanded?: string;
59
  text: string;
60
  status?: number;
61
  statusText?: string;
 
 
62
  parsed?: Partial<ReadabilityParsed> | null;
63
  screenshot?: Buffer;
64
  pageshot?: Buffer;
 
67
  maxElemDepth?: number;
68
  elemCount?: number;
69
  childFrames?: PageSnapshot[];
70
+ isIntermediate?: boolean;
71
+ isFromCache?: boolean;
72
+ lastMutationIdle?: number;
73
+ lastContentResourceLoaded?: number;
74
+ lastMediaResourceLoaded?: number;
75
  }
76
 
77
  export interface ExtendedSnapshot extends PageSnapshot {
 
376
  return false;
377
  }
378
 
379
+ let lastMutationIdle = 0;
380
+ document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
381
+
382
  function giveSnapshot(stopActiveSnapshot) {
 
383
  if (stopActiveSnapshot) {
384
  window.haltSnapshot = true;
385
  }
 
395
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
396
  href: document.location.href,
397
  html: document.documentElement?.outerHTML,
 
398
  text: document.body?.innerText,
399
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
400
  parsed: parsed,
401
  imgs: [],
402
  maxElemDepth: domAnalysis.maxDepth,
403
  elemCount: domAnalysis.elementCount,
404
+ lastMutationIdle,
405
  };
 
 
 
406
  if (document.baseURI !== r.href) {
407
  r.rebase = document.baseURI;
408
  }
 
445
  })();
446
  `;
447
 
448
+ const documentResourceTypes = new Set([
449
+ 'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
450
+ ]);
451
+ const mediaResourceTypes = new Set([
452
+ 'stylesheet', 'image', 'font', 'media'
453
+ ]);
454
+
455
+
456
  class PageReqCtrlKit {
457
  reqSet: Set<HTTPRequest> = new Set();
458
  blockers: Deferred<void>[] = [];
459
+ lastResourceLoadedAt: number = 0;
460
+ lastContentResourceLoadedAt: number = 0;
461
+ lastMediaResourceLoadedAt: number = 0;
462
 
463
  constructor(
464
  public concurrency: number,
 
483
  this.reqSet.delete(req);
484
  const deferred = this.blockers.shift();
485
  deferred?.resolve();
486
+ const now = Date.now();
487
+ this.lastResourceLoadedAt = now;
488
+ const typ = req.resourceType();
489
+ if (documentResourceTypes.has(typ)) {
490
+ this.lastContentResourceLoadedAt = now;
491
+ }
492
+ if (mediaResourceTypes.has(typ)) {
493
+ this.lastMediaResourceLoadedAt = now;
494
+ }
495
  }
496
  }
497
 
 
511
  lastPageCratedAt: number = 0;
512
  ua: string = '';
513
 
514
+ concurrentRequestsPerPage: number = 32;
515
  pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
516
 
517
  lastReqSentAt: number = 0;
 
1070
  return;
1071
  }
1072
  snapshot = s;
1073
+ if (snapshot) {
1074
+ const kit = this.pageReqCtrl.get(page);
1075
+ snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
1076
+ snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
1077
+ }
1078
  if (s?.maxElemDepth && s.maxElemDepth > 256) {
1079
  return;
1080
  }