nomagick commited on
Commit
ff595c2
·
unverified ·
1 Parent(s): a5e3c2d

improve: cache usage and detection of js-required pages

Browse files
src/api/crawler.ts CHANGED
@@ -509,7 +509,7 @@ export class CrawlerHost extends RPCHost {
509
  return digest;
510
  }
511
 
512
- async queryCache(urlToCrawl: URL, cacheTolerance: number) {
513
  const digest = this.getUrlDigest(urlToCrawl);
514
 
515
  const cache = (
@@ -526,8 +526,10 @@ export class CrawlerHost extends RPCHost {
526
  }))
527
  )?.[0];
528
 
 
 
529
  if (!cache) {
530
- return undefined;
531
  }
532
 
533
  const age = Date.now() - cache.createdAt.valueOf();
@@ -561,7 +563,7 @@ export class CrawlerHost extends RPCHost {
561
  return undefined;
562
  }
563
 
564
- return {
565
  isFresh: !stale,
566
  ...cache,
567
  snapshot: {
@@ -585,7 +587,7 @@ export class CrawlerHost extends RPCHost {
585
  url: urlToCrawl.toString(),
586
  createdAt: nowDate,
587
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
588
- htmlModifiedByJs: snapshot.htmlModifiedByJs,
589
  urlPathDigest: digest,
590
  });
591
 
@@ -726,19 +728,21 @@ export class CrawlerHost extends RPCHost {
726
  return;
727
  }
728
 
729
- let cache;
730
-
731
- if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
732
- const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
733
- cache = await this.queryCache(urlToCrawl, cacheTolerance);
734
- }
735
 
736
- if (cache?.htmlModifiedByJs === false) {
 
737
  if (crawlerOpts) {
738
  crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
739
  }
740
  }
741
 
 
 
 
 
 
742
  if (cache?.isFresh &&
743
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
744
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
 
509
  return digest;
510
  }
511
 
512
+ async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
513
  const digest = this.getUrlDigest(urlToCrawl);
514
 
515
  const cache = (
 
526
  }))
527
  )?.[0];
528
 
529
+ yield cache;
530
+
531
  if (!cache) {
532
+ return;
533
  }
534
 
535
  const age = Date.now() - cache.createdAt.valueOf();
 
563
  return undefined;
564
  }
565
 
566
+ yield {
567
  isFresh: !stale,
568
  ...cache,
569
  snapshot: {
 
587
  url: urlToCrawl.toString(),
588
  createdAt: nowDate,
589
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
590
+ htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
591
  urlPathDigest: digest,
592
  });
593
 
 
728
  return;
729
  }
730
 
731
+ const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
732
+ const cacheIt = this.queryCache(urlToCrawl, cacheTolerance);
 
 
 
 
733
 
734
+ let cache = (await cacheIt.next()).value;
735
+ if (cache?.htmlSignificantlyModifiedByJs === false) {
736
  if (crawlerOpts) {
737
  crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
738
  }
739
  }
740
 
741
+ if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
742
+ cache = (await cacheIt.next()).value;
743
+ }
744
+ cacheIt.return(undefined);
745
+
746
  if (cache?.isFresh &&
747
  (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
748
  (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
src/db/crawled.ts CHANGED
@@ -22,7 +22,7 @@ export class Crawled extends FirestoreRecord {
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
- htmlModifiedByJs?: boolean;
26
 
27
  @Prop()
28
  snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
 
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
+ htmlSignificantlyModifiedByJs?: boolean;
26
 
27
  @Prop()
28
  snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
src/dto/crawler-options.ts CHANGED
@@ -615,9 +615,6 @@ export class CrawlerOptions extends AutoCastable {
615
  return false;
616
  }
617
  const presumedTiming = this.presumedRespondTiming;
618
- if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
619
- return true;
620
- }
621
  if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
622
  const now = Date.now();
623
  if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
@@ -636,10 +633,12 @@ export class CrawlerOptions extends AutoCastable {
636
  return true;
637
  }
638
  }
639
-
640
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
641
  return false;
642
  }
 
 
 
643
  if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
644
  return false;
645
  }
 
615
  return false;
616
  }
617
  const presumedTiming = this.presumedRespondTiming;
 
 
 
618
  if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
619
  const now = Date.now();
620
  if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
 
633
  return true;
634
  }
635
  }
 
636
  if (this.injectFrameScript?.length || this.injectPageScript?.length) {
637
  return false;
638
  }
639
+ if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
640
+ return true;
641
+ }
642
  if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
643
  return false;
644
  }
src/services/puppeteer.ts CHANGED
@@ -6,8 +6,7 @@ import { container, singleton } from 'tsyringe';
6
 
7
  import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
8
  import type { Cookie } from 'set-cookie-parser';
9
- import puppeteer from 'puppeteer-extra';
10
- import { TimeoutError } from 'puppeteer';
11
 
12
  import { Defer, Deferred } from 'civkit/defer';
13
  import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
@@ -15,7 +14,6 @@ import { AsyncService } from 'civkit/async-service';
15
  import { FancyFile } from 'civkit/fancy-file';
16
  import { delay } from 'civkit/timeout';
17
 
18
- import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
19
  import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
20
  import { CurlControl } from './curl';
21
  import { BlackHoleDetector } from './blackhole-detector';
@@ -55,7 +53,7 @@ export interface PageSnapshot {
55
  href: string;
56
  rebase?: string;
57
  html: string;
58
- htmlModifiedByJs?: boolean;
59
  shadowExpanded?: string;
60
  text: string;
61
  status?: number;
@@ -110,11 +108,6 @@ export interface ScrappingOptions {
110
 
111
  }
112
 
113
- puppeteer.use(puppeteerBlockResources({
114
- blockedTypes: new Set(['media']),
115
- interceptResolutionPriority: 1,
116
- }));
117
-
118
  const SIMULATE_SCROLL = `
119
  (function () {
120
  function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
@@ -265,7 +258,7 @@ function briefImgs(elem) {
265
  };
266
  });
267
  }
268
- function getMaxDepthAndCountUsingTreeWalker(root) {
269
  let maxDepth = 0;
270
  let currentDepth = 0;
271
  let elementCount = 0;
@@ -378,11 +371,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
378
  }
379
 
380
  let lastMutationIdle = 0;
381
- let initialHTML;
382
  document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
383
 
384
- function giveSnapshot(stopActiveSnapshot) {
385
- initialHTML ??= document.documentElement?.outerHTML;
386
  if (stopActiveSnapshot) {
387
  window.haltSnapshot = true;
388
  }
@@ -392,13 +384,18 @@ function giveSnapshot(stopActiveSnapshot) {
392
  } catch (err) {
393
  void 0;
394
  }
395
- const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement);
 
 
 
 
 
396
  const r = {
397
  title: document.title,
398
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
399
  href: document.location.href,
400
  html: document.documentElement?.outerHTML,
401
- htmlModifiedByJs: false,
402
  text: document.body?.innerText,
403
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
404
  parsed: parsed,
@@ -407,9 +404,6 @@ function giveSnapshot(stopActiveSnapshot) {
407
  elemCount: domAnalysis.elementCount,
408
  lastMutationIdle,
409
  };
410
- if (initialHTML) {
411
- r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
412
- }
413
  if (document.baseURI !== r.href) {
414
  r.rebase = document.baseURI;
415
  }
@@ -446,6 +440,7 @@ function waitForSelector(selectorText) {
446
  });
447
  });
448
  }
 
449
  window.waitForSelector = waitForSelector;
450
  window.giveSnapshot = giveSnapshot;
451
  window.briefImgs = briefImgs;
@@ -566,7 +561,7 @@ export class PuppeteerControl extends AsyncService {
566
  }
567
  this.browser = await puppeteer.launch({
568
  timeout: 10_000,
569
- headless: true,
570
  executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
571
  args: ['--disable-dev-shm-usage']
572
  }).catch((err: any) => {
@@ -735,23 +730,45 @@ export class PuppeteerControl extends AsyncService {
735
  await page.evaluateOnNewDocument(`
736
  (function () {
737
  if (window.self === window.top) {
738
- let lastTextLength = 0;
 
739
  const handlePageLoad = () => {
740
- const thisTextLength = (document.body.innerText || '').length;
741
- const deltaLength = Math.abs(thisTextLength - lastTextLength);
742
- if (10 * deltaLength < lastTextLength) {
743
- // Change is not significant
 
 
 
744
  return;
745
  }
746
- lastTextLength = thisTextLength;
747
- if (window.haltSnapshot) {
 
 
 
 
 
 
 
 
748
  return;
749
  }
750
- const r = giveSnapshot();
 
 
 
 
751
  window.reportSnapshot(r);
752
  };
753
- document.addEventListener('readystatechange', handlePageLoad);
 
 
 
 
754
  document.addEventListener('load', handlePageLoad);
 
 
755
  document.addEventListener('mutationIdle', handlePageLoad);
756
  }
757
  document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
@@ -772,11 +789,13 @@ export class PuppeteerControl extends AsyncService {
772
  if (this.__loadedPage.length) {
773
  thePage = this.__loadedPage.shift();
774
  if (this.__loadedPage.length <= 1) {
775
- this.newPage()
776
- .then((r) => this.__loadedPage.push(r))
777
- .catch((err) => {
778
- this.logger.warn(`Failed to load new page ahead of time`, { err });
779
- });
 
 
780
  }
781
  }
782
 
@@ -860,6 +879,10 @@ export class PuppeteerControl extends AsyncService {
860
  return req.continue(overrides, 0);
861
  }
862
  const typ = req.resourceType();
 
 
 
 
863
  if (!options.proxyResources) {
864
  const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
865
  if (!isDocRequest) {
@@ -925,7 +948,7 @@ export class PuppeteerControl extends AsyncService {
925
  status: curled.status,
926
  headers: _.omit(curled.headers, 'result'),
927
  contentType: curled.contentType,
928
- }, 999);
929
  }
930
  const body = await readFile(await curled.file.filePath);
931
  if (req.isInterceptResolutionHandled()) {
@@ -936,7 +959,7 @@ export class PuppeteerControl extends AsyncService {
936
  headers: _.omit(curled.headers, 'result'),
937
  contentType: curled.contentType,
938
  body: Uint8Array.from(body),
939
- }, 999);
940
  }
941
  options.sideLoad ??= curled.sideLoadOpts;
942
  _.merge(options.sideLoad, curled.sideLoadOpts);
@@ -945,7 +968,7 @@ export class PuppeteerControl extends AsyncService {
945
  return req.respond({
946
  status: firstReq.result!.code,
947
  headers: _.omit(firstReq, 'result'),
948
- }, 999);
949
  } catch (err: any) {
950
  this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
951
  }
 
6
 
7
  import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
8
  import type { Cookie } from 'set-cookie-parser';
9
+ import puppeteer, { TimeoutError } from 'puppeteer';
 
10
 
11
  import { Defer, Deferred } from 'civkit/defer';
12
  import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
 
14
  import { FancyFile } from 'civkit/fancy-file';
15
  import { delay } from 'civkit/timeout';
16
 
 
17
  import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
18
  import { CurlControl } from './curl';
19
  import { BlackHoleDetector } from './blackhole-detector';
 
53
  href: string;
54
  rebase?: string;
55
  html: string;
56
+ htmlSignificantlyModifiedByJs?: boolean;
57
  shadowExpanded?: string;
58
  text: string;
59
  status?: number;
 
108
 
109
  }
110
 
 
 
 
 
 
111
  const SIMULATE_SCROLL = `
112
  (function () {
113
  function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
 
258
  };
259
  });
260
  }
261
+ function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) {
262
  let maxDepth = 0;
263
  let currentDepth = 0;
264
  let elementCount = 0;
 
371
  }
372
 
373
  let lastMutationIdle = 0;
374
+ let initialAnalytics;
375
  document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
376
 
377
+ function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
 
378
  if (stopActiveSnapshot) {
379
  window.haltSnapshot = true;
380
  }
 
384
  } catch (err) {
385
  void 0;
386
  }
387
+ const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement);
388
+ initialAnalytics ??= domAnalysis;
389
+
390
+ const thisElemCount = domAnalysis.elementCount;
391
+ const initialElemCount = initialAnalytics.elementCount;
392
+ Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON)
393
  const r = {
394
  title: document.title,
395
  description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
396
  href: document.location.href,
397
  html: document.documentElement?.outerHTML,
398
+ htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1),
399
  text: document.body?.innerText,
400
  shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
401
  parsed: parsed,
 
404
  elemCount: domAnalysis.elementCount,
405
  lastMutationIdle,
406
  };
 
 
 
407
  if (document.baseURI !== r.href) {
408
  r.rebase = document.baseURI;
409
  }
 
440
  });
441
  });
442
  }
443
+ window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker;
444
  window.waitForSelector = waitForSelector;
445
  window.giveSnapshot = giveSnapshot;
446
  window.briefImgs = briefImgs;
 
561
  }
562
  this.browser = await puppeteer.launch({
563
  timeout: 10_000,
564
+ headless: false,
565
  executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
566
  args: ['--disable-dev-shm-usage']
567
  }).catch((err: any) => {
 
730
  await page.evaluateOnNewDocument(`
731
  (function () {
732
  if (window.self === window.top) {
733
+ let lastAnalytics;
734
+ let lastReportedAt = 0;
735
  const handlePageLoad = () => {
736
+ const now = Date.now();
737
+ const dt = now - lastReportedAt;
738
+ const previousAnalytics = lastAnalytics;
739
+ const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker();
740
+ let dElem = 0;
741
+
742
+ if (window.haltSnapshot) {
743
  return;
744
  }
745
+
746
+ const thisElemCount = thisAnalytics.elementCount;
747
+ if (previousAnalytics) {
748
+ const previousElemCount = previousAnalytics.elementCount;
749
+
750
+ const delta = Math.abs(thisElemCount - previousElemCount);
751
+ dElem = delta /(previousElemCount + Number.EPSILON);
752
+ }
753
+
754
+ if (dt < 1500 && dElem < 0.1) {
755
  return;
756
  }
757
+
758
+ lastAnalytics = thisAnalytics;
759
+ lastReportedAt = now;
760
+
761
+ const r = giveSnapshot(false, lastAnalytics);
762
  window.reportSnapshot(r);
763
  };
764
+ document.addEventListener('readystatechange', ()=> {
765
+ if (document.readyState === 'interactive') {
766
+ handlePageLoad();
767
+ }
768
+ });
769
  document.addEventListener('load', handlePageLoad);
770
+ window.addEventListener('load', handlePageLoad);
771
+ document.addEventListener('DOMContentLoaded', handlePageLoad);
772
  document.addEventListener('mutationIdle', handlePageLoad);
773
  }
774
  document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
 
789
  if (this.__loadedPage.length) {
790
  thePage = this.__loadedPage.shift();
791
  if (this.__loadedPage.length <= 1) {
792
+ process.nextTick(() => {
793
+ this.newPage()
794
+ .then((r) => this.__loadedPage.push(r))
795
+ .catch((err) => {
796
+ this.logger.warn(`Failed to load new page ahead of time`, { err });
797
+ });
798
+ });
799
  }
800
  }
801
 
 
879
  return req.continue(overrides, 0);
880
  }
881
  const typ = req.resourceType();
882
+ if (typ === 'media') {
883
+ // Non-cooperative answer to block all media requests.
884
+ return req.abort('blockedbyclient');
885
+ }
886
  if (!options.proxyResources) {
887
  const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
888
  if (!isDocRequest) {
 
948
  status: curled.status,
949
  headers: _.omit(curled.headers, 'result'),
950
  contentType: curled.contentType,
951
+ }, 3);
952
  }
953
  const body = await readFile(await curled.file.filePath);
954
  if (req.isInterceptResolutionHandled()) {
 
959
  headers: _.omit(curled.headers, 'result'),
960
  contentType: curled.contentType,
961
  body: Uint8Array.from(body),
962
+ }, 3);
963
  }
964
  options.sideLoad ??= curled.sideLoadOpts;
965
  _.merge(options.sideLoad, curled.sideLoadOpts);
 
968
  return req.respond({
969
  status: firstReq.result!.code,
970
  headers: _.omit(firstReq, 'result'),
971
+ }, 3);
972
  } catch (err: any) {
973
  this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
974
  }