nomagick commited on
Commit
ec7c2ab
·
1 Parent(s): 3235906

fix: scrap timing

Browse files
backend/functions/src/services/puppeteer.ts CHANGED
@@ -12,6 +12,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
14
  import { Readability } from '@mozilla/readability';
 
15
  const tldExtract = require('tld-extract');
16
 
17
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@@ -370,9 +371,6 @@ const handlePageLoad = () => {
370
  if (window.haltSnapshot) {
371
  return;
372
  }
373
- if (document.readyState === 'loading') {
374
- return;
375
- }
376
  const thisTextLength = (document.body.innerText || '').length;
377
  const deltaLength = Math.abs(thisTextLength - lastTextLength);
378
  if (10 * deltaLength < lastTextLength) {
@@ -383,7 +381,7 @@ const handlePageLoad = () => {
383
  window.reportSnapshot(r);
384
  lastTextLength = thisTextLength;
385
  };
386
- setInterval(handlePageLoad, 500);
387
  document.addEventListener('readystatechange', handlePageLoad);
388
  document.addEventListener('load', handlePageLoad);
389
  `);
@@ -495,62 +493,116 @@ document.addEventListener('load', handlePageLoad);
495
  );
496
  });
497
 
 
 
498
  const gotoPromise = page.goto(url, {
499
  waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
500
- timeout: options?.timeoutMs || 30_000
501
  })
502
  .catch((err) => {
503
- this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
 
 
 
 
 
 
 
 
504
  return Promise.reject(new AssertionFailureError({
505
  message: `Failed to goto ${url}: ${err}`,
506
  cause: err,
507
  }));
508
- }).finally(async () => {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  if (!snapshot?.html) {
510
- finalized = true;
511
- return;
 
 
512
  }
513
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
514
- screenshot = await page.screenshot();
515
- if ((!snapshot.title || !snapshot.parsed?.content) && !(snapshot.pdfs?.length)) {
516
- const salvaged = await this.salvage(url, page);
517
- if (salvaged) {
518
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
519
- screenshot = await page.screenshot();
520
  }
 
 
521
  }
 
522
  finalized = true;
523
- this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
524
- this.emit(
525
- 'crawled',
526
- { ...snapshot, screenshot },
527
- { ...options, url: parsedUrl }
528
- );
 
 
529
  });
 
530
  if (options?.waitForSelector) {
531
- const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
532
- waitPromise
533
- .then(async () => {
534
- snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
535
- screenshot = await page.screenshot();
536
- finalized = true;
537
- nextSnapshotDeferred.resolve(snapshot);
538
- })
539
- .catch((err) => {
540
- this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
541
- });
 
 
 
 
 
 
 
 
 
542
  }
543
 
544
  try {
545
  let lastHTML = snapshot?.html;
546
  while (true) {
547
  const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
 
 
 
548
  if (options?.minIntervalMs) {
549
  ckpt.push(delay(options.minIntervalMs));
550
  }
551
  let error;
552
  await Promise.race(ckpt).catch((err) => error = err);
553
  if (finalized && !error) {
 
 
 
 
 
 
554
  yield { ...snapshot, screenshot } as PageSnapshot;
555
  break;
556
  }
@@ -566,7 +618,7 @@ document.addEventListener('load', handlePageLoad);
566
  }
567
  }
568
  } finally {
569
- gotoPromise.finally(() => {
570
  page.off('snapshot', hdl);
571
  this.ditchPage(page);
572
  });
 
12
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
13
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
14
  import { Readability } from '@mozilla/readability';
15
+ import { TimeoutError } from 'puppeteer';
16
  const tldExtract = require('tld-extract');
17
 
18
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
 
371
  if (window.haltSnapshot) {
372
  return;
373
  }
 
 
 
374
  const thisTextLength = (document.body.innerText || '').length;
375
  const deltaLength = Math.abs(thisTextLength - lastTextLength);
376
  if (10 * deltaLength < lastTextLength) {
 
381
  window.reportSnapshot(r);
382
  lastTextLength = thisTextLength;
383
  };
384
+ setInterval(handlePageLoad, 800);
385
  document.addEventListener('readystatechange', handlePageLoad);
386
  document.addEventListener('load', handlePageLoad);
387
  `);
 
493
  );
494
  });
495
 
496
+ const timeout = options?.timeoutMs || 30_000;
497
+
498
  const gotoPromise = page.goto(url, {
499
  waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
500
+ timeout,
501
  })
502
  .catch((err) => {
503
+ if (err instanceof TimeoutError) {
504
+ this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
505
+ return new AssertionFailureError({
506
+ message: `Failed to goto ${url}: ${err}`,
507
+ cause: err,
508
+ });
509
+ }
510
+
511
+ this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
512
  return Promise.reject(new AssertionFailureError({
513
  message: `Failed to goto ${url}: ${err}`,
514
  cause: err,
515
  }));
516
+ }).then(async (stuff) => {
517
+ // This check is necessary because without snapshot, the condition of the page is unclear
518
+ // Calling evaluate directly may stall the process.
519
+ if (!snapshot) {
520
+ if (stuff instanceof Error) {
521
+ finalized = true;
522
+ throw stuff;
523
+ }
524
+ }
525
+ try {
526
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
527
+ screenshot = await page.screenshot();
528
+ } catch (err: any) {
529
+ this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
530
+ if (stuff instanceof Error) {
531
+ finalized = true;
532
+ throw stuff;
533
+ }
534
+ }
535
  if (!snapshot?.html) {
536
+ if (stuff instanceof Error) {
537
+ finalized = true;
538
+ throw stuff;
539
+ }
540
  }
541
+ try {
542
+ if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
543
+ const salvaged = await this.salvage(url, page);
544
+ if (salvaged) {
545
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
546
+ screenshot = await page.screenshot();
547
+ }
548
  }
549
+ } catch (err: any) {
550
+ this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) });
551
  }
552
+
553
  finalized = true;
554
+ if (snapshot?.html) {
555
+ this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
556
+ this.emit(
557
+ 'crawled',
558
+ { ...snapshot, screenshot },
559
+ { ...options, url: parsedUrl }
560
+ );
561
+ }
562
  });
563
+ let waitForPromise: Promise<any> | undefined;
564
  if (options?.waitForSelector) {
565
+ const t0 = Date.now();
566
+ waitForPromise = nextSnapshotDeferred.promise.then(() => {
567
+ const t1 = Date.now();
568
+ const elapsed = t1 - t0;
569
+ const remaining = timeout - elapsed;
570
+ const thisTimeout = remaining > 100 ? remaining : 100;
571
+ const p = (Array.isArray(options.waitForSelector) ?
572
+ Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
573
+ page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
574
+ .then(async () => {
575
+ snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
576
+ screenshot = await page.screenshot();
577
+ finalized = true;
578
+ })
579
+ .catch((err) => {
580
+ this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
581
+ waitForPromise = undefined;
582
+ });
583
+ return p as any;
584
+ });
585
  }
586
 
587
  try {
588
  let lastHTML = snapshot?.html;
589
  while (true) {
590
  const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
591
+ if (waitForPromise) {
592
+ ckpt.push(waitForPromise);
593
+ }
594
  if (options?.minIntervalMs) {
595
  ckpt.push(delay(options.minIntervalMs));
596
  }
597
  let error;
598
  await Promise.race(ckpt).catch((err) => error = err);
599
  if (finalized && !error) {
600
+ if (!snapshot && !screenshot) {
601
+ if (error) {
602
+ throw error;
603
+ }
604
+ throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
605
+ }
606
  yield { ...snapshot, screenshot } as PageSnapshot;
607
  break;
608
  }
 
618
  }
619
  }
620
  } finally {
621
+ (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
622
  page.off('snapshot', hdl);
623
  this.ditchPage(page);
624
  });