Spaces:
Build error
Build error
fix: scrap timing
Browse files
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -12,6 +12,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
import { Readability } from '@mozilla/readability';
|
|
|
|
| 15 |
const tldExtract = require('tld-extract');
|
| 16 |
|
| 17 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
@@ -370,9 +371,6 @@ const handlePageLoad = () => {
|
|
| 370 |
if (window.haltSnapshot) {
|
| 371 |
return;
|
| 372 |
}
|
| 373 |
-
if (document.readyState === 'loading') {
|
| 374 |
-
return;
|
| 375 |
-
}
|
| 376 |
const thisTextLength = (document.body.innerText || '').length;
|
| 377 |
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
| 378 |
if (10 * deltaLength < lastTextLength) {
|
|
@@ -383,7 +381,7 @@ const handlePageLoad = () => {
|
|
| 383 |
window.reportSnapshot(r);
|
| 384 |
lastTextLength = thisTextLength;
|
| 385 |
};
|
| 386 |
-
setInterval(handlePageLoad,
|
| 387 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 388 |
document.addEventListener('load', handlePageLoad);
|
| 389 |
`);
|
|
@@ -495,62 +493,116 @@ document.addEventListener('load', handlePageLoad);
|
|
| 495 |
);
|
| 496 |
});
|
| 497 |
|
|
|
|
|
|
|
| 498 |
const gotoPromise = page.goto(url, {
|
| 499 |
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 500 |
-
timeout
|
| 501 |
})
|
| 502 |
.catch((err) => {
|
| 503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
return Promise.reject(new AssertionFailureError({
|
| 505 |
message: `Failed to goto ${url}: ${err}`,
|
| 506 |
cause: err,
|
| 507 |
}));
|
| 508 |
-
}).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
if (!snapshot?.html) {
|
| 510 |
-
|
| 511 |
-
|
|
|
|
|
|
|
| 512 |
}
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
}
|
|
|
|
|
|
|
| 521 |
}
|
|
|
|
| 522 |
finalized = true;
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
|
|
|
|
|
|
| 529 |
});
|
|
|
|
| 530 |
if (options?.waitForSelector) {
|
| 531 |
-
const
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
}
|
| 543 |
|
| 544 |
try {
|
| 545 |
let lastHTML = snapshot?.html;
|
| 546 |
while (true) {
|
| 547 |
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
|
|
|
|
|
|
|
|
|
| 548 |
if (options?.minIntervalMs) {
|
| 549 |
ckpt.push(delay(options.minIntervalMs));
|
| 550 |
}
|
| 551 |
let error;
|
| 552 |
await Promise.race(ckpt).catch((err) => error = err);
|
| 553 |
if (finalized && !error) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 555 |
break;
|
| 556 |
}
|
|
@@ -566,7 +618,7 @@ document.addEventListener('load', handlePageLoad);
|
|
| 566 |
}
|
| 567 |
}
|
| 568 |
} finally {
|
| 569 |
-
gotoPromise.finally(() => {
|
| 570 |
page.off('snapshot', hdl);
|
| 571 |
this.ditchPage(page);
|
| 572 |
});
|
|
|
|
| 12 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 13 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 14 |
import { Readability } from '@mozilla/readability';
|
| 15 |
+
import { TimeoutError } from 'puppeteer';
|
| 16 |
const tldExtract = require('tld-extract');
|
| 17 |
|
| 18 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
|
|
| 371 |
if (window.haltSnapshot) {
|
| 372 |
return;
|
| 373 |
}
|
|
|
|
|
|
|
|
|
|
| 374 |
const thisTextLength = (document.body.innerText || '').length;
|
| 375 |
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
| 376 |
if (10 * deltaLength < lastTextLength) {
|
|
|
|
| 381 |
window.reportSnapshot(r);
|
| 382 |
lastTextLength = thisTextLength;
|
| 383 |
};
|
| 384 |
+
setInterval(handlePageLoad, 800);
|
| 385 |
document.addEventListener('readystatechange', handlePageLoad);
|
| 386 |
document.addEventListener('load', handlePageLoad);
|
| 387 |
`);
|
|
|
|
| 493 |
);
|
| 494 |
});
|
| 495 |
|
| 496 |
+
const timeout = options?.timeoutMs || 30_000;
|
| 497 |
+
|
| 498 |
const gotoPromise = page.goto(url, {
|
| 499 |
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
| 500 |
+
timeout,
|
| 501 |
})
|
| 502 |
.catch((err) => {
|
| 503 |
+
if (err instanceof TimeoutError) {
|
| 504 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
|
| 505 |
+
return new AssertionFailureError({
|
| 506 |
+
message: `Failed to goto ${url}: ${err}`,
|
| 507 |
+
cause: err,
|
| 508 |
+
});
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
|
| 512 |
return Promise.reject(new AssertionFailureError({
|
| 513 |
message: `Failed to goto ${url}: ${err}`,
|
| 514 |
cause: err,
|
| 515 |
}));
|
| 516 |
+
}).then(async (stuff) => {
|
| 517 |
+
// This check is necessary because without snapshot, the condition of the page is unclear
|
| 518 |
+
// Calling evaluate directly may stall the process.
|
| 519 |
+
if (!snapshot) {
|
| 520 |
+
if (stuff instanceof Error) {
|
| 521 |
+
finalized = true;
|
| 522 |
+
throw stuff;
|
| 523 |
+
}
|
| 524 |
+
}
|
| 525 |
+
try {
|
| 526 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 527 |
+
screenshot = await page.screenshot();
|
| 528 |
+
} catch (err: any) {
|
| 529 |
+
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
| 530 |
+
if (stuff instanceof Error) {
|
| 531 |
+
finalized = true;
|
| 532 |
+
throw stuff;
|
| 533 |
+
}
|
| 534 |
+
}
|
| 535 |
if (!snapshot?.html) {
|
| 536 |
+
if (stuff instanceof Error) {
|
| 537 |
+
finalized = true;
|
| 538 |
+
throw stuff;
|
| 539 |
+
}
|
| 540 |
}
|
| 541 |
+
try {
|
| 542 |
+
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
| 543 |
+
const salvaged = await this.salvage(url, page);
|
| 544 |
+
if (salvaged) {
|
| 545 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 546 |
+
screenshot = await page.screenshot();
|
| 547 |
+
}
|
| 548 |
}
|
| 549 |
+
} catch (err: any) {
|
| 550 |
+
this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) });
|
| 551 |
}
|
| 552 |
+
|
| 553 |
finalized = true;
|
| 554 |
+
if (snapshot?.html) {
|
| 555 |
+
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
| 556 |
+
this.emit(
|
| 557 |
+
'crawled',
|
| 558 |
+
{ ...snapshot, screenshot },
|
| 559 |
+
{ ...options, url: parsedUrl }
|
| 560 |
+
);
|
| 561 |
+
}
|
| 562 |
});
|
| 563 |
+
let waitForPromise: Promise<any> | undefined;
|
| 564 |
if (options?.waitForSelector) {
|
| 565 |
+
const t0 = Date.now();
|
| 566 |
+
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
| 567 |
+
const t1 = Date.now();
|
| 568 |
+
const elapsed = t1 - t0;
|
| 569 |
+
const remaining = timeout - elapsed;
|
| 570 |
+
const thisTimeout = remaining > 100 ? remaining : 100;
|
| 571 |
+
const p = (Array.isArray(options.waitForSelector) ?
|
| 572 |
+
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
| 573 |
+
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
| 574 |
+
.then(async () => {
|
| 575 |
+
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
| 576 |
+
screenshot = await page.screenshot();
|
| 577 |
+
finalized = true;
|
| 578 |
+
})
|
| 579 |
+
.catch((err) => {
|
| 580 |
+
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
| 581 |
+
waitForPromise = undefined;
|
| 582 |
+
});
|
| 583 |
+
return p as any;
|
| 584 |
+
});
|
| 585 |
}
|
| 586 |
|
| 587 |
try {
|
| 588 |
let lastHTML = snapshot?.html;
|
| 589 |
while (true) {
|
| 590 |
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
| 591 |
+
if (waitForPromise) {
|
| 592 |
+
ckpt.push(waitForPromise);
|
| 593 |
+
}
|
| 594 |
if (options?.minIntervalMs) {
|
| 595 |
ckpt.push(delay(options.minIntervalMs));
|
| 596 |
}
|
| 597 |
let error;
|
| 598 |
await Promise.race(ckpt).catch((err) => error = err);
|
| 599 |
if (finalized && !error) {
|
| 600 |
+
if (!snapshot && !screenshot) {
|
| 601 |
+
if (error) {
|
| 602 |
+
throw error;
|
| 603 |
+
}
|
| 604 |
+
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
| 605 |
+
}
|
| 606 |
yield { ...snapshot, screenshot } as PageSnapshot;
|
| 607 |
break;
|
| 608 |
}
|
|
|
|
| 618 |
}
|
| 619 |
}
|
| 620 |
} finally {
|
| 621 |
+
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
| 622 |
page.off('snapshot', hdl);
|
| 623 |
this.ditchPage(page);
|
| 624 |
});
|