Spaces:
Build error
Build error
behavior change: ditch content based return timing, adopt mutationIdle as default timing
Browse files- src/api/crawler.ts +13 -70
- src/dto/crawler-options.ts +68 -9
- src/services/puppeteer.ts +35 -10
src/api/crawler.ts
CHANGED
|
@@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE }
|
|
| 19 |
|
| 20 |
import { Crawled } from '../db/crawled';
|
| 21 |
import { DomainBlockade } from '../db/domain-blockade';
|
| 22 |
-
import { DomainProfile } from '../db/domain-profile';
|
| 23 |
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 24 |
|
| 25 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
@@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost {
|
|
| 317 |
if (crawlerOptions.robotsTxt) {
|
| 318 |
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
| 319 |
}
|
|
|
|
|
|
|
|
|
|
| 320 |
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 321 |
const sseStream = new OutputServerEventStream();
|
| 322 |
rpcReflect.return(sseStream);
|
|
@@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 363 |
if (rpcReflect.signal.aborted) {
|
| 364 |
break;
|
| 365 |
}
|
| 366 |
-
if (!crawlerOptions.
|
| 367 |
-
continue;
|
| 368 |
-
}
|
| 369 |
-
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
| 370 |
continue;
|
| 371 |
}
|
| 372 |
|
|
@@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 412 |
if (rpcReflect.signal.aborted) {
|
| 413 |
break;
|
| 414 |
}
|
| 415 |
-
if (!crawlerOptions.
|
| 416 |
-
continue;
|
| 417 |
-
}
|
| 418 |
-
|
| 419 |
-
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
| 420 |
continue;
|
| 421 |
}
|
| 422 |
|
|
@@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost {
|
|
| 427 |
}
|
| 428 |
|
| 429 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 430 |
-
|
| 431 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 432 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 433 |
);
|
| 434 |
}
|
| 435 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
| 436 |
-
|
| 437 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 438 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 439 |
);
|
|
@@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost {
|
|
| 705 |
return;
|
| 706 |
}
|
| 707 |
|
| 708 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
| 710 |
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
| 711 |
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
|
@@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 779 |
|
| 780 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 781 |
draftSnapshot.title ??= analyzed.title;
|
|
|
|
| 782 |
let fallbackProxyIsUsed = false;
|
| 783 |
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 784 |
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
|
@@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 798 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 799 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 800 |
draftSnapshot = proxySnapshot;
|
|
|
|
| 801 |
sideLoaded = proxyLoaded;
|
| 802 |
fallbackProxyIsUsed = true;
|
| 803 |
}
|
|
@@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 986 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 987 |
}
|
| 988 |
|
| 989 |
-
if (opts.
|
| 990 |
crawlOpts.favorScreenshot = true;
|
| 991 |
}
|
| 992 |
|
|
@@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost {
|
|
| 1142 |
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 1143 |
}
|
| 1144 |
|
| 1145 |
-
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
|
| 1146 |
-
const realUrl = new URL(knownSnapshot.href);
|
| 1147 |
-
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
|
| 1148 |
-
const profile = await DomainProfile.fromFirestore(digest);
|
| 1149 |
-
|
| 1150 |
-
if (!profile) {
|
| 1151 |
-
const record = DomainProfile.from({
|
| 1152 |
-
_id: digest,
|
| 1153 |
-
origin: realUrl.origin.toLowerCase(),
|
| 1154 |
-
path,
|
| 1155 |
-
triggerUrl: realUrl.href,
|
| 1156 |
-
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
|
| 1157 |
-
createdAt: new Date(),
|
| 1158 |
-
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
|
| 1159 |
-
});
|
| 1160 |
-
await DomainProfile.save(record);
|
| 1161 |
-
|
| 1162 |
-
return;
|
| 1163 |
-
}
|
| 1164 |
-
|
| 1165 |
-
if (profile.engine === ENGINE_TYPE.BROWSER) {
|
| 1166 |
-
// Mixed engine, always use browser
|
| 1167 |
-
return;
|
| 1168 |
-
}
|
| 1169 |
-
|
| 1170 |
-
profile.origin = realUrl.origin.toLowerCase();
|
| 1171 |
-
profile.triggerUrl = realUrl.href;
|
| 1172 |
-
profile.path = path;
|
| 1173 |
-
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
|
| 1174 |
-
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
| 1175 |
-
|
| 1176 |
-
await DomainProfile.save(profile);
|
| 1177 |
-
|
| 1178 |
-
return;
|
| 1179 |
-
}
|
| 1180 |
-
|
| 1181 |
-
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
|
| 1182 |
-
if (snapshot.pdfs?.length) {
|
| 1183 |
-
return false;
|
| 1184 |
-
}
|
| 1185 |
-
if (!snapshot.title) {
|
| 1186 |
-
return true;
|
| 1187 |
-
}
|
| 1188 |
-
if (snapshot.parsed?.content) {
|
| 1189 |
-
return false;
|
| 1190 |
-
}
|
| 1191 |
-
if (snapshot.html) {
|
| 1192 |
-
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
| 1193 |
-
const tokens = r.tokens;
|
| 1194 |
-
if (tokens < 200) {
|
| 1195 |
-
return true;
|
| 1196 |
-
}
|
| 1197 |
-
}
|
| 1198 |
-
return false;
|
| 1199 |
-
}
|
| 1200 |
-
|
| 1201 |
getDomainProfileUrlDigest(url: URL) {
|
| 1202 |
const pathname = url.pathname;
|
| 1203 |
const pathVec = pathname.split('/');
|
|
|
|
| 19 |
|
| 20 |
import { Crawled } from '../db/crawled';
|
| 21 |
import { DomainBlockade } from '../db/domain-blockade';
|
|
|
|
| 22 |
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
| 23 |
|
| 24 |
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
|
|
|
| 316 |
if (crawlerOptions.robotsTxt) {
|
| 317 |
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
| 318 |
}
|
| 319 |
+
if (rpcReflect.signal.aborted) {
|
| 320 |
+
return;
|
| 321 |
+
}
|
| 322 |
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
| 323 |
const sseStream = new OutputServerEventStream();
|
| 324 |
rpcReflect.return(sseStream);
|
|
|
|
| 365 |
if (rpcReflect.signal.aborted) {
|
| 366 |
break;
|
| 367 |
}
|
| 368 |
+
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
|
|
|
|
|
|
|
|
|
| 369 |
continue;
|
| 370 |
}
|
| 371 |
|
|
|
|
| 411 |
if (rpcReflect.signal.aborted) {
|
| 412 |
break;
|
| 413 |
}
|
| 414 |
+
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
continue;
|
| 416 |
}
|
| 417 |
|
|
|
|
| 422 |
}
|
| 423 |
|
| 424 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
|
|
| 425 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 426 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
| 427 |
);
|
| 428 |
}
|
| 429 |
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
|
|
|
| 430 |
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
| 431 |
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
| 432 |
);
|
|
|
|
| 698 |
return;
|
| 699 |
}
|
| 700 |
|
| 701 |
+
if (
|
| 702 |
+
crawlOpts?.engine === ENGINE_TYPE.CURL ||
|
| 703 |
+
// deprecated name
|
| 704 |
+
crawlOpts?.engine === 'direct'
|
| 705 |
+
) {
|
| 706 |
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
| 707 |
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
| 708 |
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
|
|
|
| 776 |
|
| 777 |
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
| 778 |
draftSnapshot.title ??= analyzed.title;
|
| 779 |
+
draftSnapshot.isIntermediate = true;
|
| 780 |
let fallbackProxyIsUsed = false;
|
| 781 |
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
| 782 |
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
|
|
|
| 796 |
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
| 797 |
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
| 798 |
draftSnapshot = proxySnapshot;
|
| 799 |
+
draftSnapshot.isIntermediate = true;
|
| 800 |
sideLoaded = proxyLoaded;
|
| 801 |
fallbackProxyIsUsed = true;
|
| 802 |
}
|
|
|
|
| 985 |
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
| 986 |
}
|
| 987 |
|
| 988 |
+
if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
|
| 989 |
crawlOpts.favorScreenshot = true;
|
| 990 |
}
|
| 991 |
|
|
|
|
| 1141 |
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
| 1142 |
}
|
| 1143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1144 |
getDomainProfileUrlDigest(url: URL) {
|
| 1145 |
const pathname = url.pathname;
|
| 1146 |
const pathVec = pathname.split('/');
|
src/dto/crawler-options.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file';
|
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 4 |
import { Context } from '../services/registry';
|
| 5 |
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
|
|
|
| 6 |
|
| 7 |
export enum CONTENT_FORMAT {
|
| 8 |
CONTENT = 'content',
|
|
@@ -18,12 +19,18 @@ export enum CONTENT_FORMAT {
|
|
| 18 |
export enum ENGINE_TYPE {
|
| 19 |
AUTO = 'auto',
|
| 20 |
BROWSER = 'browser',
|
| 21 |
-
|
| 22 |
-
VLM = 'vlm',
|
| 23 |
-
READER_LM = 'readerlm-v2',
|
| 24 |
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
| 25 |
}
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
| 28 |
|
| 29 |
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
|
@@ -213,6 +220,15 @@ class Viewport extends AutoCastable {
|
|
| 213 |
in: 'header',
|
| 214 |
schema: { type: 'string' }
|
| 215 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
'X-Engine': {
|
| 217 |
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
|
| 218 |
in: 'header',
|
|
@@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 405 |
@Prop()
|
| 406 |
markdown?: TurnDownTweakableOptions;
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
static override from(input: any) {
|
| 409 |
const instance = super.from(input) as CrawlerOptions;
|
| 410 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
@@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 498 |
if (instance.engine) {
|
| 499 |
instance.engine = instance.engine.toLowerCase();
|
| 500 |
}
|
| 501 |
-
if (instance.engine ===
|
| 502 |
instance.engine = ENGINE_TYPE.BROWSER;
|
| 503 |
instance.respondWith = CONTENT_FORMAT.VLM;
|
| 504 |
-
} else if (instance.engine ===
|
| 505 |
instance.engine = ENGINE_TYPE.AUTO;
|
| 506 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 507 |
}
|
|
@@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 558 |
const dnt = ctx?.get('dnt');
|
| 559 |
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
| 560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
if (instance.cacheTolerance) {
|
| 562 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 563 |
}
|
|
@@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 569 |
return instance;
|
| 570 |
}
|
| 571 |
|
| 572 |
-
|
| 573 |
-
if (this.
|
| 574 |
return false;
|
| 575 |
}
|
| 576 |
-
if (this.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
return false;
|
| 578 |
}
|
| 579 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
|
@@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 583 |
return false;
|
| 584 |
}
|
| 585 |
|
| 586 |
-
return
|
| 587 |
}
|
| 588 |
|
| 589 |
isCacheQueryApplicable() {
|
|
@@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable {
|
|
| 611 |
}
|
| 612 |
|
| 613 |
browserIsNotRequired() {
|
|
|
|
|
|
|
|
|
|
| 614 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
| 615 |
return false;
|
| 616 |
}
|
|
|
|
| 3 |
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
| 4 |
import { Context } from '../services/registry';
|
| 5 |
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
| 6 |
+
import type { PageSnapshot } from '../services/puppeteer';
|
| 7 |
|
| 8 |
export enum CONTENT_FORMAT {
|
| 9 |
CONTENT = 'content',
|
|
|
|
| 19 |
export enum ENGINE_TYPE {
|
| 20 |
AUTO = 'auto',
|
| 21 |
BROWSER = 'browser',
|
| 22 |
+
CURL = 'curl',
|
|
|
|
|
|
|
| 23 |
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
| 24 |
}
|
| 25 |
|
| 26 |
+
export enum RESPOND_TIMING {
|
| 27 |
+
HTML = 'html',
|
| 28 |
+
MUTATION_IDLE = 'mutation-idle',
|
| 29 |
+
RESOURCE_IDLE = 'resource-idle',
|
| 30 |
+
MEDIA_IDLE = 'media-idle',
|
| 31 |
+
NETWORK_IDLE = 'network-idle',
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
| 35 |
|
| 36 |
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
|
|
|
| 220 |
in: 'header',
|
| 221 |
schema: { type: 'string' }
|
| 222 |
},
|
| 223 |
+
'X-Respond-Timing': {
|
| 224 |
+
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
| 225 |
+
`- html: unrendered HTML is enough to return\n` +
|
| 226 |
+
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
| 227 |
+
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
|
| 228 |
+
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`,
|
| 229 |
+
in: 'header',
|
| 230 |
+
schema: { type: 'string' }
|
| 231 |
+
},
|
| 232 |
'X-Engine': {
|
| 233 |
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
|
| 234 |
in: 'header',
|
|
|
|
| 421 |
@Prop()
|
| 422 |
markdown?: TurnDownTweakableOptions;
|
| 423 |
|
| 424 |
+
@Prop({
|
| 425 |
+
type: RESPOND_TIMING,
|
| 426 |
+
})
|
| 427 |
+
respondTiming?: RESPOND_TIMING;
|
| 428 |
+
|
| 429 |
static override from(input: any) {
|
| 430 |
const instance = super.from(input) as CrawlerOptions;
|
| 431 |
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
|
|
|
| 519 |
if (instance.engine) {
|
| 520 |
instance.engine = instance.engine.toLowerCase();
|
| 521 |
}
|
| 522 |
+
if (instance.engine === 'vlm') {
|
| 523 |
instance.engine = ENGINE_TYPE.BROWSER;
|
| 524 |
instance.respondWith = CONTENT_FORMAT.VLM;
|
| 525 |
+
} else if (instance.engine === 'readerlm-v2') {
|
| 526 |
instance.engine = ENGINE_TYPE.AUTO;
|
| 527 |
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
| 528 |
}
|
|
|
|
| 579 |
const dnt = ctx?.get('dnt');
|
| 580 |
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
| 581 |
|
| 582 |
+
const respondTiming = ctx?.get('x-respond-timing');
|
| 583 |
+
if (respondTiming) {
|
| 584 |
+
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
| 585 |
+
}
|
| 586 |
+
instance.respondTiming ??= (
|
| 587 |
+
instance.timeout ||
|
| 588 |
+
instance.respondWith.includes('shot') ||
|
| 589 |
+
instance.respondWith.includes('vlm')
|
| 590 |
+
) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE;
|
| 591 |
+
|
| 592 |
if (instance.cacheTolerance) {
|
| 593 |
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
| 594 |
}
|
|
|
|
| 600 |
return instance;
|
| 601 |
}
|
| 602 |
|
| 603 |
+
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
| 604 |
+
if (this.waitForSelector?.length) {
|
| 605 |
return false;
|
| 606 |
}
|
| 607 |
+
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
| 608 |
+
return true;
|
| 609 |
+
}
|
| 610 |
+
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) {
|
| 611 |
+
const now = Date.now();
|
| 612 |
+
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
| 613 |
+
return true;
|
| 614 |
+
}
|
| 615 |
+
}
|
| 616 |
+
if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
|
| 617 |
+
return false;
|
| 618 |
+
}
|
| 619 |
+
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
| 620 |
+
return false;
|
| 621 |
+
}
|
| 622 |
+
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
| 623 |
+
return true;
|
| 624 |
+
}
|
| 625 |
+
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) {
|
| 626 |
+
const now = Date.now();
|
| 627 |
+
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
| 628 |
+
return true;
|
| 629 |
+
}
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
+
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
| 633 |
return false;
|
| 634 |
}
|
| 635 |
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
|
|
|
| 639 |
return false;
|
| 640 |
}
|
| 641 |
|
| 642 |
+
return false;
|
| 643 |
}
|
| 644 |
|
| 645 |
isCacheQueryApplicable() {
|
|
|
|
| 667 |
}
|
| 668 |
|
| 669 |
browserIsNotRequired() {
|
| 670 |
+
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
|
| 671 |
+
return false;
|
| 672 |
+
}
|
| 673 |
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
| 674 |
return false;
|
| 675 |
}
|
src/services/puppeteer.ts
CHANGED
|
@@ -55,13 +55,10 @@ export interface PageSnapshot {
|
|
| 55 |
href: string;
|
| 56 |
rebase?: string;
|
| 57 |
html: string;
|
| 58 |
-
htmlModifiedByJs?: boolean;
|
| 59 |
shadowExpanded?: string;
|
| 60 |
text: string;
|
| 61 |
status?: number;
|
| 62 |
statusText?: string;
|
| 63 |
-
isIntermediate?: boolean;
|
| 64 |
-
isFromCache?: boolean;
|
| 65 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 66 |
screenshot?: Buffer;
|
| 67 |
pageshot?: Buffer;
|
|
@@ -70,6 +67,11 @@ export interface PageSnapshot {
|
|
| 70 |
maxElemDepth?: number;
|
| 71 |
elemCount?: number;
|
| 72 |
childFrames?: PageSnapshot[];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
|
| 75 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
@@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|
| 374 |
return false;
|
| 375 |
}
|
| 376 |
|
| 377 |
-
let
|
|
|
|
|
|
|
| 378 |
function giveSnapshot(stopActiveSnapshot) {
|
| 379 |
-
initialHTML ??= document.documentElement?.outerHTML;
|
| 380 |
if (stopActiveSnapshot) {
|
| 381 |
window.haltSnapshot = true;
|
| 382 |
}
|
|
@@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) {
|
|
| 392 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 393 |
href: document.location.href,
|
| 394 |
html: document.documentElement?.outerHTML,
|
| 395 |
-
htmlModifiedByJs: false,
|
| 396 |
text: document.body?.innerText,
|
| 397 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 398 |
parsed: parsed,
|
| 399 |
imgs: [],
|
| 400 |
maxElemDepth: domAnalysis.maxDepth,
|
| 401 |
elemCount: domAnalysis.elementCount,
|
|
|
|
| 402 |
};
|
| 403 |
-
if (initialHTML) {
|
| 404 |
-
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
| 405 |
-
}
|
| 406 |
if (document.baseURI !== r.href) {
|
| 407 |
r.rebase = document.baseURI;
|
| 408 |
}
|
|
@@ -445,9 +445,20 @@ window.briefImgs = briefImgs;
|
|
| 445 |
})();
|
| 446 |
`;
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
class PageReqCtrlKit {
|
| 449 |
reqSet: Set<HTTPRequest> = new Set();
|
| 450 |
blockers: Deferred<void>[] = [];
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
constructor(
|
| 453 |
public concurrency: number,
|
|
@@ -472,6 +483,15 @@ class PageReqCtrlKit {
|
|
| 472 |
this.reqSet.delete(req);
|
| 473 |
const deferred = this.blockers.shift();
|
| 474 |
deferred?.resolve();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
}
|
| 476 |
}
|
| 477 |
|
|
@@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 491 |
lastPageCratedAt: number = 0;
|
| 492 |
ua: string = '';
|
| 493 |
|
| 494 |
-
concurrentRequestsPerPage: number =
|
| 495 |
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
| 496 |
|
| 497 |
lastReqSentAt: number = 0;
|
|
@@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService {
|
|
| 1050 |
return;
|
| 1051 |
}
|
| 1052 |
snapshot = s;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 1054 |
return;
|
| 1055 |
}
|
|
|
|
| 55 |
href: string;
|
| 56 |
rebase?: string;
|
| 57 |
html: string;
|
|
|
|
| 58 |
shadowExpanded?: string;
|
| 59 |
text: string;
|
| 60 |
status?: number;
|
| 61 |
statusText?: string;
|
|
|
|
|
|
|
| 62 |
parsed?: Partial<ReadabilityParsed> | null;
|
| 63 |
screenshot?: Buffer;
|
| 64 |
pageshot?: Buffer;
|
|
|
|
| 67 |
maxElemDepth?: number;
|
| 68 |
elemCount?: number;
|
| 69 |
childFrames?: PageSnapshot[];
|
| 70 |
+
isIntermediate?: boolean;
|
| 71 |
+
isFromCache?: boolean;
|
| 72 |
+
lastMutationIdle?: number;
|
| 73 |
+
lastContentResourceLoaded?: number;
|
| 74 |
+
lastMediaResourceLoaded?: number;
|
| 75 |
}
|
| 76 |
|
| 77 |
export interface ExtendedSnapshot extends PageSnapshot {
|
|
|
|
| 376 |
return false;
|
| 377 |
}
|
| 378 |
|
| 379 |
+
let lastMutationIdle = 0;
|
| 380 |
+
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
| 381 |
+
|
| 382 |
function giveSnapshot(stopActiveSnapshot) {
|
|
|
|
| 383 |
if (stopActiveSnapshot) {
|
| 384 |
window.haltSnapshot = true;
|
| 385 |
}
|
|
|
|
| 395 |
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
| 396 |
href: document.location.href,
|
| 397 |
html: document.documentElement?.outerHTML,
|
|
|
|
| 398 |
text: document.body?.innerText,
|
| 399 |
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
| 400 |
parsed: parsed,
|
| 401 |
imgs: [],
|
| 402 |
maxElemDepth: domAnalysis.maxDepth,
|
| 403 |
elemCount: domAnalysis.elementCount,
|
| 404 |
+
lastMutationIdle,
|
| 405 |
};
|
|
|
|
|
|
|
|
|
|
| 406 |
if (document.baseURI !== r.href) {
|
| 407 |
r.rebase = document.baseURI;
|
| 408 |
}
|
|
|
|
| 445 |
})();
|
| 446 |
`;
|
| 447 |
|
| 448 |
+
const documentResourceTypes = new Set([
|
| 449 |
+
'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
|
| 450 |
+
]);
|
| 451 |
+
const mediaResourceTypes = new Set([
|
| 452 |
+
'stylesheet', 'image', 'font', 'media'
|
| 453 |
+
]);
|
| 454 |
+
|
| 455 |
+
|
| 456 |
class PageReqCtrlKit {
|
| 457 |
reqSet: Set<HTTPRequest> = new Set();
|
| 458 |
blockers: Deferred<void>[] = [];
|
| 459 |
+
lastResourceLoadedAt: number = 0;
|
| 460 |
+
lastContentResourceLoadedAt: number = 0;
|
| 461 |
+
lastMediaResourceLoadedAt: number = 0;
|
| 462 |
|
| 463 |
constructor(
|
| 464 |
public concurrency: number,
|
|
|
|
| 483 |
this.reqSet.delete(req);
|
| 484 |
const deferred = this.blockers.shift();
|
| 485 |
deferred?.resolve();
|
| 486 |
+
const now = Date.now();
|
| 487 |
+
this.lastResourceLoadedAt = now;
|
| 488 |
+
const typ = req.resourceType();
|
| 489 |
+
if (documentResourceTypes.has(typ)) {
|
| 490 |
+
this.lastContentResourceLoadedAt = now;
|
| 491 |
+
}
|
| 492 |
+
if (mediaResourceTypes.has(typ)) {
|
| 493 |
+
this.lastMediaResourceLoadedAt = now;
|
| 494 |
+
}
|
| 495 |
}
|
| 496 |
}
|
| 497 |
|
|
|
|
| 511 |
lastPageCratedAt: number = 0;
|
| 512 |
ua: string = '';
|
| 513 |
|
| 514 |
+
concurrentRequestsPerPage: number = 32;
|
| 515 |
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
| 516 |
|
| 517 |
lastReqSentAt: number = 0;
|
|
|
|
| 1070 |
return;
|
| 1071 |
}
|
| 1072 |
snapshot = s;
|
| 1073 |
+
if (snapshot) {
|
| 1074 |
+
const kit = this.pageReqCtrl.get(page);
|
| 1075 |
+
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
| 1076 |
+
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
| 1077 |
+
}
|
| 1078 |
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 1079 |
return;
|
| 1080 |
}
|