nomagick commited on
Commit
d3f3a85
·
unverified ·
1 Parent(s): 57cbae8

fix: revert screenshot behavior and introduce pageshot

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -45,6 +45,8 @@ export interface FormattedPage {
45
  text?: string;
46
  screenshotUrl?: string;
47
  screenshot?: Buffer;
 
 
48
  links?: { [k: string]: string; };
49
  images?: { [k: string]: string; };
50
 
@@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
282
  return mixin;
283
  }
284
 
285
- async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
286
  screenshotUrl?: string;
 
287
  }, nominalUrl?: URL) {
288
  if (mode === 'screenshot') {
289
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
@@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
305
  }
306
  } as FormattedPage;
307
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  if (mode === 'html') {
309
  return {
310
  ...this.getGeneralSnapshotMixins(snapshot),
@@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
761
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
762
  );
763
  }
 
 
 
 
 
 
764
 
765
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
766
  }
@@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
778
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
779
  );
780
  }
 
 
 
 
 
 
781
 
782
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
783
  }
@@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
810
 
811
  let snapshot: PageSnapshot | undefined;
812
  let screenshotUrl: string | undefined;
 
813
  const preparations = [
814
  this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
815
  snapshot = JSON.parse(r.toString('utf-8'));
@@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
818
  this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
819
  screenshotUrl = r;
820
  }) :
 
 
 
 
 
821
  Promise.resolve(undefined)
822
  ];
823
  try {
@@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
833
  snapshot: {
834
  ...snapshot,
835
  screenshot: undefined,
 
836
  screenshotUrl,
837
- } as PageSnapshot & { screenshotUrl?: string; }
 
838
  };
839
  }
840
 
@@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
878
  });
879
  cache.screenshotAvailable = true;
880
  }
 
 
 
 
 
 
 
 
881
  await savingOfSnapshot;
882
  const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
883
  this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
@@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
1013
  const crawlOpts: ExtraScrappingOptions = {
1014
  proxyUrl: opts.proxyUrl,
1015
  cookies: opts.setCookies,
1016
- favorScreenshot: opts.respondWith === 'screenshot',
1017
  removeSelector: opts.removeSelector,
1018
  targetSelector: opts.targetSelector,
1019
  waitForSelector: opts.waitForSelector,
 
45
  text?: string;
46
  screenshotUrl?: string;
47
  screenshot?: Buffer;
48
+ pageshotUrl?: string;
49
+ pageshot?: Buffer;
50
  links?: { [k: string]: string; };
51
  images?: { [k: string]: string; };
52
 
 
284
  return mixin;
285
  }
286
 
287
+ async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
288
  screenshotUrl?: string;
289
+ pageshotUrl?: string;
290
  }, nominalUrl?: URL) {
291
  if (mode === 'screenshot') {
292
  if (snapshot.screenshot && !snapshot.screenshotUrl) {
 
308
  }
309
  } as FormattedPage;
310
  }
311
+ if (mode === 'pageshot') {
312
+ if (snapshot.pageshot && !snapshot.pageshotUrl) {
313
+ const fid = `instant-screenshots/${randomUUID()}`;
314
+ await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
315
+ metadata: {
316
+ contentType: 'image/png',
317
+ }
318
+ });
319
+ snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
320
+ }
321
+
322
+ return {
323
+ ...this.getGeneralSnapshotMixins(snapshot),
324
+ html: snapshot.html,
325
+ pageshotUrl: snapshot.pageshotUrl,
326
+ toString() {
327
+ return this.pageshotUrl;
328
+ }
329
+ } as FormattedPage;
330
+ }
331
  if (mode === 'html') {
332
  return {
333
  ...this.getGeneralSnapshotMixins(snapshot),
 
784
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
785
  );
786
  }
787
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
788
+
789
+ return assignTransferProtocolMeta(`${formatted}`,
790
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
791
+ );
792
+ }
793
 
794
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
795
  }
 
807
  { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
808
  );
809
  }
810
+ if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
811
+
812
+ return assignTransferProtocolMeta(`${formatted}`,
813
+ { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
814
+ );
815
+ }
816
 
817
  return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
818
  }
 
845
 
846
  let snapshot: PageSnapshot | undefined;
847
  let screenshotUrl: string | undefined;
848
+ let pageshotUrl: string | undefined;
849
  const preparations = [
850
  this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
851
  snapshot = JSON.parse(r.toString('utf-8'));
 
854
  this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
855
  screenshotUrl = r;
856
  }) :
857
+ Promise.resolve(undefined),
858
+ cache.pageshotAvailable ?
859
+ this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
860
+ pageshotUrl = r;
861
+ }) :
862
  Promise.resolve(undefined)
863
  ];
864
  try {
 
874
  snapshot: {
875
  ...snapshot,
876
  screenshot: undefined,
877
+ pageshot: undefined,
878
  screenshotUrl,
879
+ pageshotUrl,
880
+ } as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
881
  };
882
  }
883
 
 
921
  });
922
  cache.screenshotAvailable = true;
923
  }
924
+ if (snapshot.pageshot) {
925
+ await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
926
+ metadata: {
927
+ contentType: 'image/png',
928
+ }
929
+ });
930
+ cache.pageshotAvailable = true;
931
+ }
932
  await savingOfSnapshot;
933
  const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
934
  this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
 
1064
  const crawlOpts: ExtraScrappingOptions = {
1065
  proxyUrl: opts.proxyUrl,
1066
  cookies: opts.setCookies,
1067
+ favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
1068
  removeSelector: opts.removeSelector,
1069
  targetSelector: opts.targetSelector,
1070
  waitForSelector: opts.waitForSelector,
backend/functions/src/db/crawled.ts CHANGED
@@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
- snapshot?: PageSnapshot & { screenshot: never; };
26
 
27
  @Prop()
28
  screenshotAvailable?: boolean;
29
 
 
 
 
30
  @Prop()
31
  snapshotAvailable?: boolean;
32
 
 
22
  urlPathDigest!: string;
23
 
24
  @Prop()
25
+ snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
26
 
27
  @Prop()
28
  screenshotAvailable?: boolean;
29
 
30
+ @Prop()
31
+ pageshotAvailable?: boolean;
32
+
33
  @Prop()
34
  snapshotAvailable?: boolean;
35
 
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
34
  `- markdown\n` +
35
  `- html\n` +
36
  `- text\n` +
 
37
  `- screenshot\n`
38
  ,
39
  in: 'header',
 
34
  `- markdown\n` +
35
  `- html\n` +
36
  `- text\n` +
37
+ `- pageshot\n` +
38
  `- screenshot\n`
39
  ,
40
  in: 'header',
backend/functions/src/services/puppeteer.ts CHANGED
@@ -46,6 +46,7 @@ export interface PageSnapshot {
46
  text: string;
47
  parsed?: Partial<ReadabilityParsed> | null;
48
  screenshot?: Buffer;
 
49
  imgs?: ImgBrief[];
50
  pdfs?: string[];
51
  maxElemDepth?: number;
@@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
448
 
449
  let snapshot: PageSnapshot | undefined;
450
  let screenshot: Buffer | undefined;
 
451
  const page = await this.getNextPage();
452
  const sn = this.snMap.get(page);
453
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
@@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
524
  try {
525
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
526
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
527
- screenshot = await page.screenshot({ fullPage: true });
528
  if (snapshot) {
529
  snapshot.childFrames = await pSubFrameSnapshots;
530
  }
@@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
547
  if (salvaged) {
548
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
549
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
550
- screenshot = await page.screenshot({ fullPage: true });
 
551
  if (snapshot) {
552
  snapshot.childFrames = await pSubFrameSnapshots;
553
  }
@@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
562
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
563
  this.emit(
564
  'crawled',
565
- { ...snapshot, screenshot },
566
  { ...options, url: parsedUrl }
567
  );
568
  }
@@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
581
  .then(async () => {
582
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
583
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
584
- screenshot = await page.screenshot({ fullPage: true });
 
585
  if (snapshot) {
586
  snapshot.childFrames = await pSubFrameSnapshots;
587
  }
@@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
614
  }
615
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
616
  }
617
- yield { ...snapshot, screenshot } as PageSnapshot;
618
  break;
619
  }
620
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
621
- screenshot = await page.screenshot({ fullPage: true });
 
622
  lastHTML = snapshot.html;
623
  }
624
  if (snapshot || screenshot) {
625
- yield { ...snapshot, screenshot } as PageSnapshot;
626
  }
627
  if (error) {
628
  throw error;
 
46
  text: string;
47
  parsed?: Partial<ReadabilityParsed> | null;
48
  screenshot?: Buffer;
49
+ pageshot?: Buffer;
50
  imgs?: ImgBrief[];
51
  pdfs?: string[];
52
  maxElemDepth?: number;
 
449
 
450
  let snapshot: PageSnapshot | undefined;
451
  let screenshot: Buffer | undefined;
452
+ let pageshot: Buffer | undefined;
453
  const page = await this.getNextPage();
454
  const sn = this.snMap.get(page);
455
  this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
 
526
  try {
527
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
528
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
529
+ screenshot = await page.screenshot();
530
  if (snapshot) {
531
  snapshot.childFrames = await pSubFrameSnapshots;
532
  }
 
549
  if (salvaged) {
550
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
551
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
552
+ screenshot = await page.screenshot();
553
+ pageshot = await page.screenshot({ fullPage: true });
554
  if (snapshot) {
555
  snapshot.childFrames = await pSubFrameSnapshots;
556
  }
 
565
  this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
566
  this.emit(
567
  'crawled',
568
+ { ...snapshot, screenshot, pageshot },
569
  { ...options, url: parsedUrl }
570
  );
571
  }
 
584
  .then(async () => {
585
  const pSubFrameSnapshots = this.snapshotChildFrames(page);
586
  snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
587
+ screenshot = await page.screenshot();
588
+ pageshot = await page.screenshot({ fullPage: true });
589
  if (snapshot) {
590
  snapshot.childFrames = await pSubFrameSnapshots;
591
  }
 
618
  }
619
  throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
620
  }
621
+ yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
622
  break;
623
  }
624
  if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
625
+ screenshot = await page.screenshot();
626
+ pageshot = await page.screenshot({ fullPage: true });
627
  lastHTML = snapshot.html;
628
  }
629
  if (snapshot || screenshot) {
630
+ yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
631
  }
632
  if (error) {
633
  throw error;