nomagick commited on
Commit
0a33207
·
unverified ·
1 Parent(s): e658e81

fix: another approach to suspected DoS abuse

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -375,12 +375,22 @@ export class CrawlerHost extends RPCHost {
375
  let contentText = '';
376
  const imageSummary = {} as { [k: string]: string; };
377
  const imageIdxTrack = new Map<string, number[]>();
 
378
  do {
379
  if (pdfMode) {
380
  contentText = snapshot.parsed?.content || snapshot.text;
381
  break;
382
  }
383
 
 
 
 
 
 
 
 
 
 
384
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
385
  let toBeTurnedToMd = jsDomElementOfHTML;
386
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
 
375
  let contentText = '';
376
  const imageSummary = {} as { [k: string]: string; };
377
  const imageIdxTrack = new Map<string, number[]>();
378
+ const uid = this.threadLocal.get('uid');
379
  do {
380
  if (pdfMode) {
381
  contentText = snapshot.parsed?.content || snapshot.text;
382
  break;
383
  }
384
 
385
+ if (
386
+ snapshot.maxElemDepth! > 256 ||
387
+ (!uid && snapshot.elemCount! > 10_000) ||
388
+ snapshot.text.length > 70_000
389
+ ) {
390
+ contentText = snapshot.text;
391
+ break;
392
+ }
393
+
394
  const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
395
  let toBeTurnedToMd = jsDomElementOfHTML;
396
  let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -334,6 +334,15 @@ export class SearcherHost extends RPCHost {
334
  r.description = upstreamSearchResult.description;
335
 
336
  return r;
 
 
 
 
 
 
 
 
 
337
  });
338
  });
339
 
 
334
  r.description = upstreamSearchResult.description;
335
 
336
  return r;
337
+ }).catch((err)=> {
338
+ this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
339
+
340
+ return {
341
+ url: upstreamSearchResult.url,
342
+ title: upstreamSearchResult.title,
343
+ description: upstreamSearchResult.description,
344
+ content: x.text,
345
+ };
346
  });
347
  });
348
 
backend/functions/src/services/puppeteer.ts CHANGED
@@ -11,7 +11,6 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
11
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
12
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
13
  import { TimeoutError } from 'puppeteer';
14
- import { AsyncContext } from '../shared';
15
  const tldExtract = require('tld-extract');
16
 
17
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@@ -129,7 +128,7 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
129
  NodeFilter.SHOW_ELEMENT,
130
  (node) => {
131
  const nodeName = node.nodeName.toLowerCase();
132
- return (nodeName === 'svg' || nodeName === 'code') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
133
  },
134
  false
135
  );
@@ -215,7 +214,6 @@ export class PuppeteerControl extends AsyncService {
215
 
216
  constructor(
217
  protected globalLogger: Logger,
218
- protected threadLocal: AsyncContext,
219
  ) {
220
  super(...arguments);
221
  this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
@@ -491,17 +489,13 @@ document.addEventListener('load', handlePageLoad);
491
  if (snapshot === s) {
492
  return;
493
  }
 
494
  if (s?.maxElemDepth && s.maxElemDepth > 256) {
495
- page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
496
  return;
497
  }
498
- if (s?.elemCount && s.elemCount > 20_000) {
499
- if (!this.threadLocal.get('uid')) {
500
- page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: too many DOM elements` });
501
- return;
502
- }
503
  }
504
- snapshot = s;
505
  nextSnapshotDeferred.resolve(s);
506
  nextSnapshotDeferred = Defer();
507
  this.once('crippled', crippleListener);
 
11
  import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
12
  import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
13
  import { TimeoutError } from 'puppeteer';
 
14
  const tldExtract = require('tld-extract');
15
 
16
  const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
 
128
  NodeFilter.SHOW_ELEMENT,
129
  (node) => {
130
  const nodeName = node.nodeName.toLowerCase();
131
+ return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
132
  },
133
  false
134
  );
 
214
 
215
  constructor(
216
  protected globalLogger: Logger,
 
217
  ) {
218
  super(...arguments);
219
  this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
 
489
  if (snapshot === s) {
490
  return;
491
  }
492
+ snapshot = s;
493
  if (s?.maxElemDepth && s.maxElemDepth > 256) {
 
494
  return;
495
  }
496
+ if (s?.elemCount && s.elemCount > 10_000) {
497
+ return;
 
 
 
498
  }
 
499
  nextSnapshotDeferred.resolve(s);
500
  nextSnapshotDeferred = Defer();
501
  this.once('crippled', crippleListener);