Spaces:
Build error
Build error
fix: another approach to suspected DoS abuse
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -375,12 +375,22 @@ export class CrawlerHost extends RPCHost {
|
|
| 375 |
let contentText = '';
|
| 376 |
const imageSummary = {} as { [k: string]: string; };
|
| 377 |
const imageIdxTrack = new Map<string, number[]>();
|
|
|
|
| 378 |
do {
|
| 379 |
if (pdfMode) {
|
| 380 |
contentText = snapshot.parsed?.content || snapshot.text;
|
| 381 |
break;
|
| 382 |
}
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 385 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 386 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
|
|
|
| 375 |
let contentText = '';
|
| 376 |
const imageSummary = {} as { [k: string]: string; };
|
| 377 |
const imageIdxTrack = new Map<string, number[]>();
|
| 378 |
+
const uid = this.threadLocal.get('uid');
|
| 379 |
do {
|
| 380 |
if (pdfMode) {
|
| 381 |
contentText = snapshot.parsed?.content || snapshot.text;
|
| 382 |
break;
|
| 383 |
}
|
| 384 |
|
| 385 |
+
if (
|
| 386 |
+
snapshot.maxElemDepth! > 256 ||
|
| 387 |
+
(!uid && snapshot.elemCount! > 10_000) ||
|
| 388 |
+
snapshot.text.length > 70_000
|
| 389 |
+
) {
|
| 390 |
+
contentText = snapshot.text;
|
| 391 |
+
break;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
| 395 |
let toBeTurnedToMd = jsDomElementOfHTML;
|
| 396 |
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -334,6 +334,15 @@ export class SearcherHost extends RPCHost {
|
|
| 334 |
r.description = upstreamSearchResult.description;
|
| 335 |
|
| 336 |
return r;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
});
|
| 338 |
});
|
| 339 |
|
|
|
|
| 334 |
r.description = upstreamSearchResult.description;
|
| 335 |
|
| 336 |
return r;
|
| 337 |
+
}).catch((err)=> {
|
| 338 |
+
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
| 339 |
+
|
| 340 |
+
return {
|
| 341 |
+
url: upstreamSearchResult.url,
|
| 342 |
+
title: upstreamSearchResult.title,
|
| 343 |
+
description: upstreamSearchResult.description,
|
| 344 |
+
content: x.text,
|
| 345 |
+
};
|
| 346 |
});
|
| 347 |
});
|
| 348 |
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -11,7 +11,6 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
| 11 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
| 14 |
-
import { AsyncContext } from '../shared';
|
| 15 |
const tldExtract = require('tld-extract');
|
| 16 |
|
| 17 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
@@ -129,7 +128,7 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
|
|
| 129 |
NodeFilter.SHOW_ELEMENT,
|
| 130 |
(node) => {
|
| 131 |
const nodeName = node.nodeName.toLowerCase();
|
| 132 |
-
return (nodeName === 'svg'
|
| 133 |
},
|
| 134 |
false
|
| 135 |
);
|
|
@@ -215,7 +214,6 @@ export class PuppeteerControl extends AsyncService {
|
|
| 215 |
|
| 216 |
constructor(
|
| 217 |
protected globalLogger: Logger,
|
| 218 |
-
protected threadLocal: AsyncContext,
|
| 219 |
) {
|
| 220 |
super(...arguments);
|
| 221 |
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
|
@@ -491,17 +489,13 @@ document.addEventListener('load', handlePageLoad);
|
|
| 491 |
if (snapshot === s) {
|
| 492 |
return;
|
| 493 |
}
|
|
|
|
| 494 |
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
| 495 |
-
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
|
| 496 |
return;
|
| 497 |
}
|
| 498 |
-
if (s?.elemCount && s.elemCount >
|
| 499 |
-
|
| 500 |
-
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: too many DOM elements` });
|
| 501 |
-
return;
|
| 502 |
-
}
|
| 503 |
}
|
| 504 |
-
snapshot = s;
|
| 505 |
nextSnapshotDeferred.resolve(s);
|
| 506 |
nextSnapshotDeferred = Defer();
|
| 507 |
this.once('crippled', crippleListener);
|
|
|
|
| 11 |
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
| 12 |
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
| 13 |
import { TimeoutError } from 'puppeteer';
|
|
|
|
| 14 |
const tldExtract = require('tld-extract');
|
| 15 |
|
| 16 |
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
|
|
|
| 128 |
NodeFilter.SHOW_ELEMENT,
|
| 129 |
(node) => {
|
| 130 |
const nodeName = node.nodeName.toLowerCase();
|
| 131 |
+
return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
|
| 132 |
},
|
| 133 |
false
|
| 134 |
);
|
|
|
|
| 214 |
|
| 215 |
constructor(
|
| 216 |
protected globalLogger: Logger,
|
|
|
|
| 217 |
) {
|
| 218 |
super(...arguments);
|
| 219 |
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
|
|
|
| 489 |
if (snapshot === s) {
|
| 490 |
return;
|
| 491 |
}
|
| 492 |
+
snapshot = s;
|
| 493 |
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
|
|
|
| 494 |
return;
|
| 495 |
}
|
| 496 |
+
if (s?.elemCount && s.elemCount > 10_000) {
|
| 497 |
+
return;
|
|
|
|
|
|
|
|
|
|
| 498 |
}
|
|
|
|
| 499 |
nextSnapshotDeferred.resolve(s);
|
| 500 |
nextSnapshotDeferred = Defer();
|
| 501 |
this.once('crippled', crippleListener);
|