Spaces:
Build error
Build error
fix: detect when readability does not work
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -327,8 +327,19 @@ export class CrawlerHost extends RPCHost {
|
|
| 327 |
break;
|
| 328 |
}
|
| 329 |
|
| 330 |
-
|
| 331 |
-
let turnDownService =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
for (const plugin of this.turnDownPlugins) {
|
| 333 |
turnDownService = turnDownService.use(plugin);
|
| 334 |
}
|
|
@@ -585,7 +596,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 585 |
let urlToCrawl;
|
| 586 |
const normalizeUrl = (await pNormalizeUrl).default;
|
| 587 |
try {
|
| 588 |
-
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters:false }));
|
| 589 |
} catch (err) {
|
| 590 |
throw new ParamValidationError({
|
| 591 |
message: `${err}`,
|
|
|
|
| 327 |
break;
|
| 328 |
}
|
| 329 |
|
| 330 |
+
let toBeTurnedToMd = snapshot.html;
|
| 331 |
+
let turnDownService = this.getTurndown({ url: nominalUrl });
|
| 332 |
+
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
| 333 |
+
const par1 = turnDownService.turndown(toBeTurnedToMd);
|
| 334 |
+
const par2 = turnDownService.turndown(snapshot.parsed.content)
|
| 335 |
+
|
| 336 |
+
// If Readability did its job
|
| 337 |
+
if (par2.length >= 0.3 * par1.length) {
|
| 338 |
+
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
|
| 339 |
+
toBeTurnedToMd = snapshot.parsed.content;
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
for (const plugin of this.turnDownPlugins) {
|
| 344 |
turnDownService = turnDownService.use(plugin);
|
| 345 |
}
|
|
|
|
| 596 |
let urlToCrawl;
|
| 597 |
const normalizeUrl = (await pNormalizeUrl).default;
|
| 598 |
try {
|
| 599 |
+
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters: false }));
|
| 600 |
} catch (err) {
|
| 601 |
throw new ParamValidationError({
|
| 602 |
message: `${err}`,
|