nomagick commited on
Commit
579f259
·
unverified ·
1 Parent(s): eaa0678

fix: detect when readability does not work

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -327,8 +327,19 @@ export class CrawlerHost extends RPCHost {
327
  break;
328
  }
329
 
330
- const toBeTurnedToMd = mode === 'markdown' ? snapshot.html : snapshot.parsed?.content;
331
- let turnDownService = mode === 'markdown' ? this.getTurndown({ url: snapshot.href }) : this.getTurndown({ noRules: true, url: snapshot.href });
 
 
 
 
 
 
 
 
 
 
 
332
  for (const plugin of this.turnDownPlugins) {
333
  turnDownService = turnDownService.use(plugin);
334
  }
@@ -585,7 +596,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
585
  let urlToCrawl;
586
  const normalizeUrl = (await pNormalizeUrl).default;
587
  try {
588
- urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters:false }));
589
  } catch (err) {
590
  throw new ParamValidationError({
591
  message: `${err}`,
 
327
  break;
328
  }
329
 
330
+ let toBeTurnedToMd = snapshot.html;
331
+ let turnDownService = this.getTurndown({ url: nominalUrl });
332
+ if (mode !== 'markdown' && snapshot.parsed?.content) {
333
+ const par1 = turnDownService.turndown(toBeTurnedToMd);
334
+ const par2 = turnDownService.turndown(snapshot.parsed.content)
335
+
336
+ // If Readability did its job
337
+ if (par2.length >= 0.3 * par1.length) {
338
+ turnDownService = this.getTurndown({ noRules: true, url: snapshot.href });
339
+ toBeTurnedToMd = snapshot.parsed.content;
340
+ }
341
+ }
342
+
343
  for (const plugin of this.turnDownPlugins) {
344
  turnDownService = turnDownService.use(plugin);
345
  }
 
596
  let urlToCrawl;
597
  const normalizeUrl = (await pNormalizeUrl).default;
598
  try {
599
+ urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false, sortQueryParameters: false }));
600
  } catch (err) {
601
  throw new ParamValidationError({
602
  message: `${err}`,