nomagick commited on
Commit
69231ad
·
unverified ·
1 Parent(s): 0f70723

feat: full markdown mode

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -58,7 +58,23 @@ export class CrawlerHost extends RPCHost {
58
  this.emit('ready');
59
  }
60
 
61
- async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  screenshotUrl?: string;
63
  }, nominalUrl?: URL) {
64
  if (mode === 'screenshot') {
@@ -96,8 +112,8 @@ export class CrawlerHost extends RPCHost {
96
  };
97
  }
98
 
99
- const toBeTurnedToMd = snapshot.parsed?.content;
100
- let turnDownService = new TurndownService();
101
  for (const plugin of this.turnDownPlugins) {
102
  turnDownService = turnDownService.use(plugin);
103
  }
@@ -129,7 +145,7 @@ export class CrawlerHost extends RPCHost {
129
  if (mapped) {
130
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
131
  }
132
- return `![Image ${imgIdx}: ${alt}](${src})`;
133
  }
134
  });
135
 
@@ -139,7 +155,7 @@ export class CrawlerHost extends RPCHost {
139
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
140
  } catch (err) {
141
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
142
- const vanillaTurnDownService = new TurndownService();
143
  try {
144
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
145
  } catch (err2) {
@@ -148,12 +164,15 @@ export class CrawlerHost extends RPCHost {
148
  }
149
  }
150
 
151
- if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
 
 
 
152
  try {
153
  contentText = turnDownService.turndown(snapshot.html);
154
  } catch (err) {
155
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
156
- const vanillaTurnDownService = new TurndownService();
157
  try {
158
  contentText = vanillaTurnDownService.turndown(snapshot.html);
159
  } catch (err2) {
@@ -179,6 +198,10 @@ export class CrawlerHost extends RPCHost {
179
  mixins.push(`Published Time: ${this.publishedTime}`);
180
  }
181
 
 
 
 
 
182
  return `Title: ${this.title}
183
 
184
  URL Source: ${this.url}
@@ -233,6 +256,7 @@ ${this.content}
233
  description: `Specifies the form factor of the crawled data you prefer. \n\n` +
234
  `Supported formats:\n` +
235
  `- markdown\n` +
 
236
  `- html\n` +
237
  `- text\n` +
238
  `- screenshot\n\n` +
 
58
  this.emit('ready');
59
  }
60
 
61
+ getTurndown(noRules?: boolean | string) {
62
+ const turnDownService = new TurndownService();
63
+ if (!noRules) {
64
+ turnDownService.addRule('remove-irrelevant', {
65
+ filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
66
+ replacement: () => ''
67
+ });
68
+ turnDownService.addRule('title-as-h1', {
69
+ filter: ['title'],
70
+ replacement: (innerText) => `${innerText}\n===============\n`
71
+ });
72
+ }
73
+
74
+ return turnDownService;
75
+ }
76
+
77
+ async formatSnapshot(mode: string | 'markdown' | 'full-markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
78
  screenshotUrl?: string;
79
  }, nominalUrl?: URL) {
80
  if (mode === 'screenshot') {
 
112
  };
113
  }
114
 
115
+ const toBeTurnedToMd = mode === 'full-markdown' ? snapshot.html : snapshot.parsed?.content;
116
+ let turnDownService = mode === 'markdown' ? this.getTurndown('without any rule') : this.getTurndown();
117
  for (const plugin of this.turnDownPlugins) {
118
  turnDownService = turnDownService.use(plugin);
119
  }
 
145
  if (mapped) {
146
  return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
147
  }
148
+ return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
149
  }
150
  });
151
 
 
155
  contentText = turnDownService.turndown(toBeTurnedToMd).trim();
156
  } catch (err) {
157
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
158
+ const vanillaTurnDownService = this.getTurndown();
159
  try {
160
  contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
161
  } catch (err2) {
 
164
  }
165
  }
166
 
167
+ if (
168
+ !contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
169
+ && toBeTurnedToMd !== snapshot.html
170
+ ) {
171
  try {
172
  contentText = turnDownService.turndown(snapshot.html);
173
  } catch (err) {
174
  this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
175
+ const vanillaTurnDownService = this.getTurndown();
176
  try {
177
  contentText = vanillaTurnDownService.turndown(snapshot.html);
178
  } catch (err2) {
 
198
  mixins.push(`Published Time: ${this.publishedTime}`);
199
  }
200
 
201
+ if (mode === 'full-markdown') {
202
+ return this.content;
203
+ }
204
+
205
  return `Title: ${this.title}
206
 
207
  URL Source: ${this.url}
 
256
  description: `Specifies the form factor of the crawled data you prefer. \n\n` +
257
  `Supported formats:\n` +
258
  `- markdown\n` +
259
+ `- full-markdown\n` +
260
  `- html\n` +
261
  `- text\n` +
262
  `- screenshot\n\n` +