Spaces:
Build error
Build error
feat: full markdown mode
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -58,7 +58,23 @@ export class CrawlerHost extends RPCHost {
|
|
| 58 |
this.emit('ready');
|
| 59 |
}
|
| 60 |
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
screenshotUrl?: string;
|
| 63 |
}, nominalUrl?: URL) {
|
| 64 |
if (mode === 'screenshot') {
|
|
@@ -96,8 +112,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 96 |
};
|
| 97 |
}
|
| 98 |
|
| 99 |
-
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 100 |
-
let turnDownService =
|
| 101 |
for (const plugin of this.turnDownPlugins) {
|
| 102 |
turnDownService = turnDownService.use(plugin);
|
| 103 |
}
|
|
@@ -129,7 +145,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 129 |
if (mapped) {
|
| 130 |
return ``;
|
| 131 |
}
|
| 132 |
-
return ``;
|
| 133 |
}
|
| 134 |
});
|
| 135 |
|
|
@@ -139,7 +155,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 139 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 140 |
} catch (err) {
|
| 141 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 142 |
-
const vanillaTurnDownService =
|
| 143 |
try {
|
| 144 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 145 |
} catch (err2) {
|
|
@@ -148,12 +164,15 @@ export class CrawlerHost extends RPCHost {
|
|
| 148 |
}
|
| 149 |
}
|
| 150 |
|
| 151 |
-
if (
|
|
|
|
|
|
|
|
|
|
| 152 |
try {
|
| 153 |
contentText = turnDownService.turndown(snapshot.html);
|
| 154 |
} catch (err) {
|
| 155 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 156 |
-
const vanillaTurnDownService =
|
| 157 |
try {
|
| 158 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 159 |
} catch (err2) {
|
|
@@ -179,6 +198,10 @@ export class CrawlerHost extends RPCHost {
|
|
| 179 |
mixins.push(`Published Time: ${this.publishedTime}`);
|
| 180 |
}
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
return `Title: ${this.title}
|
| 183 |
|
| 184 |
URL Source: ${this.url}
|
|
@@ -233,6 +256,7 @@ ${this.content}
|
|
| 233 |
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
|
| 234 |
`Supported formats:\n` +
|
| 235 |
`- markdown\n` +
|
|
|
|
| 236 |
`- html\n` +
|
| 237 |
`- text\n` +
|
| 238 |
`- screenshot\n\n` +
|
|
|
|
| 58 |
this.emit('ready');
|
| 59 |
}
|
| 60 |
|
| 61 |
+
getTurndown(noRules?: boolean | string) {
|
| 62 |
+
const turnDownService = new TurndownService();
|
| 63 |
+
if (!noRules) {
|
| 64 |
+
turnDownService.addRule('remove-irrelevant', {
|
| 65 |
+
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
| 66 |
+
replacement: () => ''
|
| 67 |
+
});
|
| 68 |
+
turnDownService.addRule('title-as-h1', {
|
| 69 |
+
filter: ['title'],
|
| 70 |
+
replacement: (innerText) => `${innerText}\n===============\n`
|
| 71 |
+
});
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
return turnDownService;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
async formatSnapshot(mode: string | 'markdown' | 'full-markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
| 78 |
screenshotUrl?: string;
|
| 79 |
}, nominalUrl?: URL) {
|
| 80 |
if (mode === 'screenshot') {
|
|
|
|
| 112 |
};
|
| 113 |
}
|
| 114 |
|
| 115 |
+
const toBeTurnedToMd = mode === 'full-markdown' ? snapshot.html : snapshot.parsed?.content;
|
| 116 |
+
let turnDownService = mode === 'markdown' ? this.getTurndown('without any rule') : this.getTurndown();
|
| 117 |
for (const plugin of this.turnDownPlugins) {
|
| 118 |
turnDownService = turnDownService.use(plugin);
|
| 119 |
}
|
|
|
|
| 145 |
if (mapped) {
|
| 146 |
return ``;
|
| 147 |
}
|
| 148 |
+
return alt ? `` : ``;
|
| 149 |
}
|
| 150 |
});
|
| 151 |
|
|
|
|
| 155 |
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
| 156 |
} catch (err) {
|
| 157 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 158 |
+
const vanillaTurnDownService = this.getTurndown();
|
| 159 |
try {
|
| 160 |
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
| 161 |
} catch (err2) {
|
|
|
|
| 164 |
}
|
| 165 |
}
|
| 166 |
|
| 167 |
+
if (
|
| 168 |
+
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
| 169 |
+
&& toBeTurnedToMd !== snapshot.html
|
| 170 |
+
) {
|
| 171 |
try {
|
| 172 |
contentText = turnDownService.turndown(snapshot.html);
|
| 173 |
} catch (err) {
|
| 174 |
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
| 175 |
+
const vanillaTurnDownService = this.getTurndown();
|
| 176 |
try {
|
| 177 |
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
| 178 |
} catch (err2) {
|
|
|
|
| 198 |
mixins.push(`Published Time: ${this.publishedTime}`);
|
| 199 |
}
|
| 200 |
|
| 201 |
+
if (mode === 'full-markdown') {
|
| 202 |
+
return this.content;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
return `Title: ${this.title}
|
| 206 |
|
| 207 |
URL Source: ${this.url}
|
|
|
|
| 256 |
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
|
| 257 |
`Supported formats:\n` +
|
| 258 |
`- markdown\n` +
|
| 259 |
+
`- full-markdown\n` +
|
| 260 |
`- html\n` +
|
| 261 |
`- text\n` +
|
| 262 |
`- screenshot\n\n` +
|