Spaces:
Build error
Build error
fix
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost {
|
|
| 32 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 33 |
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
|
| 34 |
|
| 35 |
-
const formatted =
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
Markdown Content:
|
| 40 |
-
${contentText
|
| 41 |
`;
|
|
|
|
|
|
|
| 42 |
|
| 43 |
return formatted;
|
| 44 |
}
|
|
@@ -47,6 +55,7 @@ ${contentText.trim()}
|
|
| 47 |
runtime: {
|
| 48 |
memory: '4GiB',
|
| 49 |
timeoutSeconds: 540,
|
|
|
|
| 50 |
},
|
| 51 |
httpMethod: ['get', 'post'],
|
| 52 |
returnType: [String, OutputServerEventStream],
|
|
@@ -60,20 +69,22 @@ ${contentText.trim()}
|
|
| 60 |
) {
|
| 61 |
const noSlashURL = ctx.req.url.slice(1);
|
| 62 |
const urlToCrawl = new URL(normalizeUrl(noSlashURL));
|
|
|
|
|
|
|
| 63 |
|
| 64 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 65 |
const sseStream = new OutputServerEventStream();
|
| 66 |
rpcReflect.return(sseStream);
|
| 67 |
|
| 68 |
try {
|
| 69 |
-
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 70 |
if (!scrapped) {
|
| 71 |
continue;
|
| 72 |
}
|
| 73 |
|
| 74 |
const formatted = this.formatSnapshot(scrapped);
|
| 75 |
|
| 76 |
-
if (scrapped.screenshot) {
|
| 77 |
sseStream.write({
|
| 78 |
event: 'screenshot',
|
| 79 |
data: scrapped.screenshot.toString('base64'),
|
|
@@ -99,37 +110,25 @@ ${contentText.trim()}
|
|
| 99 |
}
|
| 100 |
|
| 101 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 102 |
-
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 103 |
if (!scrapped?.parsed?.content) {
|
| 104 |
continue;
|
| 105 |
}
|
| 106 |
|
| 107 |
const formatted = this.formatSnapshot(scrapped);
|
| 108 |
|
| 109 |
-
if (scrapped.screenshot) {
|
| 110 |
-
|
| 111 |
-
return [
|
| 112 |
-
{
|
| 113 |
-
type: 'image_url', image_url: {
|
| 114 |
-
url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
|
| 115 |
-
}
|
| 116 |
-
},
|
| 117 |
-
{ type: 'text', content: formatted },
|
| 118 |
-
];
|
| 119 |
-
}
|
| 120 |
-
|
| 121 |
return formatted;
|
| 122 |
}
|
| 123 |
}
|
| 124 |
|
| 125 |
-
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
|
| 126 |
if (!scrapped?.parsed?.content) {
|
| 127 |
continue;
|
| 128 |
}
|
| 129 |
|
| 130 |
const formatted = this.formatSnapshot(scrapped);
|
| 131 |
|
| 132 |
-
return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null });
|
| 133 |
}
|
| 134 |
|
| 135 |
throw new Error('Unreachable');
|
|
|
|
| 32 |
const toBeTurnedToMd = snapshot.parsed?.content;
|
| 33 |
const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
|
| 34 |
|
| 35 |
+
const formatted = {
|
| 36 |
+
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
| 37 |
+
urlSource: snapshot.href.trim(),
|
| 38 |
+
markdownContent: contentText.trim(),
|
| 39 |
|
| 40 |
+
toString() {
|
| 41 |
+
return `Title: ${this.title}
|
| 42 |
+
|
| 43 |
+
URL Source: ${this.urlSource}
|
| 44 |
|
| 45 |
Markdown Content:
|
| 46 |
+
${contentText}
|
| 47 |
`;
|
| 48 |
+
}
|
| 49 |
+
};
|
| 50 |
|
| 51 |
return formatted;
|
| 52 |
}
|
|
|
|
| 55 |
runtime: {
|
| 56 |
memory: '4GiB',
|
| 57 |
timeoutSeconds: 540,
|
| 58 |
+
concurrency: 4,
|
| 59 |
},
|
| 60 |
httpMethod: ['get', 'post'],
|
| 61 |
returnType: [String, OutputServerEventStream],
|
|
|
|
| 69 |
) {
|
| 70 |
const noSlashURL = ctx.req.url.slice(1);
|
| 71 |
const urlToCrawl = new URL(normalizeUrl(noSlashURL));
|
| 72 |
+
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
|
| 73 |
+
const noCache = Boolean(ctx.req.headers['x-no-cache']);
|
| 74 |
|
| 75 |
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
| 76 |
const sseStream = new OutputServerEventStream();
|
| 77 |
rpcReflect.return(sseStream);
|
| 78 |
|
| 79 |
try {
|
| 80 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
| 81 |
if (!scrapped) {
|
| 82 |
continue;
|
| 83 |
}
|
| 84 |
|
| 85 |
const formatted = this.formatSnapshot(scrapped);
|
| 86 |
|
| 87 |
+
if (scrapped.screenshot && screenshotEnabled) {
|
| 88 |
sseStream.write({
|
| 89 |
event: 'screenshot',
|
| 90 |
data: scrapped.screenshot.toString('base64'),
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
| 113 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
| 114 |
if (!scrapped?.parsed?.content) {
|
| 115 |
continue;
|
| 116 |
}
|
| 117 |
|
| 118 |
const formatted = this.formatSnapshot(scrapped);
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
return formatted;
|
| 121 |
}
|
| 122 |
}
|
| 123 |
|
| 124 |
+
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
|
| 125 |
if (!scrapped?.parsed?.content) {
|
| 126 |
continue;
|
| 127 |
}
|
| 128 |
|
| 129 |
const formatted = this.formatSnapshot(scrapped);
|
| 130 |
|
| 131 |
+
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
| 132 |
}
|
| 133 |
|
| 134 |
throw new Error('Unreachable');
|
backend/functions/src/services/puppeteer.ts
CHANGED
|
@@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService {
|
|
| 49 |
return page.browser().connected && !page.isClosed();
|
| 50 |
}
|
| 51 |
}, {
|
| 52 |
-
max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024),
|
| 53 |
min: 1,
|
| 54 |
-
acquireTimeoutMillis:
|
| 55 |
testOnBorrow: true,
|
| 56 |
testOnReturn: true,
|
| 57 |
});
|
|
@@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 72 |
}
|
| 73 |
this.browser = await puppeteer.launch({
|
| 74 |
headless: true,
|
| 75 |
-
timeout:
|
| 76 |
});
|
| 77 |
this.browser.once('disconnected', () => {
|
| 78 |
this.logger.warn(`Browser disconnected`);
|
|
@@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService {
|
|
| 91 |
const preparations = [];
|
| 92 |
|
| 93 |
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
|
|
|
| 94 |
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
| 95 |
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 96 |
page.emit('snapshot', snapshot);
|
|
|
|
| 49 |
return page.browser().connected && !page.isClosed();
|
| 50 |
}
|
| 51 |
}, {
|
| 52 |
+
max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4),
|
| 53 |
min: 1,
|
| 54 |
+
acquireTimeoutMillis: 60_000,
|
| 55 |
testOnBorrow: true,
|
| 56 |
testOnReturn: true,
|
| 57 |
});
|
|
|
|
| 72 |
}
|
| 73 |
this.browser = await puppeteer.launch({
|
| 74 |
headless: true,
|
| 75 |
+
timeout: 60_000
|
| 76 |
});
|
| 77 |
this.browser.once('disconnected', () => {
|
| 78 |
this.logger.warn(`Browser disconnected`);
|
|
|
|
| 91 |
const preparations = [];
|
| 92 |
|
| 93 |
preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
| 94 |
+
preparations.push(page.setBypassCSP(true));
|
| 95 |
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
| 96 |
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
| 97 |
page.emit('snapshot', snapshot);
|