nomagick commited on
Commit
664d4b1
·
unverified ·
1 Parent(s): 2dc0850
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -32,13 +32,21 @@ export class CrawlerHost extends RPCHost {
32
  const toBeTurnedToMd = snapshot.parsed?.content;
33
  const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
34
 
35
- const formatted = `Title: ${(snapshot.parsed?.title || snapshot.title || '').trim()}
 
 
 
36
 
37
- URL Source: ${snapshot.href.trim()}
 
 
 
38
 
39
  Markdown Content:
40
- ${contentText.trim()}
41
  `;
 
 
42
 
43
  return formatted;
44
  }
@@ -47,6 +55,7 @@ ${contentText.trim()}
47
  runtime: {
48
  memory: '4GiB',
49
  timeoutSeconds: 540,
 
50
  },
51
  httpMethod: ['get', 'post'],
52
  returnType: [String, OutputServerEventStream],
@@ -60,20 +69,22 @@ ${contentText.trim()}
60
  ) {
61
  const noSlashURL = ctx.req.url.slice(1);
62
  const urlToCrawl = new URL(normalizeUrl(noSlashURL));
 
 
63
 
64
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
65
  const sseStream = new OutputServerEventStream();
66
  rpcReflect.return(sseStream);
67
 
68
  try {
69
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
70
  if (!scrapped) {
71
  continue;
72
  }
73
 
74
  const formatted = this.formatSnapshot(scrapped);
75
 
76
- if (scrapped.screenshot) {
77
  sseStream.write({
78
  event: 'screenshot',
79
  data: scrapped.screenshot.toString('base64'),
@@ -99,37 +110,25 @@ ${contentText.trim()}
99
  }
100
 
101
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
102
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
103
  if (!scrapped?.parsed?.content) {
104
  continue;
105
  }
106
 
107
  const formatted = this.formatSnapshot(scrapped);
108
 
109
- if (scrapped.screenshot) {
110
-
111
- return [
112
- {
113
- type: 'image_url', image_url: {
114
- url: `data:image/jpeg;base64,${scrapped.screenshot.toString('base64')}`,
115
- }
116
- },
117
- { type: 'text', content: formatted },
118
- ];
119
- }
120
-
121
  return formatted;
122
  }
123
  }
124
 
125
- for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString())) {
126
  if (!scrapped?.parsed?.content) {
127
  continue;
128
  }
129
 
130
  const formatted = this.formatSnapshot(scrapped);
131
 
132
- return assignTransferProtocolMeta(formatted, { contentType: 'text/plain', envelope: null });
133
  }
134
 
135
  throw new Error('Unreachable');
 
32
  const toBeTurnedToMd = snapshot.parsed?.content;
33
  const contentText = toBeTurnedToMd ? this.turnDownService.turndown(toBeTurnedToMd) : snapshot.text;
34
 
35
+ const formatted = {
36
+ title: (snapshot.parsed?.title || snapshot.title || '').trim(),
37
+ urlSource: snapshot.href.trim(),
38
+ markdownContent: contentText.trim(),
39
 
40
+ toString() {
41
+ return `Title: ${this.title}
42
+
43
+ URL Source: ${this.urlSource}
44
 
45
  Markdown Content:
46
+ ${contentText}
47
  `;
48
+ }
49
+ };
50
 
51
  return formatted;
52
  }
 
55
  runtime: {
56
  memory: '4GiB',
57
  timeoutSeconds: 540,
58
+ concurrency: 4,
59
  },
60
  httpMethod: ['get', 'post'],
61
  returnType: [String, OutputServerEventStream],
 
69
  ) {
70
  const noSlashURL = ctx.req.url.slice(1);
71
  const urlToCrawl = new URL(normalizeUrl(noSlashURL));
72
+ const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
73
+ const noCache = Boolean(ctx.req.headers['x-no-cache']);
74
 
75
  if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
76
  const sseStream = new OutputServerEventStream();
77
  rpcReflect.return(sseStream);
78
 
79
  try {
80
+ for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
81
  if (!scrapped) {
82
  continue;
83
  }
84
 
85
  const formatted = this.formatSnapshot(scrapped);
86
 
87
+ if (scrapped.screenshot && screenshotEnabled) {
88
  sseStream.write({
89
  event: 'screenshot',
90
  data: scrapped.screenshot.toString('base64'),
 
110
  }
111
 
112
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
113
+ for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
114
  if (!scrapped?.parsed?.content) {
115
  continue;
116
  }
117
 
118
  const formatted = this.formatSnapshot(scrapped);
119
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  return formatted;
121
  }
122
  }
123
 
124
+ for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) {
125
  if (!scrapped?.parsed?.content) {
126
  continue;
127
  }
128
 
129
  const formatted = this.formatSnapshot(scrapped);
130
 
131
+ return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
132
  }
133
 
134
  throw new Error('Unreachable');
backend/functions/src/services/puppeteer.ts CHANGED
@@ -49,9 +49,9 @@ export class PuppeteerControl extends AsyncService {
49
  return page.browser().connected && !page.isClosed();
50
  }
51
  }, {
52
- max: 1 + Math.floor(os.freemem() / 1024 * 1024 * 1024),
53
  min: 1,
54
- acquireTimeoutMillis: 15_000,
55
  testOnBorrow: true,
56
  testOnReturn: true,
57
  });
@@ -72,7 +72,7 @@ export class PuppeteerControl extends AsyncService {
72
  }
73
  this.browser = await puppeteer.launch({
74
  headless: true,
75
- timeout: 300_000
76
  });
77
  this.browser.once('disconnected', () => {
78
  this.logger.warn(`Browser disconnected`);
@@ -91,6 +91,7 @@ export class PuppeteerControl extends AsyncService {
91
  const preparations = [];
92
 
93
  preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
 
94
  preparations.push(page.setViewport({ width: 1920, height: 1080 }));
95
  preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
96
  page.emit('snapshot', snapshot);
 
49
  return page.browser().connected && !page.isClosed();
50
  }
51
  }, {
52
+ max: Math.max(1 + Math.floor(os.freemem() / 1024 * 1024 * 1024), 4),
53
  min: 1,
54
+ acquireTimeoutMillis: 60_000,
55
  testOnBorrow: true,
56
  testOnReturn: true,
57
  });
 
72
  }
73
  this.browser = await puppeteer.launch({
74
  headless: true,
75
+ timeout: 60_000
76
  });
77
  this.browser.once('disconnected', () => {
78
  this.logger.warn(`Browser disconnected`);
 
91
  const preparations = [];
92
 
93
  preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
94
+ preparations.push(page.setBypassCSP(true));
95
  preparations.push(page.setViewport({ width: 1920, height: 1080 }));
96
  preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
97
  page.emit('snapshot', snapshot);