nomagick commited on
Commit
78c8444
·
unverified ·
1 Parent(s): 629ab27
backend/functions/package.json CHANGED
@@ -18,7 +18,8 @@
18
  "from-preset": "npm run build && npm run emu:reset && npm run emu:start",
19
  "start": "npm run shell",
20
  "deploy": "firebase deploy --only functions",
21
- "logs": "firebase functions:log"
 
22
  },
23
  "engines": {
24
  "node": "18"
 
18
  "from-preset": "npm run build && npm run emu:reset && npm run emu:start",
19
  "start": "npm run shell",
20
  "deploy": "firebase deploy --only functions",
21
+ "logs": "firebase functions:log",
22
+ "gcp-build": "npx puppeteer browsers install chrome"
23
  },
24
  "engines": {
25
  "node": "18"
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -36,16 +36,16 @@ export class CrawlerHost extends RPCHost {
36
 
37
  const formatted = {
38
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
39
- urlSource: snapshot.href.trim(),
40
- markdownContent: contentText.trim(),
41
 
42
  toString() {
43
  return `Title: ${this.title}
44
 
45
- URL Source: ${this.urlSource}
46
 
47
  Markdown Content:
48
- ${contentText}
49
  `;
50
  }
51
  };
 
36
 
37
  const formatted = {
38
  title: (snapshot.parsed?.title || snapshot.title || '').trim(),
39
+ url: snapshot.href.trim(),
40
+ content: contentText.trim(),
41
 
42
  toString() {
43
  return `Title: ${this.title}
44
 
45
+ URL Source: ${this.url}
46
 
47
  Markdown Content:
48
+ ${this.content}
49
  `;
50
  }
51
  };
backend/functions/src/services/puppeteer.ts CHANGED
@@ -145,7 +145,7 @@ function giveSnapshot() {
145
 
146
  async *scrap(url: string, noCache: string | boolean = false) {
147
  const parsedUrl = new URL(url);
148
- parsedUrl.search = '';
149
  parsedUrl.hash = '';
150
  const normalizedUrl = parsedUrl.toString().toLowerCase();
151
  const digest = md5Hasher.hash(normalizedUrl);
@@ -191,7 +191,17 @@ function giveSnapshot() {
191
  page.on('snapshot', hdl);
192
 
193
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
194
- .then(async (r) => {
 
 
 
 
 
 
 
 
 
 
195
  screenshot = await page.screenshot({
196
  type: 'jpeg',
197
  quality: 85,
@@ -210,16 +220,6 @@ function giveSnapshot() {
210
  ).catch((err) => {
211
  this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
212
  });
213
-
214
- return r;
215
- }).catch((err) => {
216
- this.logger.warn(`Failed to goto ${url}`, { err: marshalErrorLike(err) });
217
- return Promise.reject(new AssertionFailureError({
218
- message: `Failed to goto ${url}: ${err}`,
219
- cause: err,
220
- }));
221
- }).finally(() => {
222
- finalized = true;
223
  });
224
 
225
  try {
 
145
 
146
  async *scrap(url: string, noCache: string | boolean = false) {
147
  const parsedUrl = new URL(url);
148
+ // parsedUrl.search = '';
149
  parsedUrl.hash = '';
150
  const normalizedUrl = parsedUrl.toString().toLowerCase();
151
  const digest = md5Hasher.hash(normalizedUrl);
 
191
  page.on('snapshot', hdl);
192
 
193
  const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
194
+ .catch((err) => {
195
+ this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
196
+ return Promise.reject(new AssertionFailureError({
197
+ message: `Failed to goto ${url}: ${err}`,
198
+ cause: err,
199
+ }));
200
+ }).finally(async () => {
201
+ finalized = true;
202
+ if (!snapshot?.html) {
203
+ return;
204
+ }
205
  screenshot = await page.screenshot({
206
  type: 'jpeg',
207
  quality: 85,
 
220
  ).catch((err) => {
221
  this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
222
  });
 
 
 
 
 
 
 
 
 
 
223
  });
224
 
225
  try {