nomagick commited on
Commit
9ac4060
·
unverified ·
1 Parent(s): 62ccacf

fix: bulk fix multiple issues

Browse files
backend/functions/src/cloud-functions/crawler.ts CHANGED
@@ -19,7 +19,7 @@ import { randomUUID } from 'crypto';
19
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
20
 
21
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
22
- import { CrawlerOptions } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
  import { PDFExtractor } from '../services/pdf-extract';
25
 
@@ -230,7 +230,9 @@ export class CrawlerHost extends RPCHost {
230
 
231
  let pdfMode = false;
232
  if (snapshot.pdfs?.length && !snapshot.title) {
233
- const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0]);
 
 
234
  if (pdf) {
235
  pdfMode = true;
236
  snapshot.title = pdf.meta?.Title;
@@ -432,7 +434,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
432
  runtime: {
433
  memory: '4GiB',
434
  timeoutSeconds: 300,
435
- concurrency: 4,
436
  },
437
  tags: ['Crawler'],
438
  httpMethod: ['get', 'post'],
@@ -442,9 +444,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
442
  @CloudHTTPv2({
443
  runtime: {
444
  memory: '4GiB',
445
- cpu: 2,
446
  timeoutSeconds: 300,
447
- concurrency: 11,
448
  maxInstances: 455,
449
  },
450
  openapi: {
@@ -543,11 +545,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
543
  res: Response,
544
  },
545
  auth: JinaEmbeddingsAuthDTO,
546
- crawlerOptions: CrawlerOptions,
547
  ) {
548
  const uid = await auth.solveUID();
549
  let chargeAmount = 0;
550
- const noSlashURL = ctx.req.url.slice(1).trimStart();
551
  if (!noSlashURL) {
552
  const latestUser = uid ? await auth.assertUser() : undefined;
553
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
@@ -911,6 +913,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
911
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
912
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
913
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
 
914
 
915
  const crawlOpts: ExtraScrappingOptions = {
916
  proxyUrl: opts.proxyUrl,
 
19
  import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
20
 
21
  import { countGPTToken as estimateToken } from '../shared/utils/openai';
22
+ import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
23
  import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
24
  import { PDFExtractor } from '../services/pdf-extract';
25
 
 
230
 
231
  let pdfMode = false;
232
  if (snapshot.pdfs?.length && !snapshot.title) {
233
+ const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
234
+ this.threadLocal.get('cacheTolerance')
235
+ );
236
  if (pdf) {
237
  pdfMode = true;
238
  snapshot.title = pdf.meta?.Title;
 
434
  runtime: {
435
  memory: '4GiB',
436
  timeoutSeconds: 300,
437
+ concurrency: 22,
438
  },
439
  tags: ['Crawler'],
440
  httpMethod: ['get', 'post'],
 
444
  @CloudHTTPv2({
445
  runtime: {
446
  memory: '4GiB',
447
+ cpu: 4,
448
  timeoutSeconds: 300,
449
+ concurrency: 22,
450
  maxInstances: 455,
451
  },
452
  openapi: {
 
545
  res: Response,
546
  },
547
  auth: JinaEmbeddingsAuthDTO,
548
+ crawlerOptions: CrawlerOptionsHeaderOnly,
549
  ) {
550
  const uid = await auth.solveUID();
551
  let chargeAmount = 0;
552
+ const noSlashURL = ctx.req.url.slice(1);
553
  if (!noSlashURL) {
554
  const latestUser = uid ? await auth.assertUser() : undefined;
555
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
 
913
  this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
914
  this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
915
  this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
916
+ this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
917
 
918
  const crawlOpts: ExtraScrappingOptions = {
919
  proxyUrl: opts.proxyUrl,
backend/functions/src/cloud-functions/data-crunching.ts CHANGED
@@ -118,27 +118,10 @@ export class DataCrunchingHost extends RPCHost {
118
  },
119
  tags: ['DataCrunching'],
120
  })
121
- // @CloudHTTPv2({
122
- // runtime: {
123
- // cpu: 2,
124
- // memory: '4GiB',
125
- // timeoutSeconds: 3600,
126
- // concurrency: 2,
127
- // maxInstances: 200,
128
- // },
129
- // tags: ['DataCrunching'],
130
- // })
131
- async dispatchPageCacheCrunching(
132
- @RPCReflect() rpcReflect: RPCReflection,
133
- ) {
134
- const sse = new OutputServerEventStream({ highWaterMark: 4096 });
135
- rpcReflect.return(sse);
136
- rpcReflect.catch((err) => {
137
- sse.end({ data: `Error: ${err.message}` });
138
- });
139
  for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
140
  this.logger.info(`Dispatching ${fileName}...`);
141
- sse.write({ data: `Dispatching ${fileName}...` });
142
 
143
  await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
144
  dispatchDeadlineSeconds: 1800,
@@ -146,12 +129,42 @@ export class DataCrunchingHost extends RPCHost {
146
  });
147
  }
148
 
149
- sse.end({ data: 'done' });
150
- sse.resume();
151
-
152
  return true;
153
  }
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
156
  const startOfToday = dayjs().utc().startOf('day');
157
  const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
@@ -234,8 +247,6 @@ export class DataCrunchingHost extends RPCHost {
234
  if (nRecords) {
235
  yield { fileName, date: theDay.toISOString(), offset };
236
  }
237
-
238
- continue;
239
  }
240
  }
241
 
 
118
  },
119
  tags: ['DataCrunching'],
120
  })
121
+ async dispatchPageCacheCrunching() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
123
  this.logger.info(`Dispatching ${fileName}...`);
124
+ // sse.write({ data: `Dispatching ${fileName}...` });
125
 
126
  await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
127
  dispatchDeadlineSeconds: 1800,
 
129
  });
130
  }
131
 
 
 
 
132
  return true;
133
  }
134
 
135
+ // @CloudHTTPv2({
136
+ // runtime: {
137
+ // cpu: 2,
138
+ // memory: '4GiB',
139
+ // timeoutSeconds: 3600,
140
+ // concurrency: 2,
141
+ // maxInstances: 200,
142
+ // },
143
+ // tags: ['DataCrunching'],
144
+ // })
145
+ // async dispatchPageCacheCrunching(
146
+ // @RPCReflect() rpcReflect: RPCReflection
147
+ // ) {
148
+ // const sse = new OutputServerEventStream({ highWaterMark: 4096 });
149
+ // rpcReflect.return(sse);
150
+ // rpcReflect.catch((err) => {
151
+ // sse.end({ data: `Error: ${err.message}` });
152
+ // });
153
+ // for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
154
+ // this.logger.info(`Dispatching ${fileName}...`);
155
+ // sse.write({ data: `Dispatching ${fileName}...` });
156
+
157
+ // await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
158
+ // dispatchDeadlineSeconds: 1800,
159
+ // uri: await getFunctionUrl('crunchPageCacheWorker'),
160
+ // });
161
+ // }
162
+
163
+ // sse.end({ data: 'done' });
164
+
165
+ // return true;
166
+ // }
167
+
168
  async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
169
  const startOfToday = dayjs().utc().startOf('day');
170
  const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
 
247
  if (nRecords) {
248
  yield { fileName, date: theDay.toISOString(), offset };
249
  }
 
 
250
  }
251
  }
252
 
backend/functions/src/cloud-functions/searcher.ts CHANGED
@@ -53,6 +53,7 @@ export class SearcherHost extends RPCHost {
53
  @CloudHTTPv2({
54
  name: 'search2',
55
  runtime: {
 
56
  memory: '4GiB',
57
  timeoutSeconds: 300,
58
  concurrency: 4,
@@ -64,10 +65,10 @@ export class SearcherHost extends RPCHost {
64
  })
65
  @CloudHTTPv2({
66
  runtime: {
67
- cpu: 4,
68
  memory: '8GiB',
69
  timeoutSeconds: 300,
70
- concurrency: 4,
71
  maxInstances: 200,
72
  },
73
  openapi: {
@@ -265,28 +266,40 @@ export class SearcherHost extends RPCHost {
265
  let lastScrapped: any[] | undefined;
266
  let earlyReturn = false;
267
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
268
- const earlyReturnTimer = setTimeout(() => {
269
- if (!lastScrapped) {
 
270
  return;
271
  }
272
- chargeAmount = this.getChargeAmount(lastScrapped);
273
- rpcReflect.return(lastScrapped);
274
- earlyReturn = true;
275
- }, this.reasonableDelayMs);
 
 
 
 
 
276
 
277
  for await (const scrapped of it) {
278
  lastScrapped = scrapped;
279
-
 
 
280
  if (!this.searchResultsQualified(scrapped)) {
281
  continue;
282
  }
283
- clearTimeout(earlyReturnTimer);
 
 
284
  chargeAmount = this.getChargeAmount(scrapped);
285
 
286
  return scrapped;
287
  }
288
 
289
- clearTimeout(earlyReturnTimer);
 
 
290
 
291
  if (!lastScrapped) {
292
  throw new AssertionFailureError(`No content available for query ${searchQuery}`);
@@ -299,29 +312,44 @@ export class SearcherHost extends RPCHost {
299
  return lastScrapped;
300
  }
301
 
302
- const earlyReturnTimer = setTimeout(() => {
303
- if (!lastScrapped) {
 
304
  return;
305
  }
306
- chargeAmount = this.getChargeAmount(lastScrapped);
307
- rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
308
- earlyReturn = true;
309
- }, this.reasonableDelayMs);
 
 
 
 
 
310
 
311
  for await (const scrapped of it) {
312
  lastScrapped = scrapped;
313
 
 
 
 
 
314
  if (!this.searchResultsQualified(scrapped)) {
315
  continue;
316
  }
317
 
318
- clearTimeout(earlyReturnTimer);
 
 
 
319
  chargeAmount = this.getChargeAmount(scrapped);
320
 
321
  return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
322
  }
323
 
324
- clearTimeout(earlyReturnTimer);
 
 
325
 
326
  if (!lastScrapped) {
327
  throw new AssertionFailureError(`No content available for query ${searchQuery}`);
@@ -331,7 +359,6 @@ export class SearcherHost extends RPCHost {
331
  chargeAmount = this.getChargeAmount(lastScrapped);
332
  }
333
 
334
-
335
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
336
  }
337
 
 
53
  @CloudHTTPv2({
54
  name: 'search2',
55
  runtime: {
56
+ cpu: 4,
57
  memory: '4GiB',
58
  timeoutSeconds: 300,
59
  concurrency: 4,
 
65
  })
66
  @CloudHTTPv2({
67
  runtime: {
68
+ cpu: 8,
69
  memory: '8GiB',
70
  timeoutSeconds: 300,
71
+ concurrency: 6,
72
  maxInstances: 200,
73
  },
74
  openapi: {
 
266
  let lastScrapped: any[] | undefined;
267
  let earlyReturn = false;
268
  if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
269
+ let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
270
+ const setEarlyReturnTimer = () => {
271
+ if (earlyReturnTimer) {
272
  return;
273
  }
274
+ earlyReturnTimer = setTimeout(() => {
275
+ if (!lastScrapped) {
276
+ return;
277
+ }
278
+ chargeAmount = this.getChargeAmount(lastScrapped);
279
+ rpcReflect.return(lastScrapped);
280
+ earlyReturn = true;
281
+ }, this.reasonableDelayMs);
282
+ };
283
 
284
  for await (const scrapped of it) {
285
  lastScrapped = scrapped;
286
+ if (_.some(scrapped, (x) => this.pageQualified(x))) {
287
+ setEarlyReturnTimer();
288
+ }
289
  if (!this.searchResultsQualified(scrapped)) {
290
  continue;
291
  }
292
+ if (earlyReturnTimer) {
293
+ clearTimeout(earlyReturnTimer);
294
+ }
295
  chargeAmount = this.getChargeAmount(scrapped);
296
 
297
  return scrapped;
298
  }
299
 
300
+ if (earlyReturnTimer) {
301
+ clearTimeout(earlyReturnTimer);
302
+ }
303
 
304
  if (!lastScrapped) {
305
  throw new AssertionFailureError(`No content available for query ${searchQuery}`);
 
312
  return lastScrapped;
313
  }
314
 
315
+ let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
316
+ const setEarlyReturnTimer = () => {
317
+ if (earlyReturnTimer) {
318
  return;
319
  }
320
+ earlyReturnTimer = setTimeout(() => {
321
+ if (!lastScrapped) {
322
+ return;
323
+ }
324
+ chargeAmount = this.getChargeAmount(lastScrapped);
325
+ rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
326
+ earlyReturn = true;
327
+ }, this.reasonableDelayMs);
328
+ };
329
 
330
  for await (const scrapped of it) {
331
  lastScrapped = scrapped;
332
 
333
+ if (_.some(scrapped, (x) => this.pageQualified(x))) {
334
+ setEarlyReturnTimer();
335
+ }
336
+
337
  if (!this.searchResultsQualified(scrapped)) {
338
  continue;
339
  }
340
 
341
+ if (earlyReturnTimer) {
342
+ clearTimeout(earlyReturnTimer);
343
+ }
344
+
345
  chargeAmount = this.getChargeAmount(scrapped);
346
 
347
  return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
348
  }
349
 
350
+ if (earlyReturnTimer) {
351
+ clearTimeout(earlyReturnTimer);
352
+ }
353
 
354
  if (!lastScrapped) {
355
  throw new AssertionFailureError(`No content available for query ${searchQuery}`);
 
359
  chargeAmount = this.getChargeAmount(lastScrapped);
360
  }
361
 
 
362
  return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
363
  }
364
 
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -52,44 +52,44 @@ export class CrawlerOptions extends AutoCastable {
52
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
53
  req: Request,
54
  res: Response,
55
- };
56
 
57
- const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format');
58
  if (customMode !== undefined) {
59
  instance.respondWith = customMode;
60
  }
61
 
62
- const withGeneratedAlt = ctx.req.get('x-with-generated-alt');
63
  if (withGeneratedAlt !== undefined) {
64
  instance.withGeneratedAlt = Boolean(withGeneratedAlt);
65
  }
66
- const withLinksSummary = ctx.req.get('x-with-links-summary');
67
  if (withLinksSummary !== undefined) {
68
  instance.withLinksSummary = Boolean(withLinksSummary);
69
  }
70
- const withImagesSummary = ctx.req.get('x-with-images-summary');
71
  if (withImagesSummary !== undefined) {
72
  instance.withImagesSummary = Boolean(withImagesSummary);
73
  }
74
- const noCache = ctx.req.get('x-no-cache');
75
  if (noCache !== undefined) {
76
  instance.noCache = Boolean(noCache);
77
- if (instance.noCache && instance.cacheTolerance === undefined) {
78
- instance.cacheTolerance = 0;
79
- }
80
  }
81
- let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '');
 
 
 
82
  if (!isNaN(cacheTolerance)) {
83
  instance.cacheTolerance = cacheTolerance;
84
  }
85
 
86
- const targetSelector = ctx.req.get('x-target-selector');
87
  instance.targetSelector ??= targetSelector;
88
- const waitForSelector = ctx.req.get('x-wait-for-selector');
89
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
90
 
91
  const cookies: CookieParam[] = [];
92
- const setCookieHeaders = ctx.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
93
  if (Array.isArray(setCookieHeaders)) {
94
  for (const setCookie of setCookieHeaders) {
95
  cookies.push({
@@ -102,9 +102,23 @@ export class CrawlerOptions extends AutoCastable {
102
  });
103
  }
104
 
105
- const proxyUrl = ctx.req.get('x-proxy-url');
106
  instance.proxyUrl ??= proxyUrl;
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return instance;
109
  }
110
  }
 
52
  const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
53
  req: Request,
54
  res: Response,
55
+ } | undefined;
56
 
57
+ const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format');
58
  if (customMode !== undefined) {
59
  instance.respondWith = customMode;
60
  }
61
 
62
+ const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
63
  if (withGeneratedAlt !== undefined) {
64
  instance.withGeneratedAlt = Boolean(withGeneratedAlt);
65
  }
66
+ const withLinksSummary = ctx?.req.get('x-with-links-summary');
67
  if (withLinksSummary !== undefined) {
68
  instance.withLinksSummary = Boolean(withLinksSummary);
69
  }
70
+ const withImagesSummary = ctx?.req.get('x-with-images-summary');
71
  if (withImagesSummary !== undefined) {
72
  instance.withImagesSummary = Boolean(withImagesSummary);
73
  }
74
+ const noCache = ctx?.req.get('x-no-cache');
75
  if (noCache !== undefined) {
76
  instance.noCache = Boolean(noCache);
 
 
 
77
  }
78
+ if (instance.noCache && instance.cacheTolerance === undefined) {
79
+ instance.cacheTolerance = 0;
80
+ }
81
+ let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || '');
82
  if (!isNaN(cacheTolerance)) {
83
  instance.cacheTolerance = cacheTolerance;
84
  }
85
 
86
+ const targetSelector = ctx?.req.get('x-target-selector');
87
  instance.targetSelector ??= targetSelector;
88
+ const waitForSelector = ctx?.req.get('x-wait-for-selector');
89
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
90
 
91
  const cookies: CookieParam[] = [];
92
+ const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
93
  if (Array.isArray(setCookieHeaders)) {
94
  for (const setCookie of setCookieHeaders) {
95
  cookies.push({
 
102
  });
103
  }
104
 
105
+ const proxyUrl = ctx?.req.get('x-proxy-url');
106
  instance.proxyUrl ??= proxyUrl;
107
 
108
+ if (instance.cacheTolerance) {
109
+ instance.cacheTolerance = instance.cacheTolerance * 1000;
110
+ }
111
+
112
+ return instance;
113
+ }
114
+ }
115
+
116
+ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
117
+ static override from(input: any) {
118
+ const instance = super.from({
119
+ [RPC_CALL_ENVIRONMENT]: Reflect.get(input, RPC_CALL_ENVIRONMENT),
120
+ }) as CrawlerOptionsHeaderOnly;
121
+
122
  return instance;
123
  }
124
  }
backend/functions/src/index.ts CHANGED
@@ -13,6 +13,7 @@ Object.assign(exports, registry.exportGrouped({
13
  memory: '4GiB',
14
  timeoutSeconds: 540,
15
  }));
 
16
  registry.title = 'reader';
17
  registry.version = '0.1.0';
18
 
 
13
  memory: '4GiB',
14
  timeoutSeconds: 540,
15
  }));
16
+ registry.allHandsOnDeck().catch(() => void 0);
17
  registry.title = 'reader';
18
  registry.version = '0.1.0';
19
 
backend/functions/src/services/pdf-extract.ts CHANGED
@@ -6,6 +6,8 @@ import { AsyncService, HashManager } from 'civkit';
6
  import { Logger } from '../shared/services/logger';
7
  import { PDFContent } from '../db/pdf';
8
  import dayjs from 'dayjs';
 
 
9
  const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
10
  dayjs.extend(utc); // Extend dayjs with the UTC plugin
11
  const timezone = require('dayjs/plugin/timezone');
@@ -46,6 +48,7 @@ export class PDFExtractor extends AsyncService {
46
 
47
  constructor(
48
  protected globalLogger: Logger,
 
49
  ) {
50
  super(...arguments);
51
  }
@@ -225,22 +228,46 @@ export class PDFExtractor extends AsyncService {
225
  return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
226
  }
227
 
228
- async cachedExtract(url: string | URL) {
229
  if (!url) {
230
  return undefined;
231
  }
232
 
233
  const digest = md5Hasher.hash(url.toString());
234
- const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
235
 
236
- const existing = await PDFContent.fromFirestore(shortDigest);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
- if (existing) {
239
- return {
240
- meta: existing.meta,
241
- content: existing.content,
242
- text: existing.text
243
- };
 
 
 
 
 
244
  }
245
 
246
  let extracted;
@@ -253,14 +280,16 @@ export class PDFExtractor extends AsyncService {
253
 
254
  // Don't try again until the next day
255
  const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
 
 
 
256
 
257
- await PDFContent.COLLECTION.doc(shortDigest).set(
258
  {
259
- _id: shortDigest,
260
  src: url.toString(),
261
  meta: extracted?.meta || {},
262
- content: extracted?.content || '',
263
  text: extracted?.text || '',
 
264
  urlDigest: digest,
265
  createdAt: new Date(),
266
  ...expireMixin
 
6
  import { Logger } from '../shared/services/logger';
7
  import { PDFContent } from '../db/pdf';
8
  import dayjs from 'dayjs';
9
+ import { FirebaseStorageBucketControl } from '../shared';
10
+ import { randomUUID } from 'crypto';
11
  const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
12
  dayjs.extend(utc); // Extend dayjs with the UTC plugin
13
  const timezone = require('dayjs/plugin/timezone');
 
48
 
49
  constructor(
50
  protected globalLogger: Logger,
51
+ protected firebaseObjectStorage: FirebaseStorageBucketControl,
52
  ) {
53
  super(...arguments);
54
  }
 
228
  return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
229
  }
230
 
231
+ async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
232
  if (!url) {
233
  return undefined;
234
  }
235
 
236
  const digest = md5Hasher.hash(url.toString());
 
237
 
238
+ const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
239
+
240
+ if (cache) {
241
+ const age = Date.now() - cache?.createdAt.valueOf();
242
+ const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
243
+ this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
244
+ url, digest, age, stale, cacheTolerance
245
+ });
246
+
247
+ if (!stale) {
248
+ if (cache.content && cache.text) {
249
+ return {
250
+ meta: cache.meta,
251
+ content: cache.content,
252
+ text: cache.text
253
+ };
254
+ }
255
+
256
+ try {
257
+ const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`);
258
+ let cached = JSON.parse(r.toString('utf-8'));
259
 
260
+ return {
261
+ meta: cached.meta,
262
+ content: cached.content,
263
+ text: cached.text
264
+ };
265
+ } catch (err) {
266
+ this.logger.warn(`Unable to load cached content for ${url}`, { err });
267
+
268
+ return undefined;
269
+ }
270
+ }
271
  }
272
 
273
  let extracted;
 
280
 
281
  // Don't try again until the next day
282
  const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
283
+ const theID = randomUUID();
284
+ await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
285
+ Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
286
 
287
+ await PDFContent.COLLECTION.doc(theID).set(
288
  {
 
289
  src: url.toString(),
290
  meta: extracted?.meta || {},
 
291
  text: extracted?.text || '',
292
+ content: extracted?.content || '',
293
  urlDigest: digest,
294
  createdAt: new Date(),
295
  ...expireMixin
backend/functions/src/services/puppeteer.ts CHANGED
@@ -380,7 +380,7 @@ document.addEventListener('load', handlePageLoad);
380
  let screenshot: Buffer | undefined;
381
  const page = await this.getNextPage();
382
  const sn = this.snMap.get(page);
383
- this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
384
  if (options?.proxyUrl) {
385
  await page.useProxy(options.proxyUrl);
386
  }
 
380
  let screenshot: Buffer | undefined;
381
  const page = await this.getNextPage();
382
  const sn = this.snMap.get(page);
383
+ this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
384
  if (options?.proxyUrl) {
385
  await page.useProxy(options.proxyUrl);
386
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit b0b597800a36e2aa8ee3d52715aa7c998b388f47
 
1
+ Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6