nomagick commited on
Commit
3b1978f
·
unverified ·
1 Parent(s): 1a2754c

fix: implement DNT in alt-gen and pdf-extract

Browse files
src/api/crawler.ts CHANGED
@@ -904,6 +904,7 @@ export class CrawlerHost extends RPCHost {
904
  }
905
  this.threadLocal.set('retainImages', opts.retainImages);
906
  this.threadLocal.set('noGfm', opts.noGfm);
 
907
 
908
  const crawlOpts: ExtraScrappingOptions = {
909
  proxyUrl: opts.proxyUrl,
 
904
  }
905
  this.threadLocal.set('retainImages', opts.retainImages);
906
  this.threadLocal.set('noGfm', opts.noGfm);
907
+ this.threadLocal.set('DNT', Boolean(opts.doNotTrack))
908
 
909
  const crawlOpts: ExtraScrappingOptions = {
910
  proxyUrl: opts.proxyUrl,
src/services/alt-text.ts CHANGED
@@ -5,6 +5,7 @@ import { CanvasService } from '../shared/services/canvas';
5
  import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
 
8
 
9
  const md5Hasher = new HashManager('md5', 'hex');
10
 
@@ -17,7 +18,8 @@ export class AltTextService extends AsyncService {
17
  constructor(
18
  protected globalLogger: Logger,
19
  protected imageInterrogator: ImageInterrogationManager,
20
- protected canvasService: CanvasService
 
21
  ) {
22
  super(...arguments);
23
  }
@@ -69,6 +71,11 @@ export class AltTextService extends AsyncService {
69
  this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
70
  }
71
 
 
 
 
 
 
72
  // Don't try again until the next day
73
  const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
74
 
 
5
  import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
6
  import { ImgBrief } from './puppeteer';
7
  import { ImgAlt } from '../db/img-alt';
8
+ import { AsyncLocalContext } from './async-context';
9
 
10
  const md5Hasher = new HashManager('md5', 'hex');
11
 
 
18
  constructor(
19
  protected globalLogger: Logger,
20
  protected imageInterrogator: ImageInterrogationManager,
21
+ protected canvasService: CanvasService,
22
+ protected asyncLocalContext: AsyncLocalContext
23
  ) {
24
  super(...arguments);
25
  }
 
71
  this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
72
  }
73
 
74
+ if (this.asyncLocalContext.ctx.DNT) {
75
+ // Don't cache alt text if DNT is set
76
+ return;
77
+ }
78
+
79
  // Don't try again until the next day
80
  const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
81
 
src/services/pdf-extract.ts CHANGED
@@ -10,6 +10,7 @@ import { FirebaseStorageBucketControl } from '../shared';
10
  import { randomUUID } from 'crypto';
11
  import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
12
  import path from 'path';
 
13
  const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
14
  dayjs.extend(utc); // Extend dayjs with the UTC plugin
15
  const timezone = require('dayjs/plugin/timezone');
@@ -56,6 +57,7 @@ export class PDFExtractor extends AsyncService {
56
  constructor(
57
  protected globalLogger: Logger,
58
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
 
59
  ) {
60
  super(...arguments);
61
  }
@@ -324,21 +326,23 @@ export class PDFExtractor extends AsyncService {
324
  try {
325
  extracted = await this.extract(data);
326
 
327
- const theID = randomUUID();
328
- await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
329
- Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
330
- PDFContent.save(
331
- PDFContent.from({
332
- _id: theID,
333
- src: nameUrl,
334
- meta: extracted?.meta || {},
335
- urlDigest: digest,
336
- createdAt: new Date(),
337
- expireAt: new Date(Date.now() + this.cacheRetentionMs)
338
- }).degradeForFireStore()
339
- ).catch((r) => {
340
- this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
341
- });
 
 
342
  } catch (err) {
343
  this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
344
  throw err;
 
10
  import { randomUUID } from 'crypto';
11
  import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
12
  import path from 'path';
13
+ import { AsyncLocalContext } from './async-context';
14
  const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
15
  dayjs.extend(utc); // Extend dayjs with the UTC plugin
16
  const timezone = require('dayjs/plugin/timezone');
 
57
  constructor(
58
  protected globalLogger: Logger,
59
  protected firebaseObjectStorage: FirebaseStorageBucketControl,
60
+ protected asyncLocalContext: AsyncLocalContext,
61
  ) {
62
  super(...arguments);
63
  }
 
326
  try {
327
  extracted = await this.extract(data);
328
 
329
+ if (!this.asyncLocalContext.ctx.DNT) {
330
+ const theID = randomUUID();
331
+ await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
332
+ Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
333
+ PDFContent.save(
334
+ PDFContent.from({
335
+ _id: theID,
336
+ src: nameUrl,
337
+ meta: extracted?.meta || {},
338
+ urlDigest: digest,
339
+ createdAt: new Date(),
340
+ expireAt: new Date(Date.now() + this.cacheRetentionMs)
341
+ }).degradeForFireStore()
342
+ ).catch((r) => {
343
+ this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
344
+ });
345
+ }
346
  } catch (err) {
347
  this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
348
  throw err;