Spaces:
Build error
Build error
fix: implement DNT in alt-gen and pdf-extract
Browse files- src/api/crawler.ts +1 -0
- src/services/alt-text.ts +8 -1
- src/services/pdf-extract.ts +19 -15
src/api/crawler.ts
CHANGED
|
@@ -904,6 +904,7 @@ export class CrawlerHost extends RPCHost {
|
|
| 904 |
}
|
| 905 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 906 |
this.threadLocal.set('noGfm', opts.noGfm);
|
|
|
|
| 907 |
|
| 908 |
const crawlOpts: ExtraScrappingOptions = {
|
| 909 |
proxyUrl: opts.proxyUrl,
|
|
|
|
| 904 |
}
|
| 905 |
this.threadLocal.set('retainImages', opts.retainImages);
|
| 906 |
this.threadLocal.set('noGfm', opts.noGfm);
|
| 907 |
+
this.threadLocal.set('DNT', Boolean(opts.doNotTrack))
|
| 908 |
|
| 909 |
const crawlOpts: ExtraScrappingOptions = {
|
| 910 |
proxyUrl: opts.proxyUrl,
|
src/services/alt-text.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { CanvasService } from '../shared/services/canvas';
|
|
| 5 |
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
| 6 |
import { ImgBrief } from './puppeteer';
|
| 7 |
import { ImgAlt } from '../db/img-alt';
|
|
|
|
| 8 |
|
| 9 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 10 |
|
|
@@ -17,7 +18,8 @@ export class AltTextService extends AsyncService {
|
|
| 17 |
constructor(
|
| 18 |
protected globalLogger: Logger,
|
| 19 |
protected imageInterrogator: ImageInterrogationManager,
|
| 20 |
-
protected canvasService: CanvasService
|
|
|
|
| 21 |
) {
|
| 22 |
super(...arguments);
|
| 23 |
}
|
|
@@ -69,6 +71,11 @@ export class AltTextService extends AsyncService {
|
|
| 69 |
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
| 70 |
}
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
// Don't try again until the next day
|
| 73 |
const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
| 74 |
|
|
|
|
| 5 |
import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
|
| 6 |
import { ImgBrief } from './puppeteer';
|
| 7 |
import { ImgAlt } from '../db/img-alt';
|
| 8 |
+
import { AsyncLocalContext } from './async-context';
|
| 9 |
|
| 10 |
const md5Hasher = new HashManager('md5', 'hex');
|
| 11 |
|
|
|
|
| 18 |
constructor(
|
| 19 |
protected globalLogger: Logger,
|
| 20 |
protected imageInterrogator: ImageInterrogationManager,
|
| 21 |
+
protected canvasService: CanvasService,
|
| 22 |
+
protected asyncLocalContext: AsyncLocalContext
|
| 23 |
) {
|
| 24 |
super(...arguments);
|
| 25 |
}
|
|
|
|
| 71 |
this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
|
| 72 |
}
|
| 73 |
|
| 74 |
+
if (this.asyncLocalContext.ctx.DNT) {
|
| 75 |
+
// Don't cache alt text if DNT is set
|
| 76 |
+
return;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
// Don't try again until the next day
|
| 80 |
const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
|
| 81 |
|
src/services/pdf-extract.ts
CHANGED
|
@@ -10,6 +10,7 @@ import { FirebaseStorageBucketControl } from '../shared';
|
|
| 10 |
import { randomUUID } from 'crypto';
|
| 11 |
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
| 12 |
import path from 'path';
|
|
|
|
| 13 |
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
| 14 |
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
| 15 |
const timezone = require('dayjs/plugin/timezone');
|
|
@@ -56,6 +57,7 @@ export class PDFExtractor extends AsyncService {
|
|
| 56 |
constructor(
|
| 57 |
protected globalLogger: Logger,
|
| 58 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
|
|
|
| 59 |
) {
|
| 60 |
super(...arguments);
|
| 61 |
}
|
|
@@ -324,21 +326,23 @@ export class PDFExtractor extends AsyncService {
|
|
| 324 |
try {
|
| 325 |
extracted = await this.extract(data);
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
PDFContent.
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
| 342 |
} catch (err) {
|
| 343 |
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
| 344 |
throw err;
|
|
|
|
| 10 |
import { randomUUID } from 'crypto';
|
| 11 |
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
| 12 |
import path from 'path';
|
| 13 |
+
import { AsyncLocalContext } from './async-context';
|
| 14 |
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
| 15 |
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
| 16 |
const timezone = require('dayjs/plugin/timezone');
|
|
|
|
| 57 |
constructor(
|
| 58 |
protected globalLogger: Logger,
|
| 59 |
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
| 60 |
+
protected asyncLocalContext: AsyncLocalContext,
|
| 61 |
) {
|
| 62 |
super(...arguments);
|
| 63 |
}
|
|
|
|
| 326 |
try {
|
| 327 |
extracted = await this.extract(data);
|
| 328 |
|
| 329 |
+
if (!this.asyncLocalContext.ctx.DNT) {
|
| 330 |
+
const theID = randomUUID();
|
| 331 |
+
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
| 332 |
+
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
| 333 |
+
PDFContent.save(
|
| 334 |
+
PDFContent.from({
|
| 335 |
+
_id: theID,
|
| 336 |
+
src: nameUrl,
|
| 337 |
+
meta: extracted?.meta || {},
|
| 338 |
+
urlDigest: digest,
|
| 339 |
+
createdAt: new Date(),
|
| 340 |
+
expireAt: new Date(Date.now() + this.cacheRetentionMs)
|
| 341 |
+
}).degradeForFireStore()
|
| 342 |
+
).catch((r) => {
|
| 343 |
+
this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
|
| 344 |
+
});
|
| 345 |
+
}
|
| 346 |
} catch (err) {
|
| 347 |
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
| 348 |
throw err;
|