Spaces:

bonesmasher
/

web_reader

Build error

App Files Files Community

nomagick commited on May 31, 2024

Commit

9ac4060

unverified ·

1 Parent(s): 62ccacf

fix: bulk fix multiple issues

Browse files

Files changed (8) hide show

backend/functions/src/cloud-functions/crawler.ts +10 -7
backend/functions/src/cloud-functions/data-crunching.ts +35 -24
backend/functions/src/cloud-functions/searcher.ts +47 -20
backend/functions/src/dto/scrapping-options.ts +28 -14
backend/functions/src/index.ts +1 -0
backend/functions/src/services/pdf-extract.ts +41 -12
backend/functions/src/services/puppeteer.ts +1 -1
thinapps-shared +1 -1

backend/functions/src/cloud-functions/crawler.ts CHANGED Viewed

@@ -19,7 +19,7 @@ import { randomUUID } from 'crypto';
 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 import { countGPTToken as estimateToken } from '../shared/utils/openai';
-import { CrawlerOptions } from '../dto/scrapping-options';
 import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 import { PDFExtractor } from '../services/pdf-extract';
@@ -230,7 +230,9 @@ export class CrawlerHost extends RPCHost {
         let pdfMode = false;
         if (snapshot.pdfs?.length && !snapshot.title) {
-            const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0]);
             if (pdf) {
                 pdfMode = true;
                 snapshot.title = pdf.meta?.Title;
@@ -432,7 +434,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
         runtime: {
             memory: '4GiB',
             timeoutSeconds: 300,
-            concurrency: 4,
         },
         tags: ['Crawler'],
         httpMethod: ['get', 'post'],
@@ -442,9 +444,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
     @CloudHTTPv2({
         runtime: {
             memory: '4GiB',
-            cpu: 2,
             timeoutSeconds: 300,
-            concurrency: 11,
             maxInstances: 455,
         },
         openapi: {
@@ -543,11 +545,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
             res: Response,
         },
         auth: JinaEmbeddingsAuthDTO,
-        crawlerOptions: CrawlerOptions,
     ) {
         const uid = await auth.solveUID();
         let chargeAmount = 0;
-        const noSlashURL = ctx.req.url.slice(1).trimStart();
         if (!noSlashURL) {
             const latestUser = uid ? await auth.assertUser() : undefined;
             if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
@@ -911,6 +913,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
         this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
         this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
         this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
         const crawlOpts: ExtraScrappingOptions = {
             proxyUrl: opts.proxyUrl,

 import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
 import { countGPTToken as estimateToken } from '../shared/utils/openai';
+import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
 import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 import { PDFExtractor } from '../services/pdf-extract';
         let pdfMode = false;
         if (snapshot.pdfs?.length && !snapshot.title) {
+            const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
+                this.threadLocal.get('cacheTolerance')
+            );
             if (pdf) {
                 pdfMode = true;
                 snapshot.title = pdf.meta?.Title;
         runtime: {
             memory: '4GiB',
             timeoutSeconds: 300,
+            concurrency: 22,
         },
         tags: ['Crawler'],
         httpMethod: ['get', 'post'],
     @CloudHTTPv2({
         runtime: {
             memory: '4GiB',
+            cpu: 4,
             timeoutSeconds: 300,
+            concurrency: 22,
             maxInstances: 455,
         },
         openapi: {
             res: Response,
         },
         auth: JinaEmbeddingsAuthDTO,
+        crawlerOptions: CrawlerOptionsHeaderOnly,
     ) {
         const uid = await auth.solveUID();
         let chargeAmount = 0;
+        const noSlashURL = ctx.req.url.slice(1);
         if (!noSlashURL) {
             const latestUser = uid ? await auth.assertUser() : undefined;
             if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
         this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt);
         this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
         this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
+        this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
         const crawlOpts: ExtraScrappingOptions = {
             proxyUrl: opts.proxyUrl,

backend/functions/src/cloud-functions/data-crunching.ts CHANGED Viewed

@@ -118,27 +118,10 @@ export class DataCrunchingHost extends RPCHost {
         },
         tags: ['DataCrunching'],
     })
-    // @CloudHTTPv2({
-    //     runtime: {
-    //         cpu: 2,
-    //         memory: '4GiB',
-    //         timeoutSeconds: 3600,
-    //         concurrency: 2,
-    //         maxInstances: 200,
-    //     },
-    //     tags: ['DataCrunching'],
-    // })
-    async dispatchPageCacheCrunching(
-        @RPCReflect() rpcReflect: RPCReflection,
-    ) {
-        const sse = new OutputServerEventStream({ highWaterMark: 4096 });
-        rpcReflect.return(sse);
-        rpcReflect.catch((err) => {
-            sse.end({ data: `Error: ${err.message}` });
-        });
         for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
             this.logger.info(`Dispatching ${fileName}...`);
-            sse.write({ data: `Dispatching ${fileName}...` });
             await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
                 dispatchDeadlineSeconds: 1800,
@@ -146,12 +129,42 @@ export class DataCrunchingHost extends RPCHost {
             });
         }
-        sse.end({ data: 'done' });
-        sse.resume();
         return true;
     }
     async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
         const startOfToday = dayjs().utc().startOf('day');
         const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
@@ -234,8 +247,6 @@ export class DataCrunchingHost extends RPCHost {
             if (nRecords) {
                 yield { fileName, date: theDay.toISOString(), offset };
             }
-            continue;
         }
     }

         },
         tags: ['DataCrunching'],
     })
+    async dispatchPageCacheCrunching() {
         for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
             this.logger.info(`Dispatching ${fileName}...`);
+            // sse.write({ data: `Dispatching ${fileName}...` });
             await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
                 dispatchDeadlineSeconds: 1800,
             });
         }
         return true;
     }
+    // @CloudHTTPv2({
+    //     runtime: {
+    //         cpu: 2,
+    //         memory: '4GiB',
+    //         timeoutSeconds: 3600,
+    //         concurrency: 2,
+    //         maxInstances: 200,
+    //     },
+    //     tags: ['DataCrunching'],
+    // })
+    // async dispatchPageCacheCrunching(
+    //     @RPCReflect() rpcReflect: RPCReflection
+    // ) {
+    //     const sse = new OutputServerEventStream({ highWaterMark: 4096 });
+    //     rpcReflect.return(sse);
+    //     rpcReflect.catch((err) => {
+    //         sse.end({ data: `Error: ${err.message}` });
+    //     });
+    //     for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
+    //         this.logger.info(`Dispatching ${fileName}...`);
+    //         sse.write({ data: `Dispatching ${fileName}...` });
+    //         await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
+    //             dispatchDeadlineSeconds: 1800,
+    //             uri: await getFunctionUrl('crunchPageCacheWorker'),
+    //         });
+    //     }
+    //     sse.end({ data: 'done' });
+    //     return true;
+    // }
     async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
         const startOfToday = dayjs().utc().startOf('day');
         const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
             if (nRecords) {
                 yield { fileName, date: theDay.toISOString(), offset };
             }
         }
     }

backend/functions/src/cloud-functions/searcher.ts CHANGED Viewed

@@ -53,6 +53,7 @@ export class SearcherHost extends RPCHost {
     @CloudHTTPv2({
         name: 'search2',
         runtime: {
             memory: '4GiB',
             timeoutSeconds: 300,
             concurrency: 4,
@@ -64,10 +65,10 @@ export class SearcherHost extends RPCHost {
     })
     @CloudHTTPv2({
         runtime: {
-            cpu: 4,
             memory: '8GiB',
             timeoutSeconds: 300,
-            concurrency: 4,
             maxInstances: 200,
         },
         openapi: {
@@ -265,28 +266,40 @@ export class SearcherHost extends RPCHost {
         let lastScrapped: any[] | undefined;
         let earlyReturn = false;
         if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
-            const earlyReturnTimer = setTimeout(() => {
-                if (!lastScrapped) {
                     return;
                 }
-                chargeAmount = this.getChargeAmount(lastScrapped);
-                rpcReflect.return(lastScrapped);
-                earlyReturn = true;
-            }, this.reasonableDelayMs);
             for await (const scrapped of it) {
                 lastScrapped = scrapped;
                 if (!this.searchResultsQualified(scrapped)) {
                     continue;
                 }
-                clearTimeout(earlyReturnTimer);
                 chargeAmount = this.getChargeAmount(scrapped);
                 return scrapped;
             }
-            clearTimeout(earlyReturnTimer);
             if (!lastScrapped) {
                 throw new AssertionFailureError(`No content available for query ${searchQuery}`);
@@ -299,29 +312,44 @@ export class SearcherHost extends RPCHost {
             return lastScrapped;
         }
-        const earlyReturnTimer = setTimeout(() => {
-            if (!lastScrapped) {
                 return;
             }
-            chargeAmount = this.getChargeAmount(lastScrapped);
-            rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
-            earlyReturn = true;
-        }, this.reasonableDelayMs);
         for await (const scrapped of it) {
             lastScrapped = scrapped;
             if (!this.searchResultsQualified(scrapped)) {
                 continue;
             }
-            clearTimeout(earlyReturnTimer);
             chargeAmount = this.getChargeAmount(scrapped);
             return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
         }
-        clearTimeout(earlyReturnTimer);
         if (!lastScrapped) {
             throw new AssertionFailureError(`No content available for query ${searchQuery}`);
@@ -331,7 +359,6 @@ export class SearcherHost extends RPCHost {
             chargeAmount = this.getChargeAmount(lastScrapped);
         }
         return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
     }

     @CloudHTTPv2({
         name: 'search2',
         runtime: {
+            cpu: 4,
             memory: '4GiB',
             timeoutSeconds: 300,
             concurrency: 4,
     })
     @CloudHTTPv2({
         runtime: {
+            cpu: 8,
             memory: '8GiB',
             timeoutSeconds: 300,
+            concurrency: 6,
             maxInstances: 200,
         },
         openapi: {
         let lastScrapped: any[] | undefined;
         let earlyReturn = false;
         if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
+            let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
+            const setEarlyReturnTimer = () => {
+                if (earlyReturnTimer) {
                     return;
                 }
+                earlyReturnTimer = setTimeout(() => {
+                    if (!lastScrapped) {
+                        return;
+                    }
+                    chargeAmount = this.getChargeAmount(lastScrapped);
+                    rpcReflect.return(lastScrapped);
+                    earlyReturn = true;
+                }, this.reasonableDelayMs);
+            };
             for await (const scrapped of it) {
                 lastScrapped = scrapped;
+                if (_.some(scrapped, (x) => this.pageQualified(x))) {
+                    setEarlyReturnTimer();
+                }
                 if (!this.searchResultsQualified(scrapped)) {
                     continue;
                 }
+                if (earlyReturnTimer) {
+                    clearTimeout(earlyReturnTimer);
+                }
                 chargeAmount = this.getChargeAmount(scrapped);
                 return scrapped;
             }
+            if (earlyReturnTimer) {
+                clearTimeout(earlyReturnTimer);
+            }
             if (!lastScrapped) {
                 throw new AssertionFailureError(`No content available for query ${searchQuery}`);
             return lastScrapped;
         }
+        let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
+        const setEarlyReturnTimer = () => {
+            if (earlyReturnTimer) {
                 return;
             }
+            earlyReturnTimer = setTimeout(() => {
+                if (!lastScrapped) {
+                    return;
+                }
+                chargeAmount = this.getChargeAmount(lastScrapped);
+                rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
+                earlyReturn = true;
+            }, this.reasonableDelayMs);
+        };
         for await (const scrapped of it) {
             lastScrapped = scrapped;
+            if (_.some(scrapped, (x) => this.pageQualified(x))) {
+                setEarlyReturnTimer();
+            }
             if (!this.searchResultsQualified(scrapped)) {
                 continue;
             }
+            if (earlyReturnTimer) {
+                clearTimeout(earlyReturnTimer);
+            }
             chargeAmount = this.getChargeAmount(scrapped);
             return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
         }
+        if (earlyReturnTimer) {
+            clearTimeout(earlyReturnTimer);
+        }
         if (!lastScrapped) {
             throw new AssertionFailureError(`No content available for query ${searchQuery}`);
             chargeAmount = this.getChargeAmount(lastScrapped);
         }
         return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
     }

backend/functions/src/dto/scrapping-options.ts CHANGED Viewed

@@ -52,44 +52,44 @@ export class CrawlerOptions extends AutoCastable {
         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
             req: Request,
             res: Response,
-        };
-        const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format');
         if (customMode !== undefined) {
             instance.respondWith = customMode;
         }
-        const withGeneratedAlt = ctx.req.get('x-with-generated-alt');
         if (withGeneratedAlt !== undefined) {
             instance.withGeneratedAlt = Boolean(withGeneratedAlt);
         }
-        const withLinksSummary = ctx.req.get('x-with-links-summary');
         if (withLinksSummary !== undefined) {
             instance.withLinksSummary = Boolean(withLinksSummary);
         }
-        const withImagesSummary = ctx.req.get('x-with-images-summary');
         if (withImagesSummary !== undefined) {
             instance.withImagesSummary = Boolean(withImagesSummary);
         }
-        const noCache = ctx.req.get('x-no-cache');
         if (noCache !== undefined) {
             instance.noCache = Boolean(noCache);
-            if (instance.noCache && instance.cacheTolerance === undefined) {
-                instance.cacheTolerance = 0;
-            }
         }
-        let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '');
         if (!isNaN(cacheTolerance)) {
             instance.cacheTolerance = cacheTolerance;
         }
-        const targetSelector = ctx.req.get('x-target-selector');
         instance.targetSelector ??= targetSelector;
-        const waitForSelector = ctx.req.get('x-wait-for-selector');
         instance.waitForSelector ??= waitForSelector || instance.targetSelector;
         const cookies: CookieParam[] = [];
-        const setCookieHeaders = ctx.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
         if (Array.isArray(setCookieHeaders)) {
             for (const setCookie of setCookieHeaders) {
                 cookies.push({
@@ -102,9 +102,23 @@ export class CrawlerOptions extends AutoCastable {
             });
         }
-        const proxyUrl = ctx.req.get('x-proxy-url');
         instance.proxyUrl ??= proxyUrl;
         return instance;
     }
 }

         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
             req: Request,
             res: Response,
+        } | undefined;
+        const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format');
         if (customMode !== undefined) {
             instance.respondWith = customMode;
         }
+        const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
         if (withGeneratedAlt !== undefined) {
             instance.withGeneratedAlt = Boolean(withGeneratedAlt);
         }
+        const withLinksSummary = ctx?.req.get('x-with-links-summary');
         if (withLinksSummary !== undefined) {
             instance.withLinksSummary = Boolean(withLinksSummary);
         }
+        const withImagesSummary = ctx?.req.get('x-with-images-summary');
         if (withImagesSummary !== undefined) {
             instance.withImagesSummary = Boolean(withImagesSummary);
         }
+        const noCache = ctx?.req.get('x-no-cache');
         if (noCache !== undefined) {
             instance.noCache = Boolean(noCache);
         }
+        if (instance.noCache && instance.cacheTolerance === undefined) {
+            instance.cacheTolerance = 0;
+        }
+        let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || '');
         if (!isNaN(cacheTolerance)) {
             instance.cacheTolerance = cacheTolerance;
         }
+        const targetSelector = ctx?.req.get('x-target-selector');
         instance.targetSelector ??= targetSelector;
+        const waitForSelector = ctx?.req.get('x-wait-for-selector');
         instance.waitForSelector ??= waitForSelector || instance.targetSelector;
         const cookies: CookieParam[] = [];
+        const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
         if (Array.isArray(setCookieHeaders)) {
             for (const setCookie of setCookieHeaders) {
                 cookies.push({
             });
         }
+        const proxyUrl = ctx?.req.get('x-proxy-url');
         instance.proxyUrl ??= proxyUrl;
+        if (instance.cacheTolerance) {
+            instance.cacheTolerance = instance.cacheTolerance * 1000;
+        }
+        return instance;
+    }
+}
+export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
+    static override from(input: any) {
+        const instance = super.from({
+            [RPC_CALL_ENVIRONMENT]: Reflect.get(input, RPC_CALL_ENVIRONMENT),
+        }) as CrawlerOptionsHeaderOnly;
         return instance;
     }
 }

backend/functions/src/index.ts CHANGED Viewed

@@ -13,6 +13,7 @@ Object.assign(exports, registry.exportGrouped({
     memory: '4GiB',
     timeoutSeconds: 540,
 }));
 registry.title = 'reader';
 registry.version = '0.1.0';

     memory: '4GiB',
     timeoutSeconds: 540,
 }));
+registry.allHandsOnDeck().catch(() => void 0);
 registry.title = 'reader';
 registry.version = '0.1.0';

backend/functions/src/services/pdf-extract.ts CHANGED Viewed

@@ -6,6 +6,8 @@ import { AsyncService, HashManager } from 'civkit';
 import { Logger } from '../shared/services/logger';
 import { PDFContent } from '../db/pdf';
 import dayjs from 'dayjs';
 const utc = require('dayjs/plugin/utc');  // Import the UTC plugin
 dayjs.extend(utc);  // Extend dayjs with the UTC plugin
 const timezone = require('dayjs/plugin/timezone');
@@ -46,6 +48,7 @@ export class PDFExtractor extends AsyncService {
     constructor(
         protected globalLogger: Logger,
     ) {
         super(...arguments);
     }
@@ -225,22 +228,46 @@ export class PDFExtractor extends AsyncService {
         return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
     }
-    async cachedExtract(url: string | URL) {
         if (!url) {
             return undefined;
         }
         const digest = md5Hasher.hash(url.toString());
-        const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
-        const existing = await PDFContent.fromFirestore(shortDigest);
-        if (existing) {
-            return {
-                meta: existing.meta,
-                content: existing.content,
-                text: existing.text
-            };
         }
         let extracted;
@@ -253,14 +280,16 @@ export class PDFExtractor extends AsyncService {
         // Don't try again until the next day
         const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
-        await PDFContent.COLLECTION.doc(shortDigest).set(
             {
-                _id: shortDigest,
                 src: url.toString(),
                 meta: extracted?.meta || {},
-                content: extracted?.content || '',
                 text: extracted?.text || '',
                 urlDigest: digest,
                 createdAt: new Date(),
                 ...expireMixin

 import { Logger } from '../shared/services/logger';
 import { PDFContent } from '../db/pdf';
 import dayjs from 'dayjs';
+import { FirebaseStorageBucketControl } from '../shared';
+import { randomUUID } from 'crypto';
 const utc = require('dayjs/plugin/utc');  // Import the UTC plugin
 dayjs.extend(utc);  // Extend dayjs with the UTC plugin
 const timezone = require('dayjs/plugin/timezone');
     constructor(
         protected globalLogger: Logger,
+        protected firebaseObjectStorage: FirebaseStorageBucketControl,
     ) {
         super(...arguments);
     }
         return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
     }
+    async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24) {
         if (!url) {
             return undefined;
         }
         const digest = md5Hasher.hash(url.toString());
+        const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
+        if (cache) {
+            const age = Date.now() - cache?.createdAt.valueOf();
+            const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
+            this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${url}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
+                url, digest, age, stale, cacheTolerance
+            });
+            if (!stale) {
+                if (cache.content && cache.text) {
+                    return {
+                        meta: cache.meta,
+                        content: cache.content,
+                        text: cache.text
+                    };
+                }
+                try {
+                    const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`);
+                    let cached = JSON.parse(r.toString('utf-8'));
+                    return {
+                        meta: cached.meta,
+                        content: cached.content,
+                        text: cached.text
+                    };
+                } catch (err) {
+                    this.logger.warn(`Unable to load cached content for ${url}`, { err });
+                    return undefined;
+                }
+            }
         }
         let extracted;
         // Don't try again until the next day
         const expireMixin = extracted ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
+        const theID = randomUUID();
+        await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
+            Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
+        await PDFContent.COLLECTION.doc(theID).set(
             {
                 src: url.toString(),
                 meta: extracted?.meta || {},
                 text: extracted?.text || '',
+                content: extracted?.content || '',
                 urlDigest: digest,
                 createdAt: new Date(),
                 ...expireMixin

backend/functions/src/services/puppeteer.ts CHANGED Viewed

@@ -380,7 +380,7 @@ document.addEventListener('load', handlePageLoad);
         let screenshot: Buffer | undefined;
         const page = await this.getNextPage();
         const sn = this.snMap.get(page);
-        this.logger.info(`Page ${sn}:  Scraping ${url}`, { url });
         if (options?.proxyUrl) {
             await page.useProxy(options.proxyUrl);
         }

         let screenshot: Buffer | undefined;
         const page = await this.getNextPage();
         const sn = this.snMap.get(page);
+        this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
         if (options?.proxyUrl) {
             await page.useProxy(options.proxyUrl);
         }

thinapps-shared CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~b0b597800a36e2aa8ee3d52715aa7c998b388f47~~


1	+ Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6