Spaces:
Build error
Build error
feat: return usage tokens in json
Browse files
backend/functions/src/cloud-functions/crawler.ts
CHANGED
|
@@ -49,6 +49,11 @@ export interface FormattedPage {
|
|
| 49 |
pageshot?: Buffer;
|
| 50 |
links?: { [k: string]: string; };
|
| 51 |
images?: { [k: string]: string; };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
toString: () => string;
|
| 54 |
}
|
|
@@ -743,7 +748,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 743 |
}
|
| 744 |
|
| 745 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 746 |
-
chargeAmount = this.
|
| 747 |
sseStream.write({
|
| 748 |
event: 'data',
|
| 749 |
data: formatted,
|
|
@@ -771,7 +776,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 771 |
}
|
| 772 |
|
| 773 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 774 |
-
chargeAmount = this.
|
| 775 |
|
| 776 |
if (crawlerOptions.timeout === undefined) {
|
| 777 |
return formatted;
|
|
@@ -783,7 +788,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 783 |
}
|
| 784 |
|
| 785 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 786 |
-
chargeAmount = this.
|
| 787 |
|
| 788 |
return formatted;
|
| 789 |
}
|
|
@@ -795,7 +800,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 795 |
}
|
| 796 |
|
| 797 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 798 |
-
chargeAmount = this.
|
| 799 |
|
| 800 |
if (crawlerOptions.timeout === undefined) {
|
| 801 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
@@ -820,7 +825,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 820 |
}
|
| 821 |
|
| 822 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 823 |
-
chargeAmount = this.
|
| 824 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 825 |
|
| 826 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
@@ -1005,25 +1010,31 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
| 1005 |
}
|
| 1006 |
}
|
| 1007 |
|
| 1008 |
-
|
| 1009 |
if (!formatted) {
|
| 1010 |
return undefined;
|
| 1011 |
}
|
| 1012 |
|
| 1013 |
const textContent = formatted?.content || formatted?.description || formatted?.text || formatted?.html;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
| 1015 |
-
|
| 1016 |
-
return estimateToken(textContent);
|
| 1017 |
-
}
|
| 1018 |
|
| 1019 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
|
| 1021 |
-
|
| 1022 |
-
// OpenAI image token count for 1024x1024 image
|
| 1023 |
-
return 765;
|
| 1024 |
-
}
|
| 1025 |
|
| 1026 |
-
return
|
| 1027 |
}
|
| 1028 |
|
| 1029 |
|
|
|
|
| 49 |
pageshot?: Buffer;
|
| 50 |
links?: { [k: string]: string; };
|
| 51 |
images?: { [k: string]: string; };
|
| 52 |
+
usage?: {
|
| 53 |
+
total_tokens?: number;
|
| 54 |
+
totalTokens?: number;
|
| 55 |
+
tokens?: number;
|
| 56 |
+
};
|
| 57 |
|
| 58 |
toString: () => string;
|
| 59 |
}
|
|
|
|
| 748 |
}
|
| 749 |
|
| 750 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 751 |
+
chargeAmount = this.assignChargeAmount(formatted);
|
| 752 |
sseStream.write({
|
| 753 |
event: 'data',
|
| 754 |
data: formatted,
|
|
|
|
| 776 |
}
|
| 777 |
|
| 778 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 779 |
+
chargeAmount = this.assignChargeAmount(formatted);
|
| 780 |
|
| 781 |
if (crawlerOptions.timeout === undefined) {
|
| 782 |
return formatted;
|
|
|
|
| 788 |
}
|
| 789 |
|
| 790 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 791 |
+
chargeAmount = this.assignChargeAmount(formatted);
|
| 792 |
|
| 793 |
return formatted;
|
| 794 |
}
|
|
|
|
| 800 |
}
|
| 801 |
|
| 802 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, urlToCrawl);
|
| 803 |
+
chargeAmount = this.assignChargeAmount(formatted);
|
| 804 |
|
| 805 |
if (crawlerOptions.timeout === undefined) {
|
| 806 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
|
|
| 825 |
}
|
| 826 |
|
| 827 |
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, urlToCrawl);
|
| 828 |
+
chargeAmount = this.assignChargeAmount(formatted);
|
| 829 |
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
| 830 |
|
| 831 |
return assignTransferProtocolMeta(`${formatted}`,
|
|
|
|
| 1010 |
}
|
| 1011 |
}
|
| 1012 |
|
| 1013 |
+
assignChargeAmount(formatted: FormattedPage) {
|
| 1014 |
if (!formatted) {
|
| 1015 |
return undefined;
|
| 1016 |
}
|
| 1017 |
|
| 1018 |
const textContent = formatted?.content || formatted?.description || formatted?.text || formatted?.html;
|
| 1019 |
+
let amount;
|
| 1020 |
+
do {
|
| 1021 |
+
if (typeof textContent === 'string') {
|
| 1022 |
+
amount = estimateToken(textContent);
|
| 1023 |
+
break;
|
| 1024 |
+
}
|
| 1025 |
|
| 1026 |
+
const imageContent = formatted.screenshotUrl || formatted.screenshot;
|
|
|
|
|
|
|
| 1027 |
|
| 1028 |
+
if (imageContent) {
|
| 1029 |
+
// OpenAI image token count for 1024x1024 image
|
| 1030 |
+
amount = 765;
|
| 1031 |
+
break;
|
| 1032 |
+
}
|
| 1033 |
+
} while (false);
|
| 1034 |
|
| 1035 |
+
Object.assign(formatted, { usage: { tokens: amount } });
|
|
|
|
|
|
|
|
|
|
| 1036 |
|
| 1037 |
+
return amount;
|
| 1038 |
}
|
| 1039 |
|
| 1040 |
|
backend/functions/src/cloud-functions/searcher.ts
CHANGED
|
@@ -178,7 +178,7 @@ export class SearcherHost extends RPCHost {
|
|
| 178 |
continue;
|
| 179 |
}
|
| 180 |
|
| 181 |
-
chargeAmount = this.
|
| 182 |
sseStream.write({
|
| 183 |
event: 'data',
|
| 184 |
data: scrapped,
|
|
@@ -211,7 +211,7 @@ export class SearcherHost extends RPCHost {
|
|
| 211 |
if (!lastScrapped) {
|
| 212 |
return;
|
| 213 |
}
|
| 214 |
-
chargeAmount = this.
|
| 215 |
rpcReflect.return(lastScrapped);
|
| 216 |
earlyReturn = true;
|
| 217 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
@@ -228,7 +228,7 @@ export class SearcherHost extends RPCHost {
|
|
| 228 |
if (earlyReturnTimer) {
|
| 229 |
clearTimeout(earlyReturnTimer);
|
| 230 |
}
|
| 231 |
-
chargeAmount = this.
|
| 232 |
|
| 233 |
return scrapped;
|
| 234 |
}
|
|
@@ -242,7 +242,7 @@ export class SearcherHost extends RPCHost {
|
|
| 242 |
}
|
| 243 |
|
| 244 |
if (!earlyReturn) {
|
| 245 |
-
chargeAmount = this.
|
| 246 |
}
|
| 247 |
|
| 248 |
return lastScrapped;
|
|
@@ -257,7 +257,7 @@ export class SearcherHost extends RPCHost {
|
|
| 257 |
if (!lastScrapped) {
|
| 258 |
return;
|
| 259 |
}
|
| 260 |
-
chargeAmount = this.
|
| 261 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 262 |
earlyReturn = true;
|
| 263 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
@@ -278,7 +278,7 @@ export class SearcherHost extends RPCHost {
|
|
| 278 |
clearTimeout(earlyReturnTimer);
|
| 279 |
}
|
| 280 |
|
| 281 |
-
chargeAmount = this.
|
| 282 |
|
| 283 |
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 284 |
}
|
|
@@ -292,7 +292,7 @@ export class SearcherHost extends RPCHost {
|
|
| 292 |
}
|
| 293 |
|
| 294 |
if (!earlyReturn) {
|
| 295 |
-
chargeAmount = this.
|
| 296 |
}
|
| 297 |
|
| 298 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
|
@@ -423,9 +423,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|
| 423 |
return resultArray;
|
| 424 |
}
|
| 425 |
|
| 426 |
-
|
| 427 |
return _.sum(
|
| 428 |
-
formatted.map((x) => this.crawler.
|
| 429 |
);
|
| 430 |
}
|
| 431 |
|
|
|
|
| 178 |
continue;
|
| 179 |
}
|
| 180 |
|
| 181 |
+
chargeAmount = this.assignChargeAmount(scrapped);
|
| 182 |
sseStream.write({
|
| 183 |
event: 'data',
|
| 184 |
data: scrapped,
|
|
|
|
| 211 |
if (!lastScrapped) {
|
| 212 |
return;
|
| 213 |
}
|
| 214 |
+
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 215 |
rpcReflect.return(lastScrapped);
|
| 216 |
earlyReturn = true;
|
| 217 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
|
|
| 228 |
if (earlyReturnTimer) {
|
| 229 |
clearTimeout(earlyReturnTimer);
|
| 230 |
}
|
| 231 |
+
chargeAmount = this.assignChargeAmount(scrapped);
|
| 232 |
|
| 233 |
return scrapped;
|
| 234 |
}
|
|
|
|
| 242 |
}
|
| 243 |
|
| 244 |
if (!earlyReturn) {
|
| 245 |
+
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 246 |
}
|
| 247 |
|
| 248 |
return lastScrapped;
|
|
|
|
| 257 |
if (!lastScrapped) {
|
| 258 |
return;
|
| 259 |
}
|
| 260 |
+
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 261 |
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
|
| 262 |
earlyReturn = true;
|
| 263 |
}, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
|
|
|
|
| 278 |
clearTimeout(earlyReturnTimer);
|
| 279 |
}
|
| 280 |
|
| 281 |
+
chargeAmount = this.assignChargeAmount(scrapped);
|
| 282 |
|
| 283 |
return assignTransferProtocolMeta(`${scrapped}`, { contentType: 'text/plain', envelope: null });
|
| 284 |
}
|
|
|
|
| 292 |
}
|
| 293 |
|
| 294 |
if (!earlyReturn) {
|
| 295 |
+
chargeAmount = this.assignChargeAmount(lastScrapped);
|
| 296 |
}
|
| 297 |
|
| 298 |
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
|
|
|
| 423 |
return resultArray;
|
| 424 |
}
|
| 425 |
|
| 426 |
+
assignChargeAmount(formatted: FormattedPage[]) {
|
| 427 |
return _.sum(
|
| 428 |
+
formatted.map((x) => this.crawler.assignChargeAmount(x) || 0)
|
| 429 |
);
|
| 430 |
}
|
| 431 |
|