Spaces:
Build error
Build error
saas: save cache in batch
Browse files- src/api/crawler.ts +29 -5
- src/api/searcher.ts +24 -3
- src/api/serp.ts +23 -3
- thinapps-shared +1 -1
src/api/crawler.ts
CHANGED
|
@@ -82,6 +82,8 @@ export class CrawlerHost extends RPCHost {
|
|
| 82 |
abuseBlockMs = 1000 * 3600;
|
| 83 |
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
| 84 |
|
|
|
|
|
|
|
| 85 |
constructor(
|
| 86 |
protected globalLogger: GlobalLogger,
|
| 87 |
protected puppeteerControl: PuppeteerControl,
|
|
@@ -152,6 +154,27 @@ export class CrawlerHost extends RPCHost {
|
|
| 152 |
});
|
| 153 |
|
| 154 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
}
|
| 156 |
|
| 157 |
override async init() {
|
|
@@ -633,13 +656,14 @@ export class CrawlerHost extends RPCHost {
|
|
| 633 |
cache.pageshotAvailable = true;
|
| 634 |
}
|
| 635 |
await savingOfSnapshot;
|
| 636 |
-
|
| 637 |
-
|
|
|
|
| 638 |
|
| 639 |
-
|
| 640 |
-
});
|
| 641 |
|
| 642 |
-
return
|
| 643 |
}
|
| 644 |
|
| 645 |
async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
|
|
|
| 82 |
abuseBlockMs = 1000 * 3600;
|
| 83 |
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
| 84 |
|
| 85 |
+
batchedCaches: Crawled[] = [];
|
| 86 |
+
|
| 87 |
constructor(
|
| 88 |
protected globalLogger: GlobalLogger,
|
| 89 |
protected puppeteerControl: PuppeteerControl,
|
|
|
|
| 154 |
});
|
| 155 |
|
| 156 |
});
|
| 157 |
+
|
| 158 |
+
setInterval(() => {
|
| 159 |
+
const thisBatch = this.batchedCaches;
|
| 160 |
+
this.batchedCaches = [];
|
| 161 |
+
if (!thisBatch.length) {
|
| 162 |
+
return;
|
| 163 |
+
}
|
| 164 |
+
const batch = Crawled.DB.batch();
|
| 165 |
+
|
| 166 |
+
for (const x of thisBatch) {
|
| 167 |
+
batch.set(Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true });
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
batch.commit()
|
| 171 |
+
.then(() => {
|
| 172 |
+
this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
|
| 173 |
+
})
|
| 174 |
+
.catch((err) => {
|
| 175 |
+
this.logger.warn(`Failed to save cache in batch`, { err });
|
| 176 |
+
});
|
| 177 |
+
}, 1000 * 10 + Math.round(1000 * Math.random())).unref();
|
| 178 |
}
|
| 179 |
|
| 180 |
override async init() {
|
|
|
|
| 656 |
cache.pageshotAvailable = true;
|
| 657 |
}
|
| 658 |
await savingOfSnapshot;
|
| 659 |
+
this.batchedCaches.push(cache);
|
| 660 |
+
// const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
| 661 |
+
// this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
| 662 |
|
| 663 |
+
// return undefined;
|
| 664 |
+
// });
|
| 665 |
|
| 666 |
+
return cache;
|
| 667 |
}
|
| 668 |
|
| 669 |
async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
src/api/searcher.ts
CHANGED
|
@@ -61,6 +61,8 @@ export class SearcherHost extends RPCHost {
|
|
| 61 |
updateAgeOnHas: false,
|
| 62 |
});
|
| 63 |
|
|
|
|
|
|
|
| 64 |
constructor(
|
| 65 |
protected globalLogger: GlobalLogger,
|
| 66 |
protected rateLimitControl: RateLimitControl,
|
|
@@ -72,6 +74,26 @@ export class SearcherHost extends RPCHost {
|
|
| 72 |
protected jinaSerp: InternalJinaSerpService,
|
| 73 |
) {
|
| 74 |
super(...arguments);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
override async init() {
|
|
@@ -780,9 +802,8 @@ export class SearcherHost extends RPCHost {
|
|
| 780 |
createdAt: nowDate,
|
| 781 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 782 |
});
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
});
|
| 786 |
} else if (lastError) {
|
| 787 |
throw lastError;
|
| 788 |
}
|
|
|
|
| 61 |
updateAgeOnHas: false,
|
| 62 |
});
|
| 63 |
|
| 64 |
+
batchedCaches: SERPResult[] = [];
|
| 65 |
+
|
| 66 |
constructor(
|
| 67 |
protected globalLogger: GlobalLogger,
|
| 68 |
protected rateLimitControl: RateLimitControl,
|
|
|
|
| 74 |
protected jinaSerp: InternalJinaSerpService,
|
| 75 |
) {
|
| 76 |
super(...arguments);
|
| 77 |
+
|
| 78 |
+
setInterval(() => {
|
| 79 |
+
const thisBatch = this.batchedCaches;
|
| 80 |
+
this.batchedCaches = [];
|
| 81 |
+
if (!thisBatch.length) {
|
| 82 |
+
return;
|
| 83 |
+
}
|
| 84 |
+
const batch = SERPResult.DB.batch();
|
| 85 |
+
|
| 86 |
+
for (const x of thisBatch) {
|
| 87 |
+
batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
|
| 88 |
+
}
|
| 89 |
+
batch.commit()
|
| 90 |
+
.then(() => {
|
| 91 |
+
this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
|
| 92 |
+
})
|
| 93 |
+
.catch((err) => {
|
| 94 |
+
this.logger.warn(`Failed to cache search result in batch`, { err });
|
| 95 |
+
});
|
| 96 |
+
}, 1000 * 60 * 10 + Math.round(1000 * Math.random())).unref();
|
| 97 |
}
|
| 98 |
|
| 99 |
override async init() {
|
|
|
|
| 802 |
createdAt: nowDate,
|
| 803 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 804 |
});
|
| 805 |
+
|
| 806 |
+
this.batchedCaches.push(record);
|
|
|
|
| 807 |
} else if (lastError) {
|
| 808 |
throw lastError;
|
| 809 |
}
|
src/api/serp.ts
CHANGED
|
@@ -63,6 +63,8 @@ export class SerpHost extends RPCHost {
|
|
| 63 |
updateAgeOnHas: false,
|
| 64 |
});
|
| 65 |
|
|
|
|
|
|
|
| 66 |
async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
|
| 67 |
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
| 68 |
Object.assign(indexObject, {
|
|
@@ -92,6 +94,26 @@ export class SerpHost extends RPCHost {
|
|
| 92 |
protected serperBing: SerperBingSearchService,
|
| 93 |
) {
|
| 94 |
super(...arguments);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
}
|
| 96 |
|
| 97 |
override async init() {
|
|
@@ -516,9 +538,7 @@ export class SerpHost extends RPCHost {
|
|
| 516 |
createdAt: nowDate,
|
| 517 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 518 |
});
|
| 519 |
-
|
| 520 |
-
this.logger.warn(`Failed to cache search result`, { err });
|
| 521 |
-
});
|
| 522 |
} else if (lastError) {
|
| 523 |
throw lastError;
|
| 524 |
}
|
|
|
|
| 63 |
updateAgeOnHas: false,
|
| 64 |
});
|
| 65 |
|
| 66 |
+
batchedCaches: SERPResult[] = [];
|
| 67 |
+
|
| 68 |
async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
|
| 69 |
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
| 70 |
Object.assign(indexObject, {
|
|
|
|
| 94 |
protected serperBing: SerperBingSearchService,
|
| 95 |
) {
|
| 96 |
super(...arguments);
|
| 97 |
+
|
| 98 |
+
setInterval(() => {
|
| 99 |
+
const thisBatch = this.batchedCaches;
|
| 100 |
+
this.batchedCaches = [];
|
| 101 |
+
if (!thisBatch.length) {
|
| 102 |
+
return;
|
| 103 |
+
}
|
| 104 |
+
const batch = SERPResult.DB.batch();
|
| 105 |
+
|
| 106 |
+
for (const x of thisBatch) {
|
| 107 |
+
batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
|
| 108 |
+
}
|
| 109 |
+
batch.commit()
|
| 110 |
+
.then(() => {
|
| 111 |
+
this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
|
| 112 |
+
})
|
| 113 |
+
.catch((err) => {
|
| 114 |
+
this.logger.warn(`Failed to cache search result in batch`, { err });
|
| 115 |
+
});
|
| 116 |
+
}, 1000 * 60 * 10 + Math.round(1000 * Math.random())).unref();
|
| 117 |
}
|
| 118 |
|
| 119 |
override async init() {
|
|
|
|
| 538 |
createdAt: nowDate,
|
| 539 |
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
|
| 540 |
});
|
| 541 |
+
this.batchedCaches.push(record);
|
|
|
|
|
|
|
| 542 |
} else if (lastError) {
|
| 543 |
throw lastError;
|
| 544 |
}
|
thinapps-shared
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit a23636b2161908eefd897b6976c10a5924e2cd57
|