nomagick commited on
Commit
131375b
·
unverified ·
1 Parent(s): 481d1a2

saas: save cache in batch

Browse files
Files changed (4) hide show
  1. src/api/crawler.ts +29 -5
  2. src/api/searcher.ts +24 -3
  3. src/api/serp.ts +23 -3
  4. thinapps-shared +1 -1
src/api/crawler.ts CHANGED
@@ -82,6 +82,8 @@ export class CrawlerHost extends RPCHost {
82
  abuseBlockMs = 1000 * 3600;
83
  domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
84
 
 
 
85
  constructor(
86
  protected globalLogger: GlobalLogger,
87
  protected puppeteerControl: PuppeteerControl,
@@ -152,6 +154,27 @@ export class CrawlerHost extends RPCHost {
152
  });
153
 
154
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  }
156
 
157
  override async init() {
@@ -633,13 +656,14 @@ export class CrawlerHost extends RPCHost {
633
  cache.pageshotAvailable = true;
634
  }
635
  await savingOfSnapshot;
636
- const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
637
- this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
 
638
 
639
- return undefined;
640
- });
641
 
642
- return r;
643
  }
644
 
645
  async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
 
82
  abuseBlockMs = 1000 * 3600;
83
  domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
84
 
85
+ batchedCaches: Crawled[] = [];
86
+
87
  constructor(
88
  protected globalLogger: GlobalLogger,
89
  protected puppeteerControl: PuppeteerControl,
 
154
  });
155
 
156
  });
157
+
158
+ setInterval(() => {
159
+ const thisBatch = this.batchedCaches;
160
+ this.batchedCaches = [];
161
+ if (!thisBatch.length) {
162
+ return;
163
+ }
164
+ const batch = Crawled.DB.batch();
165
+
166
+ for (const x of thisBatch) {
167
+ batch.set(Crawled.COLLECTION.doc(x._id), x.degradeForFireStore(), { merge: true });
168
+ }
169
+
170
+ batch.commit()
171
+ .then(() => {
172
+ this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
173
+ })
174
+ .catch((err) => {
175
+ this.logger.warn(`Failed to save cache in batch`, { err });
176
+ });
177
+ }, 1000 * 10 + Math.round(1000 * Math.random())).unref();
178
  }
179
 
180
  override async init() {
 
656
  cache.pageshotAvailable = true;
657
  }
658
  await savingOfSnapshot;
659
+ this.batchedCaches.push(cache);
660
+ // const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
661
+ // this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
662
 
663
+ // return undefined;
664
+ // });
665
 
666
+ return cache;
667
  }
668
 
669
  async *iterSnapshots(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
src/api/searcher.ts CHANGED
@@ -61,6 +61,8 @@ export class SearcherHost extends RPCHost {
61
  updateAgeOnHas: false,
62
  });
63
 
 
 
64
  constructor(
65
  protected globalLogger: GlobalLogger,
66
  protected rateLimitControl: RateLimitControl,
@@ -72,6 +74,26 @@ export class SearcherHost extends RPCHost {
72
  protected jinaSerp: InternalJinaSerpService,
73
  ) {
74
  super(...arguments);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
 
77
  override async init() {
@@ -780,9 +802,8 @@ export class SearcherHost extends RPCHost {
780
  createdAt: nowDate,
781
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
782
  });
783
- SERPResult.save(record.degradeForFireStore()).catch((err) => {
784
- this.logger.warn(`Failed to cache search result`, { err });
785
- });
786
  } else if (lastError) {
787
  throw lastError;
788
  }
 
61
  updateAgeOnHas: false,
62
  });
63
 
64
+ batchedCaches: SERPResult[] = [];
65
+
66
  constructor(
67
  protected globalLogger: GlobalLogger,
68
  protected rateLimitControl: RateLimitControl,
 
74
  protected jinaSerp: InternalJinaSerpService,
75
  ) {
76
  super(...arguments);
77
+
78
+ setInterval(() => {
79
+ const thisBatch = this.batchedCaches;
80
+ this.batchedCaches = [];
81
+ if (!thisBatch.length) {
82
+ return;
83
+ }
84
+ const batch = SERPResult.DB.batch();
85
+
86
+ for (const x of thisBatch) {
87
+ batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
88
+ }
89
+ batch.commit()
90
+ .then(() => {
91
+ this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
92
+ })
93
+ .catch((err) => {
94
+ this.logger.warn(`Failed to cache search result in batch`, { err });
95
+ });
96
+ }, 1000 * 60 * 10 + Math.round(1000 * Math.random())).unref();
97
  }
98
 
99
  override async init() {
 
802
  createdAt: nowDate,
803
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
804
  });
805
+
806
+ this.batchedCaches.push(record);
 
807
  } else if (lastError) {
808
  throw lastError;
809
  }
src/api/serp.ts CHANGED
@@ -63,6 +63,8 @@ export class SerpHost extends RPCHost {
63
  updateAgeOnHas: false,
64
  });
65
 
 
 
66
  async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
67
  const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
68
  Object.assign(indexObject, {
@@ -92,6 +94,26 @@ export class SerpHost extends RPCHost {
92
  protected serperBing: SerperBingSearchService,
93
  ) {
94
  super(...arguments);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  }
96
 
97
  override async init() {
@@ -516,9 +538,7 @@ export class SerpHost extends RPCHost {
516
  createdAt: nowDate,
517
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
518
  });
519
- SERPResult.save(record.degradeForFireStore()).catch((err) => {
520
- this.logger.warn(`Failed to cache search result`, { err });
521
- });
522
  } else if (lastError) {
523
  throw lastError;
524
  }
 
63
  updateAgeOnHas: false,
64
  });
65
 
66
+ batchedCaches: SERPResult[] = [];
67
+
68
  async getIndex(ctx: Context, auth?: JinaEmbeddingsAuthDTO) {
69
  const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
70
  Object.assign(indexObject, {
 
94
  protected serperBing: SerperBingSearchService,
95
  ) {
96
  super(...arguments);
97
+
98
+ setInterval(() => {
99
+ const thisBatch = this.batchedCaches;
100
+ this.batchedCaches = [];
101
+ if (!thisBatch.length) {
102
+ return;
103
+ }
104
+ const batch = SERPResult.DB.batch();
105
+
106
+ for (const x of thisBatch) {
107
+ batch.set(SERPResult.COLLECTION.doc(), x.degradeForFireStore());
108
+ }
109
+ batch.commit()
110
+ .then(() => {
111
+ this.logger.debug(`Saved ${thisBatch.length} caches by batch`);
112
+ })
113
+ .catch((err) => {
114
+ this.logger.warn(`Failed to cache search result in batch`, { err });
115
+ });
116
+ }, 1000 * 60 * 10 + Math.round(1000 * Math.random())).unref();
117
  }
118
 
119
  override async init() {
 
538
  createdAt: nowDate,
539
  expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs)
540
  });
541
+ this.batchedCaches.push(record);
 
 
542
  } else if (lastError) {
543
  throw lastError;
544
  }
thinapps-shared CHANGED
@@ -1 +1 @@
1
- Subproject commit c48c226fbb595773cb08baee26a9fce299dc275e
 
1
+ Subproject commit a23636b2161908eefd897b6976c10a5924e2cd57