nomagick commited on
Commit
8b7af6d
·
unverified ·
1 Parent(s): a082185

fix: ignore match all target selectors for performance

Browse files
backend/functions/src/dto/scrapping-options.ts CHANGED
@@ -237,6 +237,7 @@ export class CrawlerOptions extends AutoCastable {
237
  instance.targetSelector ??= targetSelector;
238
  const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
239
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
 
240
  const overrideUserAgent = ctx?.req.get('x-user-agent');
241
  instance.userAgent ??= overrideUserAgent;
242
 
@@ -286,3 +287,20 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
286
  return instance;
287
  }
288
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  instance.targetSelector ??= targetSelector;
238
  const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
239
  instance.waitForSelector ??= waitForSelector || instance.targetSelector;
240
+ instance.targetSelector = filterSelector(instance.targetSelector);
241
  const overrideUserAgent = ctx?.req.get('x-user-agent');
242
  instance.userAgent ??= overrideUserAgent;
243
 
 
287
  return instance;
288
  }
289
  }
290
+
291
+ function filterSelector(s?: string | string[]) {
292
+ if (!s) {
293
+ return s;
294
+ }
295
+ const sr = Array.isArray(s) ? s : [s];
296
+ const selectors = sr.filter((i)=> {
297
+ const innerSelectors = i.split(',').map((s) => s.trim());
298
+ const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
299
+ if (someViolation) {
300
+ return false;
301
+ }
302
+ return true;
303
+ })
304
+
305
+ return selectors;
306
+ };