nomagick commited on
Commit
234f61d
·
unverified ·
1 Parent(s): 140a6f8

remove more attrs in readerlm preprocessing

Browse files
backend/functions/src/services/jsdom.ts CHANGED
@@ -273,6 +273,24 @@ export class JSDomControl extends AsyncService {
273
  }
274
  x.removeAttribute('style');
275
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  const dt = Date.now() - t0;
278
  if (dt > 1000) {
 
273
  }
274
  x.removeAttribute('style');
275
  });
276
+ const treeWalker = jsdom.window.document.createTreeWalker(
277
+ jsdom.window.document, // Start from the root document
278
+ 0x80 // Only show comment nodes
279
+ );
280
+
281
+ let currentNode;
282
+ while ((currentNode = treeWalker.nextNode())) {
283
+ currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
284
+ }
285
+
286
+ jsdom.window.document.querySelectorAll('*').forEach((x)=> {
287
+ const attrs = x.getAttributeNames();
288
+ for (const attr of attrs) {
289
+ if (attr.startsWith('data-') || attr.startsWith('aria-')) {
290
+ x.removeAttribute(attr);
291
+ }
292
+ }
293
+ });
294
 
295
  const dt = Date.now() - t0;
296
  if (dt > 1000) {