devusman commited on
Commit
3203abb
Β·
1 Parent(s): 7a6e7ae
package-lock.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "puppeteer-api",
3
- "version": "1.0.0",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
 
1
  {
2
  "name": "puppeteer-api",
3
+ "version": "5.3.0",
4
  "lockfileVersion": 3,
5
  "requires": true,
6
  "packages": {
package.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "name": "puppeteer-api",
3
- "version": "1.0.0",
4
  "main": "server.js",
5
  "type": "commonjs",
6
  "dependencies": {
 
1
  {
2
  "name": "puppeteer-api",
3
+ "version": "5.3.0",
4
  "main": "server.js",
5
  "type": "commonjs",
6
  "dependencies": {
server copy.js β†’ server copy 2.js RENAMED
@@ -41,7 +41,116 @@ class ProgressTracker extends EventEmitter {
41
  }
42
  }
43
 
44
- // --- Puppeteer Logic (Updated for Stealth and Reliability) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
46
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
47
 
@@ -65,7 +174,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
65
  }
66
  }
67
 
68
- // Step 2: Inject CSS to hide cookie banners immediately (Unchanged)
69
  await page.addStyleTag({
70
  content: `
71
  /* Hide all possible cookie banners */
@@ -80,8 +189,9 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
80
  z-index: -9999 !important;
81
  pointer-events: none !important;
82
  }
83
- /* Remove blur and premium overlays */
84
- [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] {
 
85
  filter: none !important;
86
  backdrop-filter: none !important;
87
  opacity: 1 !important;
@@ -103,6 +213,10 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
103
  overflow: auto !important;
104
  position: static !important;
105
  }
 
 
 
 
106
  `
107
  });
108
 
@@ -173,6 +287,8 @@ const unblurContent = async (page, progressTracker) => {
173
 
174
  removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
175
  removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
 
 
176
 
177
  const removeBlur = (element = document) => {
178
  element.querySelectorAll("*").forEach(el => {
@@ -195,7 +311,6 @@ const unblurContent = async (page, progressTracker) => {
195
  };
196
 
197
  removeBlur();
198
- removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]');
199
 
200
  const contentSelectors = [
201
  '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
@@ -213,13 +328,43 @@ const unblurContent = async (page, progressTracker) => {
213
  };
214
 
215
  removeRestrictions();
216
- const intervalId = setInterval(removeRestrictions, 2000);
217
- setTimeout(() => clearInterval(intervalId), 60000);
218
  });
219
 
220
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
221
  };
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  const applyPrintStyles = async (page, progressTracker) => {
224
  progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
225
 
@@ -243,6 +388,8 @@ const applyPrintStyles = async (page, progressTracker) => {
243
  overflow: visible !important;
244
  background: white !important;
245
  color: black !important;
 
 
246
  }
247
  /* Remove all unwanted elements like headers, footers, sidebars, etc. */
248
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
@@ -271,7 +418,7 @@ const applyPrintStyles = async (page, progressTracker) => {
271
  display: block !important;
272
  width: 100% !important;
273
  max-width: none !important;
274
- margin: 0 !important;
275
  padding: 0 !important;
276
  box-sizing: border-box; /* Include padding in width calculation */
277
  transform: none !important;
@@ -322,7 +469,18 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
322
  '--disable-features=site-per-process',
323
  '--disable-blink-features=AutomationControlled',
324
  '--disable-extensions',
325
- '--ignore-certificate-errors'
 
 
 
 
 
 
 
 
 
 
 
326
  ],
327
  ignoreHTTPSErrors: true,
328
  timeout: 300000,
@@ -332,20 +490,32 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
332
 
333
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
334
 
335
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
336
- await page.setViewport({ width: 794, height: 1122 }); // A4 size in pixels at 96 DPI
337
 
338
  // NOTE: Stealth plugin handles most of this, but keeping for extra safety
339
  await page.evaluateOnNewDocument(() => {
340
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
341
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
342
  Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
 
 
 
 
 
 
 
 
 
 
 
 
343
  });
344
 
345
  // Set up cookie and content bypass
346
  await bypassCookiesAndRestrictions(page, progressTracker);
347
 
348
- // Block unnecessary resources (UPDATED: Always continue for 'document' to prevent navigation failures)
349
  await page.setRequestInterception(true);
350
  page.on('request', (req) => {
351
  const resourceType = req.resourceType();
@@ -356,7 +526,16 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
356
  return;
357
  }
358
 
 
 
 
 
 
 
359
  if (
 
 
 
360
  reqUrl.includes('doubleclick') ||
361
  reqUrl.includes('googletagmanager') ||
362
  reqUrl.includes('facebook.com') ||
@@ -367,7 +546,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
367
  reqUrl.includes('mixpanel') ||
368
  reqUrl.includes('onetrust') ||
369
  reqUrl.includes('cookielaw') ||
370
- (resourceType === 'other' && reqUrl.includes('/track/')) // UPDATED: More specific to avoid over-blocking
371
  ) {
372
  req.abort();
373
  } else {
@@ -381,6 +560,8 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
381
 
382
  console.log("πŸ”‘ Logging in to StuDocu...");
383
  await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 120000 });
 
 
384
  await page.waitForSelector('#email', { timeout: 15000 });
385
  await page.type('#email', options.email);
386
  await page.type('#password', options.password);
@@ -396,33 +577,32 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
396
  }
397
  }
398
 
399
- progressTracker?.updateProgress(25, 'navigating', 'Navigating to homepage first for session setup...');
400
- console.log(`πŸ“„ Navigating to homepage to simulate natural session...`);
401
- await page.goto('https://www.studocu.com/en-us', { waitUntil: 'domcontentloaded', timeout: 150000 }); // NEW: Preliminary homepage visit
402
- await new Promise(resolve => setTimeout(resolve, 3000)); // Short delay for session stabilization
403
-
404
  progressTracker?.updateProgress(30, 'navigating', 'Navigating to document...');
405
  console.log(`πŸ“„ Navigating to ${url}...`);
406
 
407
  let navigationSuccess = false;
408
  let attempts = 0;
409
- const maxAttempts = 5;
410
  while (!navigationSuccess && attempts < maxAttempts) {
411
  try {
412
  attempts++;
413
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
414
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
415
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 150000 }); // UPDATED: Increased timeout to 2.5 min
416
  navigationSuccess = true;
417
  } catch (e) {
418
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
419
  if (attempts >= maxAttempts) throw e;
420
- await new Promise(resolve => setTimeout(resolve, 15000)); // UPDATED: Increased retry delay to 15s
421
  }
422
  }
423
 
 
 
 
424
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
425
- await new Promise(resolve => setTimeout(resolve, 5000));
426
 
427
  // Apply content unblurring
428
  await unblurContent(page, progressTracker);
@@ -438,7 +618,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
438
  let contentFound = false;
439
  for (const selector of contentSelectors) {
440
  try {
441
- await page.waitForSelector(selector, { timeout: 20000 });
442
  console.log(`βœ… Found content with selector: ${selector}`);
443
  contentFound = true;
444
  break;
@@ -451,7 +631,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
451
  console.log("⚠️ No specific content selector found, proceeding with page content...");
452
  }
453
 
454
- // Enhanced scrolling to load all content
455
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
456
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
457
 
@@ -460,27 +640,28 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
460
  let scrollHeight = document.body.scrollHeight;
461
  while (true) {
462
  let totalHeight = 0;
463
- const distance = 300;
464
  while (totalHeight < scrollHeight) {
465
  window.scrollBy(0, distance);
466
  totalHeight += distance;
467
- await delay(500);
468
  }
469
- await delay(2000);
470
  const newHeight = document.body.scrollHeight;
471
  if (newHeight === scrollHeight) break;
472
  scrollHeight = newHeight;
473
  }
474
  window.scrollTo({ top: 0, behavior: "smooth" });
475
- await delay(1000);
476
  });
477
 
478
- progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
479
-
480
  // Re-apply unblur after loading new content
481
  await unblurContent(page, progressTracker);
482
 
483
- // Wait for all images to load
 
 
 
484
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
485
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
486
 
@@ -491,12 +672,12 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
491
  return new Promise((resolve) => {
492
  img.addEventListener('load', resolve);
493
  img.addEventListener('error', resolve);
494
- setTimeout(resolve, 15000);
495
  });
496
  }));
497
  });
498
 
499
- await new Promise(resolve => setTimeout(resolve, 5000));
500
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
501
 
502
  // Set exact height
@@ -511,7 +692,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
511
  document.body.style.overflow = 'hidden !important';
512
  });
513
 
514
- // Content verification
515
  const contentCheck = await page.evaluate(() => {
516
  const textContent = document.body.textContent || '';
517
  const images = document.querySelectorAll('img');
@@ -549,7 +730,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
549
  printBackground: true,
550
  preferCSSPageSize: true, // Use the @page size
551
  displayHeaderFooter: false,
552
- timeout: 180000,
553
  scale: 1,
554
  omitBackground: false
555
  });
@@ -674,15 +855,17 @@ app.get('/health', (req, res) => {
674
 
675
  app.get('/', (req, res) => {
676
  res.json({
677
- message: 'πŸš€ Enhanced StuDocu Downloader API v5.2 - Real-time Progress Tracking with Stealth',
678
- version: '5.2.0',
679
  features: [
680
  'πŸͺ Advanced cookie banner bypass',
681
  'πŸ”“ Premium content unblurring',
682
  'πŸ”‘ Login support for full access',
683
  'πŸ“Š Real-time progress tracking via polling',
684
  'πŸ“„ Clean PDF generation with print styles',
685
- 'πŸ•΅οΈ Enhanced stealth to evade bot detection'
 
 
686
  ],
687
  endpoints: {
688
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
@@ -704,6 +887,6 @@ process.on('SIGINT', () => {
704
  });
705
 
706
  app.listen(port, () => {
707
- console.log(`πŸš€ Enhanced StuDocu Downloader v5.2.0 running on http://localhost:${port}`);
708
- console.log(`✨ Features: Real-time progress tracking, enhanced stealth, and user feedback`);
709
  });
 
41
  }
42
  }
43
 
44
+ // --- Enhanced Human Behavior Simulation ---
45
+ const simulateHumanBehavior = async (page, progressTracker) => {
46
+ console.log("πŸ§‘ Simulating human-like mouse movements and delays...");
47
+ const viewport = page.viewport();
48
+ for (let i = 0; i < 5; i++) {
49
+ const x = Math.random() * (viewport.width || 1920);
50
+ const y = Math.random() * (viewport.height || 1080);
51
+ await page.mouse.move(x, y, { steps: 10 });
52
+ await page.waitForTimeout(Math.random() * 1000 + 500);
53
+ }
54
+ // Random scroll a bit
55
+ await page.evaluate(() => {
56
+ window.scrollBy(0, Math.random() * 200 - 100);
57
+ });
58
+ await page.waitForTimeout(Math.random() * 2000 + 1000);
59
+ progressTracker?.updateProgress(progressTracker.progress + 1, 'humanizing', 'Human behavior simulated');
60
+ };
61
+
62
+ // --- Enhanced Cloudflare Bypass Function ---
63
+ const handleCloudflareChallenge = async (page, progressTracker) => {
64
+ progressTracker?.updateProgress(35, 'cloudflare', 'Detecting and bypassing Cloudflare...');
65
+
66
+ console.log("☁️ Checking for Cloudflare challenge...");
67
+ const cloudflareSelectors = [
68
+ '#challenge-running',
69
+ '.cf-browser-verification',
70
+ '[data-ray]',
71
+ '#cf-challenge-running',
72
+ '.under-attack',
73
+ 'iframe[src*="cloudflare"]',
74
+ '#challenge-form', // Added for JS challenge
75
+ '.cf-turnstile' // For Turnstile CAPTCHA
76
+ ];
77
+
78
+ // Wait for any Cloudflare element to appear
79
+ let challengeDetected = false;
80
+ for (const selector of cloudflareSelectors) {
81
+ try {
82
+ await page.waitForSelector(selector, { timeout: 5000 });
83
+ challengeDetected = true;
84
+ console.log(`☁️ Cloudflare challenge detected with selector: ${selector}`);
85
+ break;
86
+ } catch (e) {
87
+ // Continue to next selector
88
+ }
89
+ }
90
+
91
+ if (challengeDetected) {
92
+ // Simulate human behavior before attempting to solve
93
+ await simulateHumanBehavior(page, progressTracker);
94
+
95
+ // Wait for the challenge to resolve (JS execution)
96
+ console.log("⏳ Waiting for Cloudflare challenge to complete...");
97
+ try {
98
+ await page.waitForFunction(() => {
99
+ const selectors = [
100
+ '#challenge-running',
101
+ '.cf-browser-verification',
102
+ '[data-ray]',
103
+ '#cf-challenge-running',
104
+ '.under-attack',
105
+ '#challenge-form',
106
+ '.cf-turnstile'
107
+ ];
108
+ return !selectors.some(sel => document.querySelector(sel));
109
+ }, { timeout: 90000 }); // Increased timeout for slower challenges
110
+ } catch (e) {
111
+ console.log("⚠️ Standard wait failed, attempting Turnstile click...");
112
+ // Fallback: Check for and click Turnstile if present
113
+ try {
114
+ const cfInput = await page.$('[name="cf-turnstile-response"]');
115
+ if (cfInput) {
116
+ const parentItem = await cfInput.evaluateHandle((element) => element.parentElement);
117
+ const coordinates = await parentItem.boundingBox();
118
+ if (coordinates) {
119
+ await page.mouse.click(coordinates.x + 25, coordinates.y + coordinates.height / 2);
120
+ console.log("πŸ–±οΈ Clicked on Turnstile CAPTCHA");
121
+ await page.waitForTimeout(3000);
122
+ }
123
+ }
124
+ // Retry wait after click
125
+ await page.waitForFunction(() => {
126
+ const selectors = [
127
+ '#challenge-running',
128
+ '.cf-browser-verification',
129
+ '[data-ray]',
130
+ '#cf-challenge-running',
131
+ '.under-attack',
132
+ '#challenge-form',
133
+ '.cf-turnstile'
134
+ ];
135
+ return !selectors.some(sel => document.querySelector(sel));
136
+ }, { timeout: 60000 });
137
+ } catch (clickError) {
138
+ console.error("❌ Turnstile click failed:", clickError.message);
139
+ throw new Error("Failed to bypass Cloudflare challenge. Try again later or use a proxy.");
140
+ }
141
+ }
142
+
143
+ // Additional wait for page to stabilize post-challenge with random delay
144
+ const randomDelay = (min, max) => Math.floor(Math.random() * (max - min + 1) + min);
145
+ await page.waitForTimeout(randomDelay(3000, 7000));
146
+ console.log("βœ… Cloudflare challenge bypassed successfully.");
147
+ progressTracker?.updateProgress(38, 'cloudflare', 'Cloudflare bypassed');
148
+ } else {
149
+ console.log("βœ… No Cloudflare challenge detected.");
150
+ }
151
+ };
152
+
153
+ // --- Puppeteer Logic (Updated for Enhanced Cloudflare Bypass) ---
154
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
155
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
156
 
 
174
  }
175
  }
176
 
177
+ // Step 2: Inject CSS to hide cookie banners immediately (Updated: Added more selectors for previews and blurred overlays)
178
  await page.addStyleTag({
179
  content: `
180
  /* Hide all possible cookie banners */
 
189
  z-index: -9999 !important;
190
  pointer-events: none !important;
191
  }
192
+ /* Remove blur and premium overlays, including previews */
193
+ [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i], [class*="preview" i], [class*="blurred-container" i], [class*="blurred" i] {
194
+ display: none !important;
195
  filter: none !important;
196
  backdrop-filter: none !important;
197
  opacity: 1 !important;
 
213
  overflow: auto !important;
214
  position: static !important;
215
  }
216
+ /* Hide Cloudflare elements if they persist */
217
+ #challenge-running, .cf-browser-verification, [data-ray], .under-attack {
218
+ display: none !important;
219
+ }
220
  `
221
  });
222
 
 
287
 
288
  removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
289
  removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
290
+ // Added: Remove preview and blurred overlays
291
+ removeBySelector('[class*="preview" i], [class*="blurred-container" i], [class*="blurred" i]:not(img)');
292
 
293
  const removeBlur = (element = document) => {
294
  element.querySelectorAll("*").forEach(el => {
 
311
  };
312
 
313
  removeBlur();
 
314
 
315
  const contentSelectors = [
316
  '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
 
328
  };
329
 
330
  removeRestrictions();
331
+ const intervalId = setInterval(removeRestrictions, 1000);
332
+ setTimeout(() => clearInterval(intervalId), 30000);
333
  });
334
 
335
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
336
  };
337
 
338
+ // New function to fetch clear images by modifying blurred URLs
339
+ const fetchClearImages = async (page, progressTracker) => {
340
+ progressTracker?.updateProgress(65, 'unblurring_images', 'Fetching clear page images...');
341
+
342
+ console.log("πŸ–ΌοΈ Modifying blurred image URLs to fetch clear versions...");
343
+ await page.evaluate(() => {
344
+ const images = document.querySelectorAll('img[src*="/blurred/"]');
345
+ images.forEach(img => {
346
+ img.src = img.src.replace(/\/blurred\//, '/');
347
+ console.log(`Modified image src: ${img.src}`);
348
+ });
349
+ });
350
+
351
+ // Wait for the modified images to load
352
+ await page.evaluate(async () => {
353
+ const images = Array.from(document.querySelectorAll('img'));
354
+ await Promise.all(images.map(img => {
355
+ if (img.complete) return Promise.resolve();
356
+ return new Promise((resolve) => {
357
+ img.addEventListener('load', resolve);
358
+ img.addEventListener('error', resolve);
359
+ setTimeout(resolve, 10000);
360
+ });
361
+ }));
362
+ });
363
+
364
+ await new Promise(resolve => setTimeout(resolve, 3000)); // Additional delay for stability
365
+ progressTracker?.updateProgress(70, 'unblurring_images', 'Clear images loaded');
366
+ };
367
+
368
  const applyPrintStyles = async (page, progressTracker) => {
369
  progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
370
 
 
388
  overflow: visible !important;
389
  background: white !important;
390
  color: black !important;
391
+ display: flex;
392
+ justify-content: center;
393
  }
394
  /* Remove all unwanted elements like headers, footers, sidebars, etc. */
395
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
 
418
  display: block !important;
419
  width: 100% !important;
420
  max-width: none !important;
421
+ margin: 0 auto !important; /* Center horizontally */
422
  padding: 0 !important;
423
  box-sizing: border-box; /* Include padding in width calculation */
424
  transform: none !important;
 
469
  '--disable-features=site-per-process',
470
  '--disable-blink-features=AutomationControlled',
471
  '--disable-extensions',
472
+ '--ignore-certificate-errors',
473
+ // NEW: Additional args for better Cloudflare evasion
474
+ '--disable-features=TranslateUI',
475
+ '--disable-ipc-flooding',
476
+ '--disable-backgrounding-occluded-windows',
477
+ '--disable-renderer-backgrounding',
478
+ '--disable-features=TranslateUI,BlinkGenPropertyTrees',
479
+ '--metrics-recording-only',
480
+ '--no-default-browser-check',
481
+ '--safebrowsing-disable-auto-update',
482
+ '--password-store=basic',
483
+ '--use-mock-keychain'
484
  ],
485
  ignoreHTTPSErrors: true,
486
  timeout: 300000,
 
490
 
491
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
492
 
493
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
494
+ await page.setViewport({ width: 1920, height: 1080 }); // NEW: Use full HD for more realistic viewport, adjust back if needed for A4
495
 
496
  // NOTE: Stealth plugin handles most of this, but keeping for extra safety
497
  await page.evaluateOnNewDocument(() => {
498
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
499
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
500
  Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
501
+ // NEW: Additional stealth evasions
502
+ Object.defineProperty(navigator, 'permissions', {
503
+ get: () => ({
504
+ query: () => Promise.resolve({ state: 'granted' })
505
+ })
506
+ });
507
+ window.chrome = {
508
+ runtime: {},
509
+ loadTimes: function () { },
510
+ csi: function () { },
511
+ app: {}
512
+ };
513
  });
514
 
515
  // Set up cookie and content bypass
516
  await bypassCookiesAndRestrictions(page, progressTracker);
517
 
518
+ // Block unnecessary resources (UPDATED: Loosened for Cloudflare - allow cloudflare.com requests)
519
  await page.setRequestInterception(true);
520
  page.on('request', (req) => {
521
  const resourceType = req.resourceType();
 
526
  return;
527
  }
528
 
529
+ // NEW: Always allow Cloudflare-related requests
530
+ if (reqUrl.includes('cloudflare') || reqUrl.includes('cf-')) {
531
+ req.continue();
532
+ return;
533
+ }
534
+
535
  if (
536
+ ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
537
+ !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
538
+ resourceType === 'script' && !reqUrl.includes('studocu') && !reqUrl.includes('cloudflare') || // Block third-party scripts except Cloudflare
539
  reqUrl.includes('doubleclick') ||
540
  reqUrl.includes('googletagmanager') ||
541
  reqUrl.includes('facebook.com') ||
 
546
  reqUrl.includes('mixpanel') ||
547
  reqUrl.includes('onetrust') ||
548
  reqUrl.includes('cookielaw') ||
549
+ (resourceType === 'other' && reqUrl.includes('/track/'))
550
  ) {
551
  req.abort();
552
  } else {
 
560
 
561
  console.log("πŸ”‘ Logging in to StuDocu...");
562
  await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 120000 });
563
+ // NEW: Handle potential Cloudflare on login page
564
+ await handleCloudflareChallenge(page, progressTracker);
565
  await page.waitForSelector('#email', { timeout: 15000 });
566
  await page.type('#email', options.email);
567
  await page.type('#password', options.password);
 
577
  }
578
  }
579
 
580
+ // Removed homepage visit as it's not strictly necessary for session setup; directly navigate to URL
 
 
 
 
581
  progressTracker?.updateProgress(30, 'navigating', 'Navigating to document...');
582
  console.log(`πŸ“„ Navigating to ${url}...`);
583
 
584
  let navigationSuccess = false;
585
  let attempts = 0;
586
+ const maxAttempts = 3; // Reduced from 5 to minimize retries
587
  while (!navigationSuccess && attempts < maxAttempts) {
588
  try {
589
  attempts++;
590
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
591
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
592
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 }); // Increased from 60000
593
  navigationSuccess = true;
594
  } catch (e) {
595
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
596
  if (attempts >= maxAttempts) throw e;
597
+ await new Promise(resolve => setTimeout(resolve, 10000)); // Increased retry delay to 10s for stability
598
  }
599
  }
600
 
601
+ // NEW: Handle Cloudflare after navigation
602
+ await handleCloudflareChallenge(page, progressTracker);
603
+
604
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
605
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms for better loading
606
 
607
  // Apply content unblurring
608
  await unblurContent(page, progressTracker);
 
618
  let contentFound = false;
619
  for (const selector of contentSelectors) {
620
  try {
621
+ await page.waitForSelector(selector, { timeout: 20000 }); // Increased from 10000
622
  console.log(`βœ… Found content with selector: ${selector}`);
623
  contentFound = true;
624
  break;
 
631
  console.log("⚠️ No specific content selector found, proceeding with page content...");
632
  }
633
 
634
+ // Enhanced scrolling to load all content (Optimized: Increased scroll distance, reduced delays)
635
  progressTracker?.updateProgress(50, 'scrolling', 'Loading all document pages...');
636
  console.log("πŸ“œ Loading all document pages with enhanced slow scroll...");
637
 
 
640
  let scrollHeight = document.body.scrollHeight;
641
  while (true) {
642
  let totalHeight = 0;
643
+ const distance = 600; // Increased from 300 for faster coverage
644
  while (totalHeight < scrollHeight) {
645
  window.scrollBy(0, distance);
646
  totalHeight += distance;
647
+ await delay(300); // Increased from 200ms for large docs stability
648
  }
649
+ await delay(2000); // Increased from 1000ms
650
  const newHeight = document.body.scrollHeight;
651
  if (newHeight === scrollHeight) break;
652
  scrollHeight = newHeight;
653
  }
654
  window.scrollTo({ top: 0, behavior: "smooth" });
655
+ await delay(1000); // Increased from 500ms
656
  });
657
 
 
 
658
  // Re-apply unblur after loading new content
659
  await unblurContent(page, progressTracker);
660
 
661
+ // New: Fetch clear images for blurred pages
662
+ await fetchClearImages(page, progressTracker);
663
+
664
+ // Wait for all images to load (Optimized: Reduced per-image timeout, parallel wait)
665
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
666
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
667
 
 
672
  return new Promise((resolve) => {
673
  img.addEventListener('load', resolve);
674
  img.addEventListener('error', resolve);
675
+ setTimeout(resolve, 10000); // Increased from 5000ms for large docs
676
  });
677
  }));
678
  });
679
 
680
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms
681
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
682
 
683
  // Set exact height
 
692
  document.body.style.overflow = 'hidden !important';
693
  });
694
 
695
+ // Content verification (Unchanged, as it's quick)
696
  const contentCheck = await page.evaluate(() => {
697
  const textContent = document.body.textContent || '';
698
  const images = document.querySelectorAll('img');
 
730
  printBackground: true,
731
  preferCSSPageSize: true, // Use the @page size
732
  displayHeaderFooter: false,
733
+ timeout: 180000, // Increased back to 180000 for large PDFs
734
  scale: 1,
735
  omitBackground: false
736
  });
 
855
 
856
  app.get('/', (req, res) => {
857
  res.json({
858
+ message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
859
+ version: '5.3.0',
860
  features: [
861
  'πŸͺ Advanced cookie banner bypass',
862
  'πŸ”“ Premium content unblurring',
863
  'πŸ”‘ Login support for full access',
864
  'πŸ“Š Real-time progress tracking via polling',
865
  'πŸ“„ Clean PDF generation with print styles',
866
+ 'πŸ•΅οΈ Enhanced stealth to evade bot detection',
867
+ '☁️ Automatic Cloudflare challenge handling',
868
+ 'πŸ§‘ Human-like behavior simulation'
869
  ],
870
  endpoints: {
871
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
 
887
  });
888
 
889
  app.listen(port, () => {
890
+ console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
891
+ console.log(`✨ Features: Real-time progress tracking, enhanced stealth, Cloudflare bypass, and user feedback`);
892
  });
server.js CHANGED
@@ -41,7 +41,116 @@ class ProgressTracker extends EventEmitter {
41
  }
42
  }
43
 
44
- // --- Puppeteer Logic (Updated for Stealth and Reliability) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
46
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
47
 
@@ -65,7 +174,7 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
65
  }
66
  }
67
 
68
- // Step 2: Inject CSS to hide cookie banners immediately (Unchanged)
69
  await page.addStyleTag({
70
  content: `
71
  /* Hide all possible cookie banners */
@@ -80,8 +189,9 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
80
  z-index: -9999 !important;
81
  pointer-events: none !important;
82
  }
83
- /* Remove blur and premium overlays */
84
- [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i] {
 
85
  filter: none !important;
86
  backdrop-filter: none !important;
87
  opacity: 1 !important;
@@ -103,6 +213,10 @@ const bypassCookiesAndRestrictions = async (page, progressTracker) => {
103
  overflow: auto !important;
104
  position: static !important;
105
  }
 
 
 
 
106
  `
107
  });
108
 
@@ -173,6 +287,8 @@ const unblurContent = async (page, progressTracker) => {
173
 
174
  removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
175
  removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
 
 
176
 
177
  const removeBlur = (element = document) => {
178
  element.querySelectorAll("*").forEach(el => {
@@ -195,7 +311,6 @@ const unblurContent = async (page, progressTracker) => {
195
  };
196
 
197
  removeBlur();
198
- removeBySelector('[class*="blur" i], [class*="premium" i], [class*="paywall" i]');
199
 
200
  const contentSelectors = [
201
  '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
@@ -213,13 +328,43 @@ const unblurContent = async (page, progressTracker) => {
213
  };
214
 
215
  removeRestrictions();
216
- const intervalId = setInterval(removeRestrictions, 1000); // Reduced from 2000ms to 1000ms
217
- setTimeout(() => clearInterval(intervalId), 30000); // Reduced from 60000ms to 30000ms
218
  });
219
 
220
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
221
  };
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  const applyPrintStyles = async (page, progressTracker) => {
224
  progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
225
 
@@ -243,6 +388,8 @@ const applyPrintStyles = async (page, progressTracker) => {
243
  overflow: visible !important;
244
  background: white !important;
245
  color: black !important;
 
 
246
  }
247
  /* Remove all unwanted elements like headers, footers, sidebars, etc. */
248
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
@@ -271,7 +418,7 @@ const applyPrintStyles = async (page, progressTracker) => {
271
  display: block !important;
272
  width: 100% !important;
273
  max-width: none !important;
274
- margin: 0 !important;
275
  padding: 0 !important;
276
  box-sizing: border-box; /* Include padding in width calculation */
277
  transform: none !important;
@@ -322,7 +469,18 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
322
  '--disable-features=site-per-process',
323
  '--disable-blink-features=AutomationControlled',
324
  '--disable-extensions',
325
- '--ignore-certificate-errors'
 
 
 
 
 
 
 
 
 
 
 
326
  ],
327
  ignoreHTTPSErrors: true,
328
  timeout: 300000,
@@ -332,20 +490,32 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
332
 
333
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
334
 
335
- await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36');
336
- await page.setViewport({ width: 794, height: 1122 }); // A4 size in pixels at 96 DPI
337
 
338
  // NOTE: Stealth plugin handles most of this, but keeping for extra safety
339
  await page.evaluateOnNewDocument(() => {
340
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
341
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
342
  Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
 
 
 
 
 
 
 
 
 
 
 
 
343
  });
344
 
345
  // Set up cookie and content bypass
346
  await bypassCookiesAndRestrictions(page, progressTracker);
347
 
348
- // Block unnecessary resources (UPDATED: Block more aggressively, including scripts, fonts, and stylesheets if not critical)
349
  await page.setRequestInterception(true);
350
  page.on('request', (req) => {
351
  const resourceType = req.resourceType();
@@ -356,10 +526,16 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
356
  return;
357
  }
358
 
 
 
 
 
 
 
359
  if (
360
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
361
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
362
- resourceType === 'script' && !reqUrl.includes('studocu') || // Block third-party scripts
363
  reqUrl.includes('doubleclick') ||
364
  reqUrl.includes('googletagmanager') ||
365
  reqUrl.includes('facebook.com') ||
@@ -383,14 +559,16 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
383
  progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
384
 
385
  console.log("πŸ”‘ Logging in to StuDocu...");
386
- await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 120000
387
- await page.waitForSelector('#email', { timeout: 10000 }); // Reduced from 15000
 
 
388
  await page.type('#email', options.email);
389
  await page.type('#password', options.password);
390
  await page.click('button[type="submit"]');
391
  try {
392
- await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 15000 }); // Reduced from 30000
393
- await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 5000 }); // Reduced from 10000
394
  console.log("βœ… Login successful.");
395
  progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
396
  } catch (e) {
@@ -411,17 +589,20 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
411
  attempts++;
412
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
413
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
414
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 }); // Reduced timeout from 150000
415
  navigationSuccess = true;
416
  } catch (e) {
417
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
418
  if (attempts >= maxAttempts) throw e;
419
- await new Promise(resolve => setTimeout(resolve, 5000)); // Reduced retry delay from 15000 to 5000ms
420
  }
421
  }
422
 
 
 
 
423
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
424
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
425
 
426
  // Apply content unblurring
427
  await unblurContent(page, progressTracker);
@@ -437,7 +618,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
437
  let contentFound = false;
438
  for (const selector of contentSelectors) {
439
  try {
440
- await page.waitForSelector(selector, { timeout: 10000 }); // Reduced from 20000
441
  console.log(`βœ… Found content with selector: ${selector}`);
442
  contentFound = true;
443
  break;
@@ -463,22 +644,23 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
463
  while (totalHeight < scrollHeight) {
464
  window.scrollBy(0, distance);
465
  totalHeight += distance;
466
- await delay(200); // Reduced from 500ms
467
  }
468
- await delay(1000); // Reduced from 2000ms
469
  const newHeight = document.body.scrollHeight;
470
  if (newHeight === scrollHeight) break;
471
  scrollHeight = newHeight;
472
  }
473
  window.scrollTo({ top: 0, behavior: "smooth" });
474
- await delay(500); // Reduced from 1000ms
475
  });
476
 
477
- progressTracker?.updateProgress(70, 'processing', 'Processing loaded content...');
478
-
479
  // Re-apply unblur after loading new content
480
  await unblurContent(page, progressTracker);
481
 
 
 
 
482
  // Wait for all images to load (Optimized: Reduced per-image timeout, parallel wait)
483
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
484
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
@@ -490,12 +672,12 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
490
  return new Promise((resolve) => {
491
  img.addEventListener('load', resolve);
492
  img.addEventListener('error', resolve);
493
- setTimeout(resolve, 5000); // Reduced from 15000ms
494
  });
495
  }));
496
  });
497
 
498
- await new Promise(resolve => setTimeout(resolve, 2000)); // Reduced from 5000ms
499
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
500
 
501
  // Set exact height
@@ -548,7 +730,7 @@ const studocuDownloader = async (url, options = {}, progressTracker = null) => {
548
  printBackground: true,
549
  preferCSSPageSize: true, // Use the @page size
550
  displayHeaderFooter: false,
551
- timeout: 60000, // Reduced from 180000
552
  scale: 1,
553
  omitBackground: false
554
  });
@@ -673,15 +855,17 @@ app.get('/health', (req, res) => {
673
 
674
  app.get('/', (req, res) => {
675
  res.json({
676
- message: 'πŸš€ Enhanced StuDocu Downloader API v5.2 - Real-time Progress Tracking with Stealth',
677
- version: '5.2.0',
678
  features: [
679
  'πŸͺ Advanced cookie banner bypass',
680
  'πŸ”“ Premium content unblurring',
681
  'πŸ”‘ Login support for full access',
682
  'πŸ“Š Real-time progress tracking via polling',
683
  'πŸ“„ Clean PDF generation with print styles',
684
- 'πŸ•΅οΈ Enhanced stealth to evade bot detection'
 
 
685
  ],
686
  endpoints: {
687
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
@@ -703,6 +887,6 @@ process.on('SIGINT', () => {
703
  });
704
 
705
  app.listen(port, () => {
706
- console.log(`πŸš€ Enhanced StuDocu Downloader v5.2.0 running on http://localhost:${port}`);
707
- console.log(`✨ Features: Real-time progress tracking, enhanced stealth, and user feedback`);
708
  });
 
41
  }
42
  }
43
 
44
+ // --- Enhanced Human Behavior Simulation ---
45
+ const simulateHumanBehavior = async (page, progressTracker) => {
46
+ console.log("πŸ§‘ Simulating human-like mouse movements and delays...");
47
+ const viewport = page.viewport();
48
+ for (let i = 0; i < 5; i++) {
49
+ const x = Math.random() * (viewport.width || 1920);
50
+ const y = Math.random() * (viewport.height || 1080);
51
+ await page.mouse.move(x, y, { steps: 10 });
52
+ await page.waitForTimeout(Math.random() * 1000 + 500);
53
+ }
54
+ // Random scroll a bit
55
+ await page.evaluate(() => {
56
+ window.scrollBy(0, Math.random() * 200 - 100);
57
+ });
58
+ await page.waitForTimeout(Math.random() * 2000 + 1000);
59
+ progressTracker?.updateProgress(progressTracker.progress + 1, 'humanizing', 'Human behavior simulated');
60
+ };
61
+
62
+ // --- Enhanced Cloudflare Bypass Function ---
63
+ const handleCloudflareChallenge = async (page, progressTracker) => {
64
+ progressTracker?.updateProgress(35, 'cloudflare', 'Detecting and bypassing Cloudflare...');
65
+
66
+ console.log("☁️ Checking for Cloudflare challenge...");
67
+ const cloudflareSelectors = [
68
+ '#challenge-running',
69
+ '.cf-browser-verification',
70
+ '[data-ray]',
71
+ '#cf-challenge-running',
72
+ '.under-attack',
73
+ 'iframe[src*="cloudflare"]',
74
+ '#challenge-form', // Added for JS challenge
75
+ '.cf-turnstile' // For Turnstile CAPTCHA
76
+ ];
77
+
78
+ // Wait for any Cloudflare element to appear
79
+ let challengeDetected = false;
80
+ for (const selector of cloudflareSelectors) {
81
+ try {
82
+ await page.waitForSelector(selector, { timeout: 5000 });
83
+ challengeDetected = true;
84
+ console.log(`☁️ Cloudflare challenge detected with selector: ${selector}`);
85
+ break;
86
+ } catch (e) {
87
+ // Continue to next selector
88
+ }
89
+ }
90
+
91
+ if (challengeDetected) {
92
+ // Simulate human behavior before attempting to solve
93
+ await simulateHumanBehavior(page, progressTracker);
94
+
95
+ // Wait for the challenge to resolve (JS execution)
96
+ console.log("⏳ Waiting for Cloudflare challenge to complete...");
97
+ try {
98
+ await page.waitForFunction(() => {
99
+ const selectors = [
100
+ '#challenge-running',
101
+ '.cf-browser-verification',
102
+ '[data-ray]',
103
+ '#cf-challenge-running',
104
+ '.under-attack',
105
+ '#challenge-form',
106
+ '.cf-turnstile'
107
+ ];
108
+ return !selectors.some(sel => document.querySelector(sel));
109
+ }, { timeout: 90000 }); // Increased timeout for slower challenges
110
+ } catch (e) {
111
+ console.log("⚠️ Standard wait failed, attempting Turnstile click...");
112
+ // Fallback: Check for and click Turnstile if present
113
+ try {
114
+ const cfInput = await page.$('[name="cf-turnstile-response"]');
115
+ if (cfInput) {
116
+ const parentItem = await cfInput.evaluateHandle((element) => element.parentElement);
117
+ const coordinates = await parentItem.boundingBox();
118
+ if (coordinates) {
119
+ await page.mouse.click(coordinates.x + 25, coordinates.y + coordinates.height / 2);
120
+ console.log("πŸ–±οΈ Clicked on Turnstile CAPTCHA");
121
+ await page.waitForTimeout(3000);
122
+ }
123
+ }
124
+ // Retry wait after click
125
+ await page.waitForFunction(() => {
126
+ const selectors = [
127
+ '#challenge-running',
128
+ '.cf-browser-verification',
129
+ '[data-ray]',
130
+ '#cf-challenge-running',
131
+ '.under-attack',
132
+ '#challenge-form',
133
+ '.cf-turnstile'
134
+ ];
135
+ return !selectors.some(sel => document.querySelector(sel));
136
+ }, { timeout: 60000 });
137
+ } catch (clickError) {
138
+ console.error("❌ Turnstile click failed:", clickError.message);
139
+ throw new Error("Failed to bypass Cloudflare challenge. Try again later or use a proxy.");
140
+ }
141
+ }
142
+
143
+ // Additional wait for page to stabilize post-challenge with random delay
144
+ const randomDelay = (min, max) => Math.floor(Math.random() * (max - min + 1) + min);
145
+ await page.waitForTimeout(randomDelay(3000, 7000));
146
+ console.log("βœ… Cloudflare challenge bypassed successfully.");
147
+ progressTracker?.updateProgress(38, 'cloudflare', 'Cloudflare bypassed');
148
+ } else {
149
+ console.log("βœ… No Cloudflare challenge detected.");
150
+ }
151
+ };
152
+
153
+ // --- Puppeteer Logic (Updated for Enhanced Cloudflare Bypass) ---
154
  const bypassCookiesAndRestrictions = async (page, progressTracker) => {
155
  progressTracker?.updateProgress(5, 'bypassing', 'Setting up cookie bypass...');
156
 
 
174
  }
175
  }
176
 
177
+ // Step 2: Inject CSS to hide cookie banners immediately (Updated: Added more selectors for previews and blurred overlays)
178
  await page.addStyleTag({
179
  content: `
180
  /* Hide all possible cookie banners */
 
189
  z-index: -9999 !important;
190
  pointer-events: none !important;
191
  }
192
+ /* Remove blur and premium overlays, including previews */
193
+ [class*="blur" i], [class*="premium" i], [class*="paywall" i], [class*="sample-preview-blur" i], [class*="preview" i], [class*="blurred-container" i], [class*="blurred" i] {
194
+ display: none !important;
195
  filter: none !important;
196
  backdrop-filter: none !important;
197
  opacity: 1 !important;
 
213
  overflow: auto !important;
214
  position: static !important;
215
  }
216
+ /* Hide Cloudflare elements if they persist */
217
+ #challenge-running, .cf-browser-verification, [data-ray], .under-attack {
218
+ display: none !important;
219
+ }
220
  `
221
  });
222
 
 
287
 
288
  removeBySelector("#adbox, .adsbox, .ad-box, .banner-ads, .advert");
289
  removeBySelector(".PremiumBannerBlobWrapper_overflow-wrapper__xsaS8");
290
+ // Added: Remove preview and blurred overlays
291
+ removeBySelector('[class*="preview" i], [class*="blurred-container" i], [class*="blurred" i]:not(img)');
292
 
293
  const removeBlur = (element = document) => {
294
  element.querySelectorAll("*").forEach(el => {
 
311
  };
312
 
313
  removeBlur();
 
314
 
315
  const contentSelectors = [
316
  '.document-content', '.page-content', '.content', '[data-page]', '[data-testid*="document"]',
 
328
  };
329
 
330
  removeRestrictions();
331
+ const intervalId = setInterval(removeRestrictions, 1000);
332
+ setTimeout(() => clearInterval(intervalId), 30000);
333
  });
334
 
335
  progressTracker?.updateProgress(20, 'unblurring', 'Content restrictions removed');
336
  };
337
 
338
+ // New function to fetch clear images by modifying blurred URLs
339
+ const fetchClearImages = async (page, progressTracker) => {
340
+ progressTracker?.updateProgress(65, 'unblurring_images', 'Fetching clear page images...');
341
+
342
+ console.log("πŸ–ΌοΈ Modifying blurred image URLs to fetch clear versions...");
343
+ await page.evaluate(() => {
344
+ const images = document.querySelectorAll('img[src*="/blurred/"]');
345
+ images.forEach(img => {
346
+ img.src = img.src.replace(/\/blurred\//, '/');
347
+ console.log(`Modified image src: ${img.src}`);
348
+ });
349
+ });
350
+
351
+ // Wait for the modified images to load
352
+ await page.evaluate(async () => {
353
+ const images = Array.from(document.querySelectorAll('img'));
354
+ await Promise.all(images.map(img => {
355
+ if (img.complete) return Promise.resolve();
356
+ return new Promise((resolve) => {
357
+ img.addEventListener('load', resolve);
358
+ img.addEventListener('error', resolve);
359
+ setTimeout(resolve, 10000);
360
+ });
361
+ }));
362
+ });
363
+
364
+ await new Promise(resolve => setTimeout(resolve, 3000)); // Additional delay for stability
365
+ progressTracker?.updateProgress(70, 'unblurring_images', 'Clear images loaded');
366
+ };
367
+
368
  const applyPrintStyles = async (page, progressTracker) => {
369
  progressTracker?.updateProgress(85, 'styling', 'Applying print styles...');
370
 
 
388
  overflow: visible !important;
389
  background: white !important;
390
  color: black !important;
391
+ display: flex;
392
+ justify-content: center;
393
  }
394
  /* Remove all unwanted elements like headers, footers, sidebars, etc. */
395
  header, footer, nav, aside, .no-print, .ads, .sidebar, .premium-banner,
 
418
  display: block !important;
419
  width: 100% !important;
420
  max-width: none !important;
421
+ margin: 0 auto !important; /* Center horizontally */
422
  padding: 0 !important;
423
  box-sizing: border-box; /* Include padding in width calculation */
424
  transform: none !important;
 
469
  '--disable-features=site-per-process',
470
  '--disable-blink-features=AutomationControlled',
471
  '--disable-extensions',
472
+ '--ignore-certificate-errors',
473
+ // NEW: Additional args for better Cloudflare evasion
474
+ '--disable-features=TranslateUI',
475
+ '--disable-ipc-flooding',
476
+ '--disable-backgrounding-occluded-windows',
477
+ '--disable-renderer-backgrounding',
478
+ '--disable-features=TranslateUI,BlinkGenPropertyTrees',
479
+ '--metrics-recording-only',
480
+ '--no-default-browser-check',
481
+ '--safebrowsing-disable-auto-update',
482
+ '--password-store=basic',
483
+ '--use-mock-keychain'
484
  ],
485
  ignoreHTTPSErrors: true,
486
  timeout: 300000,
 
490
 
491
  progressTracker?.updateProgress(2, 'initializing', 'Configuring browser settings...');
492
 
493
+ await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
494
+ await page.setViewport({ width: 1920, height: 1080 }); // NEW: Use full HD for more realistic viewport, adjust back if needed for A4
495
 
496
  // NOTE: Stealth plugin handles most of this, but keeping for extra safety
497
  await page.evaluateOnNewDocument(() => {
498
  Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
499
  Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
500
  Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
501
+ // NEW: Additional stealth evasions
502
+ Object.defineProperty(navigator, 'permissions', {
503
+ get: () => ({
504
+ query: () => Promise.resolve({ state: 'granted' })
505
+ })
506
+ });
507
+ window.chrome = {
508
+ runtime: {},
509
+ loadTimes: function () { },
510
+ csi: function () { },
511
+ app: {}
512
+ };
513
  });
514
 
515
  // Set up cookie and content bypass
516
  await bypassCookiesAndRestrictions(page, progressTracker);
517
 
518
+ // Block unnecessary resources (UPDATED: Loosened for Cloudflare - allow cloudflare.com requests)
519
  await page.setRequestInterception(true);
520
  page.on('request', (req) => {
521
  const resourceType = req.resourceType();
 
526
  return;
527
  }
528
 
529
+ // NEW: Always allow Cloudflare-related requests
530
+ if (reqUrl.includes('cloudflare') || reqUrl.includes('cf-')) {
531
+ req.continue();
532
+ return;
533
+ }
534
+
535
  if (
536
  ['image', 'media', 'font', 'stylesheet'].includes(resourceType) && // Block non-essential images/media/fonts/styles early if not core
537
  !reqUrl.includes('document') && !reqUrl.includes('page') && !reqUrl.includes('studocu') || // Allow core document images
538
+ resourceType === 'script' && !reqUrl.includes('studocu') && !reqUrl.includes('cloudflare') || // Block third-party scripts except Cloudflare
539
  reqUrl.includes('doubleclick') ||
540
  reqUrl.includes('googletagmanager') ||
541
  reqUrl.includes('facebook.com') ||
 
559
  progressTracker?.updateProgress(12, 'authenticating', 'Logging into StuDocu...');
560
 
561
  console.log("πŸ”‘ Logging in to StuDocu...");
562
+ await page.goto('https://www.studocu.com/en-us/login', { waitUntil: 'domcontentloaded', timeout: 120000 });
563
+ // NEW: Handle potential Cloudflare on login page
564
+ await handleCloudflareChallenge(page, progressTracker);
565
+ await page.waitForSelector('#email', { timeout: 15000 });
566
  await page.type('#email', options.email);
567
  await page.type('#password', options.password);
568
  await page.click('button[type="submit"]');
569
  try {
570
+ await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
571
+ await page.waitForSelector('.user-profile, [data-testid="user-menu"]', { timeout: 10000 });
572
  console.log("βœ… Login successful.");
573
  progressTracker?.updateProgress(18, 'authenticated', 'Login successful');
574
  } catch (e) {
 
589
  attempts++;
590
  progressTracker?.updateProgress(30 + (attempts * 5), 'navigating', `Navigation attempt ${attempts}/${maxAttempts}`);
591
  console.log(`Navigation attempt ${attempts}/${maxAttempts}`);
592
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 }); // Increased from 60000
593
  navigationSuccess = true;
594
  } catch (e) {
595
  console.log(`Navigation attempt ${attempts} failed:`, e.message);
596
  if (attempts >= maxAttempts) throw e;
597
+ await new Promise(resolve => setTimeout(resolve, 10000)); // Increased retry delay to 10s for stability
598
  }
599
  }
600
 
601
+ // NEW: Handle Cloudflare after navigation
602
+ await handleCloudflareChallenge(page, progressTracker);
603
+
604
  progressTracker?.updateProgress(40, 'loading', 'Page loaded, waiting for content...');
605
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms for better loading
606
 
607
  // Apply content unblurring
608
  await unblurContent(page, progressTracker);
 
618
  let contentFound = false;
619
  for (const selector of contentSelectors) {
620
  try {
621
+ await page.waitForSelector(selector, { timeout: 20000 }); // Increased from 10000
622
  console.log(`βœ… Found content with selector: ${selector}`);
623
  contentFound = true;
624
  break;
 
644
  while (totalHeight < scrollHeight) {
645
  window.scrollBy(0, distance);
646
  totalHeight += distance;
647
+ await delay(300); // Increased from 200ms for large docs stability
648
  }
649
+ await delay(2000); // Increased from 1000ms
650
  const newHeight = document.body.scrollHeight;
651
  if (newHeight === scrollHeight) break;
652
  scrollHeight = newHeight;
653
  }
654
  window.scrollTo({ top: 0, behavior: "smooth" });
655
+ await delay(1000); // Increased from 500ms
656
  });
657
 
 
 
658
  // Re-apply unblur after loading new content
659
  await unblurContent(page, progressTracker);
660
 
661
+ // New: Fetch clear images for blurred pages
662
+ await fetchClearImages(page, progressTracker);
663
+
664
  // Wait for all images to load (Optimized: Reduced per-image timeout, parallel wait)
665
  progressTracker?.updateProgress(75, 'loading_images', 'Loading images...');
666
  console.log("πŸ–ΌοΈ Waiting for all images to load...");
 
672
  return new Promise((resolve) => {
673
  img.addEventListener('load', resolve);
674
  img.addEventListener('error', resolve);
675
+ setTimeout(resolve, 10000); // Increased from 5000ms for large docs
676
  });
677
  }));
678
  });
679
 
680
+ await new Promise(resolve => setTimeout(resolve, 5000)); // Increased from 2000ms
681
  progressTracker?.updateProgress(80, 'finalizing', 'Preparing document for PDF generation...');
682
 
683
  // Set exact height
 
730
  printBackground: true,
731
  preferCSSPageSize: true, // Use the @page size
732
  displayHeaderFooter: false,
733
+ timeout: 180000, // Increased back to 180000 for large PDFs
734
  scale: 1,
735
  omitBackground: false
736
  });
 
855
 
856
  app.get('/', (req, res) => {
857
  res.json({
858
+ message: 'πŸš€ Enhanced StuDocu Downloader API v5.3 - Real-time Progress Tracking with Cloudflare Bypass',
859
+ version: '5.3.0',
860
  features: [
861
  'πŸͺ Advanced cookie banner bypass',
862
  'πŸ”“ Premium content unblurring',
863
  'πŸ”‘ Login support for full access',
864
  'πŸ“Š Real-time progress tracking via polling',
865
  'πŸ“„ Clean PDF generation with print styles',
866
+ 'πŸ•΅οΈ Enhanced stealth to evade bot detection',
867
+ '☁️ Automatic Cloudflare challenge handling',
868
+ 'πŸ§‘ Human-like behavior simulation'
869
  ],
870
  endpoints: {
871
  request: 'POST /api/request-download (body: {url, filename?, email?, password?})',
 
887
  });
888
 
889
  app.listen(port, () => {
890
+ console.log(`πŸš€ Enhanced StuDocu Downloader v5.3.0 running on http://localhost:${port}`);
891
+ console.log(`✨ Features: Real-time progress tracking, enhanced stealth, Cloudflare bypass, and user feedback`);
892
  });