Spaces:
Build error
Build error
fix: fail early on special cookie redirects
Browse files- src/services/curl.ts +10 -3
src/services/curl.ts
CHANGED
|
@@ -294,7 +294,8 @@ export class CurlControl extends AsyncService {
|
|
| 294 |
}
|
| 295 |
|
| 296 |
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 297 |
-
let leftRedirection =
|
|
|
|
| 298 |
let opts = { ...crawlOpts };
|
| 299 |
let nextHopUrl = urlToCrawl;
|
| 300 |
const fakeHeaderInfos: HeaderInfo[] = [];
|
|
@@ -312,10 +313,16 @@ export class CurlControl extends AsyncService {
|
|
| 312 |
if (parsed.length) {
|
| 313 |
opts.cookies = [...(opts.cookies || []), ...parsed];
|
| 314 |
}
|
|
|
|
|
|
|
|
|
|
| 315 |
}
|
| 316 |
|
| 317 |
if (!location && !setCookieHeader) {
|
| 318 |
-
throw new
|
|
|
|
|
|
|
|
|
|
| 319 |
}
|
| 320 |
|
| 321 |
nextHopUrl = new URL(location || '', nextHopUrl);
|
|
@@ -331,7 +338,7 @@ export class CurlControl extends AsyncService {
|
|
| 331 |
};
|
| 332 |
} while (leftRedirection > 0);
|
| 333 |
|
| 334 |
-
throw new
|
| 335 |
}
|
| 336 |
|
| 337 |
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
|
|
|
| 294 |
}
|
| 295 |
|
| 296 |
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
| 297 |
+
let leftRedirection = 6;
|
| 298 |
+
let cookieRedirects = 0;
|
| 299 |
let opts = { ...crawlOpts };
|
| 300 |
let nextHopUrl = urlToCrawl;
|
| 301 |
const fakeHeaderInfos: HeaderInfo[] = [];
|
|
|
|
| 313 |
if (parsed.length) {
|
| 314 |
opts.cookies = [...(opts.cookies || []), ...parsed];
|
| 315 |
}
|
| 316 |
+
if (!location) {
|
| 317 |
+
cookieRedirects += 1;
|
| 318 |
+
}
|
| 319 |
}
|
| 320 |
|
| 321 |
if (!location && !setCookieHeader) {
|
| 322 |
+
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
|
| 323 |
+
}
|
| 324 |
+
if (!location && cookieRedirects > 1) {
|
| 325 |
+
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
|
| 326 |
}
|
| 327 |
|
| 328 |
nextHopUrl = new URL(location || '', nextHopUrl);
|
|
|
|
| 338 |
};
|
| 339 |
} while (leftRedirection > 0);
|
| 340 |
|
| 341 |
+
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
| 342 |
}
|
| 343 |
|
| 344 |
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|