Spaces:
Running
Running
add parallelization for downloads
Browse files- classes.py +32 -16
classes.py
CHANGED
|
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
from urllib.parse import urljoin
|
|
|
|
| 8 |
|
| 9 |
def _get_proxies() -> dict:
|
| 10 |
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
|
@@ -218,8 +219,7 @@ class ETSISpecFinder:
|
|
| 218 |
except (ValueError, IndexError):
|
| 219 |
return []
|
| 220 |
|
| 221 |
-
|
| 222 |
-
for spec_type in ["TS", "TR"]:
|
| 223 |
params = {
|
| 224 |
"option": "com_standardssearch",
|
| 225 |
"view": "data",
|
|
@@ -235,9 +235,15 @@ class ETSISpecFinder:
|
|
| 235 |
proxies=_get_proxies())
|
| 236 |
data = resp.json()
|
| 237 |
if data and isinstance(data, list):
|
| 238 |
-
|
| 239 |
except Exception as e:
|
| 240 |
print(f"Error getting wki_id for {doc_id}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
return candidates
|
| 242 |
|
| 243 |
def _authenticate_eol(self, wki_id: str) -> requests.Session:
|
|
@@ -273,11 +279,17 @@ class ETSISpecFinder:
|
|
| 273 |
if not candidates:
|
| 274 |
return f"Specification {doc_id} not found"
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
| 277 |
print(f"Trying wki_id={wki_id} for {doc_id}")
|
| 278 |
-
session
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
-
# NTaccount.asp → parse profile_id from meta-refresh
|
| 281 |
r = session.get(
|
| 282 |
f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
|
| 283 |
verify=False, timeout=15,
|
|
@@ -285,24 +297,22 @@ class ETSISpecFinder:
|
|
| 285 |
meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
|
| 286 |
if not meta_match:
|
| 287 |
print(f" wki_id={wki_id}: authentication failed, trying next")
|
| 288 |
-
|
| 289 |
|
| 290 |
meta_url = meta_match.group(1)
|
| 291 |
if not meta_url.startswith("http"):
|
| 292 |
meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
|
| 293 |
|
| 294 |
-
# CheckIdentifier → 302 to copy_file
|
| 295 |
r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
|
| 296 |
if r2.status_code != 302:
|
| 297 |
print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
|
| 298 |
-
|
| 299 |
|
| 300 |
location2 = r2.headers.get("Location", "")
|
| 301 |
if "processError" in location2 or "processErrors" in location2:
|
| 302 |
print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
|
| 303 |
-
|
| 304 |
|
| 305 |
-
# copy_file (may have a second redirect)
|
| 306 |
copy_url = urljoin("https://portal.etsi.org/", location2)
|
| 307 |
r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
|
| 308 |
|
|
@@ -313,18 +323,17 @@ class ETSISpecFinder:
|
|
| 313 |
else:
|
| 314 |
r4 = r3
|
| 315 |
|
| 316 |
-
# Extract DOCX link
|
| 317 |
docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
|
| 318 |
if not docx_urls:
|
| 319 |
print(f" wki_id={wki_id}: DOCX not found in page, trying next")
|
| 320 |
-
|
| 321 |
|
| 322 |
-
# Verify the DOCX belongs to the requested spec (e.g. "102 223" → "102223")
|
| 323 |
spec_num = doc_id.split("-")[0].replace(" ", "")
|
| 324 |
matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
|
| 325 |
if not matching_urls:
|
| 326 |
-
print(f" wki_id={wki_id}: DOCX spec mismatch (expected {spec_num}
|
| 327 |
-
|
|
|
|
| 328 |
docx_url = matching_urls[0]
|
| 329 |
dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
|
| 330 |
filename = docx_url.split("/")[-1]
|
|
@@ -335,4 +344,11 @@ class ETSISpecFinder:
|
|
| 335 |
print(f" wki_id={wki_id}: success")
|
| 336 |
return tmp_path
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"
|
|
|
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
from urllib.parse import urljoin
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
|
| 10 |
def _get_proxies() -> dict:
|
| 11 |
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
|
|
|
| 219 |
except (ValueError, IndexError):
|
| 220 |
return []
|
| 221 |
|
| 222 |
+
def fetch_for_type(spec_type):
|
|
|
|
| 223 |
params = {
|
| 224 |
"option": "com_standardssearch",
|
| 225 |
"view": "data",
|
|
|
|
| 235 |
proxies=_get_proxies())
|
| 236 |
data = resp.json()
|
| 237 |
if data and isinstance(data, list):
|
| 238 |
+
return [str(item["wki_id"]) for item in data if "wki_id" in item]
|
| 239 |
except Exception as e:
|
| 240 |
print(f"Error getting wki_id for {doc_id}: {e}")
|
| 241 |
+
return []
|
| 242 |
+
|
| 243 |
+
candidates = []
|
| 244 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 245 |
+
for result in executor.map(fetch_for_type, ["TS", "TR"]):
|
| 246 |
+
candidates.extend(result)
|
| 247 |
return candidates
|
| 248 |
|
| 249 |
def _authenticate_eol(self, wki_id: str) -> requests.Session:
|
|
|
|
| 279 |
if not candidates:
|
| 280 |
return f"Specification {doc_id} not found"
|
| 281 |
|
| 282 |
+
# Authenticate once — cookies are auth tokens, not wki_id-specific
|
| 283 |
+
auth_session = self._authenticate_eol(candidates[0])
|
| 284 |
+
|
| 285 |
+
def try_wki(wki_id):
|
| 286 |
print(f"Trying wki_id={wki_id} for {doc_id}")
|
| 287 |
+
# Each thread gets its own session pre-loaded with the shared auth cookies
|
| 288 |
+
session = requests.Session()
|
| 289 |
+
session.headers.update({"User-Agent": self.headers["User-Agent"]})
|
| 290 |
+
session.proxies.update(_get_proxies())
|
| 291 |
+
session.cookies.update(auth_session.cookies)
|
| 292 |
|
|
|
|
| 293 |
r = session.get(
|
| 294 |
f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
|
| 295 |
verify=False, timeout=15,
|
|
|
|
| 297 |
meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
|
| 298 |
if not meta_match:
|
| 299 |
print(f" wki_id={wki_id}: authentication failed, trying next")
|
| 300 |
+
return None
|
| 301 |
|
| 302 |
meta_url = meta_match.group(1)
|
| 303 |
if not meta_url.startswith("http"):
|
| 304 |
meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
|
| 305 |
|
|
|
|
| 306 |
r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
|
| 307 |
if r2.status_code != 302:
|
| 308 |
print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
|
| 309 |
+
return None
|
| 310 |
|
| 311 |
location2 = r2.headers.get("Location", "")
|
| 312 |
if "processError" in location2 or "processErrors" in location2:
|
| 313 |
print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
|
| 314 |
+
return None
|
| 315 |
|
|
|
|
| 316 |
copy_url = urljoin("https://portal.etsi.org/", location2)
|
| 317 |
r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
|
| 318 |
|
|
|
|
| 323 |
else:
|
| 324 |
r4 = r3
|
| 325 |
|
|
|
|
| 326 |
docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
|
| 327 |
if not docx_urls:
|
| 328 |
print(f" wki_id={wki_id}: DOCX not found in page, trying next")
|
| 329 |
+
return None
|
| 330 |
|
|
|
|
| 331 |
spec_num = doc_id.split("-")[0].replace(" ", "")
|
| 332 |
matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
|
| 333 |
if not matching_urls:
|
| 334 |
+
print(f" wki_id={wki_id}: DOCX spec mismatch (expected {spec_num}), trying next")
|
| 335 |
+
return None
|
| 336 |
+
|
| 337 |
docx_url = matching_urls[0]
|
| 338 |
dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
|
| 339 |
filename = docx_url.split("/")[-1]
|
|
|
|
| 344 |
print(f" wki_id={wki_id}: success")
|
| 345 |
return tmp_path
|
| 346 |
|
| 347 |
+
with ThreadPoolExecutor(max_workers=min(len(candidates), 4)) as executor:
|
| 348 |
+
future_to_wki = {executor.submit(try_wki, wki_id): wki_id for wki_id in candidates}
|
| 349 |
+
for future in as_completed(future_to_wki):
|
| 350 |
+
result = future.result()
|
| 351 |
+
if result is not None:
|
| 352 |
+
return result
|
| 353 |
+
|
| 354 |
return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"
|