heymenn commited on
Commit
1cb47b6
·
1 Parent(s): 14e31ef

add parallelization for downloads

Browse files
Files changed (1) hide show
  1. classes.py +32 -16
classes.py CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
5
  import os
6
  import json
7
  from urllib.parse import urljoin
 
8
 
9
  def _get_proxies() -> dict:
10
  """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
@@ -218,8 +219,7 @@ class ETSISpecFinder:
218
  except (ValueError, IndexError):
219
  return []
220
 
221
- candidates = []
222
- for spec_type in ["TS", "TR"]:
223
  params = {
224
  "option": "com_standardssearch",
225
  "view": "data",
@@ -235,9 +235,15 @@ class ETSISpecFinder:
235
  proxies=_get_proxies())
236
  data = resp.json()
237
  if data and isinstance(data, list):
238
- candidates.extend(str(item["wki_id"]) for item in data if "wki_id" in item)
239
  except Exception as e:
240
  print(f"Error getting wki_id for {doc_id}: {e}")
 
 
 
 
 
 
241
  return candidates
242
 
243
  def _authenticate_eol(self, wki_id: str) -> requests.Session:
@@ -273,11 +279,17 @@ class ETSISpecFinder:
273
  if not candidates:
274
  return f"Specification {doc_id} not found"
275
 
276
- for wki_id in candidates:
 
 
 
277
  print(f"Trying wki_id={wki_id} for {doc_id}")
278
- session = self._authenticate_eol(wki_id)
 
 
 
 
279
 
280
- # NTaccount.asp → parse profile_id from meta-refresh
281
  r = session.get(
282
  f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
283
  verify=False, timeout=15,
@@ -285,24 +297,22 @@ class ETSISpecFinder:
285
  meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
286
  if not meta_match:
287
  print(f" wki_id={wki_id}: authentication failed, trying next")
288
- continue
289
 
290
  meta_url = meta_match.group(1)
291
  if not meta_url.startswith("http"):
292
  meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
293
 
294
- # CheckIdentifier → 302 to copy_file
295
  r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
296
  if r2.status_code != 302:
297
  print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
298
- continue
299
 
300
  location2 = r2.headers.get("Location", "")
301
  if "processError" in location2 or "processErrors" in location2:
302
  print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
303
- continue
304
 
305
- # copy_file (may have a second redirect)
306
  copy_url = urljoin("https://portal.etsi.org/", location2)
307
  r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
308
 
@@ -313,18 +323,17 @@ class ETSISpecFinder:
313
  else:
314
  r4 = r3
315
 
316
- # Extract DOCX link
317
  docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
318
  if not docx_urls:
319
  print(f" wki_id={wki_id}: DOCX not found in page, trying next")
320
- continue
321
 
322
- # Verify the DOCX belongs to the requested spec (e.g. "102 223" → "102223")
323
  spec_num = doc_id.split("-")[0].replace(" ", "")
324
  matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
325
  if not matching_urls:
326
- print(f" wki_id={wki_id}: DOCX spec mismatch (expected {spec_num} in filename), trying next")
327
- continue
 
328
  docx_url = matching_urls[0]
329
  dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
330
  filename = docx_url.split("/")[-1]
@@ -335,4 +344,11 @@ class ETSISpecFinder:
335
  print(f" wki_id={wki_id}: success")
336
  return tmp_path
337
 
 
 
 
 
 
 
 
338
  return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"
 
5
  import os
6
  import json
7
  from urllib.parse import urljoin
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
 
10
  def _get_proxies() -> dict:
11
  """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
 
219
  except (ValueError, IndexError):
220
  return []
221
 
222
+ def fetch_for_type(spec_type):
 
223
  params = {
224
  "option": "com_standardssearch",
225
  "view": "data",
 
235
  proxies=_get_proxies())
236
  data = resp.json()
237
  if data and isinstance(data, list):
238
+ return [str(item["wki_id"]) for item in data if "wki_id" in item]
239
  except Exception as e:
240
  print(f"Error getting wki_id for {doc_id}: {e}")
241
+ return []
242
+
243
+ candidates = []
244
+ with ThreadPoolExecutor(max_workers=2) as executor:
245
+ for result in executor.map(fetch_for_type, ["TS", "TR"]):
246
+ candidates.extend(result)
247
  return candidates
248
 
249
  def _authenticate_eol(self, wki_id: str) -> requests.Session:
 
279
  if not candidates:
280
  return f"Specification {doc_id} not found"
281
 
282
+ # Authenticate once — cookies are auth tokens, not wki_id-specific
283
+ auth_session = self._authenticate_eol(candidates[0])
284
+
285
+ def try_wki(wki_id):
286
  print(f"Trying wki_id={wki_id} for {doc_id}")
287
+ # Each thread gets its own session pre-loaded with the shared auth cookies
288
+ session = requests.Session()
289
+ session.headers.update({"User-Agent": self.headers["User-Agent"]})
290
+ session.proxies.update(_get_proxies())
291
+ session.cookies.update(auth_session.cookies)
292
 
 
293
  r = session.get(
294
  f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
295
  verify=False, timeout=15,
 
297
  meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
298
  if not meta_match:
299
  print(f" wki_id={wki_id}: authentication failed, trying next")
300
+ return None
301
 
302
  meta_url = meta_match.group(1)
303
  if not meta_url.startswith("http"):
304
  meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
305
 
 
306
  r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
307
  if r2.status_code != 302:
308
  print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
309
+ return None
310
 
311
  location2 = r2.headers.get("Location", "")
312
  if "processError" in location2 or "processErrors" in location2:
313
  print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
314
+ return None
315
 
 
316
  copy_url = urljoin("https://portal.etsi.org/", location2)
317
  r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
318
 
 
323
  else:
324
  r4 = r3
325
 
 
326
  docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
327
  if not docx_urls:
328
  print(f" wki_id={wki_id}: DOCX not found in page, trying next")
329
+ return None
330
 
 
331
  spec_num = doc_id.split("-")[0].replace(" ", "")
332
  matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
333
  if not matching_urls:
334
+ print(f" wki_id={wki_id}: DOCX spec mismatch (expected {spec_num}), trying next")
335
+ return None
336
+
337
  docx_url = matching_urls[0]
338
  dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
339
  filename = docx_url.split("/")[-1]
 
344
  print(f" wki_id={wki_id}: success")
345
  return tmp_path
346
 
347
+ with ThreadPoolExecutor(max_workers=min(len(candidates), 4)) as executor:
348
+ future_to_wki = {executor.submit(try_wki, wki_id): wki_id for wki_id in candidates}
349
+ for future in as_completed(future_to_wki):
350
+ result = future.result()
351
+ if result is not None:
352
+ return result
353
+
354
  return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"