Spaces:

OrganizedProgrammers
/

DocFinder

Running

App Files Files Community

heymenn commited on 15 days ago

Commit

1cb47b6

1 Parent(s): 14e31ef

add parallelization for downloads

Browse files

Files changed (1) hide show

classes.py +32 -16

classes.py CHANGED Viewed

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 import os
 import json
 from urllib.parse import urljoin
 def _get_proxies() -> dict:
     """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
@@ -218,8 +219,7 @@ class ETSISpecFinder:
             except (ValueError, IndexError):
                 return []
-        candidates = []
-        for spec_type in ["TS", "TR"]:
             params = {
                 "option": "com_standardssearch",
                 "view": "data",
@@ -235,9 +235,15 @@ class ETSISpecFinder:
                                     proxies=_get_proxies())
                 data = resp.json()
                 if data and isinstance(data, list):
-                    candidates.extend(str(item["wki_id"]) for item in data if "wki_id" in item)
             except Exception as e:
                 print(f"Error getting wki_id for {doc_id}: {e}")
         return candidates
     def _authenticate_eol(self, wki_id: str) -> requests.Session:
@@ -273,11 +279,17 @@ class ETSISpecFinder:
         if not candidates:
             return f"Specification {doc_id} not found"
-        for wki_id in candidates:
             print(f"Trying wki_id={wki_id} for {doc_id}")
-            session = self._authenticate_eol(wki_id)
-            # NTaccount.asp → parse profile_id from meta-refresh
             r = session.get(
                 f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
                 verify=False, timeout=15,
@@ -285,24 +297,22 @@ class ETSISpecFinder:
             meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
             if not meta_match:
                 print(f"  wki_id={wki_id}: authentication failed, trying next")
-                continue
             meta_url = meta_match.group(1)
             if not meta_url.startswith("http"):
                 meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
-            # CheckIdentifier → 302 to copy_file
             r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
             if r2.status_code != 302:
                 print(f"  wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
-                continue
             location2 = r2.headers.get("Location", "")
             if "processError" in location2 or "processErrors" in location2:
                 print(f"  wki_id={wki_id}: portal rejected ({location2}), trying next")
-                continue
-            # copy_file (may have a second redirect)
             copy_url = urljoin("https://portal.etsi.org/", location2)
             r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
@@ -313,18 +323,17 @@ class ETSISpecFinder:
             else:
                 r4 = r3
-            # Extract DOCX link
             docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
             if not docx_urls:
                 print(f"  wki_id={wki_id}: DOCX not found in page, trying next")
-                continue
-            # Verify the DOCX belongs to the requested spec (e.g. "102 223" → "102223")
             spec_num = doc_id.split("-")[0].replace(" ", "")
             matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
             if not matching_urls:
-                print(f"  wki_id={wki_id}: DOCX spec mismatch (expected {spec_num} in filename), trying next")
-                continue
             docx_url = matching_urls[0]
             dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
             filename = docx_url.split("/")[-1]
@@ -335,4 +344,11 @@ class ETSISpecFinder:
             print(f"  wki_id={wki_id}: success")
             return tmp_path
         return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"

 import os
 import json
 from urllib.parse import urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
 def _get_proxies() -> dict:
     """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
             except (ValueError, IndexError):
                 return []
+        def fetch_for_type(spec_type):
             params = {
                 "option": "com_standardssearch",
                 "view": "data",
                                     proxies=_get_proxies())
                 data = resp.json()
                 if data and isinstance(data, list):
+                    return [str(item["wki_id"]) for item in data if "wki_id" in item]
             except Exception as e:
                 print(f"Error getting wki_id for {doc_id}: {e}")
+            return []
+        candidates = []
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            for result in executor.map(fetch_for_type, ["TS", "TR"]):
+                candidates.extend(result)
         return candidates
     def _authenticate_eol(self, wki_id: str) -> requests.Session:
         if not candidates:
             return f"Specification {doc_id} not found"
+        # Authenticate once — cookies are auth tokens, not wki_id-specific
+        auth_session = self._authenticate_eol(candidates[0])
+        def try_wki(wki_id):
             print(f"Trying wki_id={wki_id} for {doc_id}")
+            # Each thread gets its own session pre-loaded with the shared auth cookies
+            session = requests.Session()
+            session.headers.update({"User-Agent": self.headers["User-Agent"]})
+            session.proxies.update(_get_proxies())
+            session.cookies.update(auth_session.cookies)
             r = session.get(
                 f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
                 verify=False, timeout=15,
             meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
             if not meta_match:
                 print(f"  wki_id={wki_id}: authentication failed, trying next")
+                return None
             meta_url = meta_match.group(1)
             if not meta_url.startswith("http"):
                 meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
             r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
             if r2.status_code != 302:
                 print(f"  wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
+                return None
             location2 = r2.headers.get("Location", "")
             if "processError" in location2 or "processErrors" in location2:
                 print(f"  wki_id={wki_id}: portal rejected ({location2}), trying next")
+                return None
             copy_url = urljoin("https://portal.etsi.org/", location2)
             r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
             else:
                 r4 = r3
             docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
             if not docx_urls:
                 print(f"  wki_id={wki_id}: DOCX not found in page, trying next")
+                return None
             spec_num = doc_id.split("-")[0].replace(" ", "")
             matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
             if not matching_urls:
+                print(f"  wki_id={wki_id}: DOCX spec mismatch (expected {spec_num}), trying next")
+                return None
             docx_url = matching_urls[0]
             dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
             filename = docx_url.split("/")[-1]
             print(f"  wki_id={wki_id}: success")
             return tmp_path
+        with ThreadPoolExecutor(max_workers=min(len(candidates), 4)) as executor:
+            future_to_wki = {executor.submit(try_wki, wki_id): wki_id for wki_id in candidates}
+            for future in as_completed(future_to_wki):
+                result = future.result()
+                if result is not None:
+                    return result
         return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"