heymenn commited on
Commit
31bba0e
·
1 Parent(s): 8d410d8

fix CR issue when downloading

Browse files
Files changed (2) hide show
  1. app.py +27 -0
  2. classes.py +49 -4
app.py CHANGED
@@ -347,6 +347,33 @@ def find_document_batch(request: BatchDocRequest):
347
  search_time=time.time()-start_time
348
  )
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  @app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
351
  responses={
352
  200: {"description": "DOCX file streamed directly"},
 
347
  search_time=time.time()-start_time
348
  )
349
 
350
+ @app.post("/find/tdoc/download", tags=["Document Retrieval"],
351
+ summary="Download an ETSI TDoc (CR, contribution) as DOCX",
352
+ responses={
353
+ 200: {"description": "DOCX file streamed directly"},
354
+ 404: {"description": "TDoc not found"},
355
+ })
356
+ def find_tdoc_download(request: DocRequest):
357
+ document = request.doc_id
358
+ url = etsi_doc_finder.search_document(document)
359
+
360
+ if "not found" in url.lower():
361
+ raise HTTPException(status_code=404, detail=f"TDoc {document} not found")
362
+
363
+ content = etsi_doc_finder.download_document(url)
364
+
365
+ filename = url.split("/")[-1]
366
+ tmp_path = f"/tmp/{filename}"
367
+ with open(tmp_path, "wb") as f:
368
+ f.write(content)
369
+
370
+ return FileResponse(
371
+ tmp_path,
372
+ filename=filename,
373
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
374
+ )
375
+
376
+
377
  @app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
378
  responses={
379
  200: {"description": "DOCX file streamed directly"},
classes.py CHANGED
@@ -5,19 +5,62 @@ from bs4 import BeautifulSoup
5
  import os
6
  import json
7
 
 
 
 
 
 
 
 
8
  class ETSIDocFinder:
 
 
9
  def __init__(self):
10
  self.main_ftp_url = "https://docbox.etsi.org/SET"
11
  req_data = self.connect()
12
  print(req_data['message'])
13
  self.session = req_data['session']
14
-
15
  def connect(self):
16
  session = requests.Session()
17
- req = session.post("https://portal.etsi.org/ETSIPages/LoginEOL.ashx", verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}, data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if req.text == "Failed":
19
  return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
20
  return {"error": False, "session": session, "message": "Login successful"}
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def get_workgroup(self, doc: str):
23
  main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
@@ -92,7 +135,7 @@ class ETSISpecFinder:
92
 
93
  def get_docs_from_url(self, url):
94
  try:
95
- response = requests.get(url, verify=False, timeout=15)
96
  soup = BeautifulSoup(response.text, "html.parser")
97
  docs = [item.get_text() for item in soup.find_all("a")][1:]
98
  return docs
@@ -180,7 +223,8 @@ class ETSISpecFinder:
180
  }
181
  try:
182
  resp = requests.get("https://www.etsi.org/", params=params,
183
- headers=self.headers, verify=False, timeout=15)
 
184
  data = resp.json()
185
  if data and isinstance(data, list):
186
  return str(data[0]["wki_id"])
@@ -192,6 +236,7 @@ class ETSISpecFinder:
192
  """Create a requests.Session authenticated to the ETSI EOL portal."""
193
  session = requests.Session()
194
  session.headers.update({"User-Agent": self.headers["User-Agent"]})
 
195
 
196
  login_redir_url = (
197
  f"https://portal.etsi.org/LoginRedirection.aspx"
 
5
  import os
6
  import json
7
 
8
+ def _get_proxies() -> dict:
9
+ """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
10
+ proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY") or ""
11
+ if not proxy:
12
+ return {}
13
+ return {"http": proxy, "https": proxy}
14
+
15
  class ETSIDocFinder:
16
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
17
+
18
  def __init__(self):
19
  self.main_ftp_url = "https://docbox.etsi.org/SET"
20
  req_data = self.connect()
21
  print(req_data['message'])
22
  self.session = req_data['session']
23
+
24
  def connect(self):
25
  session = requests.Session()
26
+ session.headers.update(self.HEADERS)
27
+ session.proxies.update(_get_proxies())
28
+
29
+ # Seed DNN session cookies — docbox requires the portal session to be
30
+ # initialised with domain=docbox.etsi.org so the .DOTNETNUKE cookie
31
+ # is scoped to .etsi.org and accepted by docbox.etsi.org as well.
32
+ login_redir_url = (
33
+ "https://portal.etsi.org/LoginRedirection.aspx"
34
+ "?domain=docbox.etsi.org&ReturnUrl=/"
35
+ )
36
+ session.get(login_redir_url, verify=False, timeout=15)
37
+
38
+ req = session.post(
39
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
40
+ data=json.dumps({"username": os.environ.get("EOL_USER"),
41
+ "password": os.environ.get("EOL_PASSWORD")}),
42
+ headers={"Content-Type": "application/json; charset=UTF-8",
43
+ "Referer": login_redir_url},
44
+ verify=False,
45
+ allow_redirects=False,
46
+ timeout=15,
47
+ )
48
  if req.text == "Failed":
49
  return {"error": True, "session": session, "message": "Login failed ! Check your credentials"}
50
  return {"error": False, "session": session, "message": "Login successful"}
51
+
52
+ def download_document(self, url: str) -> bytes:
53
+ """Download a docbox file using the authenticated session.
54
+
55
+ If the session has expired the portal redirects to LoginRedirection —
56
+ we detect this and re-authenticate before retrying.
57
+ """
58
+ resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
59
+ # Detect auth redirect (portal login page returned instead of file)
60
+ if resp.url and "LoginRedirection" in resp.url:
61
+ self.connect()
62
+ resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
63
+ return resp.content
64
 
65
  def get_workgroup(self, doc: str):
66
  main_tsg = "SET-WG-R" if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"]) else "SET-WG-T" if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"]) else "SET" if any(doc.startswith(kw) for kw in ["SET", "SCP"]) else None
 
135
 
136
  def get_docs_from_url(self, url):
137
  try:
138
+ response = requests.get(url, verify=False, timeout=15, proxies=_get_proxies())
139
  soup = BeautifulSoup(response.text, "html.parser")
140
  docs = [item.get_text() for item in soup.find_all("a")][1:]
141
  return docs
 
223
  }
224
  try:
225
  resp = requests.get("https://www.etsi.org/", params=params,
226
+ headers=self.headers, verify=False, timeout=15,
227
+ proxies=_get_proxies())
228
  data = resp.json()
229
  if data and isinstance(data, list):
230
  return str(data[0]["wki_id"])
 
236
  """Create a requests.Session authenticated to the ETSI EOL portal."""
237
  session = requests.Session()
238
  session.headers.update({"User-Agent": self.headers["User-Agent"]})
239
+ session.proxies.update(_get_proxies())
240
 
241
  login_redir_url = (
242
  f"https://portal.etsi.org/LoginRedirection.aspx"