Voxxium commited on
Commit
2757a58
·
verified ·
1 Parent(s): 4d946b0

Create pcn.py

Browse files
Files changed (1) hide show
  1. pcn.py +917 -0
pcn.py ADDED
@@ -0,0 +1,917 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ pcn.py — Module ENT Paris Classe Numérique (sans CLI, pour import API)
4
+
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+
11
+ import html as html_mod
12
+
13
+ import json
14
+
15
+ import logging
16
+
17
+ import mimetypes
18
+
19
+ import os
20
+
21
+ import re
22
+
23
+ import sqlite3
24
+
25
+ import time
26
+
27
+ import random
28
+
29
+ from collections import defaultdict
30
+
31
+ from dataclasses import dataclass, field
32
+
33
+ from datetime import datetime, timezone, timedelta
34
+
35
+ from html.parser import HTMLParser
36
+
37
+ from pathlib import Path
38
+
39
+ from typing import Any, Optional
40
+
41
+ from urllib.parse import urlparse, unquote
42
+
43
+ import requests
44
+
45
+ try:
46
+
47
+ import cloudscraper
48
+
49
+ except ImportError:
50
+
51
+ cloudscraper = None
52
+
53
+ _log = logging.getLogger("pcn")
54
+
55
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
56
+
57
+ # Config
58
+
59
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
60
+
61
+
62
+
63
+ @dataclass
64
+
65
+ class Config:
66
+
67
+ base_url: str = “https://ent.parisclassenumerique.fr”
68
+
69
+ login: str = “”
70
+
71
+ password: str = “”
72
+
73
+ hours_back: int = 24
74
+
75
+ fetch_body: bool = True
76
+
77
+ fetch_attachments: bool = False
78
+
79
+ attachments_dir: Path = field(default_factory=lambda: Path("/tmp/pcn_pj"))
80
+
81
+ max_notif_pages: int = 50
82
+
83
+ max_msg_pages: int = 30
84
+
85
+ msg_page_size: int = 50
86
+
87
+ db_path: Path = field(default_factory=lambda: Path("/tmp/pcn_cache.db"))
88
+
89
+ dry_run: bool = False
90
+
91
+ notif_types: list[str] = field(default_factory=lambda: [
92
+
93
+ "MESSAGERIE", "BLOG", "ACTUALITES", "EXERCIZER",
94
+
95
+ "COMMUNITIES", "WIKI", "SCRAPBOOK", "TIMELINEGENERATOR",
96
+
97
+ ])
98
+
99
+
100
+
101
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
102
+
103
+ # Data Models
104
+
105
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
106
+
107
+
108
+
109
+ @dataclass
110
+
111
+ class Attachment:
112
+
113
+ url: str
114
+
115
+ filename: str
116
+
117
+ size_bytes: int = 0
118
+
119
+ content_type: str = “”
120
+
121
+ local_path: Optional[str] = None
122
+
123
+ downloaded: bool = False
124
+
125
+ source: str = “”
126
+
127
+ sha256: Optional[str] = None
128
+
129
+
130
+
131
+ @dataclass
132
+
133
+ class Message:
134
+
135
+ id: str
136
+
137
+ date: str
138
+
139
+ sender: str
140
+
141
+ role: str
142
+
143
+ subject: str
144
+
145
+ body: str = “”
146
+
147
+ has_attachments: bool = False
148
+
149
+ attachments: list[Attachment] = field(default_factory=list)
150
+
151
+
152
+
153
+ @dataclass
154
+
155
+ class Notification:
156
+
157
+ date: str
158
+
159
+ type: str
160
+
161
+ sender: str
162
+
163
+ subject: str
164
+
165
+ preview: str = “”
166
+
167
+
168
+
169
+ @dataclass
170
+
171
+ class Report:
172
+
173
+ generated_at: str
174
+
175
+ user: str
176
+
177
+ hours_back: int
178
+
179
+ notifications: list[Notification] = field(default_factory=list)
180
+
181
+ messages: list[Message] = field(default_factory=list)
182
+
183
+ stats: dict = field(default_factory=dict)
184
+
185
+
186
+
187
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
188
+
189
+ # Utilities
190
+
191
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
192
+
193
+
194
+
195
+ def _pause(lo=0.5, hi=1.5):
196
+
197
+ time.sleep(random.uniform(lo, hi))
198
+
199
+
200
+
201
+ _MIME_EXT = {
202
+
203
+ "image/jpeg": ".jpg", "image/png": ".png", "image/gif": ".gif",
204
+
205
+ "application/pdf": ".pdf",
206
+
207
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
208
+
209
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
210
+
211
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
212
+
213
+ "application/msword": ".doc", "application/zip": ".zip",
214
+
215
+ "text/plain": ".txt", "audio/mpeg": ".mp3", "video/mp4": ".mp4",
216
+
217
+ }
218
+
219
+ _FILE_EXTS = frozenset(
220
+
221
+ “.pdf .doc .docx .xls .xlsx .ppt .pptx .odt .ods .odp .rtf .txt .csv ”
222
+
223
+ “.jpg .jpeg .png .gif .bmp .svg .webp .mp3 .mp4 .avi .mkv .mov .wav ”
224
+
225
+ ".zip .rar .7z .html .epub".split()
226
+
227
+ )
228
+
229
+ _ENT_FILE_PATTERNS = (
230
+
231
+ "/workspace/document/", "/workspace/pub/document/", "/workspace/pub/",
232
+
233
+ "/conversation/api/messages/", "/infra/file/", "/blog/pub/",
234
+
235
+ )
236
+
237
+
238
+
239
+ def _safe_name(name: str, maxlen: int = 200) -> str:
240
+
241
+ name = re.sub(r'[\\/*?:"<>|\x00-\x1f]', "_", name)
242
+
243
+ return (name.strip(". ") or "fichier")[:maxlen]
244
+
245
+
246
+
247
+ def _resolve_filename(resp: requests.Response, hint: str) -> str:
248
+
249
+ cd = resp.headers.get("Content-Disposition", "")
250
+
251
+ m = re.search(r"filename\*\s*=\s*(?:UTF-8|utf-8)''([^;\s]+)", cd, re.I)
252
+
253
+ if m:
254
+
255
+ return unquote(m.group(1))
256
+
257
+ m = re.search(r'filename="([^"]+)"', cd, re.I)
258
+
259
+ if m:
260
+
261
+ return m.group(1).strip()
262
+
263
+ name = hint or “fichier”
264
+
265
+ if not Path(name).suffix:
266
+
267
+ ct = resp.headers.get("Content-Type", "").split(";")[0].strip().lower()
268
+
269
+ ext = _MIME_EXT.get(ct, "") or (mimetypes.guess_extension(ct) or "")
270
+
271
+ if ext:
272
+
273
+ name += ext
274
+
275
+ return name
276
+
277
+
278
+
279
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
280
+
281
+ # HTML parsers
282
+
283
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
284
+
285
+
286
+
287
+ class _TextExtractor(HTMLParser):
288
+
289
+ _BLOCK = frozenset("p div br h1 h2 h3 h4 h5 h6 li tr blockquote pre hr".split())
290
+
291
+ _SKIP = frozenset("script style head".split())
292
+
293
+ def __init__(self):
294
+
295
+ super().__init__()
296
+
297
+ self._buf, self._skip = [], 0
298
+
299
+ def handle_starttag(self, tag, _):
300
+
301
+ t = tag.lower()
302
+
303
+ if t in self._SKIP: self._skip += 1
304
+
305
+ elif t in self._BLOCK: self._buf.append("\n")
306
+
307
+ def handle_endtag(self, tag):
308
+
309
+ t = tag.lower()
310
+
311
+ if t in self._SKIP: self._skip = max(0, self._skip - 1)
312
+
313
+ elif t in self._BLOCK: self._buf.append("\n")
314
+
315
+ def handle_data(self, data):
316
+
317
+ if not self._skip: self._buf.append(data)
318
+
319
+ def handle_entityref(self, name):
320
+
321
+ self._buf.append(html_mod.unescape(f"&{name};"))
322
+
323
+ def handle_charref(self, name):
324
+
325
+ self._buf.append(html_mod.unescape(f"&#{name};"))
326
+
327
+ def text(self):
328
+
329
+ t = "".join(self._buf)
330
+
331
+ t = re.sub(r"[ \t]+", " ", t)
332
+
333
+ t = re.sub(r"\n{3,}", "\n\n", t)
334
+
335
+ return t.strip()
336
+
337
+
338
+
339
+ def html_to_text(raw: str) -> str:
340
+
341
+ if not raw:
342
+
343
+ return “”
344
+
345
+ p = _TextExtractor()
346
+
347
+ try:
348
+
349
+ p.feed(raw)
350
+
351
+ return p.text()
352
+
353
+ except Exception:
354
+
355
+ t = re.sub(r"<br\s*/?>", "\n", raw, flags=re.I)
356
+
357
+ t = re.sub(r"</(?:p|div|h\d|li|tr)>", "\n", t, flags=re.I)
358
+
359
+ return re.sub(r"<[^>]+>", "", t).strip()
360
+
361
+
362
+
363
+ class _ResourceExtractor(HTMLParser):
364
+
365
+ _URL_ATTRS = frozenset("href src data-src data-document-href data-download-url poster data-uri data-href".split())
366
+
367
+ def __init__(self, base: str):
368
+
369
+ super().__init__()
370
+
371
+ self.base = base
372
+
373
+ self._host = urlparse(base).netloc
374
+
375
+ self.found: list[dict] = []
376
+
377
+ self._seen: set[str] = set()
378
+
379
+ def _norm(self, url):
380
+
381
+ url = url.strip()
382
+
383
+ if url.startswith("//"): return "https:" + url
384
+
385
+ if url.startswith("/"): return self.base + url
386
+
387
+ return url
388
+
389
+ def _same_domain(self, url):
390
+
391
+ h = urlparse(url).netloc
392
+
393
+ return not h or h == self._host
394
+
395
+ def _looks_like_file(self, url):
396
+
397
+ path = urlparse(url).path.lower()
398
+
399
+ if any(p in path for p in _ENT_FILE_PATTERNS): return True
400
+
401
+ _, ext = os.path.splitext(path)
402
+
403
+ return ext in _FILE_EXTS
404
+
405
+ def _add(self, url, filename, source):
406
+
407
+ url = self._norm(url)
408
+
409
+ if url in self._seen or not self._same_domain(url) or not self._looks_like_file(url):
410
+
411
+ return
412
+
413
+ self._seen.add(url)
414
+
415
+ self.found.append({"url": url, "filename": filename or "fichier", "source": source})
416
+
417
+ def _best_name(self, attrs, url):
418
+
419
+ for a in ("data-filename", "title", "alt", "download"):
420
+
421
+ v = attrs.get(a)
422
+
423
+ if v and isinstance(v, str) and v.strip(): return v.strip()
424
+
425
+ return unquote(urlparse(url).path.rstrip("/").split("/")[-1]) or “fichier”
426
+
427
+ def handle_starttag(self, tag, attrs):
428
+
429
+ ad = dict(attrs)
430
+
431
+ tl = tag.lower()
432
+
433
+ did = (ad.get("data-document-id") or "").strip()
434
+
435
+ if did:
436
+
437
+ url = f"{self.base}/workspace/document/{did}"
438
+
439
+ self._add(url, self._best_name(ad, url), f"data-document-id:{tl}")
440
+
441
+ for attr in self._URL_ATTRS:
442
+
443
+ val = ad.get(attr)
444
+
445
+ if not val or not isinstance(val, str): continue
446
+
447
+ val = val.strip()
448
+
449
+ if val.startswith(("data:", "javascript:", "mailto:", "#")): continue
450
+
451
+ self._add(val, self._best_name(ad, val), f"{attr}:{tl}")
452
+
453
+ if tl == "object":
454
+
455
+ val = ad.get("data")
456
+
457
+ if val and isinstance(val, str) and not val.strip().startswith(("data:", "javascript:")):
458
+
459
+ self._add(val.strip(), self._best_name(ad, val.strip()), f"data:{tl}")
460
+
461
+ style = ad.get("style") or “”
462
+
463
+ if style and isinstance(style, str):
464
+
465
+ for m in re.finditer(r"url\(['\"]?([^'\")\s]+)['\"]?\)", style):
466
+
467
+ self._add(m.group(1), "style_resource", f"style:{tl}")
468
+
469
+
470
+
471
+ def extract_resources(html_str: str, base: str) -> list[dict]:
472
+
473
+ if not html_str: return []
474
+
475
+ resources, seen = [], set()
476
+
477
+ ex = _ResourceExtractor(base)
478
+
479
+ try:
480
+
481
+ ex.feed(html_str)
482
+
483
+ except Exception:
484
+
485
+ pass
486
+
487
+ for r in ex.found:
488
+
489
+ if r["url"] not in seen:
490
+
491
+ seen.add(r["url"])
492
+
493
+ resources.append®
494
+
495
+ for m in re.finditer(r"(/workspace/(?:pub/)?document/[a-f0-9-]+(?:/[^\s\"'<>]*)?)", html_str):
496
+
497
+ url = base + m.group(1)
498
+
499
+ if url not in seen:
500
+
501
+ seen.add(url)
502
+
503
+ fn = unquote(urlparse(url).path.rstrip("/").split("/")[-1])
504
+
505
+ resources.append({"url": url, "filename": fn or "workspace_doc", "source": "regex"})
506
+
507
+ return resources
508
+
509
+
510
+
511
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
512
+
513
+ # Cache SQLite
514
+
515
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
516
+
517
+
518
+
519
+ class Cache:
520
+
521
+ def __init__(self, path: Path):
522
+
523
+ self._path = path
524
+
525
+ self._conn: Optional[sqlite3.Connection] = None
526
+
527
+ def _db(self):
528
+
529
+ if self._conn is None:
530
+
531
+ self._conn = sqlite3.connect(str(self._path))
532
+
533
+ self._conn.execute("PRAGMA journal_mode=WAL")
534
+
535
+ self._conn.executescript("""
536
+
537
+ CREATE TABLE IF NOT EXISTS downloads (url TEXT PRIMARY KEY, filename TEXT, local_path TEXT, sha256 TEXT, size_bytes INTEGER, ts TEXT);
538
+
539
+ CREATE TABLE IF NOT EXISTS messages (id TEXT PRIMARY KEY, ts TEXT, subject TEXT, sender TEXT);
540
+
541
+ """)
542
+
543
+ return self._conn
544
+
545
+ def already(self, url):
546
+
547
+ r = self._db().execute("SELECT local_path FROM downloads WHERE url=?", (url,)).fetchone()
548
+
549
+ return r[0] if r and r[0] and Path(r[0]).exists() else None
550
+
551
+ def save(self, url, fn, lp, h, sz):
552
+
553
+ self._db().execute("INSERT OR REPLACE INTO downloads VALUES (?,?,?,?,?,?)",
554
+
555
+ (url, fn, lp, h, sz, datetime.now(timezone.utc).isoformat()))
556
+
557
+ self._db().commit()
558
+
559
+ def mark_msg(self, mid, subj, sender):
560
+
561
+ self._db().execute("INSERT OR REPLACE INTO messages VALUES (?,?,?,?)",
562
+
563
+ (mid, datetime.now(timezone.utc).isoformat(), subj, sender))
564
+
565
+ self._db().commit()
566
+
567
+ def close(self):
568
+
569
+ if self._conn: self._conn.close(); self._conn = None
570
+
571
+
572
+
573
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
574
+
575
+ # Smart Session
576
+
577
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
578
+
579
+ _CF_MARKERS = ("cf-browser-verification", "challenge-platform", "cf-challenge", "Just a moment")
580
+
581
+ _HEADERS = {
582
+
583
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
584
+
585
+ "Accept-Language": "fr-FR,fr;q=0.9", "DNT": "1",
586
+
587
+ }
588
+
589
+
590
+
591
+ class SmartSession:
592
+
593
+ MAX_RETRIES = 3
594
+
595
+ BACKOFF = 2.0
596
+
597
+ def __init__(self):
598
+
599
+ self._s = requests.Session()
600
+
601
+ self._s.headers.update(_HEADERS)
602
+
603
+ self._upgraded = False
604
+
605
+ @property
606
+
607
+ def is_cloudscraper(self): return self._upgraded
608
+
609
+ @property
610
+
611
+ def cookies(self): return self._s.cookies
612
+
613
+ @property
614
+
615
+ def headers(self): return self._s.headers
616
+
617
+ def _cf_blocked(self, r):
618
+
619
+ if self._upgraded or r.status_code not in (403, 503): return False
620
+
621
+ return any(m in r.text[:4000] for m in _CF_MARKERS)
622
+
623
+ def _upgrade(self):
624
+
625
+ if self._upgraded: return
626
+
627
+ if cloudscraper is None: return
628
+
629
+ _log.warning("Cloudflare detected → cloudscraper")
630
+
631
+ old = dict(self._s.cookies)
632
+
633
+ self._s = cloudscraper.create_scraper(browser={"browser": "firefox", "platform": "windows", "mobile": False})
634
+
635
+ self._s.headers.update(_HEADERS)
636
+
637
+ self._s.cookies.update(old)
638
+
639
+ self._upgraded = True
640
+
641
+ def _do(self, method, url, **kw):
642
+
643
+ kw.setdefault("timeout", 30)
644
+
645
+ last_exc = None
646
+
647
+ for attempt in range(self.MAX_RETRIES):
648
+
649
+ try:
650
+
651
+ r = getattr(self._s, method)(url, **kw)
652
+
653
+ if self._cf_blocked(r): self._upgrade(); r = getattr(self._s, method)(url, **kw)
654
+
655
+ if r.status_code == 429:
656
+
657
+ time.sleep(float(r.headers.get("Retry-After", 10))); continue
658
+
659
+ if r.status_code >= 500 and attempt < self.MAX_RETRIES - 1:
660
+
661
+ time.sleep(self.BACKOFF ** attempt); continue
662
+
663
+ return r
664
+
665
+ except (requests.ConnectionError, requests.Timeout) as exc:
666
+
667
+ last_exc = exc
668
+
669
+ if attempt < self.MAX_RETRIES - 1: time.sleep(self.BACKOFF ** (attempt + 1))
670
+
671
+ if last_exc: raise last_exc
672
+
673
+ return r
674
+
675
+ def get(self, url, **kw): return self._do("get", url, **kw)
676
+
677
+ def post(self, url, **kw): return self._do("post", url, **kw)
678
+
679
+
680
+
681
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
682
+
683
+ # ENT Client
684
+
685
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
686
+
687
+
688
+
689
+ class ENTClient:
690
+
691
+ def __init__(self, cfg: Config):
692
+
693
+ self.cfg = cfg
694
+
695
+ self.s = SmartSession()
696
+
697
+ self.cache = Cache(cfg.db_path)
698
+
699
+ self.user: dict = {}
700
+
701
+ self.stats: dict[str, int] = defaultdict(int)
702
+
703
+ def _xhr(self, ref=None):
704
+
705
+ return {
706
+
707
+ "X-XSRF-TOKEN": self.s.cookies.get("XSRF-TOKEN", ""),
708
+
709
+ "Accept": "application/json, text/plain, */*",
710
+
711
+ "Referer": ref or f"{self.cfg.base_url}/conversation/conversation",
712
+
713
+ }
714
+
715
+ def _api(self, path, params=None, ref=None):
716
+
717
+ r = self.s.get(f"{self.cfg.base_url}{path}", params=params, headers=self._xhr(ref), timeout=20)
718
+
719
+ self.stats["api"] += 1
720
+
721
+ if r.status_code != 200: return None
722
+
723
+ try: return r.json()
724
+
725
+ except Exception: return None
726
+
727
+ def login(self):
728
+
729
+ _log.info("Connecting to PCN…")
730
+
731
+ self.s.get(f"{self.cfg.base_url}/auth/login", timeout=30)
732
+
733
+ _pause(1.0, 2.0)
734
+
735
+ xsrf = self.s.cookies.get("XSRF-TOKEN", "")
736
+
737
+ self.s.post(f"{self.cfg.base_url}/auth/login",
738
+
739
+ data={"email": self.cfg.login, "password": self.cfg.password},
740
+
741
+ headers={"X-XSRF-TOKEN": xsrf, "Content-Type": "application/x-www-form-urlencoded",
742
+
743
+ "Origin": self.cfg.base_url},
744
+
745
+ timeout=30, allow_redirects=True)
746
+
747
+ _pause(1.5, 2.5)
748
+
749
+ if self.s.cookies.get("authenticated") != "true":
750
+
751
+ r = self.s.get(f"{self.cfg.base_url}/auth/oauth2/userinfo", headers=self._xhr(), timeout=15)
752
+
753
+ if r.status_code != 200:
754
+
755
+ raise Exception("Login failed")
756
+
757
+ _pause()
758
+
759
+ self.user = self._api("/auth/oauth2/userinfo") or {}
760
+
761
+ _log.info("Logged in as: %s %s", self.user.get("firstName", "?"), self.user.get("lastName", "?"))
762
+
763
+ def fetch_notifications(self):
764
+
765
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=self.cfg.hours_back)
766
+
767
+ out, page = [], 0
768
+
769
+ while page < self.cfg.max_notif_pages:
770
+
771
+ data = self._api("/timeline/lastNotifications",
772
+
773
+ params=[("type", t) for t in self.cfg.notif_types] + [("page", page)])
774
+
775
+ if not data: break
776
+
777
+ items = data.get("results", [])
778
+
779
+ stop = False
780
+
781
+ for n in items:
782
+
783
+ try: dt = datetime.fromisoformat(n["date"]["$date"].replace("Z", "+00:00"))
784
+
785
+ except Exception: continue
786
+
787
+ if dt < cutoff: stop = True; break
788
+
789
+ p = n.get("params", {})
790
+
791
+ out.append(Notification(
792
+
793
+ date=dt.strftime("%Y-%m-%d %H:%M"), type=n.get("type", ""),
794
+
795
+ sender=p.get("username", ""),
796
+
797
+ subject=p.get("subject") or p.get("postTitle") or p.get("resourceName", ""),
798
+
799
+ preview=re.sub(r"\s+", " ", html_to_text(n.get("message", "")))[:300],
800
+
801
+ ))
802
+
803
+ if stop or len(items) < 25: break
804
+
805
+ page += 1; _pause(0.3, 0.8)
806
+
807
+ self.stats["notifs"] = len(out)
808
+
809
+ return out
810
+
811
+ def fetch_messages(self):
812
+
813
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=self.cfg.hours_back)
814
+
815
+ out, page = [], 0
816
+
817
+ while page < self.cfg.max_msg_pages:
818
+
819
+ items = self._api("/conversation/api/folders/inbox/messages",
820
+
821
+ params={"page_size": self.cfg.msg_page_size, "page": page, "unread": "true"})
822
+
823
+ if not items: break
824
+
825
+ stop = False
826
+
827
+ for m in items:
828
+
829
+ try: dt = datetime.fromtimestamp(m["date"] / 1000, tz=timezone.utc)
830
+
831
+ except Exception: continue
832
+
833
+ if dt < cutoff: stop = True; break
834
+
835
+ m["_dt"] = dt.strftime("%Y-%m-%d %H:%M")
836
+
837
+ out.append(m)
838
+
839
+ if stop or len(items) < self.cfg.msg_page_size: break
840
+
841
+ page += 1; _pause(0.3, 0.8)
842
+
843
+ self.stats["msgs"] = len(out)
844
+
845
+ return out
846
+
847
+ def _detail(self, mid):
848
+
849
+ data = self._api(f"/conversation/api/messages/{mid}")
850
+
851
+ if not data: return {"body_text": "", "api_att": [], "html_res": []}
852
+
853
+ body_html = data.get("body", "")
854
+
855
+ return {"body_text": html_to_text(body_html), "api_att": data.get("attachments", []),
856
+
857
+ "html_res": extract_resources(body_html, self.cfg.base_url)}
858
+
859
+ def _download(self, url, dest_dir, hint):
860
+
861
+ cached = self.cache.already(url)
862
+
863
+ if cached: self.stats["cache"] += 1; return Attachment(url=url, filename=hint, local_path=cached, downloaded=True, source="cache")
864
+
865
+ if self.cfg.dry_run: return Attachment(url=url, filename=hint, downloaded=False, source="dry-run")
866
+
867
+ dest_dir.mkdir(parents=True, exist_ok=True)
868
+
869
+ try:
870
+
871
+ r = self.s.get(url, headers=self._xhr(), timeout=120, stream=True)
872
+
873
+ if r.status_code != 200: self.stats["dl_err"] += 1; return Attachment(url=url, filename=hint, downloaded=False, source="error")
874
+
875
+ real = _resolve_filename(r, hint); safe = _safe_name(real); dest = dest_dir / safe
876
+
877
+ if dest.exists():
878
+
879
+ stem, suf = dest.stem, dest.suffix; i = 1
880
+
881
+ while dest.exists(): dest = dest_dir / f"{stem}_{i}{suf}"; i += 1
882
+
883
+ sha = hashlib.sha256(); size = 0
884
+
885
+ with open(dest, "wb") as fp:
886
+
887
+ for chunk in r.iter_content(65_536): fp.write(chunk); sha.update(chunk); size += len(chunk)
888
+
889
+ h = sha.hexdigest()
890
+
891
+ self.cache.save(url, safe, str(dest), h, size); self.stats["dl"] += 1; self.stats["dl_bytes"] += size
892
+
893
+ return Attachment(url=url, filename=safe, size_bytes=size, content_type=r.headers.get("Content-Type", "").split(";")[0].strip(),
894
+
895
+ local_path=str(dest), downloaded=True, source="download", sha256=h)
896
+
897
+ except Exception:
898
+
899
+ self.stats["dl_err"] += 1; return Attachment(url=url, filename=hint, downloaded=False, source="error")
900
+
901
+ def _attachments(self, mid, detail):
902
+
903
+ out, seen = [], set()
904
+
905
+ d = self.cfg.attachments_dir / mid
906
+
907
+ for a in detail.get("api_att", []):
908
+
909
+ fid = a.get("id", ""); fn = a.get("filename", f"file_{fid}")
910
+
911
+ url = f"{self.cfg.base_url}/conversation/api/messages/{mid}/attachments/{fid}"
912
+
913
+ if url in seen: continue; seen.add(url); _pause(0.2, 0.6)
914
+
915
+ att = self._download(url, d, fn); out.append(att)
916
+
917
+ for res in detail.get("htm