Almaatla commited on
Commit
53bb3f7
·
verified ·
1 Parent(s): 749121c

Upload classes.py

Browse files
Files changed (1) hide show
  1. classes.py +869 -867
classes.py CHANGED
@@ -1,868 +1,870 @@
1
- import shutil
2
- import bm25s
3
- from bm25s.hf import BM25HF
4
- import threading, re, time, concurrent.futures, requests, os, hashlib, traceback, io, zipfile, subprocess, tempfile, json, fitz
5
- import pandas as pd
6
- import numpy as np
7
-
8
- from bs4 import BeautifulSoup
9
- from datasets import load_dataset, Dataset
10
- from datasets.data_files import EmptyDatasetError
11
- from dotenv import load_dotenv
12
-
13
- load_dotenv()
14
-
15
- class TDocIndexer:
16
- def __init__(self, max_workers=33):
17
- self.indexer_length = 0
18
- self.dataset = "OrganizedProgrammers/3GPPTDocLocation"
19
-
20
- self.indexer = self.load_indexer()
21
- self.main_ftp_url = "https://3gpp.org/ftp"
22
- self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
23
- self.max_workers = max_workers
24
-
25
- self.print_lock = threading.Lock()
26
- self.indexer_lock = threading.Lock()
27
-
28
- self.total_indexed = 0
29
- self.processed_count = 0
30
- self.total_count = 0
31
-
32
- def load_indexer(self):
33
- self.indexer_length = 0
34
- all_docs = {}
35
- tdoc_locations = load_dataset(self.dataset)
36
- tdoc_locations = tdoc_locations["train"].to_list()
37
- for doc in tdoc_locations:
38
- self.indexer_length += 1
39
- all_docs[doc["doc_id"]] = doc["url"]
40
-
41
- return all_docs
42
-
43
- def save_indexer(self):
44
- """Save the updated index"""
45
- data = []
46
- for doc_id, url in self.indexer.items():
47
- data.append({"doc_id": doc_id, "url": url})
48
-
49
- dataset = Dataset.from_list(data)
50
- dataset.push_to_hub(self.dataset, token=os.environ["HF"])
51
- self.indexer = self.load_indexer()
52
-
53
- def get_docs_from_url(self, url):
54
- try:
55
- response = requests.get(url, verify=False, timeout=10)
56
- soup = BeautifulSoup(response.text, "html.parser")
57
- return [item.get_text() for item in soup.select("tr td a")]
58
- except Exception as e:
59
- with self.print_lock:
60
- print(f"Erreur lors de l'accès à {url}: {e}")
61
- return []
62
-
63
- def is_valid_document_pattern(self, filename):
64
- return bool(self.valid_doc_pattern.match(filename))
65
-
66
- def is_zip_file(self, filename):
67
- return filename.lower().endswith('.zip')
68
-
69
- def extract_doc_id(self, filename):
70
- if self.is_valid_document_pattern(filename):
71
- match = self.valid_doc_pattern.match(filename)
72
- if match:
73
- # Retourner le motif complet (comme S1-12345)
74
- full_id = filename.split('.')[0] # Enlever l'extension si présente
75
- return full_id.split('_')[0] # Enlever les suffixes après underscore si présents
76
- return None
77
-
78
- def process_zip_files(self, files_list, base_url, workshop=False):
79
- """Traiter une liste de fichiers pour trouver et indexer les ZIP valides"""
80
- indexed_count = 0
81
-
82
- for file in files_list:
83
- if file in ['./', '../', 'ZIP/', 'zip/']:
84
- continue
85
-
86
- # Vérifier si c'est un fichier ZIP et s'il correspond au motif
87
- if self.is_zip_file(file) and (self.is_valid_document_pattern(file) or workshop):
88
- file_url = f"{base_url}/{file}"
89
-
90
- # Extraire l'ID du document
91
- doc_id = self.extract_doc_id(file)
92
- if doc_id is None:
93
- doc_id = file.split('.')[0]
94
- if doc_id:
95
- # Vérifier si ce fichier est déjà indexé
96
- with self.indexer_lock:
97
- if doc_id in self.indexer and self.indexer[doc_id] == file_url:
98
- continue
99
-
100
- # Ajouter ou mettre à jour l'index
101
- self.indexer[doc_id] = file_url
102
- indexed_count += 1
103
- self.total_indexed += 1
104
-
105
- return indexed_count
106
-
107
- def process_meeting(self, meeting, wg_url, workshop=False):
108
- """Traiter une réunion individuelle avec multithreading"""
109
- try:
110
- if meeting in ['./', '../']:
111
- return 0
112
-
113
- meeting_url = f"{wg_url}/{meeting}"
114
-
115
- with self.print_lock:
116
- print(f"Vérification du meeting: {meeting}")
117
-
118
- # Vérifier le contenu de la réunion
119
- meeting_contents = self.get_docs_from_url(meeting_url)
120
-
121
- key = None
122
- if "docs" in [x.lower() for x in meeting_contents]:
123
- key = "docs"
124
- elif "tdocs" in [x.lower() for x in meeting_contents]:
125
- key = "tdocs"
126
- elif "tdoc" in [x.lower() for x in meeting_contents]:
127
- key = "tdoc"
128
-
129
- if key is not None:
130
- docs_url = f"{meeting_url}/{key}"
131
-
132
- with self.print_lock:
133
- print(f"Vérification des documents présent dans {docs_url}")
134
-
135
- # Récupérer la liste des fichiers dans le dossier Docs
136
- docs_files = self.get_docs_from_url(docs_url)
137
-
138
- # 1. Indexer les fichiers ZIP directement dans le dossier Docs
139
- docs_indexed_count = self.process_zip_files(docs_files, docs_url, workshop)
140
-
141
- if docs_indexed_count > 0:
142
- with self.print_lock:
143
- print(f"{docs_indexed_count} fichiers trouvés")
144
-
145
- # 2. Vérifier le sous-dossier ZIP s'il existe
146
- if "zip" in [x.lower() for x in docs_files]:
147
- zip_url = f"{docs_url}/zip"
148
-
149
- with self.print_lock:
150
- print(f"Vérification du dossier ./zip: {zip_url}")
151
-
152
- # Récupérer les fichiers dans le sous-dossier ZIP
153
- zip_files = self.get_docs_from_url(zip_url)
154
-
155
- # Indexer les fichiers ZIP dans le sous-dossier ZIP
156
- zip_indexed_count = self.process_zip_files(zip_files, zip_url, workshop)
157
-
158
- if zip_indexed_count > 0:
159
- with self.print_lock:
160
- print(f"{zip_indexed_count} fichiers trouvés")
161
-
162
- # Mise à jour du compteur de progression
163
- with self.indexer_lock:
164
- self.processed_count += 1
165
-
166
- # Affichage de la progression
167
- with self.print_lock:
168
- progress = (self.processed_count / self.total_count) * 100 if self.total_count > 0 else 0
169
- print(f"\rProgression: {self.processed_count}/{self.total_count} réunions traitées ({progress:.1f}%)")
170
-
171
- return 1 # Réunion traitée avec succès
172
-
173
- except Exception as e:
174
- with self.print_lock:
175
- print(f"\nErreur lors du traitement de la réunion {meeting}: {str(e)}")
176
- return 0
177
-
178
- def process_workgroup(self, wg, main_url):
179
- """Traiter un groupe de travail avec multithreading pour ses réunions"""
180
- if wg in ['./', '../']:
181
- return
182
-
183
- wg_url = f"{main_url}/{wg}"
184
-
185
- with self.print_lock:
186
- print(f"Vérification du working group: {wg}")
187
-
188
- # Récupérer les dossiers de réunion
189
- meeting_folders = self.get_docs_from_url(wg_url)
190
-
191
- # Ajouter au compteur total
192
- self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
193
-
194
- # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
195
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
196
- futures = [executor.submit(self.process_meeting, meeting, wg_url)
197
- for meeting in meeting_folders if meeting not in ['./', '../']]
198
-
199
- total = len(futures)
200
- done_count = 0
201
- yield f"event: get-maximum\ndata: {total}\n\n"
202
-
203
- for future in concurrent.futures.as_completed(futures):
204
- done_count += 1
205
- yield f"event: progress\ndata: {done_count}\n\n"
206
-
207
- def index_all_tdocs(self):
208
- """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
209
- print("Démarrage de l'indexation des TDocs 3GPP complète")
210
-
211
- start_time = time.time()
212
- docs_count_before = self.indexer_length
213
-
214
- # Principaux groupes TSG
215
- main_groups = ["tsg_sa", "tsg_ct", "tsg_ran"] # Ajouter d'autres si nécessaire
216
-
217
- for main_tsg in main_groups:
218
- print(f"Indexation de {main_tsg.upper()}...")
219
-
220
- main_url = f"{self.main_ftp_url}/{main_tsg}"
221
-
222
- # Récupérer les groupes de travail
223
- workgroups = self.get_docs_from_url(main_url)
224
-
225
- # Traiter chaque groupe de travail séquentiellement
226
- # (mais les réunions à l'intérieur seront traitées en parallèle)
227
- for wg in workgroups:
228
- yield f"event: info\ndata: {main_tsg}-{wg}\n\n"
229
- for content in self.process_workgroup(wg, main_url):
230
- yield content
231
-
232
- docs_count_after = len(self.indexer)
233
- new_docs_count = abs(docs_count_after - docs_count_before)
234
-
235
- print(f"Indexation terminée en {time.time() - start_time:.2f} secondes")
236
- print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
237
- print(f"Total des documents dans l'index: {docs_count_after}")
238
-
239
- return self.indexer
240
-
241
- def index_all_workshops(self):
242
- print("Démarrage de l'indexation des workshops ZIP 3GPP...")
243
- start_time = time.time()
244
- docs_count_before = len(self.indexer)
245
-
246
- print("\nIndexation du dossier 'workshop'")
247
- main_url = f"{self.main_ftp_url}/workshop"
248
-
249
- # Récupérer les dossiers de réunion
250
- meeting_folders = self.get_docs_from_url(main_url)
251
-
252
- # Ajouter au compteur total
253
- self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
254
-
255
- # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
256
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
257
- futures = [executor.submit(self.process_meeting, meeting, main_url, workshop=True)
258
- for meeting in meeting_folders if meeting not in ['./', '../']]
259
- total = len(futures)
260
- done_count = 0
261
-
262
- yield f"event: get-maximum\ndata: {total}\n\n"
263
-
264
- for future in concurrent.futures.as_completed(futures):
265
- done_count += 1
266
- yield f"event: progress\ndata: {done_count}\n\n"
267
-
268
- docs_count_after = len(self.indexer)
269
- new_docs_count = docs_count_after - docs_count_before
270
-
271
- print(f"\nIndexation terminée en {time.time() - start_time:.2f} secondes")
272
- print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
273
- print(f"Total des documents dans l'index: {docs_count_after}")
274
-
275
- return self.indexer
276
-
277
- class Spec3GPPIndexer:
278
- def __init__(self, max_workers=16):
279
- self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
280
- self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
281
- self.indexed_specifications = {}
282
- self.specifications_passed = set()
283
- self.processed_count = 0
284
- self.total_count = 0
285
-
286
- self.DICT_LOCK = threading.Lock()
287
- self.DOCUMENT_LOCK = threading.Lock()
288
- self.STOP_EVENT = threading.Event()
289
- self.max_workers = max_workers
290
- self.LIBREOFFICE_SEMAPHORE = threading.Semaphore(self.max_workers)
291
-
292
- def _make_doc_index(self, specs):
293
- doc_index = {}
294
- for section in specs:
295
- if section["doc_id"] not in doc_index:
296
- doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
297
- else:
298
- doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
299
- return doc_index
300
-
301
- @staticmethod
302
- def version_to_code(version_str):
303
- chars = "0123456789abcdefghijklmnopqrstuvwxyz"
304
- parts = version_str.split('.')
305
- if len(parts) != 3:
306
- return None
307
- try:
308
- x, y, z = [int(p) for p in parts]
309
- except ValueError:
310
- return None
311
- if x < 36 and y < 36 and z < 36:
312
- return f"{chars[x]}{chars[y]}{chars[z]}"
313
- else:
314
- return f"{str(x).zfill(2)}{str(y).zfill(2)}{str(z).zfill(2)}"
315
-
316
- @staticmethod
317
- def hasher(specification, version_code):
318
- return hashlib.md5(f"{specification}{version_code}".encode()).hexdigest()
319
-
320
- @staticmethod
321
- def get_scope(content):
322
- for title, text in content.items():
323
- if title.lower().endswith("scope"):
324
- return text
325
- return ""
326
-
327
- def get_text(self, specification, version_code):
328
- if self.STOP_EVENT.is_set():
329
- return []
330
-
331
- doc_id = specification
332
- series = doc_id.split(".")[0]
333
- url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
334
-
335
- try:
336
- response = requests.get(url, verify=False)
337
- if response.status_code != 200:
338
- return []
339
-
340
- zip_bytes = io.BytesIO(response.content)
341
- with zipfile.ZipFile(zip_bytes) as zip_file:
342
- # Filtrer uniquement fichiers .doc et .docx
343
- docx_files = [f for f in zip_file.namelist() if f.lower().endswith(('.doc', '.docx'))]
344
- if not docx_files:
345
- return []
346
-
347
- full_text = []
348
-
349
- for doc_file in docx_files:
350
- with tempfile.TemporaryDirectory() as tmpdir:
351
- extracted_path = os.path.join(tmpdir, os.path.basename(doc_file))
352
- with open(extracted_path, 'wb') as f:
353
- f.write(zip_file.read(doc_file))
354
-
355
- # Profil libreoffice temp dédié
356
- profile_dir = tempfile.mkdtemp(prefix="libreoffice_profile_")
357
-
358
- try:
359
- with self.LIBREOFFICE_SEMAPHORE:
360
- cmd = [
361
- 'soffice',
362
- '--headless',
363
- f'-env:UserInstallation=file://{profile_dir}',
364
- '--convert-to', 'txt:Text',
365
- '--outdir', tmpdir,
366
- extracted_path
367
- ]
368
- subprocess.run(cmd, check=True, timeout=60*5, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
369
-
370
- txt_file = os.path.splitext(extracted_path)[0] + '.txt'
371
- if os.path.exists(txt_file):
372
- with open(txt_file, 'r', encoding='utf-8', errors='ignore') as ftxt:
373
- full_text.extend(ftxt.readlines())
374
- finally:
375
- shutil.rmtree(profile_dir, ignore_errors=True)
376
-
377
- return full_text
378
-
379
- except Exception as e:
380
- print(f"Error getting text for {specification} v{version_code}: {e}")
381
- return []
382
-
383
- def get_spec_content(self, specification, version_code):
384
- if self.STOP_EVENT.is_set():
385
- return {}
386
-
387
- text = self.get_text(specification, version_code)
388
- if not text:
389
- return {}
390
-
391
- chapters = []
392
- chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+[^\.]$")
393
- for i, line in enumerate(text):
394
- if chapter_regex.fullmatch(line):
395
- chapters.append((i, line))
396
-
397
- document = {}
398
- for i in range(len(chapters)):
399
- start_index, chapter_title = chapters[i]
400
- end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
401
- content_lines = text[start_index + 1:end_index]
402
- document[chapter_title.replace("\t", " ")] = "\n".join(content_lines)
403
-
404
- return document
405
-
406
- def fetch_spec_table(self):
407
- response = requests.get(
408
- 'https://www.3gpp.org/dynareport?code=status-report.htm',
409
- headers={"User-Agent": 'Mozilla/5.0'},
410
- verify=False
411
- )
412
- dfs = pd.read_html(io.StringIO(response.text))
413
- for x in range(len(dfs)):
414
- dfs[x] = dfs[x].replace({np.nan: None})
415
- columns_needed = [0, 1, 2, 3, 4]
416
- extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
417
- columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
418
- specifications = []
419
- for df in extracted_dfs:
420
- for index, row in df.iterrows():
421
- doc = row.to_list()
422
- doc_dict = dict(zip(columns, doc))
423
- specifications.append(doc_dict)
424
- return specifications
425
-
426
- def process_specification(self, spec):
427
- if self.STOP_EVENT.is_set():
428
- return
429
- try:
430
- doc_id = str(spec['spec_num'])
431
- version_code = self.version_to_code(str(spec['vers']))
432
- if not version_code:
433
- with self.DICT_LOCK:
434
- self.processed_count += 1
435
- return
436
-
437
- document = None
438
- already_indexed = False
439
- with self.DOCUMENT_LOCK:
440
- doc_in_cache = doc_id in self.documents_by_spec_num and \
441
- self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)
442
-
443
- if doc_in_cache and doc_id not in self.specifications_passed:
444
- document = self.documents_by_spec_num[doc_id]
445
- self.specifications_passed.add(doc_id)
446
- already_indexed = True
447
- elif doc_id not in self.specifications_passed:
448
- doc_content = self.get_spec_content(doc_id, version_code)
449
- if doc_content:
450
- document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
451
- with self.DOCUMENT_LOCK:
452
- self.documents_by_spec_num[doc_id] = document
453
- self.specifications_passed.add(doc_id)
454
- already_indexed = False
455
-
456
- if document:
457
- url = f"https://www.3gpp.org/ftp/Specs/archive/{doc_id.split('.')[0]}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
458
- metadata = {
459
- "id": doc_id,
460
- "title": spec.get("title", ""),
461
- "type": spec.get("type", ""),
462
- "version": str(spec.get("vers", "")),
463
- "working_group": spec.get("WG", ""),
464
- "url": url,
465
- "scope": self.get_scope(document["content"])
466
- }
467
- key = f"{doc_id}+-+{spec.get('title', '')}+-+{spec.get('type', '')}+-+{spec.get('vers', '')}+-+{spec.get('WG', '')}"
468
- with self.DICT_LOCK:
469
- self.indexed_specifications[key] = metadata
470
-
471
- with self.DICT_LOCK:
472
- self.processed_count += 1
473
- status = "already indexed" if already_indexed else "indexed now"
474
- print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
475
-
476
- except Exception as e:
477
- traceback.print_exc()
478
- print(f"Error processing spec {spec.get('spec_num')} v{spec.get('vers')}: {e}")
479
- with self.DICT_LOCK:
480
- self.processed_count += 1
481
- print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
482
-
483
- def get_document(self, spec_id: str, spec_title: str):
484
- text = [f"{spec_id} - {spec_title}\n"]
485
- for section in self.spec_contents:
486
- if spec_id == section["doc_id"]:
487
- text.extend([f"{section['section']}\n\n{section['content']}"])
488
- return text
489
-
490
- def create_bm25_index(self):
491
- dataset_metadata = self.indexed_specifications.values()
492
- unique_specs = set()
493
- corpus_json = []
494
-
495
- for specification in dataset_metadata:
496
- if specification['id'] in unique_specs: continue
497
- for section in self.spec_contents:
498
- if specification['id'] == section['doc_id']:
499
- corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
500
- "id": specification['id'],
501
- "title": specification['title'],
502
- "section_title": section['section'],
503
- "version": specification['version'],
504
- "type": specification['type'],
505
- "working_group": specification['working_group'],
506
- "url": specification['url'],
507
- "scope": specification['scope']
508
- }})
509
-
510
- corpus_text = [doc["text"] for doc in corpus_json]
511
- corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
512
-
513
- print("Indexing BM25")
514
- retriever = BM25HF(corpus=corpus_json)
515
- retriever.index(corpus_tokens)
516
-
517
- retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF"))
518
-
519
- unique_specs = set()
520
- corpus_json = []
521
-
522
- for specification in dataset_metadata:
523
- if specification['id'] in unique_specs: continue
524
- text_list = self.get_document(specification['id'], specification['title'])
525
- text = "\n".join(text_list)
526
- if len(text_list) == 1: continue
527
- corpus_json.append({"text": text, "metadata": specification})
528
- unique_specs.add(specification['id'])
529
-
530
- corpus_text = [doc["text"] for doc in corpus_json]
531
- corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
532
-
533
- print("Indexing BM25")
534
- retriever = BM25HF(corpus=corpus_json)
535
- retriever.index(corpus_tokens)
536
-
537
- retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF"))
538
-
539
- def run(self):
540
- print("Fetching specification tables from 3GPP...")
541
- yield "event: info\ndata: Indexing 3GPP specs ...\n\n"
542
- specifications = self.fetch_spec_table()
543
- self.total_count = len(specifications)
544
- print(f"Processing {self.total_count} specs with {self.max_workers} threads...")
545
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
546
- futures = [executor.submit(self.process_specification, spec) for spec in specifications]
547
- total = len(futures)
548
- done_count = 0
549
- yield f"event: get-maximum\ndata: {total}\n\n"
550
-
551
- for future in concurrent.futures.as_completed(futures):
552
- done_count += 1
553
- yield f"event: progress\ndata: {done_count}\n\n"
554
- if self.STOP_EVENT.is_set():
555
- break
556
- print("All specs processed.")
557
-
558
- # Sauvegarde (identique au script original)
559
- def save(self):
560
- print("Saving indexed data...")
561
- flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
562
- flat_docs = []
563
- print("Flatting doc contents")
564
- for doc_id, data in self.documents_by_spec_num.items():
565
- for title, content in data["content"].items():
566
- flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
567
- print("Creating datasets ...")
568
- push_spec_content = Dataset.from_list(flat_docs)
569
- push_spec_metadata = Dataset.from_list(flat_metadata)
570
- # Token handling assumed set in environment
571
- print("Pushing ...")
572
- push_spec_content.push_to_hub("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF"])
573
- push_spec_metadata.push_to_hub("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF"])
574
-
575
- self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
576
- self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
577
- print("Save finished.")
578
-
579
- class SpecETSIIndexer:
580
- def __init__(self, max_workers=16):
581
- self.session = requests.Session()
582
- self.session.verify = False
583
-
584
- self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
585
- self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
586
- self.indexed_specifications = {}
587
- self.specifications_passed = set()
588
- self.processed_count = 0
589
- self.total_count = 0
590
-
591
- self.DICT_LOCK = threading.Lock()
592
- self.DOCUMENT_LOCK = threading.Lock()
593
- self.STOP_EVENT = threading.Event()
594
- self.max_workers = max_workers
595
-
596
- self.df = self._fetch_spec_table()
597
-
598
- def _make_doc_index(self, specs):
599
- doc_index = {}
600
- for section in specs:
601
- if section["doc_id"] not in doc_index:
602
- doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
603
- else:
604
- doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
605
- return doc_index
606
-
607
- def _fetch_spec_table(self):
608
- # Connexion login et récupération CSV TS/TR
609
- print("Connexion login ETSI...")
610
- self.session.post(
611
- "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
612
- verify=False,
613
- headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
614
- data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
615
- )
616
-
617
- print("Récupération des métadonnées TS/TR …")
618
- url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
619
- url_tr = url_ts.replace("stdType=TS", "stdType=TR")
620
- data_ts = self.session.get(url_ts, verify=False).content
621
- data_tr = self.session.get(url_tr, verify=False).content
622
- df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
623
- df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
624
-
625
- backup_ts = df_ts["ETSI deliverable"]
626
- backup_tr = df_tr["ETSI deliverable"]
627
- df_ts["ETSI deliverable"] = df_ts["ETSI deliverable"].str.extract(r"\s*ETSI TS (\d+ \d+(?:-\d+(?:-\d+)?)?)")
628
- df_tr["ETSI deliverable"] = df_tr["ETSI deliverable"].str.extract(r"\s*ETSI TR (\d+ \d+(?:-\d+(?:-\d+)?)?)")
629
- version1 = backup_ts.str.extract(r"\s*ETSI TS \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
630
- version2 = backup_tr.str.extract(r"\s*ETSI TR \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
631
- df_ts["Version"] = version1[0]
632
- df_tr["Version"] = version2[0]
633
-
634
- def ver_tuple(v):
635
- return tuple(map(int, v.split(".")))
636
- df_ts["temp"] = df_ts["Version"].apply(ver_tuple)
637
- df_tr["temp"] = df_tr["Version"].apply(ver_tuple)
638
- df_ts["Type"] = "TS"
639
- df_tr["Type"] = "TR"
640
- df = pd.concat([df_ts, df_tr])
641
- unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
642
- unique_df = unique_df.drop(columns="temp")
643
- unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
644
- df = df.drop(columns="temp")
645
- df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
646
- return df
647
-
648
- @staticmethod
649
- def hasher(specification: str, version: str):
650
- return hashlib.md5(f"{specification}{version}".encode()).hexdigest()
651
-
652
- @staticmethod
653
- def get_scope(content):
654
- for title, text in content.items():
655
- if title.lower().endswith("scope"):
656
- return text
657
- return ""
658
-
659
- def get_document(self, spec_id: str, spec_title: str):
660
- text = [f"{spec_id} - {spec_title}\n"]
661
- for section in self.spec_contents:
662
- if spec_id == section["doc_id"]:
663
- text.extend([f"{section['section']}\n\n{section['content']}"])
664
- return text
665
-
666
- def get_text(self, specification: str):
667
- if self.STOP_EVENT.is_set():
668
- return None, []
669
- print(f"\n[INFO] Tentative de récupération de la spécification {specification}", flush=True)
670
- try:
671
- # Récupérer la ligne avec le bon lien PDF
672
- row = self.df[self.df["ETSI deliverable"] == specification]
673
- if row.empty:
674
- print(f"[WARN] Spécification {specification} absente du tableau")
675
- return None, []
676
-
677
- pdf_link = row.iloc[0]["PDF link"]
678
- response = self.session.get(
679
- pdf_link,
680
- headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'}
681
- )
682
- if response.status_code != 200:
683
- print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
684
- return None, []
685
- pdf = fitz.open(stream=response.content, filetype="pdf")
686
- return pdf, pdf.get_toc()
687
- except Exception as e:
688
- print(f"[ERROR] Échec get_text pour {specification} : {e}", flush=True)
689
- return None, []
690
-
691
- def get_spec_content(self, specification: str):
692
- def extract_sections(text, titles):
693
- sections = {}
694
- sorted_titles = sorted(titles, key=lambda t: text.find(t))
695
- for i, title in enumerate(sorted_titles):
696
- start = text.find(title)
697
- if i + 1 < len(sorted_titles):
698
- end = text.find(sorted_titles[i + 1])
699
- sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
700
- else:
701
- sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
702
- return sections
703
-
704
- if self.STOP_EVENT.is_set():
705
- return {}
706
- print(f"[INFO] Extraction du contenu de {specification}", flush=True)
707
- pdf, doc_toc = self.get_text(specification)
708
- text = []
709
- if not pdf or not doc_toc:
710
- print("[ERREUR] Pas de texte ou table of contents trouvé !")
711
- return {}
712
- # On prend à partir de la première réelle page référencée
713
- first_page = 0
714
- for level, title, page in doc_toc:
715
- first_page = page - 1
716
- break
717
- for page in pdf[first_page:]:
718
- text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
719
- text = "\n".join(text)
720
- if not text or not doc_toc or self.STOP_EVENT.is_set():
721
- print("[ERREUR] Pas de texte/table of contents récupéré !")
722
- return {}
723
- titles = []
724
- for level, title, page in doc_toc:
725
- if self.STOP_EVENT.is_set():
726
- return {}
727
- if title and title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
728
- titles.append('\n'.join(title.strip().split(" ", 1)))
729
- return extract_sections(text, titles)
730
-
731
- def process_specification(self, spec):
732
- if self.STOP_EVENT.is_set():
733
- return
734
- try:
735
- version = spec.get('Version')
736
- if not version: return
737
- doc_id = str(spec.get("ETSI deliverable"))
738
- document = None
739
- already_indexed = False
740
-
741
- with self.DOCUMENT_LOCK:
742
- if (doc_id in self.documents_by_spec_num
743
- and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)
744
- and doc_id not in self.specifications_passed):
745
- document = self.documents_by_spec_num[doc_id]
746
- self.specifications_passed.add(doc_id)
747
- already_indexed = True
748
- elif doc_id in self.specifications_passed:
749
- document = self.documents_by_spec_num[doc_id]
750
- already_indexed = True
751
- else:
752
- document_content = self.get_spec_content(doc_id)
753
- if document_content:
754
- self.documents_by_spec_num[doc_id] = {"content": document_content, "hash": self.hasher(doc_id, version)}
755
- document = {"content": document_content, "hash": self.hasher(doc_id, version)}
756
- self.specifications_passed.add(doc_id)
757
- already_indexed = False
758
-
759
- if document:
760
- string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
761
- metadata = {
762
- "id": str(doc_id),
763
- "title": spec["title"],
764
- "type": spec["Type"],
765
- "version": version,
766
- "url": spec["PDF link"],
767
- "scope": "" if not document else self.get_scope(document["content"])
768
- }
769
- with self.DICT_LOCK:
770
- self.indexed_specifications[string_key] = metadata
771
-
772
- with self.DICT_LOCK:
773
- self.processed_count += 1
774
- status = "already indexed" if already_indexed else "indexed now"
775
- print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
776
-
777
- except Exception as e:
778
- traceback.print_exc()
779
- print(f"\n[ERREUR] Échec du traitement de {doc_id} {spec.get('Version')}: {e}", flush=True)
780
- with self.DICT_LOCK:
781
- self.processed_count += 1
782
- print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
783
-
784
- def run(self):
785
- print("Démarrage indexation ETSI…")
786
- yield "event: info\ndata: Indexing ETSI specs ...\n\n"
787
- specifications = self.df.to_dict(orient="records")
788
- self.total_count = len(specifications)
789
- print(f"Traitement de {self.total_count} specs avec {self.max_workers} threads...\n")
790
-
791
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
792
- futures = [executor.submit(self.process_specification, spec) for spec in specifications]
793
- total = len(futures)
794
- done_count = 0
795
- yield f"event: get-maximum\ndata: {total}\n\n"
796
-
797
- for future in concurrent.futures.as_completed(futures):
798
- done_count += 1
799
- yield f"event: progress\ndata: {done_count}\n\n"
800
- if self.STOP_EVENT.is_set():
801
- break
802
-
803
- print(f"\nAll {self.processed_count}/{self.total_count} specs processed.")
804
-
805
- def save(self):
806
- print("\nSauvegarde en cours...", flush=True)
807
- flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
808
- flat_docs = []
809
- for doc_id, data in self.documents_by_spec_num.items():
810
- for title, content in data["content"].items():
811
- flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
812
- push_spec_content = Dataset.from_list(flat_docs)
813
- push_spec_metadata = Dataset.from_list(flat_metadata)
814
- push_spec_content.push_to_hub("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF"])
815
- push_spec_metadata.push_to_hub("OrganizedProgrammers/ETSISpecMetadata", token=os.environ["HF"])
816
-
817
- self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
818
- self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
819
- print("Sauvegarde terminée.")
820
-
821
- def create_bm25_index(self):
822
- dataset_metadata = self.indexed_specifications.values()
823
- unique_specs = set()
824
- corpus_json = []
825
-
826
- for specification in dataset_metadata:
827
- if specification['id'] in unique_specs: continue
828
- for section in self.spec_contents:
829
- if specification['id'] == section['doc_id']:
830
- corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
831
- "id": specification['id'],
832
- "title": specification['title'],
833
- "section_title": section['section'],
834
- "version": specification['version'],
835
- "type": specification['type'],
836
- "url": specification['url'],
837
- "scope": specification['scope']
838
- }})
839
-
840
- corpus_text = [doc["text"] for doc in corpus_json]
841
- corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
842
-
843
- print("Indexing BM25")
844
- retriever = BM25HF(corpus=corpus_json)
845
- retriever.index(corpus_tokens)
846
-
847
- retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF"))
848
-
849
- unique_specs = set()
850
- corpus_json = []
851
-
852
- for specification in dataset_metadata:
853
- if specification['id'] in unique_specs: continue
854
- text_list = self.get_document(specification['id'], specification['title'])
855
- text = "\n".join(text_list)
856
- if len(text_list) == 1: continue
857
- corpus_json.append({"text": text, "metadata": specification})
858
- unique_specs.add(specification['id'])
859
-
860
- corpus_text = [doc["text"] for doc in corpus_json]
861
- corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
862
-
863
- print("Indexing BM25")
864
- retriever = BM25HF(corpus=corpus_json)
865
- retriever.index(corpus_tokens)
866
-
867
- retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF"))
 
 
868
 
 
1
+ import shutil
2
+ import bm25s
3
+ from bm25s.hf import BM25HF
4
+ import threading, re, time, concurrent.futures, requests, os, hashlib, traceback, io, zipfile, subprocess, tempfile, json, fitz
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ from bs4 import BeautifulSoup
9
+ from datasets import load_dataset, Dataset
10
+ from datasets.data_files import EmptyDatasetError
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ class TDocIndexer:
16
+ def __init__(self, max_workers=33):
17
+ self.indexer_length = 0
18
+ self.dataset = "OrganizedProgrammers/3GPPTDocLocation"
19
+
20
+ self.indexer = self.load_indexer()
21
+ self.main_ftp_url = "https://3gpp.org/ftp"
22
+ self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
23
+ self.max_workers = max_workers
24
+
25
+ self.print_lock = threading.Lock()
26
+ self.indexer_lock = threading.Lock()
27
+
28
+ self.total_indexed = 0
29
+ self.processed_count = 0
30
+ self.total_count = 0
31
+
32
+ def load_indexer(self):
33
+ self.indexer_length = 0
34
+ all_docs = {}
35
+ tdoc_locations = load_dataset(self.dataset)
36
+ tdoc_locations = tdoc_locations["train"].to_list()
37
+ for doc in tdoc_locations:
38
+ self.indexer_length += 1
39
+ all_docs[doc["doc_id"]] = doc["url"]
40
+
41
+ return all_docs
42
+
43
+ def save_indexer(self):
44
+ """Save the updated index"""
45
+ data = []
46
+ for doc_id, url in self.indexer.items():
47
+ data.append({"doc_id": doc_id, "url": url})
48
+
49
+ dataset = Dataset.from_list(data)
50
+ dataset.push_to_hub(self.dataset, token=os.environ["HF"])
51
+ self.indexer = self.load_indexer()
52
+
53
+ def get_docs_from_url(self, url):
54
+ try:
55
+ response = requests.get(url, verify=False, timeout=10)
56
+ soup = BeautifulSoup(response.text, "html.parser")
57
+ return [item.get_text() for item in soup.select("tr td a")]
58
+ except Exception as e:
59
+ with self.print_lock:
60
+ print(f"Erreur lors de l'accès à {url}: {e}")
61
+ return []
62
+
63
+ def is_valid_document_pattern(self, filename):
64
+ return bool(self.valid_doc_pattern.match(filename))
65
+
66
+ def is_zip_file(self, filename):
67
+ return filename.lower().endswith('.zip')
68
+
69
+ def extract_doc_id(self, filename):
70
+ if self.is_valid_document_pattern(filename):
71
+ match = self.valid_doc_pattern.match(filename)
72
+ if match:
73
+ # Retourner le motif complet (comme S1-12345)
74
+ full_id = filename.split('.')[0] # Enlever l'extension si présente
75
+ return full_id.split('_')[0] # Enlever les suffixes après underscore si présents
76
+ return None
77
+
78
+ def process_zip_files(self, files_list, base_url, workshop=False):
79
+ """Traiter une liste de fichiers pour trouver et indexer les ZIP valides"""
80
+ indexed_count = 0
81
+
82
+ for file in files_list:
83
+ if file in ['./', '../', 'ZIP/', 'zip/']:
84
+ continue
85
+
86
+ # Vérifier si c'est un fichier ZIP et s'il correspond au motif
87
+ if self.is_zip_file(file) and (self.is_valid_document_pattern(file) or workshop):
88
+ file_url = f"{base_url}/{file}"
89
+
90
+ # Extraire l'ID du document
91
+ doc_id = self.extract_doc_id(file)
92
+ if doc_id is None:
93
+ doc_id = file.split('.')[0]
94
+ if doc_id:
95
+ # Vérifier si ce fichier est déjà indexé
96
+ with self.indexer_lock:
97
+ if doc_id in self.indexer and self.indexer[doc_id] == file_url:
98
+ continue
99
+
100
+ # Ajouter ou mettre à jour l'index
101
+ self.indexer[doc_id] = file_url
102
+ indexed_count += 1
103
+ self.total_indexed += 1
104
+
105
+ return indexed_count
106
+
107
+ def process_meeting(self, meeting, wg_url, workshop=False):
108
+ """Traiter une réunion individuelle avec multithreading"""
109
+ try:
110
+ if meeting in ['./', '../']:
111
+ return 0
112
+
113
+ meeting_url = f"{wg_url}/{meeting}"
114
+
115
+ with self.print_lock:
116
+ print(f"Vérification du meeting: {meeting}")
117
+
118
+ # Vérifier le contenu de la réunion
119
+ meeting_contents = self.get_docs_from_url(meeting_url)
120
+
121
+ key = None
122
+ if "docs" in [x.lower() for x in meeting_contents]:
123
+ key = "docs"
124
+ elif "tdocs" in [x.lower() for x in meeting_contents]:
125
+ key = "tdocs"
126
+ elif "tdoc" in [x.lower() for x in meeting_contents]:
127
+ key = "tdoc"
128
+
129
+ if key is not None:
130
+ docs_url = f"{meeting_url}/{key}"
131
+
132
+ with self.print_lock:
133
+ print(f"Vérification des documents présent dans {docs_url}")
134
+
135
+ # Récupérer la liste des fichiers dans le dossier Docs
136
+ docs_files = self.get_docs_from_url(docs_url)
137
+
138
+ # 1. Indexer les fichiers ZIP directement dans le dossier Docs
139
+ docs_indexed_count = self.process_zip_files(docs_files, docs_url, workshop)
140
+
141
+ if docs_indexed_count > 0:
142
+ with self.print_lock:
143
+ print(f"{docs_indexed_count} fichiers trouvés")
144
+
145
+ # 2. Vérifier le sous-dossier ZIP s'il existe
146
+ if "zip" in [x.lower() for x in docs_files]:
147
+ zip_url = f"{docs_url}/zip"
148
+
149
+ with self.print_lock:
150
+ print(f"Vérification du dossier ./zip: {zip_url}")
151
+
152
+ # Récupérer les fichiers dans le sous-dossier ZIP
153
+ zip_files = self.get_docs_from_url(zip_url)
154
+
155
+ # Indexer les fichiers ZIP dans le sous-dossier ZIP
156
+ zip_indexed_count = self.process_zip_files(zip_files, zip_url, workshop)
157
+
158
+ if zip_indexed_count > 0:
159
+ with self.print_lock:
160
+ print(f"{zip_indexed_count} fichiers trouvés")
161
+
162
+ # Mise à jour du compteur de progression
163
+ with self.indexer_lock:
164
+ self.processed_count += 1
165
+
166
+ # Affichage de la progression
167
+ with self.print_lock:
168
+ progress = (self.processed_count / self.total_count) * 100 if self.total_count > 0 else 0
169
+ print(f"\rProgression: {self.processed_count}/{self.total_count} réunions traitées ({progress:.1f}%)")
170
+
171
+ return 1 # Réunion traitée avec succès
172
+
173
+ except Exception as e:
174
+ with self.print_lock:
175
+ print(f"\nErreur lors du traitement de la réunion {meeting}: {str(e)}")
176
+ return 0
177
+
178
+ def process_workgroup(self, wg, main_url):
179
+ """Traiter un groupe de travail avec multithreading pour ses réunions"""
180
+ if wg in ['./', '../']:
181
+ return
182
+
183
+ wg_url = f"{main_url}/{wg}"
184
+
185
+ with self.print_lock:
186
+ print(f"Vérification du working group: {wg}")
187
+
188
+ # Récupérer les dossiers de réunion
189
+ meeting_folders = self.get_docs_from_url(wg_url)
190
+
191
+ # Ajouter au compteur total
192
+ self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
193
+
194
+ # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
195
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
196
+ futures = [executor.submit(self.process_meeting, meeting, wg_url)
197
+ for meeting in meeting_folders if meeting not in ['./', '../']]
198
+
199
+ total = len(futures)
200
+ done_count = 0
201
+ yield f"event: get-maximum\ndata: {total}\n\n"
202
+
203
+ for future in concurrent.futures.as_completed(futures):
204
+ done_count += 1
205
+ yield f"event: progress\ndata: {done_count}\n\n"
206
+
207
+ def index_all_tdocs(self):
208
+ """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
209
+ print("Démarrage de l'indexation des TDocs 3GPP complète")
210
+
211
+ start_time = time.time()
212
+ docs_count_before = self.indexer_length
213
+
214
+ # Principaux groupes TSG
215
+ main_groups = ["tsg_sa", "tsg_ct", "tsg_ran"] # Ajouter d'autres si nécessaire
216
+
217
+ for main_tsg in main_groups:
218
+ print(f"Indexation de {main_tsg.upper()}...")
219
+
220
+ main_url = f"{self.main_ftp_url}/{main_tsg}"
221
+
222
+ # Récupérer les groupes de travail
223
+ workgroups = self.get_docs_from_url(main_url)
224
+
225
+ # Traiter chaque groupe de travail séquentiellement
226
+ # (mais les réunions à l'intérieur seront traitées en parallèle)
227
+ for wg in workgroups:
228
+ yield f"event: info\ndata: {main_tsg}-{wg}\n\n"
229
+ for content in self.process_workgroup(wg, main_url):
230
+ yield content
231
+
232
+ docs_count_after = len(self.indexer)
233
+ new_docs_count = abs(docs_count_after - docs_count_before)
234
+
235
+ print(f"Indexation terminée en {time.time() - start_time:.2f} secondes")
236
+ print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
237
+ print(f"Total des documents dans l'index: {docs_count_after}")
238
+
239
+ return self.indexer
240
+
241
+ def index_all_workshops(self):
242
+ print("Démarrage de l'indexation des workshops ZIP 3GPP...")
243
+ start_time = time.time()
244
+ docs_count_before = len(self.indexer)
245
+
246
+ print("\nIndexation du dossier 'workshop'")
247
+ main_url = f"{self.main_ftp_url}/workshop"
248
+
249
+ # Récupérer les dossiers de réunion
250
+ meeting_folders = self.get_docs_from_url(main_url)
251
+
252
+ # Ajouter au compteur total
253
+ self.total_count += len([m for m in meeting_folders if m not in ['./', '../']])
254
+
255
+ # Utiliser ThreadPoolExecutor pour traiter les réunions en parallèle
256
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
257
+ futures = [executor.submit(self.process_meeting, meeting, main_url, workshop=True)
258
+ for meeting in meeting_folders if meeting not in ['./', '../']]
259
+ total = len(futures)
260
+ done_count = 0
261
+
262
+ yield f"event: get-maximum\ndata: {total}\n\n"
263
+
264
+ for future in concurrent.futures.as_completed(futures):
265
+ done_count += 1
266
+ yield f"event: progress\ndata: {done_count}\n\n"
267
+
268
+ docs_count_after = len(self.indexer)
269
+ new_docs_count = docs_count_after - docs_count_before
270
+
271
+ print(f"\nIndexation terminée en {time.time() - start_time:.2f} secondes")
272
+ print(f"Nouveaux documents ZIP indexés: {new_docs_count}")
273
+ print(f"Total des documents dans l'index: {docs_count_after}")
274
+
275
+ return self.indexer
276
+
277
+ class Spec3GPPIndexer:
278
+ def __init__(self, max_workers=16):
279
+ self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
280
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
281
+ self.indexed_specifications = {}
282
+ self.specifications_passed = set()
283
+ self.processed_count = 0
284
+ self.total_count = 0
285
+
286
+ self.DICT_LOCK = threading.Lock()
287
+ self.DOCUMENT_LOCK = threading.Lock()
288
+ self.STOP_EVENT = threading.Event()
289
+ self.max_workers = max_workers
290
+ self.LIBREOFFICE_SEMAPHORE = threading.Semaphore(self.max_workers)
291
+
292
+ def _make_doc_index(self, specs):
293
+ doc_index = {}
294
+ for section in specs:
295
+ if section["doc_id"] not in doc_index:
296
+ doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
297
+ else:
298
+ doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
299
+ return doc_index
300
+
301
+ @staticmethod
302
+ def version_to_code(version_str):
303
+ chars = "0123456789abcdefghijklmnopqrstuvwxyz"
304
+ parts = version_str.split('.')
305
+ if len(parts) != 3:
306
+ return None
307
+ try:
308
+ x, y, z = [int(p) for p in parts]
309
+ except ValueError:
310
+ return None
311
+ if x < 36 and y < 36 and z < 36:
312
+ return f"{chars[x]}{chars[y]}{chars[z]}"
313
+ else:
314
+ return f"{str(x).zfill(2)}{str(y).zfill(2)}{str(z).zfill(2)}"
315
+
316
+ @staticmethod
317
+ def hasher(specification, version_code):
318
+ return hashlib.md5(f"{specification}{version_code}".encode()).hexdigest()
319
+
320
+ @staticmethod
321
+ def get_scope(content):
322
+ for title, text in content.items():
323
+ if title.lower().endswith("scope"):
324
+ return text
325
+ return ""
326
+
327
+ def get_text(self, specification, version_code):
328
+ if self.STOP_EVENT.is_set():
329
+ return []
330
+
331
+ doc_id = specification
332
+ series = doc_id.split(".")[0]
333
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
334
+
335
+ try:
336
+ response = requests.get(url, verify=False)
337
+ if response.status_code != 200:
338
+ return []
339
+
340
+ zip_bytes = io.BytesIO(response.content)
341
+ with zipfile.ZipFile(zip_bytes) as zip_file:
342
+ # Filtrer uniquement fichiers .doc et .docx
343
+ docx_files = [f for f in zip_file.namelist() if f.lower().endswith(('.doc', '.docx'))]
344
+ if not docx_files:
345
+ return []
346
+
347
+ full_text = []
348
+
349
+ for doc_file in docx_files:
350
+ with tempfile.TemporaryDirectory() as tmpdir:
351
+ extracted_path = os.path.join(tmpdir, os.path.basename(doc_file))
352
+ with open(extracted_path, 'wb') as f:
353
+ f.write(zip_file.read(doc_file))
354
+
355
+ # Profil libreoffice temp dédié
356
+ profile_dir = tempfile.mkdtemp(prefix="libreoffice_profile_")
357
+
358
+ try:
359
+ with self.LIBREOFFICE_SEMAPHORE:
360
+ cmd = [
361
+ 'soffice',
362
+ '--headless',
363
+ f'-env:UserInstallation=file://{profile_dir}',
364
+ '--convert-to', 'txt:Text',
365
+ '--outdir', tmpdir,
366
+ extracted_path
367
+ ]
368
+ subprocess.run(cmd, check=True, timeout=60*5, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
369
+
370
+ txt_file = os.path.splitext(extracted_path)[0] + '.txt'
371
+ if os.path.exists(txt_file):
372
+ with open(txt_file, 'r', encoding='utf-8', errors='ignore') as ftxt:
373
+ full_text.extend(ftxt.readlines())
374
+ finally:
375
+ shutil.rmtree(profile_dir, ignore_errors=True)
376
+
377
+ return full_text
378
+
379
+ except Exception as e:
380
+ print(f"Error getting text for {specification} v{version_code}: {e}")
381
+ return []
382
+
383
+ def get_spec_content(self, specification, version_code):
384
+ if self.STOP_EVENT.is_set():
385
+ return {}
386
+
387
+ text = self.get_text(specification, version_code)
388
+ if not text:
389
+ return {}
390
+
391
+ chapters = []
392
+ chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+[^\.]$")
393
+ for i, line in enumerate(text):
394
+ if chapter_regex.fullmatch(line):
395
+ chapters.append((i, line))
396
+
397
+ document = {}
398
+ for i in range(len(chapters)):
399
+ start_index, chapter_title = chapters[i]
400
+ end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
401
+ content_lines = text[start_index + 1:end_index]
402
+ document[chapter_title.replace("\t", " ")] = "\n".join(content_lines)
403
+
404
+ return document
405
+
406
+ def fetch_spec_table(self):
407
+ response = requests.get(
408
+ 'https://www.3gpp.org/dynareport?code=status-report.htm',
409
+ headers={"User-Agent": 'Mozilla/5.0'},
410
+ verify=False
411
+ )
412
+ dfs = pd.read_html(io.StringIO(response.text))
413
+ for x in range(len(dfs)):
414
+ dfs[x] = dfs[x].replace({np.nan: None})
415
+ columns_needed = [0, 1, 2, 3, 4]
416
+ extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
417
+ columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
418
+ specifications = []
419
+ for df in extracted_dfs:
420
+ for index, row in df.iterrows():
421
+ doc = row.to_list()
422
+ doc_dict = dict(zip(columns, doc))
423
+ specifications.append(doc_dict)
424
+ return specifications
425
+
426
+ def process_specification(self, spec):
427
+ if self.STOP_EVENT.is_set():
428
+ return
429
+ try:
430
+ doc_id = str(spec['spec_num'])
431
+ version_code = self.version_to_code(str(spec['vers']))
432
+ if not version_code:
433
+ with self.DICT_LOCK:
434
+ self.processed_count += 1
435
+ return
436
+
437
+ document = None
438
+ already_indexed = False
439
+ with self.DOCUMENT_LOCK:
440
+ doc_in_cache = doc_id in self.documents_by_spec_num and \
441
+ self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version_code)
442
+
443
+ if doc_in_cache and doc_id not in self.specifications_passed:
444
+ document = self.documents_by_spec_num[doc_id]
445
+ self.specifications_passed.add(doc_id)
446
+ already_indexed = True
447
+ elif doc_id not in self.specifications_passed:
448
+ doc_content = self.get_spec_content(doc_id, version_code)
449
+ if doc_content:
450
+ document = {"content": doc_content, "hash": self.hasher(doc_id, version_code)}
451
+ with self.DOCUMENT_LOCK:
452
+ self.documents_by_spec_num[doc_id] = document
453
+ self.specifications_passed.add(doc_id)
454
+ already_indexed = False
455
+
456
+ if document:
457
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{doc_id.split('.')[0]}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
458
+ metadata = {
459
+ "id": doc_id,
460
+ "title": spec.get("title", ""),
461
+ "type": spec.get("type", ""),
462
+ "version": str(spec.get("vers", "")),
463
+ "working_group": spec.get("WG", ""),
464
+ "url": url,
465
+ "scope": self.get_scope(document["content"])
466
+ }
467
+ key = f"{doc_id}+-+{spec.get('title', '')}+-+{spec.get('type', '')}+-+{spec.get('vers', '')}+-+{spec.get('WG', '')}"
468
+ with self.DICT_LOCK:
469
+ self.indexed_specifications[key] = metadata
470
+
471
+ with self.DICT_LOCK:
472
+ self.processed_count += 1
473
+ status = "already indexed" if already_indexed else "indexed now"
474
+ print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
475
+
476
+ except Exception as e:
477
+ traceback.print_exc()
478
+ print(f"Error processing spec {spec.get('spec_num')} v{spec.get('vers')}: {e}")
479
+ with self.DICT_LOCK:
480
+ self.processed_count += 1
481
+ print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
482
+
483
+ def get_document(self, spec_id: str, spec_title: str):
484
+ text = [f"{spec_id} - {spec_title}\n"]
485
+ for section in self.spec_contents:
486
+ if spec_id == section["doc_id"]:
487
+ text.extend([f"{section['section']}\n\n{section['content']}"])
488
+ return text
489
+
490
+ def create_bm25_index(self):
491
+ dataset_metadata = self.indexed_specifications.values()
492
+ unique_specs = set()
493
+ corpus_json = []
494
+
495
+ for specification in dataset_metadata:
496
+ if specification['id'] in unique_specs: continue
497
+ for section in self.spec_contents:
498
+ if specification['id'] == section['doc_id']:
499
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
500
+ "id": specification['id'],
501
+ "title": specification['title'],
502
+ "section_title": section['section'],
503
+ "version": specification['version'],
504
+ "type": specification['type'],
505
+ "working_group": specification['working_group'],
506
+ "url": specification['url'],
507
+ "scope": specification['scope']
508
+ }})
509
+
510
+ corpus_text = [doc["text"] for doc in corpus_json]
511
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
512
+
513
+ print("Indexing BM25")
514
+ retriever = BM25HF(corpus=corpus_json)
515
+ retriever.index(corpus_tokens)
516
+
517
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF"))
518
+
519
+ unique_specs = set()
520
+ corpus_json = []
521
+
522
+ for specification in dataset_metadata:
523
+ if specification['id'] in unique_specs: continue
524
+ text_list = self.get_document(specification['id'], specification['title'])
525
+ text = "\n".join(text_list)
526
+ if len(text_list) == 1: continue
527
+ corpus_json.append({"text": text, "metadata": specification})
528
+ unique_specs.add(specification['id'])
529
+
530
+ corpus_text = [doc["text"] for doc in corpus_json]
531
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
532
+
533
+ print("Indexing BM25")
534
+ retriever = BM25HF(corpus=corpus_json)
535
+ retriever.index(corpus_tokens)
536
+
537
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF"))
538
+
539
+ def run(self):
540
+ print("Fetching specification tables from 3GPP...")
541
+ yield "event: info\ndata: Indexing 3GPP specs ...\n\n"
542
+ specifications = self.fetch_spec_table()
543
+ self.total_count = len(specifications)
544
+ print(f"Processing {self.total_count} specs with {self.max_workers} threads...")
545
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
546
+ futures = [executor.submit(self.process_specification, spec) for spec in specifications]
547
+ total = len(futures)
548
+ done_count = 0
549
+ yield f"event: get-maximum\ndata: {total}\n\n"
550
+
551
+ for future in concurrent.futures.as_completed(futures):
552
+ done_count += 1
553
+ yield f"event: progress\ndata: {done_count}\n\n"
554
+ if self.STOP_EVENT.is_set():
555
+ break
556
+ print("All specs processed.")
557
+
558
+ # Sauvegarde (identique au script original)
559
+ def save(self):
560
+ print("Saving indexed data...")
561
+ flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
562
+ flat_docs = []
563
+ print("Flatting doc contents")
564
+ for doc_id, data in self.documents_by_spec_num.items():
565
+ for title, content in data["content"].items():
566
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
567
+ print("Creating datasets ...")
568
+ push_spec_content = Dataset.from_list(flat_docs)
569
+ push_spec_metadata = Dataset.from_list(flat_metadata)
570
+ # Token handling assumed set in environment
571
+ print("Pushing ...")
572
+ push_spec_content.push_to_hub("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF"])
573
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF"])
574
+
575
+ self.spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")["train"].to_list()
576
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
577
+ print("Save finished.")
578
+
579
+ class SpecETSIIndexer:
580
+ def __init__(self, max_workers=16):
581
+ self.session = requests.Session()
582
+ self.session.verify = False
583
+
584
+ self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
585
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
586
+ self.indexed_specifications = {}
587
+ self.specifications_passed = set()
588
+ self.processed_count = 0
589
+ self.total_count = 0
590
+
591
+ self.DICT_LOCK = threading.Lock()
592
+ self.DOCUMENT_LOCK = threading.Lock()
593
+ self.STOP_EVENT = threading.Event()
594
+ self.max_workers = max_workers
595
+
596
+ self.df = self._fetch_spec_table()
597
+
598
+ def _make_doc_index(self, specs):
599
+ doc_index = {}
600
+ for section in specs:
601
+ if section["doc_id"] not in doc_index:
602
+ doc_index[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
603
+ else:
604
+ doc_index[section["doc_id"]]["content"][section["section"]] = section["content"]
605
+ return doc_index
606
+
607
+ def _fetch_spec_table(self):
608
+ # Connexion login et récupération CSV TS/TR
609
+ print("Connexion login ETSI...")
610
+ self.session.post(
611
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
612
+ verify=False,
613
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) ..."},
614
+ data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}),
615
+ )
616
+
617
+ print("Récupération des métadonnées TS/TR …")
618
+ url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
619
+ url_tr = url_ts.replace("stdType=TS", "stdType=TR")
620
+ data_ts = self.session.get(url_ts, verify=False).content
621
+ data_tr = self.session.get(url_tr, verify=False).content
622
+ df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
623
+ df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
624
+
625
+ backup_ts = df_ts["ETSI deliverable"]
626
+ backup_tr = df_tr["ETSI deliverable"]
627
+ df_ts["ETSI deliverable"] = df_ts["ETSI deliverable"].str.extract(r"\s*ETSI TS (\d+ \d+(?:-\d+(?:-\d+)?)?)")
628
+ df_tr["ETSI deliverable"] = df_tr["ETSI deliverable"].str.extract(r"\s*ETSI TR (\d+ \d+(?:-\d+(?:-\d+)?)?)")
629
+ version1 = backup_ts.str.extract(r"\s*ETSI TS \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
630
+ version2 = backup_tr.str.extract(r"\s*ETSI TR \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
631
+ df_ts["Version"] = version1[0]
632
+ df_tr["Version"] = version2[0]
633
+
634
+ def ver_tuple(v):
635
+ if not isinstance(v, str):
636
+ return (0, 0, 0)
637
+ return tuple(map(int, v.split(".")))
638
+ df_ts["temp"] = df_ts["Version"].apply(ver_tuple)
639
+ df_tr["temp"] = df_tr["Version"].apply(ver_tuple)
640
+ df_ts["Type"] = "TS"
641
+ df_tr["Type"] = "TR"
642
+ df = pd.concat([df_ts, df_tr])
643
+ unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
644
+ unique_df = unique_df.drop(columns="temp")
645
+ unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
646
+ df = df.drop(columns="temp")
647
+ df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
648
+ return df
649
+
650
+ @staticmethod
651
+ def hasher(specification: str, version: str):
652
+ return hashlib.md5(f"{specification}{version}".encode()).hexdigest()
653
+
654
+ @staticmethod
655
+ def get_scope(content):
656
+ for title, text in content.items():
657
+ if title.lower().endswith("scope"):
658
+ return text
659
+ return ""
660
+
661
+ def get_document(self, spec_id: str, spec_title: str):
662
+ text = [f"{spec_id} - {spec_title}\n"]
663
+ for section in self.spec_contents:
664
+ if spec_id == section["doc_id"]:
665
+ text.extend([f"{section['section']}\n\n{section['content']}"])
666
+ return text
667
+
668
+ def get_text(self, specification: str):
669
+ if self.STOP_EVENT.is_set():
670
+ return None, []
671
+ print(f"\n[INFO] Tentative de récupération de la spécification {specification}", flush=True)
672
+ try:
673
+ # Récupérer la ligne avec le bon lien PDF
674
+ row = self.df[self.df["ETSI deliverable"] == specification]
675
+ if row.empty:
676
+ print(f"[WARN] Spécification {specification} absente du tableau")
677
+ return None, []
678
+
679
+ pdf_link = row.iloc[0]["PDF link"]
680
+ response = self.session.get(
681
+ pdf_link,
682
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ...'}
683
+ )
684
+ if response.status_code != 200:
685
+ print(f"[ERREUR] Echec du téléchargement du PDF pour {specification}.")
686
+ return None, []
687
+ pdf = fitz.open(stream=response.content, filetype="pdf")
688
+ return pdf, pdf.get_toc()
689
+ except Exception as e:
690
+ print(f"[ERROR] Échec get_text pour {specification} : {e}", flush=True)
691
+ return None, []
692
+
693
+ def get_spec_content(self, specification: str):
694
+ def extract_sections(text, titles):
695
+ sections = {}
696
+ sorted_titles = sorted(titles, key=lambda t: text.find(t))
697
+ for i, title in enumerate(sorted_titles):
698
+ start = text.find(title)
699
+ if i + 1 < len(sorted_titles):
700
+ end = text.find(sorted_titles[i + 1])
701
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
702
+ else:
703
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
704
+ return sections
705
+
706
+ if self.STOP_EVENT.is_set():
707
+ return {}
708
+ print(f"[INFO] Extraction du contenu de {specification}", flush=True)
709
+ pdf, doc_toc = self.get_text(specification)
710
+ text = []
711
+ if not pdf or not doc_toc:
712
+ print("[ERREUR] Pas de texte ou table of contents trouvé !")
713
+ return {}
714
+ # On prend à partir de la première réelle page référencée
715
+ first_page = 0
716
+ for level, title, page in doc_toc:
717
+ first_page = page - 1
718
+ break
719
+ for page in pdf[first_page:]:
720
+ text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
721
+ text = "\n".join(text)
722
+ if not text or not doc_toc or self.STOP_EVENT.is_set():
723
+ print("[ERREUR] Pas de texte/table of contents récupéré !")
724
+ return {}
725
+ titles = []
726
+ for level, title, page in doc_toc:
727
+ if self.STOP_EVENT.is_set():
728
+ return {}
729
+ if title and title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
730
+ titles.append('\n'.join(title.strip().split(" ", 1)))
731
+ return extract_sections(text, titles)
732
+
733
+ def process_specification(self, spec):
734
+ if self.STOP_EVENT.is_set():
735
+ return
736
+ try:
737
+ version = spec.get('Version')
738
+ if not version: return
739
+ doc_id = str(spec.get("ETSI deliverable"))
740
+ document = None
741
+ already_indexed = False
742
+
743
+ with self.DOCUMENT_LOCK:
744
+ if (doc_id in self.documents_by_spec_num
745
+ and self.documents_by_spec_num[doc_id]["hash"] == self.hasher(doc_id, version)
746
+ and doc_id not in self.specifications_passed):
747
+ document = self.documents_by_spec_num[doc_id]
748
+ self.specifications_passed.add(doc_id)
749
+ already_indexed = True
750
+ elif doc_id in self.specifications_passed:
751
+ document = self.documents_by_spec_num[doc_id]
752
+ already_indexed = True
753
+ else:
754
+ document_content = self.get_spec_content(doc_id)
755
+ if document_content:
756
+ self.documents_by_spec_num[doc_id] = {"content": document_content, "hash": self.hasher(doc_id, version)}
757
+ document = {"content": document_content, "hash": self.hasher(doc_id, version)}
758
+ self.specifications_passed.add(doc_id)
759
+ already_indexed = False
760
+
761
+ if document:
762
+ string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
763
+ metadata = {
764
+ "id": str(doc_id),
765
+ "title": spec["title"],
766
+ "type": spec["Type"],
767
+ "version": version,
768
+ "url": spec["PDF link"],
769
+ "scope": "" if not document else self.get_scope(document["content"])
770
+ }
771
+ with self.DICT_LOCK:
772
+ self.indexed_specifications[string_key] = metadata
773
+
774
+ with self.DICT_LOCK:
775
+ self.processed_count += 1
776
+ status = "already indexed" if already_indexed else "indexed now"
777
+ print(f"Spec {doc_id} ({spec.get('title', '')}): {status} - Progress {self.processed_count}/{self.total_count}")
778
+
779
+ except Exception as e:
780
+ traceback.print_exc()
781
+ print(f"\n[ERREUR] Échec du traitement de {doc_id} {spec.get('Version')}: {e}", flush=True)
782
+ with self.DICT_LOCK:
783
+ self.processed_count += 1
784
+ print(f"Progress: {self.processed_count}/{self.total_count} specs processed")
785
+
786
+ def run(self):
787
+ print("Démarrage indexation ETSI…")
788
+ yield "event: info\ndata: Indexing ETSI specs ...\n\n"
789
+ specifications = self.df.to_dict(orient="records")
790
+ self.total_count = len(specifications)
791
+ print(f"Traitement de {self.total_count} specs avec {self.max_workers} threads...\n")
792
+
793
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
794
+ futures = [executor.submit(self.process_specification, spec) for spec in specifications]
795
+ total = len(futures)
796
+ done_count = 0
797
+ yield f"event: get-maximum\ndata: {total}\n\n"
798
+
799
+ for future in concurrent.futures.as_completed(futures):
800
+ done_count += 1
801
+ yield f"event: progress\ndata: {done_count}\n\n"
802
+ if self.STOP_EVENT.is_set():
803
+ break
804
+
805
+ print(f"\nAll {self.processed_count}/{self.total_count} specs processed.")
806
+
807
+ def save(self):
808
+ print("\nSauvegarde en cours...", flush=True)
809
+ flat_metadata = [metadata for metadata in self.indexed_specifications.values()]
810
+ flat_docs = []
811
+ for doc_id, data in self.documents_by_spec_num.items():
812
+ for title, content in data["content"].items():
813
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
814
+ push_spec_content = Dataset.from_list(flat_docs)
815
+ push_spec_metadata = Dataset.from_list(flat_metadata)
816
+ push_spec_content.push_to_hub("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF"])
817
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/ETSISpecMetadata", token=os.environ["HF"])
818
+
819
+ self.spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent")["train"].to_list()
820
+ self.documents_by_spec_num = self._make_doc_index(self.spec_contents)
821
+ print("Sauvegarde terminée.")
822
+
823
+ def create_bm25_index(self):
824
+ dataset_metadata = self.indexed_specifications.values()
825
+ unique_specs = set()
826
+ corpus_json = []
827
+
828
+ for specification in dataset_metadata:
829
+ if specification['id'] in unique_specs: continue
830
+ for section in self.spec_contents:
831
+ if specification['id'] == section['doc_id']:
832
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
833
+ "id": specification['id'],
834
+ "title": specification['title'],
835
+ "section_title": section['section'],
836
+ "version": specification['version'],
837
+ "type": specification['type'],
838
+ "url": specification['url'],
839
+ "scope": specification['scope']
840
+ }})
841
+
842
+ corpus_text = [doc["text"] for doc in corpus_json]
843
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
844
+
845
+ print("Indexing BM25")
846
+ retriever = BM25HF(corpus=corpus_json)
847
+ retriever.index(corpus_tokens)
848
+
849
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF"))
850
+
851
+ unique_specs = set()
852
+ corpus_json = []
853
+
854
+ for specification in dataset_metadata:
855
+ if specification['id'] in unique_specs: continue
856
+ text_list = self.get_document(specification['id'], specification['title'])
857
+ text = "\n".join(text_list)
858
+ if len(text_list) == 1: continue
859
+ corpus_json.append({"text": text, "metadata": specification})
860
+ unique_specs.add(specification['id'])
861
+
862
+ corpus_text = [doc["text"] for doc in corpus_json]
863
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
864
+
865
+ print("Indexing BM25")
866
+ retriever = BM25HF(corpus=corpus_json)
867
+ retriever.index(corpus_tokens)
868
+
869
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF"))
870