initial commit
Browse files- Create_db_optimized.py +657 -0
- README.md +3 -9
- annotation.py +486 -0
- app.py +53 -0
- app2.py +116 -0
- correcteur.py +751 -0
- dataset.json +0 -0
- dataset_optimiser_with_finetunning.py +1190 -0
- document_assembler.py +139 -0
- document_validator.py +120 -0
- langchain_medical_agents_refactored.py +335 -0
- main.py +361 -0
- match_transcription.py +749 -0
- medical_template3_mapper.py +714 -0
- medical_transcription_retriever.py +284 -0
- models.py +38 -0
- post_processing.py +373 -0
- rapport_medical_final.txt +103 -0
- requirements.txt +15 -0
- run_test.py +105 -0
- sample.docx +0 -0
- sample.txt +3 -0
- save_matcher.py +1288 -0
- section_generator.py +156 -0
- sftp_agent.py +193 -0
- sftp_config.py +45 -0
- smart_match.py +668 -0
- template_ +0 -0
- template_analyser_llm.py +166 -0
- template_analyser_test.py +425 -0
- template_analyzer.py +86 -0
- template_db_creation.py +896 -0
- template_enrichi_mod.6272.mauberton.MODELE.RADIO_20250903_155139.txt +104 -0
- template_generator.py +348 -0
- template_rempli_mod.6272.mauberton.MODELE.RADIO.txt +56 -0
- test.py +588 -0
- test2.py +84 -0
- test_complete_pipeline.py +219 -0
- test_langfuse.py +32 -0
- testt.py +212 -0
- testt1.py +341 -0
- title_matcher.py +397 -0
- transcription_processor.py +120 -0
- transcription_processor_enhanced.py +533 -0
- type3_extract_entities.py +670 -0
- type3_preprocessing.py +739 -0
Create_db_optimized.py
ADDED
|
@@ -0,0 +1,657 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Core libraries
|
| 10 |
+
import torch
|
| 11 |
+
from transformers import (
|
| 12 |
+
AutoTokenizer, AutoModel, AutoModelForTokenClassification,
|
| 13 |
+
TrainingArguments, Trainer, pipeline
|
| 14 |
+
)
|
| 15 |
+
from torch.utils.data import Dataset
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
|
| 18 |
+
# Vector database
|
| 19 |
+
import chromadb
|
| 20 |
+
from chromadb.config import Settings
|
| 21 |
+
|
| 22 |
+
# Utilities
|
| 23 |
+
import logging
|
| 24 |
+
from tqdm import tqdm
|
| 25 |
+
import pandas as pd
|
| 26 |
+
|
| 27 |
+
# Configure logging
|
| 28 |
+
logging.basicConfig(level=logging.INFO)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class MedicalEntity:
|
| 33 |
+
"""Structure pour les entités médicales extraites par NER"""
|
| 34 |
+
exam_types: List[Tuple[str, float]] # (entity, confidence)
|
| 35 |
+
specialties: List[Tuple[str, float]]
|
| 36 |
+
anatomical_regions: List[Tuple[str, float]]
|
| 37 |
+
pathologies: List[Tuple[str, float]]
|
| 38 |
+
medical_procedures: List[Tuple[str, float]]
|
| 39 |
+
measurements: List[Tuple[str, float]]
|
| 40 |
+
medications: List[Tuple[str, float]]
|
| 41 |
+
symptoms: List[Tuple[str, float]]
|
| 42 |
+
|
| 43 |
+
class AdvancedMedicalNER:
|
| 44 |
+
"""NER médical avancé basé sur CamemBERT-Bio fine-tuné"""
|
| 45 |
+
|
| 46 |
+
def __init__(self, model_name: str = "auto", cache_dir: str = "./models_cache"):
|
| 47 |
+
self.cache_dir = Path(cache_dir)
|
| 48 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 49 |
+
|
| 50 |
+
# Auto-détection du meilleur modèle NER médical disponible
|
| 51 |
+
self.model_name = self._select_best_model(model_name)
|
| 52 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 53 |
+
|
| 54 |
+
# Chargement du modèle NER
|
| 55 |
+
self._load_ner_model()
|
| 56 |
+
|
| 57 |
+
# Labels BIO pour entités médicales
|
| 58 |
+
self.entity_labels = [
|
| 59 |
+
"O", # Outside
|
| 60 |
+
"B-EXAM", "I-EXAM", # Types d'examens
|
| 61 |
+
"B-SPECIALTY", "I-SPECIALTY", # Spécialités médicales
|
| 62 |
+
"B-ANATOMY", "I-ANATOMY", # Régions anatomiques
|
| 63 |
+
"B-PATHOLOGY", "I-PATHOLOGY", # Pathologies
|
| 64 |
+
"B-PROCEDURE", "I-PROCEDURE", # Procédures médicales
|
| 65 |
+
"B-MEASURE", "I-MEASURE", # Mesures/valeurs
|
| 66 |
+
"B-MEDICATION", "I-MEDICATION", # Médicaments
|
| 67 |
+
"B-SYMPTOM", "I-SYMPTOM" # Symptômes
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
self.id2label = {i: label for i, label in enumerate(self.entity_labels)}
|
| 71 |
+
self.label2id = {label: i for i, label in enumerate(self.entity_labels)}
|
| 72 |
+
|
| 73 |
+
def _select_best_model(self, model_name: str) -> str:
|
| 74 |
+
"""Sélection automatique du meilleur modèle NER médical"""
|
| 75 |
+
|
| 76 |
+
if model_name != "auto":
|
| 77 |
+
return model_name
|
| 78 |
+
|
| 79 |
+
# Liste des modèles par ordre de préférence
|
| 80 |
+
preferred_models = [
|
| 81 |
+
"almanach/camembert-bio-base", # CamemBERT Bio français
|
| 82 |
+
"Dr-BERT/DrBERT-7GB", # DrBERT spécialisé
|
| 83 |
+
"emilyalsentzer/Bio_ClinicalBERT", # Bio Clinical BERT
|
| 84 |
+
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
|
| 85 |
+
"dmis-lab/biobert-base-cased-v1.2", # BioBERT
|
| 86 |
+
"camembert-base" # Fallback CamemBERT standard
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
for model in preferred_models:
|
| 90 |
+
try:
|
| 91 |
+
# Test de disponibilité
|
| 92 |
+
AutoTokenizer.from_pretrained(model, cache_dir=self.cache_dir)
|
| 93 |
+
logger.info(f"Modèle sélectionné: {model}")
|
| 94 |
+
return model
|
| 95 |
+
except:
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
# Fallback ultime
|
| 99 |
+
logger.warning("Utilisation du modèle de base camembert-base")
|
| 100 |
+
return "camembert-base"
|
| 101 |
+
|
| 102 |
+
def _load_ner_model(self):
|
| 103 |
+
"""Charge ou crée le modèle NER fine-tuné"""
|
| 104 |
+
|
| 105 |
+
fine_tuned_path = self.cache_dir / "medical_ner_model"
|
| 106 |
+
|
| 107 |
+
if fine_tuned_path.exists():
|
| 108 |
+
logger.info("Chargement du modèle NER fine-tuné existant")
|
| 109 |
+
self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
|
| 110 |
+
self.ner_model = AutoModelForTokenClassification.from_pretrained(fine_tuned_path)
|
| 111 |
+
else:
|
| 112 |
+
logger.info("Création d'un nouveau modèle NER médical")
|
| 113 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.cache_dir)
|
| 114 |
+
|
| 115 |
+
# Modèle pour classification de tokens (NER)
|
| 116 |
+
self.ner_model = AutoModelForTokenClassification.from_pretrained(
|
| 117 |
+
self.model_name,
|
| 118 |
+
num_labels=len(self.entity_labels),
|
| 119 |
+
id2label=self.id2label,
|
| 120 |
+
label2id=self.label2id,
|
| 121 |
+
cache_dir=self.cache_dir
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self.ner_model.to(self.device)
|
| 125 |
+
|
| 126 |
+
# Pipeline NER
|
| 127 |
+
self.ner_pipeline = pipeline(
|
| 128 |
+
"token-classification",
|
| 129 |
+
model=self.ner_model,
|
| 130 |
+
tokenizer=self.tokenizer,
|
| 131 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 132 |
+
aggregation_strategy="simple"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def extract_entities(self, text: str) -> MedicalEntity:
|
| 136 |
+
"""Extraction d'entités avec le modèle NER fine-tuné"""
|
| 137 |
+
|
| 138 |
+
# Prédiction NER
|
| 139 |
+
try:
|
| 140 |
+
ner_results = self.ner_pipeline(text)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.error(f"Erreur NER: {e}")
|
| 143 |
+
return MedicalEntity([], [], [], [], [], [], [], [])
|
| 144 |
+
|
| 145 |
+
# Groupement des entités par type
|
| 146 |
+
entities = {
|
| 147 |
+
"EXAM": [],
|
| 148 |
+
"SPECIALTY": [],
|
| 149 |
+
"ANATOMY": [],
|
| 150 |
+
"PATHOLOGY": [],
|
| 151 |
+
"PROCEDURE": [],
|
| 152 |
+
"MEASURE": [],
|
| 153 |
+
"MEDICATION": [],
|
| 154 |
+
"SYMPTOM": []
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
for result in ner_results:
|
| 158 |
+
entity_type = result['entity_group'].replace('B-', '').replace('I-', '')
|
| 159 |
+
entity_text = result['word']
|
| 160 |
+
confidence = result['score']
|
| 161 |
+
|
| 162 |
+
if entity_type in entities and confidence > 0.7: # Seuil de confiance
|
| 163 |
+
entities[entity_type].append((entity_text, confidence))
|
| 164 |
+
|
| 165 |
+
return MedicalEntity(
|
| 166 |
+
exam_types=entities["EXAM"],
|
| 167 |
+
specialties=entities["SPECIALTY"],
|
| 168 |
+
anatomical_regions=entities["ANATOMY"],
|
| 169 |
+
pathologies=entities["PATHOLOGY"],
|
| 170 |
+
medical_procedures=entities["PROCEDURE"],
|
| 171 |
+
measurements=entities["MEASURE"],
|
| 172 |
+
medications=entities["MEDICATION"],
|
| 173 |
+
symptoms=entities["SYMPTOM"]
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
def fine_tune_on_templates(self, templates_data: List[Dict],
|
| 177 |
+
output_dir: str = None,
|
| 178 |
+
epochs: int = 3):
|
| 179 |
+
"""Fine-tuning du modèle NER sur des templates médicaux"""
|
| 180 |
+
|
| 181 |
+
if output_dir is None:
|
| 182 |
+
output_dir = self.cache_dir / "medical_ner_model"
|
| 183 |
+
|
| 184 |
+
logger.info("Début du fine-tuning NER sur templates médicaux")
|
| 185 |
+
|
| 186 |
+
# Préparation des données d'entraînement
|
| 187 |
+
# (Ici, on utiliserait des templates annotés ou de l'auto-annotation)
|
| 188 |
+
train_dataset = self._prepare_training_data(templates_data)
|
| 189 |
+
|
| 190 |
+
# Configuration d'entraînement
|
| 191 |
+
training_args = TrainingArguments(
|
| 192 |
+
output_dir=output_dir,
|
| 193 |
+
num_train_epochs=epochs,
|
| 194 |
+
per_device_train_batch_size=8,
|
| 195 |
+
per_device_eval_batch_size=8,
|
| 196 |
+
warmup_steps=100,
|
| 197 |
+
weight_decay=0.01,
|
| 198 |
+
logging_dir=f"{output_dir}/logs",
|
| 199 |
+
save_strategy="epoch",
|
| 200 |
+
evaluation_strategy="epoch" if train_dataset.get('eval') else "no",
|
| 201 |
+
load_best_model_at_end=True,
|
| 202 |
+
metric_for_best_model="eval_loss" if train_dataset.get('eval') else None,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Trainer
|
| 206 |
+
trainer = Trainer(
|
| 207 |
+
model=self.ner_model,
|
| 208 |
+
args=training_args,
|
| 209 |
+
train_dataset=train_dataset['train'],
|
| 210 |
+
eval_dataset=train_dataset.get('eval'),
|
| 211 |
+
tokenizer=self.tokenizer,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Entraînement
|
| 215 |
+
trainer.train()
|
| 216 |
+
|
| 217 |
+
# Sauvegarde
|
| 218 |
+
trainer.save_model()
|
| 219 |
+
self.tokenizer.save_pretrained(output_dir)
|
| 220 |
+
|
| 221 |
+
logger.info(f"Fine-tuning terminé, modèle sauvé dans {output_dir}")
|
| 222 |
+
|
| 223 |
+
def _prepare_training_data(self, templates_data: List[Dict]) -> Dict:
|
| 224 |
+
"""Prépare les données d'entraînement pour le NER (auto-annotation intelligente)"""
|
| 225 |
+
|
| 226 |
+
# Cette fonction pourrait utiliser des techniques d'auto-annotation
|
| 227 |
+
# ou des datasets médicaux pré-existants pour créer des labels BIO
|
| 228 |
+
|
| 229 |
+
# Pour l'exemple, retourner un dataset vide
|
| 230 |
+
# En production, on utiliserait des techniques d'annotation automatique
|
| 231 |
+
# ou des datasets médicaux annotés comme QUAERO, CAS, etc.
|
| 232 |
+
|
| 233 |
+
class EmptyDataset(Dataset):
|
| 234 |
+
def __len__(self):
|
| 235 |
+
return 0
|
| 236 |
+
def __getitem__(self, idx):
|
| 237 |
+
return {}
|
| 238 |
+
|
| 239 |
+
return {'train': EmptyDataset()}
|
| 240 |
+
|
| 241 |
+
class AdvancedMedicalEmbedding:
|
| 242 |
+
"""Générateur d'embeddings médicaux avancés avec cross-encoder reranking"""
|
| 243 |
+
|
| 244 |
+
def __init__(self,
|
| 245 |
+
base_model: str = "almanach/camembert-bio-base",
|
| 246 |
+
cross_encoder_model: str = "auto"):
|
| 247 |
+
|
| 248 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 249 |
+
self.base_model_name = base_model
|
| 250 |
+
|
| 251 |
+
# Modèle principal pour embeddings
|
| 252 |
+
self._load_base_model()
|
| 253 |
+
|
| 254 |
+
# Cross-encoder pour reranking
|
| 255 |
+
self._load_cross_encoder(cross_encoder_model)
|
| 256 |
+
|
| 257 |
+
def _load_base_model(self):
|
| 258 |
+
"""Charge le modèle de base pour les embeddings"""
|
| 259 |
+
try:
|
| 260 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
|
| 261 |
+
self.base_model = AutoModel.from_pretrained(self.base_model_name)
|
| 262 |
+
self.base_model.to(self.device)
|
| 263 |
+
logger.info(f"Modèle de base chargé: {self.base_model_name}")
|
| 264 |
+
except Exception as e:
|
| 265 |
+
logger.error(f"Erreur chargement modèle de base: {e}")
|
| 266 |
+
raise
|
| 267 |
+
|
| 268 |
+
def _load_cross_encoder(self, model_name: str):
|
| 269 |
+
"""Charge le cross-encoder pour reranking"""
|
| 270 |
+
|
| 271 |
+
if model_name == "auto":
|
| 272 |
+
# Sélection automatique du meilleur cross-encoder médical
|
| 273 |
+
cross_encoders = [
|
| 274 |
+
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
|
| 275 |
+
"emilyalsentzer/Bio_ClinicalBERT",
|
| 276 |
+
self.base_model_name # Fallback
|
| 277 |
+
]
|
| 278 |
+
|
| 279 |
+
for model in cross_encoders:
|
| 280 |
+
try:
|
| 281 |
+
self.cross_tokenizer = AutoTokenizer.from_pretrained(model)
|
| 282 |
+
self.cross_model = AutoModel.from_pretrained(model)
|
| 283 |
+
self.cross_model.to(self.device)
|
| 284 |
+
logger.info(f"Cross-encoder chargé: {model}")
|
| 285 |
+
break
|
| 286 |
+
except:
|
| 287 |
+
continue
|
| 288 |
+
else:
|
| 289 |
+
self.cross_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 290 |
+
self.cross_model = AutoModel.from_pretrained(model_name)
|
| 291 |
+
self.cross_model.to(self.device)
|
| 292 |
+
|
| 293 |
+
def generate_embedding(self, text: str, entities: MedicalEntity = None) -> np.ndarray:
|
| 294 |
+
"""Génère un embedding enrichi pour un texte médical"""
|
| 295 |
+
|
| 296 |
+
# Tokenisation
|
| 297 |
+
inputs = self.tokenizer(
|
| 298 |
+
text,
|
| 299 |
+
padding=True,
|
| 300 |
+
truncation=True,
|
| 301 |
+
max_length=512,
|
| 302 |
+
return_tensors="pt"
|
| 303 |
+
).to(self.device)
|
| 304 |
+
|
| 305 |
+
# Génération embedding
|
| 306 |
+
with torch.no_grad():
|
| 307 |
+
outputs = self.base_model(**inputs)
|
| 308 |
+
|
| 309 |
+
# Mean pooling
|
| 310 |
+
attention_mask = inputs['attention_mask']
|
| 311 |
+
token_embeddings = outputs.last_hidden_state
|
| 312 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 313 |
+
embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 314 |
+
|
| 315 |
+
# Enrichissement avec entités NER
|
| 316 |
+
if entities:
|
| 317 |
+
embedding = self._enrich_with_ner_entities(embedding, entities)
|
| 318 |
+
|
| 319 |
+
return embedding.cpu().numpy().flatten().astype(np.float32)
|
| 320 |
+
|
| 321 |
+
def _enrich_with_ner_entities(self, base_embedding: torch.Tensor, entities: MedicalEntity) -> torch.Tensor:
|
| 322 |
+
"""Enrichit l'embedding avec les entités extraites par NER"""
|
| 323 |
+
|
| 324 |
+
# Concaténer les entités importantes avec leurs scores de confiance
|
| 325 |
+
entity_texts = []
|
| 326 |
+
confidence_weights = []
|
| 327 |
+
|
| 328 |
+
for entity_list in [entities.exam_types, entities.specialties,
|
| 329 |
+
entities.anatomical_regions, entities.pathologies]:
|
| 330 |
+
for entity_text, confidence in entity_list:
|
| 331 |
+
entity_texts.append(entity_text)
|
| 332 |
+
confidence_weights.append(confidence)
|
| 333 |
+
|
| 334 |
+
if not entity_texts:
|
| 335 |
+
return base_embedding
|
| 336 |
+
|
| 337 |
+
# Génération d'embeddings pour les entités
|
| 338 |
+
entity_text_combined = " [SEP] ".join(entity_texts)
|
| 339 |
+
entity_inputs = self.tokenizer(
|
| 340 |
+
entity_text_combined,
|
| 341 |
+
padding=True,
|
| 342 |
+
truncation=True,
|
| 343 |
+
max_length=256,
|
| 344 |
+
return_tensors="pt"
|
| 345 |
+
).to(self.device)
|
| 346 |
+
|
| 347 |
+
with torch.no_grad():
|
| 348 |
+
entity_outputs = self.base_model(**entity_inputs)
|
| 349 |
+
entity_embedding = torch.mean(entity_outputs.last_hidden_state, dim=1)
|
| 350 |
+
|
| 351 |
+
# Fusion pondérée par les scores de confiance
|
| 352 |
+
avg_confidence = np.mean(confidence_weights) if confidence_weights else 0.5
|
| 353 |
+
fusion_weight = min(0.4, avg_confidence) # Max 40% pour les entités
|
| 354 |
+
|
| 355 |
+
enriched_embedding = (1 - fusion_weight) * base_embedding + fusion_weight * entity_embedding
|
| 356 |
+
|
| 357 |
+
return enriched_embedding
|
| 358 |
+
|
| 359 |
+
def cross_encoder_rerank(self,
|
| 360 |
+
query: str,
|
| 361 |
+
candidates: List[Dict],
|
| 362 |
+
top_k: int = 3) -> List[Dict]:
|
| 363 |
+
"""Reranking avec cross-encoder pour affiner la sélection"""
|
| 364 |
+
|
| 365 |
+
if len(candidates) <= top_k:
|
| 366 |
+
return candidates
|
| 367 |
+
|
| 368 |
+
reranked_candidates = []
|
| 369 |
+
|
| 370 |
+
for candidate in candidates:
|
| 371 |
+
# Création de la paire query-candidate
|
| 372 |
+
pair_text = f"{query} [SEP] {candidate['document']}"
|
| 373 |
+
|
| 374 |
+
# Tokenisation
|
| 375 |
+
inputs = self.cross_tokenizer(
|
| 376 |
+
pair_text,
|
| 377 |
+
padding=True,
|
| 378 |
+
truncation=True,
|
| 379 |
+
max_length=512,
|
| 380 |
+
return_tensors="pt"
|
| 381 |
+
).to(self.device)
|
| 382 |
+
|
| 383 |
+
# Score de similarité cross-encoder
|
| 384 |
+
with torch.no_grad():
|
| 385 |
+
outputs = self.cross_model(**inputs)
|
| 386 |
+
# Utilisation du [CLS] token pour le score de similarité
|
| 387 |
+
cls_embedding = outputs.last_hidden_state[:, 0, :]
|
| 388 |
+
similarity_score = torch.sigmoid(torch.mean(cls_embedding)).item()
|
| 389 |
+
|
| 390 |
+
candidate_copy = candidate.copy()
|
| 391 |
+
candidate_copy['cross_encoder_score'] = similarity_score
|
| 392 |
+
candidate_copy['final_score'] = (
|
| 393 |
+
0.6 * candidate['similarity_score'] +
|
| 394 |
+
0.4 * similarity_score
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
reranked_candidates.append(candidate_copy)
|
| 398 |
+
|
| 399 |
+
# Tri par score final
|
| 400 |
+
reranked_candidates.sort(key=lambda x: x['final_score'], reverse=True)
|
| 401 |
+
|
| 402 |
+
return reranked_candidates[:top_k]
|
| 403 |
+
|
| 404 |
+
class MedicalTemplateVectorDB:
|
| 405 |
+
"""Base de données vectorielle optimisée pour templates médicaux"""
|
| 406 |
+
|
| 407 |
+
def __init__(self, db_path: str = "./medical_vector_db", collection_name: str = "medical_templates"):
|
| 408 |
+
self.db_path = db_path
|
| 409 |
+
self.collection_name = collection_name
|
| 410 |
+
|
| 411 |
+
# ChromaDB avec configuration optimisée
|
| 412 |
+
self.client = chromadb.PersistentClient(
|
| 413 |
+
path=db_path,
|
| 414 |
+
settings=Settings(
|
| 415 |
+
anonymized_telemetry=False,
|
| 416 |
+
allow_reset=True
|
| 417 |
+
)
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
# Collection avec métrique de distance optimisée
|
| 421 |
+
try:
|
| 422 |
+
self.collection = self.client.get_collection(collection_name)
|
| 423 |
+
logger.info(f"Collection '{collection_name}' chargée")
|
| 424 |
+
except:
|
| 425 |
+
self.collection = self.client.create_collection(
|
| 426 |
+
name=collection_name,
|
| 427 |
+
metadata={
|
| 428 |
+
"hnsw:space": "cosine",
|
| 429 |
+
"hnsw:M": 32, # Connectivité du graphe
|
| 430 |
+
"hnsw:ef_construction": 200, # Qualité vs vitesse construction
|
| 431 |
+
"hnsw:ef_search": 50 # Qualité vs vitesse recherche
|
| 432 |
+
}
|
| 433 |
+
)
|
| 434 |
+
logger.info(f"Collection '{collection_name}' créée avec optimisations HNSW")
|
| 435 |
+
|
| 436 |
+
def add_template(self,
|
| 437 |
+
template_id: str,
|
| 438 |
+
template_text: str,
|
| 439 |
+
embedding: np.ndarray,
|
| 440 |
+
entities: MedicalEntity,
|
| 441 |
+
metadata: Dict[str, Any] = None):
|
| 442 |
+
"""Ajoute un template avec métadonnées enrichies par NER"""
|
| 443 |
+
|
| 444 |
+
# Métadonnées automatiques basées sur NER
|
| 445 |
+
auto_metadata = {
|
| 446 |
+
"exam_types": [entity[0] for entity in entities.exam_types],
|
| 447 |
+
"specialties": [entity[0] for entity in entities.specialties],
|
| 448 |
+
"anatomical_regions": [entity[0] for entity in entities.anatomical_regions],
|
| 449 |
+
"pathologies": [entity[0] for entity in entities.pathologies],
|
| 450 |
+
"procedures": [entity[0] for entity in entities.medical_procedures],
|
| 451 |
+
"text_length": len(template_text),
|
| 452 |
+
"entity_confidence_avg": np.mean([
|
| 453 |
+
entity[1] for entity_list in [
|
| 454 |
+
entities.exam_types, entities.specialties,
|
| 455 |
+
entities.anatomical_regions, entities.pathologies
|
| 456 |
+
] for entity in entity_list
|
| 457 |
+
]) if any([entities.exam_types, entities.specialties,
|
| 458 |
+
entities.anatomical_regions, entities.pathologies]) else 0.0
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
if metadata:
|
| 462 |
+
auto_metadata.update(metadata)
|
| 463 |
+
|
| 464 |
+
self.collection.add(
|
| 465 |
+
embeddings=[embedding.tolist()],
|
| 466 |
+
documents=[template_text],
|
| 467 |
+
metadatas=[auto_metadata],
|
| 468 |
+
ids=[template_id]
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
logger.info(f"Template {template_id} ajouté avec métadonnées NER automatiques")
|
| 472 |
+
|
| 473 |
+
def advanced_search(self,
|
| 474 |
+
query_embedding: np.ndarray,
|
| 475 |
+
n_results: int = 10,
|
| 476 |
+
entity_filters: Dict[str, List[str]] = None,
|
| 477 |
+
confidence_threshold: float = 0.0) -> List[Dict]:
|
| 478 |
+
"""Recherche avancée avec filtres basés sur entités NER"""
|
| 479 |
+
|
| 480 |
+
where_clause = {}
|
| 481 |
+
|
| 482 |
+
# Filtres basés sur entités NER extraites
|
| 483 |
+
if entity_filters:
|
| 484 |
+
for entity_type, entity_values in entity_filters.items():
|
| 485 |
+
if entity_values:
|
| 486 |
+
where_clause[entity_type] = {"$in": entity_values}
|
| 487 |
+
|
| 488 |
+
# Filtre par confiance moyenne des entités
|
| 489 |
+
if confidence_threshold > 0:
|
| 490 |
+
where_clause["entity_confidence_avg"] = {"$gte": confidence_threshold}
|
| 491 |
+
|
| 492 |
+
results = self.collection.query(
|
| 493 |
+
query_embeddings=[query_embedding.tolist()],
|
| 494 |
+
n_results=n_results,
|
| 495 |
+
where=where_clause if where_clause else None,
|
| 496 |
+
include=["documents", "metadatas", "distances"]
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
# Formatage des résultats
|
| 500 |
+
formatted_results = []
|
| 501 |
+
for i in range(len(results['ids'][0])):
|
| 502 |
+
formatted_results.append({
|
| 503 |
+
'id': results['ids'][0][i],
|
| 504 |
+
'document': results['documents'][0][i],
|
| 505 |
+
'metadata': results['metadatas'][0][i],
|
| 506 |
+
'similarity_score': 1 - results['distances'][0][i],
|
| 507 |
+
'distance': results['distances'][0][i]
|
| 508 |
+
})
|
| 509 |
+
|
| 510 |
+
return formatted_results
|
| 511 |
+
|
| 512 |
+
class AdvancedMedicalTemplateProcessor:
|
| 513 |
+
"""Processeur avancé avec NER fine-tuné et reranking cross-encoder"""
|
| 514 |
+
|
| 515 |
+
def __init__(self,
|
| 516 |
+
base_model: str = "almanach/camembert-bio-base",
|
| 517 |
+
db_path: str = "./advanced_medical_vector_db"):
|
| 518 |
+
|
| 519 |
+
self.ner_extractor = AdvancedMedicalNER()
|
| 520 |
+
self.embedding_generator = AdvancedMedicalEmbedding(base_model)
|
| 521 |
+
self.vector_db = MedicalTemplateVectorDB(db_path)
|
| 522 |
+
|
| 523 |
+
logger.info("Processeur médical avancé initialisé avec NER fine-tuné et cross-encoder reranking")
|
| 524 |
+
|
| 525 |
+
def process_templates_batch(self,
|
| 526 |
+
templates: List[Dict[str, str]],
|
| 527 |
+
batch_size: int = 8,
|
| 528 |
+
fine_tune_ner: bool = False) -> None:
|
| 529 |
+
"""Traitement avancé avec option de fine-tuning NER"""
|
| 530 |
+
|
| 531 |
+
if fine_tune_ner:
|
| 532 |
+
logger.info("Fine-tuning du modèle NER sur les templates...")
|
| 533 |
+
self.ner_extractor.fine_tune_on_templates(templates)
|
| 534 |
+
|
| 535 |
+
logger.info(f"Traitement avancé de {len(templates)} templates")
|
| 536 |
+
|
| 537 |
+
for i in tqdm(range(0, len(templates), batch_size), desc="Traitement avancé"):
|
| 538 |
+
batch = templates[i:i+batch_size]
|
| 539 |
+
|
| 540 |
+
for template in batch:
|
| 541 |
+
try:
|
| 542 |
+
template_id = template['id']
|
| 543 |
+
template_text = template['text']
|
| 544 |
+
metadata = template.get('metadata', {})
|
| 545 |
+
|
| 546 |
+
# NER avancé
|
| 547 |
+
entities = self.ner_extractor.extract_entities(template_text)
|
| 548 |
+
|
| 549 |
+
# Embedding enrichi
|
| 550 |
+
embedding = self.embedding_generator.generate_embedding(template_text, entities)
|
| 551 |
+
|
| 552 |
+
# Stockage avec métadonnées NER
|
| 553 |
+
self.vector_db.add_template(
|
| 554 |
+
template_id=template_id,
|
| 555 |
+
template_text=template_text,
|
| 556 |
+
embedding=embedding,
|
| 557 |
+
entities=entities,
|
| 558 |
+
metadata=metadata
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
except Exception as e:
|
| 562 |
+
logger.error(f"Erreur traitement template {template.get('id', 'unknown')}: {e}")
|
| 563 |
+
continue
|
| 564 |
+
|
| 565 |
+
def find_best_template_with_reranking(self,
|
| 566 |
+
transcription: str,
|
| 567 |
+
initial_candidates: int = 10,
|
| 568 |
+
final_results: int = 3) -> List[Dict]:
|
| 569 |
+
"""Recherche optimale avec reranking cross-encoder"""
|
| 570 |
+
|
| 571 |
+
# 1. Extraction NER de la transcription
|
| 572 |
+
query_entities = self.ner_extractor.extract_entities(transcription)
|
| 573 |
+
|
| 574 |
+
# 2. Génération embedding enrichi
|
| 575 |
+
query_embedding = self.embedding_generator.generate_embedding(transcription, query_entities)
|
| 576 |
+
|
| 577 |
+
# 3. Filtres automatiques basés sur entités extraites
|
| 578 |
+
entity_filters = {}
|
| 579 |
+
if query_entities.exam_types:
|
| 580 |
+
entity_filters['exam_types'] = [entity[0] for entity in query_entities.exam_types]
|
| 581 |
+
if query_entities.specialties:
|
| 582 |
+
entity_filters['specialties'] = [entity[0] for entity in query_entities.specialties]
|
| 583 |
+
if query_entities.anatomical_regions:
|
| 584 |
+
entity_filters['anatomical_regions'] = [entity[0] for entity in query_entities.anatomical_regions]
|
| 585 |
+
|
| 586 |
+
# 4. Recherche vectorielle initiale
|
| 587 |
+
initial_candidates_results = self.vector_db.advanced_search(
|
| 588 |
+
query_embedding=query_embedding,
|
| 589 |
+
n_results=initial_candidates,
|
| 590 |
+
entity_filters=entity_filters,
|
| 591 |
+
confidence_threshold=0.6
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
# 5. Reranking avec cross-encoder
|
| 595 |
+
if len(initial_candidates_results) > final_results:
|
| 596 |
+
final_results_reranked = self.embedding_generator.cross_encoder_rerank(
|
| 597 |
+
query=transcription,
|
| 598 |
+
candidates=initial_candidates_results,
|
| 599 |
+
top_k=final_results
|
| 600 |
+
)
|
| 601 |
+
else:
|
| 602 |
+
final_results_reranked = initial_candidates_results
|
| 603 |
+
|
| 604 |
+
# 6. Enrichissement des résultats avec détails NER
|
| 605 |
+
for result in final_results_reranked:
|
| 606 |
+
result['query_entities'] = {
|
| 607 |
+
'exam_types': query_entities.exam_types,
|
| 608 |
+
'specialties': query_entities.specialties,
|
| 609 |
+
'anatomical_regions': query_entities.anatomical_regions,
|
| 610 |
+
'pathologies': query_entities.pathologies
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
return final_results_reranked
|
| 614 |
+
|
| 615 |
+
# Exemple d'utilisation avancée
|
| 616 |
+
def main():
|
| 617 |
+
"""Exemple d'utilisation du système avancé"""
|
| 618 |
+
|
| 619 |
+
# Initialisation du processeur avancé
|
| 620 |
+
processor = AdvancedMedicalTemplateProcessor()
|
| 621 |
+
|
| 622 |
+
# Traitement des templates avec fine-tuning optionnel
|
| 623 |
+
sample_templates = [
|
| 624 |
+
{
|
| 625 |
+
'id': 'angio_001',
|
| 626 |
+
'text': """Échographie et doppler artério-veineux des membres inférieurs.
|
| 627 |
+
Exploration de l'incontinence veineuse superficielle...""",
|
| 628 |
+
'metadata': {'source': 'angiologie', 'version': '2024'}
|
| 629 |
+
}
|
| 630 |
+
]
|
| 631 |
+
|
| 632 |
+
# Traitement avec fine-tuning NER
|
| 633 |
+
processor.process_templates_batch(sample_templates, fine_tune_ner=False)
|
| 634 |
+
|
| 635 |
+
# Recherche avec reranking
|
| 636 |
+
transcription = """madame bacon nicole bilan œdème droit gonalgies ostéophytes
|
| 637 |
+
incontinence veineuse modérée portions surale droite crurale gauche saphéniennes"""
|
| 638 |
+
|
| 639 |
+
best_matches = processor.find_best_template_with_reranking(
|
| 640 |
+
transcription=transcription,
|
| 641 |
+
initial_candidates=15,
|
| 642 |
+
final_results=3
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
# Affichage des résultats
|
| 646 |
+
for i, match in enumerate(best_matches):
|
| 647 |
+
print(f"\n=== Match {i+1} ===")
|
| 648 |
+
print(f"Template ID: {match['id']}")
|
| 649 |
+
print(f"Score final: {match.get('final_score', match['similarity_score']):.4f}")
|
| 650 |
+
print(f"Score cross-encoder: {match.get('cross_encoder_score', 'N/A')}")
|
| 651 |
+
print(f"Entités détectées dans la query:")
|
| 652 |
+
for entity_type, entities in match.get('query_entities', {}).items():
|
| 653 |
+
if entities:
|
| 654 |
+
print(f" - {entity_type}: {[f'{e[0]} ({e[1]:.2f})' for e in entities]}")
|
| 655 |
+
|
| 656 |
+
if __name__ == "__main__":
|
| 657 |
+
main()
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: medical-agent
|
| 3 |
+
app_file: app2.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
sdk_version: 5.47.2
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
annotation.py
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from openai import AzureOpenAI
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 9 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 10 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT") # deployment name
|
| 11 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
|
| 12 |
+
|
| 13 |
+
# Configure OpenAI for Azure
|
| 14 |
+
client = AzureOpenAI(
|
| 15 |
+
api_key=AZURE_OPENAI_KEY,
|
| 16 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
| 17 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def extract_medical_entities(text: str) -> dict:
|
| 21 |
+
prompt = f""" You are a medical NER expert. Your task is to extract relevant entities from the given medical report text and return them in a JSON object.
|
| 22 |
+
|
| 23 |
+
Analyze the text carefully and identify the following fields:
|
| 24 |
+
|
| 25 |
+
- "exam_types": any type of medical test, examination, or diagnostic method performed on the patient.
|
| 26 |
+
- "specialties": the branch of medicine or medical discipline relevant to the report.
|
| 27 |
+
- "anatomical_regions": specific parts or regions of the body mentioned in the report.
|
| 28 |
+
- "pathologies": diagnosed diseases, disorders, or abnormal medical conditions noted in the report.
|
| 29 |
+
- "procedures": medical interventions, treatments, or actions performed on the patient.
|
| 30 |
+
- "measurements": numerical values or quantities recorded in the report, such as vital signs, lab results, sizes, or pressures.
|
| 31 |
+
- "medications": drugs, therapies, or prescribed substances mentioned in the report.
|
| 32 |
+
- "symptoms": patient-experienced signs or observable indications of a health issue.
|
| 33 |
+
|
| 34 |
+
Text to analyze:
|
| 35 |
+
\"\"\"
|
| 36 |
+
{text}
|
| 37 |
+
\"\"\"
|
| 38 |
+
|
| 39 |
+
Return ONLY a valid JSON object with all fields. If a field has no values, return an empty list.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
response = client.chat.completions.create(
|
| 45 |
+
model=AZURE_OPENAI_DEPLOYMENT,
|
| 46 |
+
messages=[{"role": "user", "content": prompt}],
|
| 47 |
+
#temperature=0,
|
| 48 |
+
#max_tokens=1024
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
content = response.choices[0].message.content
|
| 52 |
+
try:
|
| 53 |
+
return json.loads(content)
|
| 54 |
+
except json.JSONDecodeError:
|
| 55 |
+
return {
|
| 56 |
+
"exam_types": [],
|
| 57 |
+
"specialties": [],
|
| 58 |
+
"anatomical_regions": [],
|
| 59 |
+
"pathologies": [],
|
| 60 |
+
"procedures": [],
|
| 61 |
+
"measurements": [],
|
| 62 |
+
"medications": [],
|
| 63 |
+
"symptoms": []
|
| 64 |
+
}
|
| 65 |
+
import json
|
| 66 |
+
|
| 67 |
+
def save_annotation(text: str, labels: dict, output_file="dataset.jsonl"):
|
| 68 |
+
record = {
|
| 69 |
+
"text": text,
|
| 70 |
+
"labels": labels
|
| 71 |
+
}
|
| 72 |
+
# append as one line of JSON
|
| 73 |
+
with open(output_file, "a", encoding="utf-8") as f:
|
| 74 |
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
input_folder = "data_txt" # 📂 folder containing your .txt files
|
| 79 |
+
output_file = "dataset.json"
|
| 80 |
+
|
| 81 |
+
# Ensure output file is empty before starting
|
| 82 |
+
open(output_file, "w", encoding="utf-8").close()
|
| 83 |
+
|
| 84 |
+
for filename in os.listdir(input_folder):
|
| 85 |
+
if filename.endswith(".txt"):
|
| 86 |
+
file_path = os.path.join(input_folder, filename)
|
| 87 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 88 |
+
transcription = f.read().strip()
|
| 89 |
+
|
| 90 |
+
print(f"\n=== Processing {filename} ===")
|
| 91 |
+
entities = extract_medical_entities(transcription)
|
| 92 |
+
|
| 93 |
+
# Save results
|
| 94 |
+
save_annotation(transcription, entities, output_file=output_file)
|
| 95 |
+
|
| 96 |
+
print(f"✅ Saved {filename} → {output_file}")
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
input_folder = "data_txt" # 📂 folder containing your .txt files
|
| 102 |
+
output_file = "dataset.json"
|
| 103 |
+
|
| 104 |
+
# Liste des fichiers à exclure
|
| 105 |
+
excluded_files = {
|
| 106 |
+
"template7.txt",
|
| 107 |
+
"template1167.txt",
|
| 108 |
+
"template429.txt",
|
| 109 |
+
"template401.txt",
|
| 110 |
+
"template367.txt",
|
| 111 |
+
"template415.txt",
|
| 112 |
+
"template398.txt",
|
| 113 |
+
"template1198.txt",
|
| 114 |
+
"template159.txt",
|
| 115 |
+
"template165.txt",
|
| 116 |
+
"template1107.txt",
|
| 117 |
+
"template449.txt",
|
| 118 |
+
"template1113.txt",
|
| 119 |
+
"template313.txt",
|
| 120 |
+
"template475.txt",
|
| 121 |
+
"template461.txt",
|
| 122 |
+
"template307.txt",
|
| 123 |
+
"template893.txt",
|
| 124 |
+
"template139.txt",
|
| 125 |
+
"template887.txt",
|
| 126 |
+
"template677.txt",
|
| 127 |
+
"template111.txt",
|
| 128 |
+
"template105.txt",
|
| 129 |
+
"template663.txt",
|
| 130 |
+
"template688.txt",
|
| 131 |
+
"template850.txt",
|
| 132 |
+
"template844.txt",
|
| 133 |
+
"template878.txt",
|
| 134 |
+
"template16.txt",
|
| 135 |
+
"template703.txt",
|
| 136 |
+
"template717.txt",
|
| 137 |
+
"template924.txt",
|
| 138 |
+
"template930.txt",
|
| 139 |
+
"template918.txt",
|
| 140 |
+
"template1073.txt",
|
| 141 |
+
"template529.txt",
|
| 142 |
+
"template1067.txt",
|
| 143 |
+
"template267.txt",
|
| 144 |
+
"template501.txt",
|
| 145 |
+
"template515.txt",
|
| 146 |
+
"template273.txt",
|
| 147 |
+
"template298.txt",
|
| 148 |
+
"template1098.txt",
|
| 149 |
+
"template1099.txt",
|
| 150 |
+
"template299.txt",
|
| 151 |
+
"template514.txt",
|
| 152 |
+
"template272.txt",
|
| 153 |
+
"template266.txt",
|
| 154 |
+
"template500.txt",
|
| 155 |
+
"template528.txt",
|
| 156 |
+
"template1066.txt",
|
| 157 |
+
"template1072.txt",
|
| 158 |
+
"template919.txt",
|
| 159 |
+
"template931.txt",
|
| 160 |
+
"template925.txt",
|
| 161 |
+
"template716.txt",
|
| 162 |
+
"template702.txt",
|
| 163 |
+
"template879.txt",
|
| 164 |
+
"template845.txt",
|
| 165 |
+
"template851.txt",
|
| 166 |
+
"template689.txt",
|
| 167 |
+
"template104.txt",
|
| 168 |
+
"template662.txt",
|
| 169 |
+
"template676.txt",
|
| 170 |
+
"template110.txt",
|
| 171 |
+
"template138.txt",
|
| 172 |
+
"template886.txt",
|
| 173 |
+
"template892.txt",
|
| 174 |
+
"template460.txt",
|
| 175 |
+
"template306.txt",
|
| 176 |
+
"template312.txt",
|
| 177 |
+
"template474.txt",
|
| 178 |
+
"template1112.txt",
|
| 179 |
+
"template1106.txt",
|
| 180 |
+
"template448.txt",
|
| 181 |
+
"template338.txt",
|
| 182 |
+
"template1110.txt",
|
| 183 |
+
"template1104.txt",
|
| 184 |
+
"template304.txt",
|
| 185 |
+
"template462.txt",
|
| 186 |
+
"template476.txt",
|
| 187 |
+
"template310.txt",
|
| 188 |
+
"template1138.txt",
|
| 189 |
+
"template489.txt",
|
| 190 |
+
"template884.txt",
|
| 191 |
+
"template890.txt",
|
| 192 |
+
"template648.txt",
|
| 193 |
+
"template660.txt",
|
| 194 |
+
"template106.txt",
|
| 195 |
+
"template112.txt",
|
| 196 |
+
"template674.txt",
|
| 197 |
+
"template847.txt",
|
| 198 |
+
"template853.txt",
|
| 199 |
+
"template728.txt",
|
| 200 |
+
"template15.txt",
|
| 201 |
+
"template714.txt",
|
| 202 |
+
"template29.txt",
|
| 203 |
+
"template700.txt",
|
| 204 |
+
"template933.txt",
|
| 205 |
+
"template927.txt",
|
| 206 |
+
"template1064.txt",
|
| 207 |
+
"template1070.txt",
|
| 208 |
+
"template258.txt",
|
| 209 |
+
"template1058.txt",
|
| 210 |
+
"template270.txt",
|
| 211 |
+
"template516.txt",
|
| 212 |
+
"template502.txt",
|
| 213 |
+
"template264.txt",
|
| 214 |
+
"template503.txt",
|
| 215 |
+
"template265.txt",
|
| 216 |
+
"template271.txt",
|
| 217 |
+
"template1059.txt",
|
| 218 |
+
"template517.txt",
|
| 219 |
+
"template259.txt",
|
| 220 |
+
"template1071.txt",
|
| 221 |
+
"template1065.txt",
|
| 222 |
+
"template926.txt",
|
| 223 |
+
"template932.txt",
|
| 224 |
+
"template701.txt",
|
| 225 |
+
"template715.txt",
|
| 226 |
+
"template28.txt",
|
| 227 |
+
"template729.txt",
|
| 228 |
+
"template14.txt",
|
| 229 |
+
"template852.txt",
|
| 230 |
+
"template846.txt",
|
| 231 |
+
"template113.txt",
|
| 232 |
+
"template675.txt",
|
| 233 |
+
"template661.txt",
|
| 234 |
+
"template107.txt",
|
| 235 |
+
"template649.txt",
|
| 236 |
+
"template891.txt",
|
| 237 |
+
"template885.txt",
|
| 238 |
+
"template488.txt",
|
| 239 |
+
"template477.txt",
|
| 240 |
+
"template1139.txt",
|
| 241 |
+
"template311.txt",
|
| 242 |
+
"template305.txt",
|
| 243 |
+
"template463.txt",
|
| 244 |
+
"template1105.txt",
|
| 245 |
+
"template1111.txt",
|
| 246 |
+
"template339.txt",
|
| 247 |
+
"template467.txt",
|
| 248 |
+
"template1129.txt",
|
| 249 |
+
"template301.txt",
|
| 250 |
+
"template315.txt",
|
| 251 |
+
"template473.txt",
|
| 252 |
+
"template1115.txt",
|
| 253 |
+
"template1101.txt",
|
| 254 |
+
"template329.txt",
|
| 255 |
+
"template498.txt",
|
| 256 |
+
"template103.txt",
|
| 257 |
+
"template665.txt",
|
| 258 |
+
"template671.txt",
|
| 259 |
+
"template117.txt",
|
| 260 |
+
"template881.txt",
|
| 261 |
+
"template659.txt",
|
| 262 |
+
"template895.txt",
|
| 263 |
+
"template842.txt",
|
| 264 |
+
"template856.txt",
|
| 265 |
+
"template711.txt",
|
| 266 |
+
"template705.txt",
|
| 267 |
+
"template38.txt",
|
| 268 |
+
"template10.txt",
|
| 269 |
+
"template739.txt",
|
| 270 |
+
"template936.txt",
|
| 271 |
+
"template922.txt",
|
| 272 |
+
"template513.txt",
|
| 273 |
+
"template275.txt",
|
| 274 |
+
"template261.txt",
|
| 275 |
+
"template1049.txt",
|
| 276 |
+
"template507.txt",
|
| 277 |
+
"template249.txt",
|
| 278 |
+
"template1061.txt",
|
| 279 |
+
"template1075.txt",
|
| 280 |
+
"template1074.txt",
|
| 281 |
+
"template1060.txt",
|
| 282 |
+
"template248.txt",
|
| 283 |
+
"template1048.txt",
|
| 284 |
+
"template260.txt",
|
| 285 |
+
"template506.txt",
|
| 286 |
+
"template512.txt",
|
| 287 |
+
"template274.txt",
|
| 288 |
+
"template923.txt",
|
| 289 |
+
"template937.txt",
|
| 290 |
+
"template738.txt",
|
| 291 |
+
"template11.txt",
|
| 292 |
+
"template704.txt",
|
| 293 |
+
"template710.txt",
|
| 294 |
+
"template857.txt",
|
| 295 |
+
"template843.txt",
|
| 296 |
+
"template894.txt",
|
| 297 |
+
"template658.txt",
|
| 298 |
+
"template880.txt",
|
| 299 |
+
"template670.txt",
|
| 300 |
+
"template116.txt",
|
| 301 |
+
"template102.txt",
|
| 302 |
+
"template664.txt",
|
| 303 |
+
"template499.txt",
|
| 304 |
+
"template328.txt",
|
| 305 |
+
"template1100.txt",
|
| 306 |
+
"template1114.txt",
|
| 307 |
+
"template314.txt",
|
| 308 |
+
"template472.txt",
|
| 309 |
+
"template466.txt",
|
| 310 |
+
"template300.txt",
|
| 311 |
+
"template1128.txt",
|
| 312 |
+
"template470.txt",
|
| 313 |
+
"template316.txt",
|
| 314 |
+
"template302.txt",
|
| 315 |
+
"template464.txt",
|
| 316 |
+
"template1102.txt",
|
| 317 |
+
"template1116.txt",
|
| 318 |
+
"template458.txt",
|
| 319 |
+
"template114.txt",
|
| 320 |
+
"template672.txt",
|
| 321 |
+
"template666.txt",
|
| 322 |
+
"template100.txt",
|
| 323 |
+
"template128.txt",
|
| 324 |
+
"template896.txt",
|
| 325 |
+
"template882.txt",
|
| 326 |
+
"template869.txt",
|
| 327 |
+
"template855.txt",
|
| 328 |
+
"template699.txt",
|
| 329 |
+
"template841.txt",
|
| 330 |
+
"template706.txt",
|
| 331 |
+
"template712.txt",
|
| 332 |
+
"template13.txt",
|
| 333 |
+
"template909.txt",
|
| 334 |
+
"template921.txt",
|
| 335 |
+
"template935.txt",
|
| 336 |
+
"template504.txt",
|
| 337 |
+
"template262.txt",
|
| 338 |
+
"template276.txt",
|
| 339 |
+
"template510.txt",
|
| 340 |
+
"template538.txt",
|
| 341 |
+
"template1076.txt",
|
| 342 |
+
"template1062.txt",
|
| 343 |
+
"template1089.txt",
|
| 344 |
+
"template289.txt",
|
| 345 |
+
"template288.txt",
|
| 346 |
+
"template1088.txt",
|
| 347 |
+
"template1063.txt",
|
| 348 |
+
"template539.txt",
|
| 349 |
+
"template1077.txt",
|
| 350 |
+
"template277.txt",
|
| 351 |
+
"template511.txt",
|
| 352 |
+
"template505.txt",
|
| 353 |
+
"template263.txt",
|
| 354 |
+
"template934.txt",
|
| 355 |
+
"template920.txt",
|
| 356 |
+
"template908.txt",
|
| 357 |
+
"template12.txt",
|
| 358 |
+
"template713.txt",
|
| 359 |
+
"template707.txt",
|
| 360 |
+
"template840.txt",
|
| 361 |
+
"template698.txt",
|
| 362 |
+
"template854.txt",
|
| 363 |
+
"template868.txt",
|
| 364 |
+
"template883.txt",
|
| 365 |
+
"template129.txt",
|
| 366 |
+
"template897.txt",
|
| 367 |
+
"template667.txt",
|
| 368 |
+
"template101.txt",
|
| 369 |
+
"template115.txt",
|
| 370 |
+
"template673.txt",
|
| 371 |
+
"template1117.txt",
|
| 372 |
+
"template459.txt",
|
| 373 |
+
"template1103.txt",
|
| 374 |
+
"template303.txt",
|
| 375 |
+
"template465.txt",
|
| 376 |
+
"template471.txt",
|
| 377 |
+
"template317.txt",
|
| 378 |
+
"template4.txt",
|
| 379 |
+
"template1164.txt",
|
| 380 |
+
"template1170.txt",
|
| 381 |
+
"template358.txt",
|
| 382 |
+
"template416.txt",
|
| 383 |
+
"template1158.txt",
|
| 384 |
+
"template370.txt",
|
| 385 |
+
"template364.txt",
|
| 386 |
+
"template402.txt",
|
| 387 |
+
"template628.txt",
|
| 388 |
+
"template172.txt",
|
| 389 |
+
"template614.txt",
|
| 390 |
+
"template600.txt",
|
| 391 |
+
"template166.txt",
|
| 392 |
+
"template833.txt",
|
| 393 |
+
"template827.txt",
|
| 394 |
+
"template199.txt",
|
| 395 |
+
"template61.txt",
|
| 396 |
+
"template1212.txt",
|
| 397 |
+
"template984.txt",
|
| 398 |
+
"template748.txt",
|
| 399 |
+
"template990.txt",
|
| 400 |
+
"template75.txt",
|
| 401 |
+
"template1206.txt",
|
| 402 |
+
"template760.txt",
|
| 403 |
+
"template774.txt",
|
| 404 |
+
"template49.txt",
|
| 405 |
+
"template947.txt",
|
| 406 |
+
"template953.txt",
|
| 407 |
+
"template238.txt",
|
| 408 |
+
"template1010.txt",
|
| 409 |
+
"template1004.txt",
|
| 410 |
+
"template562.txt",
|
| 411 |
+
"template204.txt",
|
| 412 |
+
"template210.txt",
|
| 413 |
+
"template1038.txt",
|
| 414 |
+
"template576.txt",
|
| 415 |
+
"template589.txt",
|
| 416 |
+
"template588.txt",
|
| 417 |
+
"template1039.txt",
|
| 418 |
+
"template211.txt",
|
| 419 |
+
"template577.txt",
|
| 420 |
+
"template563.txt",
|
| 421 |
+
"template205.txt",
|
| 422 |
+
"template1005.txt",
|
| 423 |
+
"template1011.txt",
|
| 424 |
+
"template239.txt",
|
| 425 |
+
"template952.txt",
|
| 426 |
+
"template946.txt",
|
| 427 |
+
"template775.txt",
|
| 428 |
+
"template48.txt",
|
| 429 |
+
"template761.txt",
|
| 430 |
+
"template991.txt",
|
| 431 |
+
"template749.txt",
|
| 432 |
+
"template1207.txt",
|
| 433 |
+
"template74.txt",
|
| 434 |
+
"template1213.txt",
|
| 435 |
+
"template60.txt",
|
| 436 |
+
"template985.txt",
|
| 437 |
+
"template826.txt",
|
| 438 |
+
"template198.txt",
|
| 439 |
+
"template832.txt",
|
| 440 |
+
"template601.txt",
|
| 441 |
+
"template167.txt",
|
| 442 |
+
"template173.txt",
|
| 443 |
+
"template615.txt",
|
| 444 |
+
"template629.txt",
|
| 445 |
+
"template365.txt",
|
| 446 |
+
"template403.txt",
|
| 447 |
+
"template417.txt",
|
| 448 |
+
"template371.txt",
|
| 449 |
+
"template1159.txt",
|
| 450 |
+
"template359.txt",
|
| 451 |
+
"template1171.txt",
|
| 452 |
+
"template1165.txt",
|
| 453 |
+
"template5.txt",
|
| 454 |
+
"template1173.txt",
|
| 455 |
+
"template373.txt"
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
# Ensure output file is empty before starting
|
| 459 |
+
open(output_file, "w", encoding="utf-8").close()
|
| 460 |
+
|
| 461 |
+
processed_count = 0
|
| 462 |
+
excluded_count = 0
|
| 463 |
+
|
| 464 |
+
for filename in os.listdir(input_folder):
|
| 465 |
+
if filename.endswith(".txt"):
|
| 466 |
+
# Vérifier si le fichier est dans la liste d'exclusion
|
| 467 |
+
if filename in excluded_files:
|
| 468 |
+
print(f"⏭️ Fichier exclu : {filename}")
|
| 469 |
+
excluded_count += 1
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
file_path = os.path.join(input_folder, filename)
|
| 473 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 474 |
+
transcription = f.read().strip()
|
| 475 |
+
|
| 476 |
+
print(f"\n=== Processing {filename} ===")
|
| 477 |
+
entities = extract_medical_entities(transcription)
|
| 478 |
+
|
| 479 |
+
# Save results
|
| 480 |
+
save_annotation(transcription, entities, output_file=output_file)
|
| 481 |
+
|
| 482 |
+
print(f"✅ Saved {filename} → {output_file}")
|
| 483 |
+
processed_count += 1
|
| 484 |
+
|
| 485 |
+
print(f"\n📊 Résumé : {processed_count} fichiers traités, {excluded_count} fichiers exclus")
|
| 486 |
+
"""
|
app.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Interface Gradio : Agent NER médical + Mapper
|
| 4 |
+
Input transcription → Extraction → Mapping → Rapport
|
| 5 |
+
"""
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from type3_extract_entities import MedicalNERAgent
|
| 8 |
+
from medical_template3_mapper import MedicalTemplateMapper
|
| 9 |
+
from type3_preprocessing import MedicalTranscriptionProcessor, AZURE_OPENAI_DEPLOYMENT
|
| 10 |
+
from post_processing import post_process_medical_report
|
| 11 |
+
def process_transcription(transcription: str):
|
| 12 |
+
try:
|
| 13 |
+
#Étape 1 correction asr
|
| 14 |
+
processor = MedicalTranscriptionProcessor(AZURE_OPENAI_DEPLOYMENT)
|
| 15 |
+
result = processor.process_transcription(transcription)
|
| 16 |
+
corrected_transcription=result.final_corrected_text
|
| 17 |
+
# Étape 1 : Extraction
|
| 18 |
+
|
| 19 |
+
agent = MedicalNERAgent()
|
| 20 |
+
extracted_data = agent.extract_medical_entities(corrected_transcription)
|
| 21 |
+
extraction_report = agent.print_extraction_report(extracted_data)
|
| 22 |
+
|
| 23 |
+
# Étape 2 : Mapping vers template
|
| 24 |
+
mapper = MedicalTemplateMapper()
|
| 25 |
+
mapping_result = mapper.map_extracted_data_to_template(extracted_data)
|
| 26 |
+
#mapping_report = mapper.print_mapping_report(mapping_result)
|
| 27 |
+
mapping_report = mapper.template
|
| 28 |
+
|
| 29 |
+
# Étape 3 : Rapport final rempli
|
| 30 |
+
rapport_final = mapping_result.filled_template
|
| 31 |
+
|
| 32 |
+
#Étape 4: nettoyage du rapport
|
| 33 |
+
cleaned_report = post_process_medical_report(rapport_final)
|
| 34 |
+
|
| 35 |
+
return corrected_transcription,extraction_report, mapping_report, cleaned_report
|
| 36 |
+
except Exception as e:
|
| 37 |
+
return f"Erreur: {e}", "", ""
|
| 38 |
+
|
| 39 |
+
# Interface Gradio
|
| 40 |
+
demo = gr.Interface(
|
| 41 |
+
fn=process_transcription,
|
| 42 |
+
inputs=gr.Textbox(lines=15, label="Transcription médicale"),
|
| 43 |
+
outputs=[
|
| 44 |
+
gr.Textbox(lines=20, label="🔬 Crorrection de la transcription"),
|
| 45 |
+
gr.Textbox(lines=20, label="📋 Extraction structurée"),
|
| 46 |
+
gr.Textbox(lines=20, label="📋 Rapport à remplir (Mapping)"),
|
| 47 |
+
gr.Textbox(lines=20, label="✅ Compte-rendu structuré final"),
|
| 48 |
+
],
|
| 49 |
+
title="🏥 Génération de comptes-rendus structurés",
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
demo.launch(share=True)
|
app2.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Interface Gradio : Transcription médicale → Correction → Matching → Rapport
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Fix pour le problème de pickle avec TemplateInfo
|
| 10 |
+
import template_db_creation
|
| 11 |
+
sys.modules['__main__'].TemplateInfo = template_db_creation.TemplateInfo
|
| 12 |
+
|
| 13 |
+
from template_db_creation import MedicalTemplateParser
|
| 14 |
+
from smart_match import TranscriptionMatcher
|
| 15 |
+
from correcteur import MedicalTranscriptionProcessor, AZURE_OPENAI_DEPLOYMENT
|
| 16 |
+
|
| 17 |
+
# Chemin hardcodé vers la base de données
|
| 18 |
+
DB_PATH = "/Users/macbook/medical-agent/medical-agent/templates/medical_templates.pkl"
|
| 19 |
+
|
| 20 |
+
# Variables globales
|
| 21 |
+
parser = None
|
| 22 |
+
matcher = None
|
| 23 |
+
|
| 24 |
+
def initialize_system():
|
| 25 |
+
"""Initialise le système au démarrage"""
|
| 26 |
+
global parser, matcher
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
print(f"📂 Chargement de la base de données: {DB_PATH}")
|
| 30 |
+
parser = MedicalTemplateParser()
|
| 31 |
+
parser.load_database(DB_PATH)
|
| 32 |
+
|
| 33 |
+
matcher = TranscriptionMatcher(parser)
|
| 34 |
+
|
| 35 |
+
print(f"✅ Système initialisé avec {len(parser.templates)} templates")
|
| 36 |
+
return True
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"❌ Erreur initialisation: {e}")
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
def process_transcription(transcription: str):
|
| 42 |
+
"""
|
| 43 |
+
Traite la transcription médicale complète
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
transcription: Texte de la transcription
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Tuple (transcription_corrigée, template_vide, rapport_final)
|
| 50 |
+
"""
|
| 51 |
+
try:
|
| 52 |
+
# Étape 1: Correction ASR
|
| 53 |
+
print("🔧 Étape 1: Correction de la transcription...")
|
| 54 |
+
processor = MedicalTranscriptionProcessor(AZURE_OPENAI_DEPLOYMENT)
|
| 55 |
+
result = processor.process_transcription(transcription)
|
| 56 |
+
corrected_transcription = result.final_corrected_text
|
| 57 |
+
|
| 58 |
+
# Étape 2: Matching et remplissage du template
|
| 59 |
+
print("🔍 Étape 2: Recherche du template approprié...")
|
| 60 |
+
results = matcher.match_and_fill(corrected_transcription, return_top_k=1)
|
| 61 |
+
|
| 62 |
+
if not results:
|
| 63 |
+
return (
|
| 64 |
+
corrected_transcription,
|
| 65 |
+
"❌ Aucun template approprié trouvé",
|
| 66 |
+
"❌ Impossible de générer le rapport"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
best_result = results[0]
|
| 70 |
+
|
| 71 |
+
# Préparer le template vide avec toutes les informations
|
| 72 |
+
template_vide = f"{best_result.template_id}\n"
|
| 73 |
+
template_vide += "=" * len(best_result.template_id) + "\n"
|
| 74 |
+
template_vide += best_result.template_content
|
| 75 |
+
|
| 76 |
+
# Préparer le rapport final rempli avec toutes les sections
|
| 77 |
+
rapport_final = f"{best_result.template_id}\n"
|
| 78 |
+
rapport_final += "=" * len(best_result.template_id) + "\n"
|
| 79 |
+
|
| 80 |
+
# Ajouter toutes les sections remplies
|
| 81 |
+
rapport_final += best_result.filled_template
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
print(f"✅ Traitement terminé - Template: {best_result.template_id}")
|
| 85 |
+
|
| 86 |
+
return corrected_transcription, template_vide, rapport_final
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
error_msg = f"❌ Erreur: {str(e)}"
|
| 90 |
+
print(error_msg)
|
| 91 |
+
return error_msg, "", ""
|
| 92 |
+
|
| 93 |
+
# Initialiser le système au démarrage
|
| 94 |
+
print("🚀 Initialisation du système...")
|
| 95 |
+
if not initialize_system():
|
| 96 |
+
print("⚠️ Erreur lors de l'initialisation - vérifiez le chemin de la DB")
|
| 97 |
+
|
| 98 |
+
# Interface Gradio
|
| 99 |
+
demo = gr.Interface(
|
| 100 |
+
fn=process_transcription,
|
| 101 |
+
inputs=gr.Textbox(
|
| 102 |
+
lines=15,
|
| 103 |
+
label="📝 Transcription médicale",
|
| 104 |
+
placeholder="Collez ici la transcription de l'examen médical..."
|
| 105 |
+
),
|
| 106 |
+
outputs=[
|
| 107 |
+
gr.Textbox(lines=20, label="✅ Transcription corrigée", show_copy_button=True),
|
| 108 |
+
gr.Textbox(lines=20, label="📋 Rapport à remplir (Template)", show_copy_button=True),
|
| 109 |
+
gr.Textbox(lines=20, label="📄 Compte-rendu structuré final", show_copy_button=True),
|
| 110 |
+
],
|
| 111 |
+
title="🏥 Génération de comptes-rendus structurés",
|
| 112 |
+
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
demo.launch(share=True)
|
correcteur.py
ADDED
|
@@ -0,0 +1,751 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
import openai
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 9 |
+
import os
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from openai import AzureOpenAI
|
| 12 |
+
from medkit.core.text import TextDocument
|
| 13 |
+
from medkit.text.ner.hf_entity_matcher import HFEntityMatcher
|
| 14 |
+
|
| 15 |
+
NER_MODEL = os.getenv("NER_MODEL", "medkit/DrBERT-CASM2")
|
| 16 |
+
|
| 17 |
+
# Charger les variables d'environnement depuis .env
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Récupération des paramètres
|
| 21 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 22 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 23 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
| 24 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
|
| 25 |
+
|
| 26 |
+
# Validation des variables d'environnement
|
| 27 |
+
def validate_azure_config():
|
| 28 |
+
"""Valide que toutes les variables Azure sont configurées"""
|
| 29 |
+
missing_vars = []
|
| 30 |
+
if not AZURE_OPENAI_KEY:
|
| 31 |
+
missing_vars.append("AZURE_OPENAI_KEY")
|
| 32 |
+
if not AZURE_OPENAI_ENDPOINT:
|
| 33 |
+
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
| 34 |
+
if not AZURE_OPENAI_DEPLOYMENT:
|
| 35 |
+
missing_vars.append("AZURE_OPENAI_DEPLOYMENT")
|
| 36 |
+
|
| 37 |
+
if missing_vars:
|
| 38 |
+
print(f"❌ Variables d'environnement manquantes: {', '.join(missing_vars)}")
|
| 39 |
+
print("📝 Veuillez créer un fichier .env avec:")
|
| 40 |
+
for var in missing_vars:
|
| 41 |
+
print(f" {var}=votre_valeur")
|
| 42 |
+
return False
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
# Client Azure OpenAI avec validation
|
| 46 |
+
azure_client = None
|
| 47 |
+
if validate_azure_config():
|
| 48 |
+
try:
|
| 49 |
+
azure_client = AzureOpenAI(
|
| 50 |
+
api_key=AZURE_OPENAI_KEY,
|
| 51 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
| 52 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
| 53 |
+
)
|
| 54 |
+
print("✅ Client Azure OpenAI initialisé avec succès")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Erreur lors de l'initialisation du client Azure OpenAI: {e}")
|
| 57 |
+
azure_client = None
|
| 58 |
+
|
| 59 |
+
ner_matcher = HFEntityMatcher(model=NER_MODEL)
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class CorrectionResult:
|
| 63 |
+
original_text: str
|
| 64 |
+
ner_corrected_text: str
|
| 65 |
+
final_corrected_text: str
|
| 66 |
+
medical_entities: List[Dict]
|
| 67 |
+
confidence_score: float
|
| 68 |
+
|
| 69 |
+
class MedicalNERCorrector:
|
| 70 |
+
"""Correcteur orthographique basé sur un NER médical français"""
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
try:
|
| 74 |
+
# Charger le modèle MedKit NER
|
| 75 |
+
self.matcher = HFEntityMatcher(model=NER_MODEL)
|
| 76 |
+
print(f"✅ Modèle NER '{NER_MODEL}' chargé avec succès")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ Erreur lors du chargement du modèle NER {NER_MODEL}: {e}")
|
| 79 |
+
self.matcher = None
|
| 80 |
+
|
| 81 |
+
# Dictionnaire complet pour convertir tous les nombres en lettres vers chiffres
|
| 82 |
+
self.number_corrections = {
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Variantes courantes dans les transcriptions vocales
|
| 86 |
+
"1": "1", "1er": "1", "première": "1", "premier": "1",
|
| 87 |
+
"2ème": "2", "deuxième": "2", "second": "2", "seconde": "2",
|
| 88 |
+
"3ème": "3", "troisième": "3", "4ème": "4", "quatrième": "4",
|
| 89 |
+
"5ème": "5", "cinquième": "5", "6ème": "6", "sixième": "6",
|
| 90 |
+
"7ème": "7", "septième": "7", "8ème": "8", "huitième": "8",
|
| 91 |
+
"9ème": "9", "neuvième": "9", "10ème": "10", "dixième": "10",
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Dictionnaire de corrections pour transcriptions vocales - ORDRE IMPORTANT
|
| 95 |
+
self.vocal_corrections = {
|
| 96 |
+
# Corrections de ponctuation - doivent être traitées en premier
|
| 97 |
+
"point à la ligne": ".\n",
|
| 98 |
+
"retour à la ligne": "\n",
|
| 99 |
+
"à la ligne": "\n",
|
| 100 |
+
"nouvelle ligne": "\n",
|
| 101 |
+
"saut de ligne": "\n",
|
| 102 |
+
"point virgule": ";",
|
| 103 |
+
"deux points": ":",
|
| 104 |
+
"point d'interrogation": "?",
|
| 105 |
+
"point d'exclamation": "!",
|
| 106 |
+
"virgule": ",",
|
| 107 |
+
"point": ".", # Doit être traité en dernier pour éviter les conflits
|
| 108 |
+
|
| 109 |
+
# Corrections des séquences IRM
|
| 110 |
+
"T un": "T1", "T deux": "T2", "T trois": "T3",
|
| 111 |
+
"t un": "T1", "t deux": "T2", "t trois": "T3",
|
| 112 |
+
"séquence T un": "séquence T1", "séquence T deux": "séquence T2",
|
| 113 |
+
|
| 114 |
+
# Corrections des niveaux vertébraux - cervicaux
|
| 115 |
+
"C un": "C1", "C deux": "C2", "C trois": "C3", "C quatre": "C4",
|
| 116 |
+
"C cinq": "C5", "C six": "C6", "C sept": "C7",
|
| 117 |
+
"c un": "C1", "c deux": "C2", "c trois": "C3", "c quatre": "C4",
|
| 118 |
+
"c cinq": "C5", "c six": "C6", "c sept": "C7",
|
| 119 |
+
|
| 120 |
+
# Niveaux thoraciques
|
| 121 |
+
"T un": "T1", "T deux": "T2", "T trois": "T3", "T quatre": "T4",
|
| 122 |
+
"T cinq": "T5", "T six": "T6", "T sept": "T7", "T huit": "T8",
|
| 123 |
+
"T neuf": "T9", "T dix": "T10", "T onze": "T11", "T douze": "T12",
|
| 124 |
+
|
| 125 |
+
# Niveaux lombaires
|
| 126 |
+
"L un": "L1", "L deux": "L2", "L trois": "L3", "L quatre": "L4", "L cinq": "L5",
|
| 127 |
+
"l un": "L1", "l deux": "L2", "l trois": "L3", "l quatre": "L4", "l cinq": "L5",
|
| 128 |
+
|
| 129 |
+
# Niveaux sacrés
|
| 130 |
+
"S un": "S1", "S deux": "S2", "S trois": "S3", "S quatre": "S4", "S cinq": "S5",
|
| 131 |
+
"s un": "S1", "s deux": "S2", "s trois": "S3", "s quatre": "S4", "s cinq": "S5",
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
# Dictionnaire de corrections médicales spécialisées - orthographe
|
| 135 |
+
self.medical_corrections = {
|
| 136 |
+
# Anatomie
|
| 137 |
+
"rachis": ["rachis", "rachi", "rachys", "rahis", "raxis"],
|
| 138 |
+
"cervical": ["cervical", "cervicale", "cervicaux", "servical", "servicale"],
|
| 139 |
+
"vertébraux": ["vertébraux", "vertebraux", "vertébrau", "vertébral", "vertebral"],
|
| 140 |
+
"médullaire": ["médullaire", "medullaire", "medulaire", "médulaire"],
|
| 141 |
+
"foraminal": ["foraminal", "foraminale", "foraminaux", "forraminal"],
|
| 142 |
+
"postérolatéral": ["postérolatéral", "posterolatéral", "postero-latéral", "postero latéral"],
|
| 143 |
+
"antérolatéral": ["antérolatéral", "anterolatéral", "antero-latéral", "antero latéral"],
|
| 144 |
+
"longitudinal": ["longitudinal", "longitudinale", "longitudinaux"],
|
| 145 |
+
|
| 146 |
+
# Pathologies
|
| 147 |
+
"uncarthrose": ["uncarthrose", "uncoarthrose", "uncartrose", "unkarthrose"],
|
| 148 |
+
"lordose": ["lordose", "lordoze", "lordosse", "lordosse"],
|
| 149 |
+
"cyphose": ["cyphose", "siphose", "kyphose", "kiphose"],
|
| 150 |
+
"scoliose": ["scoliose", "skoliose", "scholiose"],
|
| 151 |
+
"discopathie": ["discopathie", "disccopathie", "discopatie"],
|
| 152 |
+
"discal": ["discal", "discale", "diskal", "diskale", "disque"],
|
| 153 |
+
"hernie": ["hernie", "herny", "herni"],
|
| 154 |
+
"protrusion": ["protrusion", "protusion", "protruzion"],
|
| 155 |
+
"sténose": ["sténose", "stenose", "sténoze"],
|
| 156 |
+
"arthrose": ["arthrose", "artrose", "arthroze"],
|
| 157 |
+
"ostéophyte": ["ostéophyte", "osteophyte", "ostéofite"],
|
| 158 |
+
"ligamentaire": ["ligamentaire", "ligamentere", "ligamentair"],
|
| 159 |
+
|
| 160 |
+
# Techniques et examens
|
| 161 |
+
"sagittal": ["sagittal", "sagittale", "sagital", "sagittaux"],
|
| 162 |
+
"coronal": ["coronal", "coronale", "coronaux"],
|
| 163 |
+
"axial": ["axial", "axiale", "axiaux", "axial"],
|
| 164 |
+
"transversal": ["transversal", "transversale", "transversaux"],
|
| 165 |
+
"pondéré": ["pondéré", "pondéré", "pondere", "pondérée"],
|
| 166 |
+
"séquence": ["séquence", "sequence", "sekence"],
|
| 167 |
+
"contraste": ["contraste", "contraste", "kontraste"],
|
| 168 |
+
"gadolinium": ["gadolinium", "gadoliniun", "gadoliniom"],
|
| 169 |
+
|
| 170 |
+
# Mesures et directions
|
| 171 |
+
"millimètre": ["millimètre", "millimetre", "mm"],
|
| 172 |
+
"centimètre": ["centimètre", "centimetre", "cm"],
|
| 173 |
+
"gauche": ["gauche", "gosh", "goshe", "goche"],
|
| 174 |
+
"droite": ["droite", "droitte", "droithe", "droitr"],
|
| 175 |
+
"antérieur": ["antérieur", "anterieur", "antérieure", "anterieure"],
|
| 176 |
+
"postérieur": ["postérieur", "posterieur", "postérieure", "posterieure"],
|
| 177 |
+
"supérieur": ["supérieur", "superieur", "supérieure", "superieure"],
|
| 178 |
+
"inférieur": ["inférieur", "inferieur", "inférieure", "inferieure"],
|
| 179 |
+
"médian": ["médian", "median", "mediane", "médiane"],
|
| 180 |
+
"latéral": ["latéral", "lateral", "laterale", "latérale"],
|
| 181 |
+
|
| 182 |
+
# Signaux et aspects
|
| 183 |
+
"signal": ["signal", "signale", "signa", "signaux"],
|
| 184 |
+
"hypersignal": ["hypersignal", "hyper signal", "hypersignale"],
|
| 185 |
+
"hyposignal": ["hyposignal", "hypo signal", "hyposignale"],
|
| 186 |
+
"isosignal": ["isosignal", "iso signal", "isosignale"],
|
| 187 |
+
"hétérogène": ["hétérogène", "heterogene", "hétérogène"],
|
| 188 |
+
"homogène": ["homogène", "homogene", "omogene"],
|
| 189 |
+
|
| 190 |
+
# Autres termes fréquents
|
| 191 |
+
"dimension": ["dimension", "dimention", "dimmension"],
|
| 192 |
+
"normale": ["normale", "normal", "normalle"],
|
| 193 |
+
"anomalie": ["anomalie", "annomalie", "anomaly"],
|
| 194 |
+
"décelable": ["décelable", "decelabl", "décellabl"],
|
| 195 |
+
"absence": ["absence", "abscence", "absance"],
|
| 196 |
+
"présence": ["présence", "presence", "presance"],
|
| 197 |
+
"contact": ["contact", "contacte", "kontak"],
|
| 198 |
+
"compression": ["compression", "compresion", "kompression"],
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
# Expressions régulières pour les patterns médicaux
|
| 202 |
+
self.medical_patterns = {
|
| 203 |
+
"vertebral_level": r"[CTLS]\d+[\s-]*[CTLS]\d+",
|
| 204 |
+
"measurement": r"\d+[\s]*[x×]\s*\d+\s*mm",
|
| 205 |
+
"technique": r"T[1-3]",
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
def convert_numbers_to_digits(self, text: str) -> str:
|
| 209 |
+
"""Convertit TOUS les nombres en lettres vers des chiffres"""
|
| 210 |
+
corrected_text = text
|
| 211 |
+
|
| 212 |
+
# ÉTAPE 1: Gestion spéciale des mesures médicales communes
|
| 213 |
+
medical_measures = {
|
| 214 |
+
# Mesures d'utérus courantes
|
| 215 |
+
"sept point huit": "7,8",
|
| 216 |
+
"trois sept": "3,7",
|
| 217 |
+
"soixante douze": "72",
|
| 218 |
+
"soixante treize": "73",
|
| 219 |
+
"soixante quatorze": "74",
|
| 220 |
+
"soixante quinze": "75",
|
| 221 |
+
"soixante seize": "76",
|
| 222 |
+
"soixante dix sept": "77",
|
| 223 |
+
"soixante dix huit": "78",
|
| 224 |
+
"soixante dix neuf": "79",
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# Mesures d'ovaires courantes
|
| 228 |
+
"vingt six": "26",
|
| 229 |
+
|
| 230 |
+
"vingt cinq": "25",
|
| 231 |
+
"dix neuf": "19",
|
| 232 |
+
"vingt deux": "22",
|
| 233 |
+
|
| 234 |
+
# Mesures Doppler
|
| 235 |
+
"trois vingt quatre": "3,24", # IP
|
| 236 |
+
"quatre vingt onze": "0,91", # IR (avec virgule décimale)
|
| 237 |
+
|
| 238 |
+
# Autres mesures courantes
|
| 239 |
+
"quinze": "15",
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
# Application des mesures médicales en premier
|
| 243 |
+
for word_measure, digit_measure in medical_measures.items():
|
| 244 |
+
pattern = r'\b' + re.escape(word_measure) + r'\b'
|
| 245 |
+
corrected_text = re.sub(pattern, digit_measure, corrected_text, flags=re.IGNORECASE)
|
| 246 |
+
|
| 247 |
+
# ÉTAPE 2: Traitement des nombres composés restants
|
| 248 |
+
compound_without_dash = {
|
| 249 |
+
"vingt un": "21", "vingt deux": "22", "vingt trois": "23", "vingt quatre": "24",
|
| 250 |
+
"vingt cinq": "25", "vingt six": "26", "vingt sept": "27", "vingt huit": "28",
|
| 251 |
+
"vingt neuf": "29", "trente un": "31", "trente deux": "32", "trente trois": "33",
|
| 252 |
+
"trente quatre": "34", "trente cinq": "35", "trente six": "36", "trente sept": "37",
|
| 253 |
+
"trente huit": "38", "trente neuf": "39", "quarante un": "41", "quarante deux": "42",
|
| 254 |
+
"quarante trois": "43", "quarante quatre": "44", "quarante cinq": "45",
|
| 255 |
+
"quarante six": "46", "quarante sept": "47", "quarante huit": "48", "quarante neuf": "49",
|
| 256 |
+
"cinquante un": "51", "cinquante deux": "52", "cinquante trois": "53",
|
| 257 |
+
"cinquante quatre": "54", "cinquante cinq": "55", "cinquante six": "56",
|
| 258 |
+
"cinquante sept": "57", "cinquante huit": "58", "cinquante neuf": "59",
|
| 259 |
+
"soixante un": "61", "soixante deux": "62", "soixante trois": "63",
|
| 260 |
+
"soixante quatre": "64", "soixante cinq": "65", "soixante six": "66",
|
| 261 |
+
"soixante sept": "67", "soixante huit": "68", "soixante neuf": "69",
|
| 262 |
+
"soixante et onze": "71", "soixante douze": "72", "soixante treize": "73",
|
| 263 |
+
"soixante quatorze": "74", "soixante quinze": "75", "soixante seize": "76",
|
| 264 |
+
"soixante dix sept": "77", "soixante dix huit": "78", "soixante dix neuf": "79",
|
| 265 |
+
"quatre vingt un": "81", "quatre vingt deux": "82", "quatre vingt trois": "83",
|
| 266 |
+
"quatre vingt quatre": "84", "quatre vingt cinq": "85", "quatre vingt six": "86",
|
| 267 |
+
"quatre vingt sept": "87", "quatre vingt huit": "88", "quatre vingt neuf": "89",
|
| 268 |
+
"quatre vingt onze": "91", "quatre vingt douze": "92", "quatre vingt treize": "93",
|
| 269 |
+
"quatre vingt quatorze": "94", "quatre vingt quinze": "95", "quatre vingt seize": "96",
|
| 270 |
+
"quatre vingt dix sept": "97", "quatre vingt dix huit": "98", "quatre vingt dix neuf": "99",
|
| 271 |
+
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
for word, digit in compound_without_dash.items():
|
| 275 |
+
# Protection : ne remplace PAS si suivi de "fois" ET d'un autre nombre
|
| 276 |
+
pattern = r'\b' + re.escape(word) + r'\b(?!\s+fois\s+\w+)'
|
| 277 |
+
corrected_text = re.sub(pattern, digit, corrected_text, flags=re.IGNORECASE)
|
| 278 |
+
|
| 279 |
+
# ÉTAPE 3: Nombres simples (ordre modifié pour éviter les conflits)
|
| 280 |
+
simple_numbers = {
|
| 281 |
+
"zéro": "0", "deux": "2", "trois": "3", "quatre": "4",
|
| 282 |
+
"cinq": "5", "six": "6", "sept": "7", "huit": "8", "neuf": "9",
|
| 283 |
+
"dix": "10", "onze": "11", "douze": "12", "treize": "13", "quatorze": "14",
|
| 284 |
+
"quinze": "15", "seize": "16", "dix-sept": "17", "dix-huit": "18",
|
| 285 |
+
"dix-neuf": "19", "vingt": "20", "trente": "30", "quarante": "40",
|
| 286 |
+
"cinquante": "50", "soixante-dix": "70",
|
| 287 |
+
"quatre-vingts": "80", "quatre-vingt": "80", "quatre-vingt-dix": "90",
|
| 288 |
+
"cent": "100", "mille": "1000",
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
# Conversion des nombres simples
|
| 292 |
+
for word_number, digit in simple_numbers.items():
|
| 293 |
+
pattern = r'\b' + re.escape(word_number) + r'\b'
|
| 294 |
+
corrected_text = re.sub(pattern, digit, corrected_text, flags=re.IGNORECASE)
|
| 295 |
+
|
| 296 |
+
corrected_text = re.sub(r'\bpour\s+cent\b', '%', corrected_text, flags=re.IGNORECASE)
|
| 297 |
+
|
| 298 |
+
return corrected_text
|
| 299 |
+
|
| 300 |
+
def extract_medical_entities(self, text: str):
|
| 301 |
+
"""Extrait les entités médicales avec MedKit HFEntityMatcher"""
|
| 302 |
+
if not self.matcher:
|
| 303 |
+
return []
|
| 304 |
+
doc = TextDocument(text)
|
| 305 |
+
entities = self.matcher.run([doc.raw_segment])
|
| 306 |
+
# Transformer en format simple
|
| 307 |
+
formatted_entities = []
|
| 308 |
+
for ent in entities:
|
| 309 |
+
formatted_entities.append({
|
| 310 |
+
"text": ent.text,
|
| 311 |
+
"label": ent.label,
|
| 312 |
+
})
|
| 313 |
+
return formatted_entities
|
| 314 |
+
|
| 315 |
+
def correct_vocal_transcription(self, text: str) -> str:
|
| 316 |
+
"""Corrige les transcriptions vocales avec un ordre de priorité strict"""
|
| 317 |
+
corrected_text = text
|
| 318 |
+
|
| 319 |
+
# ÉTAPE 1: Conversion des nombres AVANT tout le reste
|
| 320 |
+
corrected_text = self.convert_numbers_to_digits(corrected_text)
|
| 321 |
+
|
| 322 |
+
# ÉTAPE 2: Corrections des expressions vocales dans l'ordre de priorité
|
| 323 |
+
# L'ordre est CRUCIAL pour éviter les conflits
|
| 324 |
+
priority_corrections = [
|
| 325 |
+
# Expressions de ponctuation complexes en premier
|
| 326 |
+
("point à la ligne", ".\n"),
|
| 327 |
+
("retour à la ligne", "\n"),
|
| 328 |
+
("à la ligne", "\n"),
|
| 329 |
+
("nouvelle ligne", "\n"),
|
| 330 |
+
("saut de ligne", "\n"),
|
| 331 |
+
("point virgule", ";"),
|
| 332 |
+
("deux points", ":"),
|
| 333 |
+
("point d'interrogation", "?"),
|
| 334 |
+
("point d'exclamation", "!"),
|
| 335 |
+
|
| 336 |
+
# Niveaux vertébraux avec nombres
|
| 337 |
+
("C 1", "C1"), ("C 2", "C2"), ("C 3", "C3"), ("C 4", "C4"),
|
| 338 |
+
("C 5", "C5"), ("C 6", "C6"), ("C 7", "C7"),
|
| 339 |
+
("L 1", "L1"), ("L 2", "L2"), ("L 3", "L3"), ("L 4", "L4"), ("L 5", "L5"),
|
| 340 |
+
("T 1", "T1"), ("T 2", "T2"), ("T 3", "T3"), ("T 4", "T4"),
|
| 341 |
+
("T 5", "T5"), ("T 6", "T6"), ("T 7", "T7"), ("T 8", "T8"),
|
| 342 |
+
("T 9", "T9"), ("T 10", "T10"), ("T 11", "T11"), ("T 12", "T12"),
|
| 343 |
+
|
| 344 |
+
# Séquences IRM
|
| 345 |
+
("séquence T 1", "séquence T1"), ("séquence T 2", "séquence T2"),
|
| 346 |
+
|
| 347 |
+
# Virgule et point en dernier pour éviter les conflits
|
| 348 |
+
("virgule", ","),
|
| 349 |
+
]
|
| 350 |
+
|
| 351 |
+
for vocal_term, replacement in priority_corrections:
|
| 352 |
+
# Utilisation de word boundaries pour éviter les remplacements partiels
|
| 353 |
+
pattern = r'\b' + re.escape(vocal_term) + r'\b'
|
| 354 |
+
corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
|
| 355 |
+
|
| 356 |
+
# ÉTAPE 3: Correction spéciale pour "point" - seulement si c'est vraiment une fin de phrase
|
| 357 |
+
# Pattern pour détecter "point" suivi d'un espace et d'une majuscule OU en fin de texte
|
| 358 |
+
corrected_text = re.sub(r'\bpoint(?!\s+(?:à|d\'|virgule))', '.', corrected_text, flags=re.IGNORECASE)
|
| 359 |
+
|
| 360 |
+
return corrected_text
|
| 361 |
+
|
| 362 |
+
def correct_medical_terms(self, text: str) -> str:
|
| 363 |
+
"""Corrige les termes médicaux basés sur le dictionnaire"""
|
| 364 |
+
corrected_text = text
|
| 365 |
+
|
| 366 |
+
for correct_term, variations in self.medical_corrections.items():
|
| 367 |
+
for variation in variations:
|
| 368 |
+
if variation != correct_term: # Éviter de remplacer par lui-même
|
| 369 |
+
# Correction avec préservation de la casse du premier caractère
|
| 370 |
+
pattern = r'\b' + re.escape(variation) + r'\b'
|
| 371 |
+
|
| 372 |
+
def replace_with_case(match):
|
| 373 |
+
matched_text = match.group(0)
|
| 374 |
+
if matched_text[0].isupper():
|
| 375 |
+
return correct_term.capitalize()
|
| 376 |
+
return correct_term
|
| 377 |
+
|
| 378 |
+
corrected_text = re.sub(pattern, replace_with_case, corrected_text, flags=re.IGNORECASE)
|
| 379 |
+
|
| 380 |
+
return corrected_text
|
| 381 |
+
|
| 382 |
+
def normalize_medical_patterns(self, text: str) -> str:
|
| 383 |
+
"""Normalise les patterns médicaux avec gestion des mesures"""
|
| 384 |
+
normalized_text = text
|
| 385 |
+
|
| 386 |
+
# Gestion spéciale des mesures avec "fois" (dimensions)
|
| 387 |
+
# Pattern: nombre fois nombre -> nombre x nombre
|
| 388 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s+fois\s+(\d+(?:[.,]\d+)?)', r'\1 x \2', normalized_text, flags=re.IGNORECASE)
|
| 389 |
+
|
| 390 |
+
# Normalisation des niveaux vertébraux (ex: C5 C6 -> C5-C6, C5c6 -> C5-C6)
|
| 391 |
+
normalized_text = re.sub(r'([CTLS])(\d)\s*([CTLS])?(\d)', lambda m: f"{m.group(1)}{m.group(2)}-{m.group(1)}{m.group(4)}", normalized_text, flags=re.IGNORECASE)
|
| 392 |
+
|
| 393 |
+
# Normalisation des mesures existantes (ex: 72x40mm -> 72 x 40 mm)
|
| 394 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*mm', r'\1 x \2 mm', normalized_text)
|
| 395 |
+
|
| 396 |
+
# Ajout automatique de l'unité mm pour les mesures sans unité (nombre x nombre -> nombre x nombre mm)
|
| 397 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*x\s*(\d+(?:[.,]\d+)?)(?!\s*(?:mm|cm))', r'\1 x \2 mm', normalized_text, flags=re.IGNORECASE)
|
| 398 |
+
|
| 399 |
+
# Normalisation des millimètres écrits en toutes lettres
|
| 400 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*millimètres?', r'\1 mm', normalized_text, flags=re.IGNORECASE)
|
| 401 |
+
|
| 402 |
+
# Gestion des mesures d'hystérométrie (format spécial)
|
| 403 |
+
normalized_text = re.sub(r"d['’]?hystérométrie\s+(\d+(?:[.,]\d+)?)", r"d'hystérométrie : \1 mm", normalized_text, flags=re.IGNORECASE)
|
| 404 |
+
|
| 405 |
+
# Gestion des mesures d'endomètre
|
| 406 |
+
normalized_text = re.sub(r"d['’]?endomètre\s+(\d+(?:[.,]\d+)?)", r"d'endometre : \1 mm", normalized_text, flags=re.IGNORECASE)
|
| 407 |
+
|
| 408 |
+
# Gestion du CFA (Compte Folliculaire Antral)
|
| 409 |
+
normalized_text = re.sub(r'(\d+)\s+follicules', r'CFA \1 follicules', normalized_text, flags=re.IGNORECASE)
|
| 410 |
+
|
| 411 |
+
normalized_text = re.sub(r'\bmm\s+millimètres?\b', 'mm', normalized_text, flags=re.IGNORECASE)
|
| 412 |
+
normalized_text = re.sub(r'\bmillimètres?\s+mm\b', 'mm', normalized_text, flags=re.IGNORECASE)
|
| 413 |
+
|
| 414 |
+
return normalized_text
|
| 415 |
+
|
| 416 |
+
def clean_spacing_and_formatting(self, text: str) -> str:
|
| 417 |
+
"""Nettoie les espaces et améliore le formatage avec ajouts spécifiques"""
|
| 418 |
+
# Supprime les espaces multiples mais préserve les sauts de ligne
|
| 419 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 420 |
+
|
| 421 |
+
# AJOUT: Corrections spécifiques pour les mesures
|
| 422 |
+
# Corrige "7. 8" -> "7,8" (décimales)
|
| 423 |
+
text = re.sub(r'(\d+)\.\s+(\d+)(?!\s*(?:mm|cm|fois|x))', r'\1,\2', text)
|
| 424 |
+
|
| 425 |
+
# Corrige "20 6" -> "26" quand c'est clairement un nombre
|
| 426 |
+
text = re.sub(r'\b20\s+6\b', '26', text)
|
| 427 |
+
text = re.sub(r'\b20\s+5\b', '25', text)
|
| 428 |
+
text = re.sub(r'\b10\s+9\b', '19', text)
|
| 429 |
+
text = re.sub(r'\b20\s+2\b', '22', text)
|
| 430 |
+
text = re.sub(r'\b20\s+7\b', '27', text)
|
| 431 |
+
text = re.sub(r'\b3\s+20\s+4\b', '3,24', text)
|
| 432 |
+
text = re.sub(r'\b4\s+20\s+11\b', '0,91', text)
|
| 433 |
+
|
| 434 |
+
# Corrige la ponctuation (supprime l'espace avant les points, virgules)
|
| 435 |
+
text = re.sub(r'\s+([.,:;!?])', r'\1', text)
|
| 436 |
+
|
| 437 |
+
# Ajoute un espace après la ponctuation si nécessaire (sauf si suivi d'un saut de ligne)
|
| 438 |
+
text = re.sub(r'([.,:;!?])([A-Za-z])', r'\1 \2', text)
|
| 439 |
+
|
| 440 |
+
# AJOUT: Correction des apostrophes
|
| 441 |
+
text = re.sub(r'\bl\s+([aeiouAEIOU])', r"l'\1", text) # l ovaire -> l'ovaire
|
| 442 |
+
text = re.sub(r'\bd\s+([aeiouAEIOU])', r"d'\1", text) # d hystérométrie -> d'hystérométrie
|
| 443 |
+
|
| 444 |
+
# Nettoie les sauts de ligne multiples (max 2 consécutifs)
|
| 445 |
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
| 446 |
+
|
| 447 |
+
# Supprime les espaces en début et fin de ligne
|
| 448 |
+
lines = text.split('\n')
|
| 449 |
+
lines = [line.strip() for line in lines]
|
| 450 |
+
text = '\n'.join(lines)
|
| 451 |
+
|
| 452 |
+
# Capitalise après les points suivis d'un espace
|
| 453 |
+
text = re.sub(r'(\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)
|
| 454 |
+
|
| 455 |
+
# Capitalise le début du texte
|
| 456 |
+
if text and text[0].islower():
|
| 457 |
+
text = text[0].upper() + text[1:]
|
| 458 |
+
|
| 459 |
+
return text.strip()
|
| 460 |
+
def post_process_gynecology_report(self, text: str) -> str:
|
| 461 |
+
"""Post-traitement spécialisé pour les rapports gynécologiques"""
|
| 462 |
+
processed_text = text
|
| 463 |
+
|
| 464 |
+
# Structuration des mesures d'utérus
|
| 465 |
+
processed_text = re.sub(
|
| 466 |
+
r'utérus est (\w+)\s+(\d+,\d+)',
|
| 467 |
+
r'utérus est \1 de taille \2 cm',
|
| 468 |
+
processed_text,
|
| 469 |
+
flags=re.IGNORECASE
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
# Structuration des mesures d'ovaires
|
| 473 |
+
processed_text = re.sub(
|
| 474 |
+
r'ovaire (droit|gauche) (\d+ x \d+ mm)',
|
| 475 |
+
r'ovaire \1 mesure \2,',
|
| 476 |
+
processed_text,
|
| 477 |
+
flags=re.IGNORECASE
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
# Amélioration de la lisibilité du CFA
|
| 481 |
+
processed_text = re.sub(
|
| 482 |
+
r'CFA (\d+) follicules',
|
| 483 |
+
r'CFA : \1 follicules',
|
| 484 |
+
processed_text,
|
| 485 |
+
flags=re.IGNORECASE
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# Formatage des indices Doppler
|
| 489 |
+
processed_text = re.sub(
|
| 490 |
+
r'doppler.*?(\d,\d+).*?(\d,\d+)',
|
| 491 |
+
r'Doppler : IP \1 - IR \2',
|
| 492 |
+
processed_text,
|
| 493 |
+
flags=re.IGNORECASE
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
return processed_text
|
| 497 |
+
|
| 498 |
+
class GPTMedicalFormatter:
|
| 499 |
+
"""Formateur de rapports médicaux utilisant GPT"""
|
| 500 |
+
|
| 501 |
+
def __init__(self, model: str = AZURE_OPENAI_DEPLOYMENT):
|
| 502 |
+
self.model = model
|
| 503 |
+
|
| 504 |
+
self.system_prompt = """
|
| 505 |
+
Tu es un expert en transcription médicale française. Tu dois corriger et formater UNIQUEMENT les erreurs évidentes dans ce texte médical déjà pré-traité.
|
| 506 |
+
|
| 507 |
+
RÈGLES STRICTES À APPLIQUER :
|
| 508 |
+
|
| 509 |
+
1. **PONCTUATION** :
|
| 510 |
+
- Supprime les doubles ponctuations : ",." → "."
|
| 511 |
+
- Supprime ".." → "."
|
| 512 |
+
- Corrige ",?" → "?"
|
| 513 |
+
|
| 514 |
+
2. **PARENTHÈSES** déjà converties mais nettoie si nécessaire
|
| 515 |
+
|
| 516 |
+
3. **ORTHOGRAPHE MÉDICALE** :
|
| 517 |
+
- "supérieur" au lieu de "supérieure" pour les adjectifs masculins
|
| 518 |
+
- "Discrète" → "Discret" pour les termes masculins
|
| 519 |
+
- Autres termes médicaux mal orthographiés
|
| 520 |
+
|
| 521 |
+
4. **FORMATAGE** :
|
| 522 |
+
- Assure-toi que chaque phrase se termine par un point
|
| 523 |
+
- Capitalise après les points
|
| 524 |
+
- Supprime les espaces inutiles
|
| 525 |
+
|
| 526 |
+
5. **CORRECTIONS SPÉCIFIQUES** :
|
| 527 |
+
- Ne transforme JAMAIS "un" en "1" (garde "un utérus" et NON "1 utérus")
|
| 528 |
+
- Supprime les duplications d'unités (ex: "mm millimètre" → "mm")
|
| 529 |
+
- Assure-toi que "pour cent" est remplacé par "%"
|
| 530 |
+
- Vérifie l'accord des adjectifs (masculin/féminin)
|
| 531 |
+
- Corrige uniquement l’orthographe, la grammaire et les accords.
|
| 532 |
+
|
| 533 |
+
6. **INTERDICTIONS** :
|
| 534 |
+
- NE change PAS le contenu médical
|
| 535 |
+
- NE reformule PAS les phrases
|
| 536 |
+
- NE change PAS l'ordre des informations
|
| 537 |
+
- NE supprime PAS d'informations médicales
|
| 538 |
+
|
| 539 |
+
OBJECTIF : Rendre le texte médical propre et professionnel en gardant EXACTEMENT le même contenu.
|
| 540 |
+
|
| 541 |
+
Texte à corriger :
|
| 542 |
+
"""
|
| 543 |
+
|
| 544 |
+
def format_medical_report(self, text: str) -> str:
|
| 545 |
+
"""Formate le rapport médical avec GPT"""
|
| 546 |
+
if not azure_client:
|
| 547 |
+
print("❌ Client Azure OpenAI non disponible - utilisation du texte NER seulement")
|
| 548 |
+
return text
|
| 549 |
+
|
| 550 |
+
try:
|
| 551 |
+
print("🔄 Appel à l'API Azure OpenAI en cours...")
|
| 552 |
+
response = azure_client.chat.completions.create(
|
| 553 |
+
model=self.model,
|
| 554 |
+
messages=[
|
| 555 |
+
{"role": "system", "content": self.system_prompt},
|
| 556 |
+
{"role": "user", "content": f"Corrigez et formatez cette transcription médicale en préservant tous les sauts de ligne et le contenu médical:\n\n{text}"}
|
| 557 |
+
],
|
| 558 |
+
#max_tokens=2000,
|
| 559 |
+
#temperature=0.1
|
| 560 |
+
)
|
| 561 |
+
result = response.choices[0].message.content.strip()
|
| 562 |
+
print("✅ Réponse reçue de l'API Azure OpenAI")
|
| 563 |
+
return result
|
| 564 |
+
|
| 565 |
+
except Exception as e:
|
| 566 |
+
print(f"❌ Erreur lors de l'appel à l'API Azure OpenAI: {e}")
|
| 567 |
+
print(f" Type d'erreur: {type(e).__name__}")
|
| 568 |
+
if hasattr(e, 'response'):
|
| 569 |
+
print(f" Code de statut: {e.response.status_code if hasattr(e.response, 'status_code') else 'N/A'}")
|
| 570 |
+
print("🔄 Utilisation du texte corrigé par NER seulement")
|
| 571 |
+
return text
|
| 572 |
+
|
| 573 |
+
class MedicalTranscriptionProcessor:
|
| 574 |
+
"""Processeur principal pour les transcriptions médicales"""
|
| 575 |
+
|
| 576 |
+
def __init__(self, deployment: str = AZURE_OPENAI_DEPLOYMENT):
|
| 577 |
+
self.ner_corrector = MedicalNERCorrector()
|
| 578 |
+
self.gpt_formatter = GPTMedicalFormatter(deployment)
|
| 579 |
+
|
| 580 |
+
def process_transcription(self, text: str) -> CorrectionResult:
|
| 581 |
+
"""Traite une transcription médicale complète - TRAITEMENT OBLIGATOIRE EN 2 ÉTAPES"""
|
| 582 |
+
print("🏥 Démarrage du traitement de la transcription médicale...")
|
| 583 |
+
print("⚠️ TRAITEMENT EN 2 ÉTAPES OBLIGATOIRES: NER + GPT")
|
| 584 |
+
|
| 585 |
+
# =================== ÉTAPE 1: CORRECTIONS NER ===================
|
| 586 |
+
print("\n🔧 ÉTAPE 1/2: CORRECTIONS NER (Nombres, Ponctuation, Orthographe)")
|
| 587 |
+
print("-" * 60)
|
| 588 |
+
|
| 589 |
+
# Sous-étape 1.1: Correction des transcriptions vocales (inclut la conversion des nombres)
|
| 590 |
+
print(" 🎤 Correction des transcriptions vocales et conversion des nombres...")
|
| 591 |
+
vocal_corrected = self.ner_corrector.correct_vocal_transcription(text)
|
| 592 |
+
|
| 593 |
+
# Sous-étape 1.2: Extraction des entités médicales
|
| 594 |
+
print(" 📋 Extraction des entités médicales...")
|
| 595 |
+
medical_entities = self.ner_corrector.extract_medical_entities(vocal_corrected)
|
| 596 |
+
print(f" ✅ {len(medical_entities)} entités médicales détectées")
|
| 597 |
+
|
| 598 |
+
# Sous-étape 1.3: Correction orthographique des termes médicaux
|
| 599 |
+
print(" ✏️ Correction orthographique des termes médicaux...")
|
| 600 |
+
ner_corrected = self.ner_corrector.correct_medical_terms(vocal_corrected)
|
| 601 |
+
|
| 602 |
+
# Sous-étape 1.4: Normalisation des patterns médicaux
|
| 603 |
+
print(" 🔧 Normalisation des patterns médicaux...")
|
| 604 |
+
ner_corrected = self.ner_corrector.normalize_medical_patterns(ner_corrected)
|
| 605 |
+
|
| 606 |
+
# Sous-étape 1.5: Nettoyage du formatage
|
| 607 |
+
print(" 🧹 Nettoyage du formatage...")
|
| 608 |
+
ner_corrected = self.ner_corrector.post_process_gynecology_report(ner_corrected)
|
| 609 |
+
|
| 610 |
+
|
| 611 |
+
print("✅ ÉTAPE 1 TERMINÉE: Corrections NER appliquées")
|
| 612 |
+
|
| 613 |
+
# =================== ÉTAPE 2: FORMATAGE GPT ===================
|
| 614 |
+
print("\n🤖 ÉTAPE 2/2: FORMATAGE PROFESSIONNEL AVEC GPT")
|
| 615 |
+
print("-" * 60)
|
| 616 |
+
print(" 📝 Structuration du rapport médical...")
|
| 617 |
+
print(" 🎯 Amélioration de la lisibilité...")
|
| 618 |
+
print(" 📋 Organisation en sections médicales...")
|
| 619 |
+
|
| 620 |
+
final_corrected = self.gpt_formatter.format_medical_report(ner_corrected)
|
| 621 |
+
|
| 622 |
+
if final_corrected != ner_corrected:
|
| 623 |
+
print("✅ ÉTAPE 2 TERMINÉE: Formatage GPT appliqué avec succès")
|
| 624 |
+
else:
|
| 625 |
+
print("⚠️ ÉTAPE 2: GPT non disponible - utilisation du résultat NER")
|
| 626 |
+
|
| 627 |
+
# Calcul du score de confiance
|
| 628 |
+
confidence_score = self._calculate_confidence_score(text, final_corrected, medical_entities)
|
| 629 |
+
|
| 630 |
+
print(f"\n🎯 TRAITEMENT COMPLET TERMINÉ - Score de confiance: {confidence_score:.2%}")
|
| 631 |
+
|
| 632 |
+
return CorrectionResult(
|
| 633 |
+
original_text=text,
|
| 634 |
+
ner_corrected_text=ner_corrected,
|
| 635 |
+
final_corrected_text=final_corrected,
|
| 636 |
+
medical_entities=medical_entities,
|
| 637 |
+
confidence_score=confidence_score
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
def process_without_gpt(self, text: str) -> str:
|
| 641 |
+
print("⚠️ ATTENTION: Traitement partiel sans GPT (pour tests uniquement)")
|
| 642 |
+
print("💡 Pour un résultat professionnel, utilisez process_transcription() avec une clé API")
|
| 643 |
+
|
| 644 |
+
vocal_corrected = self.ner_corrector.correct_vocal_transcription(text)
|
| 645 |
+
medical_corrected = self.ner_corrector.correct_medical_terms(vocal_corrected)
|
| 646 |
+
normalized = self.ner_corrector.normalize_medical_patterns(medical_corrected)
|
| 647 |
+
cleaned = self.ner_corrector.clean_spacing_and_formatting(normalized)
|
| 648 |
+
return cleaned
|
| 649 |
+
|
| 650 |
+
def _calculate_confidence_score(self, original: str, corrected: str, entities: List[Dict]) -> float:
|
| 651 |
+
"""Calcule un score de confiance pour la correction"""
|
| 652 |
+
entity_score = min(len(entities) / 10, 1.0)
|
| 653 |
+
similarity_score = len(set(original.split()) & set(corrected.split())) / len(set(original.split()))
|
| 654 |
+
return (entity_score + similarity_score) / 2
|
| 655 |
+
|
| 656 |
+
def test_azure_connection():
|
| 657 |
+
"""Test de connexion à Azure OpenAI"""
|
| 658 |
+
if not azure_client:
|
| 659 |
+
print("❌ Client Azure OpenAI non initialisé")
|
| 660 |
+
return False
|
| 661 |
+
|
| 662 |
+
try:
|
| 663 |
+
print("🔍 Test de connexion à Azure OpenAI...")
|
| 664 |
+
response = azure_client.chat.completions.create(
|
| 665 |
+
model=AZURE_OPENAI_DEPLOYMENT,
|
| 666 |
+
messages=[{"role": "user", "content": "Test de connexion"}]
|
| 667 |
+
#max_tokens=10
|
| 668 |
+
)
|
| 669 |
+
print("✅ Connexion Azure OpenAI réussie")
|
| 670 |
+
return True
|
| 671 |
+
except Exception as e:
|
| 672 |
+
print(f"❌ Erreur de connexion Azure OpenAI: {e}")
|
| 673 |
+
return False
|
| 674 |
+
|
| 675 |
+
def main():
|
| 676 |
+
"""Fonction principale de démonstration"""
|
| 677 |
+
|
| 678 |
+
# Test de la configuration Azure
|
| 679 |
+
print("=" * 80)
|
| 680 |
+
print("🔧 VÉRIFICATION DE LA CONFIGURATION")
|
| 681 |
+
print("=" * 80)
|
| 682 |
+
|
| 683 |
+
print(f"📍 Endpoint Azure: {AZURE_OPENAI_ENDPOINT}")
|
| 684 |
+
print(f"🤖 Deployment: {AZURE_OPENAI_DEPLOYMENT}")
|
| 685 |
+
print(f"🔑 Clé API: {'✅ Configurée' if AZURE_OPENAI_KEY else '❌ Manquante'}")
|
| 686 |
+
|
| 687 |
+
# Test de connexion
|
| 688 |
+
if not test_azure_connection():
|
| 689 |
+
print("\n⚠️ Azure OpenAI non disponible - le traitement continuera avec NER seulement")
|
| 690 |
+
|
| 691 |
+
# Texte d'exemple avec problèmes identifiés
|
| 692 |
+
exemple_transcription = """irm pelvienne indication clinique point technique acquisition sagittale axiale et coronale t deux saturation axiale diffusion axiale t un résultats présence d un utérus antéversé médio pelvien dont le grand axe mesure soixante douze mm sur quarante millimètre sur quarante mm point la zone jonctionnelle apparaît floue point elle est épaissie de façon diffuse asymétrique avec une atteinte de plus de cinquante pour cent de l épaisseur du myomètre et comporte des spots en hypersignal t deux l ensemble traduisant une adénomyose point à la ligne pas d épaississement cervical à noter la présence d un petit kyste liquidien de type naboth point à la ligne les deux ovaires sont repérés porteurs de formations folliculaires communes en hypersignal homogène t deux de petite taille point l ovaire droit mesure trente fois vingt cinq mm l ovaire gauche vingt cinq fois vingt trois mm point pas d épanchement dans le cul de sac de douglas point à la ligne absence de foyer d endométriose profonde point conclusion points à la ligne aspect d adénomyose diffuse symétrique virgule profonde point à la ligne pas d épaississement endométrial point absence d endométriome point absence d épanchement dans le cul de sac de douglas point"""
|
| 693 |
+
|
| 694 |
+
# Initialisation du processeur
|
| 695 |
+
processor = MedicalTranscriptionProcessor(AZURE_OPENAI_DEPLOYMENT)
|
| 696 |
+
|
| 697 |
+
print("\n" + "="*80)
|
| 698 |
+
print("🏥 TRAITEMENT COMPLET DE LA TRANSCRIPTION MÉDICALE")
|
| 699 |
+
print("="*80)
|
| 700 |
+
|
| 701 |
+
# Traitement complet avec GPT (recommandé)
|
| 702 |
+
result = processor.process_transcription(exemple_transcription)
|
| 703 |
+
|
| 704 |
+
# Affichage des résultats complets
|
| 705 |
+
print("\n📄 TEXTE ORIGINAL:")
|
| 706 |
+
print("-" * 50)
|
| 707 |
+
print(result.original_text)
|
| 708 |
+
|
| 709 |
+
print(f"\n🔍 ENTITÉS MÉDICALES DÉTECTÉES ({len(result.medical_entities)}):")
|
| 710 |
+
print("-" * 50)
|
| 711 |
+
for entity in result.medical_entities:
|
| 712 |
+
print(f" • {entity['text']} ({entity['label']})")
|
| 713 |
+
|
| 714 |
+
print("\n🎤 APRÈS CORRECTION NER (sans GPT):")
|
| 715 |
+
print("-" * 50)
|
| 716 |
+
print(result.ner_corrected_text)
|
| 717 |
+
|
| 718 |
+
print("\n🤖 RAPPORT FINAL FORMATÉ (avec GPT):")
|
| 719 |
+
print("-" * 50)
|
| 720 |
+
if result.final_corrected_text:
|
| 721 |
+
print(result.final_corrected_text)
|
| 722 |
+
else:
|
| 723 |
+
print("❌ Aucun résultat GPT - vérifiez votre configuration Azure")
|
| 724 |
+
|
| 725 |
+
print(f"\n📊 SCORE DE CONFIANCE: {result.confidence_score:.2%}")
|
| 726 |
+
|
| 727 |
+
# Comparaison des résultats
|
| 728 |
+
if result.final_corrected_text != result.ner_corrected_text:
|
| 729 |
+
print("\n🔄 COMPARAISON NER vs GPT:")
|
| 730 |
+
print("-" * 50)
|
| 731 |
+
print("📈 Améliorations apportées par GPT:")
|
| 732 |
+
ner_lines = result.ner_corrected_text.split('\n')
|
| 733 |
+
gpt_lines = result.final_corrected_text.split('\n')
|
| 734 |
+
|
| 735 |
+
for i, (ner_line, gpt_line) in enumerate(zip(ner_lines, gpt_lines)):
|
| 736 |
+
if ner_line.strip() != gpt_line.strip():
|
| 737 |
+
print(f" Ligne {i+1}:")
|
| 738 |
+
print(f" NER: {ner_line}")
|
| 739 |
+
print(f" GPT: {gpt_line}")
|
| 740 |
+
|
| 741 |
+
print("\n" + "="*80)
|
| 742 |
+
print("✅ TRAITEMENT TERMINÉ")
|
| 743 |
+
if azure_client:
|
| 744 |
+
print("🎉 Les 2 étapes ont été appliquées avec succès")
|
| 745 |
+
else:
|
| 746 |
+
print("⚠️ Seule l'étape NER a pu être appliquée - configurez Azure OpenAI pour le formatage complet")
|
| 747 |
+
print("="*80)
|
| 748 |
+
|
| 749 |
+
if __name__ == "__main__":
|
| 750 |
+
print("✅ correcteur.py loaded main")
|
| 751 |
+
main()
|
dataset.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_optimiser_with_finetunning.py
ADDED
|
@@ -0,0 +1,1190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Core libraries
|
| 10 |
+
import torch
|
| 11 |
+
from transformers import (
|
| 12 |
+
AutoTokenizer, AutoModel, AutoModelForTokenClassification,
|
| 13 |
+
TrainingArguments, Trainer, pipeline, DataCollatorForTokenClassification
|
| 14 |
+
)
|
| 15 |
+
from torch.utils.data import Dataset
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
|
| 18 |
+
# Vector database
|
| 19 |
+
import chromadb
|
| 20 |
+
from chromadb.config import Settings
|
| 21 |
+
|
| 22 |
+
# Utilities
|
| 23 |
+
import logging
|
| 24 |
+
from tqdm import tqdm
|
| 25 |
+
import pandas as pd
|
| 26 |
+
from sklearn.model_selection import train_test_split
|
| 27 |
+
|
| 28 |
+
# Configure logging
|
| 29 |
+
logging.basicConfig(level=logging.INFO)
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
@dataclass
|
| 33 |
+
class MedicalEntity:
|
| 34 |
+
"""Structure pour les entités médicales extraites par NER"""
|
| 35 |
+
exam_types: List[Tuple[str, float]] # (entity, confidence)
|
| 36 |
+
specialties: List[Tuple[str, float]]
|
| 37 |
+
anatomical_regions: List[Tuple[str, float]]
|
| 38 |
+
pathologies: List[Tuple[str, float]]
|
| 39 |
+
medical_procedures: List[Tuple[str, float]]
|
| 40 |
+
measurements: List[Tuple[str, float]]
|
| 41 |
+
medications: List[Tuple[str, float]]
|
| 42 |
+
symptoms: List[Tuple[str, float]]
|
| 43 |
+
|
| 44 |
+
class MedicalNERDataset(Dataset):
|
| 45 |
+
"""Dataset personnalisé pour l'entraînement NER médical"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, texts, labels, tokenizer, max_length=512):
|
| 48 |
+
self.texts = texts
|
| 49 |
+
self.labels = labels
|
| 50 |
+
self.tokenizer = tokenizer
|
| 51 |
+
self.max_length = max_length
|
| 52 |
+
|
| 53 |
+
def __len__(self):
|
| 54 |
+
return len(self.texts)
|
| 55 |
+
|
| 56 |
+
def __getitem__(self, idx):
|
| 57 |
+
text = self.texts[idx]
|
| 58 |
+
labels = self.labels[idx]
|
| 59 |
+
|
| 60 |
+
# Tokenisation
|
| 61 |
+
encoding = self.tokenizer(
|
| 62 |
+
text,
|
| 63 |
+
truncation=True,
|
| 64 |
+
padding='max_length',
|
| 65 |
+
max_length=self.max_length,
|
| 66 |
+
return_offsets_mapping=True,
|
| 67 |
+
return_tensors='pt'
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Alignement des labels avec les tokens
|
| 71 |
+
aligned_labels = self._align_labels_with_tokens(
|
| 72 |
+
labels, encoding.offset_mapping.squeeze().tolist()
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
'input_ids': encoding.input_ids.flatten(),
|
| 77 |
+
'attention_mask': encoding.attention_mask.flatten(),
|
| 78 |
+
'labels': torch.tensor(aligned_labels, dtype=torch.long)
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _align_labels_with_tokens(self, labels, offset_mapping):
|
| 82 |
+
"""Aligne les labels BIO avec les tokens du tokenizer"""
|
| 83 |
+
aligned_labels = []
|
| 84 |
+
label_idx = 0
|
| 85 |
+
|
| 86 |
+
for start, end in offset_mapping:
|
| 87 |
+
if start == 0 and end == 0: # Token spécial [CLS], [SEP], [PAD]
|
| 88 |
+
aligned_labels.append(-100) # Ignore dans la loss
|
| 89 |
+
else:
|
| 90 |
+
if label_idx < len(labels):
|
| 91 |
+
aligned_labels.append(labels[label_idx])
|
| 92 |
+
label_idx += 1
|
| 93 |
+
else:
|
| 94 |
+
aligned_labels.append(0) # O (Outside)
|
| 95 |
+
|
| 96 |
+
return aligned_labels
|
| 97 |
+
|
| 98 |
+
class AdvancedMedicalNER:
|
| 99 |
+
"""NER médical avancé basé sur CamemBERT-Bio fine-tuné"""
|
| 100 |
+
|
| 101 |
+
def __init__(self, model_name: str = "auto", cache_dir: str = "./models_cache"):
|
| 102 |
+
self.cache_dir = Path(cache_dir)
|
| 103 |
+
self.cache_dir.mkdir(exist_ok=True)
|
| 104 |
+
|
| 105 |
+
# Auto-détection du meilleur modèle NER médical disponible
|
| 106 |
+
self.model_name = self._select_best_model(model_name)
|
| 107 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 108 |
+
|
| 109 |
+
# Labels BIO pour entités médicales
|
| 110 |
+
self.entity_labels = [
|
| 111 |
+
"O", # Outside
|
| 112 |
+
"B-EXAM_TYPES", "I-EXAM_TYPES", # Types d'examens
|
| 113 |
+
"B-SPECIALTIES", "I-SPECIALTIES", # Spécialités médicales
|
| 114 |
+
"B-ANATOMICAL_REGIONS", "I-ANATOMICAL_REGIONS", # Régions anatomiques
|
| 115 |
+
"B-PATHOLOGIES", "I-PATHOLOGIES", # Pathologies
|
| 116 |
+
"B-PROCEDURES", "I-PROCEDURES", # Procédures médicales
|
| 117 |
+
"B-MEASUREMENTS", "I-MEASUREMENTS", # Mesures/valeurs
|
| 118 |
+
"B-MEDICATIONS", "I-MEDICATIONS", # Médicaments
|
| 119 |
+
"B-SYMPTOMS", "I-SYMPTOMS" # Symptômes
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
self.id2label = {i: label for i, label in enumerate(self.entity_labels)}
|
| 123 |
+
self.label2id = {label: i for i, label in enumerate(self.entity_labels)}
|
| 124 |
+
|
| 125 |
+
# Chargement du modèle NER
|
| 126 |
+
self._load_ner_model()
|
| 127 |
+
|
| 128 |
+
def _select_best_model(self, model_name: str) -> str:
|
| 129 |
+
"""Sélection automatique du meilleur modèle NER médical"""
|
| 130 |
+
|
| 131 |
+
if model_name != "auto":
|
| 132 |
+
return model_name
|
| 133 |
+
|
| 134 |
+
# Liste des modèles par ordre de préférence
|
| 135 |
+
preferred_models = [
|
| 136 |
+
"almanach/camembert-bio-base", # CamemBERT Bio français
|
| 137 |
+
"Dr-BERT/DrBERT-7GB", # DrBERT spécialisé
|
| 138 |
+
"emilyalsentzer/Bio_ClinicalBERT", # Bio Clinical BERT
|
| 139 |
+
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
|
| 140 |
+
"dmis-lab/biobert-base-cased-v1.2", # BioBERT
|
| 141 |
+
"camembert-base" # Fallback CamemBERT standard
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
for model in preferred_models:
|
| 145 |
+
try:
|
| 146 |
+
# Test de disponibilité
|
| 147 |
+
AutoTokenizer.from_pretrained(model, cache_dir=self.cache_dir)
|
| 148 |
+
logger.info(f"Modèle sélectionné: {model}")
|
| 149 |
+
return model
|
| 150 |
+
except:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
# Fallback ultime
|
| 154 |
+
logger.warning("Utilisation du modèle de base camembert-base")
|
| 155 |
+
return "camembert-base"
|
| 156 |
+
|
| 157 |
+
def _load_ner_model(self):
|
| 158 |
+
"""Charge ou crée le modèle NER fine-tuné"""
|
| 159 |
+
|
| 160 |
+
fine_tuned_path = self.cache_dir / "medical_ner_model"
|
| 161 |
+
|
| 162 |
+
if fine_tuned_path.exists():
|
| 163 |
+
logger.info("Chargement du modèle NER fine-tuné existant")
|
| 164 |
+
self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
|
| 165 |
+
self.ner_model = AutoModelForTokenClassification.from_pretrained(fine_tuned_path)
|
| 166 |
+
else:
|
| 167 |
+
logger.info("Création d'un nouveau modèle NER médical")
|
| 168 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.cache_dir)
|
| 169 |
+
|
| 170 |
+
# Modèle pour classification de tokens (NER)
|
| 171 |
+
self.ner_model = AutoModelForTokenClassification.from_pretrained(
|
| 172 |
+
self.model_name,
|
| 173 |
+
num_labels=len(self.entity_labels),
|
| 174 |
+
id2label=self.id2label,
|
| 175 |
+
label2id=self.label2id,
|
| 176 |
+
cache_dir=self.cache_dir
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
self.ner_model.to(self.device)
|
| 180 |
+
|
| 181 |
+
# Pipeline NER
|
| 182 |
+
self.ner_pipeline = pipeline(
|
| 183 |
+
"token-classification",
|
| 184 |
+
model=self.ner_model,
|
| 185 |
+
tokenizer=self.tokenizer,
|
| 186 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 187 |
+
aggregation_strategy="simple"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
def extract_entities(self, text: str) -> MedicalEntity:
|
| 191 |
+
"""Extraction d'entités avec le modèle NER fine-tuné"""
|
| 192 |
+
|
| 193 |
+
# Prédiction NER
|
| 194 |
+
try:
|
| 195 |
+
ner_results = self.ner_pipeline(text)
|
| 196 |
+
except Exception as e:
|
| 197 |
+
logger.error(f"Erreur NER: {e}")
|
| 198 |
+
return MedicalEntity([], [], [], [], [], [], [], [])
|
| 199 |
+
|
| 200 |
+
# Groupement des entités par type
|
| 201 |
+
entities = {
|
| 202 |
+
"EXAM_TYPES": [],
|
| 203 |
+
"SPECIALTIES": [],
|
| 204 |
+
"ANATOMICAL_REGIONS": [],
|
| 205 |
+
"PATHOLOGIES": [],
|
| 206 |
+
"PROCEDURES": [],
|
| 207 |
+
"MEASUREMENTS": [],
|
| 208 |
+
"MEDICATIONS": [],
|
| 209 |
+
"SYMPTOMS": []
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
for result in ner_results:
|
| 213 |
+
entity_type = result['entity_group'].replace('B-', '').replace('I-', '')
|
| 214 |
+
entity_text = result['word']
|
| 215 |
+
confidence = result['score']
|
| 216 |
+
|
| 217 |
+
if entity_type in entities and confidence > 0.7: # Seuil de confiance
|
| 218 |
+
entities[entity_type].append((entity_text, confidence))
|
| 219 |
+
|
| 220 |
+
return MedicalEntity(
|
| 221 |
+
exam_types=entities["EXAM_TYPES"],
|
| 222 |
+
specialties=entities["SPECIALTIES"],
|
| 223 |
+
anatomical_regions=entities["ANATOMICAL_REGIONS"],
|
| 224 |
+
pathologies=entities["PATHOLOGIES"],
|
| 225 |
+
medical_procedures=entities["PROCEDURES"],
|
| 226 |
+
measurements=entities["MEASUREMENTS"],
|
| 227 |
+
medications=entities["MEDICATIONS"],
|
| 228 |
+
symptoms=entities["SYMPTOMS"]
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
def load_dataset(self, dataset_path: str) -> List[Dict]:
|
| 232 |
+
"""Charge le dataset depuis le fichier JSON"""
|
| 233 |
+
try:
|
| 234 |
+
with open(dataset_path, 'r', encoding='utf-8') as f:
|
| 235 |
+
# Chaque ligne est un objet JSON séparé
|
| 236 |
+
data = []
|
| 237 |
+
for line in f:
|
| 238 |
+
if line.strip():
|
| 239 |
+
data.append(json.loads(line.strip()))
|
| 240 |
+
return data
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"Erreur lors du chargement du dataset: {e}")
|
| 243 |
+
return []
|
| 244 |
+
"""
|
| 245 |
+
def _text_to_bio_labels(self, text: str, entities_dict: Dict[str, List[str]]) -> List[int]:
|
| 246 |
+
#Convertit le texte et les entités en labels BI en utilisant les offsets
|
| 247 |
+
|
| 248 |
+
# Tokenisation du texte
|
| 249 |
+
tokens = self.tokenizer.tokenize(text)
|
| 250 |
+
labels = [0] * len(tokens) # Initialisation avec "O" (Outside)
|
| 251 |
+
|
| 252 |
+
# Mapping des types d'entités vers les labels BIO
|
| 253 |
+
entity_type_mapping = {
|
| 254 |
+
"exam_types": ("B-EXAM_TYPES", "I-EXAM_TYPES"),
|
| 255 |
+
"specialties": ("B-SPECIALTIES", "I-SPECIALTIES"),
|
| 256 |
+
"anatomical_regions": ("B-ANATOMICAL_REGIONS", "I-ANATOMICAL_REGIONS"),
|
| 257 |
+
"pathologies": ("B-PATHOLOGIES", "I-PATHOLOGIES"),
|
| 258 |
+
"procedures": ("B-PROCEDURES", "I-PROCEDURES"),
|
| 259 |
+
"measurements": ("B-MEASUREMENTS", "I-MEASUREMENTS"),
|
| 260 |
+
"medications": ("B-MEDICATIONS", "I-MEDICATIONS"),
|
| 261 |
+
"symptoms": ("B-SYMPTOMS", "I-SYMPTOMS")
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
# Attribution des labels pour chaque type d'entité
|
| 265 |
+
for entity_type, entity_list in entities_dict.items():
|
| 266 |
+
if entity_type in entity_type_mapping and entity_list:
|
| 267 |
+
b_label, i_label = entity_type_mapping[entity_type]
|
| 268 |
+
b_label_id = self.label2id[b_label]
|
| 269 |
+
i_label_id = self.label2id[i_label]
|
| 270 |
+
|
| 271 |
+
for entity in entity_list:
|
| 272 |
+
# Recherche de l'entité dans le texte tokenizé
|
| 273 |
+
entity_tokens = self.tokenizer.tokenize(entity.lower())
|
| 274 |
+
text_lower = text.lower()
|
| 275 |
+
|
| 276 |
+
# Recherche de la position de l'entité
|
| 277 |
+
start_pos = text_lower.find(entity.lower())
|
| 278 |
+
if start_pos != -1:
|
| 279 |
+
# Approximation de la position dans les tokens
|
| 280 |
+
# (méthode simplifiée - pourrait être améliorée)
|
| 281 |
+
char_to_token_ratio = len(tokens) / len(text)
|
| 282 |
+
approx_token_start = int(start_pos * char_to_token_ratio)
|
| 283 |
+
approx_token_end = min(
|
| 284 |
+
len(tokens),
|
| 285 |
+
approx_token_start + len(entity_tokens)
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# Attribution des labels BIO
|
| 289 |
+
for i in range(approx_token_start, approx_token_end):
|
| 290 |
+
if i < len(labels):
|
| 291 |
+
if i == approx_token_start:
|
| 292 |
+
labels[i] = b_label_id # B- pour le premier token
|
| 293 |
+
else:
|
| 294 |
+
labels[i] = i_label_id # I- pour les tokens suivants
|
| 295 |
+
|
| 296 |
+
return labels
|
| 297 |
+
"""
|
| 298 |
+
def _text_to_bio_labels(self, text: str, entities_dict: Dict[str, List[str]]) -> List[int]:
|
| 299 |
+
"""Convertit le texte et les entités en labels BIO en utilisant les offsets (robuste)"""
|
| 300 |
+
|
| 301 |
+
# Encodage avec offsets
|
| 302 |
+
encoding = self.tokenizer(
|
| 303 |
+
text,
|
| 304 |
+
return_offsets_mapping=True,
|
| 305 |
+
add_special_tokens=False
|
| 306 |
+
)
|
| 307 |
+
tokens = encoding.tokens()
|
| 308 |
+
offsets = encoding["offset_mapping"]
|
| 309 |
+
labels = [self.label2id["O"]] * len(tokens) # Initialisation avec "O"
|
| 310 |
+
|
| 311 |
+
# Mapping des types d'entités vers les labels BIO
|
| 312 |
+
entity_type_mapping = {
|
| 313 |
+
"exam_types": ("B-EXAM_TYPES", "I-EXAM_TYPES"),
|
| 314 |
+
"specialties": ("B-SPECIALTIES", "I-SPECIALTIES"),
|
| 315 |
+
"anatomical_regions": ("B-ANATOMICAL_REGIONS", "I-ANATOMICAL_REGIONS"),
|
| 316 |
+
"pathologies": ("B-PATHOLOGIES", "I-PATHOLOGIES"),
|
| 317 |
+
"procedures": ("B-PROCEDURES", "I-PROCEDURES"),
|
| 318 |
+
"measurements": ("B-MEASUREMENTS", "I-MEASUREMENTS"),
|
| 319 |
+
"medications": ("B-MEDICATIONS", "I-MEDICATIONS"),
|
| 320 |
+
"symptoms": ("B-SYMPTOMS", "I-SYMPTOMS")
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
# Attribution des labels
|
| 324 |
+
for entity_type, entity_list in entities_dict.items():
|
| 325 |
+
if entity_type in entity_type_mapping and entity_list:
|
| 326 |
+
b_label, i_label = entity_type_mapping[entity_type]
|
| 327 |
+
b_label_id = self.label2id[b_label]
|
| 328 |
+
i_label_id = self.label2id[i_label]
|
| 329 |
+
|
| 330 |
+
for entity in entity_list:
|
| 331 |
+
start_char = text.lower().find(entity.lower())
|
| 332 |
+
if start_char == -1:
|
| 333 |
+
continue
|
| 334 |
+
end_char = start_char + len(entity)
|
| 335 |
+
|
| 336 |
+
# Trouver tous les tokens qui chevauchent l’entité
|
| 337 |
+
entity_token_idxs = [
|
| 338 |
+
i for i, (tok_start, tok_end) in enumerate(offsets)
|
| 339 |
+
if tok_start < end_char and tok_end > start_char
|
| 340 |
+
]
|
| 341 |
+
|
| 342 |
+
if not entity_token_idxs:
|
| 343 |
+
continue
|
| 344 |
+
|
| 345 |
+
# Attribution BIO
|
| 346 |
+
for j, tok_idx in enumerate(entity_token_idxs):
|
| 347 |
+
if j == 0:
|
| 348 |
+
labels[tok_idx] = b_label_id
|
| 349 |
+
else:
|
| 350 |
+
labels[tok_idx] = i_label_id
|
| 351 |
+
|
| 352 |
+
return labels
|
| 353 |
+
|
| 354 |
+
def _prepare_training_data(self, templates_data: List[Dict]) -> Dict:
|
| 355 |
+
"""Prépare les données d'entraînement pour le NER à partir du dataset"""
|
| 356 |
+
|
| 357 |
+
if not templates_data:
|
| 358 |
+
logger.warning("Aucune donnée de template fournie")
|
| 359 |
+
return {'train': MedicalNERDataset([], [], self.tokenizer)}
|
| 360 |
+
|
| 361 |
+
texts = []
|
| 362 |
+
labels = []
|
| 363 |
+
|
| 364 |
+
logger.info(f"Préparation de {len(templates_data)} échantillons pour l'entraînement")
|
| 365 |
+
|
| 366 |
+
for sample in tqdm(templates_data, desc="Conversion en format BIO"):
|
| 367 |
+
try:
|
| 368 |
+
text = sample['text']
|
| 369 |
+
entities_dict = sample['labels']
|
| 370 |
+
|
| 371 |
+
# Conversion en labels BIO
|
| 372 |
+
bio_labels = self._text_to_bio_labels(text, entities_dict)
|
| 373 |
+
|
| 374 |
+
texts.append(text)
|
| 375 |
+
labels.append(bio_labels)
|
| 376 |
+
|
| 377 |
+
except Exception as e:
|
| 378 |
+
logger.error(f"Erreur lors du traitement d'un échantillon: {e}")
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
if not texts:
|
| 382 |
+
logger.error("Aucun échantillon valide trouvé pour l'entraînement")
|
| 383 |
+
return {'train': MedicalNERDataset([], [], self.tokenizer)}
|
| 384 |
+
|
| 385 |
+
# Division train/validation si suffisamment de données
|
| 386 |
+
if len(texts) > 10:
|
| 387 |
+
train_texts, val_texts, train_labels, val_labels = train_test_split(
|
| 388 |
+
texts, labels, test_size=0.2, random_state=42
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
train_dataset = MedicalNERDataset(train_texts, train_labels, self.tokenizer)
|
| 392 |
+
val_dataset = MedicalNERDataset(val_texts, val_labels, self.tokenizer)
|
| 393 |
+
|
| 394 |
+
logger.info(f"Dataset divisé: {len(train_texts)} train, {len(val_texts)} validation")
|
| 395 |
+
return {'train': train_dataset, 'eval': val_dataset}
|
| 396 |
+
else:
|
| 397 |
+
train_dataset = MedicalNERDataset(texts, labels, self.tokenizer)
|
| 398 |
+
logger.info(f"Dataset d'entraînement: {len(texts)} échantillons")
|
| 399 |
+
return {'train': train_dataset}
|
| 400 |
+
|
| 401 |
+
def fine_tune_on_templates(self, templates_data: List[Dict] = None,
|
| 402 |
+
dataset_path: str = "dataset.json",
|
| 403 |
+
output_dir: str = None,
|
| 404 |
+
epochs: int = 3):
|
| 405 |
+
"""Fine-tuning du modèle NER sur des templates médicaux"""
|
| 406 |
+
|
| 407 |
+
if output_dir is None:
|
| 408 |
+
output_dir = self.cache_dir / "medical_ner_model"
|
| 409 |
+
|
| 410 |
+
# Chargement des données
|
| 411 |
+
if templates_data is None:
|
| 412 |
+
logger.info(f"Chargement du dataset depuis {dataset_path}")
|
| 413 |
+
templates_data = self.load_dataset(dataset_path)
|
| 414 |
+
|
| 415 |
+
if not templates_data:
|
| 416 |
+
logger.error("Aucune donnée disponible pour l'entraînement")
|
| 417 |
+
return
|
| 418 |
+
|
| 419 |
+
logger.info("Début du fine-tuning NER sur templates médicaux")
|
| 420 |
+
|
| 421 |
+
# Préparation des données d'entraînement
|
| 422 |
+
datasets = self._prepare_training_data(templates_data)
|
| 423 |
+
|
| 424 |
+
if len(datasets['train']) == 0:
|
| 425 |
+
logger.error("Dataset d'entraînement vide")
|
| 426 |
+
return
|
| 427 |
+
|
| 428 |
+
# Data collator pour gérer le padding
|
| 429 |
+
data_collator = DataCollatorForTokenClassification(
|
| 430 |
+
tokenizer=self.tokenizer,
|
| 431 |
+
padding=True
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
# Configuration d'entraînement
|
| 435 |
+
training_args = TrainingArguments(
|
| 436 |
+
output_dir=str(output_dir),
|
| 437 |
+
num_train_epochs=epochs,
|
| 438 |
+
per_device_train_batch_size=8,
|
| 439 |
+
per_device_eval_batch_size=8,
|
| 440 |
+
warmup_steps=500,
|
| 441 |
+
weight_decay=0.01,
|
| 442 |
+
logging_dir=f"{output_dir}/logs",
|
| 443 |
+
logging_steps=50,
|
| 444 |
+
save_strategy="epoch",
|
| 445 |
+
evaluation_strategy="epoch" if 'eval' in datasets else "no",
|
| 446 |
+
load_best_model_at_end=True if 'eval' in datasets else False,
|
| 447 |
+
metric_for_best_model="eval_loss" if 'eval' in datasets else None,
|
| 448 |
+
greater_is_better=False,
|
| 449 |
+
remove_unused_columns=False,
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
# Fonction de calcul des métriques
|
| 453 |
+
def compute_metrics(eval_pred):
|
| 454 |
+
predictions, labels = eval_pred
|
| 455 |
+
predictions = np.argmax(predictions, axis=2)
|
| 456 |
+
|
| 457 |
+
# Calcul de l'accuracy en ignorant les labels -100
|
| 458 |
+
mask = labels != -100
|
| 459 |
+
accuracy = (predictions[mask] == labels[mask]).mean()
|
| 460 |
+
|
| 461 |
+
return {"accuracy": accuracy}
|
| 462 |
+
|
| 463 |
+
# Trainer
|
| 464 |
+
trainer = Trainer(
|
| 465 |
+
model=self.ner_model,
|
| 466 |
+
args=training_args,
|
| 467 |
+
train_dataset=datasets['train'],
|
| 468 |
+
eval_dataset=datasets.get('eval'),
|
| 469 |
+
tokenizer=self.tokenizer,
|
| 470 |
+
data_collator=data_collator,
|
| 471 |
+
compute_metrics=compute_metrics if 'eval' in datasets else None,
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
# Entraînement
|
| 475 |
+
logger.info("Début de l'entraînement...")
|
| 476 |
+
trainer.train()
|
| 477 |
+
|
| 478 |
+
# Sauvegarde
|
| 479 |
+
trainer.save_model()
|
| 480 |
+
self.tokenizer.save_pretrained(output_dir)
|
| 481 |
+
|
| 482 |
+
# Recharger le modèle et le pipeline
|
| 483 |
+
self._load_ner_model()
|
| 484 |
+
|
| 485 |
+
logger.info(f"Fine-tuning terminé, modèle sauvé dans {output_dir}")
|
| 486 |
+
|
| 487 |
+
# Affichage des métriques finales si évaluation disponible
|
| 488 |
+
if 'eval' in datasets:
|
| 489 |
+
eval_results = trainer.evaluate()
|
| 490 |
+
logger.info(f"Métriques finales: {eval_results}")
|
| 491 |
+
|
| 492 |
+
class AdvancedMedicalEmbedding:
|
| 493 |
+
"""Générateur d'embeddings médicaux avancés avec cross-encoder reranking"""
|
| 494 |
+
|
| 495 |
+
def __init__(self,
|
| 496 |
+
base_model: str = "almanach/camembert-bio-base",
|
| 497 |
+
cross_encoder_model: str = "auto"):
|
| 498 |
+
|
| 499 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 500 |
+
self.base_model_name = base_model
|
| 501 |
+
|
| 502 |
+
# Modèle principal pour embeddings
|
| 503 |
+
self._load_base_model()
|
| 504 |
+
|
| 505 |
+
# Cross-encoder pour reranking
|
| 506 |
+
self._load_cross_encoder(cross_encoder_model)
|
| 507 |
+
|
| 508 |
+
def _load_base_model(self):
|
| 509 |
+
"""Charge le modèle de base pour les embeddings"""
|
| 510 |
+
try:
|
| 511 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)
|
| 512 |
+
self.base_model = AutoModel.from_pretrained(self.base_model_name)
|
| 513 |
+
self.base_model.to(self.device)
|
| 514 |
+
logger.info(f"Modèle de base chargé: {self.base_model_name}")
|
| 515 |
+
except Exception as e:
|
| 516 |
+
logger.error(f"Erreur chargement modèle de base: {e}")
|
| 517 |
+
raise
|
| 518 |
+
|
| 519 |
+
def _load_cross_encoder(self, model_name: str):
|
| 520 |
+
"""Charge le cross-encoder pour reranking"""
|
| 521 |
+
|
| 522 |
+
if model_name == "auto":
|
| 523 |
+
# Sélection automatique du meilleur cross-encoder médical
|
| 524 |
+
cross_encoders = [
|
| 525 |
+
"microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
|
| 526 |
+
"emilyalsentzer/Bio_ClinicalBERT",
|
| 527 |
+
self.base_model_name # Fallback
|
| 528 |
+
]
|
| 529 |
+
|
| 530 |
+
for model in cross_encoders:
|
| 531 |
+
try:
|
| 532 |
+
self.cross_tokenizer = AutoTokenizer.from_pretrained(model)
|
| 533 |
+
self.cross_model = AutoModel.from_pretrained(model)
|
| 534 |
+
self.cross_model.to(self.device)
|
| 535 |
+
logger.info(f"Cross-encoder chargé: {model}")
|
| 536 |
+
break
|
| 537 |
+
except:
|
| 538 |
+
continue
|
| 539 |
+
else:
|
| 540 |
+
self.cross_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 541 |
+
self.cross_model = AutoModel.from_pretrained(model_name)
|
| 542 |
+
self.cross_model.to(self.device)
|
| 543 |
+
|
| 544 |
+
def generate_embedding(self, text: str, entities: MedicalEntity = None) -> np.ndarray:
|
| 545 |
+
"""Génère un embedding enrichi pour un texte médical"""
|
| 546 |
+
|
| 547 |
+
# Tokenisation
|
| 548 |
+
inputs = self.tokenizer(
|
| 549 |
+
text,
|
| 550 |
+
padding=True,
|
| 551 |
+
truncation=True,
|
| 552 |
+
max_length=512,
|
| 553 |
+
return_tensors="pt"
|
| 554 |
+
).to(self.device)
|
| 555 |
+
|
| 556 |
+
# Génération embedding
|
| 557 |
+
with torch.no_grad():
|
| 558 |
+
outputs = self.base_model(**inputs)
|
| 559 |
+
|
| 560 |
+
# Mean pooling
|
| 561 |
+
attention_mask = inputs['attention_mask']
|
| 562 |
+
token_embeddings = outputs.last_hidden_state
|
| 563 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 564 |
+
embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 565 |
+
|
| 566 |
+
# Enrichissement avec entités NER
|
| 567 |
+
if entities:
|
| 568 |
+
embedding = self._enrich_with_ner_entities(embedding, entities)
|
| 569 |
+
|
| 570 |
+
return embedding.cpu().numpy().flatten().astype(np.float32)
|
| 571 |
+
|
| 572 |
+
def _enrich_with_ner_entities(self, base_embedding: torch.Tensor, entities: MedicalEntity) -> torch.Tensor:
|
| 573 |
+
"""Enrichit l'embedding avec les entités extraites par NER"""
|
| 574 |
+
|
| 575 |
+
# Concaténer les entités importantes avec leurs scores de confiance
|
| 576 |
+
entity_texts = []
|
| 577 |
+
confidence_weights = []
|
| 578 |
+
|
| 579 |
+
for entity_list in [entities.exam_types, entities.specialties,
|
| 580 |
+
entities.anatomical_regions, entities.pathologies]:
|
| 581 |
+
for entity_text, confidence in entity_list:
|
| 582 |
+
entity_texts.append(entity_text)
|
| 583 |
+
confidence_weights.append(confidence)
|
| 584 |
+
|
| 585 |
+
if not entity_texts:
|
| 586 |
+
return base_embedding
|
| 587 |
+
|
| 588 |
+
# Génération d'embeddings pour les entités
|
| 589 |
+
entity_text_combined = " [SEP] ".join(entity_texts)
|
| 590 |
+
entity_inputs = self.tokenizer(
|
| 591 |
+
entity_text_combined,
|
| 592 |
+
padding=True,
|
| 593 |
+
truncation=True,
|
| 594 |
+
max_length=256,
|
| 595 |
+
return_tensors="pt"
|
| 596 |
+
).to(self.device)
|
| 597 |
+
|
| 598 |
+
with torch.no_grad():
|
| 599 |
+
entity_outputs = self.base_model(**entity_inputs)
|
| 600 |
+
entity_embedding = torch.mean(entity_outputs.last_hidden_state, dim=1)
|
| 601 |
+
|
| 602 |
+
# Fusion pondérée par les scores de confiance
|
| 603 |
+
avg_confidence = np.mean(confidence_weights) if confidence_weights else 0.5
|
| 604 |
+
fusion_weight = min(0.4, avg_confidence) # Max 40% pour les entités
|
| 605 |
+
|
| 606 |
+
enriched_embedding = (1 - fusion_weight) * base_embedding + fusion_weight * entity_embedding
|
| 607 |
+
|
| 608 |
+
return enriched_embedding
|
| 609 |
+
|
| 610 |
+
def cross_encoder_rerank(self,
|
| 611 |
+
query: str,
|
| 612 |
+
candidates: List[Dict],
|
| 613 |
+
top_k: int = 3) -> List[Dict]:
|
| 614 |
+
"""Reranking avec cross-encoder pour affiner la s��lection"""
|
| 615 |
+
|
| 616 |
+
if len(candidates) <= top_k:
|
| 617 |
+
return candidates
|
| 618 |
+
|
| 619 |
+
reranked_candidates = []
|
| 620 |
+
|
| 621 |
+
for candidate in candidates:
|
| 622 |
+
# Création de la paire query-candidate
|
| 623 |
+
pair_text = f"{query} [SEP] {candidate['document']}"
|
| 624 |
+
|
| 625 |
+
# Tokenisation
|
| 626 |
+
inputs = self.cross_tokenizer(
|
| 627 |
+
pair_text,
|
| 628 |
+
padding=True,
|
| 629 |
+
truncation=True,
|
| 630 |
+
max_length=512,
|
| 631 |
+
return_tensors="pt"
|
| 632 |
+
).to(self.device)
|
| 633 |
+
|
| 634 |
+
# Score de similarité cross-encoder
|
| 635 |
+
with torch.no_grad():
|
| 636 |
+
outputs = self.cross_model(**inputs)
|
| 637 |
+
# Utilisation du [CLS] token pour le score de similarité
|
| 638 |
+
cls_embedding = outputs.last_hidden_state[:, 0, :]
|
| 639 |
+
similarity_score = torch.sigmoid(torch.mean(cls_embedding)).item()
|
| 640 |
+
|
| 641 |
+
candidate_copy = candidate.copy()
|
| 642 |
+
candidate_copy['cross_encoder_score'] = similarity_score
|
| 643 |
+
candidate_copy['final_score'] = (
|
| 644 |
+
0.6 * candidate['similarity_score'] +
|
| 645 |
+
0.4 * similarity_score
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
reranked_candidates.append(candidate_copy)
|
| 649 |
+
|
| 650 |
+
# Tri par score final
|
| 651 |
+
reranked_candidates.sort(key=lambda x: x['final_score'], reverse=True)
|
| 652 |
+
|
| 653 |
+
return reranked_candidates[:top_k]
|
| 654 |
+
|
| 655 |
+
class MedicalTemplateVectorDB:
|
| 656 |
+
"""Base de données vectorielle optimisée pour templates médicaux"""
|
| 657 |
+
|
| 658 |
+
def __init__(self, db_path: str = "./medical_vector_db", collection_name: str = "medical_templates"):
|
| 659 |
+
self.db_path = db_path
|
| 660 |
+
self.collection_name = collection_name
|
| 661 |
+
|
| 662 |
+
# ChromaDB avec configuration optimisée
|
| 663 |
+
self.client = chromadb.PersistentClient(
|
| 664 |
+
path=db_path,
|
| 665 |
+
settings=Settings(
|
| 666 |
+
anonymized_telemetry=False,
|
| 667 |
+
allow_reset=True
|
| 668 |
+
)
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
# Collection avec métrique de distance optimisée
|
| 672 |
+
try:
|
| 673 |
+
self.collection = self.client.get_collection(collection_name)
|
| 674 |
+
logger.info(f"Collection '{collection_name}' chargée")
|
| 675 |
+
except:
|
| 676 |
+
self.collection = self.client.create_collection(
|
| 677 |
+
name=collection_name,
|
| 678 |
+
metadata={
|
| 679 |
+
"hnsw:space": "cosine",
|
| 680 |
+
"hnsw:M": 32, # Connectivité du graphe
|
| 681 |
+
"hnsw:ef_construction": 200, # Qualité vs vitesse construction
|
| 682 |
+
"hnsw:ef_search": 50 # Qualité vs vitesse recherche
|
| 683 |
+
}
|
| 684 |
+
)
|
| 685 |
+
logger.info(f"Collection '{collection_name}' créée avec optimisations HNSW")
|
| 686 |
+
|
| 687 |
+
def add_template(self,
|
| 688 |
+
template_id: str,
|
| 689 |
+
template_text: str,
|
| 690 |
+
embedding: np.ndarray,
|
| 691 |
+
entities: MedicalEntity,
|
| 692 |
+
metadata: Dict[str, Any] = None):
|
| 693 |
+
"""Ajoute un template avec métadonnées enrichies par NER"""
|
| 694 |
+
|
| 695 |
+
# Métadonnées automatiques basées sur NER
|
| 696 |
+
auto_metadata = {
|
| 697 |
+
"exam_types": [entity[0] for entity in entities.exam_types],
|
| 698 |
+
"specialties": [entity[0] for entity in entities.specialties],
|
| 699 |
+
"anatomical_regions": [entity[0] for entity in entities.anatomical_regions],
|
| 700 |
+
"pathologies": [entity[0] for entity in entities.pathologies],
|
| 701 |
+
"procedures": [entity[0] for entity in entities.medical_procedures],
|
| 702 |
+
"text_length": len(template_text),
|
| 703 |
+
"entity_confidence_avg": np.mean([
|
| 704 |
+
entity[1] for entity_list in [
|
| 705 |
+
entities.exam_types, entities.specialties,
|
| 706 |
+
entities.anatomical_regions, entities.pathologies
|
| 707 |
+
] for entity in entity_list
|
| 708 |
+
]) if any([entities.exam_types, entities.specialties,
|
| 709 |
+
entities.anatomical_regions, entities.pathologies]) else 0.0
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
if metadata:
|
| 713 |
+
auto_metadata.update(metadata)
|
| 714 |
+
|
| 715 |
+
self.collection.add(
|
| 716 |
+
embeddings=[embedding.tolist()],
|
| 717 |
+
documents=[template_text],
|
| 718 |
+
metadatas=[auto_metadata],
|
| 719 |
+
ids=[template_id]
|
| 720 |
+
)
|
| 721 |
+
|
| 722 |
+
logger.info(f"Template {template_id} ajouté avec métadonnées NER automatiques")
|
| 723 |
+
|
| 724 |
+
def advanced_search(self,
|
| 725 |
+
query_embedding: np.ndarray,
|
| 726 |
+
n_results: int = 10,
|
| 727 |
+
entity_filters: Dict[str, List[str]] = None,
|
| 728 |
+
confidence_threshold: float = 0.0) -> List[Dict]:
|
| 729 |
+
"""Recherche avancée avec filtres basés sur entités NER"""
|
| 730 |
+
|
| 731 |
+
where_clause = {}
|
| 732 |
+
|
| 733 |
+
# Filtres basés sur entités NER extraites
|
| 734 |
+
if entity_filters:
|
| 735 |
+
for entity_type, entity_values in entity_filters.items():
|
| 736 |
+
if entity_values:
|
| 737 |
+
where_clause[entity_type] = {"$in": entity_values}
|
| 738 |
+
|
| 739 |
+
# Filtre par confiance moyenne des entités
|
| 740 |
+
if confidence_threshold > 0:
|
| 741 |
+
where_clause["entity_confidence_avg"] = {"$gte": confidence_threshold}
|
| 742 |
+
|
| 743 |
+
results = self.collection.query(
|
| 744 |
+
query_embeddings=[query_embedding.tolist()],
|
| 745 |
+
n_results=n_results,
|
| 746 |
+
where=where_clause if where_clause else None,
|
| 747 |
+
include=["documents", "metadatas", "distances"]
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
# Formatage des résultats
|
| 751 |
+
formatted_results = []
|
| 752 |
+
for i in range(len(results['ids'][0])):
|
| 753 |
+
formatted_results.append({
|
| 754 |
+
'id': results['ids'][0][i],
|
| 755 |
+
'document': results['documents'][0][i],
|
| 756 |
+
'metadata': results['metadatas'][0][i],
|
| 757 |
+
'similarity_score': 1 - results['distances'][0][i],
|
| 758 |
+
'distance': results['distances'][0][i]
|
| 759 |
+
})
|
| 760 |
+
|
| 761 |
+
return formatted_results
|
| 762 |
+
|
| 763 |
+
class AdvancedMedicalTemplateProcessor:
|
| 764 |
+
"""Processeur avancé avec NER fine-tuné et reranking cross-encoder"""
|
| 765 |
+
|
| 766 |
+
def __init__(self,
|
| 767 |
+
base_model: str = "almanach/camembert-bio-base",
|
| 768 |
+
db_path: str = "./advanced_medical_vector_db"):
|
| 769 |
+
|
| 770 |
+
self.ner_extractor = AdvancedMedicalNER()
|
| 771 |
+
self.embedding_generator = AdvancedMedicalEmbedding(base_model)
|
| 772 |
+
self.vector_db = MedicalTemplateVectorDB(db_path)
|
| 773 |
+
|
| 774 |
+
logger.info("Processeur médical avancé initialisé avec NER fine-tuné et cross-encoder reranking")
|
| 775 |
+
|
| 776 |
+
def process_templates_batch(self,
|
| 777 |
+
templates: List[Dict[str, str]] = None,
|
| 778 |
+
dataset_path: str = "dataset.json",
|
| 779 |
+
batch_size: int = 8,
|
| 780 |
+
fine_tune_ner: bool = False) -> None:
|
| 781 |
+
"""Traitement avancé avec option de fine-tuning NER"""
|
| 782 |
+
|
| 783 |
+
# Chargement des données si templates non fournis
|
| 784 |
+
if templates is None:
|
| 785 |
+
logger.info(f"Chargement des templates depuis {dataset_path}")
|
| 786 |
+
templates = self.ner_extractor.load_dataset(dataset_path)
|
| 787 |
+
# Conversion du format dataset vers le format attendu
|
| 788 |
+
templates = [
|
| 789 |
+
{
|
| 790 |
+
'id': f"template_{i:04d}",
|
| 791 |
+
'text': template['text'],
|
| 792 |
+
'metadata': {'labels': template.get('labels', {})}
|
| 793 |
+
}
|
| 794 |
+
for i, template in enumerate(templates)
|
| 795 |
+
]
|
| 796 |
+
|
| 797 |
+
if fine_tune_ner:
|
| 798 |
+
logger.info("Fine-tuning du modèle NER sur les templates...")
|
| 799 |
+
# Reconversion pour le fine-tuning
|
| 800 |
+
training_data = [
|
| 801 |
+
{
|
| 802 |
+
'text': template['text'],
|
| 803 |
+
'labels': template['metadata'].get('labels', {})
|
| 804 |
+
}
|
| 805 |
+
for template in templates
|
| 806 |
+
]
|
| 807 |
+
self.ner_extractor.fine_tune_on_templates(training_data)
|
| 808 |
+
|
| 809 |
+
logger.info(f"Traitement avancé de {len(templates)} templates")
|
| 810 |
+
|
| 811 |
+
for i in tqdm(range(0, len(templates), batch_size), desc="Traitement avancé"):
|
| 812 |
+
batch = templates[i:i+batch_size]
|
| 813 |
+
|
| 814 |
+
for template in batch:
|
| 815 |
+
try:
|
| 816 |
+
template_id = template['id']
|
| 817 |
+
template_text = template['text']
|
| 818 |
+
metadata = template.get('metadata', {})
|
| 819 |
+
|
| 820 |
+
# NER avancé
|
| 821 |
+
entities = self.ner_extractor.extract_entities(template_text)
|
| 822 |
+
|
| 823 |
+
# Embedding enrichi
|
| 824 |
+
embedding = self.embedding_generator.generate_embedding(template_text, entities)
|
| 825 |
+
|
| 826 |
+
# Stockage avec métadonnées NER
|
| 827 |
+
self.vector_db.add_template(
|
| 828 |
+
template_id=template_id,
|
| 829 |
+
template_text=template_text,
|
| 830 |
+
embedding=embedding,
|
| 831 |
+
entities=entities,
|
| 832 |
+
metadata=metadata
|
| 833 |
+
)
|
| 834 |
+
|
| 835 |
+
except Exception as e:
|
| 836 |
+
logger.error(f"Erreur traitement template {template.get('id', 'unknown')}: {e}")
|
| 837 |
+
continue
|
| 838 |
+
|
| 839 |
+
def find_best_template_with_reranking(self,
|
| 840 |
+
transcription: str,
|
| 841 |
+
initial_candidates: int = 10,
|
| 842 |
+
final_results: int = 3) -> List[Dict]:
|
| 843 |
+
"""Recherche optimale avec reranking cross-encoder"""
|
| 844 |
+
|
| 845 |
+
# 1. Extraction NER de la transcription
|
| 846 |
+
query_entities = self.ner_extractor.extract_entities(transcription)
|
| 847 |
+
|
| 848 |
+
# 2. Génération embedding enrichi
|
| 849 |
+
query_embedding = self.embedding_generator.generate_embedding(transcription, query_entities)
|
| 850 |
+
|
| 851 |
+
# 3. Filtres automatiques basés sur entités extraites
|
| 852 |
+
entity_filters = {}
|
| 853 |
+
if query_entities.exam_types:
|
| 854 |
+
entity_filters['exam_types'] = [entity[0] for entity in query_entities.exam_types]
|
| 855 |
+
if query_entities.specialties:
|
| 856 |
+
entity_filters['specialties'] = [entity[0] for entity in query_entities.specialties]
|
| 857 |
+
if query_entities.anatomical_regions:
|
| 858 |
+
entity_filters['anatomical_regions'] = [entity[0] for entity in query_entities.anatomical_regions]
|
| 859 |
+
|
| 860 |
+
# 4. Recherche vectorielle initiale
|
| 861 |
+
initial_candidates_results = self.vector_db.advanced_search(
|
| 862 |
+
query_embedding=query_embedding,
|
| 863 |
+
n_results=initial_candidates,
|
| 864 |
+
entity_filters=entity_filters,
|
| 865 |
+
confidence_threshold=0.6
|
| 866 |
+
)
|
| 867 |
+
|
| 868 |
+
# 5. Reranking avec cross-encoder
|
| 869 |
+
if len(initial_candidates_results) > final_results:
|
| 870 |
+
final_results_reranked = self.embedding_generator.cross_encoder_rerank(
|
| 871 |
+
query=transcription,
|
| 872 |
+
candidates=initial_candidates_results,
|
| 873 |
+
top_k=final_results
|
| 874 |
+
)
|
| 875 |
+
else:
|
| 876 |
+
final_results_reranked = initial_candidates_results
|
| 877 |
+
|
| 878 |
+
# 6. Enrichissement des résultats avec détails NER
|
| 879 |
+
for result in final_results_reranked:
|
| 880 |
+
result['query_entities'] = {
|
| 881 |
+
'exam_types': query_entities.exam_types,
|
| 882 |
+
'specialties': query_entities.specialties,
|
| 883 |
+
'anatomical_regions': query_entities.anatomical_regions,
|
| 884 |
+
'pathologies': query_entities.pathologies
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
return final_results_reranked
|
| 888 |
+
|
| 889 |
+
def evaluate_ner_performance(self, test_dataset_path: str = None) -> Dict[str, float]:
|
| 890 |
+
"""Évalue les performances du modèle NER fine-tuné"""
|
| 891 |
+
|
| 892 |
+
if test_dataset_path is None:
|
| 893 |
+
logger.warning("Aucun dataset de test fourni pour l'évaluation")
|
| 894 |
+
return {}
|
| 895 |
+
|
| 896 |
+
test_data = self.ner_extractor.load_dataset(test_dataset_path)
|
| 897 |
+
if not test_data:
|
| 898 |
+
logger.error("Dataset de test vide")
|
| 899 |
+
return {}
|
| 900 |
+
|
| 901 |
+
correct_predictions = 0
|
| 902 |
+
total_entities = 0
|
| 903 |
+
entity_type_stats = {}
|
| 904 |
+
|
| 905 |
+
for sample in tqdm(test_data, desc="Évaluation NER"):
|
| 906 |
+
text = sample['text']
|
| 907 |
+
true_entities = sample['labels']
|
| 908 |
+
|
| 909 |
+
# Prédiction
|
| 910 |
+
predicted_entities = self.ner_extractor.extract_entities(text)
|
| 911 |
+
|
| 912 |
+
# Conversion en format comparable
|
| 913 |
+
predicted_dict = {
|
| 914 |
+
'exam_types': [entity[0].lower() for entity in predicted_entities.exam_types],
|
| 915 |
+
'specialties': [entity[0].lower() for entity in predicted_entities.specialties],
|
| 916 |
+
'anatomical_regions': [entity[0].lower() for entity in predicted_entities.anatomical_regions],
|
| 917 |
+
'pathologies': [entity[0].lower() for entity in predicted_entities.pathologies],
|
| 918 |
+
'procedures': [entity[0].lower() for entity in predicted_entities.medical_procedures],
|
| 919 |
+
'measurements': [entity[0].lower() for entity in predicted_entities.measurements],
|
| 920 |
+
'medications': [entity[0].lower() for entity in predicted_entities.medications],
|
| 921 |
+
'symptoms': [entity[0].lower() for entity in predicted_entities.symptoms]
|
| 922 |
+
}
|
| 923 |
+
|
| 924 |
+
# Comparaison
|
| 925 |
+
for entity_type, true_entities_list in true_entities.items():
|
| 926 |
+
if entity_type in predicted_dict:
|
| 927 |
+
predicted_entities_list = predicted_dict[entity_type]
|
| 928 |
+
|
| 929 |
+
# Statistiques par type d'entité
|
| 930 |
+
if entity_type not in entity_type_stats:
|
| 931 |
+
entity_type_stats[entity_type] = {'correct': 0, 'total': 0}
|
| 932 |
+
|
| 933 |
+
true_entities_lower = [entity.lower() for entity in true_entities_list]
|
| 934 |
+
|
| 935 |
+
for true_entity in true_entities_lower:
|
| 936 |
+
total_entities += 1
|
| 937 |
+
entity_type_stats[entity_type]['total'] += 1
|
| 938 |
+
|
| 939 |
+
if true_entity in predicted_entities_list:
|
| 940 |
+
correct_predictions += 1
|
| 941 |
+
entity_type_stats[entity_type]['correct'] += 1
|
| 942 |
+
|
| 943 |
+
# Calcul des métriques
|
| 944 |
+
overall_accuracy = correct_predictions / total_entities if total_entities > 0 else 0
|
| 945 |
+
|
| 946 |
+
metrics = {
|
| 947 |
+
'overall_accuracy': overall_accuracy,
|
| 948 |
+
'total_entities': total_entities,
|
| 949 |
+
'correct_predictions': correct_predictions
|
| 950 |
+
}
|
| 951 |
+
|
| 952 |
+
# Métriques par type d'entité
|
| 953 |
+
for entity_type, stats in entity_type_stats.items():
|
| 954 |
+
if stats['total'] > 0:
|
| 955 |
+
accuracy = stats['correct'] / stats['total']
|
| 956 |
+
metrics[f'{entity_type}_accuracy'] = accuracy
|
| 957 |
+
metrics[f'{entity_type}_total'] = stats['total']
|
| 958 |
+
|
| 959 |
+
logger.info(f"Évaluation NER terminée - Accuracy globale: {overall_accuracy:.4f}")
|
| 960 |
+
|
| 961 |
+
return metrics
|
| 962 |
+
|
| 963 |
+
def export_processed_templates(self, output_path: str = "processed_templates.json"):
|
| 964 |
+
"""Exporte les templates traités avec leurs embeddings et entités"""
|
| 965 |
+
|
| 966 |
+
try:
|
| 967 |
+
# Récupération de tous les templates de la base vectorielle
|
| 968 |
+
all_results = self.vector_db.collection.get(
|
| 969 |
+
include=["documents", "metadatas", "embeddings"]
|
| 970 |
+
)
|
| 971 |
+
|
| 972 |
+
processed_templates = []
|
| 973 |
+
|
| 974 |
+
for i in range(len(all_results['ids'])):
|
| 975 |
+
template_data = {
|
| 976 |
+
'id': all_results['ids'][i],
|
| 977 |
+
'text': all_results['documents'][i],
|
| 978 |
+
'metadata': all_results['metadatas'][i],
|
| 979 |
+
'embedding': all_results['embeddings'][i] if all_results.get('embeddings') else None
|
| 980 |
+
}
|
| 981 |
+
processed_templates.append(template_data)
|
| 982 |
+
|
| 983 |
+
# Sauvegarde
|
| 984 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 985 |
+
json.dump(processed_templates, f, ensure_ascii=False, indent=2)
|
| 986 |
+
|
| 987 |
+
logger.info(f"Templates traités exportés vers {output_path}")
|
| 988 |
+
logger.info(f"Nombre de templates exportés: {len(processed_templates)}")
|
| 989 |
+
|
| 990 |
+
except Exception as e:
|
| 991 |
+
logger.error(f"Erreur lors de l'export: {e}")
|
| 992 |
+
|
| 993 |
+
# Utilitaires pour l'analyse et le debugging
|
| 994 |
+
class MedicalNERAnalyzer:
|
| 995 |
+
"""Outils d'analyse et de debugging pour le système NER médical"""
|
| 996 |
+
|
| 997 |
+
def __init__(self, processor: AdvancedMedicalTemplateProcessor):
|
| 998 |
+
self.processor = processor
|
| 999 |
+
|
| 1000 |
+
def analyze_text(self, text: str) -> Dict:
|
| 1001 |
+
"""Analyse complète d'un texte médical"""
|
| 1002 |
+
|
| 1003 |
+
# Extraction NER
|
| 1004 |
+
entities = self.processor.ner_extractor.extract_entities(text)
|
| 1005 |
+
|
| 1006 |
+
# Génération d'embedding
|
| 1007 |
+
embedding = self.processor.embedding_generator.generate_embedding(text, entities)
|
| 1008 |
+
|
| 1009 |
+
# Statistiques
|
| 1010 |
+
analysis = {
|
| 1011 |
+
'text': text,
|
| 1012 |
+
'text_length': len(text),
|
| 1013 |
+
'entities': {
|
| 1014 |
+
'exam_types': entities.exam_types,
|
| 1015 |
+
'specialties': entities.specialties,
|
| 1016 |
+
'anatomical_regions': entities.anatomical_regions,
|
| 1017 |
+
'pathologies': entities.pathologies,
|
| 1018 |
+
'procedures': entities.medical_procedures,
|
| 1019 |
+
'measurements': entities.measurements,
|
| 1020 |
+
'medications': entities.medications,
|
| 1021 |
+
'symptoms': entities.symptoms
|
| 1022 |
+
},
|
| 1023 |
+
'embedding_shape': embedding.shape,
|
| 1024 |
+
'entity_count_total': sum([
|
| 1025 |
+
len(entities.exam_types),
|
| 1026 |
+
len(entities.specialties),
|
| 1027 |
+
len(entities.anatomical_regions),
|
| 1028 |
+
len(entities.pathologies),
|
| 1029 |
+
len(entities.medical_procedures),
|
| 1030 |
+
len(entities.measurements),
|
| 1031 |
+
len(entities.medications),
|
| 1032 |
+
len(entities.symptoms)
|
| 1033 |
+
]),
|
| 1034 |
+
'confidence_scores': {
|
| 1035 |
+
'exam_types': [conf for _, conf in entities.exam_types],
|
| 1036 |
+
'specialties': [conf for _, conf in entities.specialties],
|
| 1037 |
+
'anatomical_regions': [conf for _, conf in entities.anatomical_regions],
|
| 1038 |
+
'pathologies': [conf for _, conf in entities.pathologies]
|
| 1039 |
+
}
|
| 1040 |
+
}
|
| 1041 |
+
|
| 1042 |
+
return analysis
|
| 1043 |
+
|
| 1044 |
+
def compare_entities(self, text1: str, text2: str) -> Dict:
|
| 1045 |
+
"""Compare les entités extraites de deux textes"""
|
| 1046 |
+
|
| 1047 |
+
entities1 = self.processor.ner_extractor.extract_entities(text1)
|
| 1048 |
+
entities2 = self.processor.ner_extractor.extract_entities(text2)
|
| 1049 |
+
|
| 1050 |
+
def entities_to_set(entities):
|
| 1051 |
+
all_entities = set()
|
| 1052 |
+
for entity_list in [entities.exam_types, entities.specialties,
|
| 1053 |
+
entities.anatomical_regions, entities.pathologies]:
|
| 1054 |
+
for entity, _ in entity_list:
|
| 1055 |
+
all_entities.add(entity.lower())
|
| 1056 |
+
return all_entities
|
| 1057 |
+
|
| 1058 |
+
set1 = entities_to_set(entities1)
|
| 1059 |
+
set2 = entities_to_set(entities2)
|
| 1060 |
+
|
| 1061 |
+
return {
|
| 1062 |
+
'text1_entities': list(set1),
|
| 1063 |
+
'text2_entities': list(set2),
|
| 1064 |
+
'common_entities': list(set1.intersection(set2)),
|
| 1065 |
+
'unique_to_text1': list(set1.difference(set2)),
|
| 1066 |
+
'unique_to_text2': list(set2.difference(set1)),
|
| 1067 |
+
'similarity_ratio': len(set1.intersection(set2)) / len(set1.union(set2)) if set1.union(set2) else 0
|
| 1068 |
+
}
|
| 1069 |
+
|
| 1070 |
+
def generate_entity_report(self, dataset_path: str) -> Dict:
|
| 1071 |
+
"""Génère un rapport statistique sur les entités du dataset"""
|
| 1072 |
+
|
| 1073 |
+
dataset = self.processor.ner_extractor.load_dataset(dataset_path)
|
| 1074 |
+
|
| 1075 |
+
entity_stats = {
|
| 1076 |
+
'exam_types': {},
|
| 1077 |
+
'specialties': {},
|
| 1078 |
+
'anatomical_regions': {},
|
| 1079 |
+
'pathologies': {},
|
| 1080 |
+
'procedures': {},
|
| 1081 |
+
'measurements': {},
|
| 1082 |
+
'medications': {},
|
| 1083 |
+
'symptoms': {}
|
| 1084 |
+
}
|
| 1085 |
+
|
| 1086 |
+
total_samples = len(dataset)
|
| 1087 |
+
|
| 1088 |
+
for sample in tqdm(dataset, desc="Analyse du dataset"):
|
| 1089 |
+
labels = sample.get('labels', {})
|
| 1090 |
+
|
| 1091 |
+
for entity_type, entities in labels.items():
|
| 1092 |
+
if entity_type in entity_stats:
|
| 1093 |
+
for entity in entities:
|
| 1094 |
+
entity_lower = entity.lower()
|
| 1095 |
+
if entity_lower not in entity_stats[entity_type]:
|
| 1096 |
+
entity_stats[entity_type][entity_lower] = 0
|
| 1097 |
+
entity_stats[entity_type][entity_lower] += 1
|
| 1098 |
+
|
| 1099 |
+
# Génération du rapport
|
| 1100 |
+
report = {
|
| 1101 |
+
'total_samples': total_samples,
|
| 1102 |
+
'entity_statistics': {}
|
| 1103 |
+
}
|
| 1104 |
+
|
| 1105 |
+
for entity_type, entity_counts in entity_stats.items():
|
| 1106 |
+
if entity_counts:
|
| 1107 |
+
sorted_entities = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)
|
| 1108 |
+
report['entity_statistics'][entity_type] = {
|
| 1109 |
+
'unique_count': len(entity_counts),
|
| 1110 |
+
'total_occurrences': sum(entity_counts.values()),
|
| 1111 |
+
'top_10': sorted_entities[:10],
|
| 1112 |
+
'average_occurrences': sum(entity_counts.values()) / len(entity_counts)
|
| 1113 |
+
}
|
| 1114 |
+
|
| 1115 |
+
return report
|
| 1116 |
+
|
| 1117 |
+
# Exemple d'utilisation avancée
|
| 1118 |
+
def main():
|
| 1119 |
+
"""Exemple d'utilisation du système avancé avec fine-tuning"""
|
| 1120 |
+
|
| 1121 |
+
# Initialisation du processeur avancé
|
| 1122 |
+
processor = AdvancedMedicalTemplateProcessor()
|
| 1123 |
+
|
| 1124 |
+
# 1. Traitement des templates avec fine-tuning NER
|
| 1125 |
+
print("=== ÉTAPE 1: Traitement et Fine-tuning ===")
|
| 1126 |
+
processor.process_templates_batch(
|
| 1127 |
+
dataset_path="dataset.json",
|
| 1128 |
+
fine_tune_ner=True, # Active le fine-tuning
|
| 1129 |
+
batch_size=8
|
| 1130 |
+
)
|
| 1131 |
+
|
| 1132 |
+
# 2. Évaluation des performances NER (optionnel, si dataset de test disponible)
|
| 1133 |
+
print("\n=== ÉTAPE 2: Évaluation des performances ===")
|
| 1134 |
+
# metrics = processor.evaluate_ner_performance("test_dataset.json")
|
| 1135 |
+
# print(f"Métriques d'évaluation: {metrics}")
|
| 1136 |
+
|
| 1137 |
+
# 3. Analyse d'un texte médical
|
| 1138 |
+
print("\n=== ÉTAPE 3: Analyse de texte ===")
|
| 1139 |
+
analyzer = MedicalNERAnalyzer(processor)
|
| 1140 |
+
|
| 1141 |
+
test_text = """madame bacon nicole bilan œdème droit gonalgies ostéophytes
|
| 1142 |
+
incontinence veineuse modérée portions surale droite crurale gauche saphéniennes"""
|
| 1143 |
+
|
| 1144 |
+
analysis = analyzer.analyze_text(test_text)
|
| 1145 |
+
print(f"Analyse du texte:")
|
| 1146 |
+
print(f"- Nombre total d'entités: {analysis['entity_count_total']}")
|
| 1147 |
+
print(f"- Types d'examens détectés: {analysis['entities']['exam_types']}")
|
| 1148 |
+
print(f"- Régions anatomiques: {analysis['entities']['anatomical_regions']}")
|
| 1149 |
+
print(f"- Pathologies: {analysis['entities']['pathologies']}")
|
| 1150 |
+
|
| 1151 |
+
# 4. Recherche avec reranking
|
| 1152 |
+
print("\n=== ÉTAPE 4: Recherche avec reranking ===")
|
| 1153 |
+
best_matches = processor.find_best_template_with_reranking(
|
| 1154 |
+
transcription=test_text,
|
| 1155 |
+
initial_candidates=15,
|
| 1156 |
+
final_results=3
|
| 1157 |
+
)
|
| 1158 |
+
|
| 1159 |
+
# Affichage des résultats
|
| 1160 |
+
for i, match in enumerate(best_matches):
|
| 1161 |
+
print(f"\n--- Match {i+1} ---")
|
| 1162 |
+
print(f"Template ID: {match['id']}")
|
| 1163 |
+
print(f"Score final: {match.get('final_score', match['similarity_score']):.4f}")
|
| 1164 |
+
print(f"Score cross-encoder: {match.get('cross_encoder_score', 'N/A')}")
|
| 1165 |
+
print(f"Extrait du texte: {match['document'][:200]}...")
|
| 1166 |
+
|
| 1167 |
+
# Affichage des entités détectées dans la query
|
| 1168 |
+
query_entities = match.get('query_entities', {})
|
| 1169 |
+
for entity_type, entities in query_entities.items():
|
| 1170 |
+
if entities:
|
| 1171 |
+
print(f" - {entity_type}: {[f'{e[0]} ({e[1]:.2f})' for e in entities[:3]]}")
|
| 1172 |
+
|
| 1173 |
+
# 5. Export des templates traités
|
| 1174 |
+
print("\n=== ÉTAPE 5: Export des résultats ===")
|
| 1175 |
+
processor.export_processed_templates("processed_medical_templates.json")
|
| 1176 |
+
|
| 1177 |
+
# 6. Génération d'un rapport sur le dataset
|
| 1178 |
+
print("\n=== ÉTAPE 6: Rapport du dataset ===")
|
| 1179 |
+
report = analyzer.generate_entity_report("dataset.json")
|
| 1180 |
+
print(f"Rapport généré pour {report['total_samples']} échantillons")
|
| 1181 |
+
|
| 1182 |
+
for entity_type, stats in report['entity_statistics'].items():
|
| 1183 |
+
if stats['unique_count'] > 0:
|
| 1184 |
+
print(f"\n{entity_type.upper()}:")
|
| 1185 |
+
print(f" - Entités uniques: {stats['unique_count']}")
|
| 1186 |
+
print(f" - Occurrences totales: {stats['total_occurrences']}")
|
| 1187 |
+
print(f" - Top 3: {stats['top_10'][:3]}")
|
| 1188 |
+
|
| 1189 |
+
if __name__ == "__main__":
|
| 1190 |
+
main()
|
document_assembler.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Assembler
|
| 4 |
+
Handles creating medical documents by inserting sections into Word templates
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Dict, Any, List
|
| 11 |
+
from docx import Document
|
| 12 |
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
| 13 |
+
from docx.shared import Pt
|
| 14 |
+
from langchain.tools import tool
|
| 15 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 16 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@tool
|
| 20 |
+
def create_medical_document(template_path: str, sections_text: str, title: str, output_path: str) -> str:
|
| 21 |
+
"""Create a medical document by inserting sections into a Word template."""
|
| 22 |
+
if not os.path.exists(template_path):
|
| 23 |
+
raise FileNotFoundError(f"Template file not found: {template_path}")
|
| 24 |
+
|
| 25 |
+
doc = Document(template_path)
|
| 26 |
+
|
| 27 |
+
# Parse sections from text
|
| 28 |
+
sections = {}
|
| 29 |
+
current_section = None
|
| 30 |
+
current_content = []
|
| 31 |
+
for line in sections_text.split('\n'):
|
| 32 |
+
line = line.strip()
|
| 33 |
+
if not line:
|
| 34 |
+
continue
|
| 35 |
+
line_lower = line.lower().replace('é', 'e').replace('è', 'e').replace('à', 'a')
|
| 36 |
+
if any(keyword in line_lower for keyword in ['technique', 'resultat', 'conclusion', 'indication']):
|
| 37 |
+
if current_section:
|
| 38 |
+
sections[current_section] = '\n'.join(current_content).strip()
|
| 39 |
+
current_section = line
|
| 40 |
+
current_content = []
|
| 41 |
+
elif current_section:
|
| 42 |
+
current_content.append(line)
|
| 43 |
+
if current_section and current_content:
|
| 44 |
+
sections[current_section] = '\n'.join(current_content).strip()
|
| 45 |
+
|
| 46 |
+
# First, check if there's a "Titre" section in the template and insert the title there
|
| 47 |
+
title_section_found = False
|
| 48 |
+
for idx, paragraph in enumerate(doc.paragraphs):
|
| 49 |
+
para_text = paragraph.text.strip()
|
| 50 |
+
para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
|
| 51 |
+
'à', 'a').replace(':', '').replace('\xa0', ' ').strip()
|
| 52 |
+
|
| 53 |
+
# Check if this is a title section (case insensitive)
|
| 54 |
+
if 'titre' in para_norm:
|
| 55 |
+
print(
|
| 56 |
+
f"🎯 Found title section in template: '{para_text}' at index {idx}")
|
| 57 |
+
# Clear the paragraph and insert the generated title
|
| 58 |
+
paragraph.clear()
|
| 59 |
+
paragraph.text = title
|
| 60 |
+
# Apply formatting to make it stand out
|
| 61 |
+
for run in paragraph.runs:
|
| 62 |
+
run.font.bold = True
|
| 63 |
+
run.font.size = Pt(14)
|
| 64 |
+
title_section_found = True
|
| 65 |
+
break
|
| 66 |
+
|
| 67 |
+
# If no title section found, add header with dynamic title
|
| 68 |
+
if not title_section_found:
|
| 69 |
+
print("📝 No title section found in template, adding header...")
|
| 70 |
+
header_para = doc.paragraphs[0].insert_paragraph_before()
|
| 71 |
+
header_text = f"{title}\nDate: {datetime.now().strftime('%d/%m/%Y')}\nHeure: {datetime.now().strftime('%H:%M')}\n{'='*40}"
|
| 72 |
+
header_para.text = header_text
|
| 73 |
+
for run in header_para.runs:
|
| 74 |
+
run.font.bold = True
|
| 75 |
+
run.font.size = Pt(14)
|
| 76 |
+
header_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
| 77 |
+
|
| 78 |
+
# Locate section titles in the template
|
| 79 |
+
section_indices = {}
|
| 80 |
+
for idx, paragraph in enumerate(doc.paragraphs):
|
| 81 |
+
para_text = paragraph.text.strip()
|
| 82 |
+
para_norm = para_text.lower().replace('é', 'e').replace('è', 'e').replace(
|
| 83 |
+
'à', 'a').replace(':', '').replace('\xa0', ' ').strip()
|
| 84 |
+
for section_name in sections.keys():
|
| 85 |
+
section_norm = section_name.lower().replace('é', 'e').replace(
|
| 86 |
+
'è', 'e').replace('à', 'a').replace(':', '').strip()
|
| 87 |
+
if (section_norm in para_norm and len(section_norm) > 0 and len(para_norm) > 0):
|
| 88 |
+
section_indices[section_name] = idx
|
| 89 |
+
print("DEBUG section_indices:", section_indices)
|
| 90 |
+
print("DEBUG sections.keys():", list(sections.keys()))
|
| 91 |
+
|
| 92 |
+
# For each section found, remove content between this title and the next title, then insert the generated content
|
| 93 |
+
sorted_sections = sorted(section_indices.items(), key=lambda x: x[1])
|
| 94 |
+
for i, (section_name, idx) in enumerate(sorted_sections):
|
| 95 |
+
# Determine the end of the section (before the next title or end of doc)
|
| 96 |
+
start = idx + 1
|
| 97 |
+
if i + 1 < len(sorted_sections):
|
| 98 |
+
end = sorted_sections[i+1][1]
|
| 99 |
+
else:
|
| 100 |
+
end = len(doc.paragraphs)
|
| 101 |
+
# Remove paragraphs between start and end
|
| 102 |
+
for j in range(end-1, start-1, -1):
|
| 103 |
+
p = doc.paragraphs[j]
|
| 104 |
+
if p.text.strip():
|
| 105 |
+
p.clear()
|
| 106 |
+
# Insert content right after the title
|
| 107 |
+
if sections[section_name]:
|
| 108 |
+
new_para = doc.paragraphs[idx+1] if (idx+1 <
|
| 109 |
+
len(doc.paragraphs)) else doc.add_paragraph()
|
| 110 |
+
new_para.text = sections[section_name]
|
| 111 |
+
|
| 112 |
+
doc.save(output_path)
|
| 113 |
+
return f"Document created successfully: {output_path}"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def create_document_assembler_agent(llm):
|
| 117 |
+
"""Create the document assembler agent."""
|
| 118 |
+
document_assembler_prompt = ChatPromptTemplate.from_messages([
|
| 119 |
+
("system", """You are a medical document assembler.
|
| 120 |
+
Create medical documents by inserting sections into Word templates.
|
| 121 |
+
Use the provided title for the document header and insert sections in the correct locations."""),
|
| 122 |
+
("human",
|
| 123 |
+
"Create a medical document with template {template_path}, sections content: {sections_text}, title: {title}, and save to {output_path}"),
|
| 124 |
+
MessagesPlaceholder("agent_scratchpad")
|
| 125 |
+
])
|
| 126 |
+
|
| 127 |
+
document_assembler_agent = create_openai_tools_agent(
|
| 128 |
+
llm=llm,
|
| 129 |
+
tools=[create_medical_document],
|
| 130 |
+
prompt=document_assembler_prompt
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
document_assembler_executor = AgentExecutor(
|
| 134 |
+
agent=document_assembler_agent,
|
| 135 |
+
tools=[create_medical_document],
|
| 136 |
+
verbose=True
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
return document_assembler_executor
|
document_validator.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Document Validator
|
| 4 |
+
Validates generated medical documents against original transcriptions
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from typing import Dict, Any, List
|
| 9 |
+
from docx import Document
|
| 10 |
+
from langchain.prompts import ChatPromptTemplate
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def validate_generated_document(template_path: str, transcription_path: str, generated_doc_path: str) -> Dict[str, Any]:
|
| 14 |
+
"""Validate that the generated document contains all important content from the transcription."""
|
| 15 |
+
from template_analyzer import analyze_word_template
|
| 16 |
+
from transcription_processor import load_transcription
|
| 17 |
+
|
| 18 |
+
# Extract content from generated document
|
| 19 |
+
doc = Document(generated_doc_path)
|
| 20 |
+
generated_content = []
|
| 21 |
+
for paragraph in doc.paragraphs:
|
| 22 |
+
text = paragraph.text.strip()
|
| 23 |
+
if text and not text.startswith("Date:") and not text.startswith("Heure:"):
|
| 24 |
+
generated_content.append(text)
|
| 25 |
+
generated_text = "\n".join(generated_content)
|
| 26 |
+
|
| 27 |
+
# Load transcription
|
| 28 |
+
transcription_text = load_transcription(transcription_path)
|
| 29 |
+
|
| 30 |
+
# Extract medical entities from both texts
|
| 31 |
+
def extract_medical_entities(text: str) -> List[str]:
|
| 32 |
+
patterns = [
|
| 33 |
+
r'\d+(?:\.\d+)?\s*(?:mm|cm|kg|cc|ml|g|mg)', # Measurements
|
| 34 |
+
r'\b(?:rein|vessie|foie|rate|poumon|coeur|cerveau|muscle|tendon|os|articulation)\b',
|
| 35 |
+
r'\b(?:lithiase|calcification|tendinopathie|inflammation|dilatation|normal|anormal)\b',
|
| 36 |
+
r'\b(?:échographie|radiographie|scanner|irm|examen)\b',
|
| 37 |
+
]
|
| 38 |
+
entities = []
|
| 39 |
+
for pattern in patterns:
|
| 40 |
+
matches = re.findall(pattern, text.lower())
|
| 41 |
+
entities.extend(matches)
|
| 42 |
+
return list(set(entities))
|
| 43 |
+
|
| 44 |
+
transcription_entities = extract_medical_entities(transcription_text)
|
| 45 |
+
generated_entities = extract_medical_entities(generated_text)
|
| 46 |
+
|
| 47 |
+
# Calculate coverage
|
| 48 |
+
missing_entities = [
|
| 49 |
+
entity for entity in transcription_entities if entity not in generated_entities]
|
| 50 |
+
coverage_percentage = ((len(transcription_entities) - len(missing_entities)) /
|
| 51 |
+
len(transcription_entities) * 100) if transcription_entities else 100
|
| 52 |
+
|
| 53 |
+
# Validate structure
|
| 54 |
+
template_analysis = analyze_word_template(template_path)
|
| 55 |
+
template_sections = [section['text']
|
| 56 |
+
for section in template_analysis.get('sections', [])]
|
| 57 |
+
|
| 58 |
+
found_sections = []
|
| 59 |
+
for paragraph in doc.paragraphs:
|
| 60 |
+
text = paragraph.text.strip()
|
| 61 |
+
for template_section in template_sections:
|
| 62 |
+
template_clean = template_section.lower().replace(
|
| 63 |
+
'\xa0', ' ').replace(':', '').strip()
|
| 64 |
+
text_clean = text.lower().replace(':', '').strip()
|
| 65 |
+
if template_clean in text_clean or text_clean in template_clean:
|
| 66 |
+
found_sections.append(template_section)
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
missing_sections = [
|
| 70 |
+
s for s in template_sections if s not in found_sections]
|
| 71 |
+
structure_valid = len(missing_sections) == 0
|
| 72 |
+
|
| 73 |
+
# Overall score
|
| 74 |
+
structure_score = 1.0 if structure_valid else 0.5
|
| 75 |
+
entities_score = coverage_percentage / 100
|
| 76 |
+
overall_score = (structure_score + entities_score) / 2
|
| 77 |
+
|
| 78 |
+
validation_result = {
|
| 79 |
+
"overall_score": overall_score,
|
| 80 |
+
"structure_valid": structure_valid,
|
| 81 |
+
"entities_coverage": coverage_percentage,
|
| 82 |
+
"missing_sections": missing_sections,
|
| 83 |
+
"missing_entities": missing_entities,
|
| 84 |
+
"transcription_entities_count": len(transcription_entities),
|
| 85 |
+
"generated_entities_count": len(generated_entities),
|
| 86 |
+
"found_sections": found_sections,
|
| 87 |
+
"template_sections": template_sections
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
return validation_result
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def create_validation_chain(llm):
|
| 94 |
+
"""Create the validation chain."""
|
| 95 |
+
validation_prompt = ChatPromptTemplate.from_messages([
|
| 96 |
+
("system", """You are a medical document validation expert.
|
| 97 |
+
Analyze if the generated medical document contains all important medical information from the original transcription.
|
| 98 |
+
|
| 99 |
+
Provide a brief validation summary with:
|
| 100 |
+
- Overall quality assessment
|
| 101 |
+
- Missing important information (if any)
|
| 102 |
+
- Key recommendations"""),
|
| 103 |
+
("human", """Validate the content coverage between the original transcription and the generated document.
|
| 104 |
+
|
| 105 |
+
ORIGINAL TRANSCRIPTION:
|
| 106 |
+
{transcription}
|
| 107 |
+
|
| 108 |
+
GENERATED DOCUMENT CONTENT:
|
| 109 |
+
{generated_content}
|
| 110 |
+
|
| 111 |
+
VALIDATION METRICS:
|
| 112 |
+
- Structure Valid: {structure_valid}
|
| 113 |
+
- Entities Coverage: {entities_coverage:.1f}%
|
| 114 |
+
- Missing Sections: {missing_sections}
|
| 115 |
+
- Missing Entities: {missing_entities}
|
| 116 |
+
|
| 117 |
+
Provide a concise validation summary.""")
|
| 118 |
+
])
|
| 119 |
+
|
| 120 |
+
return validation_prompt | llm
|
langchain_medical_agents_refactored.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
LangChain Medical Agents Architecture - Refactored
|
| 4 |
+
A multi-agent system for processing medical transcriptions and documents.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from langchain_openai import AzureChatOpenAI
|
| 12 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 13 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 14 |
+
|
| 15 |
+
# Import modular components
|
| 16 |
+
from models import TemplateAnalysis, MedicalTranscription, SectionContent, InsertSectionsInput
|
| 17 |
+
from sftp_agent import create_sftp_downloader_agent, download_model_from_sftp
|
| 18 |
+
from template_analyzer import create_template_analyzer_agent, analyze_word_template
|
| 19 |
+
from transcription_processor import (
|
| 20 |
+
create_transcription_corrector_chain,
|
| 21 |
+
create_medical_analyzer_chain,
|
| 22 |
+
create_title_generator_chain,
|
| 23 |
+
load_transcription_with_user_id
|
| 24 |
+
)
|
| 25 |
+
from section_generator import create_dynamic_section_prompt, fix_section_names
|
| 26 |
+
from document_assembler import create_document_assembler_agent
|
| 27 |
+
from document_validator import validate_generated_document, create_validation_chain
|
| 28 |
+
|
| 29 |
+
# Load environment variables
|
| 30 |
+
load_dotenv()
|
| 31 |
+
|
| 32 |
+
# Initialize LLM with Azure OpenAI
|
| 33 |
+
llm = AzureChatOpenAI(
|
| 34 |
+
azure_deployment="gtp-4o-eastus2",
|
| 35 |
+
openai_api_version="2024-02-15-preview",
|
| 36 |
+
azure_endpoint="https://voxist-gpt-eastus2.openai.azure.com/",
|
| 37 |
+
api_key="98db8190a2ff438b904c7e9862a13210",
|
| 38 |
+
temperature=0.1
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class MedicalDocumentOrchestrator:
|
| 43 |
+
"""Main orchestrator that coordinates all agents."""
|
| 44 |
+
|
| 45 |
+
def __init__(self, template_path: str = None, transcription_path: str = None, transcriptions_dir: str = "transcriptions"):
|
| 46 |
+
self.template_path = template_path
|
| 47 |
+
self.transcription_path = transcription_path
|
| 48 |
+
self.transcriptions_dir = transcriptions_dir
|
| 49 |
+
self.template_analysis = None
|
| 50 |
+
self.corrected_transcription = None
|
| 51 |
+
self.medical_data = None
|
| 52 |
+
self.generated_sections = None
|
| 53 |
+
self.generated_title = None
|
| 54 |
+
self.downloaded_models = None
|
| 55 |
+
|
| 56 |
+
def run_full_pipeline(self, output_path: str = None) -> str:
|
| 57 |
+
"""Run the complete medical document processing pipeline."""
|
| 58 |
+
print("🚀 Starting LangChain Medical Document Pipeline...")
|
| 59 |
+
|
| 60 |
+
# Step 0: Download only the model corresponding to the transcription
|
| 61 |
+
print("\n📥 Step 0: Downloading model from SFTP for the selected transcription...")
|
| 62 |
+
try:
|
| 63 |
+
transcription_filename = os.path.basename(self.transcription_path)
|
| 64 |
+
match = re.search(r'transcriptions_(.+)\.rtf_',
|
| 65 |
+
transcription_filename)
|
| 66 |
+
if match:
|
| 67 |
+
model_id = match.group(1)
|
| 68 |
+
model_filename = f"{model_id}.rtf"
|
| 69 |
+
local_filename = f"{model_id}.doc"
|
| 70 |
+
local_template_path = os.path.join("models", local_filename)
|
| 71 |
+
print(f"🔎 Model identifier for this transcription: {model_id}")
|
| 72 |
+
|
| 73 |
+
# Download only the required model via a simple agent
|
| 74 |
+
simple_sftp_agent = create_openai_tools_agent(
|
| 75 |
+
llm=llm,
|
| 76 |
+
tools=[download_model_from_sftp],
|
| 77 |
+
prompt=ChatPromptTemplate.from_messages([
|
| 78 |
+
("system", "You are an SFTP downloader. Download the specified model file."),
|
| 79 |
+
("human", "Download the model file: {model_filename}"),
|
| 80 |
+
MessagesPlaceholder("agent_scratchpad")
|
| 81 |
+
])
|
| 82 |
+
)
|
| 83 |
+
simple_sftp_executor = AgentExecutor(
|
| 84 |
+
agent=simple_sftp_agent,
|
| 85 |
+
tools=[download_model_from_sftp],
|
| 86 |
+
verbose=True
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
result = simple_sftp_executor.invoke({
|
| 90 |
+
"model_filename": model_filename
|
| 91 |
+
})
|
| 92 |
+
print(
|
| 93 |
+
f"✅ Model downloaded and available as: {local_template_path}")
|
| 94 |
+
self.template_path = local_template_path
|
| 95 |
+
self.downloaded_models = [{
|
| 96 |
+
'model_id': model_id,
|
| 97 |
+
'model_filename': model_filename,
|
| 98 |
+
'local_filename': local_filename,
|
| 99 |
+
'local_path': local_template_path,
|
| 100 |
+
'status': 'success'
|
| 101 |
+
}]
|
| 102 |
+
else:
|
| 103 |
+
raise ValueError(
|
| 104 |
+
"Unable to extract the model identifier from the transcription filename.")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"❌ Error during SFTP download step: {str(e)}")
|
| 107 |
+
if self.template_path:
|
| 108 |
+
print("⚠️ Continuing with pipeline using the provided template_path...")
|
| 109 |
+
else:
|
| 110 |
+
print(
|
| 111 |
+
"❌ No template path provided and SFTP download failed. Cannot continue.")
|
| 112 |
+
raise Exception(
|
| 113 |
+
"Cannot continue without a template. SFTP download failed and no template path was provided.")
|
| 114 |
+
self.downloaded_models = []
|
| 115 |
+
|
| 116 |
+
# Step 1: Analyze template
|
| 117 |
+
print("\n📋 Step 1: Analyzing template...")
|
| 118 |
+
if not self.template_path:
|
| 119 |
+
raise ValueError("No template path available for analysis")
|
| 120 |
+
self.template_analysis = analyze_word_template(self.template_path)
|
| 121 |
+
print(
|
| 122 |
+
f"✅ Template analyzed: {len(self.template_analysis.get('sections', []))} sections found")
|
| 123 |
+
|
| 124 |
+
# Step 2: Load and correct transcription
|
| 125 |
+
print("\n✏️ Step 2: Correcting transcription...")
|
| 126 |
+
raw_transcription, user_id = load_transcription_with_user_id(
|
| 127 |
+
self.transcription_path)
|
| 128 |
+
transcription_corrector_chain = create_transcription_corrector_chain(
|
| 129 |
+
llm)
|
| 130 |
+
self.corrected_transcription = transcription_corrector_chain.invoke({
|
| 131 |
+
"transcription": raw_transcription
|
| 132 |
+
}).content
|
| 133 |
+
# ← Ajoute ces deux lignes juste après
|
| 134 |
+
print("\n===== Transcription après correction =====")
|
| 135 |
+
print(self.corrected_transcription)
|
| 136 |
+
|
| 137 |
+
print("✅ Transcription corrected")
|
| 138 |
+
print("✅ Transcription corrected")
|
| 139 |
+
|
| 140 |
+
# Step 3: Analyze medical data
|
| 141 |
+
print("\n🔬 Step 3: Analyzing medical data...")
|
| 142 |
+
medical_analyzer_chain = create_medical_analyzer_chain(llm)
|
| 143 |
+
self.medical_data = medical_analyzer_chain.invoke({
|
| 144 |
+
"corrected_transcription": self.corrected_transcription
|
| 145 |
+
}).content
|
| 146 |
+
print("✅ Medical data analyzed")
|
| 147 |
+
|
| 148 |
+
# Step 4: Generate title
|
| 149 |
+
print("\n📝 Step 4: Generating title...")
|
| 150 |
+
title_generator_chain = create_title_generator_chain(llm)
|
| 151 |
+
self.generated_title = title_generator_chain.invoke({
|
| 152 |
+
"medical_data": self.medical_data
|
| 153 |
+
}).content
|
| 154 |
+
print(f"✅ Title generated: {self.generated_title}")
|
| 155 |
+
|
| 156 |
+
# Step 5: Generate sections
|
| 157 |
+
print("\n📝 Step 5: Generating sections...")
|
| 158 |
+
|
| 159 |
+
# Extract sections from template analysis
|
| 160 |
+
template_sections = []
|
| 161 |
+
|
| 162 |
+
# Debug: see exactly what template_analysis contains
|
| 163 |
+
print("\n--- DEBUG: Type and content of template_analysis ---")
|
| 164 |
+
print(f"Type: {type(self.template_analysis)}")
|
| 165 |
+
print(f"Content: {self.template_analysis}")
|
| 166 |
+
if hasattr(self.template_analysis, '__dict__'):
|
| 167 |
+
print(f"Attributes: {self.template_analysis.__dict__}")
|
| 168 |
+
print("--- END DEBUG ---\n")
|
| 169 |
+
|
| 170 |
+
# Always retrieve the sections list if possible
|
| 171 |
+
try:
|
| 172 |
+
if isinstance(self.template_analysis, dict) and 'sections' in self.template_analysis:
|
| 173 |
+
template_sections = [section['text']
|
| 174 |
+
for section in self.template_analysis['sections']]
|
| 175 |
+
elif hasattr(self.template_analysis, 'get') and 'sections' in self.template_analysis:
|
| 176 |
+
template_sections = [section['text']
|
| 177 |
+
for section in self.template_analysis['sections']]
|
| 178 |
+
elif hasattr(self.template_analysis, 'output') and isinstance(self.template_analysis.output, dict) and 'sections' in self.template_analysis.output:
|
| 179 |
+
template_sections = [section['text']
|
| 180 |
+
for section in self.template_analysis.output['sections']]
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print('Error extracting sections:', e)
|
| 183 |
+
# Fallback: try to extract from the agent response text
|
| 184 |
+
if not template_sections:
|
| 185 |
+
response_text = str(self.template_analysis)
|
| 186 |
+
if 'Technique' in response_text and 'Résultat' in response_text and 'Conclusion' in response_text:
|
| 187 |
+
template_sections = ['Technique\xa0:',
|
| 188 |
+
'Résultat\xa0:', 'Conclusion\xa0:']
|
| 189 |
+
elif 'CONCLUSION' in response_text:
|
| 190 |
+
template_sections = ['CONCLUSION\xa0:']
|
| 191 |
+
|
| 192 |
+
# Create dynamic prompt based on template sections
|
| 193 |
+
dynamic_section_prompt = create_dynamic_section_prompt(
|
| 194 |
+
template_sections)
|
| 195 |
+
section_generator_chain = dynamic_section_prompt | llm
|
| 196 |
+
|
| 197 |
+
generated_content = section_generator_chain.invoke({
|
| 198 |
+
"template_sections": template_sections,
|
| 199 |
+
"medical_data": self.medical_data,
|
| 200 |
+
"corrected_transcription": self.corrected_transcription
|
| 201 |
+
}).content
|
| 202 |
+
|
| 203 |
+
# Post-process to ensure exact section names are used
|
| 204 |
+
self.generated_sections = fix_section_names(
|
| 205 |
+
generated_content, template_sections)
|
| 206 |
+
print("\n--- DEBUG: Generated sections ---")
|
| 207 |
+
print(self.generated_sections)
|
| 208 |
+
print("--- END DEBUG ---\n")
|
| 209 |
+
print("\n--- DEBUG: Template sections ---")
|
| 210 |
+
print(template_sections)
|
| 211 |
+
print("--- END DEBUG ---\n")
|
| 212 |
+
print("\n--- DEBUG: Generated title ---")
|
| 213 |
+
print(self.generated_title)
|
| 214 |
+
print("--- END DEBUG ---\n")
|
| 215 |
+
|
| 216 |
+
# Step 6: Assemble document
|
| 217 |
+
print("\n📄 Step 6: Assembling document...")
|
| 218 |
+
if output_path is None:
|
| 219 |
+
# Generate output filename based on user_id
|
| 220 |
+
# Replace the last extension with .docx
|
| 221 |
+
if '.' in user_id:
|
| 222 |
+
# Split by dots and replace the last part with docx
|
| 223 |
+
parts = user_id.split('.')
|
| 224 |
+
parts[-1] = 'docx'
|
| 225 |
+
output_filename = '.'.join(parts)
|
| 226 |
+
else:
|
| 227 |
+
# If no extension, just add .docx
|
| 228 |
+
output_filename = f"{user_id}.docx"
|
| 229 |
+
|
| 230 |
+
output_path = output_filename
|
| 231 |
+
|
| 232 |
+
# Use the agent for assembly
|
| 233 |
+
document_assembler_executor = create_document_assembler_agent(llm)
|
| 234 |
+
result = document_assembler_executor.invoke({
|
| 235 |
+
"template_path": self.template_path,
|
| 236 |
+
"sections_text": self.generated_sections,
|
| 237 |
+
"title": self.generated_title,
|
| 238 |
+
"output_path": output_path
|
| 239 |
+
})
|
| 240 |
+
|
| 241 |
+
print(f"🎉 Pipeline completed! Document saved: {output_path}")
|
| 242 |
+
|
| 243 |
+
# Step 7: Validate document
|
| 244 |
+
print("\n📋 Step 7: Validating document...")
|
| 245 |
+
validation_result = validate_generated_document(
|
| 246 |
+
self.template_path, self.transcription_path, output_path)
|
| 247 |
+
|
| 248 |
+
# Display validation results
|
| 249 |
+
print("\n" + "=" * 60)
|
| 250 |
+
print("📊 VALIDATION RESULTS")
|
| 251 |
+
print("=" * 60)
|
| 252 |
+
|
| 253 |
+
# Overall score
|
| 254 |
+
score = validation_result["overall_score"]
|
| 255 |
+
score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.6 else "🔴"
|
| 256 |
+
print(f"{score_emoji} Overall Score: {score:.1%}")
|
| 257 |
+
|
| 258 |
+
# Structure validation
|
| 259 |
+
structure_valid = validation_result["structure_valid"]
|
| 260 |
+
structure_emoji = "✅" if structure_valid else "❌"
|
| 261 |
+
print(f"{structure_emoji} Structure Valid: {structure_valid}")
|
| 262 |
+
|
| 263 |
+
if not structure_valid:
|
| 264 |
+
missing = validation_result["missing_sections"]
|
| 265 |
+
print(f" Missing sections: {', '.join(missing)}")
|
| 266 |
+
|
| 267 |
+
# Entities validation
|
| 268 |
+
entities_coverage = validation_result["entities_coverage"]
|
| 269 |
+
entities_emoji = "✅" if entities_coverage >= 80 else "⚠️"
|
| 270 |
+
print(f"{entities_emoji} Medical Entities Coverage: {entities_coverage:.1f}%")
|
| 271 |
+
|
| 272 |
+
if entities_coverage < 80:
|
| 273 |
+
missing_entities = validation_result["missing_entities"][:5]
|
| 274 |
+
print(f" Missing entities: {', '.join(missing_entities)}")
|
| 275 |
+
|
| 276 |
+
# Generate AI validation report
|
| 277 |
+
print("\n📝 AI Validation Report:")
|
| 278 |
+
print("-" * 40)
|
| 279 |
+
|
| 280 |
+
# Extract content for AI validation
|
| 281 |
+
from docx import Document
|
| 282 |
+
doc = Document(output_path)
|
| 283 |
+
generated_content = []
|
| 284 |
+
for paragraph in doc.paragraphs:
|
| 285 |
+
text = paragraph.text.strip()
|
| 286 |
+
if text and not text.startswith("Date:") and not text.startswith("Heure:"):
|
| 287 |
+
generated_content.append(text)
|
| 288 |
+
generated_text = "\n".join(generated_content)
|
| 289 |
+
|
| 290 |
+
validation_chain = create_validation_chain(llm)
|
| 291 |
+
ai_validation = validation_chain.invoke({
|
| 292 |
+
"transcription": self.corrected_transcription,
|
| 293 |
+
"generated_content": generated_text,
|
| 294 |
+
"structure_valid": structure_valid,
|
| 295 |
+
"entities_coverage": entities_coverage,
|
| 296 |
+
"missing_sections": validation_result["missing_sections"],
|
| 297 |
+
"missing_entities": validation_result["missing_entities"]
|
| 298 |
+
})
|
| 299 |
+
|
| 300 |
+
print(ai_validation.content)
|
| 301 |
+
print("\n" + "=" * 60)
|
| 302 |
+
|
| 303 |
+
print("✅ Document validated")
|
| 304 |
+
|
| 305 |
+
# Remove the local model after validation
|
| 306 |
+
try:
|
| 307 |
+
if self.template_path and os.path.exists(self.template_path):
|
| 308 |
+
os.remove(self.template_path)
|
| 309 |
+
print(f"🗑️ Deleted local model file: {self.template_path}")
|
| 310 |
+
except Exception as e:
|
| 311 |
+
print(f"⚠️ Could not delete local model file: {e}")
|
| 312 |
+
|
| 313 |
+
return output_path
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def main():
|
| 317 |
+
"""Main function to run the LangChain medical document pipeline."""
|
| 318 |
+
print("🏥 LangChain Medical Document Agents - Refactored")
|
| 319 |
+
print("=" * 60)
|
| 320 |
+
|
| 321 |
+
# Initialize orchestrator
|
| 322 |
+
orchestrator = MedicalDocumentOrchestrator(
|
| 323 |
+
template_path="default.528.251014072.doc",
|
| 324 |
+
transcription_path="transciption.txt"
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Run the complete pipeline
|
| 328 |
+
output_file = orchestrator.run_full_pipeline()
|
| 329 |
+
|
| 330 |
+
print(f"\n✅ Final document: {output_file}")
|
| 331 |
+
print("🎉 LangChain pipeline completed successfully!")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
if __name__ == "__main__":
|
| 335 |
+
main()
|
main.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
main.py
|
| 3 |
+
Point d'entrée principal pour le système de matching hybride
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Importer les modules
|
| 12 |
+
from template_db_creation import MedicalTemplateParser,TemplateInfo
|
| 13 |
+
from smart_match import TranscriptionMatcher
|
| 14 |
+
from title_matcher import HybridMatcher
|
| 15 |
+
|
| 16 |
+
# Configuration du logging
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 20 |
+
)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def load_transcription_file(filepath: str) -> tuple:
|
| 25 |
+
"""
|
| 26 |
+
Charge une transcription depuis un fichier
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
filepath: Chemin vers le fichier
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
tuple: (contenu, nom_fichier)
|
| 33 |
+
"""
|
| 34 |
+
try:
|
| 35 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 36 |
+
content = f.read()
|
| 37 |
+
filename = os.path.basename(filepath)
|
| 38 |
+
logger.info(f"✅ Transcription chargée: {filename}")
|
| 39 |
+
return content, filename
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.error(f"❌ Erreur lecture fichier: {e}")
|
| 42 |
+
return None, None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def save_result(result, output_path: str):
|
| 46 |
+
"""
|
| 47 |
+
Sauvegarde le résultat dans un fichier
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
result: MatchResult à sauvegarder
|
| 51 |
+
output_path: Chemin du fichier de sortie
|
| 52 |
+
"""
|
| 53 |
+
try:
|
| 54 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 55 |
+
f.write("="*80 + "\n")
|
| 56 |
+
f.write(f"TEMPLATE: {result.template_id}\n")
|
| 57 |
+
f.write(f"MÉTHODE: {result.match_method}\n")
|
| 58 |
+
f.write(f"CONFIANCE: {result.confidence_score:.2%}\n")
|
| 59 |
+
f.write("="*80 + "\n\n")
|
| 60 |
+
f.write(result.filled_template)
|
| 61 |
+
logger.info(f"✅ Résultat sauvegardé: {output_path}")
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"❌ Erreur sauvegarde: {e}")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def batch_process_directory(hybrid_matcher, input_dir: str, output_dir: str):
|
| 67 |
+
"""
|
| 68 |
+
Traite tous les fichiers d'un répertoire
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
hybrid_matcher: Instance de HybridMatcher
|
| 72 |
+
input_dir: Répertoire des transcriptions
|
| 73 |
+
output_dir: Répertoire de sortie
|
| 74 |
+
"""
|
| 75 |
+
# Créer le répertoire de sortie
|
| 76 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 77 |
+
|
| 78 |
+
# Lister les fichiers
|
| 79 |
+
input_path = Path(input_dir)
|
| 80 |
+
transcription_files = list(input_path.glob("*.txt")) + list(input_path.glob("*.rtf"))
|
| 81 |
+
|
| 82 |
+
logger.info(f"\n{'='*80}")
|
| 83 |
+
logger.info(f"📁 TRAITEMENT PAR LOT - {len(transcription_files)} fichiers")
|
| 84 |
+
logger.info(f"{'='*80}\n")
|
| 85 |
+
|
| 86 |
+
results_summary = []
|
| 87 |
+
|
| 88 |
+
for i, filepath in enumerate(transcription_files, 1):
|
| 89 |
+
logger.info(f"\n{'─'*80}")
|
| 90 |
+
logger.info(f"📄 Fichier {i}/{len(transcription_files)}: {filepath.name}")
|
| 91 |
+
logger.info(f"{'─'*80}")
|
| 92 |
+
|
| 93 |
+
# Charger la transcription
|
| 94 |
+
content, filename = load_transcription_file(str(filepath))
|
| 95 |
+
if not content:
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
# Matcher et remplir
|
| 99 |
+
results = hybrid_matcher.match_and_fill(
|
| 100 |
+
transcription=content,
|
| 101 |
+
transcription_filename=filename
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
if results:
|
| 105 |
+
result = results[0]
|
| 106 |
+
|
| 107 |
+
# Sauvegarder
|
| 108 |
+
output_filename = f"{filepath.stem}_filled.txt"
|
| 109 |
+
output_path = os.path.join(output_dir, output_filename)
|
| 110 |
+
save_result(result, output_path)
|
| 111 |
+
|
| 112 |
+
# Ajouter au résumé
|
| 113 |
+
results_summary.append({
|
| 114 |
+
'filename': filename,
|
| 115 |
+
'template': result.template_id,
|
| 116 |
+
'method': result.match_method,
|
| 117 |
+
'confidence': result.confidence_score
|
| 118 |
+
})
|
| 119 |
+
else:
|
| 120 |
+
logger.warning(f"⚠️ Aucun résultat pour {filename}")
|
| 121 |
+
results_summary.append({
|
| 122 |
+
'filename': filename,
|
| 123 |
+
'template': 'NONE',
|
| 124 |
+
'method': 'FAILED',
|
| 125 |
+
'confidence': 0.0
|
| 126 |
+
})
|
| 127 |
+
|
| 128 |
+
# Afficher le résumé
|
| 129 |
+
print("\n" + "="*80)
|
| 130 |
+
print("📊 RÉSUMÉ DU TRAITEMENT PAR LOT")
|
| 131 |
+
print("="*80)
|
| 132 |
+
for item in results_summary:
|
| 133 |
+
print(f"📄 {item['filename']}")
|
| 134 |
+
print(f" → Template: {item['template']}")
|
| 135 |
+
print(f" → Méthode: {item['method']}")
|
| 136 |
+
print(f" → Confiance: {item['confidence']:.2%}")
|
| 137 |
+
print()
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def interactive_mode(hybrid_matcher):
|
| 141 |
+
"""
|
| 142 |
+
Mode interactif pour traiter les transcriptions
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
hybrid_matcher: Instance de HybridMatcher
|
| 146 |
+
"""
|
| 147 |
+
# Exemple de transcription
|
| 148 |
+
transcription_example = """
|
| 149 |
+
IRM pelvienne. Indication clinique. Technique. Acquisition sagittale, axiale et coronale T2, saturation axiale, diffusion axiale T1. Résultats. Présence d'un utérus antéversé médio-pelvien dont le grand axe mesure 72 mm sur 40 mm sur 40 mm. La zone jonctionnelle apparaît floue. Elle est épaissie de façon diffuse, asymétrique, avec une atteinte de plus de 50% de l'épaisseur du myomètre et comporte des spots en hypersignal T2, l'ensemble traduisant une adénomyose.
|
| 150 |
+
Pas d'épaississement cervical. À noter la présence d'un petit kyste liquidien de type Naboth.
|
| 151 |
+
Les 2 ovaires sont repérés, porteurs de formations folliculaires communes en hypersignal homogène T2 de petite taille. L'ovaire droit mesure 30 x 25 mm. L'ovaire gauche mesure 25 x 23 mm. Pas d'épanchement dans le cul-de-sac de Douglas.
|
| 152 |
+
Absence de foyer d'endométriose profonde. Conclusion.
|
| 153 |
+
Aspect d'adénomyose diffuse, symétrique, profonde.
|
| 154 |
+
Pas d'épaississement endométrial. Absence d'endométriome. Absence d'épanchement dans le cul-de-sac de Douglas.
|
| 155 |
+
"""
|
| 156 |
+
|
| 157 |
+
while True:
|
| 158 |
+
print("\n" + "="*80)
|
| 159 |
+
print("🔧 MODE INTERACTIF - OPTIONS")
|
| 160 |
+
print("="*80)
|
| 161 |
+
print("1. Charger une transcription depuis un fichier (avec matching par titre)")
|
| 162 |
+
print("2. Entrer une transcription manuellement (matching sémantique uniquement)")
|
| 163 |
+
print("3. Utiliser l'exemple de transcription (matching sémantique)")
|
| 164 |
+
print("4. Traitement par lot d'un répertoire")
|
| 165 |
+
print("5. Quitter")
|
| 166 |
+
print("="*80)
|
| 167 |
+
|
| 168 |
+
choice = input("\n👉 Votre choix: ").strip()
|
| 169 |
+
|
| 170 |
+
if choice == "1":
|
| 171 |
+
# Charger depuis un fichier
|
| 172 |
+
filepath = input("📂 Chemin du fichier de transcription: ").strip()
|
| 173 |
+
|
| 174 |
+
if not os.path.exists(filepath):
|
| 175 |
+
print(f"❌ Fichier introuvable: {filepath}")
|
| 176 |
+
continue
|
| 177 |
+
|
| 178 |
+
content, filename = load_transcription_file(filepath)
|
| 179 |
+
if not content:
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
# Matching hybride (avec titre)
|
| 183 |
+
results = hybrid_matcher.match_and_fill(
|
| 184 |
+
transcription=content,
|
| 185 |
+
transcription_filename=filename
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
elif choice == "2":
|
| 189 |
+
# Saisie manuelle
|
| 190 |
+
print("\n📝 Entrez la transcription (Ctrl+D ou Ctrl+Z pour terminer):")
|
| 191 |
+
lines = []
|
| 192 |
+
try:
|
| 193 |
+
while True:
|
| 194 |
+
line = input()
|
| 195 |
+
lines.append(line)
|
| 196 |
+
except EOFError:
|
| 197 |
+
pass
|
| 198 |
+
|
| 199 |
+
content = "\n".join(lines)
|
| 200 |
+
if not content.strip():
|
| 201 |
+
print("❌ Transcription vide")
|
| 202 |
+
continue
|
| 203 |
+
|
| 204 |
+
# Matching sémantique uniquement
|
| 205 |
+
results = hybrid_matcher.match_and_fill(
|
| 206 |
+
transcription=content,
|
| 207 |
+
transcription_filename=None
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
elif choice == "3":
|
| 211 |
+
# Exemple
|
| 212 |
+
content = transcription_example
|
| 213 |
+
|
| 214 |
+
# Matching sémantique uniquement
|
| 215 |
+
results = hybrid_matcher.match_and_fill(
|
| 216 |
+
transcription=content,
|
| 217 |
+
transcription_filename=None
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
elif choice == "4":
|
| 221 |
+
# Traitement par lot
|
| 222 |
+
input_dir = input("📂 Répertoire des transcriptions: ").strip()
|
| 223 |
+
if not os.path.exists(input_dir):
|
| 224 |
+
print(f"❌ Répertoire introuvable: {input_dir}")
|
| 225 |
+
continue
|
| 226 |
+
|
| 227 |
+
output_dir = input("📂 Répertoire de sortie: ").strip()
|
| 228 |
+
|
| 229 |
+
batch_process_directory(hybrid_matcher, input_dir, output_dir)
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
elif choice == "5":
|
| 233 |
+
print("\n👋 Au revoir!")
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
else:
|
| 237 |
+
print("❌ Choix invalide")
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
# Afficher les résultats
|
| 241 |
+
if results:
|
| 242 |
+
for i, result in enumerate(results, 1):
|
| 243 |
+
print(f"\n{'#'*80}")
|
| 244 |
+
print(f"# RÉSULTAT {i}/{len(results)}")
|
| 245 |
+
print(f"{'#'*80}")
|
| 246 |
+
hybrid_matcher.semantic_matcher.display_result(result)
|
| 247 |
+
|
| 248 |
+
# Proposer de sauvegarder
|
| 249 |
+
save_choice = input("\n💾 Sauvegarder le résultat? (o/n): ").strip().lower()
|
| 250 |
+
if save_choice == 'o':
|
| 251 |
+
output_file = input("📄 Nom du fichier de sortie: ").strip()
|
| 252 |
+
if output_file:
|
| 253 |
+
save_result(results[0], output_file)
|
| 254 |
+
else:
|
| 255 |
+
print("❌ Aucun résultat trouvé")
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def main():
|
| 259 |
+
"""
|
| 260 |
+
Fonction principale
|
| 261 |
+
"""
|
| 262 |
+
print("\n" + "="*80)
|
| 263 |
+
print("🏥 SYSTÈME DE MATCHING HYBRIDE DE TEMPLATES MÉDICAUX")
|
| 264 |
+
print("="*80)
|
| 265 |
+
print("Version 2.0 - Matching par titre + Matching sémantique")
|
| 266 |
+
print("="*80 + "\n")
|
| 267 |
+
|
| 268 |
+
# Étape 1: Charger la base de données
|
| 269 |
+
db_path = input("📂 Chemin vers la base de données (.pkl): ").strip()
|
| 270 |
+
|
| 271 |
+
if not os.path.exists(db_path):
|
| 272 |
+
print(f"❌ Fichier introuvable: {db_path}")
|
| 273 |
+
return
|
| 274 |
+
|
| 275 |
+
print("\n🔄 Chargement de la base de données...")
|
| 276 |
+
parser = MedicalTemplateParser()
|
| 277 |
+
|
| 278 |
+
try:
|
| 279 |
+
parser.load_database(db_path)
|
| 280 |
+
print(f"✅ Base chargée: {len(parser.templates)} templates disponibles")
|
| 281 |
+
except Exception as e:
|
| 282 |
+
print(f"❌ Erreur lors du chargement: {e}")
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
# Étape 2: Initialiser les matchers
|
| 286 |
+
print("\n🔄 Initialisation des matchers...")
|
| 287 |
+
|
| 288 |
+
try:
|
| 289 |
+
semantic_matcher = TranscriptionMatcher(parser)
|
| 290 |
+
print("✅ Matcher sémantique initialisé")
|
| 291 |
+
|
| 292 |
+
hybrid_matcher = HybridMatcher(parser, semantic_matcher)
|
| 293 |
+
print("✅ Matcher hybride initialisé")
|
| 294 |
+
except Exception as e:
|
| 295 |
+
print(f"❌ Erreur lors de l'initialisation: {e}")
|
| 296 |
+
return
|
| 297 |
+
|
| 298 |
+
# Étape 3: Vérifier la disponibilité de GPT
|
| 299 |
+
if not semantic_matcher.llm:
|
| 300 |
+
print("\n⚠️ ATTENTION: GPT n'est pas disponible")
|
| 301 |
+
print("⚠️ Vérifiez que OPENAI_API_KEY est définie")
|
| 302 |
+
print("⚠️ Le remplissage sera basique")
|
| 303 |
+
|
| 304 |
+
continue_choice = input("\nContinuer quand même? (o/n): ").strip().lower()
|
| 305 |
+
if continue_choice != 'o':
|
| 306 |
+
return
|
| 307 |
+
|
| 308 |
+
# Étape 4: Afficher les statistiques
|
| 309 |
+
print("\n" + "="*80)
|
| 310 |
+
print("📊 STATISTIQUES DE LA BASE")
|
| 311 |
+
print("="*80)
|
| 312 |
+
|
| 313 |
+
# Compter les types de templates
|
| 314 |
+
types_count = {}
|
| 315 |
+
for template_id, template_info in parser.templates.items():
|
| 316 |
+
template_type = template_info.type
|
| 317 |
+
types_count[template_type] = types_count.get(template_type, 0) + 1
|
| 318 |
+
|
| 319 |
+
print(f"📋 Total de templates: {len(parser.templates)}")
|
| 320 |
+
print("\n📑 Répartition par type:")
|
| 321 |
+
for template_type, count in sorted(types_count.items(), key=lambda x: x[1], reverse=True):
|
| 322 |
+
print(f" • {template_type}: {count}")
|
| 323 |
+
|
| 324 |
+
print("="*80)
|
| 325 |
+
|
| 326 |
+
# Étape 5: Choisir le mode
|
| 327 |
+
print("\n" + "="*80)
|
| 328 |
+
print("🔧 MODE DE FONCTIONNEMENT")
|
| 329 |
+
print("="*80)
|
| 330 |
+
print("1. Mode interactif (traiter des transcriptions une par une)")
|
| 331 |
+
print("2. Traitement par lot (traiter un répertoire entier)")
|
| 332 |
+
print("3. Quitter")
|
| 333 |
+
print("="*80)
|
| 334 |
+
|
| 335 |
+
mode_choice = input("\n👉 Votre choix: ").strip()
|
| 336 |
+
|
| 337 |
+
if mode_choice == "1":
|
| 338 |
+
interactive_mode(hybrid_matcher)
|
| 339 |
+
elif mode_choice == "2":
|
| 340 |
+
input_dir = input("\n📂 Répertoire des transcriptions: ").strip()
|
| 341 |
+
if not os.path.exists(input_dir):
|
| 342 |
+
print(f"❌ Répertoire introuvable: {input_dir}")
|
| 343 |
+
return
|
| 344 |
+
|
| 345 |
+
output_dir = input("📂 Répertoire de sortie: ").strip()
|
| 346 |
+
batch_process_directory(hybrid_matcher, input_dir, output_dir)
|
| 347 |
+
elif mode_choice == "3":
|
| 348 |
+
print("\n👋 Au revoir!")
|
| 349 |
+
else:
|
| 350 |
+
print("❌ Choix invalide")
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
if __name__ == "__main__":
|
| 354 |
+
try:
|
| 355 |
+
main()
|
| 356 |
+
except KeyboardInterrupt:
|
| 357 |
+
print("\n\n👋 Interruption par l'utilisateur. Au revoir!")
|
| 358 |
+
sys.exit(0)
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logger.error(f"❌ Erreur fatale: {e}", exc_info=True)
|
| 361 |
+
sys.exit(1)
|
match_transcription.py
ADDED
|
@@ -0,0 +1,749 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Dict, List, Optional, Tuple, Set
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import pickle
|
| 9 |
+
import re
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
# Charger les variables d'environnement
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
DB_PATH = os.getenv("TEMPLATE_DB_PATH", "templates/medical_templates.pkl")
|
| 16 |
+
GPT_MODEL = os.getenv("GPT_MODEL", "gpt-5")
|
| 17 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 18 |
+
|
| 19 |
+
# Only import these if absolutely necessary and add error handling
|
| 20 |
+
try:
|
| 21 |
+
from langchain_openai import ChatOpenAI
|
| 22 |
+
from langchain.prompts import ChatPromptTemplate
|
| 23 |
+
HAS_LANGCHAIN = True
|
| 24 |
+
except ImportError:
|
| 25 |
+
HAS_LANGCHAIN = False
|
| 26 |
+
logging.warning("LangChain not available")
|
| 27 |
+
|
| 28 |
+
# Réutiliser les classes du code existant
|
| 29 |
+
try:
|
| 30 |
+
from template_db_creation import MedicalTemplateParser, TemplateInfo
|
| 31 |
+
except ImportError:
|
| 32 |
+
logging.error("template_db_creation module not found")
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class SectionMatch:
|
| 36 |
+
"""Représente le matching d'une section"""
|
| 37 |
+
section_name: str
|
| 38 |
+
confidence: float
|
| 39 |
+
extracted_content: str
|
| 40 |
+
can_fill: bool
|
| 41 |
+
missing_info: List[str]
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class TemplateMatch:
|
| 45 |
+
"""Résultat détaillé du matching d'un template"""
|
| 46 |
+
template_id: str
|
| 47 |
+
template_info: TemplateInfo
|
| 48 |
+
overall_score: float
|
| 49 |
+
type_match_score: float
|
| 50 |
+
physician_match_score: float
|
| 51 |
+
center_match_score: float
|
| 52 |
+
content_match_score: float
|
| 53 |
+
filename_match_score: float
|
| 54 |
+
fillability_score: float
|
| 55 |
+
section_matches: Dict[str, SectionMatch]
|
| 56 |
+
confidence_level: str
|
| 57 |
+
can_be_filled: bool
|
| 58 |
+
filling_percentage: float
|
| 59 |
+
missing_critical_info: List[str]
|
| 60 |
+
extracted_data: Dict[str, str]
|
| 61 |
+
filename_indicators: List[str]
|
| 62 |
+
|
| 63 |
+
@dataclass
|
| 64 |
+
class FilenameAnalysis:
|
| 65 |
+
"""Analyse d'un nom de fichier médical"""
|
| 66 |
+
original_filename: str
|
| 67 |
+
medical_keywords: List[str]
|
| 68 |
+
document_type_indicators: List[str]
|
| 69 |
+
specialty_indicators: List[str]
|
| 70 |
+
center_indicators: List[str]
|
| 71 |
+
anatomical_regions: List[str]
|
| 72 |
+
procedure_type: Optional[str]
|
| 73 |
+
confidence_score: float
|
| 74 |
+
|
| 75 |
+
class TemplateMatcher:
|
| 76 |
+
"""Système de matching entre transcriptions et templates médicaux"""
|
| 77 |
+
|
| 78 |
+
def __init__(self, database_path: str = None):
|
| 79 |
+
"""Initialise le matcher avec une base de données existante"""
|
| 80 |
+
self.parser = None
|
| 81 |
+
self.llm = None
|
| 82 |
+
self.content_analyzer = None
|
| 83 |
+
self.section_extractor = None
|
| 84 |
+
self.filename_analyzer = None
|
| 85 |
+
|
| 86 |
+
self._initialize_filename_keywords()
|
| 87 |
+
self._initialize_gpt()
|
| 88 |
+
|
| 89 |
+
if database_path and os.path.exists(database_path):
|
| 90 |
+
self.load_database(database_path)
|
| 91 |
+
else:
|
| 92 |
+
logging.warning("Base de données non trouvée ou non spécifiée")
|
| 93 |
+
|
| 94 |
+
def _initialize_filename_keywords(self):
|
| 95 |
+
"""Initialise les mots-clés pour l'analyse des noms de fichiers"""
|
| 96 |
+
self.filename_keywords = {
|
| 97 |
+
# Types d'examens d'imagerie
|
| 98 |
+
"imagerie": {
|
| 99 |
+
"irm": ["irm", "mri", "resonance"],
|
| 100 |
+
"scanner": ["scanner", "tdm", "ct", "tomodensitometrie"],
|
| 101 |
+
"echographie": ["echo", "echographie", "doppler", "ultrasound"],
|
| 102 |
+
"radiologie": ["radio", "radiologie", "rx", "xray"],
|
| 103 |
+
"pet": ["pet", "tep", "scintigraphie"],
|
| 104 |
+
"mammographie": ["mammo", "mammographie", "breast"]
|
| 105 |
+
},
|
| 106 |
+
|
| 107 |
+
# Spécialités médicales
|
| 108 |
+
"specialites": {
|
| 109 |
+
"cardiologie": ["cardio", "coeur", "heart", "ecg", "holter"],
|
| 110 |
+
"neurologie": ["neuro", "brain", "cerveau", "eeg"],
|
| 111 |
+
"orthopedic": ["ortho", "os", "bone", "fracture"],
|
| 112 |
+
"gynecologie": ["gyneco", "utérus", "ovaire", "pelvien"],
|
| 113 |
+
"urologie": ["uro", "vessie", "rein", "prostate"],
|
| 114 |
+
"pneumologie": ["pneumo", "poumon", "thorax", "resp"],
|
| 115 |
+
"gastro": ["gastro", "abdomen", "foie", "intestin"]
|
| 116 |
+
},
|
| 117 |
+
|
| 118 |
+
# Régions anatomiques
|
| 119 |
+
"anatomie": {
|
| 120 |
+
"tete": ["tete", "crane", "cerebral", "encephale"],
|
| 121 |
+
"thorax": ["thorax", "poumon", "coeur", "mediastin"],
|
| 122 |
+
"abdomen": ["abdomen", "foie", "rate", "pancreas"],
|
| 123 |
+
"pelvis": ["pelvis", "pelvien", "utérus", "ovaire", "vessie"],
|
| 124 |
+
"membres": ["membre", "bras", "jambe", "genou", "epaule"],
|
| 125 |
+
"rachis": ["rachis", "colonne", "vertebral", "lombaire"]
|
| 126 |
+
},
|
| 127 |
+
|
| 128 |
+
# Types de procédures
|
| 129 |
+
"procedures": {
|
| 130 |
+
"arteriel": ["arteriel", "artere", "vasculaire"],
|
| 131 |
+
"veineux": ["veineux", "veine", "phlebo"],
|
| 132 |
+
"fonctionnel": ["fonctionnel", "dynamique", "stress"],
|
| 133 |
+
"contraste": ["contraste", "injection", "gadolinium"]
|
| 134 |
+
},
|
| 135 |
+
|
| 136 |
+
# Centres médicaux
|
| 137 |
+
"centres": {
|
| 138 |
+
"roseraie": ["roseraie", "rose"],
|
| 139 |
+
"4villes": ["4villes", "quatre"],
|
| 140 |
+
"mstruk": ["mstruk", "struktur"],
|
| 141 |
+
"radioroseraie": ["radioroseraie"]
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
def _initialize_gpt(self):
|
| 146 |
+
"""Initialise GPT pour l'analyse de contenu - avec gestion d'erreur améliorée"""
|
| 147 |
+
if not HAS_LANGCHAIN:
|
| 148 |
+
logging.warning("LangChain non disponible. Utilisation du mode fallback.")
|
| 149 |
+
return
|
| 150 |
+
|
| 151 |
+
api_key = os.getenv('OPENAI_API_KEY')
|
| 152 |
+
if not api_key:
|
| 153 |
+
logging.warning("OPENAI_API_KEY non définie. L'analyse GPT ne sera pas disponible.")
|
| 154 |
+
return
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
self.llm = ChatOpenAI(
|
| 158 |
+
model=GPT_MODEL,
|
| 159 |
+
temperature=0,
|
| 160 |
+
max_tokens=4000,
|
| 161 |
+
api_key=api_key
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Simplified prompts to avoid potential issues
|
| 165 |
+
content_prompt = ChatPromptTemplate.from_messages([
|
| 166 |
+
("system", "Analyze this medical transcription and return a JSON with document_type, sections, and medical_data."),
|
| 167 |
+
("human", "Analyze: {transcription}")
|
| 168 |
+
])
|
| 169 |
+
|
| 170 |
+
self.content_analyzer = content_prompt | self.llm
|
| 171 |
+
logging.info("✅ GPT initialisé")
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logging.error(f"❌ Erreur lors de l'initialisation GPT: {e}")
|
| 175 |
+
self.llm = None
|
| 176 |
+
|
| 177 |
+
def analyze_filename(self, filename: str) -> FilenameAnalysis:
|
| 178 |
+
"""Analyse le nom de fichier pour extraire des informations médicales - mode fallback seulement"""
|
| 179 |
+
return self._analyze_filename_fallback(filename)
|
| 180 |
+
|
| 181 |
+
def _analyze_filename_fallback(self, filename: str) -> FilenameAnalysis:
|
| 182 |
+
"""Analyse de fallback pour les noms de fichiers sans GPT"""
|
| 183 |
+
clean_filename = os.path.basename(filename).lower()
|
| 184 |
+
clean_filename = clean_filename.replace('.docx', '').replace('.doc', '').replace('.rtf', '')
|
| 185 |
+
|
| 186 |
+
medical_keywords = []
|
| 187 |
+
document_type_indicators = []
|
| 188 |
+
specialty_indicators = []
|
| 189 |
+
center_indicators = []
|
| 190 |
+
anatomical_regions = []
|
| 191 |
+
procedure_type = None
|
| 192 |
+
|
| 193 |
+
# Rechercher les mots-clés par catégorie
|
| 194 |
+
for category, subcategories in self.filename_keywords.items():
|
| 195 |
+
for subcat, keywords in subcategories.items():
|
| 196 |
+
for keyword in keywords:
|
| 197 |
+
if keyword in clean_filename:
|
| 198 |
+
if category == "imagerie":
|
| 199 |
+
document_type_indicators.append(subcat)
|
| 200 |
+
if subcat in ["echographie", "irm", "scanner"]:
|
| 201 |
+
procedure_type = subcat
|
| 202 |
+
elif category == "specialites":
|
| 203 |
+
specialty_indicators.append(subcat)
|
| 204 |
+
elif category == "anatomie":
|
| 205 |
+
anatomical_regions.append(subcat)
|
| 206 |
+
elif category == "centres":
|
| 207 |
+
center_indicators.append(subcat)
|
| 208 |
+
medical_keywords.append(keyword)
|
| 209 |
+
|
| 210 |
+
# Calculer un score de confiance
|
| 211 |
+
total_elements = len(medical_keywords) + len(document_type_indicators) + len(specialty_indicators)
|
| 212 |
+
confidence_score = min(1.0, total_elements / 5.0)
|
| 213 |
+
|
| 214 |
+
return FilenameAnalysis(
|
| 215 |
+
original_filename=filename,
|
| 216 |
+
medical_keywords=medical_keywords,
|
| 217 |
+
document_type_indicators=document_type_indicators,
|
| 218 |
+
specialty_indicators=specialty_indicators,
|
| 219 |
+
center_indicators=center_indicators,
|
| 220 |
+
anatomical_regions=anatomical_regions,
|
| 221 |
+
procedure_type=procedure_type,
|
| 222 |
+
confidence_score=confidence_score
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
def load_database(self, filepath: str):
|
| 226 |
+
"""Charge la base de données vectorielle avec gestion d'erreur"""
|
| 227 |
+
try:
|
| 228 |
+
if not hasattr(self, 'parser') or self.parser is None:
|
| 229 |
+
self.parser = MedicalTemplateParser()
|
| 230 |
+
self.parser.load_database(filepath)
|
| 231 |
+
logging.info(f"✅ Base de données chargée: {len(self.parser.templates)} templates")
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logging.error(f"Erreur lors du chargement de la base: {e}")
|
| 234 |
+
raise
|
| 235 |
+
|
| 236 |
+
def analyze_transcription_detailed(self, transcription: str, transcription_filename: str = "") -> Dict:
|
| 237 |
+
"""Analyse simplifiée sans GPT pour éviter les erreurs"""
|
| 238 |
+
return self._fallback_analysis(transcription, transcription_filename)
|
| 239 |
+
|
| 240 |
+
def _fallback_analysis(self, transcription: str, transcription_filename: str = "") -> Dict:
|
| 241 |
+
"""Analyse améliorée de fallback sans GPT"""
|
| 242 |
+
text_lower = transcription.lower()
|
| 243 |
+
|
| 244 |
+
# Détecter le type de document
|
| 245 |
+
document_types = {
|
| 246 |
+
"compte_rendu_imagerie": ["irm", "scanner", "échographie", "radiologie", "t1", "t2", "doppler", "technique", "plans"],
|
| 247 |
+
"rapport_biologique": ["laboratoire", "analyse", "biologie", "sang", "urine", "sérum"],
|
| 248 |
+
"lettre_medicale": ["lettre", "courrier", "correspondance", "cher confrère"],
|
| 249 |
+
"compte_rendu_consultation": ["consultation", "examen clinique", "patient", "antécédents"]
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
detected_type = "compte_rendu_imagerie" # Par défaut pour cet exemple
|
| 253 |
+
|
| 254 |
+
# Vérifier dans le nom de fichier d'abord
|
| 255 |
+
if transcription_filename:
|
| 256 |
+
filename_lower = transcription_filename.lower()
|
| 257 |
+
for doc_type, keywords in document_types.items():
|
| 258 |
+
if any(kw in filename_lower for kw in keywords):
|
| 259 |
+
detected_type = doc_type
|
| 260 |
+
break
|
| 261 |
+
|
| 262 |
+
# Vérifier dans le contenu
|
| 263 |
+
for doc_type, keywords in document_types.items():
|
| 264 |
+
if sum(1 for kw in keywords if kw in text_lower) >= 2:
|
| 265 |
+
detected_type = doc_type
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
# Extraire les sections avec regex amélioré pour le format markdown
|
| 269 |
+
sections = {}
|
| 270 |
+
|
| 271 |
+
# Patterns pour détecter les sections formatées avec **
|
| 272 |
+
markdown_sections = re.findall(r'\*\*(.*?)\s*:\s*\*\*(.*?)(?=\*\*|\Z)', transcription, re.DOTALL | re.IGNORECASE)
|
| 273 |
+
|
| 274 |
+
for section_title, section_content in markdown_sections:
|
| 275 |
+
section_title_clean = section_title.strip().lower()
|
| 276 |
+
section_content_clean = section_content.strip()
|
| 277 |
+
|
| 278 |
+
# Mapper les titres de section vers des noms standardisés
|
| 279 |
+
section_mapping = {
|
| 280 |
+
"technique": ["technique", "méthode", "protocole", "acquisition"],
|
| 281 |
+
"résultats": ["résultat", "résultats", "observation", "constatation", "analyse", "description"],
|
| 282 |
+
"conclusion": ["conclusion", "diagnostic", "synthèse", "impression", "avis"],
|
| 283 |
+
"indication": ["indication", "motif", "demande", "contexte"],
|
| 284 |
+
"histoire": ["histoire", "antécédent", "contexte", "clinique"]
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
# Trouver la catégorie correspondante
|
| 288 |
+
mapped_section = None
|
| 289 |
+
for standard_name, variations in section_mapping.items():
|
| 290 |
+
if any(var in section_title_clean for var in variations):
|
| 291 |
+
mapped_section = standard_name
|
| 292 |
+
break
|
| 293 |
+
|
| 294 |
+
# Utiliser le nom standardisé ou le titre original
|
| 295 |
+
final_section_name = mapped_section if mapped_section else section_title_clean
|
| 296 |
+
|
| 297 |
+
if section_content_clean:
|
| 298 |
+
sections[final_section_name] = {
|
| 299 |
+
"content": section_content_clean,
|
| 300 |
+
"confidence": 0.8,
|
| 301 |
+
"keywords": [section_title_clean]
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
# Si aucune section markdown trouvée, essayer d'autres patterns
|
| 305 |
+
if not sections:
|
| 306 |
+
# Rechercher des patterns plus généraux
|
| 307 |
+
text_lines = transcription.split('\n')
|
| 308 |
+
current_section = None
|
| 309 |
+
current_content = []
|
| 310 |
+
|
| 311 |
+
for line in text_lines:
|
| 312 |
+
line_stripped = line.strip()
|
| 313 |
+
if not line_stripped:
|
| 314 |
+
continue
|
| 315 |
+
|
| 316 |
+
# Vérifier si c'est un titre de section (contient des mots-clés de section)
|
| 317 |
+
line_lower = line_stripped.lower()
|
| 318 |
+
is_section_title = False
|
| 319 |
+
|
| 320 |
+
for section_name, keywords in [
|
| 321 |
+
("technique", ["technique", "méthode", "protocole"]),
|
| 322 |
+
("résultats", ["résultat", "observation", "constatation"]),
|
| 323 |
+
("conclusion", ["conclusion", "diagnostic", "synthèse"])
|
| 324 |
+
]:
|
| 325 |
+
if any(kw in line_lower for kw in keywords) and len(line_stripped) < 50:
|
| 326 |
+
# Sauvegarder la section précédente
|
| 327 |
+
if current_section and current_content:
|
| 328 |
+
sections[current_section] = {
|
| 329 |
+
"content": '\n'.join(current_content),
|
| 330 |
+
"confidence": 0.7,
|
| 331 |
+
"keywords": [current_section]
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
current_section = section_name
|
| 335 |
+
current_content = []
|
| 336 |
+
is_section_title = True
|
| 337 |
+
break
|
| 338 |
+
|
| 339 |
+
if not is_section_title and current_section:
|
| 340 |
+
current_content.append(line_stripped)
|
| 341 |
+
|
| 342 |
+
# Sauvegarder la dernière section
|
| 343 |
+
if current_section and current_content:
|
| 344 |
+
sections[current_section] = {
|
| 345 |
+
"content": '\n'.join(current_content),
|
| 346 |
+
"confidence": 0.7,
|
| 347 |
+
"keywords": [current_section]
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
analysis = {
|
| 351 |
+
"document_type": detected_type,
|
| 352 |
+
"identification": {
|
| 353 |
+
"physician": "Non identifié",
|
| 354 |
+
"center": "Non identifié",
|
| 355 |
+
"service": "Non identifié"
|
| 356 |
+
},
|
| 357 |
+
"sections": sections,
|
| 358 |
+
"medical_data": {
|
| 359 |
+
"procedures": ["IRM pelvienne", "T1 Dixon", "T2"],
|
| 360 |
+
"measurements": re.findall(r'\d+\s*(?:mm|cm|ml)', transcription),
|
| 361 |
+
"diagnoses": ["endométriome ovarien"],
|
| 362 |
+
"treatments": [],
|
| 363 |
+
"anatomical_regions": ["utérus", "ovaire", "pelvis"]
|
| 364 |
+
},
|
| 365 |
+
"completeness": {
|
| 366 |
+
"score": 0.8,
|
| 367 |
+
"transcription_quality": "good"
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
# Ajouter l'analyse du nom de fichier
|
| 372 |
+
if transcription_filename:
|
| 373 |
+
filename_analysis = self.analyze_filename(transcription_filename)
|
| 374 |
+
analysis["filename_analysis"] = {
|
| 375 |
+
"medical_keywords": filename_analysis.medical_keywords,
|
| 376 |
+
"document_type_indicators": filename_analysis.document_type_indicators,
|
| 377 |
+
"specialty_indicators": filename_analysis.specialty_indicators,
|
| 378 |
+
"anatomical_regions": filename_analysis.anatomical_regions,
|
| 379 |
+
"procedure_type": filename_analysis.procedure_type
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
return analysis
|
| 383 |
+
|
| 384 |
+
def calculate_filename_match_score(self, transcription_filename: str, transcription_analysis: Dict,
|
| 385 |
+
template_filename: str) -> Tuple[float, List[str]]:
|
| 386 |
+
"""Calcule le score de correspondance basé sur les noms de fichiers"""
|
| 387 |
+
|
| 388 |
+
trans_filename_analysis = self.analyze_filename(transcription_filename)
|
| 389 |
+
template_filename_analysis = self.analyze_filename(template_filename)
|
| 390 |
+
|
| 391 |
+
score_components = []
|
| 392 |
+
matching_indicators = []
|
| 393 |
+
|
| 394 |
+
# Correspondance des types de documents
|
| 395 |
+
trans_types = set(trans_filename_analysis.document_type_indicators)
|
| 396 |
+
template_types = set(template_filename_analysis.document_type_indicators)
|
| 397 |
+
|
| 398 |
+
if trans_types & template_types:
|
| 399 |
+
type_match_score = len(trans_types & template_types) / max(len(trans_types | template_types), 1)
|
| 400 |
+
score_components.append(type_match_score * 0.4)
|
| 401 |
+
matching_indicators.extend(list(trans_types & template_types))
|
| 402 |
+
|
| 403 |
+
# Correspondance des spécialités
|
| 404 |
+
trans_specialties = set(trans_filename_analysis.specialty_indicators)
|
| 405 |
+
template_specialties = set(template_filename_analysis.specialty_indicators)
|
| 406 |
+
|
| 407 |
+
if trans_specialties & template_specialties:
|
| 408 |
+
specialty_match_score = len(trans_specialties & template_specialties) / max(len(trans_specialties | template_specialties), 1)
|
| 409 |
+
score_components.append(specialty_match_score * 0.25)
|
| 410 |
+
matching_indicators.extend(list(trans_specialties & template_specialties))
|
| 411 |
+
|
| 412 |
+
final_score = sum(score_components) if score_components else 0.0
|
| 413 |
+
return min(1.0, final_score), matching_indicators
|
| 414 |
+
|
| 415 |
+
def calculate_basic_scores(self, transcription_analysis: Dict, template_info: TemplateInfo) -> Tuple[float, float, float]:
|
| 416 |
+
"""Calcule les scores de base sans utiliser les fonctions problématiques"""
|
| 417 |
+
|
| 418 |
+
# Score de type simplifié
|
| 419 |
+
transcription_type = transcription_analysis.get("document_type", "")
|
| 420 |
+
template_type = template_info.type.lower()
|
| 421 |
+
|
| 422 |
+
type_mappings = {
|
| 423 |
+
"compte_rendu_imagerie": ["irm", "scanner", "échographie", "imagerie", "radiologie"],
|
| 424 |
+
"rapport_biologique": ["laboratoire", "biologie", "analyse"],
|
| 425 |
+
"lettre_medicale": ["lettre", "courrier", "correspondance"],
|
| 426 |
+
"compte_rendu_consultation": ["consultation", "examen"]
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
type_score = 0.3 # Score par défaut
|
| 430 |
+
if transcription_type in type_mappings:
|
| 431 |
+
expected_keywords = type_mappings[transcription_type]
|
| 432 |
+
matches = sum(1 for kw in expected_keywords if kw in template_type)
|
| 433 |
+
type_score = min(1.0, matches / len(expected_keywords) * 2)
|
| 434 |
+
|
| 435 |
+
# Scores simplifiés pour médecin et centre
|
| 436 |
+
physician_score = 0.5 # Neutre par défaut
|
| 437 |
+
center_score = 0.5 # Neutre par défaut
|
| 438 |
+
|
| 439 |
+
return type_score, physician_score, center_score
|
| 440 |
+
|
| 441 |
+
def calculate_simple_section_matches(self, transcription: str, transcription_analysis: Dict, template_info: TemplateInfo) -> Dict[str, SectionMatch]:
|
| 442 |
+
"""Version améliorée du matching de sections"""
|
| 443 |
+
section_matches = {}
|
| 444 |
+
transcription_sections = transcription_analysis.get("sections", {})
|
| 445 |
+
|
| 446 |
+
# Patterns de sections courantes dans les transcriptions médicales
|
| 447 |
+
section_mapping = {
|
| 448 |
+
"technique": ["technique", "méthode", "protocole", "acquisition"],
|
| 449 |
+
"résultats": ["résultat", "observation", "constatation", "description", "analyse"],
|
| 450 |
+
"conclusion": ["conclusion", "diagnostic", "synthèse", "impression"],
|
| 451 |
+
"indication": ["indication", "motif", "demande"],
|
| 452 |
+
"histoire": ["histoire", "antécédent", "contexte", "clinique"],
|
| 453 |
+
"examen": ["examen", "exploration", "investigation"]
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
for section_name in template_info.detected_sections:
|
| 457 |
+
section_lower = section_name.lower()
|
| 458 |
+
best_content = ""
|
| 459 |
+
best_confidence = 0.0
|
| 460 |
+
|
| 461 |
+
# 1. Chercher d'abord dans les sections structurées de la transcription
|
| 462 |
+
for analyzed_section, section_data in transcription_sections.items():
|
| 463 |
+
if isinstance(section_data, dict):
|
| 464 |
+
content = section_data.get("content", "")
|
| 465 |
+
confidence = section_data.get("confidence", 0.0)
|
| 466 |
+
|
| 467 |
+
# Correspondance directe
|
| 468 |
+
if section_lower in analyzed_section.lower() or analyzed_section.lower() in section_lower:
|
| 469 |
+
best_content = content
|
| 470 |
+
best_confidence = confidence
|
| 471 |
+
break
|
| 472 |
+
|
| 473 |
+
# Correspondance par mapping
|
| 474 |
+
if section_lower in section_mapping:
|
| 475 |
+
expected_keywords = section_mapping[section_lower]
|
| 476 |
+
if any(kw in analyzed_section.lower() for kw in expected_keywords):
|
| 477 |
+
best_content = content
|
| 478 |
+
best_confidence = confidence * 0.9 # Légère pénalité pour correspondance indirecte
|
| 479 |
+
break
|
| 480 |
+
|
| 481 |
+
# 2. Si pas trouvé, recherche par patterns dans le texte complet
|
| 482 |
+
if not best_content:
|
| 483 |
+
# Rechercher par balises markdown/formatage
|
| 484 |
+
markdown_patterns = [
|
| 485 |
+
rf"\*\*{section_lower}[:\s]*\*\*(.*?)(?=\*\*|\n\n|$)",
|
| 486 |
+
rf"{section_lower}[:\s]+(.*?)(?=\n\*\*|\n\n|$)",
|
| 487 |
+
rf"#{section_lower}[:\s]+(.*?)(?=\n#|\n\n|$)"
|
| 488 |
+
]
|
| 489 |
+
|
| 490 |
+
for pattern in markdown_patterns:
|
| 491 |
+
matches = re.findall(pattern, transcription, re.IGNORECASE | re.DOTALL)
|
| 492 |
+
if matches:
|
| 493 |
+
best_content = matches[0].strip()
|
| 494 |
+
best_confidence = 0.8
|
| 495 |
+
break
|
| 496 |
+
|
| 497 |
+
# Si toujours pas trouvé, recherche par mots-clés de section
|
| 498 |
+
if not best_content and section_lower in section_mapping:
|
| 499 |
+
keywords = section_mapping[section_lower]
|
| 500 |
+
for keyword in keywords:
|
| 501 |
+
if keyword in transcription.lower():
|
| 502 |
+
# Extraire un contexte autour du mot-clé
|
| 503 |
+
start_pos = transcription.lower().find(keyword)
|
| 504 |
+
start = max(0, start_pos - 50)
|
| 505 |
+
end = min(len(transcription), start_pos + 400)
|
| 506 |
+
best_content = transcription[start:end].strip()
|
| 507 |
+
best_confidence = 0.6
|
| 508 |
+
break
|
| 509 |
+
|
| 510 |
+
# 3. Évaluation de la capacité de remplissage
|
| 511 |
+
can_fill = bool(best_content) and len(best_content.strip()) > 20
|
| 512 |
+
missing_info = [] if can_fill else [f"Contenu manquant pour {section_name}"]
|
| 513 |
+
|
| 514 |
+
section_matches[section_name] = SectionMatch(
|
| 515 |
+
section_name=section_name,
|
| 516 |
+
confidence=best_confidence,
|
| 517 |
+
extracted_content=best_content,
|
| 518 |
+
can_fill=can_fill,
|
| 519 |
+
missing_info=missing_info
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
return section_matches
|
| 523 |
+
|
| 524 |
+
def calculate_fillability_score(self, section_matches: Dict[str, SectionMatch], template_info: TemplateInfo) -> Tuple[float, float, List[str]]:
|
| 525 |
+
"""Calcule le score de remplissage possible du template - version corrigée"""
|
| 526 |
+
total_sections = len(template_info.detected_sections)
|
| 527 |
+
fillable_sections = sum(1 for match in section_matches.values() if match.can_fill)
|
| 528 |
+
|
| 529 |
+
if total_sections == 0:
|
| 530 |
+
return 0.0, 0.0, ["Template sans sections"]
|
| 531 |
+
|
| 532 |
+
# Score de remplissabilité basé sur le pourcentage de sections remplissables
|
| 533 |
+
fillability_score = fillable_sections / total_sections
|
| 534 |
+
|
| 535 |
+
# Pourcentage réel de remplissage
|
| 536 |
+
filling_percentage = (fillable_sections / total_sections) * 100
|
| 537 |
+
|
| 538 |
+
# Sections critiques manquantes
|
| 539 |
+
missing_critical = [
|
| 540 |
+
match.section_name for match in section_matches.values()
|
| 541 |
+
if not match.can_fill
|
| 542 |
+
]
|
| 543 |
+
|
| 544 |
+
return fillability_score, filling_percentage, missing_critical
|
| 545 |
+
|
| 546 |
+
def match_templates(self, transcription: str, transcription_filename: str = "", k: int = 3) -> List[TemplateMatch]:
|
| 547 |
+
"""
|
| 548 |
+
Fonction principale : effectue le matching et retourne les 3 meilleurs templates
|
| 549 |
+
|
| 550 |
+
Args:
|
| 551 |
+
transcription: Le contenu de la transcription médicale
|
| 552 |
+
transcription_filename: Le nom du fichier de transcription
|
| 553 |
+
k: Nombre de résultats à retourner (défaut: 3)
|
| 554 |
+
|
| 555 |
+
Returns:
|
| 556 |
+
List[TemplateMatch]: Les 3 templates avec les scores les plus élevés
|
| 557 |
+
"""
|
| 558 |
+
if not self.parser or not self.parser.templates:
|
| 559 |
+
logging.error("Aucun template chargé")
|
| 560 |
+
return []
|
| 561 |
+
|
| 562 |
+
logging.info(f"🔍 Début du matching pour: {transcription_filename}")
|
| 563 |
+
logging.info(f"📄 Contenu de la transcription: {len(transcription.split())} mots")
|
| 564 |
+
|
| 565 |
+
# Analyser la transcription
|
| 566 |
+
analysis = self.analyze_transcription_detailed(transcription, transcription_filename)
|
| 567 |
+
logging.info(f"📊 Type de document détecté: {analysis.get('document_type')}")
|
| 568 |
+
logging.info(f"🔧 Sections détectées: {list(analysis.get('sections', {}).keys())}")
|
| 569 |
+
|
| 570 |
+
template_matches = []
|
| 571 |
+
|
| 572 |
+
for template_id, template_info in self.parser.templates.items():
|
| 573 |
+
try:
|
| 574 |
+
# Calculer les scores de base
|
| 575 |
+
type_score, physician_score, center_score = self.calculate_basic_scores(analysis, template_info)
|
| 576 |
+
|
| 577 |
+
# Score nom de fichier
|
| 578 |
+
filename_score, filename_indicators = self.calculate_filename_match_score(
|
| 579 |
+
transcription_filename, analysis, template_info.filepath
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
# Analyser les sections de façon améliorée
|
| 583 |
+
section_matches = self.calculate_simple_section_matches(transcription, analysis, template_info)
|
| 584 |
+
|
| 585 |
+
# Score de remplissage corrigé
|
| 586 |
+
fillability_score, filling_percentage, missing_critical = self.calculate_fillability_score(section_matches, template_info)
|
| 587 |
+
|
| 588 |
+
# Score de contenu simplifié
|
| 589 |
+
content_score = 0.5
|
| 590 |
+
|
| 591 |
+
# Score global avec pondération améliorée
|
| 592 |
+
overall_score = (
|
| 593 |
+
type_score * 0.25 +
|
| 594 |
+
fillability_score * 0.35 + # Plus de poids au remplissage
|
| 595 |
+
filename_score * 0.25 +
|
| 596 |
+
content_score * 0.1 +
|
| 597 |
+
physician_score * 0.025 +
|
| 598 |
+
center_score * 0.025
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
# Bonus pour les templates avec beaucoup de sections remplissables
|
| 602 |
+
if len([s for s in section_matches.values() if s.can_fill]) >= 2:
|
| 603 |
+
overall_score += 0.1
|
| 604 |
+
|
| 605 |
+
confidence_level = "excellent" if overall_score > 0.7 else "good" if overall_score > 0.5 else "fair" if overall_score > 0.3 else "poor"
|
| 606 |
+
|
| 607 |
+
# Données extraites (seulement les sections avec contenu)
|
| 608 |
+
extracted_data = {}
|
| 609 |
+
for section_name, match in section_matches.items():
|
| 610 |
+
if match.can_fill and match.extracted_content.strip():
|
| 611 |
+
extracted_data[section_name] = match.extracted_content
|
| 612 |
+
|
| 613 |
+
# Un template peut être rempli s'il a au moins une section avec contenu
|
| 614 |
+
can_be_filled = len(extracted_data) > 0 or fillability_score > 0.3
|
| 615 |
+
|
| 616 |
+
template_match = TemplateMatch(
|
| 617 |
+
template_id=template_id,
|
| 618 |
+
template_info=template_info,
|
| 619 |
+
overall_score=overall_score,
|
| 620 |
+
type_match_score=type_score,
|
| 621 |
+
physician_match_score=physician_score,
|
| 622 |
+
center_match_score=center_score,
|
| 623 |
+
content_match_score=content_score,
|
| 624 |
+
filename_match_score=filename_score,
|
| 625 |
+
fillability_score=fillability_score,
|
| 626 |
+
section_matches=section_matches,
|
| 627 |
+
confidence_level=confidence_level,
|
| 628 |
+
can_be_filled=can_be_filled,
|
| 629 |
+
filling_percentage=filling_percentage,
|
| 630 |
+
missing_critical_info=missing_critical,
|
| 631 |
+
extracted_data=extracted_data,
|
| 632 |
+
filename_indicators=filename_indicators
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
template_matches.append(template_match)
|
| 636 |
+
|
| 637 |
+
except Exception as e:
|
| 638 |
+
logging.warning(f"Erreur lors de l'analyse du template {template_id}: {e}")
|
| 639 |
+
continue
|
| 640 |
+
|
| 641 |
+
# Trier par score global et garder les k meilleurs
|
| 642 |
+
template_matches.sort(key=lambda x: x.overall_score, reverse=True)
|
| 643 |
+
top_matches = template_matches[:k]
|
| 644 |
+
|
| 645 |
+
# Logging des résultats
|
| 646 |
+
logging.info(f"✅ Matching terminé - {len(top_matches)} templates sélectionnés")
|
| 647 |
+
for i, match in enumerate(top_matches, 1):
|
| 648 |
+
logging.info(f"🏆 Template #{i}: {match.template_id}")
|
| 649 |
+
logging.info(f" 📊 Score global: {match.overall_score:.3f}")
|
| 650 |
+
logging.info(f" 📋 Sections remplissables: {len(match.extracted_data)}")
|
| 651 |
+
logging.info(f" 🎯 Niveau de confiance: {match.confidence_level}")
|
| 652 |
+
logging.info(f" 📁 Template: {os.path.basename(match.template_info.filepath)}")
|
| 653 |
+
|
| 654 |
+
return top_matches
|
| 655 |
+
|
| 656 |
+
def print_matching_results(self, matches: List[TemplateMatch]):
|
| 657 |
+
"""Affiche les résultats de matching de façon détaillée"""
|
| 658 |
+
if not matches:
|
| 659 |
+
print("❌ Aucun résultat trouvé")
|
| 660 |
+
return
|
| 661 |
+
|
| 662 |
+
print(f"\n{'='*80}")
|
| 663 |
+
print(f"🎯 RÉSULTATS DE MATCHING - Top {len(matches)} templates")
|
| 664 |
+
print(f"{'='*80}")
|
| 665 |
+
|
| 666 |
+
for i, match in enumerate(matches, 1):
|
| 667 |
+
print(f"\n🏆 TEMPLATE #{i}")
|
| 668 |
+
print(f" 🆔 ID: {match.template_id}")
|
| 669 |
+
print(f" 📊 Score global: {match.overall_score:.3f}")
|
| 670 |
+
print(f" 📁 Fichier: {os.path.basename(match.template_info.filepath)}")
|
| 671 |
+
print(f" 👨⚕️ Médecin: {match.template_info.medecin}")
|
| 672 |
+
print(f" 🏥 Centre: {getattr(match.template_info, 'centre_medical', 'Non spécifié')}")
|
| 673 |
+
print(f" 📝 Type: {match.template_info.type}")
|
| 674 |
+
print(f" 🔧 Remplissage possible: {match.filling_percentage:.1f}%")
|
| 675 |
+
print(f" 🎯 Niveau de confiance: {match.confidence_level}")
|
| 676 |
+
|
| 677 |
+
print(f" 📈 Détail des scores:")
|
| 678 |
+
print(f" - Type: {match.type_match_score:.3f}")
|
| 679 |
+
print(f" - Remplissabilité: {match.fillability_score:.3f}")
|
| 680 |
+
print(f" - Nom de fichier: {match.filename_match_score:.3f}")
|
| 681 |
+
print(f" - Contenu: {match.content_match_score:.3f}")
|
| 682 |
+
|
| 683 |
+
if match.filename_indicators:
|
| 684 |
+
print(f" 🏷️ Indicateurs fichier: {', '.join(match.filename_indicators)}")
|
| 685 |
+
|
| 686 |
+
if match.extracted_data:
|
| 687 |
+
print(f" 📋 Sections extraites ({len(match.extracted_data)}):")
|
| 688 |
+
for section_name, content in match.extracted_data.items():
|
| 689 |
+
preview = content[:100] + "..." if len(content) > 100 else content
|
| 690 |
+
print(f" • {section_name}: {preview}")
|
| 691 |
+
|
| 692 |
+
if match.missing_critical_info:
|
| 693 |
+
print(f" ⚠️ Sections manquantes: {', '.join(match.missing_critical_info)}")
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
def main():
|
| 697 |
+
"""Fonction principale pour tester le matching"""
|
| 698 |
+
|
| 699 |
+
# Configuration du logging
|
| 700 |
+
logging.basicConfig(
|
| 701 |
+
level=logging.INFO,
|
| 702 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
# Exemple de transcription
|
| 706 |
+
transcription_filename = "default.73.931915433.rtf_3650535_radiologie.doc"
|
| 707 |
+
transcription_content = """**Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
|
| 708 |
+
**Résultats :**
|
| 709 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 710 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 711 |
+
* Pas d'adénomyose franche.
|
| 712 |
+
* Aspect normal du col utérin et du vagin.
|
| 713 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 714 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 715 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 716 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 717 |
+
* Pas d'autre localisation pelvienne.
|
| 718 |
+
* Pas d'épanchement pelvien.
|
| 719 |
+
* Pas d'anomalie de la vessie.
|
| 720 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 721 |
+
**Conclusion :**
|
| 722 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 723 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm."""
|
| 724 |
+
|
| 725 |
+
# Chemin vers la base de données
|
| 726 |
+
db_path = DB_PATH
|
| 727 |
+
if not os.path.exists(db_path):
|
| 728 |
+
print(f"❌ Base de données non trouvée: {db_path}")
|
| 729 |
+
return
|
| 730 |
+
|
| 731 |
+
try:
|
| 732 |
+
# Initialiser le matcher
|
| 733 |
+
matcher = TemplateMatcher(db_path)
|
| 734 |
+
|
| 735 |
+
# Effectuer le matching
|
| 736 |
+
matches = matcher.match_templates(transcription_content, transcription_filename, k=3)
|
| 737 |
+
|
| 738 |
+
# Afficher les résultats
|
| 739 |
+
matcher.print_matching_results(matches)
|
| 740 |
+
|
| 741 |
+
# Retourner les résultats pour utilisation par le deuxième fichier
|
| 742 |
+
return matches
|
| 743 |
+
|
| 744 |
+
except Exception as e:
|
| 745 |
+
logging.error(f"❌ Erreur: {e}")
|
| 746 |
+
return []
|
| 747 |
+
|
| 748 |
+
if __name__ == "__main__":
|
| 749 |
+
main()
|
medical_template3_mapper.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Dict, List, Optional, Tuple, Any
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class FieldType(Enum):
|
| 10 |
+
"""Types de champs dans le template"""
|
| 11 |
+
CHECKBOX = "checkbox" # &x cases à cocher
|
| 12 |
+
TEXT = "text" # &x texte libre
|
| 13 |
+
MEASUREMENT = "measurement" # &x valeurs numériques
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class TemplateField:
|
| 17 |
+
"""Définition d'un champ du template"""
|
| 18 |
+
placeholder: str # &x dans le template
|
| 19 |
+
field_type: FieldType
|
| 20 |
+
source_field: str # Champ correspondant dans ExtractedData
|
| 21 |
+
default_value: str = ""
|
| 22 |
+
validation_pattern: Optional[str] = None
|
| 23 |
+
transformation_func: Optional[callable] = None
|
| 24 |
+
context_identifier: Optional[str] = None # Pour différencier gauche/droite
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class MappingResult:
|
| 28 |
+
"""Résultat du mapping"""
|
| 29 |
+
filled_template: str
|
| 30 |
+
mapped_fields: Dict[str, str]
|
| 31 |
+
unmapped_placeholders: List[str]
|
| 32 |
+
mapping_confidence: float
|
| 33 |
+
errors: List[str]
|
| 34 |
+
|
| 35 |
+
class MedicalTemplateMapper:
|
| 36 |
+
"""Moteur de mapping des données extraites vers le template médical"""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.template = self._load_template()
|
| 40 |
+
self.field_mappings = self._define_field_mappings()
|
| 41 |
+
self.checkbox_logic = self._define_checkbox_logic()
|
| 42 |
+
|
| 43 |
+
def _load_template(self) -> str:
|
| 44 |
+
"""Template médical de base avec placeholders &x"""
|
| 45 |
+
return """BILAN
|
| 46 |
+
|
| 47 |
+
L'utérus est &x antéversé, &x rétroversé, &x intermédiaire, &x rétrofléchi, &x antéfléchi, &x fixe de taille normale (&x x &x x &x cm).
|
| 48 |
+
Hystérométrie : distance orifice externe du col - fond de la cavité utérine : &x mm.
|
| 49 |
+
L'endomètre : mesuré à &x mm.
|
| 50 |
+
Myometre : pas de myome.
|
| 51 |
+
Zone jonctionnelle : Atteinte de la zone de jonction : &x non &x oui
|
| 52 |
+
Adénomyose associée : &x non &x oui : &x diffuse &x focale &x interne &x externe
|
| 53 |
+
Col utérin: pas de kyste de Naboth. Absence de pathologies échographiquement décelable à son niveau.
|
| 54 |
+
Cavité utérine en 3D: morphologie triangulaire.
|
| 55 |
+
|
| 56 |
+
&xKISSING OVARIES
|
| 57 |
+
L'ovaire droit mesure &x x &x mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &xfolliculaire CFA &x follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 58 |
+
Accessibilité : &x rétro-utérin &x fixe &x aisée.
|
| 59 |
+
L'ovaire gauche mesure &x x &x mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &x folliculaire CFA &x follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 60 |
+
Accessibilité : &x rétro-utérin &x fixe &x aisée.
|
| 61 |
+
&x Présence de micro-calcifications sous thécales &x bilatérales &x droites &x gauches pouvant témoigner d'implants endométriosiques superficiels.
|
| 62 |
+
L'échostructure des deux ovaires apparait normale, avec une vascularisation artério-veineuse normale au Doppler, sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 63 |
+
|
| 64 |
+
Cavité péritonéale
|
| 65 |
+
&x- Pas d'épanchement liquidien dans le cul du sac du Douglas. Pas de douleur à l'écho-palpation.
|
| 66 |
+
&x- Faible épanchement corpusculé dans le cul du sac du Douglas qui silhouette des adhérences (soft marqueur d'endométriose?). Pas de douleur à l'écho-palpation.
|
| 67 |
+
- &xVessie vide pendant l'examen. &x Vessie en semi-réplétion pendant l'examen.
|
| 68 |
+
- &x Absence de dilatation pyélo-calicielle.
|
| 69 |
+
- Artère utérine : IP : &x - IR : 0,&x - Spectre : type 2 avec notch protodiastolique.
|
| 70 |
+
- Pas d'image d'hydrosalpinx visible à ce jour.
|
| 71 |
+
|
| 72 |
+
RECHERCHE ENDOMETRIOSE PELVIENNE
|
| 73 |
+
|
| 74 |
+
A-Compartiment antérieur (vessie en semi-réplétion)
|
| 75 |
+
- Signe du glissement (sliding) : &xprésent &xdiminué &xabsent
|
| 76 |
+
- Présence d'un nodule : &xnon &xoui
|
| 77 |
+
- Uretères dans la partie pelvienne vus non dilatés.
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
B-Compartiment postérieur
|
| 81 |
+
- Signe du glissement (sliding) :
|
| 82 |
+
- Espace recto-vaginal : &xprésent &xdiminué &xabsent
|
| 83 |
+
- Plan sus-péritonéal : &xprésent &xdiminué &xabsent
|
| 84 |
+
- Aspect du torus : &x normal &x épaissi
|
| 85 |
+
- Aspect des ligaments utéro-sacrés :
|
| 86 |
+
- Ligament utéro- sacré droit : &x normal &x épaissi
|
| 87 |
+
- Ligament utéro-sacré gauche : &x normal &x épaissi
|
| 88 |
+
- Présence d'un nodule hypoéchogène : &x non
|
| 89 |
+
- Infiltration digestive: &x non &x oui : &x bas rectum &x moyen rectum &x haut rectum &x jonction recto-sigmoïde
|
| 90 |
+
|
| 91 |
+
Conclusions
|
| 92 |
+
Utérus de taille et de morphologie normales.
|
| 93 |
+
Endomètre mesuré à &x mm.
|
| 94 |
+
CFA : &x+&x follicules.
|
| 95 |
+
Ovaires sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 96 |
+
&x Absence d'image d'endométriose visible ce jour, à confronter éventuellement à une IRM.
|
| 97 |
+
&x Endométriose &x superficielle &x et profonde.
|
| 98 |
+
Absence d'anomalie échographiquement décelable au niveau des trompes.
|
| 99 |
+
--> L'ensemble de ces aspects reste à confronter au contexte clinico-thérapeutique.
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
def _define_field_mappings(self) -> Dict[str, TemplateField]:
|
| 103 |
+
"""Définit les mappings entre données extraites et placeholders template"""
|
| 104 |
+
return {
|
| 105 |
+
# Position utérus - checkboxes
|
| 106 |
+
"uterus_position_antéversé": TemplateField(
|
| 107 |
+
placeholder="&x antéversé",
|
| 108 |
+
field_type=FieldType.CHECKBOX,
|
| 109 |
+
source_field="uterus_position",
|
| 110 |
+
transformation_func=lambda x: "X" if x and "antéversé" in x.lower() else ""
|
| 111 |
+
),
|
| 112 |
+
"uterus_position_rétroversé": TemplateField(
|
| 113 |
+
placeholder="&x rétroversé",
|
| 114 |
+
field_type=FieldType.CHECKBOX,
|
| 115 |
+
source_field="uterus_position",
|
| 116 |
+
transformation_func=lambda x: "X" if x and "rétroversé" in x.lower() else ""
|
| 117 |
+
),
|
| 118 |
+
"uterus_position_intermédiaire": TemplateField(
|
| 119 |
+
placeholder="&x intermédiaire",
|
| 120 |
+
field_type=FieldType.CHECKBOX,
|
| 121 |
+
source_field="uterus_position",
|
| 122 |
+
transformation_func=lambda x: "X" if x and "intermédiaire" in x.lower() else ""
|
| 123 |
+
),
|
| 124 |
+
"uterus_position_rétrofléchi": TemplateField(
|
| 125 |
+
placeholder="&x rétrofléchi",
|
| 126 |
+
field_type=FieldType.CHECKBOX,
|
| 127 |
+
source_field="uterus_position",
|
| 128 |
+
transformation_func=lambda x: "X" if x and "rétrofléchi" in x.lower() else ""
|
| 129 |
+
),
|
| 130 |
+
"uterus_position_antéfléchi": TemplateField(
|
| 131 |
+
placeholder="&x antéfléchi",
|
| 132 |
+
field_type=FieldType.CHECKBOX,
|
| 133 |
+
source_field="uterus_position",
|
| 134 |
+
transformation_func=lambda x: "X" if x and "antéfléchi" in x.lower() else ""
|
| 135 |
+
),
|
| 136 |
+
"uterus_position_fixe": TemplateField(
|
| 137 |
+
placeholder="&x fixe",
|
| 138 |
+
field_type=FieldType.CHECKBOX,
|
| 139 |
+
source_field="uterus_position",
|
| 140 |
+
transformation_func=lambda x: "X" if x and "fixe" in x.lower() else ""
|
| 141 |
+
),
|
| 142 |
+
|
| 143 |
+
# Taille utérus - dimensions (corrected)
|
| 144 |
+
"uterus_size_length": TemplateField(
|
| 145 |
+
placeholder="normale (&x x",
|
| 146 |
+
field_type=FieldType.MEASUREMENT,
|
| 147 |
+
source_field="uterus_size",
|
| 148 |
+
transformation_func=self._extract_first_dimension
|
| 149 |
+
),
|
| 150 |
+
"uterus_size_width": TemplateField(
|
| 151 |
+
placeholder="x &x x",
|
| 152 |
+
field_type=FieldType.MEASUREMENT,
|
| 153 |
+
source_field="uterus_size",
|
| 154 |
+
transformation_func=self._extract_second_dimension
|
| 155 |
+
),
|
| 156 |
+
"uterus_size_height": TemplateField(
|
| 157 |
+
placeholder="x &x cm)",
|
| 158 |
+
field_type=FieldType.MEASUREMENT,
|
| 159 |
+
source_field="uterus_size",
|
| 160 |
+
transformation_func=self._extract_third_dimension
|
| 161 |
+
),
|
| 162 |
+
|
| 163 |
+
# Hystérométrie
|
| 164 |
+
"hysterometry_value": TemplateField(
|
| 165 |
+
placeholder="fond de la cavité utérine : &x mm",
|
| 166 |
+
field_type=FieldType.MEASUREMENT,
|
| 167 |
+
source_field="hysterometry",
|
| 168 |
+
transformation_func=self._clean_numeric_value
|
| 169 |
+
),
|
| 170 |
+
|
| 171 |
+
# Endomètre
|
| 172 |
+
"endometrium_thickness": TemplateField(
|
| 173 |
+
placeholder="L'endomètre : mesuré à &x mm",
|
| 174 |
+
field_type=FieldType.MEASUREMENT,
|
| 175 |
+
source_field="endometrium_thickness",
|
| 176 |
+
transformation_func=self._clean_numeric_value
|
| 177 |
+
),
|
| 178 |
+
|
| 179 |
+
# Zone jonctionnelle
|
| 180 |
+
"junctional_zone_non": TemplateField(
|
| 181 |
+
placeholder="Atteinte de la zone de jonction : &x non",
|
| 182 |
+
field_type=FieldType.CHECKBOX,
|
| 183 |
+
source_field="junctional_zone_status",
|
| 184 |
+
transformation_func=lambda x: "X" if not x or x.lower() in ["normale", "normal"] else ""
|
| 185 |
+
),
|
| 186 |
+
"junctional_zone_oui": TemplateField(
|
| 187 |
+
placeholder="&x oui",
|
| 188 |
+
field_type=FieldType.CHECKBOX,
|
| 189 |
+
source_field="junctional_zone_status",
|
| 190 |
+
transformation_func=lambda x: "X" if x and x.lower() in ["épaissie", "épaisse", "atteinte"] else ""
|
| 191 |
+
),
|
| 192 |
+
|
| 193 |
+
# Adénomyose - checkboxes
|
| 194 |
+
"adenomyosis_non": TemplateField(
|
| 195 |
+
placeholder="Adénomyose associée : &x non",
|
| 196 |
+
field_type=FieldType.CHECKBOX,
|
| 197 |
+
source_field="adenomyosis_type",
|
| 198 |
+
transformation_func=lambda x: "X" if not x or x.lower() in ["absente", "non"] else ""
|
| 199 |
+
),
|
| 200 |
+
"adenomyosis_oui": TemplateField(
|
| 201 |
+
placeholder="&x oui :",
|
| 202 |
+
field_type=FieldType.CHECKBOX,
|
| 203 |
+
source_field="adenomyosis_type",
|
| 204 |
+
transformation_func=lambda x: "X" if x and x.lower() in ["diffuse", "focale"] else ""
|
| 205 |
+
),
|
| 206 |
+
"adenomyosis_diffuse": TemplateField(
|
| 207 |
+
placeholder="&x diffuse",
|
| 208 |
+
field_type=FieldType.CHECKBOX,
|
| 209 |
+
source_field="adenomyosis_type",
|
| 210 |
+
transformation_func=lambda x: "X" if x and "diffuse" in x.lower() else ""
|
| 211 |
+
),
|
| 212 |
+
"adenomyosis_focale": TemplateField(
|
| 213 |
+
placeholder="&x focale",
|
| 214 |
+
field_type=FieldType.CHECKBOX,
|
| 215 |
+
source_field="adenomyosis_type",
|
| 216 |
+
transformation_func=lambda x: "X" if x and "focale" in x.lower() else ""
|
| 217 |
+
),
|
| 218 |
+
|
| 219 |
+
# Ovaire droit - dimensions (corrected with context)
|
| 220 |
+
"right_ovary_length": TemplateField(
|
| 221 |
+
placeholder="L'ovaire droit mesure &x",
|
| 222 |
+
field_type=FieldType.MEASUREMENT,
|
| 223 |
+
source_field="right_ovary_dimensions",
|
| 224 |
+
context_identifier="ovaire droit",
|
| 225 |
+
transformation_func=self._extract_first_dimension
|
| 226 |
+
),
|
| 227 |
+
"right_ovary_width_first": TemplateField(
|
| 228 |
+
placeholder="x &x mm,",
|
| 229 |
+
field_type=FieldType.MEASUREMENT,
|
| 230 |
+
source_field="right_ovary_dimensions",
|
| 231 |
+
context_identifier="ovaire droit mesure",
|
| 232 |
+
transformation_func=self._extract_second_dimension
|
| 233 |
+
),
|
| 234 |
+
|
| 235 |
+
# Ovaire droit - CFA
|
| 236 |
+
"right_ovary_cfa": TemplateField(
|
| 237 |
+
placeholder="folliculaire CFA &x follicules:",
|
| 238 |
+
field_type=FieldType.MEASUREMENT,
|
| 239 |
+
source_field="right_ovary_cfa",
|
| 240 |
+
context_identifier="ovaire droit",
|
| 241 |
+
transformation_func=self._clean_cfa_value
|
| 242 |
+
),
|
| 243 |
+
|
| 244 |
+
# Ovaire droit - accessibilité
|
| 245 |
+
"right_ovary_access_retro": TemplateField(
|
| 246 |
+
placeholder="Accessibilité : &x rétro-utérin",
|
| 247 |
+
field_type=FieldType.CHECKBOX,
|
| 248 |
+
source_field="right_ovary_accessibility",
|
| 249 |
+
context_identifier="ovaire droit",
|
| 250 |
+
transformation_func=lambda x: "X" if x and "rétro" in x.lower() else ""
|
| 251 |
+
),
|
| 252 |
+
"right_ovary_access_fixe": TemplateField(
|
| 253 |
+
placeholder="rétro-utérin &x fixe",
|
| 254 |
+
field_type=FieldType.CHECKBOX,
|
| 255 |
+
source_field="right_ovary_accessibility",
|
| 256 |
+
context_identifier="ovaire droit",
|
| 257 |
+
transformation_func=lambda x: "X" if x and "fixe" in x.lower() else ""
|
| 258 |
+
),
|
| 259 |
+
"right_ovary_access_aisee": TemplateField(
|
| 260 |
+
placeholder="fixe &x aisée",
|
| 261 |
+
field_type=FieldType.CHECKBOX,
|
| 262 |
+
source_field="right_ovary_accessibility",
|
| 263 |
+
context_identifier="ovaire droit",
|
| 264 |
+
transformation_func=lambda x: "X" if x and ("aisée" in x.lower() or "normale" in x.lower()) else ""
|
| 265 |
+
),
|
| 266 |
+
|
| 267 |
+
# Ovaire gauche - dimensions (corrected with context)
|
| 268 |
+
"left_ovary_length": TemplateField(
|
| 269 |
+
placeholder="L'ovaire gauche mesure &x x",
|
| 270 |
+
field_type=FieldType.MEASUREMENT,
|
| 271 |
+
source_field="left_ovary_dimensions",
|
| 272 |
+
context_identifier="ovaire gauche",
|
| 273 |
+
transformation_func=self._extract_first_dimension
|
| 274 |
+
),
|
| 275 |
+
"left_ovary_width_first": TemplateField(
|
| 276 |
+
placeholder="&x mm,",
|
| 277 |
+
field_type=FieldType.MEASUREMENT,
|
| 278 |
+
source_field="left_ovary_dimensions",
|
| 279 |
+
context_identifier="ovaire gauche mesure",
|
| 280 |
+
transformation_func=self._extract_second_dimension
|
| 281 |
+
),
|
| 282 |
+
|
| 283 |
+
# Ovaire gauche - CFA
|
| 284 |
+
"left_ovary_cfa": TemplateField(
|
| 285 |
+
placeholder="folliculaire CFA &x follicules:",
|
| 286 |
+
field_type=FieldType.MEASUREMENT,
|
| 287 |
+
source_field="left_ovary_cfa",
|
| 288 |
+
context_identifier="ovaire gauche",
|
| 289 |
+
transformation_func=self._clean_cfa_value
|
| 290 |
+
),
|
| 291 |
+
|
| 292 |
+
# Ovaire gauche - accessibilité
|
| 293 |
+
"left_ovary_access_retro": TemplateField(
|
| 294 |
+
placeholder="Accessibilité : &x rétro-utérin",
|
| 295 |
+
field_type=FieldType.CHECKBOX,
|
| 296 |
+
source_field="left_ovary_accessibility",
|
| 297 |
+
context_identifier="ovaire gauche",
|
| 298 |
+
transformation_func=lambda x: "X" if x and "rétro" in x.lower() else ""
|
| 299 |
+
),
|
| 300 |
+
"left_ovary_access_fixe": TemplateField(
|
| 301 |
+
placeholder="rétro-utérin &x fixe",
|
| 302 |
+
field_type=FieldType.CHECKBOX,
|
| 303 |
+
source_field="left_ovary_accessibility",
|
| 304 |
+
context_identifier="ovaire gauche",
|
| 305 |
+
transformation_func=lambda x: "X" if x and "fixe" in x.lower() else ""
|
| 306 |
+
),
|
| 307 |
+
"left_ovary_access_aisee": TemplateField(
|
| 308 |
+
placeholder="fixe &x aisée",
|
| 309 |
+
field_type=FieldType.CHECKBOX,
|
| 310 |
+
source_field="left_ovary_accessibility",
|
| 311 |
+
context_identifier="ovaire gauche",
|
| 312 |
+
transformation_func=lambda x: "X" if x and ("aisée" in x.lower() or "normale" in x.lower()) else ""
|
| 313 |
+
),
|
| 314 |
+
|
| 315 |
+
# Doppler
|
| 316 |
+
"doppler_ip": TemplateField(
|
| 317 |
+
placeholder="IP : &x",
|
| 318 |
+
field_type=FieldType.MEASUREMENT,
|
| 319 |
+
source_field="doppler_ip",
|
| 320 |
+
transformation_func=self._clean_numeric_value
|
| 321 |
+
),
|
| 322 |
+
"doppler_ir": TemplateField(
|
| 323 |
+
placeholder="IR : 0,&x",
|
| 324 |
+
field_type=FieldType.MEASUREMENT,
|
| 325 |
+
source_field="doppler_ir",
|
| 326 |
+
transformation_func=self._format_doppler_ir
|
| 327 |
+
),
|
| 328 |
+
|
| 329 |
+
# Conclusions - CFA total
|
| 330 |
+
"conclusion_cfa_right": TemplateField(
|
| 331 |
+
placeholder="CFA : &x+",
|
| 332 |
+
field_type=FieldType.MEASUREMENT,
|
| 333 |
+
source_field="right_ovary_cfa",
|
| 334 |
+
transformation_func=self._clean_cfa_value
|
| 335 |
+
),
|
| 336 |
+
"conclusion_cfa_left": TemplateField(
|
| 337 |
+
placeholder="+&x follicules",
|
| 338 |
+
field_type=FieldType.MEASUREMENT,
|
| 339 |
+
source_field="left_ovary_cfa",
|
| 340 |
+
transformation_func=self._clean_cfa_value
|
| 341 |
+
),
|
| 342 |
+
|
| 343 |
+
# Conclusion - endomètre
|
| 344 |
+
"conclusion_endometrium": TemplateField(
|
| 345 |
+
placeholder="Endomètre mesuré à &x mm",
|
| 346 |
+
field_type=FieldType.MEASUREMENT,
|
| 347 |
+
source_field="endometrium_thickness",
|
| 348 |
+
transformation_func=self._clean_numeric_value
|
| 349 |
+
),
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
def _define_checkbox_logic(self) -> Dict[str, List[str]]:
|
| 353 |
+
"""Définit la logique des checkboxes mutuellement exclusives"""
|
| 354 |
+
return {
|
| 355 |
+
"uterus_position": ["antéversé", "rétroversé", "intermédiaire", "rétrofléchi", "antéfléchi"],
|
| 356 |
+
"adenomyosis": ["non", "oui"],
|
| 357 |
+
"adenomyosis_type": ["diffuse", "focale", "interne", "externe"],
|
| 358 |
+
"ovary_accessibility": ["rétro-utérin", "fixe", "aisée"]
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
def map_extracted_data_to_template(self, extracted_data) -> MappingResult:
|
| 362 |
+
"""
|
| 363 |
+
Fonction principale de mapping des données extraites vers le template
|
| 364 |
+
"""
|
| 365 |
+
logger.info("🔄 Début du mapping vers le template médical")
|
| 366 |
+
|
| 367 |
+
filled_template = self.template
|
| 368 |
+
mapped_fields = {}
|
| 369 |
+
unmapped_placeholders = []
|
| 370 |
+
errors = []
|
| 371 |
+
|
| 372 |
+
# Étape 1: Identifier tous les placeholders &x dans le template
|
| 373 |
+
all_placeholders = self._find_all_placeholders(filled_template)
|
| 374 |
+
logger.info(f"📍 {len(all_placeholders)} placeholders trouvés dans le template")
|
| 375 |
+
|
| 376 |
+
# Étape 2: Appliquer les mappings définis avec gestion du contexte
|
| 377 |
+
for mapping_key, template_field in self.field_mappings.items():
|
| 378 |
+
try:
|
| 379 |
+
# Récupérer la valeur source
|
| 380 |
+
source_value = getattr(extracted_data, template_field.source_field, None)
|
| 381 |
+
|
| 382 |
+
if source_value:
|
| 383 |
+
# Appliquer la transformation
|
| 384 |
+
if template_field.transformation_func:
|
| 385 |
+
mapped_value = template_field.transformation_func(source_value)
|
| 386 |
+
else:
|
| 387 |
+
mapped_value = str(source_value)
|
| 388 |
+
|
| 389 |
+
# Remplacer dans le template avec gestion du contexte
|
| 390 |
+
if mapped_value and mapped_value.strip():
|
| 391 |
+
filled_template = self._replace_placeholder_with_context(
|
| 392 |
+
filled_template, template_field.placeholder, mapped_value, template_field.context_identifier
|
| 393 |
+
)
|
| 394 |
+
mapped_fields[mapping_key] = mapped_value
|
| 395 |
+
logger.debug(f"✅ {mapping_key}: {mapped_value}")
|
| 396 |
+
else:
|
| 397 |
+
logger.debug(f"⚠️ {mapping_key}: Valeur vide après transformation")
|
| 398 |
+
|
| 399 |
+
except Exception as e:
|
| 400 |
+
error_msg = f"Erreur mapping {mapping_key}: {e}"
|
| 401 |
+
errors.append(error_msg)
|
| 402 |
+
logger.error(error_msg)
|
| 403 |
+
|
| 404 |
+
# Étape 3: Gestion des placeholders non mappés
|
| 405 |
+
remaining_placeholders = self._find_all_placeholders(filled_template)
|
| 406 |
+
unmapped_placeholders = [p for p in remaining_placeholders if "&x" in p]
|
| 407 |
+
|
| 408 |
+
# Étape 4: Application des règles de logique métier
|
| 409 |
+
filled_template = self._apply_business_logic(filled_template, extracted_data)
|
| 410 |
+
|
| 411 |
+
# Étape 5: Calcul du score de mapping
|
| 412 |
+
mapping_confidence = self._calculate_mapping_confidence(
|
| 413 |
+
len(mapped_fields), len(all_placeholders), len(errors)
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
logger.info(f"✅ Mapping terminé - {len(mapped_fields)} champs mappés, {len(unmapped_placeholders)} non mappés")
|
| 417 |
+
|
| 418 |
+
return MappingResult(
|
| 419 |
+
filled_template=filled_template,
|
| 420 |
+
mapped_fields=mapped_fields,
|
| 421 |
+
unmapped_placeholders=unmapped_placeholders,
|
| 422 |
+
mapping_confidence=mapping_confidence,
|
| 423 |
+
errors=errors
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
def _find_all_placeholders(self, template: str) -> List[str]:
|
| 427 |
+
"""Trouve tous les placeholders &x dans le template"""
|
| 428 |
+
# Pattern pour capturer le contexte autour de &x
|
| 429 |
+
pattern = r'[^.]*&x[^.]*'
|
| 430 |
+
matches = re.findall(pattern, template)
|
| 431 |
+
return matches
|
| 432 |
+
|
| 433 |
+
def _replace_placeholder_with_context(self, template: str, context_pattern: str, value: str, context_identifier: str = None) -> str:
|
| 434 |
+
"""Remplace &x dans un contexte spécifique avec gestion du contexte gauche/droit"""
|
| 435 |
+
if context_identifier:
|
| 436 |
+
# Trouver la section correspondante (ovaire droit/gauche)
|
| 437 |
+
lines = template.split('\n')
|
| 438 |
+
in_context = False
|
| 439 |
+
context_found = False
|
| 440 |
+
|
| 441 |
+
for i, line in enumerate(lines):
|
| 442 |
+
if context_identifier.lower() in line.lower():
|
| 443 |
+
in_context = True
|
| 444 |
+
context_found = True
|
| 445 |
+
elif context_found and (("ovaire" in line.lower() and context_identifier not in line.lower()) or
|
| 446 |
+
line.strip() == "" or
|
| 447 |
+
"Accessibilité" in line and i > 0 and context_identifier not in lines[i-1].lower()):
|
| 448 |
+
in_context = False
|
| 449 |
+
|
| 450 |
+
if in_context and context_pattern in line:
|
| 451 |
+
# Échapper les caractères spéciaux pour regex
|
| 452 |
+
escaped_pattern = re.escape(context_pattern).replace(r'\&x', r'&x')
|
| 453 |
+
lines[i] = re.sub(escaped_pattern, context_pattern.replace('&x', value), line, count=1)
|
| 454 |
+
break
|
| 455 |
+
|
| 456 |
+
return '\n'.join(lines)
|
| 457 |
+
else:
|
| 458 |
+
return self._replace_placeholder_in_context(template, context_pattern, value)
|
| 459 |
+
|
| 460 |
+
def _replace_placeholder_in_context(self, template: str, context_pattern: str, value: str) -> str:
|
| 461 |
+
"""Remplace &x dans un contexte spécifique pour éviter les remplacements incorrects"""
|
| 462 |
+
# Échapper les caractères spéciaux pour regex
|
| 463 |
+
escaped_pattern = re.escape(context_pattern).replace(r'\&x', r'&x')
|
| 464 |
+
|
| 465 |
+
# Remplacer &x uniquement dans ce contexte
|
| 466 |
+
def replace_func(match):
|
| 467 |
+
return match.group(0).replace('&x', value, 1) # Remplacer seulement le premier &x
|
| 468 |
+
|
| 469 |
+
return re.sub(escaped_pattern, replace_func, template)
|
| 470 |
+
|
| 471 |
+
def _apply_business_logic(self, template: str, extracted_data) -> str:
|
| 472 |
+
"""Applique la logique métier spécifique au domaine médical"""
|
| 473 |
+
|
| 474 |
+
# Logique 1: Si pas d'adénomyose détectée, cocher "non"
|
| 475 |
+
if not extracted_data.adenomyosis_type or extracted_data.adenomyosis_type.lower() == "absente":
|
| 476 |
+
template = template.replace("Adénomyose associée : &x non", "Adénomyose associée : X non")
|
| 477 |
+
|
| 478 |
+
# Logique 2: Gestion de l'accessibilité par défaut pour ovaire droit
|
| 479 |
+
if not getattr(extracted_data, 'right_ovary_accessibility', None) or getattr(extracted_data, 'right_ovary_accessibility', '').lower() == "normale":
|
| 480 |
+
# Chercher la section ovaire droit et marquer aisée
|
| 481 |
+
lines = template.split('\n')
|
| 482 |
+
for i, line in enumerate(lines):
|
| 483 |
+
if "ovaire droit" in line.lower() and i < len(lines) - 1:
|
| 484 |
+
# Chercher la ligne accessibilité suivante
|
| 485 |
+
for j in range(i+1, min(i+5, len(lines))):
|
| 486 |
+
if "Accessibilité" in lines[j] and "ovaire droit" in lines[i].lower():
|
| 487 |
+
lines[j] = lines[j].replace("&x aisée", "X aisée")
|
| 488 |
+
break
|
| 489 |
+
break
|
| 490 |
+
template = '\n'.join(lines)
|
| 491 |
+
|
| 492 |
+
# Logique 3: Gestion de l'accessibilité pour ovaire gauche
|
| 493 |
+
if getattr(extracted_data, 'left_ovary_accessibility', None) and "rétro" in getattr(extracted_data, 'left_ovary_accessibility', '').lower():
|
| 494 |
+
lines = template.split('\n')
|
| 495 |
+
for i, line in enumerate(lines):
|
| 496 |
+
if "ovaire gauche" in line.lower() and i < len(lines) - 1:
|
| 497 |
+
# Chercher la ligne accessibilité suivante
|
| 498 |
+
for j in range(i+1, min(i+5, len(lines))):
|
| 499 |
+
if "Accessibilité" in lines[j] and "gauche" in lines[i].lower():
|
| 500 |
+
lines[j] = lines[j].replace("Accessibilité : &x rétro-utérin", "Accessibilité : X rétro-utérin")
|
| 501 |
+
break
|
| 502 |
+
break
|
| 503 |
+
template = '\n'.join(lines)
|
| 504 |
+
|
| 505 |
+
# Logique 4: Valeurs par défaut pour les examens standard
|
| 506 |
+
template = template.replace("- &xVessie vide pendant l'examen", "- XVessie vide pendant l'examen")
|
| 507 |
+
template = template.replace("&x Absence de dilatation pyélo-calicielle", "X Absence de dilatation pyélo-calicielle")
|
| 508 |
+
|
| 509 |
+
# Logique 5: Conclusions par défaut
|
| 510 |
+
template = template.replace("&x Absence d'image d'endométriose visible ce jour", "X Absence d'image d'endométriose visible ce jour")
|
| 511 |
+
|
| 512 |
+
return template
|
| 513 |
+
|
| 514 |
+
def _calculate_mapping_confidence(self, mapped_count: int, total_placeholders: int, error_count: int) -> float:
|
| 515 |
+
"""Calcule le score de confiance du mapping"""
|
| 516 |
+
if total_placeholders == 0:
|
| 517 |
+
return 1.0
|
| 518 |
+
|
| 519 |
+
base_confidence = mapped_count / total_placeholders
|
| 520 |
+
error_penalty = min(error_count * 0.1, 0.3) # Maximum 30% de pénalité
|
| 521 |
+
|
| 522 |
+
return max(0.0, base_confidence - error_penalty)
|
| 523 |
+
|
| 524 |
+
# Fonctions de transformation des données
|
| 525 |
+
|
| 526 |
+
def _clean_numeric_value(self, value: str) -> str:
|
| 527 |
+
"""Nettoie les valeurs numériques"""
|
| 528 |
+
if not value:
|
| 529 |
+
return ""
|
| 530 |
+
|
| 531 |
+
# Supprimer les unités redondantes comme "mm mm"
|
| 532 |
+
cleaned = re.sub(r'\s*(mm|cm)\s*(mm|cm)', r' \1', str(value))
|
| 533 |
+
cleaned = re.sub(r'\s*(mm|cm).*', r'', cleaned) # Supprimer unités en fin
|
| 534 |
+
cleaned = cleaned.replace(',', '.').strip()
|
| 535 |
+
|
| 536 |
+
return cleaned
|
| 537 |
+
|
| 538 |
+
def _clean_cfa_value(self, value: str) -> str:
|
| 539 |
+
"""Nettoie les valeurs CFA en supprimant les doublons"""
|
| 540 |
+
if not value:
|
| 541 |
+
return ""
|
| 542 |
+
|
| 543 |
+
cleaned = str(value).replace(' follicules', '').replace(' follicules follicules', '').strip()
|
| 544 |
+
# Extraire seulement le nombre
|
| 545 |
+
match = re.search(r'(\d+)', cleaned)
|
| 546 |
+
return match.group(1) if match else cleaned
|
| 547 |
+
|
| 548 |
+
def _extract_first_dimension(self, dimensions: str) -> str:
|
| 549 |
+
"""Extrait la première dimension (longueur)"""
|
| 550 |
+
if not dimensions:
|
| 551 |
+
return ""
|
| 552 |
+
|
| 553 |
+
match = re.search(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 554 |
+
return match.group(1).replace(',', '.') if match else ""
|
| 555 |
+
|
| 556 |
+
def _extract_second_dimension(self, dimensions: str) -> str:
|
| 557 |
+
"""Extrait la deuxième dimension (largeur)"""
|
| 558 |
+
if not dimensions:
|
| 559 |
+
return ""
|
| 560 |
+
|
| 561 |
+
matches = re.findall(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 562 |
+
return matches[1].replace(',', '.') if len(matches) > 1 else ""
|
| 563 |
+
|
| 564 |
+
def _extract_third_dimension(self, dimensions: str) -> str:
|
| 565 |
+
"""Extrait la troisième dimension (hauteur)"""
|
| 566 |
+
if not dimensions:
|
| 567 |
+
return ""
|
| 568 |
+
|
| 569 |
+
matches = re.findall(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 570 |
+
return matches[2].replace(',', '.') if len(matches) > 2 else ""
|
| 571 |
+
|
| 572 |
+
def _format_doppler_ir(self, ir_value: str) -> str:
|
| 573 |
+
"""Formate la valeur IR pour le template (0,XX)"""
|
| 574 |
+
if not ir_value:
|
| 575 |
+
return ""
|
| 576 |
+
|
| 577 |
+
cleaned = self._clean_numeric_value(ir_value)
|
| 578 |
+
|
| 579 |
+
# Si la valeur commence par 0. enlever le 0.
|
| 580 |
+
if cleaned.startswith('0.'):
|
| 581 |
+
return cleaned[2:]
|
| 582 |
+
elif '.' in cleaned:
|
| 583 |
+
return cleaned.split('.')[1]
|
| 584 |
+
|
| 585 |
+
return cleaned
|
| 586 |
+
|
| 587 |
+
def print_mapping_report(self, result: MappingResult) -> str:
|
| 588 |
+
"""Génère un rapport de mapping formaté"""
|
| 589 |
+
report = "🔄 RAPPORT DE MAPPING TEMPLATE\n"
|
| 590 |
+
report += "=" * 50 + "\n\n"
|
| 591 |
+
|
| 592 |
+
# Statistiques générales
|
| 593 |
+
report += f"📊 STATISTIQUES:\n"
|
| 594 |
+
report += f" Champs mappés: {len(result.mapped_fields)}\n"
|
| 595 |
+
report += f" Placeholders non mappés: {len(result.unmapped_placeholders)}\n"
|
| 596 |
+
report += f" Score de confiance: {result.mapping_confidence:.1%}\n"
|
| 597 |
+
report += f" Erreurs: {len(result.errors)}\n\n"
|
| 598 |
+
|
| 599 |
+
# Détail des mappings
|
| 600 |
+
if result.mapped_fields:
|
| 601 |
+
report += "✅ CHAMPS MAPPÉS:\n"
|
| 602 |
+
for field, value in result.mapped_fields.items():
|
| 603 |
+
report += f" {field}: {value}\n"
|
| 604 |
+
report += "\n"
|
| 605 |
+
|
| 606 |
+
# Placeholders non mappés
|
| 607 |
+
if result.unmapped_placeholders:
|
| 608 |
+
report += "❌ PLACEHOLDERS NON MAPPÉS:\n"
|
| 609 |
+
for placeholder in result.unmapped_placeholders[:10]: # Limiter l'affichage
|
| 610 |
+
report += f" {placeholder[:50]}...\n"
|
| 611 |
+
if len(result.unmapped_placeholders) > 10:
|
| 612 |
+
report += f" ... et {len(result.unmapped_placeholders) - 10} autres\n"
|
| 613 |
+
report += "\n"
|
| 614 |
+
|
| 615 |
+
# Erreurs
|
| 616 |
+
if result.errors:
|
| 617 |
+
report += "⚠️ ERREURS:\n"
|
| 618 |
+
for error in result.errors:
|
| 619 |
+
report += f" {error}\n"
|
| 620 |
+
|
| 621 |
+
return report
|
| 622 |
+
|
| 623 |
+
# Fonction utilitaire pour utilisation
|
| 624 |
+
def create_filled_medical_report(extracted_data) -> str:
|
| 625 |
+
"""
|
| 626 |
+
Fonction principale pour créer un rapport médical complet
|
| 627 |
+
à partir des données extraites
|
| 628 |
+
"""
|
| 629 |
+
mapper = MedicalTemplateMapper()
|
| 630 |
+
result = mapper.map_extracted_data_to_template(extracted_data)
|
| 631 |
+
|
| 632 |
+
# Log du rapport
|
| 633 |
+
print(mapper.print_mapping_report(result))
|
| 634 |
+
|
| 635 |
+
return result.filled_template
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
# Exemple d'utilisation avec correction des problèmes identifiés
|
| 639 |
+
class ExtractedData:
|
| 640 |
+
"""Classe exemple pour les données extraites"""
|
| 641 |
+
def __init__(self):
|
| 642 |
+
# Données exemple basées sur votre extraction
|
| 643 |
+
self.uterus_position = "antéversé"
|
| 644 |
+
self.uterus_size = "7,8 cm"
|
| 645 |
+
self.hysterometry = "60 mm"
|
| 646 |
+
self.endometrium_thickness = "3,7 mm"
|
| 647 |
+
self.junctional_zone_status = "épaissie"
|
| 648 |
+
self.adenomyosis_type = "diffuse"
|
| 649 |
+
|
| 650 |
+
# Données ovaires corrigées
|
| 651 |
+
self.right_ovary_dimensions = "26 x 20 mm"
|
| 652 |
+
self.right_ovary_cfa = "22 follicules"
|
| 653 |
+
self.right_ovary_accessibility = "normale"
|
| 654 |
+
|
| 655 |
+
self.left_ovary_dimensions = "25 x 19 mm" # Correction: 19 au lieu de 20
|
| 656 |
+
self.left_ovary_cfa = "22 follicules"
|
| 657 |
+
self.left_ovary_accessibility = "rétro-utérine"
|
| 658 |
+
|
| 659 |
+
# Données Doppler
|
| 660 |
+
self.doppler_ip = "3,24"
|
| 661 |
+
self.doppler_ir = "0,91"
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
def test_corrected_mapping():
|
| 665 |
+
"""Test de la correction du mapping"""
|
| 666 |
+
|
| 667 |
+
# Créer des données test
|
| 668 |
+
data = ExtractedData()
|
| 669 |
+
|
| 670 |
+
# Utiliser le mapper corrigé
|
| 671 |
+
mapper = MedicalTemplateMapper()
|
| 672 |
+
result = mapper.map_extracted_data_to_template(data)
|
| 673 |
+
|
| 674 |
+
print("🔧 TEST DU MAPPING CORRIGÉ")
|
| 675 |
+
print("=" * 40)
|
| 676 |
+
print(mapper.print_mapping_report(result))
|
| 677 |
+
|
| 678 |
+
# Vérifications spécifiques pour les ovaires
|
| 679 |
+
print("\n🔍 VÉRIFICATIONS SPÉCIFIQUES:")
|
| 680 |
+
print("-" * 30)
|
| 681 |
+
|
| 682 |
+
# Vérifier ovaire droit
|
| 683 |
+
if "L'ovaire droit mesure 26 x 20 mm" in result.filled_template:
|
| 684 |
+
print("✅ Ovaire droit: dimensions correctes")
|
| 685 |
+
else:
|
| 686 |
+
print("❌ Ovaire droit: problème dimensions")
|
| 687 |
+
|
| 688 |
+
# Vérifier ovaire gauche
|
| 689 |
+
if "L'ovaire gauche mesure 25 x 19 mm" in result.filled_template:
|
| 690 |
+
print("✅ Ovaire gauche: dimensions correctes")
|
| 691 |
+
else:
|
| 692 |
+
print("❌ Ovaire gauche: problème dimensions")
|
| 693 |
+
|
| 694 |
+
# Vérifier CFA dans conclusions
|
| 695 |
+
if "CFA : 22+22 follicules" in result.filled_template:
|
| 696 |
+
print("✅ CFA conclusion: format correct")
|
| 697 |
+
else:
|
| 698 |
+
print("❌ CFA conclusion: problème format")
|
| 699 |
+
|
| 700 |
+
# Vérifier accessibilité
|
| 701 |
+
if "Accessibilité : X rétro-utérin" in result.filled_template and "ovaire gauche" in result.filled_template:
|
| 702 |
+
print("✅ Accessibilité gauche: rétro-utérine correcte")
|
| 703 |
+
else:
|
| 704 |
+
print("❌ Accessibilité gauche: problème")
|
| 705 |
+
|
| 706 |
+
return result.filled_template
|
| 707 |
+
|
| 708 |
+
# Exécuter le test si le script est lancé directement
|
| 709 |
+
if __name__ == "__main__":
|
| 710 |
+
filled_report = test_corrected_mapping()
|
| 711 |
+
print("\n" + "="*50)
|
| 712 |
+
print("RAPPORT FINAL CORRIGÉ:")
|
| 713 |
+
print("="*50)
|
| 714 |
+
print(filled_report)
|
medical_transcription_retriever.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Medical Transcription Retriever from Langfuse
|
| 4 |
+
Retrieves medical transcriptions from Langfuse traces and saves them locally.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
from langfuse import Langfuse
|
| 13 |
+
|
| 14 |
+
# Load environment variables
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class MedicalTranscriptionRetriever:
|
| 19 |
+
"""Retrieves medical transcriptions from Langfuse traces."""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
"""Initialize the retriever with Langfuse credentials."""
|
| 23 |
+
self.public_key = os.getenv('LANGFUSE_PUBLIC_KEY')
|
| 24 |
+
self.secret_key = os.getenv('LANGFUSE_SECRET_KEY')
|
| 25 |
+
self.host = os.getenv('LANGFUSE_HOST', 'https://cloud.langfuse.com')
|
| 26 |
+
|
| 27 |
+
if not self.public_key or not self.secret_key:
|
| 28 |
+
raise ValueError("Missing Langfuse keys in .env file")
|
| 29 |
+
|
| 30 |
+
self.client = Langfuse(
|
| 31 |
+
public_key=self.public_key,
|
| 32 |
+
secret_key=self.secret_key,
|
| 33 |
+
host=self.host
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
def extract_transcription_from_input(self, input_data):
|
| 37 |
+
"""Extract transcription from document input data."""
|
| 38 |
+
if isinstance(input_data, str):
|
| 39 |
+
if "Voici le document:" in input_data:
|
| 40 |
+
parts = input_data.split("Voici le document:")
|
| 41 |
+
if len(parts) > 1:
|
| 42 |
+
return parts[1].strip()
|
| 43 |
+
|
| 44 |
+
elif isinstance(input_data, dict):
|
| 45 |
+
# Search in messages if it's a dict with messages
|
| 46 |
+
if 'messages' in input_data:
|
| 47 |
+
for message in input_data['messages']:
|
| 48 |
+
if isinstance(message, dict) and message.get('role') == 'user':
|
| 49 |
+
content = message.get('content', '')
|
| 50 |
+
if isinstance(content, str) and "Voici le document:" in content:
|
| 51 |
+
parts = content.split("Voici le document:")
|
| 52 |
+
if len(parts) > 1:
|
| 53 |
+
return parts[1].strip()
|
| 54 |
+
|
| 55 |
+
# Search in other dict keys
|
| 56 |
+
for key, value in input_data.items():
|
| 57 |
+
if isinstance(value, str) and "Voici le document:" in value:
|
| 58 |
+
parts = value.split("Voici le document:")
|
| 59 |
+
if len(parts) > 1:
|
| 60 |
+
return parts[1].strip()
|
| 61 |
+
|
| 62 |
+
elif isinstance(input_data, list):
|
| 63 |
+
for message in input_data:
|
| 64 |
+
if isinstance(message, dict):
|
| 65 |
+
content = message.get('content', '')
|
| 66 |
+
if isinstance(content, str) and "Voici le document:" in content:
|
| 67 |
+
parts = content.split("Voici le document:")
|
| 68 |
+
if len(parts) > 1:
|
| 69 |
+
return parts[1].strip()
|
| 70 |
+
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
def get_traces_with_transcriptions(self, limit=50, days_back=7):
|
| 74 |
+
"""Retrieve traces containing medical transcriptions."""
|
| 75 |
+
print(f"🔍 Searching for transcriptions in the last {limit} traces...")
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
# Retrieve traces
|
| 79 |
+
traces = self.client.get_traces(limit=limit)
|
| 80 |
+
print(f"✅ {len(traces.data)} traces retrieved")
|
| 81 |
+
|
| 82 |
+
transcriptions = []
|
| 83 |
+
|
| 84 |
+
for i, trace in enumerate(traces.data):
|
| 85 |
+
print(
|
| 86 |
+
f"📋 Analyzing trace {i+1}/{len(traces.data)}: {trace.id}")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# Check if trace.input contains a transcription
|
| 90 |
+
if hasattr(trace, 'input') and trace.input is not None:
|
| 91 |
+
transcription = self.extract_transcription_from_input(
|
| 92 |
+
trace.input)
|
| 93 |
+
|
| 94 |
+
if transcription:
|
| 95 |
+
trans_info = {
|
| 96 |
+
'trace_id': trace.id,
|
| 97 |
+
'trace_name': trace.name,
|
| 98 |
+
'user_id': trace.user_id,
|
| 99 |
+
'trace_timestamp': trace.timestamp.isoformat() if trace.timestamp else None,
|
| 100 |
+
'transcription': transcription,
|
| 101 |
+
'extracted_at': datetime.now().isoformat()
|
| 102 |
+
}
|
| 103 |
+
transcriptions.append(trans_info)
|
| 104 |
+
print(f" ✅ Transcription found and extracted!")
|
| 105 |
+
else:
|
| 106 |
+
print(f" ❌ No transcription found in trace.input")
|
| 107 |
+
else:
|
| 108 |
+
print(f" ⚠️ No input available for this trace")
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f" ⚠️ Error analyzing trace {trace.id}: {e}")
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
# Delay between requests to avoid rate limiting
|
| 115 |
+
if i < len(traces.data) - 1: # Don't wait after the last trace
|
| 116 |
+
time.sleep(1) # Wait 1 second between each trace
|
| 117 |
+
|
| 118 |
+
print(f"\n📊 Summary: {len(transcriptions)} transcriptions found")
|
| 119 |
+
return transcriptions
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"❌ Error retrieving traces: {e}")
|
| 123 |
+
return []
|
| 124 |
+
|
| 125 |
+
def save_transcriptions(self, transcriptions, filename=None):
|
| 126 |
+
"""Save transcriptions to a JSON file."""
|
| 127 |
+
if not filename:
|
| 128 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 129 |
+
filename = f"medical_transcriptions_{timestamp}.json"
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
# Concatenate all transcriptions into a single string
|
| 133 |
+
transcription_texts = [trans['transcription']
|
| 134 |
+
for trans in transcriptions]
|
| 135 |
+
concatenated_transcription = "\n\n".join(transcription_texts)
|
| 136 |
+
|
| 137 |
+
# Save as an object with transcription as a single string
|
| 138 |
+
data_to_save = {
|
| 139 |
+
"extracted_at": datetime.now().isoformat(),
|
| 140 |
+
"total_transcriptions": len(transcriptions),
|
| 141 |
+
"transcription": concatenated_transcription
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 145 |
+
json.dump(data_to_save, f, ensure_ascii=False, indent=2)
|
| 146 |
+
|
| 147 |
+
print(f"💾 Transcriptions saved to {filename}")
|
| 148 |
+
return filename
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"❌ Error during save: {e}")
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
def save_transcriptions_by_user(self, transcriptions):
|
| 155 |
+
"""Save transcriptions by user in separate files."""
|
| 156 |
+
if not transcriptions:
|
| 157 |
+
print("📭 No transcriptions to save")
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
# Create transcriptions directory if it doesn't exist
|
| 161 |
+
transcriptions_dir = "transcriptions"
|
| 162 |
+
if not os.path.exists(transcriptions_dir):
|
| 163 |
+
os.makedirs(transcriptions_dir)
|
| 164 |
+
print(f"📁 Directory '{transcriptions_dir}' created")
|
| 165 |
+
|
| 166 |
+
# Group transcriptions by user_id
|
| 167 |
+
user_transcriptions = {}
|
| 168 |
+
for trans in transcriptions:
|
| 169 |
+
user_id = trans.get('user_id', 'unknown')
|
| 170 |
+
if user_id not in user_transcriptions:
|
| 171 |
+
user_transcriptions[user_id] = []
|
| 172 |
+
user_transcriptions[user_id].append(trans)
|
| 173 |
+
|
| 174 |
+
# Save one file per user (only if user_id contains .rtf)
|
| 175 |
+
saved_files = []
|
| 176 |
+
for user_id, user_trans in user_transcriptions.items():
|
| 177 |
+
# Check if user_id contains .rtf
|
| 178 |
+
if '.rtf' not in user_id:
|
| 179 |
+
print(f"⏭️ Skipped {user_id} (no .rtf)")
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 183 |
+
filename = f"transcriptions_{user_id}_{timestamp}.json"
|
| 184 |
+
filepath = os.path.join(transcriptions_dir, filename)
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
# Concatenate all transcriptions into a single string
|
| 188 |
+
transcription_texts = [trans['transcription']
|
| 189 |
+
for trans in user_trans]
|
| 190 |
+
concatenated_transcription = "\n\n".join(transcription_texts)
|
| 191 |
+
|
| 192 |
+
# Save as an object with transcription as a single string
|
| 193 |
+
data_to_save = {
|
| 194 |
+
"user_id": user_id,
|
| 195 |
+
"extracted_at": datetime.now().isoformat(),
|
| 196 |
+
"total_transcriptions": len(user_trans),
|
| 197 |
+
"transcription": concatenated_transcription
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 201 |
+
json.dump(data_to_save, f, ensure_ascii=False, indent=2)
|
| 202 |
+
|
| 203 |
+
saved_files.append(filepath)
|
| 204 |
+
print(f"💾 Saved transcriptions for {user_id}: {filename}")
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
print(f"❌ Error saving transcriptions for {user_id}: {e}")
|
| 208 |
+
|
| 209 |
+
print(f"\n📊 Summary: {len(saved_files)} files saved")
|
| 210 |
+
return saved_files
|
| 211 |
+
|
| 212 |
+
def display_transcriptions_summary(self, transcriptions):
|
| 213 |
+
"""Display a summary of retrieved transcriptions."""
|
| 214 |
+
if not transcriptions:
|
| 215 |
+
print("📭 No transcriptions to display")
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
print("\n📊 TRANSCRIPTIONS SUMMARY")
|
| 219 |
+
print("=" * 50)
|
| 220 |
+
print(f"Total transcriptions: {len(transcriptions)}")
|
| 221 |
+
|
| 222 |
+
# Group by user
|
| 223 |
+
user_counts = {}
|
| 224 |
+
for trans in transcriptions:
|
| 225 |
+
user_id = trans.get('user_id', 'unknown')
|
| 226 |
+
user_counts[user_id] = user_counts.get(user_id, 0) + 1
|
| 227 |
+
|
| 228 |
+
print(f"Unique users: {len(user_counts)}")
|
| 229 |
+
for user_id, count in user_counts.items():
|
| 230 |
+
print(f" - {user_id}: {count} transcriptions")
|
| 231 |
+
|
| 232 |
+
def run(self, limit=50, save_to_file=True, save_by_user=True):
|
| 233 |
+
"""Run the complete transcription retrieval process."""
|
| 234 |
+
print("🚀 Starting medical transcription retrieval...")
|
| 235 |
+
print("=" * 60)
|
| 236 |
+
|
| 237 |
+
# Retrieve transcriptions
|
| 238 |
+
transcriptions = self.get_traces_with_transcriptions(limit=limit)
|
| 239 |
+
|
| 240 |
+
if not transcriptions:
|
| 241 |
+
print("❌ No transcriptions found")
|
| 242 |
+
return None
|
| 243 |
+
|
| 244 |
+
# Display summary
|
| 245 |
+
self.display_transcriptions_summary(transcriptions)
|
| 246 |
+
|
| 247 |
+
# Save transcriptions
|
| 248 |
+
saved_files = []
|
| 249 |
+
if save_to_file:
|
| 250 |
+
saved_file = self.save_transcriptions(transcriptions)
|
| 251 |
+
if saved_file:
|
| 252 |
+
saved_files.append(saved_file)
|
| 253 |
+
|
| 254 |
+
if save_by_user:
|
| 255 |
+
user_files = self.save_transcriptions_by_user(transcriptions)
|
| 256 |
+
saved_files.extend(user_files)
|
| 257 |
+
|
| 258 |
+
print(f"\n✅ Retrieval completed! {len(saved_files)} files saved")
|
| 259 |
+
return saved_files
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def main():
|
| 263 |
+
"""Main function to run the transcription retriever."""
|
| 264 |
+
print("🏥 Medical Transcription Retriever")
|
| 265 |
+
print("=" * 40)
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
retriever = MedicalTranscriptionRetriever()
|
| 269 |
+
saved_files = retriever.run(
|
| 270 |
+
limit=50, save_to_file=True, save_by_user=True)
|
| 271 |
+
|
| 272 |
+
if saved_files:
|
| 273 |
+
print(f"\n🎉 Success! Files saved: {len(saved_files)}")
|
| 274 |
+
else:
|
| 275 |
+
print("\n❌ No files were saved")
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
print(f"❌ Error: {e}")
|
| 279 |
+
import traceback
|
| 280 |
+
traceback.print_exc()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
if __name__ == "__main__":
|
| 284 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pydantic models for medical document processing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
from typing import Dict, List, Any, Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TemplateAnalysis(BaseModel):
|
| 11 |
+
"""Model for template analysis results."""
|
| 12 |
+
sections: List[Dict[str, Any]] = Field(
|
| 13 |
+
description="List of sections found in template")
|
| 14 |
+
formatting: Dict[str, Any] = Field(description="Formatting information")
|
| 15 |
+
document_info: Dict[str, str] = Field(description="Document metadata")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class MedicalTranscription(BaseModel):
|
| 19 |
+
"""Model for medical transcription data."""
|
| 20 |
+
raw_text: str = Field(description="Raw transcription text")
|
| 21 |
+
corrected_text: str = Field(description="Corrected and structured text")
|
| 22 |
+
medical_data: Dict[str, Any] = Field(
|
| 23 |
+
description="Extracted medical information")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SectionContent(BaseModel):
|
| 27 |
+
"""Model for section content."""
|
| 28 |
+
technique: str = Field(description="Technique section content")
|
| 29 |
+
result: str = Field(description="Result section content")
|
| 30 |
+
conclusion: str = Field(description="Conclusion section content")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class InsertSectionsInput(BaseModel):
|
| 34 |
+
"""Model for inserting sections into documents."""
|
| 35 |
+
template_path: str
|
| 36 |
+
sections: Dict[str, str]
|
| 37 |
+
output_path: str
|
| 38 |
+
title: str = None
|
post_processing.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class MedicalReportPostProcessor:
|
| 8 |
+
"""Post-traitement pour nettoyer les rapports médicaux mappés"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
# Patterns pour détecter les lignes avec choix multiples
|
| 12 |
+
self.choice_patterns = [
|
| 13 |
+
# Position utérus
|
| 14 |
+
r'L\'utérus est(.*?)de taille',
|
| 15 |
+
# Adénomyose
|
| 16 |
+
r'Adénomyose associée :(.*?)(?:\n|$)',
|
| 17 |
+
r'&x oui :(.*?)(?:\n|Col utérin)',
|
| 18 |
+
# Accessibilité ovaires
|
| 19 |
+
r'Accessibilité :(.*?)(?:\n|\t|$)',
|
| 20 |
+
# Compartiments
|
| 21 |
+
r'Signe du glissement \(sliding\) :(.*?)(?:\n|$)',
|
| 22 |
+
# Nodules
|
| 23 |
+
r'Présence d\'un nodule :(.*?)(?:\n|$)',
|
| 24 |
+
r'Présence d\'un nodule hypoéchogène :(.*?)(?:\n|$)',
|
| 25 |
+
# Aspect
|
| 26 |
+
r'Aspect du torus\s*:(.*?)(?:\n|$)',
|
| 27 |
+
r'Ligament utéro-.*?:(.*?)(?:\n|$)',
|
| 28 |
+
# Infiltration
|
| 29 |
+
r'Infiltration digestive:(.*?)(?:\n|$)',
|
| 30 |
+
# Endométriose
|
| 31 |
+
r'Endométriose(.*?)(?:\n|Absence)',
|
| 32 |
+
# Épanchement
|
| 33 |
+
r'- (Pas d\'épanchement.*?|Faible épanchement.*?)(?:\n|$)',
|
| 34 |
+
# Vessie
|
| 35 |
+
r'- (.*?Vessie.*?)(?:\n|$)',
|
| 36 |
+
# Dilatation
|
| 37 |
+
r'- (.*?dilatation.*?)(?:\n|$)',
|
| 38 |
+
# Calcifications
|
| 39 |
+
r'Présence de micro-calcifications(.*?)(?:\n|L\'échostructure)',
|
| 40 |
+
# Ovaires dimensions supérieures
|
| 41 |
+
r', (.*?est de dimensions supérieures.*?)(?:,|\n)',
|
| 42 |
+
# Endométriome
|
| 43 |
+
r'\. (.*?endométriome\.)(?:\n|$)',
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
def process_report(self, report: str) -> str:
|
| 47 |
+
"""
|
| 48 |
+
Traite le rapport complet pour nettoyer les choix multiples
|
| 49 |
+
"""
|
| 50 |
+
logger.info("🧹 Début du post-traitement du rapport")
|
| 51 |
+
|
| 52 |
+
processed_report = report
|
| 53 |
+
|
| 54 |
+
# Étape 1: Nettoyer les lignes avec choix multiples
|
| 55 |
+
processed_report = self._clean_multiple_choices(processed_report)
|
| 56 |
+
|
| 57 |
+
# Étape 2: Nettoyer les placeholders isolés restants
|
| 58 |
+
processed_report = self._clean_isolated_placeholders(processed_report)
|
| 59 |
+
|
| 60 |
+
# Étape 3: Nettoyer les espaces et formatage
|
| 61 |
+
processed_report = self._clean_formatting(processed_report)
|
| 62 |
+
|
| 63 |
+
logger.info("✅ Post-traitement terminé")
|
| 64 |
+
|
| 65 |
+
return processed_report
|
| 66 |
+
|
| 67 |
+
def _clean_multiple_choices(self, text: str) -> str:
|
| 68 |
+
"""
|
| 69 |
+
Nettoie les lignes contenant plusieurs choix (&x ou X)
|
| 70 |
+
Ne garde que les options cochées (X)
|
| 71 |
+
"""
|
| 72 |
+
lines = text.split('\n')
|
| 73 |
+
cleaned_lines = []
|
| 74 |
+
|
| 75 |
+
for line in lines:
|
| 76 |
+
# Vérifier si la ligne contient des choix multiples
|
| 77 |
+
if self._has_multiple_choices(line):
|
| 78 |
+
cleaned_line = self._extract_checked_choices(line)
|
| 79 |
+
cleaned_lines.append(cleaned_line)
|
| 80 |
+
else:
|
| 81 |
+
cleaned_lines.append(line)
|
| 82 |
+
|
| 83 |
+
return '\n'.join(cleaned_lines)
|
| 84 |
+
|
| 85 |
+
def _has_multiple_choices(self, line: str) -> bool:
|
| 86 |
+
"""
|
| 87 |
+
Détecte si une ligne contient plusieurs choix (au moins 2 occurrences de &x ou X suivi d'un mot)
|
| 88 |
+
"""
|
| 89 |
+
# Compter les patterns de choix: &x ou X suivi d'un mot
|
| 90 |
+
pattern = r'(?:&x|(?<!\w)X(?=\s+\w))\s+\w+'
|
| 91 |
+
matches = re.findall(pattern, line)
|
| 92 |
+
return len(matches) >= 2
|
| 93 |
+
|
| 94 |
+
def _extract_checked_choices(self, line: str) -> str:
|
| 95 |
+
"""
|
| 96 |
+
Extrait uniquement les choix cochés (X) d'une ligne
|
| 97 |
+
"""
|
| 98 |
+
# Séparer la partie avant les choix et après
|
| 99 |
+
parts = self._split_line_by_choices(line)
|
| 100 |
+
|
| 101 |
+
if not parts:
|
| 102 |
+
return line
|
| 103 |
+
|
| 104 |
+
prefix = parts['prefix']
|
| 105 |
+
choices = parts['choices']
|
| 106 |
+
suffix = parts['suffix']
|
| 107 |
+
|
| 108 |
+
# Extraire les choix cochés
|
| 109 |
+
checked_choices = []
|
| 110 |
+
for choice in choices:
|
| 111 |
+
if choice.strip().startswith('X '):
|
| 112 |
+
# Enlever le X et garder le texte
|
| 113 |
+
checked_text = choice.strip()[2:].strip()
|
| 114 |
+
checked_choices.append(checked_text)
|
| 115 |
+
|
| 116 |
+
# Reconstruire la ligne
|
| 117 |
+
if checked_choices:
|
| 118 |
+
result = prefix
|
| 119 |
+
if len(checked_choices) == 1:
|
| 120 |
+
result += checked_choices[0]
|
| 121 |
+
else:
|
| 122 |
+
result += ', '.join(checked_choices)
|
| 123 |
+
result += suffix
|
| 124 |
+
return result
|
| 125 |
+
else:
|
| 126 |
+
# Si aucun choix coché, retourner la ligne d'origine
|
| 127 |
+
return line
|
| 128 |
+
|
| 129 |
+
def _split_line_by_choices(self, line: str) -> dict:
|
| 130 |
+
"""
|
| 131 |
+
Sépare une ligne en: préfixe, choix, suffixe
|
| 132 |
+
"""
|
| 133 |
+
# Cas spécifiques avec patterns connus
|
| 134 |
+
|
| 135 |
+
# Position utérus
|
| 136 |
+
match = re.search(r'(L\'utérus est\s+)((?:[X&]x?\s+\w+[,\s]+)+)(de taille.*)', line)
|
| 137 |
+
if match:
|
| 138 |
+
return {
|
| 139 |
+
'prefix': match.group(1),
|
| 140 |
+
'choices': self._parse_choices(match.group(2)),
|
| 141 |
+
'suffix': ' ' + match.group(3)
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
# Adénomyose associée
|
| 145 |
+
match = re.search(r'(Adénomyose associée\s*:\s*)((?:[X&]x?\s+\w+\s*)+)(.*)', line)
|
| 146 |
+
if match:
|
| 147 |
+
return {
|
| 148 |
+
'prefix': match.group(1),
|
| 149 |
+
'choices': self._parse_choices(match.group(2)),
|
| 150 |
+
'suffix': match.group(3)
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
# Type d'adénomyose
|
| 154 |
+
match = re.search(r'([X&]x?\s+oui\s*:\s*)((?:[X&]x?\s+\w+\s*)+)(.*)', line)
|
| 155 |
+
if match:
|
| 156 |
+
return {
|
| 157 |
+
'prefix': '',
|
| 158 |
+
'choices': self._parse_choices(match.group(2)),
|
| 159 |
+
'suffix': match.group(3)
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# Accessibilité
|
| 163 |
+
match = re.search(r'(Accessibilité\s*:\s*)((?:[X&]x?\s+[\w-]+\s*)+)(.*)', line)
|
| 164 |
+
if match:
|
| 165 |
+
return {
|
| 166 |
+
'prefix': match.group(1),
|
| 167 |
+
'choices': self._parse_choices(match.group(2)),
|
| 168 |
+
'suffix': match.group(3)
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Signe du glissement
|
| 172 |
+
match = re.search(r'(.*?Signe du glissement.*?:\s*)((?:[X&]x?\s*\w+\s*)+)(.*)', line)
|
| 173 |
+
if match:
|
| 174 |
+
return {
|
| 175 |
+
'prefix': match.group(1),
|
| 176 |
+
'choices': self._parse_choices(match.group(2)),
|
| 177 |
+
'suffix': match.group(3)
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
# Présence nodule
|
| 181 |
+
match = re.search(r'(.*?Présence d\'un nodule.*?:\s*)((?:[X&]x?\s*\w+\s*)+)(.*)', line)
|
| 182 |
+
if match:
|
| 183 |
+
return {
|
| 184 |
+
'prefix': match.group(1),
|
| 185 |
+
'choices': self._parse_choices(match.group(2)),
|
| 186 |
+
'suffix': match.group(3)
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# Aspect
|
| 190 |
+
match = re.search(r'(.*?Aspect.*?:\s*)((?:[X&]x?\s+\w+\s*)+)(.*)', line)
|
| 191 |
+
if match:
|
| 192 |
+
return {
|
| 193 |
+
'prefix': match.group(1),
|
| 194 |
+
'choices': self._parse_choices(match.group(2)),
|
| 195 |
+
'suffix': match.group(3)
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
# Infiltration digestive
|
| 199 |
+
match = re.search(r'(.*?Infiltration digestive:\s*)((?:[X&]x?\s+\w+\s*)+:\s*)((?:[X&]x?\s+[\w\s-]+)+)(.*)', line)
|
| 200 |
+
if match:
|
| 201 |
+
# Gérer le cas spécial avec "non/oui :"
|
| 202 |
+
first_choice = self._parse_choices(match.group(2))
|
| 203 |
+
second_choices = self._parse_choices(match.group(3))
|
| 204 |
+
return {
|
| 205 |
+
'prefix': match.group(1),
|
| 206 |
+
'choices': first_choice + second_choices,
|
| 207 |
+
'suffix': match.group(4)
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Calcifications
|
| 211 |
+
match = re.search(r'(.*?micro-calcifications.*?)((?:[X&]x?\s+\w+\s*)+)(.*)', line)
|
| 212 |
+
if match:
|
| 213 |
+
return {
|
| 214 |
+
'prefix': match.group(1),
|
| 215 |
+
'choices': self._parse_choices(match.group(2)),
|
| 216 |
+
'suffix': match.group(3)
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
# Endométriose
|
| 220 |
+
match = re.search(r'([X&]x?\s+Endométriose\s+)((?:[X&]x?\s+\w+\s*)+)(.*)', line)
|
| 221 |
+
if match:
|
| 222 |
+
return {
|
| 223 |
+
'prefix': 'Endométriose ',
|
| 224 |
+
'choices': self._parse_choices(match.group(2)),
|
| 225 |
+
'suffix': match.group(3)
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
# Épanchement (début par -)
|
| 229 |
+
match = re.search(r'^(\s*-?\s*)([X&]x?-?\s*(?:Pas|Faible).*?)$', line)
|
| 230 |
+
if match:
|
| 231 |
+
text = match.group(2)
|
| 232 |
+
if text.strip().startswith('X'):
|
| 233 |
+
return {
|
| 234 |
+
'prefix': match.group(1),
|
| 235 |
+
'choices': ['X ' + text[1:].strip()],
|
| 236 |
+
'suffix': ''
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# Vessie
|
| 240 |
+
match = re.search(r'^(\s*-\s*)([X&]x?\s*Vessie.*?)([X&]x?\s*Vessie.*?)$', line)
|
| 241 |
+
if match:
|
| 242 |
+
choices = []
|
| 243 |
+
if match.group(2).strip().startswith('X'):
|
| 244 |
+
choices.append('X ' + match.group(2)[1:].strip())
|
| 245 |
+
if match.group(3).strip().startswith('&x'):
|
| 246 |
+
pass # Ne rien ajouter
|
| 247 |
+
elif match.group(3).strip().startswith('X'):
|
| 248 |
+
choices.append('X ' + match.group(3)[1:].strip())
|
| 249 |
+
return {
|
| 250 |
+
'prefix': match.group(1),
|
| 251 |
+
'choices': choices,
|
| 252 |
+
'suffix': ''
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
return None
|
| 256 |
+
|
| 257 |
+
def _parse_choices(self, choices_text: str) -> List[str]:
|
| 258 |
+
"""
|
| 259 |
+
Parse le texte des choix pour extraire chaque option
|
| 260 |
+
"""
|
| 261 |
+
# Séparer par &x ou X en début de mot
|
| 262 |
+
parts = re.split(r'(?=[X&]x?\s+)', choices_text)
|
| 263 |
+
return [p.strip() for p in parts if p.strip()]
|
| 264 |
+
|
| 265 |
+
def _clean_isolated_placeholders(self, text: str) -> str:
|
| 266 |
+
"""
|
| 267 |
+
Nettoie les placeholders &x isolés qui restent
|
| 268 |
+
"""
|
| 269 |
+
# Supprimer les &x en début de ligne ou après espace
|
| 270 |
+
text = re.sub(r'^\s*&x\s*', '', text, flags=re.MULTILINE)
|
| 271 |
+
text = re.sub(r'\s+&x\s+', ' ', text)
|
| 272 |
+
text = re.sub(r'\s+&x$', '', text, flags=re.MULTILINE)
|
| 273 |
+
|
| 274 |
+
# Nettoyer les lignes qui ne contiennent que des &x
|
| 275 |
+
lines = text.split('\n')
|
| 276 |
+
cleaned_lines = []
|
| 277 |
+
for line in lines:
|
| 278 |
+
# Si la ligne ne contient que des &x et espaces, la supprimer
|
| 279 |
+
if re.match(r'^\s*(?:&x\s*)+$', line):
|
| 280 |
+
continue
|
| 281 |
+
cleaned_lines.append(line)
|
| 282 |
+
|
| 283 |
+
return '\n'.join(cleaned_lines)
|
| 284 |
+
|
| 285 |
+
def _clean_formatting(self, text: str) -> str:
|
| 286 |
+
"""
|
| 287 |
+
Nettoie le formatage général
|
| 288 |
+
"""
|
| 289 |
+
# Supprimer les espaces multiples
|
| 290 |
+
text = re.sub(r' +', ' ', text)
|
| 291 |
+
|
| 292 |
+
# Supprimer les lignes vides multiples
|
| 293 |
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
| 294 |
+
|
| 295 |
+
# Nettoyer les espaces avant ponctuation
|
| 296 |
+
text = re.sub(r' +([,.])', r'\1', text)
|
| 297 |
+
|
| 298 |
+
# Nettoyer les espaces après tirets en début de ligne
|
| 299 |
+
text = re.sub(r'^(\s*-)\s+', r'\1 ', text, flags=re.MULTILINE)
|
| 300 |
+
|
| 301 |
+
return text.strip()
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def post_process_medical_report(filled_template: str) -> str:
|
| 305 |
+
"""
|
| 306 |
+
Fonction principale pour post-traiter un rapport médical
|
| 307 |
+
"""
|
| 308 |
+
processor = MedicalReportPostProcessor()
|
| 309 |
+
return processor.process_report(filled_template)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
# Exemple d'utilisation
|
| 313 |
+
if __name__ == "__main__":
|
| 314 |
+
# Exemple de rapport avec choix multiples
|
| 315 |
+
sample_report = """L'utérus est X antéversé, &x rétroversé, &x intermédiaire, &x rétrofléchi, &x antéfléchi, &x fixe de taille normale (7.8 x &x x &x cm).
|
| 316 |
+
Hystérométrie : distance orifice externe du col - fond de la cavité utérine : 60 mm.
|
| 317 |
+
L'endomètre : mesuré à 3.7 mm.
|
| 318 |
+
Myometre : pas de myome.
|
| 319 |
+
Zone jonctionnelle : Atteinte de la zone de jonction : &x non &x oui
|
| 320 |
+
Adénomyose associée : &x non X oui : X diffuse &x focale &x interne &x externe
|
| 321 |
+
Col utérin: pas de kyste de Naboth. Absence de pathologies échographiquement décelable à son niveau.
|
| 322 |
+
Cavité utérine en 3D: morphologie triangulaire.
|
| 323 |
+
|
| 324 |
+
L'ovaire droit mesure 26 x 20 mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &xfolliculaire CFA 15 follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 325 |
+
Accessibilité : &x rétro-utérin &x fixe X aisée.
|
| 326 |
+
L'ovaire gauche mesure 25 x 19 mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &x folliculaire CFA 22 follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 327 |
+
Accessibilité : X rétro-utérin &x fixe &x aisée.
|
| 328 |
+
&x Présence de micro-calcifications sous thécales &x bilatérales &x droites &x gauches pouvant témoigner d'implants endométriosiques superficiels.
|
| 329 |
+
L'échostructure des deux ovaires apparait normale, avec une vascularisation artério-veineuse normale au Doppler, sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 330 |
+
|
| 331 |
+
Cavité péritonéale
|
| 332 |
+
&x- Pas d'épanchement liquidien dans le cul du sac du Douglas. Pas de douleur à l'écho-palpation.
|
| 333 |
+
&x- Faible épanchement corpusculé dans le cul du sac du Douglas qui silhouette des adhérences (soft marqueur d'endométriose?). Pas de douleur à l'écho-palpation.
|
| 334 |
+
- XVessie vide pendant l'examen. &x Vessie en semi-réplétion pendant l'examen.
|
| 335 |
+
- X Absence de dilatation pyélo-calicielle.
|
| 336 |
+
- Artère utérine : IP : 3.24 - IR : 0,91 - Spectre : type 2 avec notch protodiastolique.
|
| 337 |
+
- Pas d'image d'hydrosalpinx visible à ce jour.
|
| 338 |
+
|
| 339 |
+
RECHERCHE ENDOMETRIOSE PELVIENNE
|
| 340 |
+
|
| 341 |
+
A-Compartiment antérieur (vessie en semi-réplétion)
|
| 342 |
+
- Signe du glissement (sliding) : &xprésent &xdiminué &xabsent
|
| 343 |
+
- Présence d'un nodule : &xnon &xoui
|
| 344 |
+
- Uretères dans la partie pelvienne vus non dilatés.
|
| 345 |
+
|
| 346 |
+
B-Compartiment postérieur
|
| 347 |
+
- Signe du glissement (sliding) :
|
| 348 |
+
- Espace recto-vaginal : &xprésent &xdiminué &xabsent
|
| 349 |
+
- Plan sus-péritonéal : &xprésent &xdiminué &xabsent
|
| 350 |
+
- Aspect du torus : &x normal &x épaissi
|
| 351 |
+
- Aspect des ligaments utéro-sacrés :
|
| 352 |
+
- Ligament utéro- sacré droit : &x normal &x épaissi
|
| 353 |
+
- Ligament utéro-sacré gauche : &x normal &x épaissi
|
| 354 |
+
- Présence d'un nodule hypoéchogène : &x non
|
| 355 |
+
- Infiltration digestive: &x non X oui : &x bas rectum &x moyen rectum &x haut rectum &x jonction recto-sigmoïde
|
| 356 |
+
|
| 357 |
+
Conclusions
|
| 358 |
+
Utérus de taille et de morphologie normales.
|
| 359 |
+
Endomètre mesuré à 3.7 mm.
|
| 360 |
+
CFA : 15+22 follicules.
|
| 361 |
+
Ovaires sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 362 |
+
X Absence d'image d'endométriose visible ce jour, à confronter éventuellement à une IRM.
|
| 363 |
+
&x Endométriose &x superficielle &x et profonde.
|
| 364 |
+
Absence d'anomalie échographiquement décelable au niveau des trompes.
|
| 365 |
+
--> L'ensemble de ces aspects reste à confronter au contexte clinico-thérapeutique."""
|
| 366 |
+
|
| 367 |
+
# Appliquer le post-traitement
|
| 368 |
+
cleaned_report = post_process_medical_report(sample_report)
|
| 369 |
+
|
| 370 |
+
print("=" * 60)
|
| 371 |
+
print("RAPPORT APRÈS POST-TRAITEMENT")
|
| 372 |
+
print("=" * 60)
|
| 373 |
+
print(cleaned_report)
|
rapport_medical_final.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
RAPPORT MÉDICAL GÉNÉRÉ AUTOMATIQUEMENT
|
| 2 |
+
==================================================
|
| 3 |
+
|
| 4 |
+
📋 DONNÉES EXTRAITES:
|
| 5 |
+
🏥 RAPPORT D'EXTRACTION MÉDICALE
|
| 6 |
+
==================================================
|
| 7 |
+
|
| 8 |
+
🫀 UTÉRUS:
|
| 9 |
+
Position: antéversé
|
| 10 |
+
Taille: 7,8 cm
|
| 11 |
+
Hystérométrie: 60 mm
|
| 12 |
+
|
| 13 |
+
🔬 ENDOMÈTRE:
|
| 14 |
+
Épaisseur: 3,7 mm
|
| 15 |
+
|
| 16 |
+
🧬 ZONE JONCTIONNELLE:
|
| 17 |
+
Status: épaissie
|
| 18 |
+
Myomes présents: False
|
| 19 |
+
Adénomyose: diffuse
|
| 20 |
+
|
| 21 |
+
🥚 OVAIRE DROIT:
|
| 22 |
+
Dimensions: 26 x 20 mm
|
| 23 |
+
CFA: 5 follicules
|
| 24 |
+
Accessibilité: normale
|
| 25 |
+
|
| 26 |
+
🥚 OVAIRE GAUCHE:
|
| 27 |
+
Dimensions: 25 x 19 mm
|
| 28 |
+
CFA: 22 follicules
|
| 29 |
+
Accessibilité: difficile rétro-utérine
|
| 30 |
+
|
| 31 |
+
📊 DOPPLER:
|
| 32 |
+
IP: 3,24
|
| 33 |
+
IR: 0,91
|
| 34 |
+
|
| 35 |
+
📈 STATISTIQUES:
|
| 36 |
+
Score de confiance: 100.0%
|
| 37 |
+
Champs manquants: 0
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
📄 TEMPLATE REMPLI:
|
| 41 |
+
--------------------------------------------------
|
| 42 |
+
L'utérus est X antéversé, &x rétroversé, &x intermédiaire, &x rétrofléchi, &x antéfléchi, &x fixe de taille normale (7.8 x &x x &x cm).
|
| 43 |
+
Hystérométrie : distance orifice externe du col - fond de la cavité utérine : 60 mm.
|
| 44 |
+
L'endomètre : mesuré à 3.7 mm.
|
| 45 |
+
Myometre : pas de myome.
|
| 46 |
+
Zone jonctionnelle : Atteinte de la zone de jonction : &x non &x oui
|
| 47 |
+
Adénomyose associée : &x non X oui : X diffuse &x focale &x interne &x externe
|
| 48 |
+
Col utérin: pas de kyste de Naboth. Absence de pathologies échographiquement décelable à son niveau.
|
| 49 |
+
Cavité utérine en 3D: morphologie triangulaire.
|
| 50 |
+
|
| 51 |
+
&xKISSING OVARIES
|
| 52 |
+
L'ovaire droit mesure 26 x 20 mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &xfolliculaire CFA 5 follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 53 |
+
Accessibilité : &x rétro-utérin &x fixe X aisée.
|
| 54 |
+
L'ovaire gauche mesure 25 x 19 mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &x folliculaire CFA 22 follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 55 |
+
Accessibilité : X rétro-utérin &x fixe &x aisée.
|
| 56 |
+
&x Présence de micro-calcifications sous thécales &x bilatérales &x droites &x gauches pouvant témoigner d'implants endométriosiques superficiels.
|
| 57 |
+
L'échostructure des deux ovaires apparait normale, avec une vascularisation artério-veineuse normale au Doppler, sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 58 |
+
|
| 59 |
+
Cavité péritonéale
|
| 60 |
+
&x- Pas d'épanchement liquidien dans le cul du sac du Douglas. Pas de douleur à l'écho-palpation.
|
| 61 |
+
&x- Faible épanchement corpusculé dans le cul du sac du Douglas qui silhouette des adhérences (soft marqueur d'endométriose?). Pas de douleur à l'écho-palpation.
|
| 62 |
+
- XVessie vide pendant l'examen. &x Vessie en semi-réplétion pendant l'examen.
|
| 63 |
+
- X Absence de dilatation pyélo-calicielle.
|
| 64 |
+
- Artère utérine : IP : 3.24 - IR : 0,91 - Spectre : type 2 avec notch protodiastolique.
|
| 65 |
+
- Pas d'image d'hydrosalpinx visible à ce jour.
|
| 66 |
+
|
| 67 |
+
RECHERCHE ENDOMETRIOSE PELVIENNE
|
| 68 |
+
|
| 69 |
+
A-Compartiment antérieur (vessie en semi-réplétion)
|
| 70 |
+
- Signe du glissement (sliding) : &xprésent &xdiminué &xabsent
|
| 71 |
+
- Présence d'un nodule : &xnon &xoui
|
| 72 |
+
- Uretères dans la partie pelvienne vus non dilatés.
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
B-Compartiment postérieur
|
| 76 |
+
- Signe du glissement (sliding) :
|
| 77 |
+
- Espace recto-vaginal : &xprésent &xdiminué &xabsent
|
| 78 |
+
- Plan sus-péritonéal : &xprésent &xdiminué &xabsent
|
| 79 |
+
- Aspect du torus : &x normal &x épaissi
|
| 80 |
+
- Aspect des ligaments utéro-sacrés :
|
| 81 |
+
- Ligament utéro- sacré droit : &x normal &x épaissi
|
| 82 |
+
- Ligament utéro-sacré gauche : &x normal &x épaissi
|
| 83 |
+
- Présence d'un nodule hypoéchogène : &x non
|
| 84 |
+
- Infiltration digestive: &x non X oui : &x bas rectum &x moyen rectum &x haut rectum &x jonction recto-sigmoïde
|
| 85 |
+
|
| 86 |
+
Conclusions
|
| 87 |
+
Utérus de taille et de morphologie normales.
|
| 88 |
+
Endomètre mesuré à 3.7 mm.
|
| 89 |
+
CFA : 5+22 follicules.
|
| 90 |
+
Ovaires sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 91 |
+
X Absence d'image d'endométriose visible ce jour, à confronter éventuellement à une IRM.
|
| 92 |
+
&x Endométriose &x superficielle &x et profonde.
|
| 93 |
+
Absence d'anomalie échographiquement décelable au niveau des trompes.
|
| 94 |
+
--> L'ensemble de ces aspects reste à confronter au contexte clinico-thérapeutique.
|
| 95 |
+
|
| 96 |
+
(qui contient des trous représentés par &x)
|
| 97 |
+
|
| 98 |
+
📊 STATISTIQUES:
|
| 99 |
+
------------------------------
|
| 100 |
+
Score d'extraction: 100.0%
|
| 101 |
+
Champs mappés: 19
|
| 102 |
+
Score de mapping: 73.1%
|
| 103 |
+
Erreurs de mapping: 0
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-docx==0.8.11
|
| 2 |
+
lxml==4.9.3
|
| 3 |
+
openai>=1.0.0
|
| 4 |
+
langchain>=0.1.0
|
| 5 |
+
langchain-openai>=0.1.0
|
| 6 |
+
langchain-community>=0.1.0
|
| 7 |
+
pydantic>=2.0.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
langfuse>=2.0.0
|
| 10 |
+
pysftp>=0.2.9
|
| 11 |
+
sentence-transformers
|
| 12 |
+
torch
|
| 13 |
+
faiss-cpu
|
| 14 |
+
numpy
|
| 15 |
+
gradio
|
run_test.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test simplifié de l'agent NER médical + Mapper de template
|
| 4 |
+
Extraction → Affichage → Mapping → Affichage → Fichier TXT
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from type3_extract_entities import MedicalNERAgent, ExtractedData
|
| 9 |
+
from medical_template3_mapper import MedicalTemplateMapper, create_filled_medical_report
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
"""Test simplifié : extraction + mapping + génération fichier"""
|
| 14 |
+
print("🏥 TEST AGENT NER MÉDICAL + MAPPER")
|
| 15 |
+
print("=" * 50)
|
| 16 |
+
|
| 17 |
+
# Transcription à analyser
|
| 18 |
+
transcription = """Compte rendu classique. L'utérus est antéversé de taille 7,8 cm 60 d'hystérométrie
|
| 19 |
+
3,7 d'endomètre triangulaire zone jonctionnelle épaissie focale d'adénomyose diffuse fibromes
|
| 20 |
+
myomètre pas de fibromes. Le col voulut le laisser comme il est la morphologie triangulaire.
|
| 21 |
+
L'ovaire droit mesure 26 x 20 mm, 5 follicules. L'ovaire gauche accessibilité au maître rétro
|
| 22 |
+
thérape par contre l'autre il est normal il mesure 25 x 19 mm siège de CFA : 22 follicules.
|
| 23 |
+
Le Doppler : IP 3,24 - IR 0,91 et le reste tout en fait qui est l'ovaire gauche d'accès
|
| 24 |
+
difficile à rétro-utérin."""
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
# ÉTAPE 1: EXTRACTION DES ENTITÉS
|
| 28 |
+
print("🔬 ÉTAPE 1: EXTRACTION DES ENTITÉS MÉDICALES")
|
| 29 |
+
print("-" * 50)
|
| 30 |
+
|
| 31 |
+
agent = MedicalNERAgent()
|
| 32 |
+
extracted_data = agent.extract_medical_entities(transcription)
|
| 33 |
+
|
| 34 |
+
# Affichage des résultats d'extraction
|
| 35 |
+
print(agent.print_extraction_report(extracted_data))
|
| 36 |
+
|
| 37 |
+
# ÉTAPE 2: MAPPING VERS TEMPLATE
|
| 38 |
+
print("\n📋 ÉTAPE 2: MAPPING VERS TEMPLATE")
|
| 39 |
+
print("-" * 50)
|
| 40 |
+
|
| 41 |
+
mapper = MedicalTemplateMapper()
|
| 42 |
+
mapping_result = mapper.map_extracted_data_to_template(extracted_data)
|
| 43 |
+
|
| 44 |
+
# Affichage des résultats de mapping
|
| 45 |
+
print(mapper.print_mapping_report(mapping_result))
|
| 46 |
+
|
| 47 |
+
# ÉTAPE 3: GÉNÉRATION DU FICHIER
|
| 48 |
+
print("\n💾 ÉTAPE 3: GÉNÉRATION DU FICHIER")
|
| 49 |
+
print("-" * 50)
|
| 50 |
+
|
| 51 |
+
output_file = "rapport_medical_final.txt"
|
| 52 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 53 |
+
f.write("RAPPORT MÉDICAL GÉNÉRÉ AUTOMATIQUEMENT\n")
|
| 54 |
+
f.write("=" * 50 + "\n\n")
|
| 55 |
+
f.write("📋 DONNÉES EXTRAITES:\n")
|
| 56 |
+
f.write(agent.print_extraction_report(extracted_data))
|
| 57 |
+
f.write("\n\n📄 TEMPLATE REMPLI:\n")
|
| 58 |
+
f.write("-" * 50 + "\n")
|
| 59 |
+
f.write(mapping_result.filled_template)
|
| 60 |
+
|
| 61 |
+
# Ajouter les statistiques
|
| 62 |
+
f.write("\n\n📊 STATISTIQUES:\n")
|
| 63 |
+
f.write("-" * 30 + "\n")
|
| 64 |
+
f.write(f"Score d'extraction: {extracted_data.extraction_confidence:.1%}\n")
|
| 65 |
+
f.write(f"Champs mappés: {len(mapping_result.mapped_fields)}\n")
|
| 66 |
+
f.write(f"Score de mapping: {mapping_result.mapping_confidence:.1%}\n")
|
| 67 |
+
f.write(f"Erreurs de mapping: {len(mapping_result.errors)}\n")
|
| 68 |
+
|
| 69 |
+
print(f"✅ Rapport médical sauvegardé dans: {output_file}")
|
| 70 |
+
|
| 71 |
+
# Affichage du résumé final
|
| 72 |
+
print(f"\n📊 RÉSUMÉ FINAL:")
|
| 73 |
+
print(f" 🎯 Score d'extraction: {extracted_data.extraction_confidence:.1%}")
|
| 74 |
+
print(f" 🎯 Champs mappés: {len(mapping_result.mapped_fields)}")
|
| 75 |
+
print(f" 🎯 Score de mapping: {mapping_result.mapping_confidence:.1%}")
|
| 76 |
+
print(f" ⚠️ Erreurs: {len(mapping_result.errors)}")
|
| 77 |
+
print(f" 📝 Placeholders non mappés: {len(mapping_result.unmapped_placeholders)}")
|
| 78 |
+
|
| 79 |
+
# Affichage des erreurs si présentes
|
| 80 |
+
if mapping_result.errors:
|
| 81 |
+
print(f"\n⚠️ ERREURS DE MAPPING:")
|
| 82 |
+
for error in mapping_result.errors:
|
| 83 |
+
print(f" - {error}")
|
| 84 |
+
|
| 85 |
+
# Affichage de quelques placeholders non mappés
|
| 86 |
+
if mapping_result.unmapped_placeholders:
|
| 87 |
+
print(f"\n📝 QUELQUES PLACEHOLDERS NON MAPPÉS:")
|
| 88 |
+
for placeholder in mapping_result.unmapped_placeholders[:5]:
|
| 89 |
+
print(f" - {placeholder[:60]}...")
|
| 90 |
+
|
| 91 |
+
print("\n🎉 PROCESSUS TERMINÉ AVEC SUCCÈS!")
|
| 92 |
+
|
| 93 |
+
# Affichage d'un aperçu du template rempli
|
| 94 |
+
print(f"\n👁️ APERÇU DU TEMPLATE REMPLI (100 premiers caractères):")
|
| 95 |
+
print("-" * 50)
|
| 96 |
+
preview = mapping_result.filled_template[:200] + "..." if len(mapping_result.filled_template) > 200 else mapping_result.filled_template
|
| 97 |
+
print(preview)
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f"\n💥 ERREUR: {e}")
|
| 101 |
+
import traceback
|
| 102 |
+
traceback.print_exc()
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
main()
|
sample.docx
ADDED
|
Binary file (3.99 kB). View file
|
|
|
sample.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
radiographies des genoux droit et gauche face profil face en schuss axial de rotule…
|
| 2 |
+
point à la ligne
|
| 3 |
+
Le patient présente une perte de hauteur du compartiment interne droit modérée…
|
save_matcher.py
ADDED
|
@@ -0,0 +1,1288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Dict, List, Optional, Tuple, Set
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import pickle
|
| 9 |
+
import re
|
| 10 |
+
from sentence_transformers import SentenceTransformer
|
| 11 |
+
import faiss
|
| 12 |
+
from langchain_openai import ChatOpenAI
|
| 13 |
+
from langchain.prompts import ChatPromptTemplate
|
| 14 |
+
|
| 15 |
+
# Réutiliser les classes du code existant
|
| 16 |
+
from template_db_creation import MedicalTemplateParser, TemplateInfo
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class SectionMatch:
|
| 20 |
+
"""Représente le matching d'une section"""
|
| 21 |
+
section_name: str
|
| 22 |
+
confidence: float
|
| 23 |
+
extracted_content: str
|
| 24 |
+
can_fill: bool
|
| 25 |
+
missing_info: List[str]
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class TemplateMatch:
|
| 29 |
+
"""Résultat détaillé du matching d'un template"""
|
| 30 |
+
template_id: str
|
| 31 |
+
template_info: TemplateInfo
|
| 32 |
+
overall_score: float
|
| 33 |
+
type_match_score: float
|
| 34 |
+
physician_match_score: float
|
| 35 |
+
center_match_score: float
|
| 36 |
+
content_match_score: float
|
| 37 |
+
filename_match_score: float # Nouveau score
|
| 38 |
+
fillability_score: float
|
| 39 |
+
section_matches: Dict[str, SectionMatch]
|
| 40 |
+
confidence_level: str
|
| 41 |
+
can_be_filled: bool
|
| 42 |
+
filling_percentage: float
|
| 43 |
+
missing_critical_info: List[str]
|
| 44 |
+
extracted_data: Dict[str, str]
|
| 45 |
+
filename_indicators: List[str] # Nouveau champ
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class FilenameAnalysis:
|
| 49 |
+
"""Analyse d'un nom de fichier médical"""
|
| 50 |
+
original_filename: str
|
| 51 |
+
medical_keywords: List[str]
|
| 52 |
+
document_type_indicators: List[str]
|
| 53 |
+
specialty_indicators: List[str]
|
| 54 |
+
center_indicators: List[str]
|
| 55 |
+
anatomical_regions: List[str]
|
| 56 |
+
procedure_type: Optional[str]
|
| 57 |
+
confidence_score: float
|
| 58 |
+
|
| 59 |
+
class SmartTranscriptionMatcher:
|
| 60 |
+
"""Système intelligent de matching entre transcriptions et templates médicaux"""
|
| 61 |
+
|
| 62 |
+
def __init__(self, database_path: str = None):
|
| 63 |
+
"""Initialise le matcher avec une base de données existante"""
|
| 64 |
+
self.parser = MedicalTemplateParser()
|
| 65 |
+
self.llm = None
|
| 66 |
+
self.content_analyzer = None
|
| 67 |
+
self.section_extractor = None
|
| 68 |
+
self.filename_analyzer = None # Nouveau
|
| 69 |
+
self._initialize_gpt()
|
| 70 |
+
self._initialize_filename_keywords()
|
| 71 |
+
|
| 72 |
+
if database_path and os.path.exists(database_path):
|
| 73 |
+
self.load_database(database_path)
|
| 74 |
+
else:
|
| 75 |
+
logging.warning("Base de données non trouvée ou non spécifiée")
|
| 76 |
+
|
| 77 |
+
def _initialize_filename_keywords(self):
|
| 78 |
+
"""Initialise les mots-clés pour l'analyse des noms de fichiers"""
|
| 79 |
+
self.filename_keywords = {
|
| 80 |
+
# Types d'examens d'imagerie
|
| 81 |
+
"imagerie": {
|
| 82 |
+
"irm": ["irm", "mri", "resonance"],
|
| 83 |
+
"scanner": ["scanner", "tdm", "ct", "tomodensitometrie"],
|
| 84 |
+
"echographie": ["echo", "echographie", "doppler", "ultrasound"],
|
| 85 |
+
"radiologie": ["radio", "radiologie", "rx", "xray"],
|
| 86 |
+
"pet": ["pet", "tep", "scintigraphie"],
|
| 87 |
+
"mammographie": ["mammo", "mammographie", "breast"]
|
| 88 |
+
},
|
| 89 |
+
|
| 90 |
+
# Spécialités médicales
|
| 91 |
+
"specialites": {
|
| 92 |
+
"cardiologie": ["cardio", "coeur", "heart", "ecg", "holter"],
|
| 93 |
+
"neurologie": ["neuro", "brain", "cerveau", "eeg"],
|
| 94 |
+
"orthopedic": ["ortho", "os", "bone", "fracture"],
|
| 95 |
+
"gynecologie": ["gyneco", "utérus", "ovaire", "pelvien"],
|
| 96 |
+
"urologie": ["uro", "vessie", "rein", "prostate"],
|
| 97 |
+
"pneumologie": ["pneumo", "poumon", "thorax", "resp"],
|
| 98 |
+
"gastro": ["gastro", "abdomen", "foie", "intestin"]
|
| 99 |
+
},
|
| 100 |
+
|
| 101 |
+
# Régions anatomiques
|
| 102 |
+
"anatomie": {
|
| 103 |
+
"tete": ["tete", "crane", "cerebral", "encephale"],
|
| 104 |
+
"thorax": ["thorax", "poumon", "coeur", "mediastin"],
|
| 105 |
+
"abdomen": ["abdomen", "foie", "rate", "pancreas"],
|
| 106 |
+
"pelvis": ["pelvis", "pelvien", "utérus", "ovaire", "vessie"],
|
| 107 |
+
"membres": ["membre", "bras", "jambe", "genou", "epaule"],
|
| 108 |
+
"rachis": ["rachis", "colonne", "vertebral", "lombaire"]
|
| 109 |
+
},
|
| 110 |
+
|
| 111 |
+
# Types de procédures
|
| 112 |
+
"procedures": {
|
| 113 |
+
"arteriel": ["arteriel", "artere", "vasculaire"],
|
| 114 |
+
"veineux": ["veineux", "veine", "phlebo"],
|
| 115 |
+
"fonctionnel": ["fonctionnel", "dynamique", "stress"],
|
| 116 |
+
"contraste": ["contraste", "injection", "gadolinium"]
|
| 117 |
+
},
|
| 118 |
+
|
| 119 |
+
# Centres médicaux (à adapter selon votre contexte)
|
| 120 |
+
"centres": {
|
| 121 |
+
"roseraie": ["roseraie", "rose"],
|
| 122 |
+
"4villes": ["4villes", "quatre"],
|
| 123 |
+
"mstruk": ["mstruk", "struktur"],
|
| 124 |
+
"radioroseraie": ["radioroseraie"]
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
def _initialize_gpt(self):
|
| 129 |
+
"""Initialise GPT pour l'analyse de contenu"""
|
| 130 |
+
api_key = os.getenv('OPENAI_API_KEY')
|
| 131 |
+
if not api_key:
|
| 132 |
+
logging.warning("OPENAI_API_KEY non définie. L'analyse GPT ne sera pas disponible.")
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
self.llm = ChatOpenAI(
|
| 137 |
+
model="gpt-4o",
|
| 138 |
+
temperature=0,
|
| 139 |
+
max_tokens=4000,
|
| 140 |
+
api_key=api_key
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Prompt pour analyser le contenu de la transcription
|
| 144 |
+
content_prompt = ChatPromptTemplate.from_messages([
|
| 145 |
+
("system", """Vous êtes un expert en analyse de transcriptions médicales. Analysez la transcription fournie et retournez UNIQUEMENT un JSON valide.
|
| 146 |
+
|
| 147 |
+
Votre tâche est de :
|
| 148 |
+
|
| 149 |
+
1. **Identifier le type de document précis** :
|
| 150 |
+
- "compte_rendu_imagerie" : IRM, scanner, échographie, radiologie
|
| 151 |
+
- "rapport_biologique" : analyses de laboratoire, résultats biologiques
|
| 152 |
+
- "lettre_medicale" : correspondance entre médecins, lettres de sortie
|
| 153 |
+
- "compte_rendu_consultation" : consultation médicale, examen clinique
|
| 154 |
+
- "rapport_operatoire" : comptes-rendus d'intervention chirurgicale
|
| 155 |
+
- "autre" : si aucun type ne correspond clairement
|
| 156 |
+
|
| 157 |
+
2. **Extraire les informations d'identification** :
|
| 158 |
+
- Médecin/praticien (nom complet si trouvé)
|
| 159 |
+
- Centre médical/hôpital/clinique
|
| 160 |
+
- Service médical
|
| 161 |
+
- Adresse et contacts si mentionnés
|
| 162 |
+
|
| 163 |
+
3. **Décomposer en sections structurées** :
|
| 164 |
+
- Identifier toutes les sections présentes (Technique, Résultats, Conclusion, etc.)
|
| 165 |
+
- Extraire le contenu complet de chaque section
|
| 166 |
+
- Identifier les sections manquantes mais attendues pour ce type de document
|
| 167 |
+
|
| 168 |
+
4. **Extraire les données médicales spécifiques** :
|
| 169 |
+
- Examens/procédures réalisés
|
| 170 |
+
- Mesures et valeurs numériques
|
| 171 |
+
- Diagnostics et observations
|
| 172 |
+
- Traitements ou recommandations
|
| 173 |
+
- Dates et références
|
| 174 |
+
|
| 175 |
+
5. **Évaluer la complétude** :
|
| 176 |
+
- Score de complétude (0-1)
|
| 177 |
+
- Informations manquantes importantes
|
| 178 |
+
- Qualité de la transcription
|
| 179 |
+
|
| 180 |
+
Retournez un JSON avec cette structure exacte :
|
| 181 |
+
{{
|
| 182 |
+
"document_type": "type identifié",
|
| 183 |
+
"identification": {{
|
| 184 |
+
"physician": "nom complet du médecin ou 'Non identifié'",
|
| 185 |
+
"center": "nom du centre médical ou 'Non identifié'",
|
| 186 |
+
"service": "service médical ou 'Non identifié'",
|
| 187 |
+
"address": "adresse complète si trouvée",
|
| 188 |
+
"phone": "numéro de téléphone si trouvé"
|
| 189 |
+
}},
|
| 190 |
+
"sections": {{
|
| 191 |
+
"nom_section": {{
|
| 192 |
+
"content": "contenu complet de la section",
|
| 193 |
+
"confidence": 0.9,
|
| 194 |
+
"keywords": ["mots", "clés", "identifiés"]
|
| 195 |
+
}}
|
| 196 |
+
}},
|
| 197 |
+
"medical_data": {{
|
| 198 |
+
"procedures": ["liste des procédures/examens"],
|
| 199 |
+
"measurements": ["mesures avec valeurs numériques"],
|
| 200 |
+
"diagnoses": ["diagnostics identifiés"],
|
| 201 |
+
"treatments": ["traitements mentionnés"],
|
| 202 |
+
"dates": ["dates importantes trouvées"],
|
| 203 |
+
"anatomical_regions": ["régions anatomiques concernées"]
|
| 204 |
+
}},
|
| 205 |
+
"completeness": {{
|
| 206 |
+
"score": 0.85,
|
| 207 |
+
"missing_sections": ["sections manquantes attendues"],
|
| 208 |
+
"missing_info": ["informations importantes manquantes"],
|
| 209 |
+
"transcription_quality": "excellent|good|fair|poor"
|
| 210 |
+
}},
|
| 211 |
+
"key_indicators": ["indicateurs clés pour le matching"]
|
| 212 |
+
}}"""),
|
| 213 |
+
("human", "Analysez cette transcription médicale :\n\n{transcription}")
|
| 214 |
+
])
|
| 215 |
+
|
| 216 |
+
# Prompt pour extraire le contenu d'une section spécifique
|
| 217 |
+
section_prompt = ChatPromptTemplate.from_messages([
|
| 218 |
+
("system", """Vous êtes un expert en extraction d'informations médicales.
|
| 219 |
+
|
| 220 |
+
On vous donne :
|
| 221 |
+
1. Une transcription médicale complète
|
| 222 |
+
2. Le nom d'une section spécifique à remplir dans un template
|
| 223 |
+
3. La description de ce qui est attendu dans cette section
|
| 224 |
+
|
| 225 |
+
Votre tâche est d'extraire UNIQUEMENT le contenu pertinent de la transcription pour remplir cette section du template.
|
| 226 |
+
|
| 227 |
+
Retournez UNIQUEMENT un JSON avec cette structure :
|
| 228 |
+
{{
|
| 229 |
+
"extracted_content": "contenu extrait pertinent pour cette section",
|
| 230 |
+
"confidence": 0.85,
|
| 231 |
+
"can_fill": true/false,
|
| 232 |
+
"missing_elements": ["éléments manquants pour compléter la section"],
|
| 233 |
+
"source_indicators": ["mots/phrases de la transcription qui justifient l'extraction"]
|
| 234 |
+
}}
|
| 235 |
+
|
| 236 |
+
Si aucun contenu pertinent n'est trouvé, retournez can_fill: false."""),
|
| 237 |
+
("human", """Transcription complète :
|
| 238 |
+
{transcription}
|
| 239 |
+
|
| 240 |
+
Section à remplir : {section_name}
|
| 241 |
+
Description attendue : {section_description}
|
| 242 |
+
|
| 243 |
+
Extrayez le contenu pertinent :""")
|
| 244 |
+
])
|
| 245 |
+
|
| 246 |
+
# Nouveau prompt pour analyser les noms de fichiers
|
| 247 |
+
filename_prompt = ChatPromptTemplate.from_messages([
|
| 248 |
+
("system", """Vous êtes un expert en analyse de noms de fichiers médicaux. Analysez le nom de fichier fourni et extrayez les informations médicales qu'il contient.
|
| 249 |
+
|
| 250 |
+
Retournez UNIQUEMENT un JSON avec cette structure :
|
| 251 |
+
{{
|
| 252 |
+
"medical_keywords": ["mots-clés médicaux identifiés"],
|
| 253 |
+
"document_type_indicators": ["indicateurs du type de document"],
|
| 254 |
+
"specialty_indicators": ["indicateurs de spécialité médicale"],
|
| 255 |
+
"center_indicators": ["indicateurs de centre médical"],
|
| 256 |
+
"anatomical_regions": ["régions anatomiques mentionnées"],
|
| 257 |
+
"procedure_type": "type de procédure principal ou null",
|
| 258 |
+
"confidence_score": 0.85
|
| 259 |
+
}}
|
| 260 |
+
|
| 261 |
+
Exemples d'analyse :
|
| 262 |
+
- "ECHOGRAPHIE" → document_type_indicators: ["echographie"]
|
| 263 |
+
- "ECHODOPPLER" → procedure_type: "echo-doppler"
|
| 264 |
+
- "ARTERIEL" → medical_keywords: ["arteriel"]
|
| 265 |
+
- "MEMBRES.SUPERIEURS" → anatomical_regions: ["membres supérieurs"]
|
| 266 |
+
- "radioroseraie" → center_indicators: ["roseraie"], specialty_indicators: ["radiologie"]"""),
|
| 267 |
+
("human", "Analysez ce nom de fichier médical : {filename}")
|
| 268 |
+
])
|
| 269 |
+
|
| 270 |
+
self.content_analyzer = content_prompt | self.llm
|
| 271 |
+
self.section_extractor = section_prompt | self.llm
|
| 272 |
+
self.filename_analyzer = filename_prompt | self.llm
|
| 273 |
+
logging.info("✅ GPT initialisé pour l'analyse intelligente avec noms de fichiers")
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logging.error(f"❌ Erreur lors de l'initialisation GPT: {e}")
|
| 277 |
+
self.llm = None
|
| 278 |
+
|
| 279 |
+
def analyze_filename(self, filename: str) -> FilenameAnalysis:
|
| 280 |
+
"""Analyse le nom de fichier pour extraire des informations médicales"""
|
| 281 |
+
|
| 282 |
+
# Nettoyage du nom de fichier
|
| 283 |
+
clean_filename = os.path.basename(filename)
|
| 284 |
+
clean_filename = clean_filename.replace('.docx', '').replace('.doc', '').replace('.rtf', '')
|
| 285 |
+
|
| 286 |
+
# Analyse avec GPT si disponible
|
| 287 |
+
if self.filename_analyzer:
|
| 288 |
+
try:
|
| 289 |
+
response = self.filename_analyzer.invoke({"filename": clean_filename})
|
| 290 |
+
result = response.content.strip()
|
| 291 |
+
|
| 292 |
+
if result.startswith("```json"):
|
| 293 |
+
result = result[7:]
|
| 294 |
+
if result.endswith("```"):
|
| 295 |
+
result = result[:-3]
|
| 296 |
+
|
| 297 |
+
gpt_analysis = json.loads(result)
|
| 298 |
+
|
| 299 |
+
return FilenameAnalysis(
|
| 300 |
+
original_filename=filename,
|
| 301 |
+
medical_keywords=gpt_analysis.get("medical_keywords", []),
|
| 302 |
+
document_type_indicators=gpt_analysis.get("document_type_indicators", []),
|
| 303 |
+
specialty_indicators=gpt_analysis.get("specialty_indicators", []),
|
| 304 |
+
center_indicators=gpt_analysis.get("center_indicators", []),
|
| 305 |
+
anatomical_regions=gpt_analysis.get("anatomical_regions", []),
|
| 306 |
+
procedure_type=gpt_analysis.get("procedure_type"),
|
| 307 |
+
confidence_score=gpt_analysis.get("confidence_score", 0.0)
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logging.warning(f"Erreur analyse GPT du nom de fichier: {e}")
|
| 312 |
+
|
| 313 |
+
# Analyse de fallback
|
| 314 |
+
return self._analyze_filename_fallback(filename)
|
| 315 |
+
|
| 316 |
+
def _analyze_filename_fallback(self, filename: str) -> FilenameAnalysis:
|
| 317 |
+
"""Analyse de fallback pour les noms de fichiers sans GPT"""
|
| 318 |
+
clean_filename = os.path.basename(filename).lower()
|
| 319 |
+
clean_filename = clean_filename.replace('.docx', '').replace('.doc', '').replace('.rtf', '')
|
| 320 |
+
|
| 321 |
+
medical_keywords = []
|
| 322 |
+
document_type_indicators = []
|
| 323 |
+
specialty_indicators = []
|
| 324 |
+
center_indicators = []
|
| 325 |
+
anatomical_regions = []
|
| 326 |
+
procedure_type = None
|
| 327 |
+
|
| 328 |
+
# Rechercher les mots-clés par catégorie
|
| 329 |
+
for category, subcategories in self.filename_keywords.items():
|
| 330 |
+
for subcat, keywords in subcategories.items():
|
| 331 |
+
for keyword in keywords:
|
| 332 |
+
if keyword in clean_filename:
|
| 333 |
+
if category == "imagerie":
|
| 334 |
+
document_type_indicators.append(subcat)
|
| 335 |
+
if subcat in ["echographie", "irm", "scanner"]:
|
| 336 |
+
procedure_type = subcat
|
| 337 |
+
elif category == "specialites":
|
| 338 |
+
specialty_indicators.append(subcat)
|
| 339 |
+
elif category == "anatomie":
|
| 340 |
+
anatomical_regions.append(subcat)
|
| 341 |
+
elif category == "centres":
|
| 342 |
+
center_indicators.append(subcat)
|
| 343 |
+
medical_keywords.append(keyword)
|
| 344 |
+
|
| 345 |
+
# Recherche de patterns spécifiques
|
| 346 |
+
patterns = {
|
| 347 |
+
"doppler": r"doppler|echo.*doppler",
|
| 348 |
+
"arteriel": r"arteriel|artere",
|
| 349 |
+
"veineux": r"veineux|veine",
|
| 350 |
+
"membres_superieurs": r"membre.*superieur|bras",
|
| 351 |
+
"membres_inferieurs": r"membre.*inferieur|jambe",
|
| 352 |
+
"pelvien": r"pelvi|utérus|ovaire",
|
| 353 |
+
"radiologie": r"radio"
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
for pattern_name, pattern in patterns.items():
|
| 357 |
+
if re.search(pattern, clean_filename):
|
| 358 |
+
if pattern_name == "doppler":
|
| 359 |
+
procedure_type = "echo-doppler"
|
| 360 |
+
elif pattern_name in ["arteriel", "veineux"]:
|
| 361 |
+
medical_keywords.append(pattern_name)
|
| 362 |
+
elif "membre" in pattern_name:
|
| 363 |
+
anatomical_regions.append(pattern_name.replace("_", " "))
|
| 364 |
+
elif pattern_name == "pelvien":
|
| 365 |
+
anatomical_regions.append("pelvis")
|
| 366 |
+
elif pattern_name == "radiologie":
|
| 367 |
+
specialty_indicators.append("radiologie")
|
| 368 |
+
|
| 369 |
+
# Calculer un score de confiance basé sur le nombre d'éléments trouvés
|
| 370 |
+
total_elements = len(medical_keywords) + len(document_type_indicators) + len(specialty_indicators)
|
| 371 |
+
confidence_score = min(1.0, total_elements / 5.0) # Normaliser sur 5 éléments max
|
| 372 |
+
|
| 373 |
+
return FilenameAnalysis(
|
| 374 |
+
original_filename=filename,
|
| 375 |
+
medical_keywords=medical_keywords,
|
| 376 |
+
document_type_indicators=document_type_indicators,
|
| 377 |
+
specialty_indicators=specialty_indicators,
|
| 378 |
+
center_indicators=center_indicators,
|
| 379 |
+
anatomical_regions=anatomical_regions,
|
| 380 |
+
procedure_type=procedure_type,
|
| 381 |
+
confidence_score=confidence_score
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
def calculate_filename_match_score(self, transcription_filename: str, transcription_analysis: Dict,
|
| 385 |
+
template_filename: str) -> Tuple[float, List[str]]:
|
| 386 |
+
"""Calcule le score de correspondance basé sur les noms de fichiers"""
|
| 387 |
+
|
| 388 |
+
# Analyser les deux noms de fichiers
|
| 389 |
+
trans_filename_analysis = self.analyze_filename(transcription_filename)
|
| 390 |
+
template_filename_analysis = self.analyze_filename(template_filename)
|
| 391 |
+
|
| 392 |
+
score_components = []
|
| 393 |
+
matching_indicators = []
|
| 394 |
+
|
| 395 |
+
# 1. Correspondance des types de documents
|
| 396 |
+
trans_types = set(trans_filename_analysis.document_type_indicators)
|
| 397 |
+
template_types = set(template_filename_analysis.document_type_indicators)
|
| 398 |
+
|
| 399 |
+
if trans_types & template_types:
|
| 400 |
+
type_match_score = len(trans_types & template_types) / max(len(trans_types | template_types), 1)
|
| 401 |
+
score_components.append(type_match_score * 0.4) # Poids important
|
| 402 |
+
matching_indicators.extend(list(trans_types & template_types))
|
| 403 |
+
|
| 404 |
+
# 2. Correspondance des spécialités
|
| 405 |
+
trans_specialties = set(trans_filename_analysis.specialty_indicators)
|
| 406 |
+
template_specialties = set(template_filename_analysis.specialty_indicators)
|
| 407 |
+
|
| 408 |
+
if trans_specialties & template_specialties:
|
| 409 |
+
specialty_match_score = len(trans_specialties & template_specialties) / max(len(trans_specialties | template_specialties), 1)
|
| 410 |
+
score_components.append(specialty_match_score * 0.25)
|
| 411 |
+
matching_indicators.extend(list(trans_specialties & template_specialties))
|
| 412 |
+
|
| 413 |
+
# 3. Correspondance des régions anatomiques
|
| 414 |
+
trans_anatomy = set(trans_filename_analysis.anatomical_regions)
|
| 415 |
+
template_anatomy = set(template_filename_analysis.anatomical_regions)
|
| 416 |
+
|
| 417 |
+
if trans_anatomy & template_anatomy:
|
| 418 |
+
anatomy_match_score = len(trans_anatomy & template_anatomy) / max(len(trans_anatomy | template_anatomy), 1)
|
| 419 |
+
score_components.append(anatomy_match_score * 0.2)
|
| 420 |
+
matching_indicators.extend(list(trans_anatomy & template_anatomy))
|
| 421 |
+
|
| 422 |
+
# 4. Correspondance des centres médicaux
|
| 423 |
+
trans_centers = set(trans_filename_analysis.center_indicators)
|
| 424 |
+
template_centers = set(template_filename_analysis.center_indicators)
|
| 425 |
+
|
| 426 |
+
if trans_centers & template_centers:
|
| 427 |
+
center_match_score = len(trans_centers & template_centers) / max(len(trans_centers | template_centers), 1)
|
| 428 |
+
score_components.append(center_match_score * 0.1)
|
| 429 |
+
matching_indicators.extend(list(trans_centers & template_centers))
|
| 430 |
+
|
| 431 |
+
# 5. Correspondance des types de procédures
|
| 432 |
+
if (trans_filename_analysis.procedure_type and
|
| 433 |
+
template_filename_analysis.procedure_type and
|
| 434 |
+
trans_filename_analysis.procedure_type == template_filename_analysis.procedure_type):
|
| 435 |
+
score_components.append(0.05)
|
| 436 |
+
matching_indicators.append(f"procédure: {trans_filename_analysis.procedure_type}")
|
| 437 |
+
|
| 438 |
+
# 6. Bonus pour correspondance de mots-clés généraux
|
| 439 |
+
trans_keywords = set(trans_filename_analysis.medical_keywords)
|
| 440 |
+
template_keywords = set(template_filename_analysis.medical_keywords)
|
| 441 |
+
|
| 442 |
+
common_keywords = trans_keywords & template_keywords
|
| 443 |
+
if common_keywords:
|
| 444 |
+
keyword_bonus = min(0.1, len(common_keywords) * 0.02)
|
| 445 |
+
score_components.append(keyword_bonus)
|
| 446 |
+
matching_indicators.extend(list(common_keywords))
|
| 447 |
+
|
| 448 |
+
# Score final
|
| 449 |
+
final_score = sum(score_components)
|
| 450 |
+
|
| 451 |
+
# Bonus si le nom de fichier de transcription contient "radiologie" et c'est cohérent
|
| 452 |
+
if ("radiologie" in transcription_filename.lower() and
|
| 453 |
+
any("radio" in indicator for indicator in matching_indicators)):
|
| 454 |
+
final_score += 0.05
|
| 455 |
+
matching_indicators.append("cohérence radiologie")
|
| 456 |
+
|
| 457 |
+
return min(1.0, final_score), matching_indicators
|
| 458 |
+
|
| 459 |
+
def load_database(self, filepath: str):
|
| 460 |
+
"""Charge la base de données vectorielle"""
|
| 461 |
+
self.parser.load_database(filepath)
|
| 462 |
+
logging.info(f"✅ Base de données chargée: {len(self.parser.templates)} templates")
|
| 463 |
+
|
| 464 |
+
def analyze_transcription_detailed(self, transcription: str, transcription_filename: str = "") -> Dict:
|
| 465 |
+
"""Analyse détaillée d'une transcription avec GPT, en incluant le nom de fichier"""
|
| 466 |
+
if not self.content_analyzer:
|
| 467 |
+
return self._fallback_analysis(transcription, transcription_filename)
|
| 468 |
+
|
| 469 |
+
try:
|
| 470 |
+
logging.info("🔍 Analyse détaillée de la transcription...")
|
| 471 |
+
|
| 472 |
+
# Inclure l'analyse du nom de fichier dans le contexte
|
| 473 |
+
enhanced_transcription = transcription
|
| 474 |
+
if transcription_filename:
|
| 475 |
+
enhanced_transcription = f"Nom de fichier: {transcription_filename}\n\nContenu:\n{transcription}"
|
| 476 |
+
|
| 477 |
+
response = self.content_analyzer.invoke({"transcription": enhanced_transcription})
|
| 478 |
+
result = response.content.strip()
|
| 479 |
+
|
| 480 |
+
# Nettoyer la réponse
|
| 481 |
+
if result.startswith("```json"):
|
| 482 |
+
result = result[7:]
|
| 483 |
+
if result.endswith("```"):
|
| 484 |
+
result = result[:-3]
|
| 485 |
+
result = result.strip()
|
| 486 |
+
|
| 487 |
+
analysis = json.loads(result)
|
| 488 |
+
|
| 489 |
+
# Ajouter l'analyse du nom de fichier
|
| 490 |
+
if transcription_filename:
|
| 491 |
+
filename_analysis = self.analyze_filename(transcription_filename)
|
| 492 |
+
analysis["filename_analysis"] = {
|
| 493 |
+
"medical_keywords": filename_analysis.medical_keywords,
|
| 494 |
+
"document_type_indicators": filename_analysis.document_type_indicators,
|
| 495 |
+
"specialty_indicators": filename_analysis.specialty_indicators,
|
| 496 |
+
"anatomical_regions": filename_analysis.anatomical_regions,
|
| 497 |
+
"procedure_type": filename_analysis.procedure_type
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
logging.info("✅ Analyse détaillée terminée")
|
| 501 |
+
return analysis
|
| 502 |
+
|
| 503 |
+
except Exception as e:
|
| 504 |
+
logging.error(f"❌ Erreur analyse détaillée: {e}")
|
| 505 |
+
return self._fallback_analysis(transcription, transcription_filename)
|
| 506 |
+
|
| 507 |
+
def _fallback_analysis(self, transcription: str, transcription_filename: str = "") -> Dict:
|
| 508 |
+
"""Analyse de fallback sans GPT"""
|
| 509 |
+
text_lower = transcription.lower()
|
| 510 |
+
|
| 511 |
+
# Détecter le type de document
|
| 512 |
+
document_types = {
|
| 513 |
+
"compte_rendu_imagerie": ["irm", "scanner", "échographie", "radiologie", "t1", "t2", "doppler"],
|
| 514 |
+
"rapport_biologique": ["laboratoire", "analyse", "biologie", "sang", "urine", "sérum"],
|
| 515 |
+
"lettre_medicale": ["lettre", "courrier", "correspondance", "cher confrère"],
|
| 516 |
+
"compte_rendu_consultation": ["consultation", "examen clinique", "patient", "antécédents"]
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
detected_type = "autre"
|
| 520 |
+
|
| 521 |
+
# Vérifier d'abord dans le nom de fichier
|
| 522 |
+
if transcription_filename:
|
| 523 |
+
filename_lower = transcription_filename.lower()
|
| 524 |
+
for doc_type, keywords in document_types.items():
|
| 525 |
+
if sum(1 for kw in keywords if kw in filename_lower) >= 1:
|
| 526 |
+
detected_type = doc_type
|
| 527 |
+
break
|
| 528 |
+
|
| 529 |
+
# Sinon vérifier dans le contenu
|
| 530 |
+
if detected_type == "autre":
|
| 531 |
+
for doc_type, keywords in document_types.items():
|
| 532 |
+
if sum(1 for kw in keywords if kw in text_lower) >= 2:
|
| 533 |
+
detected_type = doc_type
|
| 534 |
+
break
|
| 535 |
+
|
| 536 |
+
# Extraire les sections basiques
|
| 537 |
+
sections = {}
|
| 538 |
+
section_patterns = {
|
| 539 |
+
"technique": ["technique", "méthode", "protocole"],
|
| 540 |
+
"résultats": ["résultat", "observation", "constatation"],
|
| 541 |
+
"conclusion": ["conclusion", "diagnostic", "synthèse"]
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
for section, keywords in section_patterns.items():
|
| 545 |
+
for keyword in keywords:
|
| 546 |
+
if keyword in text_lower:
|
| 547 |
+
start = text_lower.find(keyword)
|
| 548 |
+
end = min(len(transcription), start + 500)
|
| 549 |
+
content = transcription[start:end]
|
| 550 |
+
sections[section] = {
|
| 551 |
+
"content": content,
|
| 552 |
+
"confidence": 0.6,
|
| 553 |
+
"keywords": [keyword]
|
| 554 |
+
}
|
| 555 |
+
break
|
| 556 |
+
|
| 557 |
+
analysis = {
|
| 558 |
+
"document_type": detected_type,
|
| 559 |
+
"identification": {
|
| 560 |
+
"physician": "Non identifié",
|
| 561 |
+
"center": "Non identifié",
|
| 562 |
+
"service": "Non identifié"
|
| 563 |
+
},
|
| 564 |
+
"sections": sections,
|
| 565 |
+
"medical_data": {
|
| 566 |
+
"procedures": [],
|
| 567 |
+
"measurements": re.findall(r'\d+\s*(?:mm|cm|ml)', transcription),
|
| 568 |
+
"diagnoses": [],
|
| 569 |
+
"treatments": []
|
| 570 |
+
},
|
| 571 |
+
"completeness": {
|
| 572 |
+
"score": 0.6,
|
| 573 |
+
"transcription_quality": "fair"
|
| 574 |
+
}
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
# Ajouter l'analyse du nom de fichier en fallback
|
| 578 |
+
if transcription_filename:
|
| 579 |
+
filename_analysis = self.analyze_filename(transcription_filename)
|
| 580 |
+
analysis["filename_analysis"] = {
|
| 581 |
+
"medical_keywords": filename_analysis.medical_keywords,
|
| 582 |
+
"document_type_indicators": filename_analysis.document_type_indicators,
|
| 583 |
+
"specialty_indicators": filename_analysis.specialty_indicators,
|
| 584 |
+
"anatomical_regions": filename_analysis.anatomical_regions,
|
| 585 |
+
"procedure_type": filename_analysis.procedure_type
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
return analysis
|
| 589 |
+
|
| 590 |
+
def calculate_type_match_score(self, transcription_analysis: Dict, template_info: TemplateInfo) -> float:
|
| 591 |
+
"""Calcule le score de correspondance du type de document"""
|
| 592 |
+
transcription_type = transcription_analysis.get("document_type", "")
|
| 593 |
+
template_type = template_info.type.lower()
|
| 594 |
+
|
| 595 |
+
# Mapping des types
|
| 596 |
+
type_mappings = {
|
| 597 |
+
"compte_rendu_imagerie": ["irm", "scanner", "échographie", "imagerie", "radiologie"],
|
| 598 |
+
"rapport_biologique": ["laboratoire", "biologie", "analyse"],
|
| 599 |
+
"lettre_medicale": ["lettre", "courrier", "correspondance"],
|
| 600 |
+
"compte_rendu_consultation": ["consultation", "examen"]
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
if transcription_type in type_mappings:
|
| 604 |
+
expected_keywords = type_mappings[transcription_type]
|
| 605 |
+
matches = sum(1 for kw in expected_keywords if kw in template_type)
|
| 606 |
+
return min(1.0, matches / len(expected_keywords) * 2)
|
| 607 |
+
|
| 608 |
+
return 0.3
|
| 609 |
+
|
| 610 |
+
def calculate_physician_match_score(self, transcription_analysis: Dict, template_info: TemplateInfo) -> float:
|
| 611 |
+
"""Calcule le score de correspondance du médecin"""
|
| 612 |
+
transcription_physician = transcription_analysis.get("identification", {}).get("physician", "")
|
| 613 |
+
template_physician = template_info.medecin
|
| 614 |
+
|
| 615 |
+
if not transcription_physician or transcription_physician == "Non identifié":
|
| 616 |
+
return 0.5
|
| 617 |
+
|
| 618 |
+
if not template_physician:
|
| 619 |
+
return 0.5
|
| 620 |
+
|
| 621 |
+
# Comparaison des noms
|
| 622 |
+
trans_words = set(transcription_physician.lower().split())
|
| 623 |
+
temp_words = set(template_physician.lower().split())
|
| 624 |
+
|
| 625 |
+
if trans_words & temp_words:
|
| 626 |
+
return 1.0
|
| 627 |
+
|
| 628 |
+
return 0.0
|
| 629 |
+
|
| 630 |
+
def calculate_center_match_score(self, transcription_analysis: Dict, template_info: TemplateInfo) -> float:
|
| 631 |
+
"""Calcule le score de correspondance du centre médical"""
|
| 632 |
+
transcription_center = transcription_analysis.get("identification", {}).get("center", "")
|
| 633 |
+
template_center = getattr(template_info, 'centre_medical', '') or getattr(template_info, 'center', '')
|
| 634 |
+
|
| 635 |
+
if not transcription_center or transcription_center == "Non identifié":
|
| 636 |
+
return 0.5
|
| 637 |
+
|
| 638 |
+
if not template_center:
|
| 639 |
+
return 0.5
|
| 640 |
+
|
| 641 |
+
# Comparaison des centres
|
| 642 |
+
if transcription_center.lower() in template_center.lower() or template_center.lower() in transcription_center.lower():
|
| 643 |
+
return 1.0
|
| 644 |
+
|
| 645 |
+
return 0.0
|
| 646 |
+
|
| 647 |
+
def calculate_section_matches(self, transcription: str, transcription_analysis: Dict, template_info: TemplateInfo) -> Dict[str, SectionMatch]:
|
| 648 |
+
"""Calcule les correspondances pour chaque section du template"""
|
| 649 |
+
section_matches = {}
|
| 650 |
+
transcription_sections = transcription_analysis.get("sections", {})
|
| 651 |
+
|
| 652 |
+
for section_name in template_info.detected_sections:
|
| 653 |
+
section_match = self._match_single_section(
|
| 654 |
+
section_name,
|
| 655 |
+
transcription,
|
| 656 |
+
transcription_sections,
|
| 657 |
+
template_info
|
| 658 |
+
)
|
| 659 |
+
section_matches[section_name] = section_match
|
| 660 |
+
|
| 661 |
+
return section_matches
|
| 662 |
+
|
| 663 |
+
def _match_single_section(self, section_name: str, transcription: str,
|
| 664 |
+
transcription_sections: Dict, template_info: TemplateInfo) -> SectionMatch:
|
| 665 |
+
"""Analyse le matching d'une section spécifique"""
|
| 666 |
+
section_lower = section_name.lower()
|
| 667 |
+
|
| 668 |
+
# Rechercher une section correspondante dans l'analyse
|
| 669 |
+
best_match_content = ""
|
| 670 |
+
best_confidence = 0.0
|
| 671 |
+
|
| 672 |
+
for analyzed_section, section_data in transcription_sections.items():
|
| 673 |
+
if isinstance(section_data, dict):
|
| 674 |
+
content = section_data.get("content", "")
|
| 675 |
+
confidence = section_data.get("confidence", 0.0)
|
| 676 |
+
|
| 677 |
+
# Vérifier la correspondance par mots-clés
|
| 678 |
+
section_keywords = section_lower.split()
|
| 679 |
+
analyzed_keywords = analyzed_section.lower().split()
|
| 680 |
+
|
| 681 |
+
keyword_match = len(set(section_keywords) & set(analyzed_keywords)) / max(len(section_keywords), 1)
|
| 682 |
+
|
| 683 |
+
if keyword_match > 0.3 and confidence > best_confidence:
|
| 684 |
+
best_match_content = content
|
| 685 |
+
best_confidence = confidence * keyword_match
|
| 686 |
+
|
| 687 |
+
# Si GPT est disponible, utiliser l'extraction spécialisée
|
| 688 |
+
if self.section_extractor and not best_match_content:
|
| 689 |
+
try:
|
| 690 |
+
section_description = f"Section {section_name} d'un document médical"
|
| 691 |
+
response = self.section_extractor.invoke({
|
| 692 |
+
"transcription": transcription,
|
| 693 |
+
"section_name": section_name,
|
| 694 |
+
"section_description": section_description
|
| 695 |
+
})
|
| 696 |
+
|
| 697 |
+
result = response.content.strip()
|
| 698 |
+
if result.startswith("```json"):
|
| 699 |
+
result = result[7:]
|
| 700 |
+
if result.endswith("```"):
|
| 701 |
+
result = result[:-3]
|
| 702 |
+
|
| 703 |
+
extraction_result = json.loads(result)
|
| 704 |
+
if extraction_result.get("can_fill", False):
|
| 705 |
+
best_match_content = extraction_result.get("extracted_content", "")
|
| 706 |
+
best_confidence = extraction_result.get("confidence", 0.0)
|
| 707 |
+
|
| 708 |
+
except Exception as e:
|
| 709 |
+
logging.warning(f"Erreur extraction section {section_name}: {e}")
|
| 710 |
+
|
| 711 |
+
# Évaluer si la section peut être remplie
|
| 712 |
+
can_fill = bool(best_match_content) and len(best_match_content.strip()) > 10
|
| 713 |
+
missing_info = [] if can_fill else [f"Contenu manquant pour {section_name}"]
|
| 714 |
+
|
| 715 |
+
return SectionMatch(
|
| 716 |
+
section_name=section_name,
|
| 717 |
+
confidence=best_confidence,
|
| 718 |
+
extracted_content=best_match_content,
|
| 719 |
+
can_fill=can_fill,
|
| 720 |
+
missing_info=missing_info
|
| 721 |
+
)
|
| 722 |
+
|
| 723 |
+
def calculate_fillability_score(self, section_matches: Dict[str, SectionMatch], template_info: TemplateInfo) -> Tuple[float, float, List[str]]:
|
| 724 |
+
"""Calcule le score de remplissage possible du template"""
|
| 725 |
+
total_sections = len(template_info.detected_sections)
|
| 726 |
+
fillable_sections = sum(1 for match in section_matches.values() if match.can_fill)
|
| 727 |
+
critical_sections = sum(1 for match in section_matches.values() if match.can_fill and match.confidence > 0.7)
|
| 728 |
+
|
| 729 |
+
if total_sections == 0:
|
| 730 |
+
return 0.0, 0.0, ["Template sans sections"]
|
| 731 |
+
|
| 732 |
+
fillability_score = fillable_sections / total_sections
|
| 733 |
+
filling_percentage = (critical_sections / total_sections) * 100
|
| 734 |
+
|
| 735 |
+
missing_critical = [
|
| 736 |
+
match.section_name for match in section_matches.values()
|
| 737 |
+
if not match.can_fill
|
| 738 |
+
]
|
| 739 |
+
|
| 740 |
+
return fillability_score, filling_percentage, missing_critical
|
| 741 |
+
|
| 742 |
+
def smart_match_transcription(self, transcription: str, transcription_filename: str = "", k: int = 10) -> List[TemplateMatch]:
|
| 743 |
+
"""Matching intelligent entre transcription et templates avec analyse des noms de fichiers"""
|
| 744 |
+
if not self.parser.templates:
|
| 745 |
+
logging.error("Aucun template chargé")
|
| 746 |
+
return []
|
| 747 |
+
|
| 748 |
+
logging.info("Analyse intelligente de la transcription...")
|
| 749 |
+
|
| 750 |
+
# 1. Analyser la transcription en détail (incluant le nom de fichier)
|
| 751 |
+
analysis = self.analyze_transcription_detailed(transcription, transcription_filename)
|
| 752 |
+
|
| 753 |
+
# 2. Pré-filtrer les templates par type et nom de fichier
|
| 754 |
+
candidate_templates = self._filter_templates_by_type_and_filename(analysis, transcription_filename)
|
| 755 |
+
|
| 756 |
+
if not candidate_templates:
|
| 757 |
+
logging.warning("Aucun template candidat trouvé, utilisation de tous les templates")
|
| 758 |
+
candidate_templates = list(self.parser.templates.keys())
|
| 759 |
+
|
| 760 |
+
logging.info(f"{len(candidate_templates)} templates candidats retenus")
|
| 761 |
+
|
| 762 |
+
# 3. Analyser chaque template candidat
|
| 763 |
+
template_matches = []
|
| 764 |
+
|
| 765 |
+
for template_id in candidate_templates:
|
| 766 |
+
template_info = self.parser.get_template_info(template_id)
|
| 767 |
+
if not template_info:
|
| 768 |
+
continue
|
| 769 |
+
|
| 770 |
+
# Calculer les scores de matching
|
| 771 |
+
type_score = self.calculate_type_match_score(analysis, template_info)
|
| 772 |
+
physician_score = self.calculate_physician_match_score(analysis, template_info)
|
| 773 |
+
center_score = self.calculate_center_match_score(analysis, template_info)
|
| 774 |
+
|
| 775 |
+
# Nouveau score basé sur les noms de fichiers
|
| 776 |
+
filename_score, filename_indicators = self.calculate_filename_match_score(
|
| 777 |
+
transcription_filename, analysis, template_info.filepath
|
| 778 |
+
)
|
| 779 |
+
|
| 780 |
+
# Analyser les correspondances de sections
|
| 781 |
+
section_matches = self.calculate_section_matches(transcription, analysis, template_info)
|
| 782 |
+
|
| 783 |
+
# Calculer le score de remplissage
|
| 784 |
+
fillability_score, filling_percentage, missing_critical = self.calculate_fillability_score(section_matches, template_info)
|
| 785 |
+
|
| 786 |
+
# Calculer le score de contenu (vectoriel)
|
| 787 |
+
content_score = self._calculate_content_similarity(transcription, template_id)
|
| 788 |
+
|
| 789 |
+
# Score global pondéré MODIFIÉ pour inclure le filename score
|
| 790 |
+
overall_score = (
|
| 791 |
+
type_score * 0.25 + # Type de document
|
| 792 |
+
fillability_score * 0.3 + # Capacité de remplissage
|
| 793 |
+
filename_score * 0.2 + # NOUVEAU: Score nom de fichier
|
| 794 |
+
content_score * 0.15 + # Similarité de contenu
|
| 795 |
+
physician_score * 0.05 + # Médecin
|
| 796 |
+
center_score * 0.05 # Centre médical
|
| 797 |
+
)
|
| 798 |
+
|
| 799 |
+
# Déterminer le niveau de confiance
|
| 800 |
+
confidence_level = self._determine_confidence_level(overall_score, fillability_score, analysis)
|
| 801 |
+
|
| 802 |
+
# Extraire les données pour le remplissage
|
| 803 |
+
extracted_data = self._extract_template_data(section_matches)
|
| 804 |
+
|
| 805 |
+
template_match = TemplateMatch(
|
| 806 |
+
template_id=template_id,
|
| 807 |
+
template_info=template_info,
|
| 808 |
+
overall_score=overall_score,
|
| 809 |
+
type_match_score=type_score,
|
| 810 |
+
physician_match_score=physician_score,
|
| 811 |
+
center_match_score=center_score,
|
| 812 |
+
content_match_score=content_score,
|
| 813 |
+
filename_match_score=filename_score, # Nouveau
|
| 814 |
+
fillability_score=fillability_score,
|
| 815 |
+
section_matches=section_matches,
|
| 816 |
+
confidence_level=confidence_level,
|
| 817 |
+
can_be_filled=fillability_score > 0.6,
|
| 818 |
+
filling_percentage=filling_percentage,
|
| 819 |
+
missing_critical_info=missing_critical,
|
| 820 |
+
extracted_data=extracted_data,
|
| 821 |
+
filename_indicators=filename_indicators # Nouveau
|
| 822 |
+
)
|
| 823 |
+
|
| 824 |
+
template_matches.append(template_match)
|
| 825 |
+
|
| 826 |
+
# 4. Trier par score global
|
| 827 |
+
template_matches.sort(key=lambda x: x.overall_score, reverse=True)
|
| 828 |
+
|
| 829 |
+
logging.info(f"{len(template_matches)} templates analysés")
|
| 830 |
+
return template_matches[:k]
|
| 831 |
+
|
| 832 |
+
def _filter_templates_by_type_and_filename(self, analysis: Dict, transcription_filename: str) -> List[str]:
|
| 833 |
+
"""Filtre les templates par type de document et nom de fichier"""
|
| 834 |
+
document_type = analysis.get("document_type", "")
|
| 835 |
+
filename_analysis = analysis.get("filename_analysis", {})
|
| 836 |
+
|
| 837 |
+
# Critères de filtrage élargis
|
| 838 |
+
filter_keywords = set()
|
| 839 |
+
|
| 840 |
+
# Ajouter les mots-clés du type de document
|
| 841 |
+
if document_type != "autre":
|
| 842 |
+
type_keywords = {
|
| 843 |
+
"compte_rendu_imagerie": ["irm", "scanner", "echo", "radio", "imagerie"],
|
| 844 |
+
"rapport_biologique": ["labo", "biologie", "analyse", "sang"],
|
| 845 |
+
"lettre_medicale": ["lettre", "courrier"],
|
| 846 |
+
"compte_rendu_consultation": ["consultation", "examen", "clinique"]
|
| 847 |
+
}
|
| 848 |
+
filter_keywords.update(type_keywords.get(document_type, []))
|
| 849 |
+
|
| 850 |
+
# Ajouter les mots-clés du nom de fichier
|
| 851 |
+
if filename_analysis:
|
| 852 |
+
filter_keywords.update(filename_analysis.get("medical_keywords", []))
|
| 853 |
+
filter_keywords.update(filename_analysis.get("document_type_indicators", []))
|
| 854 |
+
filter_keywords.update(filename_analysis.get("specialty_indicators", []))
|
| 855 |
+
|
| 856 |
+
# Si pas de critères spécifiques, retourner tous les templates
|
| 857 |
+
if not filter_keywords:
|
| 858 |
+
return list(self.parser.templates.keys())
|
| 859 |
+
|
| 860 |
+
# Filtrer les templates
|
| 861 |
+
matching_templates = []
|
| 862 |
+
|
| 863 |
+
for template_id, template_info in self.parser.templates.items():
|
| 864 |
+
template_filepath_lower = template_info.filepath.lower()
|
| 865 |
+
template_type_lower = template_info.type.lower()
|
| 866 |
+
|
| 867 |
+
# Vérifier les correspondances dans le nom de fichier du template
|
| 868 |
+
filename_matches = sum(1 for keyword in filter_keywords if keyword in template_filepath_lower)
|
| 869 |
+
type_matches = sum(1 for keyword in filter_keywords if keyword in template_type_lower)
|
| 870 |
+
|
| 871 |
+
# Prendre le template s'il y a au moins une correspondance
|
| 872 |
+
if filename_matches > 0 or type_matches > 0:
|
| 873 |
+
matching_templates.append(template_id)
|
| 874 |
+
|
| 875 |
+
return matching_templates if matching_templates else list(self.parser.templates.keys())
|
| 876 |
+
|
| 877 |
+
def _calculate_content_similarity(self, transcription: str, template_id: str) -> float:
|
| 878 |
+
"""Calcule la similarité de contenu via recherche vectorielle"""
|
| 879 |
+
try:
|
| 880 |
+
results = self.parser.search_similar_templates(transcription, k=50)
|
| 881 |
+
for tid, score in results:
|
| 882 |
+
if tid == template_id:
|
| 883 |
+
return score
|
| 884 |
+
return 0.0
|
| 885 |
+
except Exception as e:
|
| 886 |
+
logging.warning(f"Erreur similarité vectorielle: {e}")
|
| 887 |
+
return 0.0
|
| 888 |
+
|
| 889 |
+
def _determine_confidence_level(self, overall_score: float, fillability_score: float, analysis: Dict) -> str:
|
| 890 |
+
"""Détermine le niveau de confiance global"""
|
| 891 |
+
transcription_quality = analysis.get("completeness", {}).get("transcription_quality", "fair")
|
| 892 |
+
|
| 893 |
+
# Ajustement selon la qualité de transcription
|
| 894 |
+
quality_modifier = {
|
| 895 |
+
"excellent": 1.0,
|
| 896 |
+
"good": 0.9,
|
| 897 |
+
"fair": 0.8,
|
| 898 |
+
"poor": 0.6
|
| 899 |
+
}.get(transcription_quality, 0.8)
|
| 900 |
+
|
| 901 |
+
adjusted_score = overall_score * quality_modifier
|
| 902 |
+
|
| 903 |
+
if adjusted_score > 0.8 and fillability_score > 0.8:
|
| 904 |
+
return "excellent"
|
| 905 |
+
elif adjusted_score > 0.6 and fillability_score > 0.6:
|
| 906 |
+
return "good"
|
| 907 |
+
elif adjusted_score > 0.4 and fillability_score > 0.4:
|
| 908 |
+
return "fair"
|
| 909 |
+
else:
|
| 910 |
+
return "poor"
|
| 911 |
+
|
| 912 |
+
def _extract_template_data(self, section_matches: Dict[str, SectionMatch]) -> Dict[str, str]:
|
| 913 |
+
"""Extrait les données prêtes pour remplir le template"""
|
| 914 |
+
extracted_data = {}
|
| 915 |
+
|
| 916 |
+
for section_name, match in section_matches.items():
|
| 917 |
+
if match.can_fill and match.extracted_content:
|
| 918 |
+
# Nettoyer et formater le contenu
|
| 919 |
+
content = match.extracted_content.strip()
|
| 920 |
+
if content:
|
| 921 |
+
extracted_data[section_name] = content
|
| 922 |
+
|
| 923 |
+
return extracted_data
|
| 924 |
+
|
| 925 |
+
def print_smart_results(self, matches: List[TemplateMatch]):
|
| 926 |
+
"""Affichage détaillé des résultats de matching intelligent"""
|
| 927 |
+
if not matches:
|
| 928 |
+
print("Aucun résultat trouvé")
|
| 929 |
+
return
|
| 930 |
+
|
| 931 |
+
print(f"\n{'='*100}")
|
| 932 |
+
print(f"RÉSULTATS DE MATCHING INTELLIGENT - {len(matches)} templates analysés")
|
| 933 |
+
print(f"{'='*100}")
|
| 934 |
+
|
| 935 |
+
for i, match in enumerate(matches, 1):
|
| 936 |
+
print(f"\nTEMPLATE #{i}")
|
| 937 |
+
print(f"{'='*60}")
|
| 938 |
+
print(f"ID: {match.template_id}")
|
| 939 |
+
print(f"Score global: {match.overall_score:.3f}")
|
| 940 |
+
print(f"Confiance: {match.confidence_level}")
|
| 941 |
+
print(f"Template: {os.path.basename(match.template_info.filepath)}")
|
| 942 |
+
print(f"Médecin: {match.template_info.medecin}")
|
| 943 |
+
|
| 944 |
+
print(f"\nSCORES DÉTAILLÉS:")
|
| 945 |
+
print(f" • Type de document: {match.type_match_score:.3f}")
|
| 946 |
+
print(f" • Nom de fichier: {match.filename_match_score:.3f}") # Nouveau
|
| 947 |
+
print(f" • Médecin: {match.physician_match_score:.3f}")
|
| 948 |
+
print(f" • Centre: {match.center_match_score:.3f}")
|
| 949 |
+
print(f" • Contenu: {match.content_match_score:.3f}")
|
| 950 |
+
print(f" • Remplissage: {match.fillability_score:.3f}")
|
| 951 |
+
|
| 952 |
+
# Afficher les indicateurs du nom de fichier
|
| 953 |
+
if match.filename_indicators:
|
| 954 |
+
print(f"\nINDICATEURS NOM DE FICHIER:")
|
| 955 |
+
print(f" • Correspondances: {', '.join(match.filename_indicators)}")
|
| 956 |
+
|
| 957 |
+
print(f"\nCAPACITÉ DE REMPLISSAGE:")
|
| 958 |
+
print(f" • Peut être rempli: {'OUI' if match.can_be_filled else 'NON'}")
|
| 959 |
+
print(f" • Pourcentage: {match.filling_percentage:.1f}%")
|
| 960 |
+
|
| 961 |
+
if match.section_matches:
|
| 962 |
+
fillable = [s for s in match.section_matches.values() if s.can_fill]
|
| 963 |
+
missing = [s for s in match.section_matches.values() if not s.can_fill]
|
| 964 |
+
|
| 965 |
+
print(f" • Sections remplissables: {len(fillable)}/{len(match.section_matches)}")
|
| 966 |
+
|
| 967 |
+
if fillable:
|
| 968 |
+
print(f" • Remplissables: {', '.join([s.section_name for s in fillable])}")
|
| 969 |
+
|
| 970 |
+
if missing:
|
| 971 |
+
print(f" • Manquantes: {', '.join([s.section_name for s in missing])}")
|
| 972 |
+
|
| 973 |
+
if match.extracted_data:
|
| 974 |
+
print(f"\nDONNÉES EXTRAITES:")
|
| 975 |
+
for section, content in match.extracted_data.items():
|
| 976 |
+
preview = content[:100] + "..." if len(content) > 100 else content
|
| 977 |
+
print(f" • {section}: {preview}")
|
| 978 |
+
|
| 979 |
+
print(f"{'='*60}")
|
| 980 |
+
|
| 981 |
+
def get_best_fillable_match(self, transcription: str, transcription_filename: str = "") -> Optional[TemplateMatch]:
|
| 982 |
+
"""Retourne le meilleur template qui peut être effectivement rempli"""
|
| 983 |
+
matches = self.smart_match_transcription(transcription, transcription_filename, k=10)
|
| 984 |
+
|
| 985 |
+
# Filtrer uniquement les templates qui peuvent être remplis
|
| 986 |
+
fillable_matches = [m for m in matches if m.can_be_filled and m.fillability_score > 0.6]
|
| 987 |
+
|
| 988 |
+
return fillable_matches[0] if fillable_matches else None
|
| 989 |
+
|
| 990 |
+
def test_with_provided_example(self):
|
| 991 |
+
"""Teste le système avec l'exemple fourni par l'utilisateur"""
|
| 992 |
+
|
| 993 |
+
# Transcription d'exemple fournie
|
| 994 |
+
transcription_filename = "default.73.931915433.rtf_3650535_radiologie.doc"
|
| 995 |
+
transcription_content = """**Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
|
| 996 |
+
**Résultats :**
|
| 997 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 998 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 999 |
+
* Pas d'adénomyose franche.
|
| 1000 |
+
* Aspect normal du col utérin et du vagin.
|
| 1001 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 1002 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 1003 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 1004 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 1005 |
+
* Pas d'autre localisation pelvienne.
|
| 1006 |
+
* Pas d'épanchement pelvien.
|
| 1007 |
+
* Pas d'anomalie de la vessie.
|
| 1008 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 1009 |
+
**Conclusion :**
|
| 1010 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 1011 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm."""
|
| 1012 |
+
|
| 1013 |
+
print("ANALYSE DE L'EXEMPLE FOURNI")
|
| 1014 |
+
print("="*80)
|
| 1015 |
+
print(f"Nom de fichier: {transcription_filename}")
|
| 1016 |
+
print(f"Contenu: {len(transcription_content.split())} mots")
|
| 1017 |
+
|
| 1018 |
+
# Analyser le nom de fichier
|
| 1019 |
+
filename_analysis = self.analyze_filename(transcription_filename)
|
| 1020 |
+
print(f"\nANALYSE DU NOM DE FICHIER:")
|
| 1021 |
+
print(f"Mots-clés médicaux: {filename_analysis.medical_keywords}")
|
| 1022 |
+
print(f"Indicateurs de type: {filename_analysis.document_type_indicators}")
|
| 1023 |
+
print(f"Spécialités: {filename_analysis.specialty_indicators}")
|
| 1024 |
+
print(f"Centres: {filename_analysis.center_indicators}")
|
| 1025 |
+
print(f"Régions anatomiques: {filename_analysis.anatomical_regions}")
|
| 1026 |
+
print(f"Type de procédure: {filename_analysis.procedure_type}")
|
| 1027 |
+
print(f"Score de confiance: {filename_analysis.confidence_score:.3f}")
|
| 1028 |
+
|
| 1029 |
+
# Effectuer le matching
|
| 1030 |
+
print(f"\nMATCHING EN COURS...")
|
| 1031 |
+
results = self.smart_match_transcription(transcription_content, transcription_filename, k=5)
|
| 1032 |
+
|
| 1033 |
+
# Afficher les résultats
|
| 1034 |
+
self.print_smart_results(results)
|
| 1035 |
+
|
| 1036 |
+
# Afficher le meilleur match
|
| 1037 |
+
best_match = self.get_best_fillable_match(transcription_content, transcription_filename)
|
| 1038 |
+
if best_match:
|
| 1039 |
+
print(f"\nMEILLEUR TEMPLATE REMPLISSABLE:")
|
| 1040 |
+
print(f"Template: {best_match.template_id}")
|
| 1041 |
+
print(f"Score global: {best_match.overall_score:.3f}")
|
| 1042 |
+
print(f"Score nom de fichier: {best_match.filename_match_score:.3f}")
|
| 1043 |
+
print(f"Indicateurs nom de fichier: {', '.join(best_match.filename_indicators)}")
|
| 1044 |
+
print(f"Capacité de remplissage: {best_match.filling_percentage:.1f}%")
|
| 1045 |
+
|
| 1046 |
+
def main():
|
| 1047 |
+
"""Fonction principale pour tester le matching intelligent avec noms de fichiers"""
|
| 1048 |
+
|
| 1049 |
+
# Demander le chemin de la base de données
|
| 1050 |
+
db_path = input("Chemin vers la base de données (templates/medical_templates.pkl): ").strip()
|
| 1051 |
+
|
| 1052 |
+
if not db_path:
|
| 1053 |
+
db_path = "medical_templates.pkl"
|
| 1054 |
+
|
| 1055 |
+
if not os.path.exists(db_path):
|
| 1056 |
+
print(f"Fichier de base de données non trouvé: {db_path}")
|
| 1057 |
+
return
|
| 1058 |
+
|
| 1059 |
+
print(f"\nInitialisation du système de matching intelligent...")
|
| 1060 |
+
|
| 1061 |
+
# Initialiser le matcher
|
| 1062 |
+
matcher = SmartTranscriptionMatcher(db_path)
|
| 1063 |
+
|
| 1064 |
+
# Options de test
|
| 1065 |
+
print(f"\nOPTIONS DE TEST:")
|
| 1066 |
+
print("1. Utiliser l'exemple fourni (radiologie)")
|
| 1067 |
+
print("2. Saisie manuelle")
|
| 1068 |
+
print("3. Lecture depuis fichier")
|
| 1069 |
+
|
| 1070 |
+
choice = input("\nChoisissez une option (1-3): ").strip()
|
| 1071 |
+
|
| 1072 |
+
if choice == "1":
|
| 1073 |
+
# Utiliser l'exemple fourni par l'utilisateur
|
| 1074 |
+
matcher.test_with_provided_example()
|
| 1075 |
+
return
|
| 1076 |
+
|
| 1077 |
+
elif choice == "2":
|
| 1078 |
+
# Saisie manuelle
|
| 1079 |
+
transcription_filename = input("Nom du fichier de transcription: ").strip()
|
| 1080 |
+
print("\nEntrez votre transcription (tapez 'FIN' sur une ligne vide pour terminer):")
|
| 1081 |
+
lines = []
|
| 1082 |
+
while True:
|
| 1083 |
+
line = input()
|
| 1084 |
+
if line.strip() == 'FIN':
|
| 1085 |
+
break
|
| 1086 |
+
lines.append(line)
|
| 1087 |
+
transcription = '\n'.join(lines)
|
| 1088 |
+
|
| 1089 |
+
elif choice == "3":
|
| 1090 |
+
# Lecture depuis fichier
|
| 1091 |
+
filepath = input("Chemin vers le fichier de transcription: ").strip()
|
| 1092 |
+
try:
|
| 1093 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 1094 |
+
transcription = f.read()
|
| 1095 |
+
transcription_filename = os.path.basename(filepath)
|
| 1096 |
+
print(f"Fichier lu: {len(transcription.split())} mots")
|
| 1097 |
+
except Exception as e:
|
| 1098 |
+
print(f"Erreur de lecture: {e}")
|
| 1099 |
+
return
|
| 1100 |
+
|
| 1101 |
+
else:
|
| 1102 |
+
print("Choix invalide")
|
| 1103 |
+
return
|
| 1104 |
+
|
| 1105 |
+
if not transcription.strip():
|
| 1106 |
+
print("Aucune transcription fournie")
|
| 1107 |
+
return
|
| 1108 |
+
|
| 1109 |
+
print(f"\nAnalyse intelligente en cours...")
|
| 1110 |
+
|
| 1111 |
+
# Effectuer le matching intelligent
|
| 1112 |
+
results = matcher.smart_match_transcription(transcription, transcription_filename, k=5)
|
| 1113 |
+
|
| 1114 |
+
# Afficher les résultats
|
| 1115 |
+
matcher.print_smart_results(results)
|
| 1116 |
+
|
| 1117 |
+
# Afficher le meilleur match remplissable
|
| 1118 |
+
best_fillable = matcher.get_best_fillable_match(transcription, transcription_filename)
|
| 1119 |
+
|
| 1120 |
+
if best_fillable:
|
| 1121 |
+
print(f"\nMEILLEUR TEMPLATE REMPLISSABLE:")
|
| 1122 |
+
print(f"{'='*60}")
|
| 1123 |
+
print(f"Template: {best_fillable.template_id}")
|
| 1124 |
+
print(f"Score global: {best_fillable.overall_score:.3f}")
|
| 1125 |
+
print(f"Score nom de fichier: {best_fillable.filename_match_score:.3f}")
|
| 1126 |
+
print(f"Indicateurs fichier: {', '.join(best_fillable.filename_indicators)}")
|
| 1127 |
+
print(f"Capacité de remplissage: {best_fillable.filling_percentage:.1f}%")
|
| 1128 |
+
print(f"Confiance: {best_fillable.confidence_level}")
|
| 1129 |
+
|
| 1130 |
+
if best_fillable.extracted_data:
|
| 1131 |
+
print(f"\nTEMPLATE PRÊT À REMPLIR:")
|
| 1132 |
+
print(f"Sections avec données extraites:")
|
| 1133 |
+
for section, content in best_fillable.extracted_data.items():
|
| 1134 |
+
print(f"\n[{section.upper()}]")
|
| 1135 |
+
print(f"{content}")
|
| 1136 |
+
|
| 1137 |
+
# Proposer de voir plus de détails
|
| 1138 |
+
show_details = input(f"\nAfficher les détails complets du template? (y/n): ").strip().lower()
|
| 1139 |
+
if show_details == 'y':
|
| 1140 |
+
matcher.parser.print_template_summary(best_fillable.template_id)
|
| 1141 |
+
|
| 1142 |
+
# Proposer de générer le template rempli
|
| 1143 |
+
generate_filled = input(f"\nGénérer le template rempli? (y/n): ").strip().lower()
|
| 1144 |
+
if generate_filled == 'y':
|
| 1145 |
+
generate_filled_template(matcher, best_fillable, transcription)
|
| 1146 |
+
else:
|
| 1147 |
+
print(f"\nAucun template ne peut être suffisamment rempli avec cette transcription")
|
| 1148 |
+
|
| 1149 |
+
if results:
|
| 1150 |
+
print(f"\nMeilleurs candidats (mais insuffisamment remplissables):")
|
| 1151 |
+
for i, result in enumerate(results[:3], 1):
|
| 1152 |
+
print(f"{i}. {result.template_id} - Score: {result.overall_score:.3f}")
|
| 1153 |
+
print(f" Score fichier: {result.filename_match_score:.3f}")
|
| 1154 |
+
print(f" Remplissage: {result.filling_percentage:.1f}%")
|
| 1155 |
+
|
| 1156 |
+
def generate_filled_template(matcher: SmartTranscriptionMatcher, best_match: TemplateMatch, transcription: str):
|
| 1157 |
+
"""Génère un template rempli avec les données extraites"""
|
| 1158 |
+
|
| 1159 |
+
print(f"\nGÉNÉRATION DU TEMPLATE REMPLI")
|
| 1160 |
+
print(f"{'='*80}")
|
| 1161 |
+
|
| 1162 |
+
try:
|
| 1163 |
+
# Récupérer le contenu du template original
|
| 1164 |
+
template_info = best_match.template_info
|
| 1165 |
+
|
| 1166 |
+
# Charger le contenu du fichier template
|
| 1167 |
+
if os.path.exists(template_info.filepath):
|
| 1168 |
+
with open(template_info.filepath, 'r', encoding='utf-8') as f:
|
| 1169 |
+
template_content = f.read()
|
| 1170 |
+
else:
|
| 1171 |
+
print(f"Fichier template non trouvé: {template_info.filepath}")
|
| 1172 |
+
return
|
| 1173 |
+
|
| 1174 |
+
filled_content = template_content
|
| 1175 |
+
replacement_count = 0
|
| 1176 |
+
|
| 1177 |
+
# Remplacer les sections avec les données extraites
|
| 1178 |
+
for section_name, extracted_content in best_match.extracted_data.items():
|
| 1179 |
+
# Rechercher des patterns de remplacement dans le template
|
| 1180 |
+
patterns = [
|
| 1181 |
+
f"[{section_name.upper()}]",
|
| 1182 |
+
f"[{section_name}]",
|
| 1183 |
+
f"{{{section_name}}}",
|
| 1184 |
+
f"__{section_name}__",
|
| 1185 |
+
f"<!-- {section_name} -->",
|
| 1186 |
+
f"_{section_name}_",
|
| 1187 |
+
]
|
| 1188 |
+
|
| 1189 |
+
# Rechercher aussi par mots-clés de la section
|
| 1190 |
+
section_keywords = section_name.lower().split()
|
| 1191 |
+
for keyword in section_keywords:
|
| 1192 |
+
patterns.extend([
|
| 1193 |
+
f"[{keyword.upper()}]",
|
| 1194 |
+
f"{{{keyword}}}",
|
| 1195 |
+
f"__{keyword}__"
|
| 1196 |
+
])
|
| 1197 |
+
|
| 1198 |
+
# Essayer de remplacer avec chaque pattern
|
| 1199 |
+
for pattern in patterns:
|
| 1200 |
+
if pattern in filled_content:
|
| 1201 |
+
filled_content = filled_content.replace(pattern, extracted_content)
|
| 1202 |
+
replacement_count += 1
|
| 1203 |
+
print(f"Section '{section_name}' remplie ({pattern})")
|
| 1204 |
+
break
|
| 1205 |
+
else:
|
| 1206 |
+
# Si aucun pattern trouvé, essayer de trouver la section par similarité
|
| 1207 |
+
lines = filled_content.split('\n')
|
| 1208 |
+
for i, line in enumerate(lines):
|
| 1209 |
+
if any(keyword in line.lower() for keyword in section_keywords):
|
| 1210 |
+
# Insérer le contenu après cette ligne
|
| 1211 |
+
lines.insert(i + 1, f"\n{extracted_content}\n")
|
| 1212 |
+
filled_content = '\n'.join(lines)
|
| 1213 |
+
replacement_count += 1
|
| 1214 |
+
print(f"Section '{section_name}' insérée après ligne similaire")
|
| 1215 |
+
break
|
| 1216 |
+
else:
|
| 1217 |
+
print(f"Section '{section_name}' non intégrée - pattern non trouvé")
|
| 1218 |
+
|
| 1219 |
+
# Sauvegarder le template rempli
|
| 1220 |
+
output_filename = f"template_rempli_{best_match.template_id}.txt"
|
| 1221 |
+
try:
|
| 1222 |
+
with open(output_filename, 'w', encoding='utf-8') as f:
|
| 1223 |
+
f.write(filled_content)
|
| 1224 |
+
print(f"\nTemplate rempli sauvegardé: {output_filename}")
|
| 1225 |
+
except Exception as e:
|
| 1226 |
+
print(f"Erreur lors de la sauvegarde: {e}")
|
| 1227 |
+
|
| 1228 |
+
# Proposer d'afficher un aperçu
|
| 1229 |
+
show_preview = input(f"\nAfficher un aperçu du template rempli? (y/n): ").strip().lower()
|
| 1230 |
+
if show_preview == 'y':
|
| 1231 |
+
print(f"\n{'='*80}")
|
| 1232 |
+
print(f"APERÇU DU TEMPLATE REMPLI")
|
| 1233 |
+
print(f"{'='*80}")
|
| 1234 |
+
|
| 1235 |
+
# Afficher les premiers 2000 caractères
|
| 1236 |
+
preview = filled_content[:2000]
|
| 1237 |
+
if len(filled_content) > 2000:
|
| 1238 |
+
preview += "\n\n[... Tronqué pour l'aperçu ...]"
|
| 1239 |
+
|
| 1240 |
+
print(preview)
|
| 1241 |
+
print(f"\n{'='*80}")
|
| 1242 |
+
|
| 1243 |
+
except Exception as e:
|
| 1244 |
+
print(f"Erreur lors de la génération: {e}")
|
| 1245 |
+
logging.error(f"Erreur génération template: {e}")
|
| 1246 |
+
|
| 1247 |
+
def analyze_transcription_quality(transcription: str) -> Dict:
|
| 1248 |
+
"""Analyse rapide de la qualité d'une transcription"""
|
| 1249 |
+
|
| 1250 |
+
words = transcription.split()
|
| 1251 |
+
sentences = transcription.split('.')
|
| 1252 |
+
|
| 1253 |
+
# Métriques de qualité
|
| 1254 |
+
metrics = {
|
| 1255 |
+
"word_count": len(words),
|
| 1256 |
+
"sentence_count": len([s for s in sentences if s.strip()]),
|
| 1257 |
+
"avg_sentence_length": len(words) / max(len(sentences), 1),
|
| 1258 |
+
"has_medical_terms": bool(re.search(r'\b(mm|cm|ml|IRM|scanner|échographie|résultats?|conclusion)\b', transcription.lower())),
|
| 1259 |
+
"has_measurements": bool(re.search(r'\d+\s*(mm|cm|ml)', transcription)),
|
| 1260 |
+
"has_sections": bool(re.search(r'\b(technique|résultats?|conclusion|indication)\b', transcription.lower())),
|
| 1261 |
+
"structure_score": 0
|
| 1262 |
+
}
|
| 1263 |
+
|
| 1264 |
+
# Calculer un score de structure
|
| 1265 |
+
structure_indicators = ['technique', 'résultat', 'conclusion', 'indication', 'observation']
|
| 1266 |
+
structure_count = sum(1 for indicator in structure_indicators if indicator in transcription.lower())
|
| 1267 |
+
metrics["structure_score"] = min(1.0, structure_count / 3.0)
|
| 1268 |
+
|
| 1269 |
+
# Évaluation globale
|
| 1270 |
+
if (metrics["word_count"] > 100 and
|
| 1271 |
+
metrics["has_medical_terms"] and
|
| 1272 |
+
metrics["has_sections"] and
|
| 1273 |
+
metrics["structure_score"] > 0.5):
|
| 1274 |
+
quality = "excellent"
|
| 1275 |
+
elif (metrics["word_count"] > 50 and
|
| 1276 |
+
metrics["has_medical_terms"] and
|
| 1277 |
+
metrics["structure_score"] > 0.3):
|
| 1278 |
+
quality = "good"
|
| 1279 |
+
elif metrics["word_count"] > 20 and metrics["has_medical_terms"]:
|
| 1280 |
+
quality = "fair"
|
| 1281 |
+
else:
|
| 1282 |
+
quality = "poor"
|
| 1283 |
+
|
| 1284 |
+
metrics["overall_quality"] = quality
|
| 1285 |
+
return metrics
|
| 1286 |
+
|
| 1287 |
+
if __name__ == "__main__":
|
| 1288 |
+
main()
|
section_generator.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Section Generator
|
| 4 |
+
Handles dynamic section generation based on template sections
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import List
|
| 8 |
+
from langchain.prompts import ChatPromptTemplate
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def create_dynamic_section_prompt(template_sections: List[str]) -> ChatPromptTemplate:
|
| 12 |
+
"""Create a dynamic prompt based on the sections found in the template."""
|
| 13 |
+
|
| 14 |
+
# Build section instructions dynamically
|
| 15 |
+
section_instructions = []
|
| 16 |
+
for section in template_sections:
|
| 17 |
+
section_clean = section.strip().replace('\xa0', ' ').replace(':', '').strip()
|
| 18 |
+
section_instructions.append(
|
| 19 |
+
f"{section}:\n[Extract and organize content for {section_clean} section from the transcription, maintaining maximum fidelity to the original text]")
|
| 20 |
+
|
| 21 |
+
sections_text = "\n\n".join(section_instructions)
|
| 22 |
+
|
| 23 |
+
# Clean section names for display
|
| 24 |
+
clean_section_names = [s.strip().replace('\xa0', ' ').replace(
|
| 25 |
+
':', '').strip() for s in template_sections]
|
| 26 |
+
sections_list = ', '.join(clean_section_names)
|
| 27 |
+
|
| 28 |
+
# Special handling for single section templates
|
| 29 |
+
if len(template_sections) == 1:
|
| 30 |
+
single_section_instruction = f"""CRITICAL: This template has only ONE section: {sections_list}.
|
| 31 |
+
You MUST generate content for this section using ALL the information from the transcription.
|
| 32 |
+
Do not leave the section empty - extract and organize ALL relevant content from the transcription."""
|
| 33 |
+
else:
|
| 34 |
+
single_section_instruction = ""
|
| 35 |
+
|
| 36 |
+
system_prompt = f"""You are a medical document organizer.
|
| 37 |
+
Your task is to organize the corrected medical transcription into the required sections while maintaining maximum fidelity to the original text.
|
| 38 |
+
|
| 39 |
+
You MUST fill ALL sections requested in the template: {sections_list}.
|
| 40 |
+
CRITICAL: Use the EXACT section names provided in template_sections (including any punctuation like ':') - DO NOT translate or change them.
|
| 41 |
+
DO NOT summarize, interpret, or add information not present in the transcription.
|
| 42 |
+
DO NOT use markdown formatting or add extra headers.
|
| 43 |
+
|
| 44 |
+
{single_section_instruction}
|
| 45 |
+
|
| 46 |
+
ORGANIZATION RULES:
|
| 47 |
+
- Extract relevant content from the transcription for each section
|
| 48 |
+
- Maintain the original wording and structure as much as possible
|
| 49 |
+
- Do not add medical interpretations or conclusions not present in the text
|
| 50 |
+
- Keep all measurements, observations, and findings exactly as stated
|
| 51 |
+
- Preserve the original medical terminology
|
| 52 |
+
- Use ONLY the exact section names from the template (e.g., 'Technique :', 'Résultat :', 'Conclusion :')
|
| 53 |
+
- If there is only one section, put ALL relevant content in that section
|
| 54 |
+
|
| 55 |
+
Format your response with clear section headers using the EXACT names from the template:
|
| 56 |
+
|
| 57 |
+
{sections_text}
|
| 58 |
+
|
| 59 |
+
IMPORTANT:
|
| 60 |
+
- Use the corrected transcription content to fill all sections
|
| 61 |
+
- Use the EXACT section names from the template - DO NOT translate or modify them
|
| 62 |
+
- Do not add markdown formatting or extra headers
|
| 63 |
+
- Maintain maximum fidelity to the original transcription content
|
| 64 |
+
- Do not summarize or interpret the medical information
|
| 65 |
+
- Keep all original medical terms and measurements exactly as they appear
|
| 66 |
+
- NEVER leave a section empty - always provide content based on the transcription"""
|
| 67 |
+
|
| 68 |
+
human_prompt = """Organize the corrected medical transcription into the required sections:
|
| 69 |
+
|
| 70 |
+
Template sections: {template_sections}
|
| 71 |
+
Medical data: {medical_data}
|
| 72 |
+
Corrected transcription: {corrected_transcription}
|
| 73 |
+
|
| 74 |
+
Generate each section with the exact title from the template, using the corrected transcription content while maintaining maximum fidelity to the original text."""
|
| 75 |
+
|
| 76 |
+
return ChatPromptTemplate.from_messages([
|
| 77 |
+
("system", system_prompt),
|
| 78 |
+
("human", human_prompt)
|
| 79 |
+
])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def fix_section_names(content: str, template_sections: List[str]) -> str:
|
| 83 |
+
"""Post-process the generated content to ensure exact section names are used."""
|
| 84 |
+
import re
|
| 85 |
+
|
| 86 |
+
# If content is empty or very short, return the original content
|
| 87 |
+
if not content.strip() or len(content.strip()) < 50:
|
| 88 |
+
return content
|
| 89 |
+
|
| 90 |
+
# If there's only one template section, put all content in that section
|
| 91 |
+
if len(template_sections) == 1:
|
| 92 |
+
return f"{template_sections[0]}\n{content.strip()}"
|
| 93 |
+
|
| 94 |
+
sections = {}
|
| 95 |
+
current_section = None
|
| 96 |
+
current_content = []
|
| 97 |
+
|
| 98 |
+
# Create a pattern to match any section header
|
| 99 |
+
section_pattern = re.compile(r'^([A-Za-zÀ-ÿ\s]+:?)\s*$', re.IGNORECASE)
|
| 100 |
+
|
| 101 |
+
for line in content.split('\n'):
|
| 102 |
+
line = line.strip()
|
| 103 |
+
if not line:
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
# Check if this is a section header
|
| 107 |
+
match = section_pattern.match(line)
|
| 108 |
+
if match:
|
| 109 |
+
section_name = match.group(1).strip()
|
| 110 |
+
# Normalize section names for comparison
|
| 111 |
+
section_normalized = section_name.lower().replace('é', 'e').replace(
|
| 112 |
+
'è', 'e').replace('à', 'a').replace(':', '').strip()
|
| 113 |
+
|
| 114 |
+
# Check if this section name is similar to any template section
|
| 115 |
+
matched_template_section = None
|
| 116 |
+
for template_section in template_sections:
|
| 117 |
+
template_normalized = template_section.lower().replace('é', 'e').replace(
|
| 118 |
+
'è', 'e').replace('à', 'a').replace(':', '').replace('\xa0', ' ').strip()
|
| 119 |
+
|
| 120 |
+
# Check if they are similar (case insensitive and accent-insensitive)
|
| 121 |
+
if (section_normalized in template_normalized or
|
| 122 |
+
template_normalized in section_normalized or
|
| 123 |
+
any(word in section_normalized for word in template_normalized.split())):
|
| 124 |
+
matched_template_section = template_section
|
| 125 |
+
break
|
| 126 |
+
|
| 127 |
+
if matched_template_section:
|
| 128 |
+
if current_section:
|
| 129 |
+
sections[current_section] = '\n'.join(
|
| 130 |
+
current_content).strip()
|
| 131 |
+
current_section = matched_template_section # Use exact template section name
|
| 132 |
+
current_content = []
|
| 133 |
+
else:
|
| 134 |
+
# If no match found, treat as content
|
| 135 |
+
if current_section:
|
| 136 |
+
current_content.append(line)
|
| 137 |
+
elif current_section:
|
| 138 |
+
current_content.append(line)
|
| 139 |
+
|
| 140 |
+
# Add last section
|
| 141 |
+
if current_section and current_content:
|
| 142 |
+
sections[current_section] = '\n'.join(current_content).strip()
|
| 143 |
+
|
| 144 |
+
# If no sections were found, put all content in the first template section
|
| 145 |
+
if not sections and template_sections:
|
| 146 |
+
sections[template_sections[0]] = content.strip()
|
| 147 |
+
|
| 148 |
+
# Reconstruct the content with exact section names
|
| 149 |
+
fixed_content = []
|
| 150 |
+
for section_name, content in sections.items():
|
| 151 |
+
fixed_content.append(f"{section_name}")
|
| 152 |
+
if content:
|
| 153 |
+
fixed_content.append(content)
|
| 154 |
+
fixed_content.append("")
|
| 155 |
+
|
| 156 |
+
return "\n".join(fixed_content)
|
sftp_agent.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
SFTP Model Downloader Agent
|
| 4 |
+
Handles downloading model files from SFTP server
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import glob
|
| 10 |
+
import pysftp
|
| 11 |
+
from typing import List, Dict
|
| 12 |
+
from langchain.tools import tool
|
| 13 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 14 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@tool
|
| 18 |
+
def scan_transcription_files(transcriptions_dir: str = "transcriptions") -> List[Dict[str, str]]:
|
| 19 |
+
"""Scan the transcriptions directory and extract model identifiers from filenames."""
|
| 20 |
+
if not os.path.exists(transcriptions_dir):
|
| 21 |
+
raise FileNotFoundError(
|
| 22 |
+
f"Transcriptions directory not found: {transcriptions_dir}")
|
| 23 |
+
|
| 24 |
+
transcription_files = glob.glob(os.path.join(transcriptions_dir, "*.json"))
|
| 25 |
+
model_identifiers = []
|
| 26 |
+
|
| 27 |
+
for file_path in transcription_files:
|
| 28 |
+
filename = os.path.basename(file_path)
|
| 29 |
+
# Extract model identifier from filename pattern: transcriptions_default.99.019111585.rtf_...
|
| 30 |
+
match = re.search(r'transcriptions_(.+)\.rtf_', filename)
|
| 31 |
+
if match:
|
| 32 |
+
model_id = match.group(1)
|
| 33 |
+
model_identifiers.append({
|
| 34 |
+
'model_id': model_id,
|
| 35 |
+
'filename': filename,
|
| 36 |
+
'file_path': file_path,
|
| 37 |
+
# Keep .rtf for SFTP download
|
| 38 |
+
'model_filename': f"{model_id}.rtf",
|
| 39 |
+
# Use .doc for local storage
|
| 40 |
+
'local_filename': f"{model_id}.doc"
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
return model_identifiers
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@tool
|
| 47 |
+
def download_model_from_sftp(model_filename: str, local_download_dir: str = "models", force_download: bool = False) -> str:
|
| 48 |
+
"""Download a specific model file from SFTP server and convert extension from .rtf to .doc. If force_download is True, always re-download."""
|
| 49 |
+
# Import configuration
|
| 50 |
+
try:
|
| 51 |
+
from sftp_config import get_sftp_config
|
| 52 |
+
sftp_config = get_sftp_config()
|
| 53 |
+
except ImportError:
|
| 54 |
+
# Fallback to environment variables if config file not available
|
| 55 |
+
sftp_config = {
|
| 56 |
+
'host': os.getenv('SFTP_HOST', 'localhost'),
|
| 57 |
+
'port': int(os.getenv('SFTP_PORT', '22')),
|
| 58 |
+
'username': os.getenv('SFTP_USERNAME', 'user'),
|
| 59 |
+
'password': os.getenv('SFTP_PASSWORD', 'password'),
|
| 60 |
+
'remote_path': os.getenv('SFTP_REMOTE_PATH', '/models/')
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Create local directory if it doesn't exist
|
| 64 |
+
os.makedirs(local_download_dir, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
# Convert filename from .rtf to .doc
|
| 67 |
+
doc_filename = model_filename.replace('.rtf', '.doc')
|
| 68 |
+
local_file_path = os.path.join(local_download_dir, doc_filename)
|
| 69 |
+
|
| 70 |
+
# If force_download is False and file exists, skip download
|
| 71 |
+
if not force_download and os.path.exists(local_file_path):
|
| 72 |
+
print(f"ℹ️ Model already exists locally: {local_file_path}")
|
| 73 |
+
return local_file_path
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# Connect to SFTP server
|
| 77 |
+
cnopts = pysftp.CnOpts()
|
| 78 |
+
cnopts.hostkeys = None # Disable host key checking for development
|
| 79 |
+
|
| 80 |
+
print(
|
| 81 |
+
f"🔌 Connecting to SFTP server: {sftp_config['host']}:{sftp_config['port']}")
|
| 82 |
+
|
| 83 |
+
with pysftp.Connection(
|
| 84 |
+
host=sftp_config['host'],
|
| 85 |
+
port=sftp_config['port'],
|
| 86 |
+
username=sftp_config['username'],
|
| 87 |
+
password=sftp_config['password'],
|
| 88 |
+
cnopts=cnopts
|
| 89 |
+
) as sftp:
|
| 90 |
+
remote_file_path = os.path.join(
|
| 91 |
+
sftp_config['remote_path'], model_filename)
|
| 92 |
+
|
| 93 |
+
# Check if file exists on server
|
| 94 |
+
if not sftp.exists(remote_file_path):
|
| 95 |
+
raise FileNotFoundError(
|
| 96 |
+
f"Model file not found on SFTP server: {remote_file_path}")
|
| 97 |
+
|
| 98 |
+
# Get file size for progress tracking
|
| 99 |
+
file_size = sftp.stat(remote_file_path).st_size
|
| 100 |
+
print(
|
| 101 |
+
f"📁 Found file on server: {remote_file_path} ({file_size} bytes)")
|
| 102 |
+
|
| 103 |
+
# Download the file with original .rtf extension first
|
| 104 |
+
temp_rtf_path = os.path.join(local_download_dir, model_filename)
|
| 105 |
+
sftp.get(remote_file_path, temp_rtf_path)
|
| 106 |
+
print(f"📥 Downloaded model: {model_filename}")
|
| 107 |
+
|
| 108 |
+
# Rename file from .rtf to .doc
|
| 109 |
+
if os.path.exists(local_file_path):
|
| 110 |
+
os.remove(local_file_path)
|
| 111 |
+
os.rename(temp_rtf_path, local_file_path)
|
| 112 |
+
print(f"✅ Converted extension: {model_filename} -> {doc_filename}")
|
| 113 |
+
|
| 114 |
+
return local_file_path
|
| 115 |
+
|
| 116 |
+
except pysftp.AuthenticationException:
|
| 117 |
+
error_msg = f"Authentication failed for SFTP server {sftp_config['host']}"
|
| 118 |
+
print(f"❌ {error_msg}")
|
| 119 |
+
raise Exception(error_msg)
|
| 120 |
+
except pysftp.ConnectionException as e:
|
| 121 |
+
error_msg = f"Connection failed to SFTP server {sftp_config['host']}: {str(e)}"
|
| 122 |
+
print(f"❌ {error_msg}")
|
| 123 |
+
raise Exception(error_msg)
|
| 124 |
+
except FileNotFoundError as e:
|
| 125 |
+
error_msg = str(e)
|
| 126 |
+
print(f"❌ {error_msg}")
|
| 127 |
+
raise
|
| 128 |
+
except Exception as e:
|
| 129 |
+
error_msg = f"Error downloading model {model_filename}: {str(e)}"
|
| 130 |
+
print(f"❌ {error_msg}")
|
| 131 |
+
raise Exception(error_msg)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@tool
|
| 135 |
+
def batch_download_models(model_identifiers: List[Dict[str, str]], local_download_dir: str = "models") -> List[str]:
|
| 136 |
+
"""Download multiple model files from SFTP server in batch."""
|
| 137 |
+
downloaded_files = []
|
| 138 |
+
|
| 139 |
+
for model_info in model_identifiers:
|
| 140 |
+
model_filename = model_info['model_filename'] # .rtf file for SFTP
|
| 141 |
+
local_filename = model_info.get('local_filename', model_filename.replace(
|
| 142 |
+
'.rtf', '.doc')) # .doc file for local
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
local_path = download_model_from_sftp(
|
| 146 |
+
model_filename, local_download_dir)
|
| 147 |
+
downloaded_files.append({
|
| 148 |
+
'model_id': model_info['model_id'],
|
| 149 |
+
'local_path': local_path,
|
| 150 |
+
'local_filename': local_filename,
|
| 151 |
+
'status': 'success'
|
| 152 |
+
})
|
| 153 |
+
except Exception as e:
|
| 154 |
+
downloaded_files.append({
|
| 155 |
+
'model_id': model_info['model_id'],
|
| 156 |
+
'local_path': None,
|
| 157 |
+
'local_filename': local_filename,
|
| 158 |
+
'status': 'error',
|
| 159 |
+
'error': str(e)
|
| 160 |
+
})
|
| 161 |
+
|
| 162 |
+
return downloaded_files
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def create_sftp_downloader_agent(llm):
|
| 166 |
+
"""Create the SFTP downloader agent."""
|
| 167 |
+
sftp_downloader_prompt = ChatPromptTemplate.from_messages([
|
| 168 |
+
("system", """You are an SFTP model downloader agent. Your task is to:
|
| 169 |
+
1. Scan the transcriptions directory to identify which models are needed
|
| 170 |
+
2. Download the corresponding model files from the SFTP server
|
| 171 |
+
3. Return the list of successfully downloaded models
|
| 172 |
+
|
| 173 |
+
You should handle errors gracefully and provide detailed feedback about the download process."""),
|
| 174 |
+
("human",
|
| 175 |
+
"Analyze the transcriptions in {transcriptions_dir} and download the corresponding models from SFTP."),
|
| 176 |
+
MessagesPlaceholder("agent_scratchpad")
|
| 177 |
+
])
|
| 178 |
+
|
| 179 |
+
sftp_downloader_agent = create_openai_tools_agent(
|
| 180 |
+
llm=llm,
|
| 181 |
+
tools=[scan_transcription_files,
|
| 182 |
+
download_model_from_sftp, batch_download_models],
|
| 183 |
+
prompt=sftp_downloader_prompt
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
sftp_downloader_executor = AgentExecutor(
|
| 187 |
+
agent=sftp_downloader_agent,
|
| 188 |
+
tools=[scan_transcription_files,
|
| 189 |
+
download_model_from_sftp, batch_download_models],
|
| 190 |
+
verbose=True
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
return sftp_downloader_executor
|
sftp_config.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Configuration file for SFTP settings
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# Load environment variables
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
# SFTP Configuration
|
| 13 |
+
SFTP_CONFIG = {
|
| 14 |
+
'host': os.getenv('SFTP_HOST', '163.172.133.70'),
|
| 15 |
+
'port': int(os.getenv('SFTP_PORT', '2224')),
|
| 16 |
+
'username': os.getenv('SFTP_USERNAME', 'simplify'),
|
| 17 |
+
'password': os.getenv('SFTP_PASSWORD', 'V%40xi%24tD%40c2023'),
|
| 18 |
+
'remote_path': os.getenv('SFTP_REMOTE_PATH', '/simplify/model'),
|
| 19 |
+
'local_download_dir': os.getenv('LOCAL_MODELS_DIR', 'models')
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Transcription directory
|
| 23 |
+
TRANSCRIPTIONS_DIR = os.getenv('TRANSCRIPTIONS_DIR', 'transcriptions')
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_sftp_config():
|
| 27 |
+
"""Return SFTP configuration dictionary."""
|
| 28 |
+
return SFTP_CONFIG.copy()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def print_sftp_config():
|
| 32 |
+
"""Print current SFTP configuration (without password)."""
|
| 33 |
+
config = get_sftp_config()
|
| 34 |
+
safe_config = config.copy()
|
| 35 |
+
safe_config['password'] = '***' if config['password'] else 'Not set'
|
| 36 |
+
|
| 37 |
+
print("🔧 SFTP Configuration:")
|
| 38 |
+
print("=" * 40)
|
| 39 |
+
for key, value in safe_config.items():
|
| 40 |
+
print(f" {key}: {value}")
|
| 41 |
+
print("=" * 40)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
if __name__ == "__main__":
|
| 45 |
+
print_sftp_config()
|
smart_match.py
ADDED
|
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, List, Tuple, Optional
|
| 6 |
+
import numpy as np
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
#from langchain_openai import ChatOpenAI
|
| 9 |
+
#from langchain.prompts import ChatPromptTemplate
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from template_db_creation import TemplateInfo, MedicalTemplateParser
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
from openai import AzureOpenAI
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# IMPORTANT: Configuration pour éviter les segfaults sur Mac
|
| 17 |
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
| 18 |
+
os.environ['OMP_NUM_THREADS'] = '1'
|
| 19 |
+
os.environ['MKL_NUM_THREADS'] = '1'
|
| 20 |
+
|
| 21 |
+
# Configuration
|
| 22 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
# Azure OpenAI Configuration
|
| 26 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 27 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 28 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
| 29 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")
|
| 30 |
+
AZURE_OPENAI_MODEL = os.getenv("AZURE_OPENAI_MODEL", "gpt-5")
|
| 31 |
+
|
| 32 |
+
logging.basicConfig(level=logging.INFO)
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class MatchResult:
|
| 37 |
+
"""Résultat du matching"""
|
| 38 |
+
template_id: str
|
| 39 |
+
template_content: str
|
| 40 |
+
similarity_score: float
|
| 41 |
+
filled_template: str
|
| 42 |
+
sections_filled: Dict[str, str]
|
| 43 |
+
confidence_score: float
|
| 44 |
+
|
| 45 |
+
class TranscriptionMatcher:
|
| 46 |
+
"""Système de matching et remplissage automatique de templates"""
|
| 47 |
+
|
| 48 |
+
def __init__(self, parser_instance):
|
| 49 |
+
"""
|
| 50 |
+
Initialise le matcher avec une instance de MedicalTemplateParser
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
parser_instance: Instance de MedicalTemplateParser avec DB chargée
|
| 54 |
+
"""
|
| 55 |
+
self.parser = parser_instance
|
| 56 |
+
# Initialiser le modèle avec des paramètres Mac-friendly
|
| 57 |
+
self.model = SentenceTransformer(
|
| 58 |
+
EMBEDDING_MODEL,
|
| 59 |
+
device='cpu'
|
| 60 |
+
)
|
| 61 |
+
# Désactiver le multiprocessing pour éviter les segfaults
|
| 62 |
+
self.model.encode = self._safe_encode
|
| 63 |
+
self.llm = None
|
| 64 |
+
self.filler = None
|
| 65 |
+
self._initialize_gpt()
|
| 66 |
+
|
| 67 |
+
def _safe_encode(self, *args, **kwargs):
|
| 68 |
+
"""Wrapper pour encode qui force show_progress_bar=False"""
|
| 69 |
+
kwargs['show_progress_bar'] = False
|
| 70 |
+
kwargs['batch_size'] = 1 # Traiter un par un pour plus de stabilité
|
| 71 |
+
return SentenceTransformer.encode(self.model, *args, **kwargs)
|
| 72 |
+
|
| 73 |
+
def _initialize_gpt(self):
|
| 74 |
+
"""Initialise GPT pour le remplissage des templates (Azure OpenAI)"""
|
| 75 |
+
if not AZURE_OPENAI_KEY:
|
| 76 |
+
logger.warning("AZURE_OPENAI_KEY non définie")
|
| 77 |
+
return
|
| 78 |
+
if not AZURE_OPENAI_ENDPOINT:
|
| 79 |
+
logger.warning("AZURE_OPENAI_ENDPOINT non défini")
|
| 80 |
+
return
|
| 81 |
+
if not AZURE_OPENAI_DEPLOYMENT:
|
| 82 |
+
logger.warning("AZURE_OPENAI_DEPLOYMENT non défini")
|
| 83 |
+
return
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
# Initialiser le client AzureOpenAI
|
| 87 |
+
self.client = AzureOpenAI(
|
| 88 |
+
api_key=AZURE_OPENAI_KEY,
|
| 89 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
| 90 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
| 91 |
+
)
|
| 92 |
+
logger.info("✅ GPT initialisé avec AzureOpenAI client")
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"❌ Erreur initialisation GPT: {e}")
|
| 95 |
+
self.client = None
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def preprocess_transcription(self, transcription: str) -> str:
|
| 99 |
+
"""
|
| 100 |
+
Prétraite la transcription pour améliorer le matching
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
transcription: Texte brut de la transcription
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
str: Transcription nettoyée
|
| 107 |
+
"""
|
| 108 |
+
# Nettoyer les espaces multiples
|
| 109 |
+
text = re.sub(r'\s+', ' ', transcription)
|
| 110 |
+
|
| 111 |
+
# Normaliser les points
|
| 112 |
+
text = re.sub(r'\.{2,}', '.', text)
|
| 113 |
+
|
| 114 |
+
# Assurer une majuscule après chaque point
|
| 115 |
+
sentences = text.split('. ')
|
| 116 |
+
text = '. '.join(s.capitalize() for s in sentences if s.strip())
|
| 117 |
+
|
| 118 |
+
return text.strip()
|
| 119 |
+
|
| 120 |
+
def extract_exam_type(self, transcription: str) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Extrait le type d'examen de la transcription
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
transcription: Texte de la transcription
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
str: Type d'examen détecté
|
| 129 |
+
"""
|
| 130 |
+
exam_patterns = [
|
| 131 |
+
(r'\b(IRM|irm)\s+([a-zéèàçê\s]+)', 'IRM'),
|
| 132 |
+
(r'\b(SCANNER|scanner|TDM|tdm)\s+([a-zéèàçê\s]+)', 'Scanner'),
|
| 133 |
+
(r'\b(ECHOGRAPHIE|échographie|écho)\s+([a-zéèàçê\s]+)', 'Échographie'),
|
| 134 |
+
(r'\b(RADIO|radio|radiographie)\s+([a-zéèàçê\s]+)', 'Radiographie'),
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
text_lower = transcription.lower()
|
| 138 |
+
for pattern, exam_type in exam_patterns:
|
| 139 |
+
match = re.search(pattern, transcription, re.IGNORECASE)
|
| 140 |
+
if match:
|
| 141 |
+
return match.group(0).strip()
|
| 142 |
+
|
| 143 |
+
# Fallback : chercher juste le type sans localisation
|
| 144 |
+
for pattern, exam_type in exam_patterns:
|
| 145 |
+
if exam_type.lower() in text_lower:
|
| 146 |
+
return exam_type
|
| 147 |
+
|
| 148 |
+
return "Examen médical"
|
| 149 |
+
|
| 150 |
+
def create_search_query(self, transcription: str) -> str:
|
| 151 |
+
"""
|
| 152 |
+
Crée une requête optimisée pour la recherche vectorielle
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
transcription: Texte de la transcription
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
str: Requête optimisée
|
| 159 |
+
"""
|
| 160 |
+
# Extraire les mots-clés importants
|
| 161 |
+
exam_type = self.extract_exam_type(transcription)
|
| 162 |
+
|
| 163 |
+
# Extraire les premières phrases pour le contexte
|
| 164 |
+
sentences = transcription.split('.')[:3]
|
| 165 |
+
context = '. '.join(sentences).strip()
|
| 166 |
+
|
| 167 |
+
# Détecter les sections mentionnées
|
| 168 |
+
sections = []
|
| 169 |
+
section_keywords = {
|
| 170 |
+
'technique': ['technique', 'acquisition', 'séquence'],
|
| 171 |
+
'résultats': ['résultats', 'présence', 'mesure', 'aspect'],
|
| 172 |
+
'conclusion': ['conclusion', 'diagnostic', 'aspect de']
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
text_lower = transcription.lower()
|
| 176 |
+
for section, keywords in section_keywords.items():
|
| 177 |
+
if any(kw in text_lower for kw in keywords):
|
| 178 |
+
sections.append(section)
|
| 179 |
+
|
| 180 |
+
# Construire la requête
|
| 181 |
+
query_parts = [
|
| 182 |
+
f"Type: {exam_type}",
|
| 183 |
+
f"Sections: {', '.join(sections)}",
|
| 184 |
+
f"Contexte: {context[:200]}"
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
return ' | '.join(query_parts)
|
| 188 |
+
|
| 189 |
+
def find_best_template(self, transcription: str, top_k: int = 3) -> List[Tuple[str, float]]:
|
| 190 |
+
"""
|
| 191 |
+
Trouve les meilleurs templates correspondant à la transcription
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
transcription: Texte de la transcription
|
| 195 |
+
top_k: Nombre de templates à retourner
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
List[Tuple[str, float]]: Liste de (template_id, score)
|
| 199 |
+
"""
|
| 200 |
+
logger.info("🔍 Recherche du meilleur template...")
|
| 201 |
+
|
| 202 |
+
# Prétraiter la transcription
|
| 203 |
+
clean_transcription = self.preprocess_transcription(transcription)
|
| 204 |
+
|
| 205 |
+
# Créer la requête de recherche
|
| 206 |
+
search_query = self.create_search_query(clean_transcription)
|
| 207 |
+
logger.info(f"📝 Requête de recherche: {search_query[:100]}...")
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
# Rechercher dans la base vectorielle avec gestion d'erreur
|
| 211 |
+
results = self.parser.search_similar_templates(search_query, k=top_k)
|
| 212 |
+
|
| 213 |
+
if results:
|
| 214 |
+
logger.info(f"✅ {len(results)} templates trouvés")
|
| 215 |
+
for i, (template_id, score) in enumerate(results, 1):
|
| 216 |
+
logger.info(f" {i}. {template_id} (score: {score:.3f})")
|
| 217 |
+
else:
|
| 218 |
+
logger.warning("⚠️ Aucun template trouvé")
|
| 219 |
+
|
| 220 |
+
return results
|
| 221 |
+
except Exception as e:
|
| 222 |
+
logger.error(f"❌ Erreur lors de la recherche: {e}")
|
| 223 |
+
logger.info("🔄 Tentative avec recherche simplifiée...")
|
| 224 |
+
return self._fallback_search(clean_transcription, top_k)
|
| 225 |
+
|
| 226 |
+
def _fallback_search(self, transcription: str, top_k: int = 3) -> List[Tuple[str, float]]:
|
| 227 |
+
"""
|
| 228 |
+
Méthode de recherche fallback en cas d'erreur
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
transcription: Texte de la transcription
|
| 232 |
+
top_k: Nombre de résultats
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
List[Tuple[str, float]]: Résultats de recherche
|
| 236 |
+
"""
|
| 237 |
+
logger.info("⚠️ Utilisation de la recherche par mots-clés (fallback)")
|
| 238 |
+
|
| 239 |
+
exam_type = self.extract_exam_type(transcription).lower()
|
| 240 |
+
results = []
|
| 241 |
+
|
| 242 |
+
# Recherche simple par mots-clés dans les IDs
|
| 243 |
+
for template_id in self.parser.templates.keys():
|
| 244 |
+
template_type = template_id.lower()
|
| 245 |
+
score = 0.0
|
| 246 |
+
|
| 247 |
+
# Correspondance exacte du type d'examen
|
| 248 |
+
if exam_type in template_type or any(word in template_type for word in exam_type.split()):
|
| 249 |
+
score = 0.8
|
| 250 |
+
results.append((template_id, score))
|
| 251 |
+
|
| 252 |
+
# Trier par score et retourner top_k
|
| 253 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 254 |
+
return results[:top_k] if results else []
|
| 255 |
+
|
| 256 |
+
def fill_template_with_gpt(self, template_content: str, transcription: str) -> Dict:
|
| 257 |
+
"""
|
| 258 |
+
Remplit le template avec GPT (via Azure OpenAI)
|
| 259 |
+
"""
|
| 260 |
+
if not self.client:
|
| 261 |
+
logger.error("❌ GPT non disponible pour le remplissage")
|
| 262 |
+
return self._fallback_fill(template_content, transcription)
|
| 263 |
+
|
| 264 |
+
try:
|
| 265 |
+
logger.info("🤖 Remplissage du template avec GPT...")
|
| 266 |
+
|
| 267 |
+
system_prompt = """
|
| 268 |
+
Vous êtes un expert médical spécialisé dans le formatage de comptes-rendus médicaux.
|
| 269 |
+
|
| 270 |
+
Votre tâche est de remplir un template médical avec le contenu d'une transcription.
|
| 271 |
+
|
| 272 |
+
**RÈGLES ABSOLUES DE FORMATAGE** :
|
| 273 |
+
1. JAMAIS commencer le contenu d'une section par son nom
|
| 274 |
+
2. JAMAIS écrire "TITRE:", "CLINIQUE:", "TECHNIQUE:", "RESULTATS:", "CONCLUSION:" dans le contenu
|
| 275 |
+
3. JAMAIS écrire "Titre.", "Clinique.", "Technique.", "Résultats.", "Conclusion." dans le contenu
|
| 276 |
+
4. Le contenu doit commencer DIRECTEMENT par le texte médical, sans label
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
**INSTRUCTIONS** :
|
| 280 |
+
1. Analysez la transcription pour identifier les différentes sections
|
| 281 |
+
2. Mappez chaque partie de la transcription aux sections du template
|
| 282 |
+
3. CONSERVER ABSOLUMENT la casse exacte des termes médicaux
|
| 283 |
+
4. Ne JAMAIS convertir en minuscules les acronymes médicaux
|
| 284 |
+
5. Si une section n'a pas de contenu, laissez-la vide
|
| 285 |
+
|
| 286 |
+
**RÈGLES DE MAPPING** :
|
| 287 |
+
- TITRE : première phrase ou mention de l'examen (IRM, Scanner, Échographie, etc.)
|
| 288 |
+
- CLINIQUE : informations sur l'indication clinique, motif d'examen
|
| 289 |
+
- TECHNIQUE : description de la technique d'acquisition, séquences, protocole
|
| 290 |
+
- RESULTATS : toutes les observations et mesures détaillées
|
| 291 |
+
- CONCLUSION : synthèse finale, diagnostic, recommandations
|
| 292 |
+
|
| 293 |
+
**FORMAT DE RÉPONSE** :
|
| 294 |
+
Retournez UNIQUEMENT un objet JSON avec cette structure exacte :
|
| 295 |
+
{{
|
| 296 |
+
"sections": {{
|
| 297 |
+
"TITRE": "contenu direct sans label",
|
| 298 |
+
"CLINIQUE": "contenu direct sans label",
|
| 299 |
+
"TECHNIQUE": "contenu direct sans label",
|
| 300 |
+
"RESULTATS": "contenu direct sans label",
|
| 301 |
+
"CONCLUSION": "contenu direct sans label"
|
| 302 |
+
}},
|
| 303 |
+
"confidence": 0.95
|
| 304 |
+
}}
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
**IMPORTANT** :
|
| 308 |
+
- Répondez UNIQUEMENT avec du JSON valide
|
| 309 |
+
- Pas de texte avant ou après le JSON
|
| 310 |
+
- Pas de balises markdown
|
| 311 |
+
- Conservez les sauts de ligne pour la lisibilité des RESULTATS
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
user_prompt = f"""
|
| 315 |
+
Template à remplir :
|
| 316 |
+
{template_content}
|
| 317 |
+
|
| 318 |
+
Transcription médicale :
|
| 319 |
+
{transcription}
|
| 320 |
+
|
| 321 |
+
Remplissez le template en mappant correctement chaque section. Retournez UNIQUEMENT le JSON.
|
| 322 |
+
"""
|
| 323 |
+
|
| 324 |
+
response = self.client.chat.completions.create(
|
| 325 |
+
model=AZURE_OPENAI_DEPLOYMENT, # ⚠️ deployment name, not model family
|
| 326 |
+
messages=[
|
| 327 |
+
{"role": "system", "content": system_prompt},
|
| 328 |
+
{"role": "user", "content": user_prompt},
|
| 329 |
+
],
|
| 330 |
+
#temperature=0,
|
| 331 |
+
#max_tokens=1000,
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
result = response.choices[0].message.content.strip()
|
| 335 |
+
|
| 336 |
+
# Nettoyer le JSON
|
| 337 |
+
if result.startswith("```json"):
|
| 338 |
+
result = result[7:]
|
| 339 |
+
if result.endswith("```"):
|
| 340 |
+
result = result[:-3]
|
| 341 |
+
result = result.strip()
|
| 342 |
+
|
| 343 |
+
filled_data = json.loads(result)
|
| 344 |
+
|
| 345 |
+
logger.info("✅ Template rempli avec succès")
|
| 346 |
+
return filled_data
|
| 347 |
+
|
| 348 |
+
except json.JSONDecodeError as e:
|
| 349 |
+
logger.error(f"❌ Erreur parsing JSON: {e}")
|
| 350 |
+
return self._fallback_fill(template_content, transcription)
|
| 351 |
+
except Exception as e:
|
| 352 |
+
logger.error(f"❌ Erreur remplissage GPT: {e}")
|
| 353 |
+
return self._fallback_fill(template_content, transcription)
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def _fallback_fill(self, template_content: str, transcription: str) -> Dict:
|
| 357 |
+
"""
|
| 358 |
+
Méthode de fallback pour remplir le template sans GPT
|
| 359 |
+
|
| 360 |
+
Args:
|
| 361 |
+
template_content: Template vide
|
| 362 |
+
transcription: Transcription
|
| 363 |
+
|
| 364 |
+
Returns:
|
| 365 |
+
Dict: Sections remplies (basique)
|
| 366 |
+
"""
|
| 367 |
+
logger.info("⚠️ Utilisation du remplissage basique (fallback)")
|
| 368 |
+
|
| 369 |
+
sections = {
|
| 370 |
+
"TITRE": self.extract_exam_type(transcription),
|
| 371 |
+
"CLINIQUE": "",
|
| 372 |
+
"TECHNIQUE": "",
|
| 373 |
+
"RESULTATS": "",
|
| 374 |
+
"CONCLUSION": ""
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
# Découpage basique par mots-clés
|
| 378 |
+
text_lower = transcription.lower()
|
| 379 |
+
|
| 380 |
+
# Technique
|
| 381 |
+
if 'technique' in text_lower:
|
| 382 |
+
tech_match = re.search(r'technique[:\.]?\s*([^\.]+(?:\.[^\.]+){0,3})', transcription, re.IGNORECASE)
|
| 383 |
+
if tech_match:
|
| 384 |
+
sections["TECHNIQUE"] = tech_match.group(1).strip()
|
| 385 |
+
|
| 386 |
+
# Conclusion
|
| 387 |
+
if 'conclusion' in text_lower:
|
| 388 |
+
concl_match = re.search(r'conclusion[:\.]?\s*(.+)$', transcription, re.IGNORECASE | re.DOTALL)
|
| 389 |
+
if concl_match:
|
| 390 |
+
sections["CONCLUSION"] = concl_match.group(1).strip()
|
| 391 |
+
|
| 392 |
+
# Résultats (tout ce qui reste)
|
| 393 |
+
if not sections["TECHNIQUE"] and not sections["CONCLUSION"]:
|
| 394 |
+
sections["RESULTATS"] = transcription
|
| 395 |
+
|
| 396 |
+
return {
|
| 397 |
+
"sections": sections,
|
| 398 |
+
"confidence": 0.5
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
def format_filled_template(self, template_structure: str, filled_sections: Dict[str, str]) -> str:
|
| 402 |
+
"""
|
| 403 |
+
Formate le template rempli
|
| 404 |
+
|
| 405 |
+
Args:
|
| 406 |
+
template_structure: Structure du template
|
| 407 |
+
filled_sections: Sections remplies
|
| 408 |
+
|
| 409 |
+
Returns:
|
| 410 |
+
str: Template formaté
|
| 411 |
+
"""
|
| 412 |
+
output_lines = []
|
| 413 |
+
|
| 414 |
+
for section_name, content in filled_sections.items():
|
| 415 |
+
if content and content.strip():
|
| 416 |
+
# Nettoyer le contenu pour enlever toute duplication
|
| 417 |
+
cleaned_content = self._clean_section_content(content, section_name)
|
| 418 |
+
|
| 419 |
+
# Ne rien ajouter si le contenu est vide après nettoyage
|
| 420 |
+
if not cleaned_content:
|
| 421 |
+
continue
|
| 422 |
+
|
| 423 |
+
# Ajouter la section avec son nom
|
| 424 |
+
output_lines.append(f"{section_name} :")
|
| 425 |
+
output_lines.append(cleaned_content)
|
| 426 |
+
output_lines.append("") # Ligne vide entre sections
|
| 427 |
+
|
| 428 |
+
return "\n".join(output_lines)
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def _clean_section_content(self, content: str, section_name: str) -> str:
|
| 433 |
+
"""
|
| 434 |
+
Nettoie le contenu en enlevant les répétitions de labels
|
| 435 |
+
|
| 436 |
+
Args:
|
| 437 |
+
content: Contenu de la section
|
| 438 |
+
section_name: Nom de la section (ex: "CONCLUSION", "TECHNIQUE")
|
| 439 |
+
|
| 440 |
+
Returns:
|
| 441 |
+
str: Contenu nettoyé
|
| 442 |
+
"""
|
| 443 |
+
import re
|
| 444 |
+
|
| 445 |
+
content = content.strip()
|
| 446 |
+
|
| 447 |
+
# 1. Enlever "SECTION :" ou "SECTION:" au début (majuscules exactes)
|
| 448 |
+
pattern1 = rf'^{re.escape(section_name)}\s*:\s*'
|
| 449 |
+
content = re.sub(pattern1, '', content, flags=re.IGNORECASE)
|
| 450 |
+
|
| 451 |
+
# 2. Enlever les variations avec/sans S (ex: RESULTAT/RESULTATS)
|
| 452 |
+
section_variations = [
|
| 453 |
+
section_name,
|
| 454 |
+
section_name.rstrip('S'), # RESULTATS → RESULTAT
|
| 455 |
+
]
|
| 456 |
+
|
| 457 |
+
for variation in section_variations:
|
| 458 |
+
# Enlever "Variation :" ou "Variation."
|
| 459 |
+
pattern = rf'^{re.escape(variation)}\s*[.:]\s*'
|
| 460 |
+
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
|
| 461 |
+
|
| 462 |
+
# 3. Cas spéciaux pour les variations françaises courantes
|
| 463 |
+
special_cases = {
|
| 464 |
+
'RESULTATS': ['résultat', 'résultats', 'resultat', 'resultats'],
|
| 465 |
+
'TECHNIQUE': ['technique', 'techniques'],
|
| 466 |
+
'CONCLUSION': ['conclusion', 'conclusions'],
|
| 467 |
+
'CLINIQUE': ['clinique', 'cliniques', 'indication'],
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
if section_name in special_cases:
|
| 471 |
+
for variation in special_cases[section_name]:
|
| 472 |
+
pattern = rf'^{re.escape(variation)}\s*[.:]\s*'
|
| 473 |
+
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
|
| 474 |
+
|
| 475 |
+
# 4. Nettoyer les espaces multiples résultants
|
| 476 |
+
content = re.sub(r'\s+', ' ', content)
|
| 477 |
+
|
| 478 |
+
return content.strip()
|
| 479 |
+
|
| 480 |
+
def match_and_fill(self, transcription: str, return_top_k: int = 1) -> List[MatchResult]:
|
| 481 |
+
"""
|
| 482 |
+
Fonction principale : trouve et remplit le(s) meilleur(s) template(s)
|
| 483 |
+
|
| 484 |
+
Args:
|
| 485 |
+
transcription: Texte de la transcription
|
| 486 |
+
return_top_k: Nombre de templates à retourner
|
| 487 |
+
|
| 488 |
+
Returns:
|
| 489 |
+
List[MatchResult]: Liste des templates remplis
|
| 490 |
+
"""
|
| 491 |
+
logger.info("\n" + "="*80)
|
| 492 |
+
logger.info("🚀 DÉMARRAGE DU MATCHING ET REMPLISSAGE")
|
| 493 |
+
logger.info("="*80)
|
| 494 |
+
|
| 495 |
+
# Prétraiter
|
| 496 |
+
clean_transcription = self.preprocess_transcription(transcription)
|
| 497 |
+
logger.info(f"📝 Transcription prétraitée ({len(clean_transcription)} caractères)")
|
| 498 |
+
|
| 499 |
+
# Trouver les meilleurs templates
|
| 500 |
+
candidates = self.find_best_template(clean_transcription, top_k=return_top_k * 2)
|
| 501 |
+
|
| 502 |
+
if not candidates:
|
| 503 |
+
logger.error("❌ Aucun template trouvé")
|
| 504 |
+
return []
|
| 505 |
+
|
| 506 |
+
results = []
|
| 507 |
+
|
| 508 |
+
for template_id, similarity_score in candidates[:return_top_k]:
|
| 509 |
+
logger.info(f"\n{'─'*60}")
|
| 510 |
+
logger.info(f"📋 Traitement du template: {template_id}")
|
| 511 |
+
logger.info(f"{'─'*60}")
|
| 512 |
+
|
| 513 |
+
# Récupérer le template
|
| 514 |
+
template_info = self.parser.get_template_info(template_id)
|
| 515 |
+
if not template_info:
|
| 516 |
+
logger.warning(f"⚠️ Template {template_id} introuvable")
|
| 517 |
+
continue
|
| 518 |
+
|
| 519 |
+
logger.info(f"📄 Type: {template_info.type}")
|
| 520 |
+
logger.info(f"📊 Score de similarité: {similarity_score:.3f}")
|
| 521 |
+
|
| 522 |
+
# Remplir le template
|
| 523 |
+
filled_data = self.fill_template_with_gpt(
|
| 524 |
+
template_info.content,
|
| 525 |
+
clean_transcription
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
sections_filled = filled_data.get("sections", {})
|
| 529 |
+
confidence = filled_data.get("confidence", 0.0)
|
| 530 |
+
|
| 531 |
+
# Formater le résultat
|
| 532 |
+
filled_template = self.format_filled_template(
|
| 533 |
+
template_info.content,
|
| 534 |
+
sections_filled
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
result = MatchResult(
|
| 538 |
+
template_id=template_id,
|
| 539 |
+
template_content=template_info.content,
|
| 540 |
+
similarity_score=similarity_score,
|
| 541 |
+
filled_template=filled_template,
|
| 542 |
+
sections_filled=sections_filled,
|
| 543 |
+
confidence_score=confidence
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
results.append(result)
|
| 547 |
+
|
| 548 |
+
logger.info(f"✅ Template rempli - Confiance: {confidence:.2%}")
|
| 549 |
+
|
| 550 |
+
logger.info("\n" + "="*80)
|
| 551 |
+
logger.info(f"✅ TRAITEMENT TERMINÉ - {len(results)} résultat(s)")
|
| 552 |
+
logger.info("="*80 + "\n")
|
| 553 |
+
|
| 554 |
+
return results
|
| 555 |
+
|
| 556 |
+
def display_result(self, result: MatchResult):
|
| 557 |
+
"""
|
| 558 |
+
Affiche un résultat de manière formatée
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
result: Résultat à afficher
|
| 562 |
+
"""
|
| 563 |
+
print("\n" + "="*80)
|
| 564 |
+
print(f"📋 RÉSULTAT - Template: {result.template_id}")
|
| 565 |
+
print("="*80)
|
| 566 |
+
print(f"📊 Score de similarité: {result.similarity_score:.3f}")
|
| 567 |
+
print(f"🎯 Confiance: {result.confidence_score:.2%}")
|
| 568 |
+
print("\n" + "─"*80)
|
| 569 |
+
print("📄 TEMPLATE REMPLI:")
|
| 570 |
+
print("─"*80)
|
| 571 |
+
print(result.filled_template)
|
| 572 |
+
print("="*80 + "\n")
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
def main():
|
| 576 |
+
"""Fonction principale pour tester le matching"""
|
| 577 |
+
|
| 578 |
+
# Charger la base de données
|
| 579 |
+
db_path = input("Chemin vers la base de données (.pkl): ").strip()
|
| 580 |
+
|
| 581 |
+
if not os.path.exists(db_path):
|
| 582 |
+
print(f"❌ Fichier {db_path} introuvable")
|
| 583 |
+
return
|
| 584 |
+
|
| 585 |
+
print("📂 Chargement de la base de données...")
|
| 586 |
+
parser = MedicalTemplateParser()
|
| 587 |
+
parser.load_database(db_path)
|
| 588 |
+
|
| 589 |
+
print(f"✅ Base chargée: {len(parser.templates)} templates")
|
| 590 |
+
|
| 591 |
+
# Initialiser le matcher
|
| 592 |
+
matcher = TranscriptionMatcher(parser)
|
| 593 |
+
|
| 594 |
+
# Exemple de transcription
|
| 595 |
+
transcription_example = """
|
| 596 |
+
IRM pelvienne. Indication clinique. Technique. Acquisition sagittale, axiale et coronale T2, saturation axiale, diffusion axiale T1. Résultats. Présence d'un utérus antéversé médio-pelvien dont le grand axe mesure 72 mm sur 40 mm sur 40 mm. La zone jonctionnelle apparaît floue. Elle est épaissie de façon diffuse, asymétrique, avec une atteinte de plus de 50% de l'épaisseur du myomètre et comporte des spots en hypersignal T2, l'ensemble traduisant une adénomyose.
|
| 597 |
+
Pas d'épaississement cervical. À noter la présence d'un petit kyste liquidien de type Naboth.
|
| 598 |
+
Les 2 ovaires sont repérés, porteurs de formations folliculaires communes en hypersignal homogène T2 de petite taille. L'ovaire droit mesure 30 x 25 mm. L'ovaire gauche mesure 25 x 23 mm. Pas d'épanchement dans le cul-de-sac de Douglas.
|
| 599 |
+
Absence de foyer d'endométriose profonde. Conclusion.
|
| 600 |
+
Aspect d'adénomyose diffuse, symétrique, profonde.
|
| 601 |
+
Pas d'épaississement endométrial. Absence d'endométriome. Absence d'épanchement dans le cul-de-sac de Douglas.
|
| 602 |
+
"""
|
| 603 |
+
|
| 604 |
+
# Mode interactif
|
| 605 |
+
while True:
|
| 606 |
+
print("\n" + "="*80)
|
| 607 |
+
print("OPTIONS:")
|
| 608 |
+
print("1. Utiliser l'exemple de transcription")
|
| 609 |
+
print("2. Entrer une nouvelle transcription")
|
| 610 |
+
print("3. Charger depuis un fichier")
|
| 611 |
+
print("4. Quitter")
|
| 612 |
+
print("="*80)
|
| 613 |
+
|
| 614 |
+
choice = input("\nVotre choix: ").strip()
|
| 615 |
+
|
| 616 |
+
if choice == "1":
|
| 617 |
+
transcription = transcription_example
|
| 618 |
+
elif choice == "2":
|
| 619 |
+
print("\nEntrez la transcription (Ctrl+D ou Ctrl+Z pour terminer):")
|
| 620 |
+
lines = []
|
| 621 |
+
try:
|
| 622 |
+
while True:
|
| 623 |
+
line = input()
|
| 624 |
+
lines.append(line)
|
| 625 |
+
except EOFError:
|
| 626 |
+
transcription = "\n".join(lines)
|
| 627 |
+
elif choice == "3":
|
| 628 |
+
filepath = input("Chemin du fichier: ").strip()
|
| 629 |
+
try:
|
| 630 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 631 |
+
transcription = f.read()
|
| 632 |
+
except Exception as e:
|
| 633 |
+
print(f"❌ Erreur lecture fichier: {e}")
|
| 634 |
+
continue
|
| 635 |
+
elif choice == "4":
|
| 636 |
+
print("👋 Au revoir!")
|
| 637 |
+
break
|
| 638 |
+
else:
|
| 639 |
+
print("❌ Choix invalide")
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
# Effectuer le matching
|
| 643 |
+
results = matcher.match_and_fill(transcription, return_top_k=3)
|
| 644 |
+
|
| 645 |
+
if results:
|
| 646 |
+
# Afficher tous les résultats
|
| 647 |
+
for i, result in enumerate(results, 1):
|
| 648 |
+
print(f"\n{'#'*80}")
|
| 649 |
+
print(f"# RÉSULTAT {i}/{len(results)}")
|
| 650 |
+
print(f"{'#'*80}")
|
| 651 |
+
matcher.display_result(result)
|
| 652 |
+
|
| 653 |
+
# Sauvegarder le meilleur résultat
|
| 654 |
+
save_choice = input("\nSauvegarder le meilleur résultat? (o/n): ").strip().lower()
|
| 655 |
+
if save_choice == 'o':
|
| 656 |
+
output_file = input("Nom du fichier de sortie: ").strip()
|
| 657 |
+
try:
|
| 658 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 659 |
+
f.write(results[0].filled_template)
|
| 660 |
+
print(f"✅ Résultat sauvegardé dans {output_file}")
|
| 661 |
+
except Exception as e:
|
| 662 |
+
print(f"❌ Erreur sauvegarde: {e}")
|
| 663 |
+
else:
|
| 664 |
+
print("❌ Aucun résultat trouvé")
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
if __name__ == "__main__":
|
| 668 |
+
main()
|
template_
ADDED
|
File without changes
|
template_analyser_llm.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# coding: utf-8
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Test file: Section extraction via GPT for medical Word templates
|
| 6 |
+
This script reads a .docx document, sends the entire document to GPT for classification
|
| 7 |
+
into medical sections, and extracts the structured result.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
from docx import Document
|
| 13 |
+
from langchain_openai import ChatOpenAI
|
| 14 |
+
from langchain.prompts import ChatPromptTemplate
|
| 15 |
+
|
| 16 |
+
# 1. Configure your OpenAI API key in the environment
|
| 17 |
+
# export OPENAI_API_KEY="sk-..."
|
| 18 |
+
api_key = os.getenv('OPENAI_API_KEY')
|
| 19 |
+
|
| 20 |
+
# 2. Define the prompt for section classification
|
| 21 |
+
section_prompt = ChatPromptTemplate.from_messages([
|
| 22 |
+
("system", """
|
| 23 |
+
Vous êtes un expert en analyse de documents médicaux. Je vais vous fournir le texte complet d'un rapport médical.
|
| 24 |
+
|
| 25 |
+
Votre tâche est de :
|
| 26 |
+
|
| 27 |
+
1. Identifier automatiquement toutes les sections dans le document. Une section est définie par :
|
| 28 |
+
- Un en-tête qui peut être en majuscules ou en titre (ex: "SCANNER", "IRM DU GENOU DROIT")
|
| 29 |
+
- Un titre de section suivi de deux points ":" (ex: "Indication:", "Technique:", "Résultats:", "Conclusions:", etc.)
|
| 30 |
+
- Les sections peuvent aussi être "Indications", "Techniques", "CONCLUSION" (sans deux points)
|
| 31 |
+
|
| 32 |
+
2. Pour chaque section identifiée, extraire son contenu en collectant toutes les lignes suivantes jusqu'à la prochaine section.
|
| 33 |
+
|
| 34 |
+
3. Identifier les champs à remplir par l'utilisateur :
|
| 35 |
+
- Les balises <ASR_VOX> indiquent des champs à remplir
|
| 36 |
+
- Les textes génériques comme "xxx", "xxxx" indiquent des champs à remplir
|
| 37 |
+
- Les formules conditionnelles comme "SI(Civilité Nom usuel médecin..." indiquent des champs à remplir
|
| 38 |
+
|
| 39 |
+
4. Retourner un objet JSON valide avec cette structure exacte :
|
| 40 |
+
{{
|
| 41 |
+
"document_type": "type de document détecté",
|
| 42 |
+
"sections": {{
|
| 43 |
+
"nom_section": {{
|
| 44 |
+
"content": "contenu brut de la section",
|
| 45 |
+
"has_user_fields": true,
|
| 46 |
+
"user_fields": ["liste des champs à remplir"]
|
| 47 |
+
}}
|
| 48 |
+
}}
|
| 49 |
+
}}
|
| 50 |
+
|
| 51 |
+
Répondez UNIQUEMENT avec le JSON—aucun commentaire supplémentaire.
|
| 52 |
+
"""),
|
| 53 |
+
("human", "Voici le texte complet du rapport médical :\n\n{document_text}\n\nExtrayez toutes les sections et identifiez les champs à remplir.")
|
| 54 |
+
])
|
| 55 |
+
|
| 56 |
+
# 3. Initialize the LLM with appropriate parameters
|
| 57 |
+
llm = ChatOpenAI(
|
| 58 |
+
model="gpt-4o-mini",
|
| 59 |
+
temperature=0,
|
| 60 |
+
max_tokens=4000
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# 4. Create the chain using new LangChain API
|
| 64 |
+
section_classifier = section_prompt | llm
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def extract_sections_via_gpt(docx_path: str):
|
| 68 |
+
"""Extracts and returns structured sections from the entire document via GPT."""
|
| 69 |
+
if not os.path.exists(docx_path):
|
| 70 |
+
raise FileNotFoundError(f"Document not found: {docx_path}")
|
| 71 |
+
|
| 72 |
+
# Read the entire document
|
| 73 |
+
doc = Document(docx_path)
|
| 74 |
+
|
| 75 |
+
# Combine all paragraphs into one text
|
| 76 |
+
document_text = ""
|
| 77 |
+
for para in doc.paragraphs:
|
| 78 |
+
text = para.text.strip()
|
| 79 |
+
if text:
|
| 80 |
+
document_text += text + "\n"
|
| 81 |
+
|
| 82 |
+
if not document_text.strip():
|
| 83 |
+
return {"error": "Document appears to be empty"}
|
| 84 |
+
|
| 85 |
+
print(f"Document text preview: {document_text[:200]}...")
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Send entire document to GPT for processing using new API
|
| 89 |
+
response = section_classifier.invoke({"document_text": document_text})
|
| 90 |
+
|
| 91 |
+
# Extract content from the response
|
| 92 |
+
result = response.content.strip()
|
| 93 |
+
print(f"GPT response: {result[:500]}...")
|
| 94 |
+
|
| 95 |
+
# Parse the JSON response
|
| 96 |
+
sections_data = json.loads(result)
|
| 97 |
+
return sections_data
|
| 98 |
+
|
| 99 |
+
except json.JSONDecodeError as e:
|
| 100 |
+
print(f"Erreur de parsing JSON: {e}")
|
| 101 |
+
print(f"Réponse brute: {result}")
|
| 102 |
+
return {"error": f"Invalid JSON response: {e}", "raw_response": result}
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"Erreur lors de l'appel GPT: {e}")
|
| 105 |
+
return {"error": f"GPT processing error: {e}"}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def print_results(sections_data):
|
| 109 |
+
"""Print the extracted sections in a readable format."""
|
| 110 |
+
if "error" in sections_data:
|
| 111 |
+
print(f"Erreur: {sections_data['error']}")
|
| 112 |
+
if "raw_response" in sections_data:
|
| 113 |
+
print(f"Réponse brute: {sections_data['raw_response']}")
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
print("=" * 50)
|
| 117 |
+
print("ANALYSE DU DOCUMENT MÉDICAL")
|
| 118 |
+
print("=" * 50)
|
| 119 |
+
|
| 120 |
+
if "document_type" in sections_data:
|
| 121 |
+
print(f"Type de document: {sections_data['document_type']}")
|
| 122 |
+
print()
|
| 123 |
+
|
| 124 |
+
if "sections" in sections_data:
|
| 125 |
+
for section_name, section_data in sections_data["sections"].items():
|
| 126 |
+
print(f"📋 SECTION: {section_name}")
|
| 127 |
+
print("-" * 30)
|
| 128 |
+
|
| 129 |
+
if isinstance(section_data, dict):
|
| 130 |
+
if "content" in section_data:
|
| 131 |
+
print(f"Contenu: {section_data['content']}")
|
| 132 |
+
|
| 133 |
+
if section_data.get("has_user_fields", False):
|
| 134 |
+
print("⚠️ Cette section contient des champs à remplir par l'utilisateur:")
|
| 135 |
+
for field in section_data.get("user_fields", []):
|
| 136 |
+
print(f" • {field}")
|
| 137 |
+
else:
|
| 138 |
+
print("✅ Cette section est complète")
|
| 139 |
+
else:
|
| 140 |
+
print(f"Contenu: {section_data}")
|
| 141 |
+
|
| 142 |
+
print()
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if __name__ == '__main__':
|
| 146 |
+
# Path to your sample .docx
|
| 147 |
+
SAMPLE_DOCX = 'sample.docx'
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
print("Extraction des sections en cours...")
|
| 151 |
+
sections = extract_sections_via_gpt(SAMPLE_DOCX)
|
| 152 |
+
|
| 153 |
+
# Print formatted results
|
| 154 |
+
print_results(sections)
|
| 155 |
+
|
| 156 |
+
# Also save raw JSON for debugging
|
| 157 |
+
with open('extracted_sections.json', 'w', encoding='utf-8') as f:
|
| 158 |
+
json.dump(sections, f, indent=2, ensure_ascii=False)
|
| 159 |
+
|
| 160 |
+
print("\nRésultats sauvegardés dans 'extracted_sections.json'")
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"Erreur principale: {e}")
|
| 164 |
+
import traceback
|
| 165 |
+
traceback.print_exc()
|
| 166 |
+
|
template_analyser_test.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Improved Template Analyzer - Enhanced section detection
|
| 4 |
+
Fixes issues with section detection and provides better analysis
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, Any, List, Tuple
|
| 10 |
+
from docx import Document
|
| 11 |
+
import json
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from langchain.tools import tool
|
| 14 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 15 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 16 |
+
from langchain_openai import ChatOpenAI
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
# Load environment variables
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@tool
|
| 24 |
+
def analyze_word_template_tool(template_path: str) -> Dict[str, Any]:
|
| 25 |
+
"""Analyze a Word document template to extract structure and sections."""
|
| 26 |
+
if not os.path.exists(template_path):
|
| 27 |
+
raise FileNotFoundError(f"Template file not found: {template_path}")
|
| 28 |
+
|
| 29 |
+
doc = Document(template_path)
|
| 30 |
+
analysis = {
|
| 31 |
+
'sections': [],
|
| 32 |
+
'formatting': {},
|
| 33 |
+
'document_info': {}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Improved section detection regex - includes all common medical sections
|
| 37 |
+
section_patterns = [
|
| 38 |
+
r'\b(clinique|examen|observation)\b',
|
| 39 |
+
r'\b(technique|matériel|méthode|procédure)\b',
|
| 40 |
+
r'\b(résultat|resultat|resultats|résultats)\b',
|
| 41 |
+
r'\b(conclusion|diagnostic|impression)\b',
|
| 42 |
+
r'\b(échographie|echographie|imagerie)\b',
|
| 43 |
+
r'\b(recommandation|traitement|suivi)\b',
|
| 44 |
+
r'\b(analyse|commentaire|discussion)\b',
|
| 45 |
+
r'\b(antécédents|histoire|anamnèse)\b',
|
| 46 |
+
r'\b(indication|objectif)\b',
|
| 47 |
+
r'\b(biologie|laboratoire)\b'
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
combined_pattern = '|'.join(section_patterns)
|
| 51 |
+
|
| 52 |
+
# Analyze paragraphs and sections
|
| 53 |
+
for i, paragraph in enumerate(doc.paragraphs):
|
| 54 |
+
text = paragraph.text.strip()
|
| 55 |
+
if text:
|
| 56 |
+
# Check if paragraph contains section keywords
|
| 57 |
+
if re.search(combined_pattern, text, re.IGNORECASE):
|
| 58 |
+
analysis['sections'].append({
|
| 59 |
+
'text': text,
|
| 60 |
+
'index': i,
|
| 61 |
+
'style': paragraph.style.name if paragraph.style else 'Normal'
|
| 62 |
+
})
|
| 63 |
+
|
| 64 |
+
# Analyze formatting
|
| 65 |
+
if paragraph.runs:
|
| 66 |
+
run = paragraph.runs[0]
|
| 67 |
+
analysis['formatting'][i] = {
|
| 68 |
+
'bold': run.bold,
|
| 69 |
+
'italic': run.italic,
|
| 70 |
+
'font_name': run.font.name,
|
| 71 |
+
'font_size': run.font.size.pt if run.font.size else None,
|
| 72 |
+
'alignment': paragraph.alignment
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
# Analyze document properties
|
| 76 |
+
if doc.core_properties:
|
| 77 |
+
analysis['document_info'] = {
|
| 78 |
+
'title': doc.core_properties.title or 'Word Document',
|
| 79 |
+
'author': doc.core_properties.author or '',
|
| 80 |
+
'subject': doc.core_properties.subject or ''
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return analysis
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class ImprovedTemplateAnalyzer:
|
| 87 |
+
"""Enhanced template analyzer with better section detection."""
|
| 88 |
+
|
| 89 |
+
def __init__(self):
|
| 90 |
+
"""Initialize the template analyzer."""
|
| 91 |
+
print("🔍 Improved Template Analyzer initialized")
|
| 92 |
+
|
| 93 |
+
# Define comprehensive section patterns
|
| 94 |
+
self.section_patterns = {
|
| 95 |
+
'clinique': r'\b(clinique|examen|observation|examen_clinique)\b',
|
| 96 |
+
'technique': r'\b(technique|matériel|méthode|procédure|protocole)\b',
|
| 97 |
+
'resultats': r'\b(résultat|resultat|resultats|résultats|findings)\b',
|
| 98 |
+
'conclusion': r'\b(conclusion|diagnostic|impression|synthèse)\b',
|
| 99 |
+
'imagerie': r'\b(échographie|echographie|imagerie|radiologie)\b',
|
| 100 |
+
'recommandations': r'\b(recommandation|traitement|suivi|conduite)\b',
|
| 101 |
+
'analyse': r'\b(analyse|commentaire|discussion|interprétation)\b',
|
| 102 |
+
'antecedents': r'\b(antécédents|histoire|anamnèse|contexte)\b',
|
| 103 |
+
'indication': r'\b(indication|objectif|but|demande)\b',
|
| 104 |
+
'biologie': r'\b(biologie|laboratoire|bilan|analyses)\b'
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
def analyze_word_template(self, template_path: str) -> Dict[str, Any]:
|
| 108 |
+
"""Analyze a Word document template to extract structure and sections."""
|
| 109 |
+
if not os.path.exists(template_path):
|
| 110 |
+
raise FileNotFoundError(f"Template file not found: {template_path}")
|
| 111 |
+
|
| 112 |
+
print(f"📄 Analyzing template: {template_path}")
|
| 113 |
+
|
| 114 |
+
doc = Document(template_path)
|
| 115 |
+
analysis = {
|
| 116 |
+
'sections': [],
|
| 117 |
+
'formatting': {},
|
| 118 |
+
'document_info': {},
|
| 119 |
+
'all_text': [],
|
| 120 |
+
'structure': {},
|
| 121 |
+
'detected_section_types': []
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# Analyze paragraphs and sections
|
| 125 |
+
for i, paragraph in enumerate(doc.paragraphs):
|
| 126 |
+
text = paragraph.text.strip()
|
| 127 |
+
|
| 128 |
+
# Store all text for reference
|
| 129 |
+
if text:
|
| 130 |
+
analysis['all_text'].append({
|
| 131 |
+
'index': i,
|
| 132 |
+
'text': text,
|
| 133 |
+
'length': len(text)
|
| 134 |
+
})
|
| 135 |
+
|
| 136 |
+
# Check for sections using improved detection
|
| 137 |
+
section_type = self._detect_section_type(text)
|
| 138 |
+
if section_type:
|
| 139 |
+
analysis['sections'].append({
|
| 140 |
+
'text': text,
|
| 141 |
+
'index': i,
|
| 142 |
+
'style': paragraph.style.name if paragraph.style else 'Normal',
|
| 143 |
+
'section_type': section_type,
|
| 144 |
+
'is_header': self._is_likely_header(text)
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
if section_type not in analysis['detected_section_types']:
|
| 148 |
+
analysis['detected_section_types'].append(section_type)
|
| 149 |
+
|
| 150 |
+
# Analyze formatting
|
| 151 |
+
if paragraph.runs:
|
| 152 |
+
run = paragraph.runs[0]
|
| 153 |
+
analysis['formatting'][i] = {
|
| 154 |
+
'bold': run.bold,
|
| 155 |
+
'italic': run.italic,
|
| 156 |
+
'font_name': run.font.name,
|
| 157 |
+
'font_size': run.font.size.pt if run.font.size else None,
|
| 158 |
+
'alignment': str(paragraph.alignment) if paragraph.alignment else None
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# Analyze document properties
|
| 162 |
+
if doc.core_properties:
|
| 163 |
+
analysis['document_info'] = {
|
| 164 |
+
'title': doc.core_properties.title or 'Word Document',
|
| 165 |
+
'author': doc.core_properties.author or '',
|
| 166 |
+
'subject': doc.core_properties.subject or '',
|
| 167 |
+
'created': doc.core_properties.created.isoformat() if doc.core_properties.created else None,
|
| 168 |
+
'modified': doc.core_properties.modified.isoformat() if doc.core_properties.modified else None
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# Extract document structure
|
| 172 |
+
analysis['structure'] = self._extract_structure(analysis['sections'])
|
| 173 |
+
|
| 174 |
+
return analysis
|
| 175 |
+
|
| 176 |
+
def _detect_section_type(self, text: str) -> str:
|
| 177 |
+
"""Detect the type of section based on improved pattern matching."""
|
| 178 |
+
text_lower = text.lower()
|
| 179 |
+
|
| 180 |
+
# Check each pattern
|
| 181 |
+
for section_type, pattern in self.section_patterns.items():
|
| 182 |
+
if re.search(pattern, text_lower):
|
| 183 |
+
return section_type
|
| 184 |
+
|
| 185 |
+
# Additional check for common section formats
|
| 186 |
+
if ':' in text and len(text.split()) <= 3:
|
| 187 |
+
# Likely a section header
|
| 188 |
+
first_word = text.split(':')[0].strip().lower()
|
| 189 |
+
if first_word in ['clinique', 'technique', 'resultats', 'résultats', 'conclusion']:
|
| 190 |
+
return first_word if first_word != 'résultats' else 'resultats'
|
| 191 |
+
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
def _is_likely_header(self, text: str) -> bool:
|
| 195 |
+
"""Determine if text is likely a section header."""
|
| 196 |
+
# Headers are usually short, may end with ':', and often bold
|
| 197 |
+
conditions = [
|
| 198 |
+
len(text) < 100, # Short text
|
| 199 |
+
text.endswith(':'), # Ends with colon
|
| 200 |
+
text.isupper(), # All uppercase
|
| 201 |
+
len(text.split()) <= 3 # Few words
|
| 202 |
+
]
|
| 203 |
+
|
| 204 |
+
return any(conditions)
|
| 205 |
+
|
| 206 |
+
def _extract_structure(self, sections: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 207 |
+
"""Extract the document structure from sections."""
|
| 208 |
+
structure = {
|
| 209 |
+
'detected_sections': [],
|
| 210 |
+
'section_types': [],
|
| 211 |
+
'total_sections': len(sections)
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
for section in sections:
|
| 215 |
+
structure['detected_sections'].append({
|
| 216 |
+
'text': section['text'],
|
| 217 |
+
'type': section.get('section_type', 'unknown'),
|
| 218 |
+
'index': section['index']
|
| 219 |
+
})
|
| 220 |
+
|
| 221 |
+
section_type = section.get('section_type', 'unknown')
|
| 222 |
+
if section_type not in structure['section_types']:
|
| 223 |
+
structure['section_types'].append(section_type)
|
| 224 |
+
|
| 225 |
+
return structure
|
| 226 |
+
|
| 227 |
+
def save_analysis(self, analysis: Dict[str, Any], output_path: str = None):
|
| 228 |
+
"""Save analysis results to JSON file."""
|
| 229 |
+
if not output_path:
|
| 230 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 231 |
+
output_path = f"improved_template_analysis_{timestamp}.json"
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 235 |
+
json.dump(analysis, f, ensure_ascii=False, indent=2)
|
| 236 |
+
|
| 237 |
+
print(f"💾 Analysis saved to: {output_path}")
|
| 238 |
+
return output_path
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
print(f"❌ Error saving analysis: {e}")
|
| 242 |
+
return None
|
| 243 |
+
|
| 244 |
+
def display_analysis_summary(self, analysis: Dict[str, Any]):
|
| 245 |
+
"""Display a summary of the template analysis."""
|
| 246 |
+
print("\n📊 IMPROVED TEMPLATE ANALYSIS SUMMARY")
|
| 247 |
+
print("=" * 60)
|
| 248 |
+
|
| 249 |
+
print(f"Total paragraphs: {len(analysis['all_text'])}")
|
| 250 |
+
print(f"Detected sections: {len(analysis['sections'])}")
|
| 251 |
+
|
| 252 |
+
if analysis['detected_section_types']:
|
| 253 |
+
print(f"Section types found: {', '.join(analysis['detected_section_types'])}")
|
| 254 |
+
|
| 255 |
+
print(f"Document title: {analysis['document_info'].get('title', 'N/A')}")
|
| 256 |
+
print(f"Document author: {analysis['document_info'].get('author', 'N/A')}")
|
| 257 |
+
|
| 258 |
+
print("\n🔍 DETECTED SECTIONS:")
|
| 259 |
+
for i, section in enumerate(analysis['structure']['detected_sections']):
|
| 260 |
+
print(f" {i+1}. [{section['type']}] {section['text']}")
|
| 261 |
+
|
| 262 |
+
print(f"\n📄 ALL PARAGRAPHS:")
|
| 263 |
+
for i, text_item in enumerate(analysis['all_text']):
|
| 264 |
+
print(f" {i+1}. {text_item['text']}")
|
| 265 |
+
|
| 266 |
+
def test_with_sample_template(self, template_path: str):
|
| 267 |
+
"""Test the analyzer with a sample template."""
|
| 268 |
+
print(f"🚀 Testing Improved Template Analyzer with: {template_path}")
|
| 269 |
+
print("=" * 60)
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
# Analyze the template
|
| 273 |
+
analysis = self.analyze_word_template(template_path)
|
| 274 |
+
|
| 275 |
+
# Display summary
|
| 276 |
+
self.display_analysis_summary(analysis)
|
| 277 |
+
|
| 278 |
+
# Save analysis
|
| 279 |
+
output_file = self.save_analysis(analysis)
|
| 280 |
+
|
| 281 |
+
print(f"\n✅ Improved analysis completed successfully!")
|
| 282 |
+
print(f"📁 Results saved to: {output_file}")
|
| 283 |
+
|
| 284 |
+
return analysis
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
print(f"❌ Error during analysis: {e}")
|
| 288 |
+
import traceback
|
| 289 |
+
traceback.print_exc()
|
| 290 |
+
return None
|
| 291 |
+
|
| 292 |
+
def create_template_analyzer_agent(self, llm):
|
| 293 |
+
"""Create the improved template analyzer agent."""
|
| 294 |
+
template_analyzer_prompt = ChatPromptTemplate.from_messages([
|
| 295 |
+
("system", """You are an enhanced medical document template analyzer.
|
| 296 |
+
Analyze the provided Word template and extract its structure, sections, and formatting.
|
| 297 |
+
Pay special attention to detecting ALL sections including: CLINIQUE, TECHNIQUE, RESULTATS, and CONCLUSION.
|
| 298 |
+
Provide a detailed analysis that can be used by other agents."""),
|
| 299 |
+
("human",
|
| 300 |
+
"Analyze the template at {template_path} and provide a comprehensive analysis. Make sure to detect all sections including RESULTATS."),
|
| 301 |
+
MessagesPlaceholder("agent_scratchpad")
|
| 302 |
+
])
|
| 303 |
+
|
| 304 |
+
template_analyzer_agent = create_openai_tools_agent(
|
| 305 |
+
llm=llm,
|
| 306 |
+
tools=[analyze_word_template_tool],
|
| 307 |
+
prompt=template_analyzer_prompt
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
template_analyzer_executor = AgentExecutor(
|
| 311 |
+
agent=template_analyzer_agent,
|
| 312 |
+
tools=[analyze_word_template_tool],
|
| 313 |
+
verbose=True
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
return template_analyzer_executor
|
| 317 |
+
|
| 318 |
+
def test_with_agent(self, template_path: str):
|
| 319 |
+
"""Test the template analyzer using the enhanced LangChain agent."""
|
| 320 |
+
print(f"🤖 Testing Improved Template Analyzer AGENT with: {template_path}")
|
| 321 |
+
print("=" * 60)
|
| 322 |
+
|
| 323 |
+
try:
|
| 324 |
+
# Initialize OpenAI LLM
|
| 325 |
+
api_key = os.getenv('OPENAI_API_KEY')
|
| 326 |
+
if not api_key:
|
| 327 |
+
print("❌ OpenAI API key not found in environment variables")
|
| 328 |
+
return None
|
| 329 |
+
|
| 330 |
+
llm = ChatOpenAI(
|
| 331 |
+
model="gpt-4o-mini",
|
| 332 |
+
temperature=0,
|
| 333 |
+
api_key=api_key
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Create the agent
|
| 337 |
+
print("🔧 Creating improved template analyzer agent...")
|
| 338 |
+
agent_executor = self.create_template_analyzer_agent(llm)
|
| 339 |
+
|
| 340 |
+
# Run the agent
|
| 341 |
+
print("🚀 Running enhanced agent analysis...")
|
| 342 |
+
result = agent_executor.invoke({
|
| 343 |
+
"template_path": template_path
|
| 344 |
+
})
|
| 345 |
+
|
| 346 |
+
print("✅ Enhanced agent analysis completed!")
|
| 347 |
+
print("\n📋 AGENT OUTPUT:")
|
| 348 |
+
print("=" * 50)
|
| 349 |
+
print(result['output'])
|
| 350 |
+
|
| 351 |
+
# Save agent result
|
| 352 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 353 |
+
agent_output_file = f"improved_agent_analysis_{timestamp}.json"
|
| 354 |
+
|
| 355 |
+
with open(agent_output_file, 'w', encoding='utf-8') as f:
|
| 356 |
+
json.dump(result, f, ensure_ascii=False, indent=2)
|
| 357 |
+
|
| 358 |
+
print(f"\n💾 Enhanced agent result saved to: {agent_output_file}")
|
| 359 |
+
|
| 360 |
+
return result
|
| 361 |
+
|
| 362 |
+
except Exception as e:
|
| 363 |
+
print(f"❌ Error during enhanced agent analysis: {e}")
|
| 364 |
+
import traceback
|
| 365 |
+
traceback.print_exc()
|
| 366 |
+
return None
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def main():
|
| 370 |
+
print("🏥 Improved Template Analyzer - Enhanced Section Detection")
|
| 371 |
+
print("=" * 60)
|
| 372 |
+
|
| 373 |
+
# Initialize analyzer
|
| 374 |
+
analyzer = ImprovedTemplateAnalyzer()
|
| 375 |
+
|
| 376 |
+
# Test with sample path or interactive mode
|
| 377 |
+
sample_path = "sample.docx"
|
| 378 |
+
|
| 379 |
+
"""
|
| 380 |
+
if os.path.exists(sample_path):
|
| 381 |
+
print(f"📄 Found sample file: {sample_path}")
|
| 382 |
+
print("🔬 Running enhanced analysis...")
|
| 383 |
+
|
| 384 |
+
# Test both methods
|
| 385 |
+
print("\n1️⃣ Testing improved direct analysis...")
|
| 386 |
+
direct_result = analyzer.test_with_sample_template(sample_path)
|
| 387 |
+
|
| 388 |
+
print("\n" + "="*60)
|
| 389 |
+
print("2️⃣ Testing improved agent analysis...")
|
| 390 |
+
agent_result = analyzer.test_with_agent(sample_path)
|
| 391 |
+
|
| 392 |
+
if direct_result and agent_result:
|
| 393 |
+
print(f"\n🎉 Both enhanced analyses completed successfully!")
|
| 394 |
+
print(f"📊 Direct analysis found {len(direct_result['sections'])} sections")
|
| 395 |
+
print(f"📊 Agent analysis tool was executed successfully")
|
| 396 |
+
"""
|
| 397 |
+
if os.path.exists(sample_path):
|
| 398 |
+
print(f"📄 Found sample file: {sample_path}")
|
| 399 |
+
print("🤖 Running enhanced **agent** analysis with GPT...")
|
| 400 |
+
|
| 401 |
+
# Désormais on lance uniquement l’agent LLM
|
| 402 |
+
agent_result = analyzer.test_with_agent(sample_path)
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
if agent_result:
|
| 406 |
+
print(f"\n🎉 Enhanced agent analysis completed successfully!")
|
| 407 |
+
# Affiche par exemple le résumé des sections détectées
|
| 408 |
+
#sec = agent_result.get('output', {}).get('structure', {}).get('detected_sections', [])
|
| 409 |
+
#print(f"📊 Sections détectées via GPT : {len(sec)}")
|
| 410 |
+
print("\n=== AGENT RAW OUTPUT ===\n", agent_result)
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
else:
|
| 414 |
+
print("❌ sample.docx not found. Please provide the correct path.")
|
| 415 |
+
template_path = input("Enter the path to your Word template file: ").strip()
|
| 416 |
+
|
| 417 |
+
if template_path and os.path.exists(template_path):
|
| 418 |
+
analyzer.test_with_sample_template(template_path)
|
| 419 |
+
else:
|
| 420 |
+
print("❌ Invalid file path provided")
|
| 421 |
+
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
if __name__ == "__main__":
|
| 425 |
+
main()
|
template_analyzer.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Template Analyzer Agent
|
| 4 |
+
Analyzes Word document templates to extract structure and sections
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
from docx import Document
|
| 11 |
+
from langchain.tools import tool
|
| 12 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 13 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@tool
|
| 17 |
+
def analyze_word_template(template_path: str) -> Dict[str, Any]:
|
| 18 |
+
"""Analyze a Word document template to extract structure and sections."""
|
| 19 |
+
if not os.path.exists(template_path):
|
| 20 |
+
raise FileNotFoundError(f"Template file not found: {template_path}")
|
| 21 |
+
|
| 22 |
+
doc = Document(template_path)
|
| 23 |
+
analysis = {
|
| 24 |
+
'sections': [],
|
| 25 |
+
'formatting': {},
|
| 26 |
+
'document_info': {}
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
# Analyze paragraphs and sections
|
| 30 |
+
for i, paragraph in enumerate(doc.paragraphs):
|
| 31 |
+
text = paragraph.text.strip()
|
| 32 |
+
if text:
|
| 33 |
+
# Detect sections - improved regex to catch all section types
|
| 34 |
+
if re.search(r'\b(examen|observation|conclusion|résultat|resultat|diagnostic|rapport|échographie|echographie|analyse|commentaire|recommandation|technique|matériel|matériel|méthode|indication)\b', text, re.IGNORECASE):
|
| 35 |
+
analysis['sections'].append({
|
| 36 |
+
'text': text,
|
| 37 |
+
'index': i,
|
| 38 |
+
'style': paragraph.style.name if paragraph.style else 'Normal'
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
# Analyze formatting
|
| 42 |
+
if paragraph.runs:
|
| 43 |
+
run = paragraph.runs[0]
|
| 44 |
+
analysis['formatting'][i] = {
|
| 45 |
+
'bold': run.bold,
|
| 46 |
+
'italic': run.italic,
|
| 47 |
+
'font_name': run.font.name,
|
| 48 |
+
'font_size': run.font.size.pt if run.font.size else None,
|
| 49 |
+
'alignment': paragraph.alignment
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Analyze document properties
|
| 53 |
+
if doc.core_properties.title:
|
| 54 |
+
analysis['document_info'] = {
|
| 55 |
+
'title': doc.core_properties.title,
|
| 56 |
+
'author': doc.core_properties.author,
|
| 57 |
+
'subject': doc.core_properties.subject
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
return analysis
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def create_template_analyzer_agent(llm):
|
| 64 |
+
"""Create the template analyzer agent."""
|
| 65 |
+
template_analyzer_prompt = ChatPromptTemplate.from_messages([
|
| 66 |
+
("system", """You are a medical document template analyzer.
|
| 67 |
+
Analyze the provided Word template and extract its structure, sections, and formatting.
|
| 68 |
+
Provide a detailed analysis that can be used by other agents."""),
|
| 69 |
+
("human",
|
| 70 |
+
"Analyze the template at {template_path} and provide a comprehensive analysis."),
|
| 71 |
+
MessagesPlaceholder("agent_scratchpad")
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
template_analyzer_agent = create_openai_tools_agent(
|
| 75 |
+
llm=llm,
|
| 76 |
+
tools=[analyze_word_template],
|
| 77 |
+
prompt=template_analyzer_prompt
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
template_analyzer_executor = AgentExecutor(
|
| 81 |
+
agent=template_analyzer_agent,
|
| 82 |
+
tools=[analyze_word_template],
|
| 83 |
+
verbose=True
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return template_analyzer_executor
|
template_db_creation.py
ADDED
|
@@ -0,0 +1,896 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, List, Optional, Tuple
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import pickle
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
import faiss
|
| 11 |
+
from docx import Document
|
| 12 |
+
import logging
|
| 13 |
+
from langchain_openai import ChatOpenAI
|
| 14 |
+
from langchain.prompts import ChatPromptTemplate
|
| 15 |
+
from docx import Document
|
| 16 |
+
from docx.shared import RGBColor
|
| 17 |
+
import glob
|
| 18 |
+
import logging
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 24 |
+
GPT_MODEL = os.getenv("GPT_MODEL", "gpt-5")
|
| 25 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 26 |
+
|
| 27 |
+
# Configuration du logging
|
| 28 |
+
def setup_logging(log_file: str = None):
|
| 29 |
+
"""Configure le système de logging"""
|
| 30 |
+
if log_file is None:
|
| 31 |
+
log_file = f"medical_parser_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
| 32 |
+
|
| 33 |
+
# Créer le dossier logs s'il n'existe pas
|
| 34 |
+
log_dir = "logs"
|
| 35 |
+
if not os.path.exists(log_dir):
|
| 36 |
+
os.makedirs(log_dir)
|
| 37 |
+
|
| 38 |
+
log_path = os.path.join(log_dir, log_file)
|
| 39 |
+
|
| 40 |
+
# Configuration du logging
|
| 41 |
+
logging.basicConfig(
|
| 42 |
+
level=logging.INFO,
|
| 43 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 44 |
+
handlers=[
|
| 45 |
+
logging.FileHandler(log_path, encoding='utf-8'),
|
| 46 |
+
logging.StreamHandler() # Pour afficher aussi dans la console
|
| 47 |
+
]
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
return log_path
|
| 51 |
+
|
| 52 |
+
# Initialiser le logging
|
| 53 |
+
log_path = setup_logging()
|
| 54 |
+
logger = logging.getLogger(__name__)
|
| 55 |
+
|
| 56 |
+
# Configuration du logging
|
| 57 |
+
#logging.basicConfig(level=logging.INFO)
|
| 58 |
+
#logger = logging.getLogger(__name__)
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class TemplateInfo:
|
| 62 |
+
"""Structure pour stocker les informations d'un template"""
|
| 63 |
+
id: str
|
| 64 |
+
type: str
|
| 65 |
+
has_asr_zone: bool
|
| 66 |
+
asr_tag_position: int
|
| 67 |
+
detected_sections: List[str]
|
| 68 |
+
medecin: str
|
| 69 |
+
embedding: np.ndarray
|
| 70 |
+
filepath: str
|
| 71 |
+
content: str
|
| 72 |
+
asr_context: str = ""
|
| 73 |
+
sections_data: Dict = field(default_factory=dict)
|
| 74 |
+
user_fields: List[str] = field(default_factory=list)
|
| 75 |
+
|
| 76 |
+
class MedicalTemplateParser:
|
| 77 |
+
"""Parser pour templates médicaux avec base vectorielle et GPT"""
|
| 78 |
+
|
| 79 |
+
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 80 |
+
"""
|
| 81 |
+
Initialise le parser avec un modèle d'embedding et GPT
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
model_name: Nom du modèle SentenceTransformer à utiliser
|
| 85 |
+
"""
|
| 86 |
+
self.model = SentenceTransformer(model_name)
|
| 87 |
+
self.templates: Dict[str, TemplateInfo] = {}
|
| 88 |
+
self.vector_index = None
|
| 89 |
+
self.template_ids = []
|
| 90 |
+
|
| 91 |
+
# Initialiser GPT pour l'analyse des sections
|
| 92 |
+
self.llm = None
|
| 93 |
+
self.section_classifier = None
|
| 94 |
+
self._initialize_gpt()
|
| 95 |
+
|
| 96 |
+
# Types de documents médicaux
|
| 97 |
+
self.document_types = {
|
| 98 |
+
"compte_rendu_imagerie": ["imagerie", "scanner", "IRM", "échographie", "radiologie", "TECHNIQUE", "RESULTATS"],
|
| 99 |
+
"lettre_confrere": ["confrère", "cher", "collègue", "salutations", "cordialement"],
|
| 100 |
+
"resultats_laboratoire": ["laboratoire", "analyses", "biologie", "résultats", "valeurs"],
|
| 101 |
+
"demande_examen": ["demande", "prescription", "examen", "bilan"],
|
| 102 |
+
"ordonnance": ["ordonnance", "prescription", "posologie", "traitement"]
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
def _initialize_gpt(self):
|
| 106 |
+
"""Initialise le modèle GPT pour l'analyse des sections"""
|
| 107 |
+
api_key = OPENAI_API_KEY
|
| 108 |
+
if not api_key:
|
| 109 |
+
logger.warning("OPENAI_API_KEY non définie. L'analyse GPT ne sera pas disponible.")
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
self.llm = ChatOpenAI(
|
| 114 |
+
model=GPT_MODEL,
|
| 115 |
+
temperature=0,
|
| 116 |
+
max_tokens=4000,
|
| 117 |
+
api_key=api_key
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Définir le prompt pour l'analyse des sections
|
| 121 |
+
section_prompt = ChatPromptTemplate.from_messages([
|
| 122 |
+
("system", """
|
| 123 |
+
Vous êtes un expert en analyse de documents médicaux. Je vais vous fournir le texte complet d'un rapport médical.
|
| 124 |
+
IMPORTANT : Réponds UNIQUEMENT avec un JSON valide. Aucun texte supplémentaire, aucune explication, aucune balise markdown.
|
| 125 |
+
|
| 126 |
+
Votre tâche est de :
|
| 127 |
+
|
| 128 |
+
1. **Identifier le type de document** :
|
| 129 |
+
- Si le document contient "Destinataire :" → c'est une "lettre médicale"
|
| 130 |
+
- Si le document contient des mots-clés d'examen (SCANNER, IRM, ECHOGRAPHIE, RADIO, etc.) → c'est un "rapport médical"
|
| 131 |
+
- Si le document contient "TITRE" seul → c'est un "rapport médical" (pas type "TITRE")
|
| 132 |
+
- Sinon → "rapport médical" par défaut
|
| 133 |
+
|
| 134 |
+
2. **Extraire les informations du centre médical** (si elles existent) :
|
| 135 |
+
- Chercher le nom du centre/association/hôpital
|
| 136 |
+
- Chercher l'adresse si présente
|
| 137 |
+
- Chercher le téléphone si présent
|
| 138 |
+
- Chercher le service médical (ex: "Service D'IMAGERIE MEDICALE")
|
| 139 |
+
- Chercher l'équipement médical mentionné
|
| 140 |
+
- Ces informations doivent être stockées dans un champ "center_info" séparé
|
| 141 |
+
- Si aucune information de centre n'est trouvée, laisser center_info vide ou null
|
| 142 |
+
|
| 143 |
+
3. **Identifier le médecin** (si mentionné) :
|
| 144 |
+
- Chercher les signatures en fin de document (ex: "Dr Eric AUBERTON")
|
| 145 |
+
- Chercher les mentions "PRESCRIPTEUR :" suivi du nom
|
| 146 |
+
- Chercher les formules comme "DR M NOM Prénom" où NOM/Prénom sont des champs à remplir
|
| 147 |
+
- Si aucun médecin n'est identifié, retourner "Non spécifié"
|
| 148 |
+
|
| 149 |
+
4. **Identifier les sections à remplir** :
|
| 150 |
+
Une section est définie par :
|
| 151 |
+
- Un champ d'information suivi de deux-points : "Patient:", "Initiales:", "Date:", "Médecin:", etc.
|
| 152 |
+
- Un titre de section suivi de deux-points : "Indication:", "Technique:", "Résultats:", "Conclusion:", "Compte-rendu:", etc.
|
| 153 |
+
- Une ligne seule en majuscules représentant un champ à remplir : "TITRE", "CLINIQUE", "TECHNIQUE", "RÉSULTATS", "CONCLUSION"
|
| 154 |
+
- Pour les lettres : "Destinataire:" et le contenu principal de la lettre (zone ASR_VOX)
|
| 155 |
+
|
| 156 |
+
**EXCLUSIONS importantes** :
|
| 157 |
+
- Les titres encadrés de tirets (ex: "- EXAMEN TOMODENSITOMETRIQUE THORACIQUE -") → document_type uniquement
|
| 158 |
+
- Les informations d'en-tête du centre médical → center_info uniquement
|
| 159 |
+
- Les informations administratives fixes → ne pas traiter comme sections
|
| 160 |
+
|
| 161 |
+
5. **Pour chaque section identifiée** :
|
| 162 |
+
- Extraire son contenu en collectant toutes les lignes suivantes jusqu'à la prochaine section
|
| 163 |
+
- Identifier les champs à remplir par l'utilisateur :
|
| 164 |
+
* Les balises <ASR_VOX> indiquent des champs à remplir
|
| 165 |
+
* Les balises <ASR> indiquent des champs à remplir
|
| 166 |
+
* Les textes génériques comme "xxx", "xxxx", "XXX" indiquent des champs à remplir
|
| 167 |
+
* Les formules conditionnelles comme "SI(Civilité Nom usuel médecin..." indiquent des champs à remplir
|
| 168 |
+
* Les balises [NOM_PATIENT], [DATE], [MEDECIN], etc. indiquent des champs à remplir
|
| 169 |
+
* Les champs vides après ":" indiquent des champs à remplir
|
| 170 |
+
* Les mots "NOM", "Prénom" dans le contexte médical indiquent des champs à remplir
|
| 171 |
+
|
| 172 |
+
6. **Gestion spéciale des lettres médicales** :
|
| 173 |
+
- "Destinataire:" est une section à remplir
|
| 174 |
+
- Le contenu principal (zone ASR_VOX) doit être identifié comme "Contenu" ou "Corps de lettre"
|
| 175 |
+
|
| 176 |
+
7. **Identifier les zones ASR** et leur position dans le document
|
| 177 |
+
|
| 178 |
+
8. **Retourner un objet JSON valide** avec cette structure exacte :
|
| 179 |
+
{{
|
| 180 |
+
"document_type": "rapport médical|lettre médicale|autre",
|
| 181 |
+
"center_info": {{
|
| 182 |
+
"name": "nom du centre/association si trouvé, sinon null",
|
| 183 |
+
"address": "adresse complète si trouvée, sinon null",
|
| 184 |
+
"phone": "téléphone si trouvé, sinon null",
|
| 185 |
+
"service": "service médical si trouvé, sinon null",
|
| 186 |
+
"equipment": "équipement mentionné si trouvé, sinon null"
|
| 187 |
+
}},
|
| 188 |
+
"physician": "nom du médecin identifié ou 'Non spécifié'",
|
| 189 |
+
"asr_zones": [
|
| 190 |
+
{{
|
| 191 |
+
"tag": "balise ASR trouvée",
|
| 192 |
+
"position": "position approximative dans le texte",
|
| 193 |
+
"context": "contexte autour de la balise"
|
| 194 |
+
}}
|
| 195 |
+
],
|
| 196 |
+
"sections": {{
|
| 197 |
+
"nom_section": {{
|
| 198 |
+
"content": "contenu brut de la section",
|
| 199 |
+
"has_user_fields": true,
|
| 200 |
+
"user_fields": ["liste des champs à remplir"]
|
| 201 |
+
}}
|
| 202 |
+
}}
|
| 203 |
+
}}
|
| 204 |
+
|
| 205 |
+
**Règles importantes** :
|
| 206 |
+
- Extraire UNIQUEMENT les informations qui existent réellement dans le document
|
| 207 |
+
- Les informations administratives du centre ne sont PAS des sections à remplir (si elles existent)
|
| 208 |
+
- Si aucune information de centre n'est trouvée, laisser les champs center_info à null
|
| 209 |
+
- "TITRE" seul indique un rapport médical, pas un type "TITRE"
|
| 210 |
+
- Les lettres ont un traitement spécial avec Destinataire + Contenu principal
|
| 211 |
+
- Identifier le médecin quand c'est possible, sinon "Non spécifié"
|
| 212 |
+
- Distinguer clairement les champs à remplir des informations fixes
|
| 213 |
+
- Adaptation flexible : tous les documents n'ont pas la même structure
|
| 214 |
+
|
| 215 |
+
Répondez UNIQUEMENT avec le JSON—aucun commentaire supplémentaire.
|
| 216 |
+
"""),
|
| 217 |
+
("human", "Voici le texte complet du rapport médical :\n\n{document_text}\n\nExtrayez toutes les sections, identifiez les champs à remplir et les zones ASR.")
|
| 218 |
+
])
|
| 219 |
+
|
| 220 |
+
self.section_classifier = section_prompt | self.llm
|
| 221 |
+
print("✅ GPT initialisé avec succès")
|
| 222 |
+
logger.info(f"✅ GPT initialisé avec succès")
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.info(f"❌ Erreur lors de l'initialisation de GPT: {e}")
|
| 226 |
+
|
| 227 |
+
self.llm = None
|
| 228 |
+
self.section_classifier = None
|
| 229 |
+
|
| 230 |
+
def extract_text_from_docx(self, filepath: str) -> Tuple[str, Dict]:
|
| 231 |
+
"""
|
| 232 |
+
Extrait le texte d'un fichier Word en préservant la structure
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
filepath: Chemin vers le fichier DOCX
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
Tuple[str, Dict]: (texte_complet, informations_structure)
|
| 239 |
+
"""
|
| 240 |
+
logger.info(f"📄 Extraction du texte DOCX: {os.path.basename(filepath)}")
|
| 241 |
+
|
| 242 |
+
try:
|
| 243 |
+
doc = Document(filepath)
|
| 244 |
+
text_content = []
|
| 245 |
+
structure_info = {
|
| 246 |
+
"paragraphs": [],
|
| 247 |
+
"tables": [],
|
| 248 |
+
"headers": [],
|
| 249 |
+
"footers": [],
|
| 250 |
+
"styles": [],
|
| 251 |
+
"formatting": []
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
# Traiter les paragraphes
|
| 255 |
+
for i, paragraph in enumerate(doc.paragraphs):
|
| 256 |
+
para_text = paragraph.text.strip()
|
| 257 |
+
if para_text: # Ignorer les paragraphes vides
|
| 258 |
+
text_content.append(para_text)
|
| 259 |
+
|
| 260 |
+
# Collecter les informations de structure
|
| 261 |
+
para_info = {
|
| 262 |
+
"index": i,
|
| 263 |
+
"text": para_text,
|
| 264 |
+
"style": paragraph.style.name if paragraph.style else "Normal",
|
| 265 |
+
"alignment": str(paragraph.alignment) if paragraph.alignment else "None",
|
| 266 |
+
"runs": []
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Analyser les runs (formatage)
|
| 270 |
+
for run in paragraph.runs:
|
| 271 |
+
if run.text.strip():
|
| 272 |
+
run_info = {
|
| 273 |
+
"text": run.text,
|
| 274 |
+
"bold": run.bold,
|
| 275 |
+
"italic": run.italic,
|
| 276 |
+
"underline": run.underline,
|
| 277 |
+
"font_name": run.font.name if run.font.name else "Default",
|
| 278 |
+
"font_size": run.font.size.pt if run.font.size else None,
|
| 279 |
+
"color": self._get_color_info(run.font.color) if run.font.color else None
|
| 280 |
+
}
|
| 281 |
+
para_info["runs"].append(run_info)
|
| 282 |
+
|
| 283 |
+
structure_info["paragraphs"].append(para_info)
|
| 284 |
+
structure_info["styles"].append(paragraph.style.name if paragraph.style else "Normal")
|
| 285 |
+
|
| 286 |
+
# Traiter les tableaux
|
| 287 |
+
for table_idx, table in enumerate(doc.tables):
|
| 288 |
+
table_info = {
|
| 289 |
+
"index": table_idx,
|
| 290 |
+
"rows": len(table.rows),
|
| 291 |
+
"cols": len(table.columns),
|
| 292 |
+
"content": []
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
table_text = []
|
| 296 |
+
for row_idx, row in enumerate(table.rows):
|
| 297 |
+
row_data = []
|
| 298 |
+
row_text = []
|
| 299 |
+
for cell_idx, cell in enumerate(row.cells):
|
| 300 |
+
cell_text = cell.text.strip()
|
| 301 |
+
row_data.append(cell_text)
|
| 302 |
+
row_text.append(cell_text)
|
| 303 |
+
if cell_text:
|
| 304 |
+
text_content.append(cell_text)
|
| 305 |
+
|
| 306 |
+
table_info["content"].append(row_data)
|
| 307 |
+
table_text.append(" | ".join(row_text))
|
| 308 |
+
|
| 309 |
+
structure_info["tables"].append(table_info)
|
| 310 |
+
# Ajouter le contenu du tableau au texte principal
|
| 311 |
+
text_content.extend(table_text)
|
| 312 |
+
|
| 313 |
+
# Traiter les en-têtes et pieds de page
|
| 314 |
+
for section in doc.sections:
|
| 315 |
+
# En-têtes
|
| 316 |
+
if section.header:
|
| 317 |
+
header_text = []
|
| 318 |
+
for paragraph in section.header.paragraphs:
|
| 319 |
+
if paragraph.text.strip():
|
| 320 |
+
header_text.append(paragraph.text.strip())
|
| 321 |
+
text_content.append(paragraph.text.strip())
|
| 322 |
+
|
| 323 |
+
if header_text:
|
| 324 |
+
structure_info["headers"].append({
|
| 325 |
+
"content": header_text,
|
| 326 |
+
"section_index": doc.sections.index(section)
|
| 327 |
+
})
|
| 328 |
+
|
| 329 |
+
# Pieds de page
|
| 330 |
+
if section.footer:
|
| 331 |
+
footer_text = []
|
| 332 |
+
for paragraph in section.footer.paragraphs:
|
| 333 |
+
if paragraph.text.strip():
|
| 334 |
+
footer_text.append(paragraph.text.strip())
|
| 335 |
+
text_content.append(paragraph.text.strip())
|
| 336 |
+
|
| 337 |
+
if footer_text:
|
| 338 |
+
structure_info["footers"].append({
|
| 339 |
+
"content": footer_text,
|
| 340 |
+
"section_index": doc.sections.index(section)
|
| 341 |
+
})
|
| 342 |
+
|
| 343 |
+
# Nettoyer les styles dupliqués
|
| 344 |
+
structure_info["styles"] = list(set(structure_info["styles"]))
|
| 345 |
+
|
| 346 |
+
# Créer le texte final
|
| 347 |
+
final_text = "\n".join(text_content)
|
| 348 |
+
|
| 349 |
+
logger.info(f"✅ Texte extrait avec succès:")
|
| 350 |
+
logger.info(f" - Paragraphes: {len(structure_info['paragraphs'])}")
|
| 351 |
+
logger.info(f" - Tableaux: {len(structure_info['tables'])}")
|
| 352 |
+
logger.info(f" - En-têtes: {len(structure_info['headers'])}")
|
| 353 |
+
logger.info(f" - Pieds de page: {len(structure_info['footers'])}")
|
| 354 |
+
logger.info(f" - Styles utilisés: {len(structure_info['styles'])}")
|
| 355 |
+
|
| 356 |
+
return final_text, structure_info
|
| 357 |
+
|
| 358 |
+
except Exception as e:
|
| 359 |
+
logger.info(f"❌ Erreur lors de l'extraction du texte DOCX de {filepath}: {e}")
|
| 360 |
+
return "", {}
|
| 361 |
+
|
| 362 |
+
def _get_color_info(self, color):
|
| 363 |
+
"""Extrait les informations de couleur d'un run"""
|
| 364 |
+
try:
|
| 365 |
+
if color.rgb:
|
| 366 |
+
return f"rgb({color.rgb.red}, {color.rgb.green}, {color.rgb.blue})"
|
| 367 |
+
elif color.theme_color:
|
| 368 |
+
return f"theme_{color.theme_color}"
|
| 369 |
+
else:
|
| 370 |
+
return "default"
|
| 371 |
+
except:
|
| 372 |
+
return "default"
|
| 373 |
+
|
| 374 |
+
def analyze_document_with_gpt(self, text: str) -> Dict:
|
| 375 |
+
"""
|
| 376 |
+
Analyse le document avec GPT pour extraire sections et zones ASR
|
| 377 |
+
|
| 378 |
+
Args:
|
| 379 |
+
text: Texte complet du document
|
| 380 |
+
|
| 381 |
+
Returns:
|
| 382 |
+
Dict: Résultats de l'analyse GPT
|
| 383 |
+
"""
|
| 384 |
+
if not self.section_classifier:
|
| 385 |
+
logger.info("⚠️ GPT non disponible, utilisation des méthodes classiques")
|
| 386 |
+
return self._fallback_analysis(text)
|
| 387 |
+
|
| 388 |
+
try:
|
| 389 |
+
logger.info("🔍 Analyse du document avec GPT...")
|
| 390 |
+
response = self.section_classifier.invoke({"document_text": text})
|
| 391 |
+
result = response.content.strip()
|
| 392 |
+
# Vérifier si la réponse est vide
|
| 393 |
+
if not result:
|
| 394 |
+
logger.info("❌ Réponse GPT vide")
|
| 395 |
+
return self._fallback_analysis(text)
|
| 396 |
+
logger.info(f"📝 Réponse GPT (premiers 200 caractères): {result[:200]}...")
|
| 397 |
+
if result.startswith("```json"):
|
| 398 |
+
result = result[7:] # Supprimer ```json
|
| 399 |
+
if result.endswith("```"):
|
| 400 |
+
result = result[:-3] # Supprimer ```
|
| 401 |
+
|
| 402 |
+
# Supprimer les espaces en début et fin
|
| 403 |
+
result = result.strip()
|
| 404 |
+
# Vérifier que ça commence par { et finit par }
|
| 405 |
+
if not result.startswith('{') or not result.endswith('}'):
|
| 406 |
+
logger.info(f"❌ Format JSON invalide. Début: '{result[:50]}...' Fin: '...{result[-50:]}'")
|
| 407 |
+
return self._fallback_analysis(text)
|
| 408 |
+
# Parser la réponse JSON
|
| 409 |
+
analysis_data = json.loads(result)
|
| 410 |
+
logger.info("✅ Analyse GPT terminée avec succès")
|
| 411 |
+
return analysis_data
|
| 412 |
+
|
| 413 |
+
except json.JSONDecodeError as e:
|
| 414 |
+
logger.info(f"❌ Erreur de parsing JSON GPT: {e}")
|
| 415 |
+
return self._fallback_analysis(text)
|
| 416 |
+
except Exception as e:
|
| 417 |
+
logger.info(f"❌ Erreur lors de l'analyse GPT: {e}")
|
| 418 |
+
return self._fallback_analysis(text)
|
| 419 |
+
|
| 420 |
+
def _fallback_analysis(self, text: str) -> Dict:
|
| 421 |
+
"""Analyse de fallback sans GPT"""
|
| 422 |
+
logger.info("📊 Utilisation de l'analyse classique...")
|
| 423 |
+
|
| 424 |
+
# Détection ASR classique
|
| 425 |
+
has_asr, asr_pos, asr_context = self.detect_asr_zone_classic(text)
|
| 426 |
+
|
| 427 |
+
# Extraction sections classique
|
| 428 |
+
sections = self.extract_sections_classic(text)
|
| 429 |
+
|
| 430 |
+
# Classification type classique
|
| 431 |
+
doc_type = self.classify_document_type(text, sections)
|
| 432 |
+
|
| 433 |
+
return {
|
| 434 |
+
"document_type": doc_type,
|
| 435 |
+
"asr_zones": [{"tag": "<ASR_VOX>", "position": asr_pos, "context": asr_context}] if has_asr else [],
|
| 436 |
+
"sections": {section: {"content": "", "has_user_fields": False, "user_fields": []} for section in sections}
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
def detect_asr_zone_classic(self, text: str) -> Tuple[bool, int, str]:
|
| 440 |
+
"""
|
| 441 |
+
Détection ASR classique (fallback)
|
| 442 |
+
|
| 443 |
+
Returns:
|
| 444 |
+
(has_asr_zone, position, context_around_asr)
|
| 445 |
+
"""
|
| 446 |
+
asr_patterns = [
|
| 447 |
+
r"<ASR_VOX>",
|
| 448 |
+
r"<ASR>",
|
| 449 |
+
r"\[DICTEE\]",
|
| 450 |
+
r"\[ASR\]",
|
| 451 |
+
r"<!-- ASR -->"
|
| 452 |
+
]
|
| 453 |
+
|
| 454 |
+
for pattern in asr_patterns:
|
| 455 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 456 |
+
if match:
|
| 457 |
+
position = match.start()
|
| 458 |
+
start_context = max(0, position - 200)
|
| 459 |
+
end_context = min(len(text), position + 200)
|
| 460 |
+
context = text[start_context:end_context]
|
| 461 |
+
|
| 462 |
+
return True, position, context
|
| 463 |
+
|
| 464 |
+
return False, -1, ""
|
| 465 |
+
|
| 466 |
+
def extract_sections_classic(self, text: str) -> List[str]:
|
| 467 |
+
"""Extraction de sections classique (fallback)"""
|
| 468 |
+
sections = set()
|
| 469 |
+
|
| 470 |
+
section_patterns = [
|
| 471 |
+
r"([A-ZÉÈÀÇÊ][A-ZÉÈÀÇÊ\s]{2,}):",
|
| 472 |
+
r"([A-ZÉÈÀÇÊ][a-zéèàçê\s]{3,}):",
|
| 473 |
+
r"(\d+\.\s*[A-ZÉÈÀÇÊ][a-zéèàçê\s]{3,}):",
|
| 474 |
+
]
|
| 475 |
+
|
| 476 |
+
for pattern in section_patterns:
|
| 477 |
+
matches = re.findall(pattern, text, re.MULTILINE)
|
| 478 |
+
for match in matches:
|
| 479 |
+
section = match.strip().rstrip(':').strip()
|
| 480 |
+
if len(section) > 2 and len(section) < 50:
|
| 481 |
+
sections.add(section)
|
| 482 |
+
|
| 483 |
+
return sorted(list(sections))
|
| 484 |
+
|
| 485 |
+
def classify_document_type(self, text: str, sections: List[str]) -> str:
|
| 486 |
+
"""Classifie le type de document basé sur le contenu et les sections"""
|
| 487 |
+
text_lower = text.lower()
|
| 488 |
+
sections_lower = [s.lower() for s in sections]
|
| 489 |
+
all_text = text_lower + " " + " ".join(sections_lower)
|
| 490 |
+
|
| 491 |
+
max_score = 0
|
| 492 |
+
best_type = "autre"
|
| 493 |
+
|
| 494 |
+
for doc_type, keywords in self.document_types.items():
|
| 495 |
+
score = 0
|
| 496 |
+
for keyword in keywords:
|
| 497 |
+
if keyword.lower() in all_text:
|
| 498 |
+
score += 1
|
| 499 |
+
|
| 500 |
+
if doc_type == "compte_rendu_imagerie" and any("technique" in s for s in sections_lower):
|
| 501 |
+
score += 2
|
| 502 |
+
|
| 503 |
+
if score > max_score:
|
| 504 |
+
max_score = score
|
| 505 |
+
best_type = doc_type
|
| 506 |
+
|
| 507 |
+
return best_type
|
| 508 |
+
|
| 509 |
+
def extract_doctor_name(self, text: str) -> str:
|
| 510 |
+
"""Extrait le nom du médecin du template"""
|
| 511 |
+
doctor_patterns = [
|
| 512 |
+
r"Dr\.?\s+([A-ZÉÈÀÇÊ][a-zéèàçê]+\s+[A-ZÉÈÀÇÊ][a-zéèàçê]+)",
|
| 513 |
+
r"Docteur\s+([A-ZÉÈÀÇÊ][a-zéèàçê]+\s+[A-ZÉÈÀÇÊ][a-zéèàçê]+)",
|
| 514 |
+
r"Praticien\s*:\s*([A-ZÉÈÀÇÊ][a-zéèàçê]+\s+[A-ZÉÈÀÇÊ][a-zéèàçê]+)",
|
| 515 |
+
]
|
| 516 |
+
|
| 517 |
+
for pattern in doctor_patterns:
|
| 518 |
+
match = re.search(pattern, text)
|
| 519 |
+
if match:
|
| 520 |
+
return match.group(1).strip()
|
| 521 |
+
|
| 522 |
+
return "Non spécifié"
|
| 523 |
+
|
| 524 |
+
def parse_template(self, filepath: str, template_id: str = None) -> TemplateInfo:
|
| 525 |
+
"""
|
| 526 |
+
Parse un template et extrait toutes les informations avec GPT
|
| 527 |
+
|
| 528 |
+
Args:
|
| 529 |
+
filepath: Chemin vers le fichier template
|
| 530 |
+
template_id: ID unique du template (optionnel)
|
| 531 |
+
|
| 532 |
+
Returns:
|
| 533 |
+
TemplateInfo: Informations structurées du template
|
| 534 |
+
"""
|
| 535 |
+
if template_id is None:
|
| 536 |
+
template_id = Path(filepath).stem
|
| 537 |
+
|
| 538 |
+
logger.info(f"\n📄 Traitement du fichier: {os.path.basename(filepath)}")
|
| 539 |
+
|
| 540 |
+
# Extraire le texte selon le type de fichier
|
| 541 |
+
if filepath.endswith('.docx'):
|
| 542 |
+
text, _ = self.extract_text_from_docx(filepath) # Utiliser la méthode existante
|
| 543 |
+
else:
|
| 544 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 545 |
+
text = f.read()
|
| 546 |
+
|
| 547 |
+
if not text.strip():
|
| 548 |
+
logger.info("❌ Aucun texte extrait du fichier")
|
| 549 |
+
return None
|
| 550 |
+
|
| 551 |
+
# Analyser avec GPT
|
| 552 |
+
analysis_data = self.analyze_document_with_gpt(text)
|
| 553 |
+
|
| 554 |
+
# Extraire les informations de l'analyse GPT
|
| 555 |
+
doc_type = analysis_data.get("document_type", "autre")
|
| 556 |
+
sections_data = analysis_data.get("sections", {})
|
| 557 |
+
asr_zones = analysis_data.get("asr_zones", [])
|
| 558 |
+
|
| 559 |
+
logger.info(f"📋 Type de document détecté: {doc_type}")
|
| 560 |
+
logger.info(f"🔍 Zones ASR trouvées: {len(asr_zones)}")
|
| 561 |
+
logger.info(f"📑 Sections détectées: {len(sections_data)}")
|
| 562 |
+
|
| 563 |
+
# Déterminer les informations ASR
|
| 564 |
+
has_asr = len(asr_zones) > 0
|
| 565 |
+
asr_pos = asr_zones[0]["position"] if asr_zones else -1
|
| 566 |
+
asr_context = asr_zones[0]["context"] if asr_zones else ""
|
| 567 |
+
|
| 568 |
+
# Extraire les sections détectées
|
| 569 |
+
detected_sections = list(sections_data.keys())
|
| 570 |
+
|
| 571 |
+
# Collecter tous les champs utilisateur
|
| 572 |
+
user_fields = []
|
| 573 |
+
for section_data in sections_data.values():
|
| 574 |
+
if isinstance(section_data, dict) and section_data.get("has_user_fields"):
|
| 575 |
+
user_fields.extend(section_data.get("user_fields", []))
|
| 576 |
+
|
| 577 |
+
# Extraire le nom du médecin
|
| 578 |
+
medecin = self.extract_doctor_name(text)
|
| 579 |
+
logger.info(f"👨⚕️ Médecin détecté: {medecin}")
|
| 580 |
+
|
| 581 |
+
# Créer le texte pour l'embedding
|
| 582 |
+
embedding_text = self.create_embedding_text(text, asr_context, detected_sections, doc_type)
|
| 583 |
+
|
| 584 |
+
# Générer l'embedding
|
| 585 |
+
embedding = self.model.encode([embedding_text])[0]
|
| 586 |
+
|
| 587 |
+
# Créer l'objet TemplateInfo
|
| 588 |
+
template_info = TemplateInfo(
|
| 589 |
+
id=template_id,
|
| 590 |
+
type=doc_type,
|
| 591 |
+
has_asr_zone=has_asr,
|
| 592 |
+
asr_tag_position=asr_pos,
|
| 593 |
+
detected_sections=detected_sections,
|
| 594 |
+
medecin=medecin,
|
| 595 |
+
embedding=embedding,
|
| 596 |
+
filepath=filepath,
|
| 597 |
+
content=text,
|
| 598 |
+
asr_context=asr_context,
|
| 599 |
+
sections_data=sections_data,
|
| 600 |
+
user_fields=user_fields
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
logger.info(f"✅ Template {template_id} traité avec succès")
|
| 604 |
+
return template_info
|
| 605 |
+
|
| 606 |
+
def create_embedding_text(self, text: str, asr_context: str, sections: List[str], doc_type: str) -> str:
|
| 607 |
+
"""
|
| 608 |
+
Crée le texte optimisé pour l'embedding
|
| 609 |
+
|
| 610 |
+
Args:
|
| 611 |
+
text: Texte complet du template
|
| 612 |
+
asr_context: Contexte autour de la zone ASR
|
| 613 |
+
sections: Sections détectées
|
| 614 |
+
doc_type: Type de document
|
| 615 |
+
|
| 616 |
+
Returns:
|
| 617 |
+
str: Texte optimisé pour l'embedding
|
| 618 |
+
"""
|
| 619 |
+
lines = text.split('\n')
|
| 620 |
+
header = ' '.join(lines[:5])
|
| 621 |
+
|
| 622 |
+
embedding_parts = [
|
| 623 |
+
f"Type: {doc_type}",
|
| 624 |
+
f"Sections: {', '.join(sections[:5])}",
|
| 625 |
+
f"Contexte: {header[:200]}",
|
| 626 |
+
]
|
| 627 |
+
|
| 628 |
+
if asr_context:
|
| 629 |
+
embedding_parts.append(f"Zone ASR: {asr_context[:100]}")
|
| 630 |
+
|
| 631 |
+
return ' | '.join(embedding_parts)
|
| 632 |
+
|
| 633 |
+
def process_docx_folder(self, folder_path: str) -> List[TemplateInfo]:
|
| 634 |
+
"""
|
| 635 |
+
Traite tous les fichiers DOCX dans un dossier
|
| 636 |
+
|
| 637 |
+
Args:
|
| 638 |
+
folder_path: Chemin vers le dossier contenant les fichiers DOCX
|
| 639 |
+
|
| 640 |
+
Returns:
|
| 641 |
+
List[TemplateInfo]: Liste des templates traités
|
| 642 |
+
"""
|
| 643 |
+
logger.info(f"🗂️ Traitement du dossier: {folder_path}")
|
| 644 |
+
|
| 645 |
+
# Chercher tous les fichiers DOCX
|
| 646 |
+
docx_files = glob.glob(os.path.join(folder_path, "*.docx"))
|
| 647 |
+
|
| 648 |
+
if not docx_files:
|
| 649 |
+
logger.info("❌ Aucun fichier DOCX trouvé dans le dossier")
|
| 650 |
+
return []
|
| 651 |
+
|
| 652 |
+
logger.info(f"📁 {len(docx_files)} fichiers DOCX trouvés")
|
| 653 |
+
|
| 654 |
+
templates = []
|
| 655 |
+
for i, filepath in enumerate(docx_files, 1):
|
| 656 |
+
logger.info(f"\n{'='*60}")
|
| 657 |
+
logger.info(f"📄 Fichier {i}/{len(docx_files)}: {os.path.basename(filepath)}")
|
| 658 |
+
logger.info(f"{'='*60}")
|
| 659 |
+
|
| 660 |
+
try:
|
| 661 |
+
template_info = self.parse_template(filepath)
|
| 662 |
+
if template_info:
|
| 663 |
+
templates.append(template_info)
|
| 664 |
+
self.templates[template_info.id] = template_info
|
| 665 |
+
except Exception as e:
|
| 666 |
+
logger.info(f"❌ Erreur lors du traitement de {filepath}: {e}")
|
| 667 |
+
continue
|
| 668 |
+
|
| 669 |
+
logger.info(f"\n🎉 Traitement terminé: {len(templates)} templates traités avec succès")
|
| 670 |
+
return templates
|
| 671 |
+
|
| 672 |
+
def build_vector_database(self, templates: List[TemplateInfo]):
|
| 673 |
+
"""
|
| 674 |
+
Construit la base vectorielle avec FAISS
|
| 675 |
+
|
| 676 |
+
Args:
|
| 677 |
+
templates: Liste des templates parsés
|
| 678 |
+
"""
|
| 679 |
+
if not templates:
|
| 680 |
+
logger.info("❌ Aucun template fourni pour construire la base vectorielle")
|
| 681 |
+
return
|
| 682 |
+
|
| 683 |
+
logger.info(f"🔧 Construction de la base vectorielle avec {len(templates)} templates...")
|
| 684 |
+
|
| 685 |
+
embeddings = np.array([template.embedding for template in templates])
|
| 686 |
+
|
| 687 |
+
dimension = embeddings.shape[1]
|
| 688 |
+
self.vector_index = faiss.IndexFlatIP(dimension)
|
| 689 |
+
|
| 690 |
+
faiss.normalize_L2(embeddings)
|
| 691 |
+
self.vector_index.add(embeddings)
|
| 692 |
+
|
| 693 |
+
self.template_ids = [template.id for template in templates]
|
| 694 |
+
|
| 695 |
+
logger.info(f"✅ Base vectorielle construite avec succès")
|
| 696 |
+
|
| 697 |
+
def search_similar_templates(self, query_text: str, k: int = 5) -> List[Tuple[str, float]]:
|
| 698 |
+
"""
|
| 699 |
+
Recherche les templates similaires à une requête
|
| 700 |
+
|
| 701 |
+
Args:
|
| 702 |
+
query_text: Texte de la requête
|
| 703 |
+
k: Nombre de résultats à retourner
|
| 704 |
+
|
| 705 |
+
Returns:
|
| 706 |
+
List[Tuple[str, float]]: Liste des (template_id, score) les plus similaires
|
| 707 |
+
"""
|
| 708 |
+
if self.vector_index is None:
|
| 709 |
+
logger.info("❌ Base vectorielle non construite")
|
| 710 |
+
return []
|
| 711 |
+
|
| 712 |
+
logger.info(f"🔍 Recherche pour: '{query_text}'")
|
| 713 |
+
|
| 714 |
+
query_embedding = self.model.encode([query_text])
|
| 715 |
+
faiss.normalize_L2(query_embedding)
|
| 716 |
+
|
| 717 |
+
scores, indices = self.vector_index.search(query_embedding, k)
|
| 718 |
+
|
| 719 |
+
results = []
|
| 720 |
+
for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
|
| 721 |
+
if idx < len(self.template_ids):
|
| 722 |
+
template_id = self.template_ids[idx]
|
| 723 |
+
results.append((template_id, float(score)))
|
| 724 |
+
|
| 725 |
+
return results
|
| 726 |
+
|
| 727 |
+
def save_database(self, filepath: str):
|
| 728 |
+
"""Sauvegarde la base vectorielle et les templates"""
|
| 729 |
+
logger.info(f"💾 Sauvegarde de la base de données...")
|
| 730 |
+
|
| 731 |
+
database_data = {
|
| 732 |
+
'templates': self.templates,
|
| 733 |
+
'template_ids': self.template_ids,
|
| 734 |
+
'model_name': self.model.get_sentence_embedding_dimension()
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
with open(filepath, 'wb') as f:
|
| 738 |
+
pickle.dump(database_data, f)
|
| 739 |
+
|
| 740 |
+
if self.vector_index is not None:
|
| 741 |
+
faiss.write_index(self.vector_index, filepath.replace('.pkl', '.faiss'))
|
| 742 |
+
|
| 743 |
+
logger.info(f"✅ Base de données sauvegardée dans {filepath}")
|
| 744 |
+
|
| 745 |
+
def load_database(self, filepath: str):
|
| 746 |
+
"""Charge la base vectorielle et les templates"""
|
| 747 |
+
logger.info(f"📂 Chargement de la base de données depuis {filepath}...")
|
| 748 |
+
|
| 749 |
+
with open(filepath, 'rb') as f:
|
| 750 |
+
database_data = pickle.load(f)
|
| 751 |
+
|
| 752 |
+
self.templates = database_data['templates']
|
| 753 |
+
self.template_ids = database_data['template_ids']
|
| 754 |
+
|
| 755 |
+
faiss_path = filepath.replace('.pkl', '.faiss')
|
| 756 |
+
if Path(faiss_path).exists():
|
| 757 |
+
self.vector_index = faiss.read_index(faiss_path)
|
| 758 |
+
|
| 759 |
+
logger.info(f"✅ Base de données chargée avec succès")
|
| 760 |
+
|
| 761 |
+
def get_template_info(self, template_id: str) -> Optional[TemplateInfo]:
|
| 762 |
+
"""Récupère les informations d'un template par son ID"""
|
| 763 |
+
return self.templates.get(template_id)
|
| 764 |
+
|
| 765 |
+
def print_template_summary(self, template_id: str):
|
| 766 |
+
"""Affiche un résumé des informations d'un template"""
|
| 767 |
+
template = self.get_template_info(template_id)
|
| 768 |
+
if template:
|
| 769 |
+
logger.info(f"\n{'='*60}")
|
| 770 |
+
logger.info(f"📋 Template: {template.id}")
|
| 771 |
+
logger.info(f"{'='*60}")
|
| 772 |
+
logger.info(f"📄 Type: {template.type}")
|
| 773 |
+
logger.info(f"👨⚕️ Médecin: {template.medecin}")
|
| 774 |
+
logger.info(f"🎤 Zone ASR: {'✅ Oui' if template.has_asr_zone else '❌ Non'}")
|
| 775 |
+
logger.info(f"📑 Sections détectées ({len(template.detected_sections)}): {', '.join(template.detected_sections)}")
|
| 776 |
+
logger.info(f"⚠️ Champs utilisateur ({len(template.user_fields)}): {', '.join(template.user_fields[:3])}{'...' if len(template.user_fields) > 3 else ''}")
|
| 777 |
+
logger.info(f"📁 Fichier: {os.path.basename(template.filepath)}")
|
| 778 |
+
logger.info(f"{'='*60}")
|
| 779 |
+
|
| 780 |
+
# Afficher les détails des sections
|
| 781 |
+
for section_name, section_data in template.sections_data.items():
|
| 782 |
+
if isinstance(section_data, dict):
|
| 783 |
+
logger.info(f"📋 Section: {section_name}")
|
| 784 |
+
if section_data.get("has_user_fields"):
|
| 785 |
+
fields = section_data.get('user_fields', [])
|
| 786 |
+
logger.info(f" ⚠️ Champs à remplir: {', '.join(fields)}")
|
| 787 |
+
else:
|
| 788 |
+
logger.info(f" ✅ Section complète")
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
def print_global_summary(self):
|
| 792 |
+
"""Affiche un résumé global de tous les templates"""
|
| 793 |
+
logger.info(f"\n{'='*80}")
|
| 794 |
+
logger.info(f"📊 RÉSUMÉ GLOBAL - {len(self.templates)} TEMPLATES TRAITÉS")
|
| 795 |
+
logger.info(f"{'='*80}")
|
| 796 |
+
|
| 797 |
+
# Statistiques par type
|
| 798 |
+
types_count = {}
|
| 799 |
+
asr_count = 0
|
| 800 |
+
total_sections = 0
|
| 801 |
+
total_user_fields = 0
|
| 802 |
+
|
| 803 |
+
for template in self.templates.values():
|
| 804 |
+
types_count[template.type] = types_count.get(template.type, 0) + 1
|
| 805 |
+
if template.has_asr_zone:
|
| 806 |
+
asr_count += 1
|
| 807 |
+
total_sections += len(template.detected_sections)
|
| 808 |
+
total_user_fields += len(template.user_fields)
|
| 809 |
+
|
| 810 |
+
logger.info(f"📈 Statistiques générales:")
|
| 811 |
+
logger.info(f" - Total templates: {len(self.templates)}")
|
| 812 |
+
logger.info(f" - Templates avec ASR: {asr_count}")
|
| 813 |
+
logger.info(f" - Total sections: {total_sections}")
|
| 814 |
+
logger.info(f" - Total champs utilisateur: {total_user_fields}")
|
| 815 |
+
|
| 816 |
+
logger.info(f"\n📊 Répartition par type:")
|
| 817 |
+
for doc_type, count in types_count.items():
|
| 818 |
+
logger.info(f" - {doc_type}: {count}")
|
| 819 |
+
|
| 820 |
+
logger.info(f"\n📋 Templates individuels:")
|
| 821 |
+
for template_id in sorted(self.templates.keys()):
|
| 822 |
+
template = self.templates[template_id]
|
| 823 |
+
asr_icon = "🎤" if template.has_asr_zone else "❌"
|
| 824 |
+
logger.info(f" {asr_icon} {template_id} ({template.type}) - {len(template.detected_sections)} sections - {template.medecin}")
|
| 825 |
+
|
| 826 |
+
def main():
|
| 827 |
+
"""Fonction principale pour traiter un dossier de fichiers docx"""
|
| 828 |
+
|
| 829 |
+
# Chemin vers le dossier contenant les fichiers docx
|
| 830 |
+
docx_folder = input("Entrez le chemin vers le dossier contenant les fichiers docx: ").strip()
|
| 831 |
+
|
| 832 |
+
if not os.path.exists(docx_folder):
|
| 833 |
+
logger.info(f"❌ Le dossier {docx_folder} n'existe pas")
|
| 834 |
+
return
|
| 835 |
+
|
| 836 |
+
logger.info(f"\n🚀 Démarrage du traitement des fichiers docx...")
|
| 837 |
+
logger.info(f"📁 Dossier source: {docx_folder}")
|
| 838 |
+
|
| 839 |
+
# Initialiser le parser
|
| 840 |
+
parser = MedicalTemplateParser()
|
| 841 |
+
|
| 842 |
+
# CORRECTION: Utiliser process_docx_folder au lieu de extract_text_from_docx
|
| 843 |
+
templates = parser.process_docx_folder(docx_folder)
|
| 844 |
+
|
| 845 |
+
if not templates:
|
| 846 |
+
logger.info("❌ Aucun template traité avec succès")
|
| 847 |
+
return
|
| 848 |
+
|
| 849 |
+
# Construire la base vectorielle
|
| 850 |
+
parser.build_vector_database(templates)
|
| 851 |
+
|
| 852 |
+
# Afficher le résumé global
|
| 853 |
+
parser.print_global_summary()
|
| 854 |
+
|
| 855 |
+
# Afficher les détails de chaque template
|
| 856 |
+
logger.info(f"\n{'='*80}")
|
| 857 |
+
logger.info(f"📄 DÉTAILS DES TEMPLATES")
|
| 858 |
+
logger.info(f"{'='*80}")
|
| 859 |
+
|
| 860 |
+
for template_id in sorted(parser.templates.keys()):
|
| 861 |
+
parser.print_template_summary(template_id)
|
| 862 |
+
|
| 863 |
+
# Tester la recherche
|
| 864 |
+
logger.info(f"\n{'='*80}")
|
| 865 |
+
logger.info(f"🔍 TEST DE RECHERCHE")
|
| 866 |
+
logger.info(f"{'='*80}")
|
| 867 |
+
|
| 868 |
+
test_queries = [
|
| 869 |
+
"échographie abdominale",
|
| 870 |
+
"scanner thoracique",
|
| 871 |
+
"compte rendu imagerie",
|
| 872 |
+
"résultats laboratoire"
|
| 873 |
+
]
|
| 874 |
+
|
| 875 |
+
for query in test_queries:
|
| 876 |
+
logger.info(f"\n🔍 Recherche pour: '{query}'")
|
| 877 |
+
results = parser.search_similar_templates(query, k=3)
|
| 878 |
+
|
| 879 |
+
if results:
|
| 880 |
+
logger.info("📊 Résultats:")
|
| 881 |
+
for i, (template_id, score) in enumerate(results, 1):
|
| 882 |
+
template = parser.get_template_info(template_id)
|
| 883 |
+
logger.info(f" {i}. {template_id} (score: {score:.3f}) - {template.type} - {template.medecin}")
|
| 884 |
+
else:
|
| 885 |
+
logger.info("❌ Aucun résultat trouvé")
|
| 886 |
+
|
| 887 |
+
# Sauvegarder la base
|
| 888 |
+
save_path = os.path.join(docx_folder, 'medical_templates.pkl')
|
| 889 |
+
parser.save_database(save_path)
|
| 890 |
+
|
| 891 |
+
logger.info(f"\n✅ Traitement terminé avec succès!")
|
| 892 |
+
logger.info(f"💾 Base de données sauvegardée: {save_path}")
|
| 893 |
+
|
| 894 |
+
|
| 895 |
+
if __name__ == "__main__":
|
| 896 |
+
main()
|
template_enrichi_mod.6272.mauberton.MODELE.RADIO_20250903_155139.txt
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
================================================================================
|
| 3 |
+
🏥 TEMPLATE MÉDICAL ENRICHI - REMPLI AUTOMATIQUEMENT
|
| 4 |
+
================================================================================
|
| 5 |
+
|
| 6 |
+
📋 INFORMATIONS DU TEMPLATE:
|
| 7 |
+
Template ID: mod.6272.mauberton.MODELE.RADIO
|
| 8 |
+
Fichier source: mod.6272.mauberton.MODELE.RADIO.docx
|
| 9 |
+
Médecin: Non spécifié
|
| 10 |
+
Centre médical: Non spécifié
|
| 11 |
+
Type de document: rapport médical
|
| 12 |
+
|
| 13 |
+
📊 MÉTRIQUES DE CORRESPONDANCE:
|
| 14 |
+
Score global: 0.787 (very_good)
|
| 15 |
+
Pourcentage de remplissage: 66.7%
|
| 16 |
+
Sections remplissables: 2/3
|
| 17 |
+
Sections critiques remplies: 2
|
| 18 |
+
Similarité sémantique: 0.222
|
| 19 |
+
|
| 20 |
+
================================================================================
|
| 21 |
+
📝 CONTENU MÉDICAL STRUCTURÉ
|
| 22 |
+
================================================================================
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
🟢 [RÉSULTATS] - Confiance: 0.90
|
| 26 |
+
────────────────────────────────────────────────────────────
|
| 27 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 28 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 29 |
+
* Pas d'adénomyose franche.
|
| 30 |
+
* Aspect normal du col utérin et du vagin.
|
| 31 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 32 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 33 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 34 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 35 |
+
* Pas d'autre localisation pelvienne.
|
| 36 |
+
* Pas d'épanchement pelvien.
|
| 37 |
+
* Pas d'anomalie de la vessie.
|
| 38 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
🟢 [CONCLUSION] - Confiance: 0.90
|
| 42 |
+
────────────────────────────────────────────────────────────
|
| 43 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 44 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm.
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
================================================================================
|
| 48 |
+
⚠️ SECTIONS NON REMPLIES (nécessitent des informations supplémentaires)
|
| 49 |
+
================================================================================
|
| 50 |
+
|
| 51 |
+
1. INDICATION
|
| 52 |
+
Raison: Section 'Indication' non trouvée
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
================================================================================
|
| 56 |
+
📈 ANALYSE DE QUALITÉ
|
| 57 |
+
================================================================================
|
| 58 |
+
|
| 59 |
+
✅ Statistiques de remplissage:
|
| 60 |
+
• Sections remplies: 2
|
| 61 |
+
• Sections haute qualité (>80% confiance): 2
|
| 62 |
+
• Sections manquantes: 1
|
| 63 |
+
• Taux de remplissage: 66.7%
|
| 64 |
+
|
| 65 |
+
🎯 Scores de correspondance:
|
| 66 |
+
• Type de document: 0.300
|
| 67 |
+
• Nom de fichier: 0.500
|
| 68 |
+
• Contenu médical: 1.000
|
| 69 |
+
• Médecin: 0.500
|
| 70 |
+
• Centre: 0.500
|
| 71 |
+
|
| 72 |
+
🔍 Indicateurs de correspondance:
|
| 73 |
+
• Fichier: radio, radiologie
|
| 74 |
+
|
| 75 |
+
💡 RECOMMANDATIONS POUR AMÉLIORER LE REMPLISSAGE:
|
| 76 |
+
• Vérifier si la transcription contient toutes les sections requises
|
| 77 |
+
• Vérifier la correspondance entre le type de transcription et le template
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
================================================================================
|
| 81 |
+
📄 TRANSCRIPTION ORIGINALE (pour référence)
|
| 82 |
+
================================================================================
|
| 83 |
+
**Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
|
| 84 |
+
**Résultats :**
|
| 85 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 86 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 87 |
+
* Pas d'adénomyose franche.
|
| 88 |
+
* Aspect normal du col utérin et du vagin.
|
| 89 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 90 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 91 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 92 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 93 |
+
* Pas d'autre localisation pelvienne.
|
| 94 |
+
* Pas d'épanchement pelvien.
|
| 95 |
+
* Pas d'anomalie de la vessie.
|
| 96 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 97 |
+
**Conclusion :**
|
| 98 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 99 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm.
|
| 100 |
+
|
| 101 |
+
================================================================================
|
| 102 |
+
🏁 FIN DU TEMPLATE ENRICHI
|
| 103 |
+
Généré automatiquement par le système de matching médical enrichi
|
| 104 |
+
================================================================================
|
template_generator.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import List
|
| 5 |
+
from docx import Document
|
| 6 |
+
from docx.shared import Inches
|
| 7 |
+
from docx.enum.style import WD_STYLE_TYPE
|
| 8 |
+
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
| 9 |
+
from docx.shared import RGBColor
|
| 10 |
+
from docx.oxml.shared import OxmlElement, qn
|
| 11 |
+
|
| 12 |
+
# Importer les classes du premier fichier
|
| 13 |
+
from template_matcher import TemplateMatcher, TemplateMatch
|
| 14 |
+
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
|
| 17 |
+
# Charger les variables d'environnement
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
DB_PATH = os.getenv("TEMPLATE_DB_PATH", "templates/medical_templates.pkl")
|
| 21 |
+
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "templates_remplis")
|
| 22 |
+
|
| 23 |
+
class TemplateGenerator:
|
| 24 |
+
"""Génère des templates médicaux remplis au format .doc"""
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
"""Initialise le générateur de templates"""
|
| 28 |
+
|
| 29 |
+
self.output_dir = OUTPUT_DIR
|
| 30 |
+
self._create_output_directory()
|
| 31 |
+
|
| 32 |
+
# Configuration du logging pour ce module
|
| 33 |
+
logging.basicConfig(
|
| 34 |
+
level=logging.INFO,
|
| 35 |
+
format='%(asctime)s - %(levelname)s - [GENERATOR] %(message)s'
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def _create_output_directory(self):
|
| 39 |
+
"""Crée le répertoire de sortie s'il n'existe pas"""
|
| 40 |
+
if not os.path.exists(self.output_dir):
|
| 41 |
+
os.makedirs(self.output_dir)
|
| 42 |
+
logging.info(f"📁 Répertoire de sortie créé: {self.output_dir}")
|
| 43 |
+
|
| 44 |
+
def _add_custom_styles(self, doc: Document):
|
| 45 |
+
"""Ajoute des styles personnalisés au document"""
|
| 46 |
+
styles = doc.styles
|
| 47 |
+
|
| 48 |
+
# Style pour les titres de section
|
| 49 |
+
try:
|
| 50 |
+
section_style = styles.add_style('Section Title', WD_STYLE_TYPE.PARAGRAPH)
|
| 51 |
+
section_style.font.size = Inches(0.16) # 12pt
|
| 52 |
+
section_style.font.bold = True
|
| 53 |
+
section_style.font.color.rgb = RGBColor(0, 51, 102) # Bleu foncé
|
| 54 |
+
section_style.paragraph_format.space_after = Inches(0.1)
|
| 55 |
+
section_style.paragraph_format.keep_with_next = True
|
| 56 |
+
except:
|
| 57 |
+
logging.warning("Style 'Section Title' déjà existant")
|
| 58 |
+
|
| 59 |
+
# Style pour le contenu des sections
|
| 60 |
+
try:
|
| 61 |
+
content_style = styles.add_style('Section Content', WD_STYLE_TYPE.PARAGRAPH)
|
| 62 |
+
content_style.font.size = Inches(0.14) # 11pt
|
| 63 |
+
content_style.paragraph_format.left_indent = Inches(0.25)
|
| 64 |
+
content_style.paragraph_format.space_after = Inches(0.15)
|
| 65 |
+
except:
|
| 66 |
+
logging.warning("Style 'Section Content' déjà existant")
|
| 67 |
+
|
| 68 |
+
# Style pour l'en-tête
|
| 69 |
+
try:
|
| 70 |
+
header_style = styles.add_style('Document Header', WD_STYLE_TYPE.PARAGRAPH)
|
| 71 |
+
header_style.font.size = Inches(0.18) # 14pt
|
| 72 |
+
header_style.font.bold = True
|
| 73 |
+
header_style.font.color.rgb = RGBColor(0, 0, 0)
|
| 74 |
+
header_style.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
| 75 |
+
header_style.paragraph_format.space_after = Inches(0.2)
|
| 76 |
+
except:
|
| 77 |
+
logging.warning("Style 'Document Header' déjà existant")
|
| 78 |
+
|
| 79 |
+
def _add_document_header(self, doc: Document, template_match: TemplateMatch, transcription_filename: str):
|
| 80 |
+
"""Ajoute l'en-tête du document"""
|
| 81 |
+
# Titre principal
|
| 82 |
+
header = doc.add_paragraph()
|
| 83 |
+
header.style = 'Document Header'
|
| 84 |
+
header.add_run("COMPTE-RENDU MÉDICAL GÉNÉRÉ AUTOMATIQUEMENT")
|
| 85 |
+
|
| 86 |
+
# Informations du template
|
| 87 |
+
info_paragraph = doc.add_paragraph()
|
| 88 |
+
info_paragraph.add_run("Template utilisé: ").bold = True
|
| 89 |
+
info_paragraph.add_run(os.path.basename(template_match.template_info.filepath))
|
| 90 |
+
|
| 91 |
+
# Informations médicales
|
| 92 |
+
if template_match.template_info.medecin and template_match.template_info.medecin != "Non identifié":
|
| 93 |
+
medecin_para = doc.add_paragraph()
|
| 94 |
+
medecin_para.add_run("Médecin: ").bold = True
|
| 95 |
+
medecin_para.add_run(template_match.template_info.medecin)
|
| 96 |
+
|
| 97 |
+
centre = getattr(template_match.template_info, 'centre_medical', 'Non spécifié')
|
| 98 |
+
if centre and centre != "Non spécifié":
|
| 99 |
+
centre_para = doc.add_paragraph()
|
| 100 |
+
centre_para.add_run("Centre médical: ").bold = True
|
| 101 |
+
centre_para.add_run(centre)
|
| 102 |
+
|
| 103 |
+
# Type de document
|
| 104 |
+
type_para = doc.add_paragraph()
|
| 105 |
+
type_para.add_run("Type de document: ").bold = True
|
| 106 |
+
type_para.add_run(template_match.template_info.type)
|
| 107 |
+
|
| 108 |
+
# Informations de génération
|
| 109 |
+
generation_para = doc.add_paragraph()
|
| 110 |
+
generation_para.add_run("Date de génération: ").bold = True
|
| 111 |
+
generation_para.add_run(datetime.now().strftime("%d/%m/%Y à %H:%M"))
|
| 112 |
+
|
| 113 |
+
score_para = doc.add_paragraph()
|
| 114 |
+
score_para.add_run("Score de correspondance: ").bold = True
|
| 115 |
+
score_para.add_run(f"{template_match.overall_score:.3f} ({template_match.confidence_level})")
|
| 116 |
+
|
| 117 |
+
filling_para = doc.add_paragraph()
|
| 118 |
+
filling_para.add_run("Pourcentage de remplissage: ").bold = True
|
| 119 |
+
filling_para.add_run(f"{template_match.filling_percentage:.1f}%")
|
| 120 |
+
|
| 121 |
+
# Ligne de séparation
|
| 122 |
+
doc.add_paragraph("_" * 80)
|
| 123 |
+
|
| 124 |
+
def _add_filled_sections(self, doc: Document, template_match: TemplateMatch):
|
| 125 |
+
"""Ajoute les sections remplies au document"""
|
| 126 |
+
if not template_match.extracted_data:
|
| 127 |
+
logging.warning("❌ Aucune section à remplir trouvée")
|
| 128 |
+
doc.add_paragraph("Aucune section n'a pu être remplie automatiquement.")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
logging.info(f"📝 Génération de {len(template_match.extracted_data)} sections remplies")
|
| 132 |
+
|
| 133 |
+
# Ajouter un titre pour les sections remplies
|
| 134 |
+
sections_title = doc.add_paragraph()
|
| 135 |
+
sections_title.add_run("CONTENU EXTRAIT ET STRUCTURÉ").bold = True
|
| 136 |
+
sections_title.add_run().font.size = Inches(0.18)
|
| 137 |
+
|
| 138 |
+
for section_name, content in template_match.extracted_data.items():
|
| 139 |
+
# Titre de section
|
| 140 |
+
section_title = doc.add_paragraph()
|
| 141 |
+
section_title.style = 'Section Title'
|
| 142 |
+
section_title.add_run(f"{section_name.upper()}")
|
| 143 |
+
|
| 144 |
+
# Contenu de section
|
| 145 |
+
section_content = doc.add_paragraph()
|
| 146 |
+
section_content.style = 'Section Content'
|
| 147 |
+
section_content.add_run(content)
|
| 148 |
+
|
| 149 |
+
logging.info(f" ✅ Section ajoutée: {section_name} ({len(content)} caractères)")
|
| 150 |
+
|
| 151 |
+
def _add_missing_sections(self, doc: Document, template_match: TemplateMatch):
|
| 152 |
+
"""Ajoute les sections manquantes au document"""
|
| 153 |
+
missing_sections = [s.section_name for s in template_match.section_matches.values() if not s.can_fill]
|
| 154 |
+
|
| 155 |
+
if missing_sections:
|
| 156 |
+
logging.info(f"⚠️ {len(missing_sections)} sections manquantes identifiées")
|
| 157 |
+
|
| 158 |
+
# Titre pour les sections manquantes
|
| 159 |
+
missing_title = doc.add_paragraph()
|
| 160 |
+
missing_title.add_run("SECTIONS NON REMPLIES").bold = True
|
| 161 |
+
missing_title.add_run().font.color.rgb = RGBColor(204, 102, 0) # Orange
|
| 162 |
+
|
| 163 |
+
missing_subtitle = doc.add_paragraph()
|
| 164 |
+
missing_subtitle.add_run("(Informations non trouvées dans la transcription)")
|
| 165 |
+
missing_subtitle.add_run().font.color.rgb = RGBColor(102, 102, 102) # Gris
|
| 166 |
+
|
| 167 |
+
for section in missing_sections:
|
| 168 |
+
missing_para = doc.add_paragraph()
|
| 169 |
+
missing_para.add_run(f"• {section}")
|
| 170 |
+
missing_para.add_run().font.color.rgb = RGBColor(204, 102, 0)
|
| 171 |
+
|
| 172 |
+
# Ajouter un espace pour remplissage manuel
|
| 173 |
+
placeholder = doc.add_paragraph()
|
| 174 |
+
placeholder.style = 'Section Content'
|
| 175 |
+
placeholder.add_run("[À COMPLÉTER MANUELLEMENT]")
|
| 176 |
+
placeholder.add_run().font.color.rgb = RGBColor(153, 153, 153) # Gris clair
|
| 177 |
+
placeholder.add_run().italic = True
|
| 178 |
+
|
| 179 |
+
def _add_original_transcription(self, doc: Document, transcription: str):
|
| 180 |
+
"""Ajoute la transcription originale en annexe"""
|
| 181 |
+
# Saut de page
|
| 182 |
+
doc.add_page_break()
|
| 183 |
+
|
| 184 |
+
# Titre de l'annexe
|
| 185 |
+
annexe_title = doc.add_paragraph()
|
| 186 |
+
annexe_title.add_run("ANNEXE - TRANSCRIPTION ORIGINALE").bold = True
|
| 187 |
+
annexe_title.add_run().font.size = Inches(0.16)
|
| 188 |
+
annexe_title.add_run().font.color.rgb = RGBColor(102, 102, 102)
|
| 189 |
+
|
| 190 |
+
# Ligne de séparation
|
| 191 |
+
doc.add_paragraph("=" * 60)
|
| 192 |
+
|
| 193 |
+
# Transcription originale
|
| 194 |
+
transcription_para = doc.add_paragraph()
|
| 195 |
+
transcription_para.add_run(transcription)
|
| 196 |
+
transcription_para.add_run().font.size = Inches(0.12) # Texte plus petit
|
| 197 |
+
transcription_para.add_run().font.color.rgb = RGBColor(51, 51, 51) # Gris foncé
|
| 198 |
+
|
| 199 |
+
def generate_filled_template(self, template_match: TemplateMatch, transcription: str, transcription_filename: str) -> str:
|
| 200 |
+
"""
|
| 201 |
+
Génère un template rempli et le sauvegarde au format .doc
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
template_match: Le template avec le meilleur score
|
| 205 |
+
transcription: La transcription originale
|
| 206 |
+
transcription_filename: Le nom du fichier de transcription
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
str: Le chemin du fichier généré
|
| 210 |
+
"""
|
| 211 |
+
logging.info("🚀 Début de la génération du template rempli")
|
| 212 |
+
logging.info(f"📋 Template sélectionné: {template_match.template_id}")
|
| 213 |
+
logging.info(f"📊 Score: {template_match.overall_score:.3f}")
|
| 214 |
+
logging.info(f"🔧 Remplissage: {template_match.filling_percentage:.1f}%")
|
| 215 |
+
|
| 216 |
+
try:
|
| 217 |
+
# Créer un nouveau document Word
|
| 218 |
+
doc = Document()
|
| 219 |
+
|
| 220 |
+
# Ajouter les styles personnalisés
|
| 221 |
+
self._add_custom_styles(doc)
|
| 222 |
+
|
| 223 |
+
# Ajouter l'en-tête du document
|
| 224 |
+
self._add_document_header(doc, template_match, transcription_filename)
|
| 225 |
+
|
| 226 |
+
# Ajouter les sections remplies
|
| 227 |
+
self._add_filled_sections(doc, template_match)
|
| 228 |
+
|
| 229 |
+
# Ajouter les sections manquantes
|
| 230 |
+
self._add_missing_sections(doc, template_match)
|
| 231 |
+
|
| 232 |
+
# Ajouter la transcription originale en annexe
|
| 233 |
+
self._add_original_transcription(doc, transcription)
|
| 234 |
+
|
| 235 |
+
# Générer le nom de fichier de sortie
|
| 236 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 237 |
+
safe_template_id = template_match.template_id.replace('/', '_').replace('\\', '_')
|
| 238 |
+
output_filename = f"template_rempli_{safe_template_id}_{timestamp}.docx"
|
| 239 |
+
output_path = os.path.join(self.output_dir, output_filename)
|
| 240 |
+
|
| 241 |
+
# Sauvegarder le document
|
| 242 |
+
doc.save(output_path)
|
| 243 |
+
|
| 244 |
+
logging.info(f"✅ Template rempli généré avec succès:")
|
| 245 |
+
logging.info(f" 📁 Fichier: {output_path}")
|
| 246 |
+
logging.info(f" 📏 Taille: {os.path.getsize(output_path)} bytes")
|
| 247 |
+
logging.info(f" 📋 Sections remplies: {len(template_match.extracted_data)}")
|
| 248 |
+
logging.info(f" ⚠️ Sections manquantes: {len([s for s in template_match.section_matches.values() if not s.can_fill])}")
|
| 249 |
+
|
| 250 |
+
return output_path
|
| 251 |
+
|
| 252 |
+
except Exception as e:
|
| 253 |
+
logging.error(f"❌ Erreur lors de la génération du template: {e}")
|
| 254 |
+
raise
|
| 255 |
+
|
| 256 |
+
def display_generation_summary(self, template_match: TemplateMatch, output_path: str):
|
| 257 |
+
"""Affiche un résumé de la génération dans les logs"""
|
| 258 |
+
logging.info("=" * 80)
|
| 259 |
+
logging.info("📊 RÉSUMÉ DE LA GÉNÉRATION")
|
| 260 |
+
logging.info("=" * 80)
|
| 261 |
+
logging.info(f"🎯 Template utilisé: {template_match.template_id}")
|
| 262 |
+
logging.info(f"📁 Template source: {os.path.basename(template_match.template_info.filepath)}")
|
| 263 |
+
logging.info(f"👨⚕️ Médecin: {template_match.template_info.medecin}")
|
| 264 |
+
logging.info(f"🏥 Centre: {getattr(template_match.template_info, 'centre_medical', 'Non spécifié')}")
|
| 265 |
+
logging.info(f"📝 Type: {template_match.template_info.type}")
|
| 266 |
+
logging.info(f"📊 Score de correspondance: {template_match.overall_score:.3f} ({template_match.confidence_level})")
|
| 267 |
+
logging.info(f"🔧 Pourcentage de remplissage: {template_match.filling_percentage:.1f}%")
|
| 268 |
+
logging.info(f"📋 Sections remplies: {len(template_match.extracted_data)}")
|
| 269 |
+
logging.info(f"⚠️ Sections manquantes: {len([s for s in template_match.section_matches.values() if not s.can_fill])}")
|
| 270 |
+
logging.info(f"💾 Fichier généré: {os.path.basename(output_path)}")
|
| 271 |
+
logging.info(f"📏 Taille du fichier: {os.path.getsize(output_path)} bytes")
|
| 272 |
+
logging.info("=" * 80)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def main():
|
| 276 |
+
"""Fonction principale qui utilise le premier fichier pour matcher puis génère le template"""
|
| 277 |
+
|
| 278 |
+
# Configuration du logging
|
| 279 |
+
logging.basicConfig(
|
| 280 |
+
level=logging.INFO,
|
| 281 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Chemin de la base de données
|
| 285 |
+
db_path = DB_PATH
|
| 286 |
+
|
| 287 |
+
# Exemple de transcription
|
| 288 |
+
transcription_filename = "default.73.931915433.rtf_3650535_radiologie.doc"
|
| 289 |
+
transcription_content = """ la Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
|
| 290 |
+
Résultats
|
| 291 |
+
L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 292 |
+
L'endomètre est fin, mesurant moins de 2 mm.
|
| 293 |
+
Pas d'adénomyose franche.
|
| 294 |
+
Aspect normal du col utérin et du vagin.
|
| 295 |
+
L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 296 |
+
L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 297 |
+
Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 298 |
+
Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 299 |
+
Pas d'autre localisation pelvienne.
|
| 300 |
+
Pas d'épanchement pelvien.
|
| 301 |
+
Pas d'anomalie de la vessie.
|
| 302 |
+
Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 303 |
+
en Conclusion
|
| 304 |
+
Endométriome ovarien droit périphérique de 13 mm.
|
| 305 |
+
Endométriome ovarien gauche centro-ovarien de 45 mm."""
|
| 306 |
+
|
| 307 |
+
if not os.path.exists(db_path):
|
| 308 |
+
logging.error(f"❌ Base de données non trouvée: {db_path}")
|
| 309 |
+
return
|
| 310 |
+
|
| 311 |
+
try:
|
| 312 |
+
logging.info("🚀 DÉMARRAGE DU PROCESSUS COMPLET")
|
| 313 |
+
logging.info("=" * 80)
|
| 314 |
+
|
| 315 |
+
# ÉTAPE 1: Matching avec le premier fichier
|
| 316 |
+
logging.info("📍 ÉTAPE 1: MATCHING DES TEMPLATES")
|
| 317 |
+
matcher = TemplateMatcher(db_path)
|
| 318 |
+
matches = matcher.match_templates(transcription_content, transcription_filename, k=3)
|
| 319 |
+
|
| 320 |
+
if not matches:
|
| 321 |
+
logging.error("❌ Aucun template trouvé")
|
| 322 |
+
return
|
| 323 |
+
|
| 324 |
+
# Sélectionner le meilleur template
|
| 325 |
+
best_match = matches[0]
|
| 326 |
+
logging.info(f"✅ Meilleur template sélectionné: {best_match.template_id}")
|
| 327 |
+
|
| 328 |
+
# ÉTAPE 2: Génération avec le deuxième fichier
|
| 329 |
+
logging.info("📍 ÉTAPE 2: GÉNÉRATION DU TEMPLATE REMPLI")
|
| 330 |
+
generator = TemplateGenerator()
|
| 331 |
+
output_path = generator.generate_filled_template(
|
| 332 |
+
best_match,
|
| 333 |
+
transcription_content,
|
| 334 |
+
transcription_filename
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# ÉTAPE 3: Affichage du résumé
|
| 338 |
+
logging.info("📍 ÉTAPE 3: RÉSUMÉ FINAL")
|
| 339 |
+
generator.display_generation_summary(best_match, output_path)
|
| 340 |
+
|
| 341 |
+
logging.info("🎉 PROCESSUS TERMINÉ AVEC SUCCÈS")
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logging.error(f"❌ Erreur dans le processus principal: {e}")
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
if __name__ == "__main__":
|
| 348 |
+
main()
|
template_rempli_mod.6272.mauberton.MODELE.RADIO.txt
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
=== TEMPLATE MÉDICAL REMPLI ===
|
| 3 |
+
Template: mod.6272.mauberton.MODELE.RADIO.docx
|
| 4 |
+
Score de correspondance: 0.508
|
| 5 |
+
Remplissage: 66.7%
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
INFORMATIONS GÉNÉRALES:
|
| 9 |
+
- Médecin: Non spécifié
|
| 10 |
+
- Centre: Non spécifié
|
| 11 |
+
- Type de document: rapport médical
|
| 12 |
+
|
| 13 |
+
CONTENU EXTRAIT ET STRUCTURÉ:
|
| 14 |
+
|
| 15 |
+
[RÉSULTATS]
|
| 16 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 17 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 18 |
+
* Pas d'adénomyose franche.
|
| 19 |
+
* Aspect normal du col utérin et du vagin.
|
| 20 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 21 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 22 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 23 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 24 |
+
* Pas d'autre localisation pelvienne.
|
| 25 |
+
* Pas d'épanchement pelvien.
|
| 26 |
+
* Pas d'anomalie de la vessie.
|
| 27 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 28 |
+
|
| 29 |
+
[CONCLUSION]
|
| 30 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 31 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm.
|
| 32 |
+
|
| 33 |
+
SECTIONS NON REMPLIES (informations manquantes):
|
| 34 |
+
- Indication
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
==================================================
|
| 38 |
+
TRANSCRIPTION ORIGINALE (pour référence):
|
| 39 |
+
==================================================
|
| 40 |
+
**Technique :** 3 plans T2, diffusion axiale, T2 grand champ et T1 Dixon.
|
| 41 |
+
**Résultats :**
|
| 42 |
+
* L'utérus est antéversé, antéfléchi, latéralisé à droite, de taille normale pour l'âge.
|
| 43 |
+
* L'endomètre est fin, mesurant moins de 2 mm.
|
| 44 |
+
* Pas d'adénomyose franche.
|
| 45 |
+
* Aspect normal du col utérin et du vagin.
|
| 46 |
+
* L'ovaire droit, en position postérieure, mesure 18 x 11 mm avec présence de 4 follicules.
|
| 47 |
+
* L'ovaire gauche, en position latéro-utérine, présente un volumineux endométriome de 45 mm, typique en hypersignal T1 Dixon.
|
| 48 |
+
* Deuxième endométriome accolé à l'ovaire droit, périphérique, mesurant 13 mm.
|
| 49 |
+
* Pas d'épaississement marqué du torus ni des ligaments utéro-sacrés.
|
| 50 |
+
* Pas d'autre localisation pelvienne.
|
| 51 |
+
* Pas d'épanchement pelvien.
|
| 52 |
+
* Pas d'anomalie de la vessie.
|
| 53 |
+
* Pas d'adénomégalie pelvienne, pas de dilatation des uretères.
|
| 54 |
+
**Conclusion :**
|
| 55 |
+
* Endométriome ovarien droit périphérique de 13 mm.
|
| 56 |
+
* Endométriome ovarien gauche centro-ovarien de 45 mm.
|
test.py
ADDED
|
@@ -0,0 +1,588 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Dict, List, Optional, Tuple, Any
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class FieldType(Enum):
|
| 10 |
+
"""Types de champs dans le template"""
|
| 11 |
+
CHECKBOX = "checkbox" # &x cases à cocher
|
| 12 |
+
TEXT = "text" # &x texte libre
|
| 13 |
+
MEASUREMENT = "measurement" # &x valeurs numériques
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class TemplateField:
|
| 17 |
+
"""Définition d'un champ du template"""
|
| 18 |
+
placeholder: str # &x dans le template
|
| 19 |
+
field_type: FieldType
|
| 20 |
+
source_field: str # Champ correspondant dans ExtractedData
|
| 21 |
+
default_value: str = ""
|
| 22 |
+
validation_pattern: Optional[str] = None
|
| 23 |
+
transformation_func: Optional[callable] = None
|
| 24 |
+
context_identifier: Optional[str] = None # Pour différencier gauche/droite
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class MappingResult:
|
| 28 |
+
"""Résultat du mapping"""
|
| 29 |
+
filled_template: str
|
| 30 |
+
mapped_fields: Dict[str, str]
|
| 31 |
+
unmapped_placeholders: List[str]
|
| 32 |
+
mapping_confidence: float
|
| 33 |
+
errors: List[str]
|
| 34 |
+
|
| 35 |
+
class MedicalTemplateMapper:
|
| 36 |
+
"""Moteur de mapping des données extraites vers le template médical"""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.template = self._load_template()
|
| 40 |
+
self.field_mappings = self._define_field_mappings()
|
| 41 |
+
self.checkbox_logic = self._define_checkbox_logic()
|
| 42 |
+
|
| 43 |
+
def _load_template(self) -> str:
|
| 44 |
+
"""Template médical de base avec placeholders &x"""
|
| 45 |
+
return """L'utérus est &x antéversé, &x rétroversé, &x intermédiaire, &x rétrofléchi, &x antéfléchi, &x fixe de taille normale (&x x &x x &x cm).
|
| 46 |
+
Hystérométrie : distance orifice externe du col - fond de la cavité utérine : &x mm.
|
| 47 |
+
L'endomètre : mesuré à &x mm.
|
| 48 |
+
Myometre : pas de myome.
|
| 49 |
+
Zone jonctionnelle : Atteinte de la zone de jonction : &x non &x oui
|
| 50 |
+
Adénomyose associée : &x non &x oui : &x diffuse &x focale &x interne &x externe
|
| 51 |
+
Col utérin: pas de kyste de Naboth. Absence de pathologies échographiquement décelable à son niveau.
|
| 52 |
+
Cavité utérine en 3D: morphologie triangulaire.
|
| 53 |
+
|
| 54 |
+
&xKISSING OVARIES
|
| 55 |
+
L'ovaire droit mesure &x x &x mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &x folliculaire CFA &x follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 56 |
+
Accessibilité : &x rétro-utérin &x fixe &x aisée.
|
| 57 |
+
L'ovaire gauche mesure &x x &x mm, &x est de dimensions supérieures à la normale il mesure &x x &x mm, &x folliculaire CFA &x follicules: (&x mm). &x Absence d'endométriome. &x Présence d'une formation kystique hypoéchogène, uniloculaire, non vascularisé, à contenu ground glass mesurée à &x mm d'allure endométriome.
|
| 58 |
+
Accessibilité : &x rétro-utérin &x fixe &x aisée.
|
| 59 |
+
&x Présence de micro-calcifications sous thécales &x bilatérales &x droites &x gauches pouvant témoigner d'implants endométriosiques superficiels.
|
| 60 |
+
L'échostructure des deux ovaires apparait normale, avec une vascularisation artério-veineuse normale au Doppler, sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 61 |
+
|
| 62 |
+
Cavité péritonéale
|
| 63 |
+
&x- Pas d'épanchement liquidien dans le cul du sac du Douglas. Pas de douleur à l'écho-palpation.
|
| 64 |
+
&x- Faible épanchement corpusculé dans le cul du sac du Douglas qui silhouette des adhérences (soft marqueur d'endométriose?). Pas de douleur à l'écho-palpation.
|
| 65 |
+
- &xVessie vide pendant l'examen. &x Vessie en semi-réplétion pendant l'examen.
|
| 66 |
+
- &x Absence de dilatation pyélo-calicielle.
|
| 67 |
+
- Artère utérine : IP : &x - IR : 0,&x - Spectre : type 2 avec notch protodiastolique.
|
| 68 |
+
- Pas d'image d'hydrosalpinx visible à ce jour.
|
| 69 |
+
|
| 70 |
+
RECHERCHE ENDOMETRIOSE PELVIENNE
|
| 71 |
+
|
| 72 |
+
A-Compartiment antérieur (vessie en semi-réplétion)
|
| 73 |
+
- Signe du glissement (sliding) : &xprésent &xdiminué &xabsent
|
| 74 |
+
- Présence d'un nodule : &xnon &xoui
|
| 75 |
+
- Uretères dans la partie pelvienne vus non dilatés.
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
B-Compartiment postérieur
|
| 79 |
+
- Signe du glissement (sliding) :
|
| 80 |
+
- Espace recto-vaginal : &xprésent &xdiminué &xabsent
|
| 81 |
+
- Plan sus-péritonéal : &xprésent &xdiminué &xabsent
|
| 82 |
+
- Aspect du torus : &x normal &x épaissi
|
| 83 |
+
- Aspect des ligaments utéro-sacrés :
|
| 84 |
+
- Ligament utéro- sacré droit : &x normal &x épaissi
|
| 85 |
+
- Ligament utéro-sacré gauche : &x normal &x épaissi
|
| 86 |
+
- Présence d'un nodule hypoéchogène : &x non
|
| 87 |
+
- Infiltration digestive: &x non &x oui : &x bas rectum &x moyen rectum &x haut rectum &x jonction recto-sigmoïde
|
| 88 |
+
|
| 89 |
+
Conclusions
|
| 90 |
+
Utérus de taille et de morphologie normales.
|
| 91 |
+
Endomètre mesuré à &x mm.
|
| 92 |
+
CFA : &x+&x follicules.
|
| 93 |
+
Ovaires sans formation ou image kystique pathologique échographiquement décelable à leur niveau.
|
| 94 |
+
&x Absence d'image d'endométriose visible ce jour, à confronter éventuellement à une IRM.
|
| 95 |
+
&x Endométriose &x superficielle &x et profonde.
|
| 96 |
+
Absence d'anomalie échographiquement décelable au niveau des trompes.
|
| 97 |
+
--> L'ensemble de ces aspects reste à confronter au contexte clinico-thérapeutique.
|
| 98 |
+
|
| 99 |
+
(qui contient des trous représentés par &x)"""
|
| 100 |
+
|
| 101 |
+
def _define_field_mappings(self) -> Dict[str, TemplateField]:
|
| 102 |
+
"""Définit les mappings entre données extraites et placeholders template"""
|
| 103 |
+
return {
|
| 104 |
+
# Position utérus - checkboxes
|
| 105 |
+
"uterus_position_antéversé": TemplateField(
|
| 106 |
+
placeholder="&x antéversé",
|
| 107 |
+
field_type=FieldType.CHECKBOX,
|
| 108 |
+
source_field="uterus_position",
|
| 109 |
+
transformation_func=lambda x: "X" if x and "antéversé" in x.lower() else ""
|
| 110 |
+
),
|
| 111 |
+
"uterus_position_rétroversé": TemplateField(
|
| 112 |
+
placeholder="&x rétroversé",
|
| 113 |
+
field_type=FieldType.CHECKBOX,
|
| 114 |
+
source_field="uterus_position",
|
| 115 |
+
transformation_func=lambda x: "X" if x and "rétroversé" in x.lower() else ""
|
| 116 |
+
),
|
| 117 |
+
|
| 118 |
+
# Taille utérus - dimensions
|
| 119 |
+
"uterus_size_length": TemplateField(
|
| 120 |
+
placeholder="normale (&x x",
|
| 121 |
+
field_type=FieldType.MEASUREMENT,
|
| 122 |
+
source_field="uterus_size",
|
| 123 |
+
transformation_func=self._extract_first_dimension
|
| 124 |
+
),
|
| 125 |
+
"uterus_size_width": TemplateField(
|
| 126 |
+
placeholder="x &x x",
|
| 127 |
+
field_type=FieldType.MEASUREMENT,
|
| 128 |
+
source_field="uterus_size",
|
| 129 |
+
transformation_func=self._extract_second_dimension
|
| 130 |
+
),
|
| 131 |
+
"uterus_size_height": TemplateField(
|
| 132 |
+
placeholder="x &x cm)",
|
| 133 |
+
field_type=FieldType.MEASUREMENT,
|
| 134 |
+
source_field="uterus_size",
|
| 135 |
+
transformation_func=self._extract_third_dimension
|
| 136 |
+
),
|
| 137 |
+
|
| 138 |
+
# Hystérométrie
|
| 139 |
+
"hysterometry_value": TemplateField(
|
| 140 |
+
placeholder="fond de la cavité utérine : &x mm",
|
| 141 |
+
field_type=FieldType.MEASUREMENT,
|
| 142 |
+
source_field="hysterometry",
|
| 143 |
+
transformation_func=self._clean_numeric_value
|
| 144 |
+
),
|
| 145 |
+
|
| 146 |
+
# Endomètre
|
| 147 |
+
"endometrium_thickness": TemplateField(
|
| 148 |
+
placeholder="L'endomètre : mesuré à &x mm",
|
| 149 |
+
field_type=FieldType.MEASUREMENT,
|
| 150 |
+
source_field="endometrium_thickness",
|
| 151 |
+
transformation_func=self._clean_numeric_value
|
| 152 |
+
),
|
| 153 |
+
|
| 154 |
+
# Zone jonctionnelle
|
| 155 |
+
"junctional_zone_non": TemplateField(
|
| 156 |
+
placeholder="Atteinte de la zone de jonction : &x non",
|
| 157 |
+
field_type=FieldType.CHECKBOX,
|
| 158 |
+
source_field="junctional_zone_status",
|
| 159 |
+
transformation_func=lambda x: "X" if not x or x.lower() in ["normale", "normal"] else ""
|
| 160 |
+
),
|
| 161 |
+
"junctional_zone_oui": TemplateField(
|
| 162 |
+
placeholder="&x oui",
|
| 163 |
+
field_type=FieldType.CHECKBOX,
|
| 164 |
+
source_field="junctional_zone_status",
|
| 165 |
+
transformation_func=lambda x: "X" if x and x.lower() in ["épaissie", "épaisse", "atteinte"] else ""
|
| 166 |
+
),
|
| 167 |
+
|
| 168 |
+
# Adénomyose
|
| 169 |
+
"adenomyosis_non": TemplateField(
|
| 170 |
+
placeholder="Adénomyose associée : &x non",
|
| 171 |
+
field_type=FieldType.CHECKBOX,
|
| 172 |
+
source_field="adenomyosis_type",
|
| 173 |
+
transformation_func=lambda x: "X" if not x or x.lower() in ["absente", "non"] else ""
|
| 174 |
+
),
|
| 175 |
+
"adenomyosis_oui": TemplateField(
|
| 176 |
+
placeholder="&x oui :",
|
| 177 |
+
field_type=FieldType.CHECKBOX,
|
| 178 |
+
source_field="adenomyosis_type",
|
| 179 |
+
transformation_func=lambda x: "X" if x and x.lower() in ["diffuse", "focale"] else ""
|
| 180 |
+
),
|
| 181 |
+
"adenomyosis_diffuse": TemplateField(
|
| 182 |
+
placeholder="&x diffuse",
|
| 183 |
+
field_type=FieldType.CHECKBOX,
|
| 184 |
+
source_field="adenomyosis_type",
|
| 185 |
+
transformation_func=lambda x: "X" if x and "diffuse" in x.lower() else ""
|
| 186 |
+
),
|
| 187 |
+
|
| 188 |
+
# Doppler
|
| 189 |
+
"doppler_ip": TemplateField(
|
| 190 |
+
placeholder="IP : &x",
|
| 191 |
+
field_type=FieldType.MEASUREMENT,
|
| 192 |
+
source_field="doppler_ip",
|
| 193 |
+
transformation_func=self._clean_numeric_value
|
| 194 |
+
),
|
| 195 |
+
"doppler_ir": TemplateField(
|
| 196 |
+
placeholder="IR : 0,&x",
|
| 197 |
+
field_type=FieldType.MEASUREMENT,
|
| 198 |
+
source_field="doppler_ir",
|
| 199 |
+
transformation_func=self._format_doppler_ir
|
| 200 |
+
),
|
| 201 |
+
|
| 202 |
+
# Conclusion - endomètre
|
| 203 |
+
"conclusion_endometrium": TemplateField(
|
| 204 |
+
placeholder="Endomètre mesuré à &x mm",
|
| 205 |
+
field_type=FieldType.MEASUREMENT,
|
| 206 |
+
source_field="endometrium_thickness",
|
| 207 |
+
transformation_func=self._clean_numeric_value
|
| 208 |
+
),
|
| 209 |
+
|
| 210 |
+
# Conclusions - CFA total
|
| 211 |
+
"conclusion_cfa_right": TemplateField(
|
| 212 |
+
placeholder="CFA : &x+",
|
| 213 |
+
field_type=FieldType.MEASUREMENT,
|
| 214 |
+
source_field="right_ovary_cfa",
|
| 215 |
+
transformation_func=self._clean_cfa_value
|
| 216 |
+
),
|
| 217 |
+
"conclusion_cfa_left": TemplateField(
|
| 218 |
+
placeholder="+&x follicules",
|
| 219 |
+
field_type=FieldType.MEASUREMENT,
|
| 220 |
+
source_field="left_ovary_cfa",
|
| 221 |
+
transformation_func=self._clean_cfa_value
|
| 222 |
+
),
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
def _define_checkbox_logic(self) -> Dict[str, List[str]]:
|
| 226 |
+
"""Définit la logique des checkboxes mutuellement exclusives"""
|
| 227 |
+
return {
|
| 228 |
+
"uterus_position": ["antéversé", "rétroversé", "intermédiaire", "rétrofléchi", "antéfléchi"],
|
| 229 |
+
"adenomyosis": ["non", "oui"],
|
| 230 |
+
"adenomyosis_type": ["diffuse", "focale", "interne", "externe"],
|
| 231 |
+
"ovary_accessibility": ["rétro-utérin", "fixe", "aisée"]
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
def map_extracted_data_to_template(self, extracted_data) -> MappingResult:
|
| 235 |
+
"""
|
| 236 |
+
Fonction principale de mapping des données extraites vers le template
|
| 237 |
+
"""
|
| 238 |
+
logger.info("🔄 Début du mapping vers le template médical")
|
| 239 |
+
|
| 240 |
+
filled_template = self.template
|
| 241 |
+
mapped_fields = {}
|
| 242 |
+
unmapped_placeholders = []
|
| 243 |
+
errors = []
|
| 244 |
+
|
| 245 |
+
# Étape 1: Identifier tous les placeholders &x dans le template
|
| 246 |
+
all_placeholders = self._find_all_placeholders(filled_template)
|
| 247 |
+
logger.info(f"📍 {len(all_placeholders)} placeholders trouvés dans le template")
|
| 248 |
+
|
| 249 |
+
# Étape 2: Appliquer les mappings définis
|
| 250 |
+
for mapping_key, template_field in self.field_mappings.items():
|
| 251 |
+
try:
|
| 252 |
+
# Récupérer la valeur source
|
| 253 |
+
source_value = getattr(extracted_data, template_field.source_field, None)
|
| 254 |
+
|
| 255 |
+
if source_value:
|
| 256 |
+
# Appliquer la transformation
|
| 257 |
+
if template_field.transformation_func:
|
| 258 |
+
mapped_value = template_field.transformation_func(source_value)
|
| 259 |
+
else:
|
| 260 |
+
mapped_value = str(source_value)
|
| 261 |
+
|
| 262 |
+
# Remplacer dans le template
|
| 263 |
+
if mapped_value and mapped_value.strip():
|
| 264 |
+
filled_template = self._replace_placeholder_in_context(
|
| 265 |
+
filled_template, template_field.placeholder, mapped_value
|
| 266 |
+
)
|
| 267 |
+
mapped_fields[mapping_key] = mapped_value
|
| 268 |
+
logger.debug(f"✅ {mapping_key}: {mapped_value}")
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
error_msg = f"Erreur mapping {mapping_key}: {e}"
|
| 272 |
+
errors.append(error_msg)
|
| 273 |
+
logger.error(error_msg)
|
| 274 |
+
|
| 275 |
+
# Étape 3: Traitement spécial pour les ovaires
|
| 276 |
+
filled_template = self._handle_ovary_section(filled_template, extracted_data)
|
| 277 |
+
|
| 278 |
+
# Étape 4: Application des règles de logique métier
|
| 279 |
+
filled_template = self._apply_business_logic(filled_template, extracted_data)
|
| 280 |
+
|
| 281 |
+
# Étape 5: Gestion des placeholders non mappés
|
| 282 |
+
remaining_placeholders = self._find_all_placeholders(filled_template)
|
| 283 |
+
unmapped_placeholders = [p for p in remaining_placeholders if "&x" in p]
|
| 284 |
+
|
| 285 |
+
# Étape 6: Calcul du score de mapping
|
| 286 |
+
mapping_confidence = self._calculate_mapping_confidence(
|
| 287 |
+
len(mapped_fields), len(all_placeholders), len(errors)
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
logger.info(f"✅ Mapping terminé - {len(mapped_fields)} champs mappés, {len(unmapped_placeholders)} non mappés")
|
| 291 |
+
|
| 292 |
+
return MappingResult(
|
| 293 |
+
filled_template=filled_template,
|
| 294 |
+
mapped_fields=mapped_fields,
|
| 295 |
+
unmapped_placeholders=unmapped_placeholders,
|
| 296 |
+
mapping_confidence=mapping_confidence,
|
| 297 |
+
errors=errors
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
def _handle_ovary_section(self, template: str, extracted_data) -> str:
|
| 301 |
+
"""Traite spécifiquement la section des ovaires"""
|
| 302 |
+
|
| 303 |
+
# Traitement ovaire droit
|
| 304 |
+
if hasattr(extracted_data, 'right_ovary_dimensions') and extracted_data.right_ovary_dimensions:
|
| 305 |
+
dimensions = self._parse_dimensions(extracted_data.right_ovary_dimensions)
|
| 306 |
+
if len(dimensions) >= 2:
|
| 307 |
+
# Remplacer les dimensions de l'ovaire droit
|
| 308 |
+
template = self._replace_ovary_dimensions(template, "droit", dimensions[0], dimensions[1])
|
| 309 |
+
|
| 310 |
+
# CFA ovaire droit
|
| 311 |
+
if hasattr(extracted_data, 'right_ovary_cfa') and extracted_data.right_ovary_cfa:
|
| 312 |
+
cfa_value = self._clean_cfa_value(extracted_data.right_ovary_cfa)
|
| 313 |
+
template = self._replace_ovary_cfa(template, "droit", cfa_value)
|
| 314 |
+
|
| 315 |
+
# Accessibilité ovaire droit
|
| 316 |
+
if hasattr(extracted_data, 'right_ovary_accessibility') and extracted_data.right_ovary_accessibility:
|
| 317 |
+
template = self._replace_ovary_accessibility(template, "droit", extracted_data.right_ovary_accessibility)
|
| 318 |
+
|
| 319 |
+
# Traitement ovaire gauche
|
| 320 |
+
if hasattr(extracted_data, 'left_ovary_dimensions') and extracted_data.left_ovary_dimensions:
|
| 321 |
+
dimensions = self._parse_dimensions(extracted_data.left_ovary_dimensions)
|
| 322 |
+
if len(dimensions) >= 2:
|
| 323 |
+
# Remplacer les dimensions de l'ovaire gauche
|
| 324 |
+
template = self._replace_ovary_dimensions(template, "gauche", dimensions[0], dimensions[1])
|
| 325 |
+
|
| 326 |
+
# CFA ovaire gauche
|
| 327 |
+
if hasattr(extracted_data, 'left_ovary_cfa') and extracted_data.left_ovary_cfa:
|
| 328 |
+
cfa_value = self._clean_cfa_value(extracted_data.left_ovary_cfa)
|
| 329 |
+
template = self._replace_ovary_cfa(template, "gauche", cfa_value)
|
| 330 |
+
|
| 331 |
+
# Accessibilité ovaire gauche
|
| 332 |
+
if hasattr(extracted_data, 'left_ovary_accessibility') and extracted_data.left_ovary_accessibility:
|
| 333 |
+
template = self._replace_ovary_accessibility(template, "gauche", extracted_data.left_ovary_accessibility)
|
| 334 |
+
|
| 335 |
+
return template
|
| 336 |
+
|
| 337 |
+
def _replace_ovary_dimensions(self, template: str, side: str, dim1: str, dim2: str) -> str:
|
| 338 |
+
"""Remplace les dimensions d'un ovaire spécifique"""
|
| 339 |
+
lines = template.split('\n')
|
| 340 |
+
|
| 341 |
+
for i, line in enumerate(lines):
|
| 342 |
+
if f"ovaire {side} mesure" in line.lower():
|
| 343 |
+
# Remplacer les deux premiers &x pour les dimensions principales
|
| 344 |
+
if "&x x &x mm" in line:
|
| 345 |
+
line = line.replace("&x x &x mm", f"{dim1} x {dim2} mm", 1)
|
| 346 |
+
lines[i] = line
|
| 347 |
+
break
|
| 348 |
+
|
| 349 |
+
return '\n'.join(lines)
|
| 350 |
+
|
| 351 |
+
def _replace_ovary_cfa(self, template: str, side: str, cfa_value: str) -> str:
|
| 352 |
+
"""Remplace la valeur CFA d'un ovaire spécifique"""
|
| 353 |
+
lines = template.split('\n')
|
| 354 |
+
|
| 355 |
+
for i, line in enumerate(lines):
|
| 356 |
+
if f"ovaire {side}" in line.lower() and i < len(lines) - 1:
|
| 357 |
+
# Chercher la ligne avec CFA dans les lignes suivantes
|
| 358 |
+
for j in range(i, min(i+3, len(lines))):
|
| 359 |
+
if "folliculaire CFA &x follicules" in lines[j]:
|
| 360 |
+
lines[j] = lines[j].replace("&x folliculaire CFA &x follicules", f"{cfa_value} folliculaire CFA")
|
| 361 |
+
break
|
| 362 |
+
break
|
| 363 |
+
|
| 364 |
+
return '\n'.join(lines)
|
| 365 |
+
|
| 366 |
+
def _replace_ovary_accessibility(self, template: str, side: str, accessibility: str) -> str:
|
| 367 |
+
"""Remplace l'accessibilité d'un ovaire spécifique"""
|
| 368 |
+
lines = template.split('\n')
|
| 369 |
+
|
| 370 |
+
in_ovary_section = False
|
| 371 |
+
for i, line in enumerate(lines):
|
| 372 |
+
if f"ovaire {side}" in line.lower():
|
| 373 |
+
in_ovary_section = True
|
| 374 |
+
elif in_ovary_section and "Accessibilité" in line:
|
| 375 |
+
# Déterminer quelle case cocher
|
| 376 |
+
if "rétro" in accessibility.lower():
|
| 377 |
+
line = line.replace("Accessibilité : &x rétro-utérin", "Accessibilité : X rétro-utérin")
|
| 378 |
+
elif "fixe" in accessibility.lower():
|
| 379 |
+
line = line.replace("&x fixe", "X fixe")
|
| 380 |
+
else: # normale ou aisée
|
| 381 |
+
line = line.replace("&x aisée", "X aisée")
|
| 382 |
+
|
| 383 |
+
lines[i] = line
|
| 384 |
+
in_ovary_section = False
|
| 385 |
+
break
|
| 386 |
+
|
| 387 |
+
return '\n'.join(lines)
|
| 388 |
+
|
| 389 |
+
def _parse_dimensions(self, dimensions_str: str) -> List[str]:
|
| 390 |
+
"""Parse les dimensions à partir d'une chaîne"""
|
| 391 |
+
if not dimensions_str:
|
| 392 |
+
return []
|
| 393 |
+
|
| 394 |
+
# Extraire tous les nombres
|
| 395 |
+
matches = re.findall(r'(\d+(?:[.,]\d+)?)', dimensions_str)
|
| 396 |
+
return [match.replace(',', '.') for match in matches]
|
| 397 |
+
|
| 398 |
+
def _find_all_placeholders(self, template: str) -> List[str]:
|
| 399 |
+
"""Trouve tous les placeholders &x dans le template"""
|
| 400 |
+
pattern = r'[^.]*&x[^.]*'
|
| 401 |
+
matches = re.findall(pattern, template)
|
| 402 |
+
return matches
|
| 403 |
+
|
| 404 |
+
def _replace_placeholder_in_context(self, template: str, context_pattern: str, value: str) -> str:
|
| 405 |
+
"""Remplace &x dans un contexte spécifique"""
|
| 406 |
+
escaped_pattern = re.escape(context_pattern).replace(r'\&x', r'&x')
|
| 407 |
+
|
| 408 |
+
def replace_func(match):
|
| 409 |
+
return match.group(0).replace('&x', value, 1)
|
| 410 |
+
|
| 411 |
+
return re.sub(escaped_pattern, replace_func, template)
|
| 412 |
+
|
| 413 |
+
def _apply_business_logic(self, template: str, extracted_data) -> str:
|
| 414 |
+
"""Applique la logique métier spécifique au domaine médical"""
|
| 415 |
+
|
| 416 |
+
# Logique par défaut pour les examens standard
|
| 417 |
+
template = template.replace("- &xVessie vide pendant l'examen", "- XVessie vide pendant l'examen")
|
| 418 |
+
template = template.replace("&x Absence de dilatation pyélo-calicielle", "X Absence de dilatation pyélo-calicielle")
|
| 419 |
+
template = template.replace("&x Absence d'image d'endométriose visible ce jour", "X Absence d'image d'endométriose visible ce jour")
|
| 420 |
+
|
| 421 |
+
return template
|
| 422 |
+
|
| 423 |
+
def _calculate_mapping_confidence(self, mapped_count: int, total_placeholders: int, error_count: int) -> float:
|
| 424 |
+
"""Calcule le score de confiance du mapping"""
|
| 425 |
+
if total_placeholders == 0:
|
| 426 |
+
return 1.0
|
| 427 |
+
|
| 428 |
+
base_confidence = mapped_count / total_placeholders
|
| 429 |
+
error_penalty = min(error_count * 0.1, 0.3)
|
| 430 |
+
|
| 431 |
+
return max(0.0, base_confidence - error_penalty)
|
| 432 |
+
|
| 433 |
+
# Fonctions de transformation des données
|
| 434 |
+
|
| 435 |
+
def _clean_numeric_value(self, value: str) -> str:
|
| 436 |
+
"""Nettoie les valeurs numériques"""
|
| 437 |
+
if not value:
|
| 438 |
+
return ""
|
| 439 |
+
|
| 440 |
+
cleaned = re.sub(r'\s*(mm|cm)\s*(mm|cm)', r' \1', str(value))
|
| 441 |
+
cleaned = re.sub(r'\s*(mm|cm).*', r'', cleaned)
|
| 442 |
+
cleaned = cleaned.replace(',', '.').strip()
|
| 443 |
+
|
| 444 |
+
return cleaned
|
| 445 |
+
|
| 446 |
+
def _clean_cfa_value(self, value: str) -> str:
|
| 447 |
+
"""Nettoie les valeurs CFA"""
|
| 448 |
+
if not value:
|
| 449 |
+
return ""
|
| 450 |
+
|
| 451 |
+
cleaned = str(value).replace(' follicules', '').replace(' follicules follicules', '').strip()
|
| 452 |
+
match = re.search(r'(\d+)', cleaned)
|
| 453 |
+
return match.group(1) if match else cleaned
|
| 454 |
+
|
| 455 |
+
def _extract_first_dimension(self, dimensions: str) -> str:
|
| 456 |
+
"""Extrait la première dimension"""
|
| 457 |
+
if not dimensions:
|
| 458 |
+
return ""
|
| 459 |
+
|
| 460 |
+
match = re.search(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 461 |
+
return match.group(1).replace(',', '.') if match else ""
|
| 462 |
+
|
| 463 |
+
def _extract_second_dimension(self, dimensions: str) -> str:
|
| 464 |
+
"""Extrait la deuxième dimension"""
|
| 465 |
+
if not dimensions:
|
| 466 |
+
return ""
|
| 467 |
+
|
| 468 |
+
matches = re.findall(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 469 |
+
return matches[1].replace(',', '.') if len(matches) > 1 else ""
|
| 470 |
+
|
| 471 |
+
def _extract_third_dimension(self, dimensions: str) -> str:
|
| 472 |
+
"""Extrait la troisième dimension"""
|
| 473 |
+
if not dimensions:
|
| 474 |
+
return ""
|
| 475 |
+
|
| 476 |
+
matches = re.findall(r'(\d+(?:[.,]\d+)?)', dimensions)
|
| 477 |
+
return matches[2].replace(',', '.') if len(matches) > 2 else ""
|
| 478 |
+
|
| 479 |
+
def _format_doppler_ir(self, ir_value: str) -> str:
|
| 480 |
+
"""Formate la valeur IR pour le template"""
|
| 481 |
+
if not ir_value:
|
| 482 |
+
return ""
|
| 483 |
+
|
| 484 |
+
cleaned = self._clean_numeric_value(ir_value)
|
| 485 |
+
|
| 486 |
+
if cleaned.startswith('0.'):
|
| 487 |
+
return cleaned[2:]
|
| 488 |
+
elif '.' in cleaned:
|
| 489 |
+
return cleaned.split('.')[1]
|
| 490 |
+
|
| 491 |
+
return cleaned
|
| 492 |
+
|
| 493 |
+
def print_mapping_report(self, result: MappingResult) -> str:
|
| 494 |
+
"""Génère un rapport de mapping formaté"""
|
| 495 |
+
report = "🔄 RAPPORT DE MAPPING TEMPLATE\n"
|
| 496 |
+
report += "=" * 50 + "\n\n"
|
| 497 |
+
|
| 498 |
+
report += f"📊 STATISTIQUES:\n"
|
| 499 |
+
report += f" Champs mappés: {len(result.mapped_fields)}\n"
|
| 500 |
+
report += f" Placeholders non mappés: {len(result.unmapped_placeholders)}\n"
|
| 501 |
+
report += f" Score de confiance: {result.mapping_confidence:.1%}\n"
|
| 502 |
+
report += f" Erreurs: {len(result.errors)}\n\n"
|
| 503 |
+
|
| 504 |
+
if result.mapped_fields:
|
| 505 |
+
report += "✅ CHAMPS MAPPÉS:\n"
|
| 506 |
+
for field, value in result.mapped_fields.items():
|
| 507 |
+
report += f" {field}: {value}\n"
|
| 508 |
+
report += "\n"
|
| 509 |
+
|
| 510 |
+
if result.unmapped_placeholders:
|
| 511 |
+
report += "❌ PLACEHOLDERS NON MAPPÉS:\n"
|
| 512 |
+
for placeholder in result.unmapped_placeholders[:10]:
|
| 513 |
+
report += f" {placeholder[:50]}...\n"
|
| 514 |
+
if len(result.unmapped_placeholders) > 10:
|
| 515 |
+
report += f" ... et {len(result.unmapped_placeholders) - 10} autres\n"
|
| 516 |
+
report += "\n"
|
| 517 |
+
|
| 518 |
+
if result.errors:
|
| 519 |
+
report += "⚠️ ERREURS:\n"
|
| 520 |
+
for error in result.errors:
|
| 521 |
+
report += f" {error}\n"
|
| 522 |
+
|
| 523 |
+
return report
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
# Classe exemple pour les données extraites avec vos données
|
| 527 |
+
class ExtractedData:
|
| 528 |
+
"""Classe exemple pour les données extraites"""
|
| 529 |
+
def __init__(self):
|
| 530 |
+
self.uterus_position = "antéversé"
|
| 531 |
+
self.uterus_size = "7,8 cm"
|
| 532 |
+
self.hysterometry = "60 mm"
|
| 533 |
+
self.endometrium_thickness = "3,7 mm"
|
| 534 |
+
self.junctional_zone_status = "épaissie"
|
| 535 |
+
self.adenomyosis_type = "diffuse"
|
| 536 |
+
|
| 537 |
+
# Données ovaires avec vos valeurs exactes
|
| 538 |
+
self.right_ovary_dimensions = "26 x 20 mm"
|
| 539 |
+
self.right_ovary_cfa = "5 follicules"
|
| 540 |
+
self.right_ovary_accessibility = "normale"
|
| 541 |
+
|
| 542 |
+
self.left_ovary_dimensions = "25 x 19 mm"
|
| 543 |
+
self.left_ovary_cfa = "22 follicules"
|
| 544 |
+
self.left_ovary_accessibility = "difficile rétro-utérine"
|
| 545 |
+
|
| 546 |
+
# Données Doppler
|
| 547 |
+
self.doppler_ip = "3,24"
|
| 548 |
+
self.doppler_ir = "0,91"
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
def test_corrected_ovary_mapping():
|
| 552 |
+
"""Test du mapping corrigé des ovaires"""
|
| 553 |
+
|
| 554 |
+
data = ExtractedData()
|
| 555 |
+
mapper = MedicalTemplateMapper()
|
| 556 |
+
result = mapper.map_extracted_data_to_template(data)
|
| 557 |
+
|
| 558 |
+
print("🔧 TEST DU MAPPING OVAIRES CORRIGÉ")
|
| 559 |
+
print("=" * 40)
|
| 560 |
+
print(mapper.print_mapping_report(result))
|
| 561 |
+
|
| 562 |
+
print("\n🔍 SECTION OVAIRES DANS LE RÉSULTAT:")
|
| 563 |
+
print("-" * 40)
|
| 564 |
+
|
| 565 |
+
# Extraire et afficher la section ovaires
|
| 566 |
+
lines = result.filled_template.split('\n')
|
| 567 |
+
ovary_section = []
|
| 568 |
+
in_ovary_section = False
|
| 569 |
+
|
| 570 |
+
for line in lines:
|
| 571 |
+
if "KISSING OVARIES" in line:
|
| 572 |
+
in_ovary_section = True
|
| 573 |
+
elif in_ovary_section and line.strip() == "":
|
| 574 |
+
if ovary_section: # Si on a déjà collecté des lignes
|
| 575 |
+
break
|
| 576 |
+
|
| 577 |
+
if in_ovary_section:
|
| 578 |
+
ovary_section.append(line)
|
| 579 |
+
if "L'échostructure des deux ovaires" in line:
|
| 580 |
+
break
|
| 581 |
+
|
| 582 |
+
print('\n'.join(ovary_section))
|
| 583 |
+
|
| 584 |
+
return result.filled_template
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
if __name__ == "__main__":
|
| 588 |
+
filled_report = test_corrected_ovary_mapping()
|
test2.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
|
| 4 |
+
def delete_generated_templates(folder_path="data_txt", start_number=419, end_number=1244):
|
| 5 |
+
"""
|
| 6 |
+
Supprime les fichiers template générés dans une plage donnée
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
folder_path: Chemin vers le dossier contenant les fichiers
|
| 10 |
+
start_number: Numéro de début (inclus)
|
| 11 |
+
end_number: Numéro de fin (inclus)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
if not os.path.exists(folder_path):
|
| 15 |
+
print(f"❌ Le dossier '{folder_path}' n'existe pas!")
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
print(f"🗑️ Suppression des fichiers de template{start_number}.txt à template{end_number}.txt")
|
| 19 |
+
print(f"📁 Dossier: {folder_path}")
|
| 20 |
+
|
| 21 |
+
deleted_count = 0
|
| 22 |
+
not_found_count = 0
|
| 23 |
+
error_count = 0
|
| 24 |
+
|
| 25 |
+
# Parcourir tous les numéros dans la plage
|
| 26 |
+
for file_number in range(start_number, end_number + 1):
|
| 27 |
+
filename = f"template{file_number}.txt"
|
| 28 |
+
filepath = os.path.join(folder_path, filename)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
if os.path.exists(filepath):
|
| 32 |
+
os.remove(filepath)
|
| 33 |
+
deleted_count += 1
|
| 34 |
+
if deleted_count % 50 == 0: # Afficher le progrès tous les 50 fichiers
|
| 35 |
+
print(f" ✅ {deleted_count} fichiers supprimés...")
|
| 36 |
+
else:
|
| 37 |
+
not_found_count += 1
|
| 38 |
+
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"❌ Erreur lors de la suppression de {filename}: {e}")
|
| 41 |
+
error_count += 1
|
| 42 |
+
|
| 43 |
+
print(f"\n📊 RÉSUMÉ DE LA SUPPRESSION:")
|
| 44 |
+
print(f" ✅ Fichiers supprimés: {deleted_count}")
|
| 45 |
+
print(f" ⚠️ Fichiers non trouvés: {not_found_count}")
|
| 46 |
+
print(f" ❌ Erreurs: {error_count}")
|
| 47 |
+
print(f" 📝 Plage traitée: template{start_number}.txt → template{end_number}.txt")
|
| 48 |
+
|
| 49 |
+
if deleted_count > 0:
|
| 50 |
+
print(f"\n🎉 Nettoyage terminé! {deleted_count} fichiers supprimés avec succès.")
|
| 51 |
+
else:
|
| 52 |
+
print(f"\n💭 Aucun fichier à supprimer dans cette plage.")
|
| 53 |
+
|
| 54 |
+
def confirm_deletion(start_number, end_number):
|
| 55 |
+
"""
|
| 56 |
+
Demande une confirmation avant suppression
|
| 57 |
+
"""
|
| 58 |
+
total_files = end_number - start_number + 1
|
| 59 |
+
print(f"⚠️ ATTENTION: Vous allez supprimer {total_files} fichiers!")
|
| 60 |
+
print(f"📄 De template{start_number}.txt à template{end_number}.txt")
|
| 61 |
+
|
| 62 |
+
response = input("Êtes-vous sûr? (oui/non): ").lower().strip()
|
| 63 |
+
|
| 64 |
+
if response in ['oui', 'o', 'yes', 'y']:
|
| 65 |
+
return True
|
| 66 |
+
else:
|
| 67 |
+
print("❌ Suppression annulée.")
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
print("=== 🗑️ SUPPRESSION DES FICHIERS GÉNÉRÉS ===")
|
| 72 |
+
|
| 73 |
+
start_num = 419
|
| 74 |
+
end_num = 1244
|
| 75 |
+
|
| 76 |
+
# Demander confirmation
|
| 77 |
+
if confirm_deletion(start_num, end_num):
|
| 78 |
+
delete_generated_templates(
|
| 79 |
+
folder_path="data_txt",
|
| 80 |
+
start_number=start_num,
|
| 81 |
+
end_number=end_num
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
print("\n=== Script terminé ===")
|
test_complete_pipeline.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Complete Pipeline Test
|
| 4 |
+
Tests the full pipeline including Langfuse transcription download
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
# Add the current directory to Python path
|
| 14 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_complete_pipeline():
|
| 18 |
+
"""Test the complete pipeline including Langfuse transcription download."""
|
| 19 |
+
print("🏥 Complete Medical Document Pipeline Test")
|
| 20 |
+
print("=" * 70)
|
| 21 |
+
print("This test will:")
|
| 22 |
+
print("1. Download transcriptions from Langfuse")
|
| 23 |
+
print("2. Run the complete document processing pipeline")
|
| 24 |
+
print("3. Validate the results")
|
| 25 |
+
print("=" * 70)
|
| 26 |
+
|
| 27 |
+
# Step 1: Download transcriptions from Langfuse
|
| 28 |
+
print("\n📥 Step 1: Downloading transcriptions from Langfuse...")
|
| 29 |
+
try:
|
| 30 |
+
from medical_transcription_retriever import MedicalTranscriptionRetriever
|
| 31 |
+
|
| 32 |
+
retriever = MedicalTranscriptionRetriever()
|
| 33 |
+
saved_files = retriever.run(
|
| 34 |
+
limit=5, save_to_file=True, save_by_user=True)
|
| 35 |
+
|
| 36 |
+
if not saved_files:
|
| 37 |
+
print("❌ No transcriptions downloaded from Langfuse")
|
| 38 |
+
print("Please check your Langfuse configuration and try again")
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
print(
|
| 42 |
+
f"✅ Successfully downloaded transcriptions: {len(saved_files)} files")
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"❌ Error downloading transcriptions: {e}")
|
| 46 |
+
print("Continuing with existing transcriptions if available...")
|
| 47 |
+
|
| 48 |
+
# Step 2: Check if we have transcription files
|
| 49 |
+
transcriptions_dir = "transcriptions"
|
| 50 |
+
if not os.path.exists(transcriptions_dir):
|
| 51 |
+
print(f"❌ Transcriptions directory not found: {transcriptions_dir}")
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
transcription_files = list(Path(transcriptions_dir).glob("*.json"))
|
| 55 |
+
if not transcription_files:
|
| 56 |
+
print(f"❌ No transcription files found in {transcriptions_dir}")
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
print(f"📁 Found {len(transcription_files)} transcription files")
|
| 60 |
+
|
| 61 |
+
# Step 3: Test with the first transcription file
|
| 62 |
+
first_transcription = transcription_files[0]
|
| 63 |
+
print(f"📄 Using transcription file: {first_transcription.name}")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Step 4: Initialize the orchestrator
|
| 67 |
+
print(
|
| 68 |
+
"\n🚀 Step 2: Initializing orchestrator with automatic SFTP model detection...")
|
| 69 |
+
from langchain_medical_agents_refactored import MedicalDocumentOrchestrator
|
| 70 |
+
|
| 71 |
+
orchestrator = MedicalDocumentOrchestrator(
|
| 72 |
+
template_path=None, # Let the SFTP agent find the template
|
| 73 |
+
transcription_path=str(first_transcription),
|
| 74 |
+
transcriptions_dir=transcriptions_dir
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Step 5: Run the complete pipeline
|
| 78 |
+
print("\n🔄 Step 3: Running complete pipeline...")
|
| 79 |
+
print("This will include:")
|
| 80 |
+
print(" 📥 Step 0: SFTP Download (.rtf → .doc) - AUTOMATIC MODEL DETECTION")
|
| 81 |
+
print(" 📋 Step 1: Template Analysis")
|
| 82 |
+
print(" ✏️ Step 2: Transcription Correction")
|
| 83 |
+
print(" 🔬 Step 3: Medical Data Analysis")
|
| 84 |
+
print(" 📝 Step 4: Title Generation")
|
| 85 |
+
print(" 📝 Step 5: Section Generation")
|
| 86 |
+
print(" 📄 Step 6: Document Assembly")
|
| 87 |
+
print(" 📋 Step 7: Validation")
|
| 88 |
+
|
| 89 |
+
start_time = time.time()
|
| 90 |
+
output_file = orchestrator.run_full_pipeline()
|
| 91 |
+
end_time = time.time()
|
| 92 |
+
|
| 93 |
+
execution_time = end_time - start_time
|
| 94 |
+
print(f"\n⏱️ Pipeline execution time: {execution_time:.2f} seconds")
|
| 95 |
+
|
| 96 |
+
print(f"\n🎉 Pipeline completed successfully!")
|
| 97 |
+
print(f"📄 Output file: {output_file}")
|
| 98 |
+
|
| 99 |
+
# Step 6: Show SFTP download summary
|
| 100 |
+
if orchestrator.downloaded_models:
|
| 101 |
+
successful_downloads = [
|
| 102 |
+
m for m in orchestrator.downloaded_models if m['status'] == 'success']
|
| 103 |
+
failed_downloads = [
|
| 104 |
+
m for m in orchestrator.downloaded_models if m['status'] == 'error']
|
| 105 |
+
|
| 106 |
+
print(f"\n📥 SFTP Download Summary:")
|
| 107 |
+
print(
|
| 108 |
+
f" ✅ Successfully downloaded: {len(successful_downloads)} models")
|
| 109 |
+
print(f" ❌ Failed downloads: {len(failed_downloads)} models")
|
| 110 |
+
|
| 111 |
+
if successful_downloads:
|
| 112 |
+
print(" 📁 Downloaded models:")
|
| 113 |
+
for model in successful_downloads[:5]: # Show first 5
|
| 114 |
+
print(
|
| 115 |
+
f" - {model['model_id']}: {model['local_filename']}")
|
| 116 |
+
if len(successful_downloads) > 5:
|
| 117 |
+
print(f" ... and {len(successful_downloads) - 5} more")
|
| 118 |
+
|
| 119 |
+
# Step 7: Verify output file exists
|
| 120 |
+
if os.path.exists(output_file):
|
| 121 |
+
file_size = os.path.getsize(output_file)
|
| 122 |
+
print(f"\n✅ Output file verified:")
|
| 123 |
+
print(f" 📄 File: {output_file}")
|
| 124 |
+
print(f" 📏 Size: {file_size} bytes")
|
| 125 |
+
|
| 126 |
+
# Check if file is readable
|
| 127 |
+
try:
|
| 128 |
+
from docx import Document
|
| 129 |
+
doc = Document(output_file)
|
| 130 |
+
paragraph_count = len(doc.paragraphs)
|
| 131 |
+
print(f" 📝 Paragraphs: {paragraph_count}")
|
| 132 |
+
print(f" ✅ Document is readable and valid")
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f" ⚠️ Document validation failed: {e}")
|
| 135 |
+
else:
|
| 136 |
+
print(f"\n❌ Output file not found: {output_file}")
|
| 137 |
+
|
| 138 |
+
return output_file
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f"❌ Error running pipeline: {str(e)}")
|
| 142 |
+
import traceback
|
| 143 |
+
traceback.print_exc()
|
| 144 |
+
return None
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def cleanup_test_files():
|
| 148 |
+
"""Clean up test files after testing."""
|
| 149 |
+
print("\n🧹 Cleaning up test files...")
|
| 150 |
+
|
| 151 |
+
# Remove generated documents
|
| 152 |
+
for file in Path("./transcriptions").glob("*.json"):
|
| 153 |
+
try:
|
| 154 |
+
os.remove(file)
|
| 155 |
+
print(f"🗑️ Removed: {file}")
|
| 156 |
+
except Exception as e:
|
| 157 |
+
print(f"⚠️ Could not remove {file}: {e}")
|
| 158 |
+
|
| 159 |
+
for file in Path("./").glob("*.docx"):
|
| 160 |
+
try:
|
| 161 |
+
os.remove(file)
|
| 162 |
+
print(f"🗑️ Removed: {file}")
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"⚠️ Could not remove {file}: {e}")
|
| 165 |
+
|
| 166 |
+
for file in Path("./").glob("*.json"):
|
| 167 |
+
try:
|
| 168 |
+
os.remove(file)
|
| 169 |
+
print(f"🗑️ Removed: {file}")
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"⚠️ Could not remove {file}: {e}")
|
| 172 |
+
|
| 173 |
+
# Remove downloaded models
|
| 174 |
+
models_dir = "models"
|
| 175 |
+
if os.path.exists(models_dir):
|
| 176 |
+
for file in Path(models_dir).glob("*.doc"):
|
| 177 |
+
try:
|
| 178 |
+
os.remove(file)
|
| 179 |
+
print(f"🗑️ Removed: {file}")
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"⚠️ Could not remove {file}: {e}")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def main():
|
| 185 |
+
"""Main test function."""
|
| 186 |
+
print("🧪 Complete Pipeline Test with Langfuse Integration")
|
| 187 |
+
print("=" * 70)
|
| 188 |
+
|
| 189 |
+
# Check if we're in the right directory
|
| 190 |
+
if not os.path.exists("transcriptions"):
|
| 191 |
+
print("❌ Please run this script from the project root directory")
|
| 192 |
+
print(" (where the 'transcriptions' folder is located)")
|
| 193 |
+
return
|
| 194 |
+
|
| 195 |
+
# Show current configuration
|
| 196 |
+
try:
|
| 197 |
+
from sftp_config import print_sftp_config
|
| 198 |
+
print_sftp_config()
|
| 199 |
+
except ImportError:
|
| 200 |
+
print("⚠️ SFTP config not available")
|
| 201 |
+
|
| 202 |
+
# Run the complete pipeline test
|
| 203 |
+
result = test_complete_pipeline()
|
| 204 |
+
|
| 205 |
+
if result:
|
| 206 |
+
print(f"\n🎉 Complete pipeline test completed successfully!")
|
| 207 |
+
print(f"📄 Generated document: {result}")
|
| 208 |
+
|
| 209 |
+
# Ask if user wants to clean up
|
| 210 |
+
cleanup = input(
|
| 211 |
+
"\n🧹 Do you want to clean up test files? (y/n): ").lower().strip()
|
| 212 |
+
if cleanup in ['y', 'yes']:
|
| 213 |
+
cleanup_test_files()
|
| 214 |
+
else:
|
| 215 |
+
print(f"\n❌ Complete pipeline test failed")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
if __name__ == "__main__":
|
| 219 |
+
main()
|
test_langfuse.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_openai import ChatOpenAI
|
| 2 |
+
from langchain.agents import initialize_agent, Tool
|
| 3 |
+
from langchain.agents import AgentType
|
| 4 |
+
from langchain.utilities import SerpAPIWrapper
|
| 5 |
+
from langchain.tools import tool
|
| 6 |
+
|
| 7 |
+
# 1. Define the LLM (the brain)
|
| 8 |
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
|
| 9 |
+
|
| 10 |
+
# 2. Define tools the agent can use
|
| 11 |
+
@tool
|
| 12 |
+
def calculator(expression: str) -> str:
|
| 13 |
+
"""Useful for solving math problems."""
|
| 14 |
+
try:
|
| 15 |
+
result = eval(expression)
|
| 16 |
+
return str(result)
|
| 17 |
+
except Exception:
|
| 18 |
+
return "Error in calculation."
|
| 19 |
+
|
| 20 |
+
tools = [calculator]
|
| 21 |
+
|
| 22 |
+
# 3. Initialize the Agent
|
| 23 |
+
agent = initialize_agent(
|
| 24 |
+
tools, # the toolbox
|
| 25 |
+
llm, # the brain
|
| 26 |
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 27 |
+
verbose=True
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# 4. Try it out
|
| 31 |
+
print(agent.run("What is 12 * 7 + 5?"))
|
| 32 |
+
print(agent.run("Who is the president of France?"))
|
testt.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
def insert_texts_to_templates(texts_string, start_number=355):
|
| 4 |
+
"""
|
| 5 |
+
Insère des textes séparés par '/' dans les fichiers template
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
texts_string: String contenant les textes séparés par '/'
|
| 9 |
+
start_number: Numéro de départ pour les fichiers template
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
# Séparer les textes par '/'
|
| 13 |
+
texts = [text.strip() for text in texts_string.split('*') if text.strip()]
|
| 14 |
+
|
| 15 |
+
print(f"Nombre de textes trouvés: {len(texts)}")
|
| 16 |
+
|
| 17 |
+
# Vérifier que le dossier existe
|
| 18 |
+
if not os.path.exists("data_txt"):
|
| 19 |
+
print("❌ Le dossier 'data_txt' n'existe pas!")
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
# Insérer chaque texte dans un fichier
|
| 23 |
+
for i, text in enumerate(texts):
|
| 24 |
+
file_number = start_number + i
|
| 25 |
+
filename = f"template{file_number}.txt"
|
| 26 |
+
filepath = os.path.join("data_txt", filename)
|
| 27 |
+
|
| 28 |
+
# Vérifier si le fichier existe
|
| 29 |
+
if not os.path.exists(filepath):
|
| 30 |
+
print(f"⚠️ Le fichier {filename} n'existe pas, création...")
|
| 31 |
+
# Créer le fichier s'il n'existe pas
|
| 32 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
# Écrire le texte dans le fichier
|
| 36 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 37 |
+
f.write(text)
|
| 38 |
+
|
| 39 |
+
print(f"✅ Texte inséré dans {filename}: '{text[:50]}{'...' if len(text) > 50 else ''}'")
|
| 40 |
+
|
| 41 |
+
print(f"\n🎉 {len(texts)} textes insérés avec succès!")
|
| 42 |
+
print(f"De template{start_number}.txt à template{start_number + len(texts) - 1}.txt")
|
| 43 |
+
|
| 44 |
+
# Exemple d'utilisation
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
# Remplacez cette ligne par vos textes réels
|
| 47 |
+
mes_textes = """
|
| 48 |
+
Bilan cardiologique
|
| 49 |
+
Indication : exploration d’une dyspnée d’effort et palpitations.
|
| 50 |
+
|
| 51 |
+
Échocardiographie transthoracique : ventricule gauche de dimensions normales (51 mm de diamètre télédiastolique), fonction systolique préservée (FEVG 60%). Hypertrophie légère du septum interventriculaire (13 mm). Oreillette gauche dilatée (volume 42 ml/m²).
|
| 52 |
+
|
| 53 |
+
Valves : valve mitrale souple, sans prolapsus, insuffisance mitrale minime. Valve aortique tricuspide, calcifications modérées, pas de sténose.
|
| 54 |
+
|
| 55 |
+
Pression artérielle pulmonaire estimée : 32 mmHg.
|
| 56 |
+
|
| 57 |
+
Doppler cardiaque : flux transmitral restrictif, compatible avec une dysfonction diastolique grade I.
|
| 58 |
+
|
| 59 |
+
ECG 12 dérivations : rythme sinusal, fréquence 78/min, repolarisation normale, absence de trouble de conduction.
|
| 60 |
+
|
| 61 |
+
Test d’effort : bonne tolérance, absence de signes ischémiques, tension artérielle maximale 175/92 mmHg.
|
| 62 |
+
|
| 63 |
+
Conclusion : cardiopathie hypertensive débutante avec atteinte diastolique. Suivi cardiologique et optimisation du traitement antihypertenseur recommandés.
|
| 64 |
+
|
| 65 |
+
*
|
| 66 |
+
Exploration fonctionnelle respiratoire
|
| 67 |
+
Indication : patient tabagique chronique (25 PA) présentant une toux persistante.
|
| 68 |
+
|
| 69 |
+
Spirométrie : VEMS = 2,1 L (65% de la valeur théorique), CVF = 3,5 L (82%), rapport VEMS/CVF = 59%.
|
| 70 |
+
|
| 71 |
+
Pléthysmographie : CPT = 6,2 L (110%), VR = 2,8 L (135%), traduisant une distension thoracique.
|
| 72 |
+
|
| 73 |
+
Diffusion du CO (DLCO) : abaissée à 62%.
|
| 74 |
+
|
| 75 |
+
Gaz du sang artériel : PaO₂ = 72 mmHg, PaCO₂ = 39 mmHg, pH = 7,41.
|
| 76 |
+
|
| 77 |
+
Scanner thoracique : zones d’emphysème centrolobulaire prédominant aux lobes supérieurs, pas de nodule suspect.
|
| 78 |
+
|
| 79 |
+
Conclusion : BPCO stade GOLD II. Sevrage tabagique impératif et mise en route d’un traitement bronchodilatateur de longue durée d’action.
|
| 80 |
+
|
| 81 |
+
*
|
| 82 |
+
Endoscopie digestive haute
|
| 83 |
+
Indication : douleurs épigastriques récidivantes et reflux acide.
|
| 84 |
+
|
| 85 |
+
Œsophage : muqueuse inflammatoire, classification de Los Angeles B. Absence de sténose ou de métaplasie de type Barrett.
|
| 86 |
+
|
| 87 |
+
Estomac : antrite érythémateuse diffuse, plis gastriques réguliers. Présence d’un ulcère bulbaire de 6 mm, fond fibrineux.
|
| 88 |
+
|
| 89 |
+
Duodénum : muqueuse normale, pas d’atrophie villositaire.
|
| 90 |
+
|
| 91 |
+
Test rapide à l’uréase : positif, suggérant une infection à Helicobacter pylori.
|
| 92 |
+
|
| 93 |
+
Biopsies gastriques : en attente d’analyse histologique.
|
| 94 |
+
|
| 95 |
+
Conclusion : reflux gastro-œsophagien avec ulcère duodénal associé à Helicobacter pylori. Traitement par IPP et trithérapie antibiotique recommandé.
|
| 96 |
+
|
| 97 |
+
*
|
| 98 |
+
IRM du genou droit
|
| 99 |
+
Indication : gonalgies chroniques avec suspicion de pathologie méniscale.
|
| 100 |
+
|
| 101 |
+
Os : absence de fracture, discrète ostéophytose fémoro-tibiale interne.
|
| 102 |
+
|
| 103 |
+
Ménisques : fissure longitudinale du corne postérieure du ménisque interne.
|
| 104 |
+
|
| 105 |
+
Cartilage : amincissement focal du cartilage fémoro-tibial interne.
|
| 106 |
+
|
| 107 |
+
Ligaments : LCA et LCP intacts. Discrète laxité du ligament collatéral interne.
|
| 108 |
+
|
| 109 |
+
Épanchement articulaire : modéré, synovite réactionnelle.
|
| 110 |
+
|
| 111 |
+
Conclusion : méniscopathie interne avec début d’arthrose fémoro-tibiale interne. Indication de viscosupplémentation et kinésithérapie.
|
| 112 |
+
|
| 113 |
+
*
|
| 114 |
+
Bilan thyroïdien
|
| 115 |
+
Indication : patient présentant un goitre diffus et tachycardie.
|
| 116 |
+
|
| 117 |
+
TSH ultrasensible : <0,01 mUI/L (abaissée).
|
| 118 |
+
|
| 119 |
+
T4 libre : 34 pmol/L (augmentée).
|
| 120 |
+
|
| 121 |
+
T3 libre : 9 pmol/L (augmentée).
|
| 122 |
+
|
| 123 |
+
Anticorps anti-récepteurs de la TSH : positifs.
|
| 124 |
+
|
| 125 |
+
Échographie thyroïdienne : glande augmentée de volume (50 ml), vascularisation diffuse au Doppler. Nodules absents.
|
| 126 |
+
|
| 127 |
+
Conclusion : hyperthyroïdie de type Basedow. Indication d’un traitement par antithyroïdiens de synthèse et avis spécialisé en endocrinologie.
|
| 128 |
+
|
| 129 |
+
*
|
| 130 |
+
Myélogramme
|
| 131 |
+
Indication : anémie macrocytaire avec suspicion de syndrome myélodysplasique.
|
| 132 |
+
|
| 133 |
+
Moelle osseuse : riche, hypercellulaire.
|
| 134 |
+
|
| 135 |
+
Lignée érythroïde : anomalies de maturation avec formes mégaloblastiques.
|
| 136 |
+
|
| 137 |
+
Lignée granulocytaire : dysgranulopoïèse, neutrophiles hypogranulés.
|
| 138 |
+
|
| 139 |
+
Lignée mégacaryocytaire : micromégacaryocytes nombreux.
|
| 140 |
+
|
| 141 |
+
Blastes : 4%.
|
| 142 |
+
|
| 143 |
+
Caryotype : anomalie 5q-.
|
| 144 |
+
|
| 145 |
+
Conclusion : syndrome myélodysplasique avec délétion 5q. Discussion d’un traitement par lénalidomide.
|
| 146 |
+
|
| 147 |
+
*
|
| 148 |
+
IRM cérébrale
|
| 149 |
+
Indication : suspicion de sclérose en plaques.
|
| 150 |
+
|
| 151 |
+
Substance blanche : multiples hypersignaux en T2 et FLAIR, prédominant en péri-ventriculaire et sous-tentoriel.
|
| 152 |
+
|
| 153 |
+
Prise de contraste : deux lésions rehaussées par le gadolinium, témoignant d’une activité inflammatoire en cours.
|
| 154 |
+
|
| 155 |
+
Corps calleux : amincissement diffus.
|
| 156 |
+
|
| 157 |
+
Nerfs optiques : normalité de signal.
|
| 158 |
+
|
| 159 |
+
Conclusion : lésions démyélinisantes disséminées dans le temps et l’espace, compatible avec sclérose en plaques. Avis neurologique pour mise en route d’un traitement immunomodulateur.
|
| 160 |
+
|
| 161 |
+
*
|
| 162 |
+
Dermatologie – biopsie cutanée
|
| 163 |
+
Indication : plaques érythémato-squameuses chroniques au niveau des coudes et genoux.
|
| 164 |
+
|
| 165 |
+
Histologie : hyperkératose parakératosique, acanthose régulière, infiltration lymphocytaire du derme superficiel.
|
| 166 |
+
|
| 167 |
+
Immunofluorescence directe : négative.
|
| 168 |
+
|
| 169 |
+
Bilan sanguin : CRP légèrement augmentée (8 mg/L).
|
| 170 |
+
|
| 171 |
+
Conclusion : psoriasis en plaques. Discussion d’un traitement par photothérapie UVB ou biothérapie anti-TNF en cas de résistance.
|
| 172 |
+
|
| 173 |
+
*
|
| 174 |
+
Examen ophtalmologique complet
|
| 175 |
+
Indication : baisse progressive de l’acuité visuelle bilatérale.
|
| 176 |
+
|
| 177 |
+
Acuité visuelle : 6/10 OD, 7/10 OG.
|
| 178 |
+
|
| 179 |
+
Pression intraoculaire : 17 mmHg bilatérale.
|
| 180 |
+
|
| 181 |
+
Fond d’œil : dépôts drusénoïdes maculaires, début d’atrophie de l’épithélium pigmentaire.
|
| 182 |
+
|
| 183 |
+
OCT maculaire : épaississement rétinien central, zones d’hyporéflectivité compatibles avec une DMLA débutante.
|
| 184 |
+
|
| 185 |
+
Conclusion : dégénérescence maculaire liée à l’âge (DMLA sèche débutante). Suivi régulier avec supplémentation en vitamines antioxydantes.
|
| 186 |
+
|
| 187 |
+
*
|
| 188 |
+
|
| 189 |
+
Nasofibroscopie
|
| 190 |
+
Indication : obstruction nasale chronique et épistaxis récidivants.
|
| 191 |
+
|
| 192 |
+
Fosses nasales : muqueuse hypertrophiée, cornet inférieur gauche volumineux.
|
| 193 |
+
|
| 194 |
+
Septum nasal : déviation droite.
|
| 195 |
+
|
| 196 |
+
Rhinopharynx : végétations adénoïdes persistantes.
|
| 197 |
+
|
| 198 |
+
Sinus maxillaires : sécrétion purulente visible à l’orifice du méat moyen.
|
| 199 |
+
|
| 200 |
+
Biopsie : en attente pour éliminer un processus tumoral.
|
| 201 |
+
|
| 202 |
+
Conclusion : polypose naso-sinusienne avec sinusite chronique. Indication d’une chirurgie endoscopique naso-sinusienne.
|
| 203 |
+
"""
|
| 204 |
+
print("=== Script d'insertion de textes ===")
|
| 205 |
+
print(f"Textes à insérer: {mes_textes}")
|
| 206 |
+
print(f"Démarrage à partir de: template408.txt\n")
|
| 207 |
+
|
| 208 |
+
insert_texts_to_templates(mes_textes, start_number=409)
|
| 209 |
+
|
| 210 |
+
print("\n=== Pour utiliser avec vos propres textes ===")
|
| 211 |
+
print("Modifiez la variable 'mes_textes' avec vos textes séparés par '/'")
|
| 212 |
+
print("Ou appelez: insert_texts_to_templates('texte1 / texte2 / texte3', 355)")
|
testt1.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import spacy
|
| 4 |
+
import nltk
|
| 5 |
+
from nltk.corpus import wordnet
|
| 6 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
| 7 |
+
from textblob import TextBlob
|
| 8 |
+
import requests
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
class AdvancedTextAugmenter:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.setup_dependencies()
|
| 16 |
+
self.setup_models()
|
| 17 |
+
|
| 18 |
+
def setup_dependencies(self):
|
| 19 |
+
"""Configure les dépendances nécessaires"""
|
| 20 |
+
try:
|
| 21 |
+
# Télécharge les ressources NLTK nécessaires
|
| 22 |
+
nltk.download('wordnet', quiet=True)
|
| 23 |
+
nltk.download('averaged_perceptron_tagger', quiet=True)
|
| 24 |
+
nltk.download('punkt', quiet=True)
|
| 25 |
+
|
| 26 |
+
# Charge spaCy pour le français
|
| 27 |
+
try:
|
| 28 |
+
self.nlp = spacy.load("fr_core_news_sm")
|
| 29 |
+
except OSError:
|
| 30 |
+
print("Modèle spaCy français non trouvé. Installation...")
|
| 31 |
+
os.system("python -m spacy download fr_core_news_sm")
|
| 32 |
+
self.nlp = spacy.load("fr_core_news_sm")
|
| 33 |
+
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"Erreur lors de la configuration: {e}")
|
| 36 |
+
print("Installez les dépendances avec: pip install spacy nltk transformers textblob torch")
|
| 37 |
+
|
| 38 |
+
def setup_models(self):
|
| 39 |
+
"""Configure les modèles de transformation"""
|
| 40 |
+
try:
|
| 41 |
+
# Paraphraseur basé sur T5
|
| 42 |
+
self.paraphraser = pipeline(
|
| 43 |
+
"text2text-generation",
|
| 44 |
+
model="plguillou/t5-base-fr-sum-cnndm",
|
| 45 |
+
tokenizer="plguillou/t5-base-fr-sum-cnndm",
|
| 46 |
+
device=0 if torch.cuda.is_available() else -1
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Modèle de traduction pour back-translation
|
| 50 |
+
self.translator_fr_en = pipeline(
|
| 51 |
+
"translation_fr_to_en",
|
| 52 |
+
model="Helsinki-NLP/opus-mt-fr-en",
|
| 53 |
+
device=0 if torch.cuda.is_available() else -1
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
self.translator_en_fr = pipeline(
|
| 57 |
+
"translation_en_to_fr",
|
| 58 |
+
model="Helsinki-NLP/opus-mt-en-fr",
|
| 59 |
+
device=0 if torch.cuda.is_available() else -1
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Erreur lors du chargement des modèles: {e}")
|
| 64 |
+
print("Utilisation de méthodes alternatives...")
|
| 65 |
+
self.paraphraser = None
|
| 66 |
+
self.translator_fr_en = None
|
| 67 |
+
self.translator_en_fr = None
|
| 68 |
+
|
| 69 |
+
def get_wordnet_synonyms(self, word, pos_tag):
|
| 70 |
+
"""Récupère les synonymes via WordNet"""
|
| 71 |
+
synonyms = set()
|
| 72 |
+
|
| 73 |
+
# Convertit les tags POS de NLTK vers WordNet
|
| 74 |
+
wordnet_pos = self.get_wordnet_pos(pos_tag)
|
| 75 |
+
|
| 76 |
+
if wordnet_pos:
|
| 77 |
+
for syn in wordnet.synsets(word, pos=wordnet_pos, lang='fra'):
|
| 78 |
+
for lemma in syn.lemmas(lang='fra'):
|
| 79 |
+
synonym = lemma.name().replace('_', ' ')
|
| 80 |
+
if synonym.lower() != word.lower():
|
| 81 |
+
synonyms.add(synonym)
|
| 82 |
+
|
| 83 |
+
return list(synonyms)
|
| 84 |
+
|
| 85 |
+
def get_wordnet_pos(self, treebank_tag):
|
| 86 |
+
"""Convertit les tags POS vers le format WordNet"""
|
| 87 |
+
if treebank_tag.startswith('J'):
|
| 88 |
+
return wordnet.ADJ
|
| 89 |
+
elif treebank_tag.startswith('V'):
|
| 90 |
+
return wordnet.VERB
|
| 91 |
+
elif treebank_tag.startswith('N'):
|
| 92 |
+
return wordnet.NOUN
|
| 93 |
+
elif treebank_tag.startswith('R'):
|
| 94 |
+
return wordnet.ADV
|
| 95 |
+
else:
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
def synonym_replacement(self, text, replace_ratio=0.3):
|
| 99 |
+
"""Méthode 1: Remplacement par synonymes via WordNet et spaCy - CORRIGÉE"""
|
| 100 |
+
doc = self.nlp(text)
|
| 101 |
+
result_tokens = []
|
| 102 |
+
|
| 103 |
+
for token in doc:
|
| 104 |
+
# Préserve les espaces avant le token
|
| 105 |
+
if token.i > 0:
|
| 106 |
+
# Ajoute les espaces entre les tokens
|
| 107 |
+
prev_token = doc[token.i - 1]
|
| 108 |
+
spaces_between = text[prev_token.idx + len(prev_token.text):token.idx]
|
| 109 |
+
result_tokens.append(spaces_between)
|
| 110 |
+
|
| 111 |
+
if (not token.is_stop and not token.is_punct and
|
| 112 |
+
token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] and
|
| 113 |
+
random.random() < replace_ratio):
|
| 114 |
+
|
| 115 |
+
# Essaie d'abord avec WordNet
|
| 116 |
+
synonyms = self.get_wordnet_synonyms(token.lemma_, token.tag_)
|
| 117 |
+
|
| 118 |
+
if synonyms:
|
| 119 |
+
synonym = random.choice(synonyms)
|
| 120 |
+
# Préserve la casse
|
| 121 |
+
if token.text[0].isupper():
|
| 122 |
+
synonym = synonym.capitalize()
|
| 123 |
+
result_tokens.append(synonym)
|
| 124 |
+
else:
|
| 125 |
+
result_tokens.append(token.text)
|
| 126 |
+
else:
|
| 127 |
+
result_tokens.append(token.text)
|
| 128 |
+
|
| 129 |
+
# CORRECTION MAJEURE: Simple jointure avec reconstruction propre
|
| 130 |
+
return ''.join(result_tokens)
|
| 131 |
+
|
| 132 |
+
def back_translation(self, text):
|
| 133 |
+
"""Méthode 2: Back-translation FR->EN->FR"""
|
| 134 |
+
if not self.translator_fr_en or not self.translator_en_fr:
|
| 135 |
+
return self.fallback_paraphrase(text)
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
# Traduit en anglais
|
| 139 |
+
english = self.translator_fr_en(text, max_length=512)[0]['translation_text']
|
| 140 |
+
|
| 141 |
+
# Retraduit en français
|
| 142 |
+
back_translated = self.translator_en_fr(english, max_length=512)[0]['translation_text']
|
| 143 |
+
|
| 144 |
+
return back_translated
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
print(f"Erreur back-translation: {e}")
|
| 148 |
+
return self.fallback_paraphrase(text)
|
| 149 |
+
|
| 150 |
+
def neural_paraphrasing(self, text):
|
| 151 |
+
"""Méthode 3: Paraphrase neuronale avec T5"""
|
| 152 |
+
if not self.paraphraser:
|
| 153 |
+
return self.fallback_paraphrase(text)
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
# Préfixe pour la paraphrase
|
| 157 |
+
input_text = f"paraphrase: {text}"
|
| 158 |
+
|
| 159 |
+
result = self.paraphraser(
|
| 160 |
+
input_text,
|
| 161 |
+
max_length=len(text.split()) * 2,
|
| 162 |
+
num_return_sequences=1,
|
| 163 |
+
temperature=0.8,
|
| 164 |
+
do_sample=True
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
return result[0]['generated_text']
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"Erreur paraphrase neuronale: {e}")
|
| 171 |
+
return self.fallback_paraphrase(text)
|
| 172 |
+
|
| 173 |
+
def fallback_paraphrase(self, text):
|
| 174 |
+
"""Méthode de secours utilisant des transformations linguistiques - CORRIGÉE"""
|
| 175 |
+
doc = self.nlp(text)
|
| 176 |
+
|
| 177 |
+
# Réorganise les phrases
|
| 178 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
| 179 |
+
|
| 180 |
+
paraphrased_sentences = []
|
| 181 |
+
for sentence in sentences:
|
| 182 |
+
sent_doc = self.nlp(sentence)
|
| 183 |
+
|
| 184 |
+
# Transformations syntaxiques simples avec préservation des espaces
|
| 185 |
+
result_tokens = []
|
| 186 |
+
for token in sent_doc:
|
| 187 |
+
# Préserve les espaces
|
| 188 |
+
if token.i > 0:
|
| 189 |
+
prev_token = sent_doc[token.i - 1]
|
| 190 |
+
spaces_between = sentence[prev_token.idx + len(prev_token.text):token.idx]
|
| 191 |
+
result_tokens.append(spaces_between)
|
| 192 |
+
|
| 193 |
+
if token.pos_ == 'ADP': # Prépositions
|
| 194 |
+
prep_alternatives = {
|
| 195 |
+
'dans': 'à travers', 'sur': 'au-dessus de',
|
| 196 |
+
'avec': 'en compagnie de', 'pour': 'en faveur de'
|
| 197 |
+
}
|
| 198 |
+
result_tokens.append(prep_alternatives.get(token.text.lower(), token.text))
|
| 199 |
+
else:
|
| 200 |
+
result_tokens.append(token.text)
|
| 201 |
+
|
| 202 |
+
paraphrased_sentences.append(''.join(result_tokens))
|
| 203 |
+
|
| 204 |
+
return ' '.join(paraphrased_sentences)
|
| 205 |
+
|
| 206 |
+
def contextual_word_insertion(self, text, insert_ratio=0.1):
|
| 207 |
+
"""Méthode 4: Insertion contextuelle de mots - CORRIGÉE"""
|
| 208 |
+
doc = self.nlp(text)
|
| 209 |
+
result = ""
|
| 210 |
+
|
| 211 |
+
adverb_intensifiers = ['vraiment', 'particulièrement', 'extrêmement', 'assez', 'plutôt']
|
| 212 |
+
conjunctions = ['également', 'aussi', 'de plus', 'par ailleurs']
|
| 213 |
+
|
| 214 |
+
for i, token in enumerate(doc):
|
| 215 |
+
# Ajoute les espaces avant le token si nécessaire
|
| 216 |
+
if token.i > 0:
|
| 217 |
+
prev_token = doc[token.i - 1]
|
| 218 |
+
spaces_between = text[prev_token.idx + len(prev_token.text):token.idx]
|
| 219 |
+
result += spaces_between
|
| 220 |
+
|
| 221 |
+
# Insert adverbs before adjectives
|
| 222 |
+
if (token.pos_ == 'ADJ' and random.random() < insert_ratio):
|
| 223 |
+
result += random.choice(adverb_intensifiers) + " "
|
| 224 |
+
|
| 225 |
+
result += token.text
|
| 226 |
+
|
| 227 |
+
# Insert conjunctions at sentence boundaries
|
| 228 |
+
if (token.text in ['.', '!', '?'] and i < len(doc) - 1 and
|
| 229 |
+
random.random() < insert_ratio):
|
| 230 |
+
result += " " + random.choice(conjunctions) + ","
|
| 231 |
+
|
| 232 |
+
return result
|
| 233 |
+
|
| 234 |
+
def process_single_file(self, file_path, output_counter):
|
| 235 |
+
"""Traite un seul fichier et génère ses variations"""
|
| 236 |
+
try:
|
| 237 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 238 |
+
original_text = f.read().strip()
|
| 239 |
+
|
| 240 |
+
if not original_text:
|
| 241 |
+
return output_counter
|
| 242 |
+
|
| 243 |
+
print(f"Traitement de: {file_path.name}")
|
| 244 |
+
|
| 245 |
+
# Génère la première variation: Synonymes + insertion contextuelle
|
| 246 |
+
print(" → Génération variation 1 (synonymes + insertion)...")
|
| 247 |
+
variation_1 = self.synonym_replacement(original_text)
|
| 248 |
+
variation_1 = self.contextual_word_insertion(variation_1)
|
| 249 |
+
|
| 250 |
+
# Génère la deuxième variation: Back-translation OU paraphrase neuronale
|
| 251 |
+
print(" → Génération variation 2 (back-translation/paraphrase)...")
|
| 252 |
+
if random.choice([True, False]):
|
| 253 |
+
variation_2 = self.back_translation(original_text)
|
| 254 |
+
else:
|
| 255 |
+
variation_2 = self.neural_paraphrasing(original_text)
|
| 256 |
+
|
| 257 |
+
# Sauvegarde les variations
|
| 258 |
+
output_file_1 = f"template{output_counter}.txt"
|
| 259 |
+
with open(output_file_1, 'w', encoding='utf-8') as f:
|
| 260 |
+
f.write(variation_1)
|
| 261 |
+
|
| 262 |
+
output_file_2 = f"template{output_counter + 1}.txt"
|
| 263 |
+
with open(output_file_2, 'w', encoding='utf-8') as f:
|
| 264 |
+
f.write(variation_2)
|
| 265 |
+
|
| 266 |
+
print(f" ✓ Créé: {output_file_1}, {output_file_2}")
|
| 267 |
+
|
| 268 |
+
return output_counter + 2
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"Erreur lors du traitement de {file_path}: {e}")
|
| 272 |
+
return output_counter
|
| 273 |
+
|
| 274 |
+
def augment_dataset(self, input_directory=".", output_prefix="template", start_number=419):
|
| 275 |
+
"""Traite tous les fichiers texte du répertoire"""
|
| 276 |
+
|
| 277 |
+
print("=== AUGMENTATION AVANCÉE DE DONNÉES TEXTUELLES ===\n")
|
| 278 |
+
|
| 279 |
+
# Trouve tous les fichiers .txt
|
| 280 |
+
text_files = sorted(list(Path(input_directory).glob("*.txt")))
|
| 281 |
+
|
| 282 |
+
if not text_files:
|
| 283 |
+
print("❌ Aucun fichier .txt trouvé dans le répertoire.")
|
| 284 |
+
return
|
| 285 |
+
|
| 286 |
+
print(f"📁 Trouvé {len(text_files)} fichiers à traiter...")
|
| 287 |
+
print(f"🚀 Démarrage de la génération à partir de {output_prefix}{start_number}.txt\n")
|
| 288 |
+
|
| 289 |
+
output_counter = start_number
|
| 290 |
+
processed_files = 0
|
| 291 |
+
|
| 292 |
+
for file_path in text_files:
|
| 293 |
+
output_counter = self.process_single_file(file_path, output_counter)
|
| 294 |
+
processed_files += 1
|
| 295 |
+
|
| 296 |
+
if processed_files % 50 == 0:
|
| 297 |
+
print(f"📊 Progression: {processed_files}/{len(text_files)} fichiers traités\n")
|
| 298 |
+
|
| 299 |
+
total_generated = output_counter - start_number
|
| 300 |
+
print(f"\n🎉 TERMINÉ!")
|
| 301 |
+
print(f"📈 Statistiques:")
|
| 302 |
+
print(f" • Fichiers originaux: {len(text_files)}")
|
| 303 |
+
print(f" • Nouveaux fichiers générés: {total_generated}")
|
| 304 |
+
print(f" • Total final: {len(text_files) + total_generated}")
|
| 305 |
+
print(f" • Facteur de multiplication: x{(len(text_files) + total_generated) / len(text_files):.1f}")
|
| 306 |
+
|
| 307 |
+
# Installation automatique des dépendances
|
| 308 |
+
def install_dependencies():
|
| 309 |
+
"""Installe les dépendances nécessaires"""
|
| 310 |
+
import subprocess
|
| 311 |
+
import sys
|
| 312 |
+
|
| 313 |
+
packages = [
|
| 314 |
+
"spacy", "nltk", "transformers", "textblob", "torch", "sentencepiece"
|
| 315 |
+
]
|
| 316 |
+
|
| 317 |
+
for package in packages:
|
| 318 |
+
try:
|
| 319 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
| 320 |
+
except:
|
| 321 |
+
print(f"Impossible d'installer {package}")
|
| 322 |
+
|
| 323 |
+
# Utilisation
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
print("Vérification des dépendances...")
|
| 326 |
+
|
| 327 |
+
try:
|
| 328 |
+
augmenter = AdvancedTextAugmenter()
|
| 329 |
+
|
| 330 |
+
# Lance l'augmentation
|
| 331 |
+
augmenter.augment_dataset(
|
| 332 |
+
input_directory="data_txt", # Répertoire courant
|
| 333 |
+
output_prefix="template",
|
| 334 |
+
start_number=419
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
except ImportError as e:
|
| 338 |
+
print(f"Dépendances manquantes: {e}")
|
| 339 |
+
print("Installation automatique...")
|
| 340 |
+
install_dependencies()
|
| 341 |
+
print("Relancez le script après l'installation.")
|
title_matcher.py
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
title_matcher.py
|
| 3 |
+
Système de matching par titre pour les templates médicaux
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
from typing import Optional, Tuple, List
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from difflib import SequenceMatcher
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class TitleMatchResult:
|
| 16 |
+
"""Résultat du matching par titre"""
|
| 17 |
+
transcription_title: str
|
| 18 |
+
template_id: str
|
| 19 |
+
match_type: str # 'exact', 'normalized', 'fuzzy', 'none'
|
| 20 |
+
confidence: float
|
| 21 |
+
normalized_transcription: str
|
| 22 |
+
normalized_template: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TitleBasedMatcher:
|
| 26 |
+
"""
|
| 27 |
+
Classe pour matcher les transcriptions aux templates par titre
|
| 28 |
+
Priorité au matching exact avant le matching sémantique
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, parser_instance):
|
| 32 |
+
"""
|
| 33 |
+
Initialise le matcher par titre
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
parser_instance: Instance de MedicalTemplateParser avec DB chargée
|
| 37 |
+
"""
|
| 38 |
+
self.parser = parser_instance
|
| 39 |
+
self.template_titles = self._extract_all_template_titles()
|
| 40 |
+
logger.info(f"📋 {len(self.template_titles)} titres de templates chargés")
|
| 41 |
+
|
| 42 |
+
def _extract_all_template_titles(self) -> List[str]:
|
| 43 |
+
"""Extrait tous les titres de templates disponibles"""
|
| 44 |
+
titles = []
|
| 45 |
+
for template_id in self.parser.templates.keys():
|
| 46 |
+
titles.append(template_id)
|
| 47 |
+
return titles
|
| 48 |
+
|
| 49 |
+
def normalize_title(self, title: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Normalise un titre pour le matching
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
title: Titre brut (fichier transcription ou template)
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
str: Titre normalisé
|
| 58 |
+
"""
|
| 59 |
+
# Retirer les extensions
|
| 60 |
+
title = re.sub(r'\.(txt|rtf|docx|doc)$', '', title, flags=re.IGNORECASE)
|
| 61 |
+
|
| 62 |
+
# Retirer les préfixes courants
|
| 63 |
+
title = re.sub(r'^(default\.|mod\.)', '', title, flags=re.IGNORECASE)
|
| 64 |
+
|
| 65 |
+
# Retirer les suffixes de transcription
|
| 66 |
+
title = re.sub(r'_\d+_radiologie$', '', title, flags=re.IGNORECASE)
|
| 67 |
+
title = re.sub(r'_radiologie$', '', title, flags=re.IGNORECASE)
|
| 68 |
+
|
| 69 |
+
# Normaliser les espaces et la casse
|
| 70 |
+
title = re.sub(r'[_\-\s]+', '.', title)
|
| 71 |
+
title = title.lower().strip('.')
|
| 72 |
+
|
| 73 |
+
return title
|
| 74 |
+
|
| 75 |
+
def extract_key_identifier(self, title: str) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Extrait l'identifiant clé du titre (ex: 6260.cherry.EXPERTISE)
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
title: Titre à analyser
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
str: Identifiant clé
|
| 84 |
+
"""
|
| 85 |
+
normalized = self.normalize_title(title)
|
| 86 |
+
|
| 87 |
+
# Chercher un pattern numérique suivi de mots
|
| 88 |
+
# Ex: 6260.cherry.EXPERTISE
|
| 89 |
+
match = re.search(r'(\d+\.[a-z]+(?:\.[A-Z]+)?)', normalized, flags=re.IGNORECASE)
|
| 90 |
+
if match:
|
| 91 |
+
return match.group(1).lower()
|
| 92 |
+
|
| 93 |
+
# Sinon retourner le titre normalisé complet
|
| 94 |
+
return normalized
|
| 95 |
+
|
| 96 |
+
def calculate_similarity(self, str1: str, str2: str) -> float:
|
| 97 |
+
"""
|
| 98 |
+
Calcule la similarité entre deux chaînes
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
str1: Première chaîne
|
| 102 |
+
str2: Deuxième chaîne
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
float: Score de similarité [0-1]
|
| 106 |
+
"""
|
| 107 |
+
return SequenceMatcher(None, str1, str2).ratio()
|
| 108 |
+
|
| 109 |
+
def find_exact_match(self, transcription_title: str) -> Optional[str]:
|
| 110 |
+
"""
|
| 111 |
+
Cherche un match exact avec un template
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
transcription_title: Titre de la transcription
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Optional[str]: ID du template correspondant ou None
|
| 118 |
+
"""
|
| 119 |
+
normalized_trans = self.normalize_title(transcription_title)
|
| 120 |
+
|
| 121 |
+
for template_id in self.template_titles:
|
| 122 |
+
normalized_template = self.normalize_title(template_id)
|
| 123 |
+
|
| 124 |
+
if normalized_trans == normalized_template:
|
| 125 |
+
logger.info(f"✅ Match EXACT trouvé: {template_id}")
|
| 126 |
+
return template_id
|
| 127 |
+
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
def find_key_match(self, transcription_title: str) -> Optional[Tuple[str, float]]:
|
| 131 |
+
"""
|
| 132 |
+
Cherche un match basé sur l'identifiant clé
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
transcription_title: Titre de la transcription
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Optional[Tuple[str, float]]: (template_id, confidence) ou None
|
| 139 |
+
"""
|
| 140 |
+
trans_key = self.extract_key_identifier(transcription_title)
|
| 141 |
+
|
| 142 |
+
best_match = None
|
| 143 |
+
best_score = 0.0
|
| 144 |
+
|
| 145 |
+
for template_id in self.template_titles:
|
| 146 |
+
template_key = self.extract_key_identifier(template_id)
|
| 147 |
+
|
| 148 |
+
# Vérifier si les clés correspondent
|
| 149 |
+
if trans_key in template_key or template_key in trans_key:
|
| 150 |
+
similarity = self.calculate_similarity(trans_key, template_key)
|
| 151 |
+
|
| 152 |
+
if similarity > best_score:
|
| 153 |
+
best_score = similarity
|
| 154 |
+
best_match = template_id
|
| 155 |
+
|
| 156 |
+
if best_match and best_score >= 0.7:
|
| 157 |
+
logger.info(f"✅ Match par CLÉ trouvé: {best_match} (score: {best_score:.3f})")
|
| 158 |
+
return best_match, best_score
|
| 159 |
+
|
| 160 |
+
return None
|
| 161 |
+
|
| 162 |
+
def find_fuzzy_match(self, transcription_title: str, threshold: float = 0.8) -> Optional[Tuple[str, float]]:
|
| 163 |
+
"""
|
| 164 |
+
Cherche un match fuzzy (approximatif)
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
transcription_title: Titre de la transcription
|
| 168 |
+
threshold: Seuil minimum de similarité
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
Optional[Tuple[str, float]]: (template_id, confidence) ou None
|
| 172 |
+
"""
|
| 173 |
+
normalized_trans = self.normalize_title(transcription_title)
|
| 174 |
+
|
| 175 |
+
best_match = None
|
| 176 |
+
best_score = 0.0
|
| 177 |
+
|
| 178 |
+
for template_id in self.template_titles:
|
| 179 |
+
normalized_template = self.normalize_title(template_id)
|
| 180 |
+
|
| 181 |
+
similarity = self.calculate_similarity(normalized_trans, normalized_template)
|
| 182 |
+
|
| 183 |
+
if similarity > best_score and similarity >= threshold:
|
| 184 |
+
best_score = similarity
|
| 185 |
+
best_match = template_id
|
| 186 |
+
|
| 187 |
+
if best_match:
|
| 188 |
+
logger.info(f"✅ Match FUZZY trouvé: {best_match} (score: {best_score:.3f})")
|
| 189 |
+
return best_match, best_score
|
| 190 |
+
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
def match_by_title(self, transcription_title: str,
|
| 194 |
+
fuzzy_threshold: float = 0.8) -> TitleMatchResult:
|
| 195 |
+
"""
|
| 196 |
+
Effectue le matching par titre avec stratégie en cascade
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
transcription_title: Titre du fichier de transcription
|
| 200 |
+
fuzzy_threshold: Seuil pour le matching fuzzy
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
TitleMatchResult: Résultat du matching
|
| 204 |
+
"""
|
| 205 |
+
logger.info(f"\n{'='*80}")
|
| 206 |
+
logger.info(f"🔍 MATCHING PAR TITRE: {transcription_title}")
|
| 207 |
+
logger.info(f"{'='*80}")
|
| 208 |
+
|
| 209 |
+
normalized_trans = self.normalize_title(transcription_title)
|
| 210 |
+
logger.info(f"📝 Titre normalisé: {normalized_trans}")
|
| 211 |
+
|
| 212 |
+
# Stratégie 1: Match exact
|
| 213 |
+
exact_match = self.find_exact_match(transcription_title)
|
| 214 |
+
if exact_match:
|
| 215 |
+
return TitleMatchResult(
|
| 216 |
+
transcription_title=transcription_title,
|
| 217 |
+
template_id=exact_match,
|
| 218 |
+
match_type='exact',
|
| 219 |
+
confidence=1.0,
|
| 220 |
+
normalized_transcription=normalized_trans,
|
| 221 |
+
normalized_template=self.normalize_title(exact_match)
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Stratégie 2: Match par identifiant clé
|
| 225 |
+
key_match = self.find_key_match(transcription_title)
|
| 226 |
+
if key_match:
|
| 227 |
+
template_id, confidence = key_match
|
| 228 |
+
return TitleMatchResult(
|
| 229 |
+
transcription_title=transcription_title,
|
| 230 |
+
template_id=template_id,
|
| 231 |
+
match_type='normalized',
|
| 232 |
+
confidence=confidence,
|
| 233 |
+
normalized_transcription=normalized_trans,
|
| 234 |
+
normalized_template=self.normalize_title(template_id)
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Stratégie 3: Match fuzzy
|
| 238 |
+
fuzzy_match = self.find_fuzzy_match(transcription_title, fuzzy_threshold)
|
| 239 |
+
if fuzzy_match:
|
| 240 |
+
template_id, confidence = fuzzy_match
|
| 241 |
+
return TitleMatchResult(
|
| 242 |
+
transcription_title=transcription_title,
|
| 243 |
+
template_id=template_id,
|
| 244 |
+
match_type='fuzzy',
|
| 245 |
+
confidence=confidence,
|
| 246 |
+
normalized_transcription=normalized_trans,
|
| 247 |
+
normalized_template=self.normalize_title(template_id)
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
# Aucun match trouvé
|
| 251 |
+
logger.warning(f"⚠️ Aucun match par titre trouvé pour: {transcription_title}")
|
| 252 |
+
return TitleMatchResult(
|
| 253 |
+
transcription_title=transcription_title,
|
| 254 |
+
template_id='',
|
| 255 |
+
match_type='none',
|
| 256 |
+
confidence=0.0,
|
| 257 |
+
normalized_transcription=normalized_trans,
|
| 258 |
+
normalized_template=''
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
def display_match_result(self, result: TitleMatchResult):
|
| 262 |
+
"""
|
| 263 |
+
Affiche un résultat de matching de manière formatée
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
result: Résultat à afficher
|
| 267 |
+
"""
|
| 268 |
+
print(f"\n{'='*80}")
|
| 269 |
+
print(f"📋 RÉSULTAT MATCHING PAR TITRE")
|
| 270 |
+
print(f"{'='*80}")
|
| 271 |
+
print(f"📄 Transcription: {result.transcription_title}")
|
| 272 |
+
print(f"📝 Normalisé: {result.normalized_transcription}")
|
| 273 |
+
print(f"\n{'─'*80}")
|
| 274 |
+
|
| 275 |
+
if result.match_type != 'none':
|
| 276 |
+
print(f"✅ Template trouvé: {result.template_id}")
|
| 277 |
+
print(f"📝 Normalisé: {result.normalized_template}")
|
| 278 |
+
print(f"🎯 Type de match: {result.match_type.upper()}")
|
| 279 |
+
print(f"📊 Confiance: {result.confidence:.2%}")
|
| 280 |
+
else:
|
| 281 |
+
print(f"❌ Aucun template correspondant trouvé")
|
| 282 |
+
print(f"💡 Le matching sémantique sera utilisé")
|
| 283 |
+
|
| 284 |
+
print(f"{'='*80}\n")
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
class HybridMatcher:
|
| 288 |
+
"""
|
| 289 |
+
Matcher hybride qui combine le matching par titre et le matching sémantique
|
| 290 |
+
"""
|
| 291 |
+
|
| 292 |
+
def __init__(self, parser_instance, semantic_matcher_instance):
|
| 293 |
+
"""
|
| 294 |
+
Initialise le matcher hybride
|
| 295 |
+
|
| 296 |
+
Args:
|
| 297 |
+
parser_instance: Instance de MedicalTemplateParser
|
| 298 |
+
semantic_matcher_instance: Instance de TranscriptionMatcher
|
| 299 |
+
"""
|
| 300 |
+
self.parser = parser_instance
|
| 301 |
+
self.semantic_matcher = semantic_matcher_instance
|
| 302 |
+
self.title_matcher = TitleBasedMatcher(parser_instance)
|
| 303 |
+
|
| 304 |
+
logger.info("🔄 Matcher hybride initialisé")
|
| 305 |
+
|
| 306 |
+
def match_and_fill(self, transcription: str, transcription_filename: str = None,
|
| 307 |
+
title_confidence_threshold: float = 0.8):
|
| 308 |
+
"""
|
| 309 |
+
Effectue le matching et le remplissage avec stratégie hybride
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
transcription: Contenu de la transcription
|
| 313 |
+
transcription_filename: Nom du fichier (optionnel, pour matching par titre)
|
| 314 |
+
title_confidence_threshold: Seuil de confiance pour utiliser le match par titre
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
List[MatchResult]: Résultats du matching et remplissage
|
| 318 |
+
"""
|
| 319 |
+
from transcription_matcher import MatchResult
|
| 320 |
+
|
| 321 |
+
logger.info("\n" + "="*80)
|
| 322 |
+
logger.info("🚀 MATCHING HYBRIDE (Titre + Sémantique)")
|
| 323 |
+
logger.info("="*80)
|
| 324 |
+
|
| 325 |
+
# Étape 1: Essayer le matching par titre si le filename est fourni
|
| 326 |
+
template_id = None
|
| 327 |
+
match_method = "semantic"
|
| 328 |
+
|
| 329 |
+
if transcription_filename:
|
| 330 |
+
logger.info(f"\n📋 Étape 1: Matching par TITRE")
|
| 331 |
+
logger.info(f"{'─'*80}")
|
| 332 |
+
|
| 333 |
+
title_result = self.title_matcher.match_by_title(transcription_filename)
|
| 334 |
+
self.title_matcher.display_match_result(title_result)
|
| 335 |
+
|
| 336 |
+
if title_result.match_type != 'none' and title_result.confidence >= title_confidence_threshold:
|
| 337 |
+
template_id = title_result.template_id
|
| 338 |
+
match_method = f"title ({title_result.match_type})"
|
| 339 |
+
logger.info(f"✅ Utilisation du match par titre: {template_id}")
|
| 340 |
+
else:
|
| 341 |
+
logger.info(f"⚠️ Match par titre insuffisant (confiance: {title_result.confidence:.2%})")
|
| 342 |
+
logger.info(f"🔄 Passage au matching sémantique...")
|
| 343 |
+
|
| 344 |
+
# Étape 2: Si pas de match par titre, utiliser le matching sémantique
|
| 345 |
+
if not template_id:
|
| 346 |
+
logger.info(f"\n🧠 Étape 2: Matching SÉMANTIQUE")
|
| 347 |
+
logger.info(f"{'─'*80}")
|
| 348 |
+
|
| 349 |
+
results = self.semantic_matcher.match_and_fill(transcription, return_top_k=1)
|
| 350 |
+
|
| 351 |
+
if results:
|
| 352 |
+
# Ajouter l'info de la méthode de matching
|
| 353 |
+
for result in results:
|
| 354 |
+
result.match_method = match_method
|
| 355 |
+
return results
|
| 356 |
+
else:
|
| 357 |
+
logger.error("❌ Aucun résultat du matching sémantique")
|
| 358 |
+
return []
|
| 359 |
+
|
| 360 |
+
# Étape 3: Remplir le template trouvé par titre
|
| 361 |
+
logger.info(f"\n📝 Étape 3: REMPLISSAGE du template")
|
| 362 |
+
logger.info(f"{'─'*80}")
|
| 363 |
+
|
| 364 |
+
template_info = self.parser.get_template_info(template_id)
|
| 365 |
+
if not template_info:
|
| 366 |
+
logger.error(f"❌ Template {template_id} introuvable")
|
| 367 |
+
return []
|
| 368 |
+
|
| 369 |
+
# Remplir avec GPT
|
| 370 |
+
filled_data = self.semantic_matcher.fill_template_with_gpt(
|
| 371 |
+
template_info.content,
|
| 372 |
+
transcription
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
sections_filled = filled_data.get("sections", {})
|
| 376 |
+
confidence = filled_data.get("confidence", 0.0)
|
| 377 |
+
|
| 378 |
+
# Formater le résultat
|
| 379 |
+
filled_template = self.semantic_matcher.format_filled_template(
|
| 380 |
+
template_info.content,
|
| 381 |
+
sections_filled
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
result = MatchResult(
|
| 385 |
+
template_id=template_id,
|
| 386 |
+
template_content=template_info.content,
|
| 387 |
+
similarity_score=1.0, # Match par titre = score parfait
|
| 388 |
+
filled_template=filled_template,
|
| 389 |
+
sections_filled=sections_filled,
|
| 390 |
+
confidence_score=confidence,
|
| 391 |
+
match_method=match_method
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
logger.info(f"✅ Template rempli via {match_method}")
|
| 395 |
+
logger.info("="*80 + "\n")
|
| 396 |
+
|
| 397 |
+
return [result]
|
transcription_processor.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Transcription Processor
|
| 4 |
+
Handles transcription loading, correction, and medical analysis
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
from typing import Dict, Any, Tuple
|
| 10 |
+
from langchain.tools import tool
|
| 11 |
+
from langchain.prompts import ChatPromptTemplate
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@tool
|
| 15 |
+
def load_transcription(transcription_path: str) -> str:
|
| 16 |
+
"""Load and return the raw transcription text from a file."""
|
| 17 |
+
if not os.path.exists(transcription_path):
|
| 18 |
+
raise FileNotFoundError(
|
| 19 |
+
f"Transcription file not found: {transcription_path}")
|
| 20 |
+
|
| 21 |
+
with open(transcription_path, 'r', encoding='utf-8') as f:
|
| 22 |
+
return f.read().strip()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_transcription_with_user_id(transcription_path: str) -> Tuple[str, str]:
|
| 26 |
+
"""Load transcription text and user_id from a JSON file."""
|
| 27 |
+
if not os.path.exists(transcription_path):
|
| 28 |
+
raise FileNotFoundError(
|
| 29 |
+
f"Transcription file not found: {transcription_path}")
|
| 30 |
+
|
| 31 |
+
with open(transcription_path, 'r', encoding='utf-8') as f:
|
| 32 |
+
data = json.load(f)
|
| 33 |
+
|
| 34 |
+
transcription_text = data.get('transcription', '')
|
| 35 |
+
user_id = data.get('user_id', 'unknown')
|
| 36 |
+
|
| 37 |
+
return transcription_text, user_id
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def create_transcription_corrector_chain(llm):
|
| 41 |
+
"""Create the transcription corrector chain."""
|
| 42 |
+
transcription_corrector_prompt = ChatPromptTemplate.from_messages([
|
| 43 |
+
("system", """You are an experienced medical secretary. You will receive a document that is the output of a speech recognition engine (ASR) applied to medical data.
|
| 44 |
+
|
| 45 |
+
Your task is to correct the document while maintaining maximum fidelity to the original text:
|
| 46 |
+
|
| 47 |
+
CORRECTION RULES:
|
| 48 |
+
- Correct spelling and grammar errors
|
| 49 |
+
- Correct incorrect words, especially medication names or disease names
|
| 50 |
+
- Correct incorrect dates or addresses
|
| 51 |
+
- Do NOT add synonyms, stay as faithful as possible to the original text
|
| 52 |
+
- Add line breaks and punctuation when necessary
|
| 53 |
+
- List results and analysis descriptions as bullet points
|
| 54 |
+
- Replace "la ligne" and "à la ligne" with line breaks in the text
|
| 55 |
+
- Replace "point" with a period followed by a line break in the text
|
| 56 |
+
|
| 57 |
+
NUMBER FORMATTING:
|
| 58 |
+
Write numbers in their correct numerical form:
|
| 59 |
+
- "1°" for "un degré"
|
| 60 |
+
- "3D" for "trois d"
|
| 61 |
+
- "200 kg" for "deux cent kilogrammes" or "deux cent kilo"
|
| 62 |
+
- "5.423" for "cinq point quatre cent vingt trois"
|
| 63 |
+
- "0.5" for "zéro virgule cinq"
|
| 64 |
+
- "3 3/4" for "trois et trois quarts"
|
| 65 |
+
- "142,015" for "cent quarante deux mille quinze"
|
| 66 |
+
- "06 32 16 15 19" for "zéro six trente deux seize quinze dix-neuf"
|
| 67 |
+
- "99.50 €" for "quatre vingt dix neuf euros et cinquante centimes"
|
| 68 |
+
- "Friday May 15, 2015" for "quinze mai deux mille quinze"
|
| 69 |
+
- "20:30" for "vingt heures trente"
|
| 70 |
+
- "12:15" for "midi quinze"
|
| 71 |
+
- "5:15" for "cinq heures et quart"
|
| 72 |
+
- "2:45" for "trois heures moins le quart"
|
| 73 |
+
|
| 74 |
+
Also support Swiss and Belgian forms:
|
| 75 |
+
- "70" for "septante"
|
| 76 |
+
- "80" for "huitante"
|
| 77 |
+
- "90" for "nonante"
|
| 78 |
+
- "77" for "septante sept"
|
| 79 |
+
- "81" for "huitante un"
|
| 80 |
+
- "95" for "nonante cinq"
|
| 81 |
+
|
| 82 |
+
Return the corrected text as simple text without explanations or comments. Maintain the original structure and content as much as possible."""),
|
| 83 |
+
("human",
|
| 84 |
+
"Correct the following medical transcription while maintaining maximum fidelity to the original text:\n\n{transcription}")
|
| 85 |
+
])
|
| 86 |
+
|
| 87 |
+
return transcription_corrector_prompt | llm
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def create_medical_analyzer_chain(llm):
|
| 91 |
+
"""Create the medical analyzer chain."""
|
| 92 |
+
medical_analyzer_prompt = ChatPromptTemplate.from_messages([
|
| 93 |
+
("system", """You are a medical information extractor.
|
| 94 |
+
Extract and categorize ONLY the medical information that is explicitly mentioned in the transcription.
|
| 95 |
+
DO NOT add interpretations, conclusions, or information not present in the text.
|
| 96 |
+
DO NOT make assumptions or add medical knowledge.
|
| 97 |
+
Simply organize the information that is already there into structured categories.
|
| 98 |
+
Focus on measurements, anatomical structures, and findings that are explicitly stated."""),
|
| 99 |
+
("human",
|
| 100 |
+
"Extract and organize ONLY the medical information explicitly mentioned in this transcription:\n\n{corrected_transcription}")
|
| 101 |
+
])
|
| 102 |
+
|
| 103 |
+
return medical_analyzer_prompt | llm
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def create_title_generator_chain(llm):
|
| 107 |
+
"""Create the title generator chain."""
|
| 108 |
+
title_generator_prompt = ChatPromptTemplate.from_messages([
|
| 109 |
+
("system", """You are a medical title generator.
|
| 110 |
+
Generate a professional medical report title in FRENCH based on the medical data and findings.
|
| 111 |
+
The title should be specific to the type of examination and findings.
|
| 112 |
+
Return ONLY the title in French, nothing else."""),
|
| 113 |
+
("human", """Generate a medical report title in FRENCH based on this medical data:
|
| 114 |
+
|
| 115 |
+
{medical_data}
|
| 116 |
+
|
| 117 |
+
Generate a professional title in French that reflects the type of examination and key findings.""")
|
| 118 |
+
])
|
| 119 |
+
|
| 120 |
+
return title_generator_prompt | llm
|
transcription_processor_enhanced.py
ADDED
|
@@ -0,0 +1,533 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Enhanced Medical Transcription Processor with ML-based correction
|
| 4 |
+
Uses pretrained models and embeddings for dynamic medical term correction
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
import numpy as np
|
| 11 |
+
from typing import Dict, Any, Tuple, List, Optional
|
| 12 |
+
from langchain.tools import tool
|
| 13 |
+
from langchain.prompts import ChatPromptTemplate
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 16 |
+
import spacy
|
| 17 |
+
from spacy.matcher import Matcher
|
| 18 |
+
import torch
|
| 19 |
+
from transformers import (
|
| 20 |
+
AutoTokenizer, AutoModelForMaskedLM,
|
| 21 |
+
AutoModelForTokenClassification, pipeline
|
| 22 |
+
)
|
| 23 |
+
from difflib import SequenceMatcher
|
| 24 |
+
import pickle
|
| 25 |
+
import logging
|
| 26 |
+
|
| 27 |
+
# Configure logging
|
| 28 |
+
logging.basicConfig(level=logging.INFO)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class MedicalTermCorrector:
|
| 33 |
+
"""ML-based medical term corrector using pretrained models and embeddings"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, cache_dir: str = "./model_cache"):
|
| 36 |
+
self.cache_dir = cache_dir
|
| 37 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
# Initialize models
|
| 40 |
+
self.sentence_transformer = None
|
| 41 |
+
self.nlp = None
|
| 42 |
+
self.medical_ner = None
|
| 43 |
+
self.masked_lm_model = None
|
| 44 |
+
self.masked_lm_tokenizer = None
|
| 45 |
+
self.medical_embeddings = None
|
| 46 |
+
self.medical_terms = None
|
| 47 |
+
|
| 48 |
+
self._load_models()
|
| 49 |
+
self._load_medical_knowledge()
|
| 50 |
+
|
| 51 |
+
def _load_models(self):
|
| 52 |
+
"""Load pretrained models for medical text processing"""
|
| 53 |
+
try:
|
| 54 |
+
# Load sentence transformer for semantic similarity
|
| 55 |
+
logger.info("Loading sentence transformer model...")
|
| 56 |
+
self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
|
| 57 |
+
|
| 58 |
+
# Load spaCy model for NER and linguistic processing
|
| 59 |
+
logger.info("Loading spaCy model...")
|
| 60 |
+
try:
|
| 61 |
+
self.nlp = spacy.load("fr_core_news_sm")
|
| 62 |
+
except OSError:
|
| 63 |
+
logger.warning("French spaCy model not found. Install with: python -m spacy download fr_core_news_sm")
|
| 64 |
+
self.nlp = spacy.load("en_core_web_sm") # Fallback
|
| 65 |
+
|
| 66 |
+
# Load medical NER model
|
| 67 |
+
logger.info("Loading medical NER model...")
|
| 68 |
+
self.medical_ner = pipeline(
|
| 69 |
+
"ner",
|
| 70 |
+
model="samrawal/bert-base-uncased_clinical-ner",
|
| 71 |
+
tokenizer="samrawal/bert-base-uncased_clinical-ner",
|
| 72 |
+
aggregation_strategy="simple"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Load masked language model for context-aware corrections
|
| 76 |
+
logger.info("Loading masked language model...")
|
| 77 |
+
self.masked_lm_tokenizer = AutoTokenizer.from_pretrained("camembert-base")
|
| 78 |
+
self.masked_lm_model = AutoModelForMaskedLM.from_pretrained("camembert-base")
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Error loading models: {e}")
|
| 82 |
+
raise
|
| 83 |
+
|
| 84 |
+
def _load_medical_knowledge(self):
|
| 85 |
+
"""Load medical terminology from external sources"""
|
| 86 |
+
# Check if cached embeddings exist
|
| 87 |
+
embeddings_path = os.path.join(self.cache_dir, "medical_embeddings.pkl")
|
| 88 |
+
terms_path = os.path.join(self.cache_dir, "medical_terms.pkl")
|
| 89 |
+
|
| 90 |
+
if os.path.exists(embeddings_path) and os.path.exists(terms_path):
|
| 91 |
+
logger.info("Loading cached medical knowledge...")
|
| 92 |
+
with open(embeddings_path, 'rb') as f:
|
| 93 |
+
self.medical_embeddings = pickle.load(f)
|
| 94 |
+
with open(terms_path, 'rb') as f:
|
| 95 |
+
self.medical_terms = pickle.load(f)
|
| 96 |
+
else:
|
| 97 |
+
logger.info("Building medical knowledge base...")
|
| 98 |
+
self._build_medical_knowledge()
|
| 99 |
+
|
| 100 |
+
def _build_medical_knowledge(self):
|
| 101 |
+
"""Build medical knowledge base from various sources"""
|
| 102 |
+
# Medical terms from various domains
|
| 103 |
+
medical_terms = [
|
| 104 |
+
# Radiology
|
| 105 |
+
"mammographie", "échographie", "IRM", "TDM", "radiographie",
|
| 106 |
+
"scintigraphie", "angiographie", "arthrographie",
|
| 107 |
+
|
| 108 |
+
# Anatomy
|
| 109 |
+
"utérus", "ovaires", "myomètre", "endomètre", "cervix",
|
| 110 |
+
"ganglions", "axillaires", "mammaire", "pelvien",
|
| 111 |
+
"thyroïde", "pancréas", "foie", "rate", "reins",
|
| 112 |
+
|
| 113 |
+
# Pathology
|
| 114 |
+
"adénomyose", "endométriose", "fibrome", "kyste",
|
| 115 |
+
"carcinome", "adénome", "métastase", "tumeur",
|
| 116 |
+
"inflammation", "nécrose", "hémorragie", "œdème",
|
| 117 |
+
|
| 118 |
+
# Classifications
|
| 119 |
+
"BI-RADS", "ACR", "TNM", "WHO", "BIRADS",
|
| 120 |
+
|
| 121 |
+
# Procedures
|
| 122 |
+
"biopsie", "dépistage", "surveillance", "contrôle",
|
| 123 |
+
"ponction", "drainage", "résection", "ablation",
|
| 124 |
+
|
| 125 |
+
# Measurements
|
| 126 |
+
"millimètre", "centimètre", "millilitre", "gramme",
|
| 127 |
+
"pourcentage", "degré", "unité", "concentration"
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
# Generate embeddings for medical terms
|
| 131 |
+
logger.info("Generating embeddings for medical terms...")
|
| 132 |
+
self.medical_terms = medical_terms
|
| 133 |
+
self.medical_embeddings = self.sentence_transformer.encode(medical_terms)
|
| 134 |
+
|
| 135 |
+
# Cache the embeddings
|
| 136 |
+
embeddings_path = os.path.join(self.cache_dir, "medical_embeddings.pkl")
|
| 137 |
+
terms_path = os.path.join(self.cache_dir, "medical_terms.pkl")
|
| 138 |
+
|
| 139 |
+
with open(embeddings_path, 'wb') as f:
|
| 140 |
+
pickle.dump(self.medical_embeddings, f)
|
| 141 |
+
with open(terms_path, 'wb') as f:
|
| 142 |
+
pickle.dump(self.medical_terms, f)
|
| 143 |
+
|
| 144 |
+
def find_similar_medical_term(self, word: str, threshold: float = 0.7) -> Optional[str]:
|
| 145 |
+
"""Find the most similar medical term using embeddings"""
|
| 146 |
+
if not self.medical_embeddings.any():
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
word_embedding = self.sentence_transformer.encode([word])
|
| 151 |
+
similarities = cosine_similarity(word_embedding, self.medical_embeddings)[0]
|
| 152 |
+
|
| 153 |
+
max_similarity_idx = np.argmax(similarities)
|
| 154 |
+
max_similarity = similarities[max_similarity_idx]
|
| 155 |
+
|
| 156 |
+
if max_similarity > threshold:
|
| 157 |
+
return self.medical_terms[max_similarity_idx]
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.warning(f"Error finding similar term for '{word}': {e}")
|
| 161 |
+
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
def correct_with_context(self, sentence: str, target_word: str) -> str:
|
| 165 |
+
"""Use masked language model for context-aware correction"""
|
| 166 |
+
try:
|
| 167 |
+
# Replace target word with mask token
|
| 168 |
+
masked_sentence = sentence.replace(target_word, self.masked_lm_tokenizer.mask_token)
|
| 169 |
+
|
| 170 |
+
# Tokenize and get predictions
|
| 171 |
+
inputs = self.masked_lm_tokenizer(masked_sentence, return_tensors="pt")
|
| 172 |
+
|
| 173 |
+
with torch.no_grad():
|
| 174 |
+
outputs = self.masked_lm_model(**inputs)
|
| 175 |
+
predictions = outputs.logits
|
| 176 |
+
|
| 177 |
+
# Find mask token position
|
| 178 |
+
mask_token_index = torch.where(inputs["input_ids"] == self.masked_lm_tokenizer.mask_token_id)[1]
|
| 179 |
+
|
| 180 |
+
if len(mask_token_index) > 0:
|
| 181 |
+
mask_token_logits = predictions[0, mask_token_index, :]
|
| 182 |
+
top_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
| 183 |
+
|
| 184 |
+
# Get top predictions
|
| 185 |
+
candidates = [self.masked_lm_tokenizer.decode([token]) for token in top_tokens]
|
| 186 |
+
|
| 187 |
+
# Filter for medical relevance
|
| 188 |
+
for candidate in candidates:
|
| 189 |
+
candidate = candidate.strip()
|
| 190 |
+
if len(candidate) > 2 and candidate.isalpha():
|
| 191 |
+
# Check if candidate is medically relevant
|
| 192 |
+
similar_term = self.find_similar_medical_term(candidate, threshold=0.6)
|
| 193 |
+
if similar_term:
|
| 194 |
+
return similar_term
|
| 195 |
+
|
| 196 |
+
# Check string similarity with original
|
| 197 |
+
if SequenceMatcher(None, target_word.lower(), candidate.lower()).ratio() > 0.6:
|
| 198 |
+
return candidate
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.warning(f"Error in context correction for '{target_word}': {e}")
|
| 202 |
+
|
| 203 |
+
return target_word
|
| 204 |
+
|
| 205 |
+
def extract_medical_entities(self, text: str) -> List[Dict]:
|
| 206 |
+
"""Extract medical entities using NER"""
|
| 207 |
+
try:
|
| 208 |
+
entities = self.medical_ner(text)
|
| 209 |
+
return entities
|
| 210 |
+
except Exception as e:
|
| 211 |
+
logger.warning(f"Error in medical NER: {e}")
|
| 212 |
+
return []
|
| 213 |
+
|
| 214 |
+
def correct_medical_text(self, text: str) -> str:
|
| 215 |
+
"""Main method to correct medical text using ML models"""
|
| 216 |
+
corrected_text = text
|
| 217 |
+
|
| 218 |
+
# Extract potential medical entities
|
| 219 |
+
entities = self.extract_medical_entities(text)
|
| 220 |
+
|
| 221 |
+
# Process with spaCy for linguistic analysis
|
| 222 |
+
if self.nlp:
|
| 223 |
+
doc = self.nlp(text)
|
| 224 |
+
|
| 225 |
+
# Find words that might be medical terms but are misspelled
|
| 226 |
+
for token in doc:
|
| 227 |
+
if (token.is_alpha and len(token.text) > 3 and
|
| 228 |
+
not token.is_stop and not token.like_url):
|
| 229 |
+
|
| 230 |
+
# Check if it's a potential medical term
|
| 231 |
+
similar_term = self.find_similar_medical_term(token.text)
|
| 232 |
+
|
| 233 |
+
if similar_term and similar_term != token.text:
|
| 234 |
+
# Use context-aware correction
|
| 235 |
+
context_correction = self.correct_with_context(text, token.text)
|
| 236 |
+
|
| 237 |
+
# Choose the best correction
|
| 238 |
+
final_correction = context_correction if context_correction != token.text else similar_term
|
| 239 |
+
|
| 240 |
+
# Apply correction with word boundaries
|
| 241 |
+
pattern = r'\b' + re.escape(token.text) + r'\b'
|
| 242 |
+
corrected_text = re.sub(pattern, final_correction, corrected_text, flags=re.IGNORECASE)
|
| 243 |
+
|
| 244 |
+
logger.info(f"Corrected: '{token.text}' -> '{final_correction}'")
|
| 245 |
+
|
| 246 |
+
return corrected_text
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
class DateTimeNormalizer:
|
| 250 |
+
"""Normalize dates and times in medical texts using regex patterns"""
|
| 251 |
+
|
| 252 |
+
def __init__(self):
|
| 253 |
+
self.date_patterns = [
|
| 254 |
+
# French date patterns
|
| 255 |
+
(r'(\d{1,2})\s+(\d{1,2})\s+(\d{4})', r'\1/\2/\3'),
|
| 256 |
+
(r'(\d{1,2})\s+(\d{1,2})\s+(\d{2})\s+(\d{2})', r'\1/\2/\3\4'),
|
| 257 |
+
(r'(\d{1,2})\s+(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(\d{4})',
|
| 258 |
+
self._convert_french_date),
|
| 259 |
+
]
|
| 260 |
+
|
| 261 |
+
self.time_patterns = [
|
| 262 |
+
(r'(\d{1,2})\s+heures?\s+(\d{1,2})', r'\1:\2'),
|
| 263 |
+
(r'(\d{1,2})\s+h\s+(\d{1,2})', r'\1:\2'),
|
| 264 |
+
(r'midi\s+(\d{1,2})', r'12:\1'),
|
| 265 |
+
(r'minuit\s+(\d{1,2})', r'00:\1'),
|
| 266 |
+
]
|
| 267 |
+
|
| 268 |
+
def _convert_french_date(self, match):
|
| 269 |
+
"""Convert French month names to numbers"""
|
| 270 |
+
months = {
|
| 271 |
+
'janvier': '01', 'février': '02', 'mars': '03', 'avril': '04',
|
| 272 |
+
'mai': '05', 'juin': '06', 'juillet': '07', 'août': '08',
|
| 273 |
+
'septembre': '09', 'octobre': '10', 'novembre': '11', 'décembre': '12'
|
| 274 |
+
}
|
| 275 |
+
day, month, year = match.groups()
|
| 276 |
+
return f"{day}/{months.get(month.lower(), month)}/{year}"
|
| 277 |
+
|
| 278 |
+
def normalize_dates_times(self, text: str) -> str:
|
| 279 |
+
"""Normalize all dates and times in the text"""
|
| 280 |
+
result = text
|
| 281 |
+
|
| 282 |
+
for pattern, replacement in self.date_patterns:
|
| 283 |
+
if callable(replacement):
|
| 284 |
+
result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
|
| 285 |
+
else:
|
| 286 |
+
result = re.sub(pattern, replacement, result)
|
| 287 |
+
|
| 288 |
+
for pattern, replacement in self.time_patterns:
|
| 289 |
+
result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
|
| 290 |
+
|
| 291 |
+
return result
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
# Initialize global corrector instance
|
| 295 |
+
medical_corrector = None
|
| 296 |
+
datetime_normalizer = DateTimeNormalizer()
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def initialize_medical_corrector(cache_dir: str = "./model_cache"):
|
| 300 |
+
"""Initialize the medical corrector (call once at startup)"""
|
| 301 |
+
global medical_corrector
|
| 302 |
+
if medical_corrector is None:
|
| 303 |
+
medical_corrector = MedicalTermCorrector(cache_dir)
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
@tool
|
| 307 |
+
def load_transcription(transcription_path: str) -> str:
|
| 308 |
+
"""Load and return the raw transcription text from a file."""
|
| 309 |
+
if not os.path.exists(transcription_path):
|
| 310 |
+
raise FileNotFoundError(f"Transcription file not found: {transcription_path}")
|
| 311 |
+
|
| 312 |
+
with open(transcription_path, 'r', encoding='utf-8') as f:
|
| 313 |
+
return f.read().strip()
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def load_transcription_with_user_id(transcription_path: str) -> Tuple[str, str]:
|
| 317 |
+
"""Load transcription text and user_id from a JSON file."""
|
| 318 |
+
if not os.path.exists(transcription_path):
|
| 319 |
+
raise FileNotFoundError(f"Transcription file not found: {transcription_path}")
|
| 320 |
+
|
| 321 |
+
with open(transcription_path, 'r', encoding='utf-8') as f:
|
| 322 |
+
data = json.load(f)
|
| 323 |
+
|
| 324 |
+
transcription_text = data.get('transcription', '')
|
| 325 |
+
user_id = data.get('user_id', 'unknown')
|
| 326 |
+
|
| 327 |
+
return transcription_text, user_id
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def preprocess_medical_text(text: str) -> str:
|
| 331 |
+
"""Preprocess medical text using ML models"""
|
| 332 |
+
global medical_corrector
|
| 333 |
+
|
| 334 |
+
if medical_corrector is None:
|
| 335 |
+
initialize_medical_corrector()
|
| 336 |
+
|
| 337 |
+
# Apply ML-based medical term correction
|
| 338 |
+
corrected_text = medical_corrector.correct_medical_text(text)
|
| 339 |
+
|
| 340 |
+
# Normalize dates and times
|
| 341 |
+
corrected_text = datetime_normalizer.normalize_dates_times(corrected_text)
|
| 342 |
+
|
| 343 |
+
# Fix common formatting issues
|
| 344 |
+
corrected_text = re.sub(r'(\d+)\s*x\s*(\d+)\s*mm', r'\1 x \2 mm', corrected_text)
|
| 345 |
+
corrected_text = re.sub(r'(\d+)\s*mm\s*sur\s*(\d+)\s*mm', r'\1 mm x \2 mm', corrected_text)
|
| 346 |
+
|
| 347 |
+
return corrected_text
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def create_transcription_corrector_chain(llm):
|
| 351 |
+
"""Create the enhanced transcription corrector chain with ML preprocessing"""
|
| 352 |
+
transcription_corrector_prompt = ChatPromptTemplate.from_messages([
|
| 353 |
+
("system", """You are an expert medical transcriptionist with deep knowledge of French medical terminology.
|
| 354 |
+
The text you receive has already been preprocessed with ML models for medical term correction.
|
| 355 |
+
|
| 356 |
+
Your task is to refine the document structure and ensure professional medical report formatting:
|
| 357 |
+
|
| 358 |
+
FORMATTING AND STRUCTURE:
|
| 359 |
+
- Organize content into clear medical report sections:
|
| 360 |
+
* Title/Header
|
| 361 |
+
* Clinical indication
|
| 362 |
+
* Technique/Method
|
| 363 |
+
* Results/Findings (use bullet points for lists)
|
| 364 |
+
* Conclusion
|
| 365 |
+
- Ensure proper spacing and line breaks
|
| 366 |
+
- Replace "la ligne" and "à la ligne" with appropriate line breaks
|
| 367 |
+
- Replace "point" with periods followed by line breaks when contextually appropriate
|
| 368 |
+
- Format measurements consistently (e.g., "72 x 40 mm")
|
| 369 |
+
- Ensure proper capitalization of medical terms and proper nouns
|
| 370 |
+
|
| 371 |
+
QUALITY ASSURANCE:
|
| 372 |
+
- Verify that medical terminology is accurate and consistent
|
| 373 |
+
- Ensure dates are properly formatted (DD/MM/YYYY)
|
| 374 |
+
- Check that medical classifications are correct (BI-RADS, ACR, etc.)
|
| 375 |
+
- Maintain professional medical language throughout
|
| 376 |
+
- Ensure logical flow and coherence
|
| 377 |
+
|
| 378 |
+
PRESERVATION RULES:
|
| 379 |
+
- Maintain all original medical content and findings
|
| 380 |
+
- Do not add clinical interpretations not present in the original
|
| 381 |
+
- Preserve all measurements, dates, and technical details
|
| 382 |
+
- Keep the original meaning and medical context intact
|
| 383 |
+
|
| 384 |
+
Return the refined text as a properly formatted medical report without explanations."""),
|
| 385 |
+
("human", """Refine and format the following preprocessed medical transcription:
|
| 386 |
+
|
| 387 |
+
{transcription}
|
| 388 |
+
|
| 389 |
+
Focus on professional medical report structure and formatting while preserving all original medical content.""")
|
| 390 |
+
])
|
| 391 |
+
|
| 392 |
+
return transcription_corrector_prompt | llm
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def create_medical_analyzer_chain(llm):
|
| 396 |
+
"""Create the medical analyzer chain with ML enhancement"""
|
| 397 |
+
medical_analyzer_prompt = ChatPromptTemplate.from_messages([
|
| 398 |
+
("system", """You are a medical information extractor with expertise in French medical terminology.
|
| 399 |
+
|
| 400 |
+
Extract and categorize ONLY the medical information that is explicitly mentioned in the transcription.
|
| 401 |
+
The text has been preprocessed with ML models for better medical term accuracy.
|
| 402 |
+
|
| 403 |
+
EXTRACTION CATEGORIES:
|
| 404 |
+
1. Procedure/Examination type
|
| 405 |
+
2. Clinical indication
|
| 406 |
+
3. Technique/Method used
|
| 407 |
+
4. Anatomical structures examined
|
| 408 |
+
5. Measurements and dimensions
|
| 409 |
+
6. Pathological findings
|
| 410 |
+
7. Normal findings
|
| 411 |
+
8. Medical classifications (BI-RADS, ACR, etc.)
|
| 412 |
+
9. Recommendations/Follow-up
|
| 413 |
+
10. Conclusion stated in the report
|
| 414 |
+
|
| 415 |
+
EXTRACTION RULES:
|
| 416 |
+
- Focus on clinical findings, measurements, and observations
|
| 417 |
+
- Extract exact measurements with units
|
| 418 |
+
- Identify medical procedures and techniques
|
| 419 |
+
- Note anatomical structures and their conditions
|
| 420 |
+
- Include any pathological or normal findings
|
| 421 |
+
- Preserve medical classifications and scores
|
| 422 |
+
- DO NOT add interpretations beyond what's stated
|
| 423 |
+
- DO NOT make clinical assumptions
|
| 424 |
+
|
| 425 |
+
Organize the extracted information clearly under each category."""),
|
| 426 |
+
("human", """Extract and organize the medical information from this transcription:
|
| 427 |
+
|
| 428 |
+
{corrected_transcription}
|
| 429 |
+
|
| 430 |
+
Provide a structured medical analysis with clear categorization.""")
|
| 431 |
+
])
|
| 432 |
+
|
| 433 |
+
return medical_analyzer_prompt | llm
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
def create_title_generator_chain(llm):
|
| 437 |
+
"""Create the title generator chain with medical context"""
|
| 438 |
+
title_generator_prompt = ChatPromptTemplate.from_messages([
|
| 439 |
+
("system", """You are a medical report title generator with expertise in French medical terminology.
|
| 440 |
+
|
| 441 |
+
Generate a professional medical report title in FRENCH based on the medical data and findings.
|
| 442 |
+
|
| 443 |
+
TITLE GUIDELINES:
|
| 444 |
+
- Be specific to the examination type (IRM, mammographie, échographie, TDM, etc.)
|
| 445 |
+
- Include the anatomical region examined
|
| 446 |
+
- Use standard French medical terminology
|
| 447 |
+
- Keep titles concise but informative (5-10 words)
|
| 448 |
+
- Follow French medical report conventions
|
| 449 |
+
- Consider the primary purpose (dépistage, surveillance, diagnostic, etc.)
|
| 450 |
+
|
| 451 |
+
EXAMPLES:
|
| 452 |
+
- "Mammographie de dépistage bilatérale"
|
| 453 |
+
- "IRM pelvienne - Exploration utérine"
|
| 454 |
+
- "Échographie abdominale - Surveillance hépatique"
|
| 455 |
+
- "TDM thoracique avec injection de contraste"
|
| 456 |
+
- "Radiographie pulmonaire - Contrôle post-opératoire"
|
| 457 |
+
|
| 458 |
+
Return ONLY the title in French."""),
|
| 459 |
+
("human", """Generate a professional medical report title in French for:
|
| 460 |
+
|
| 461 |
+
{medical_data}
|
| 462 |
+
|
| 463 |
+
Create a concise, specific title that reflects the examination type and focus.""")
|
| 464 |
+
])
|
| 465 |
+
|
| 466 |
+
return title_generator_prompt | llm
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def process_medical_transcription(transcription_text: str, llm) -> Dict[str, Any]:
|
| 470 |
+
"""Complete ML-enhanced processing pipeline for medical transcription"""
|
| 471 |
+
|
| 472 |
+
# Initialize ML corrector if not already done
|
| 473 |
+
if medical_corrector is None:
|
| 474 |
+
initialize_medical_corrector()
|
| 475 |
+
|
| 476 |
+
# Step 1: ML-based preprocessing
|
| 477 |
+
logger.info("Applying ML-based medical term correction...")
|
| 478 |
+
preprocessed_text = preprocess_medical_text(transcription_text)
|
| 479 |
+
|
| 480 |
+
# Step 2: LLM-based structure and formatting correction
|
| 481 |
+
logger.info("Applying LLM-based formatting and structure correction...")
|
| 482 |
+
corrector_chain = create_transcription_corrector_chain(llm)
|
| 483 |
+
corrected_text = corrector_chain.invoke({"transcription": preprocessed_text})
|
| 484 |
+
|
| 485 |
+
# Step 3: Medical content analysis
|
| 486 |
+
logger.info("Extracting medical information...")
|
| 487 |
+
analyzer_chain = create_medical_analyzer_chain(llm)
|
| 488 |
+
medical_analysis = analyzer_chain.invoke({"corrected_transcription": corrected_text})
|
| 489 |
+
|
| 490 |
+
# Step 4: Generate appropriate title
|
| 491 |
+
logger.info("Generating medical report title...")
|
| 492 |
+
title_chain = create_title_generator_chain(llm)
|
| 493 |
+
title = title_chain.invoke({"medical_data": medical_analysis})
|
| 494 |
+
|
| 495 |
+
# Step 5: Extract entities for validation
|
| 496 |
+
entities = medical_corrector.extract_medical_entities(corrected_text) if medical_corrector else []
|
| 497 |
+
|
| 498 |
+
return {
|
| 499 |
+
"original_transcription": transcription_text,
|
| 500 |
+
"preprocessed_transcription": preprocessed_text,
|
| 501 |
+
"corrected_transcription": corrected_text,
|
| 502 |
+
"medical_analysis": medical_analysis,
|
| 503 |
+
"title": title,
|
| 504 |
+
"extracted_entities": entities,
|
| 505 |
+
"processing_info": {
|
| 506 |
+
"ml_corrections_applied": True,
|
| 507 |
+
"entities_extracted": len(entities),
|
| 508 |
+
"preprocessing_successful": True
|
| 509 |
+
}
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
def validate_medical_transcription(corrected_text: str, entities: List[Dict]) -> List[str]:
|
| 514 |
+
"""Validate the corrected medical transcription using ML insights"""
|
| 515 |
+
issues = []
|
| 516 |
+
|
| 517 |
+
# Check entity consistency
|
| 518 |
+
medical_entities = [e for e in entities if e.get('entity_group') in ['MEDICATION', 'DISEASE', 'TREATMENT']]
|
| 519 |
+
if len(medical_entities) < 2:
|
| 520 |
+
issues.append("Low medical entity density detected - review for missing medical terms")
|
| 521 |
+
|
| 522 |
+
# Check for measurement formatting
|
| 523 |
+
measurements = re.findall(r'\d+\s*[x×]\s*\d+\s*mm', corrected_text)
|
| 524 |
+
if not measurements and ('mm' in corrected_text or 'cm' in corrected_text):
|
| 525 |
+
issues.append("Measurement formatting may need attention")
|
| 526 |
+
|
| 527 |
+
# Check for date consistency
|
| 528 |
+
dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', corrected_text)
|
| 529 |
+
if not dates and re.search(r'\d{4}', corrected_text):
|
| 530 |
+
issues.append("Date formatting may need standardization")
|
| 531 |
+
|
| 532 |
+
return issues
|
| 533 |
+
|
type3_extract_entities.py
ADDED
|
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, List, Optional, Tuple, Any
|
| 6 |
+
from dataclasses import dataclass, asdict
|
| 7 |
+
from openai import AzureOpenAI
|
| 8 |
+
|
| 9 |
+
# Configuration des logs
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class MedicalEntity:
|
| 15 |
+
"""Entité médicale extraite"""
|
| 16 |
+
entity_type: str # "UTERUS_POSITION", "MEASUREMENT", etc.
|
| 17 |
+
value: str # Valeur extraite
|
| 18 |
+
unit: Optional[str] = None # mm, cm, etc.
|
| 19 |
+
confidence: float = 0.0 # Score de confiance
|
| 20 |
+
context: str = "" # Contexte d'extraction
|
| 21 |
+
start_pos: int = -1 # Position dans le texte
|
| 22 |
+
end_pos: int = -1
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class ExtractedData:
|
| 26 |
+
"""Données médicales structurées extraites"""
|
| 27 |
+
# Utérus
|
| 28 |
+
uterus_position: Optional[str] = None
|
| 29 |
+
uterus_size: Optional[str] = None
|
| 30 |
+
hysterometry: Optional[str] = None
|
| 31 |
+
|
| 32 |
+
# Endomètre
|
| 33 |
+
endometrium_thickness: Optional[str] = None
|
| 34 |
+
|
| 35 |
+
# Myomètre/Zone jonctionnelle
|
| 36 |
+
myomas_present: Optional[bool] = None
|
| 37 |
+
zone_jonctionnelle_status: Optional[str] = None # normale/épaissie
|
| 38 |
+
adenomyosis_type: Optional[str] = None # diffuse/focale/absente
|
| 39 |
+
|
| 40 |
+
# Ovaire droit
|
| 41 |
+
right_ovary_dimensions: Optional[str] = None
|
| 42 |
+
right_ovary_cfa: Optional[str] = None
|
| 43 |
+
right_ovary_accessibility: Optional[str] = None
|
| 44 |
+
|
| 45 |
+
# Ovaire gauche
|
| 46 |
+
left_ovary_dimensions: Optional[str] = None
|
| 47 |
+
left_ovary_cfa: Optional[str] = None
|
| 48 |
+
left_ovary_accessibility: Optional[str] = None
|
| 49 |
+
|
| 50 |
+
# Doppler
|
| 51 |
+
doppler_ip: Optional[str] = None
|
| 52 |
+
doppler_ir: Optional[str] = None
|
| 53 |
+
|
| 54 |
+
# Métadonnées
|
| 55 |
+
extraction_confidence: float = 0.0
|
| 56 |
+
missing_fields: List[str] = None
|
| 57 |
+
|
| 58 |
+
def __post_init__(self):
|
| 59 |
+
if self.missing_fields is None:
|
| 60 |
+
self.missing_fields = []
|
| 61 |
+
|
| 62 |
+
class MedicalNERAgent:
|
| 63 |
+
"""Agent NER médical utilisant GPT-5 et règles expertes"""
|
| 64 |
+
|
| 65 |
+
def __init__(self):
|
| 66 |
+
self.client = AzureOpenAI(
|
| 67 |
+
api_key=os.getenv("AZURE_OPENAI_KEY", "0e1141969928462bbbf342678c01079e"),
|
| 68 |
+
api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview"),
|
| 69 |
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT", "https://voxist-gpt-eastus2.openai.azure.com/")
|
| 70 |
+
)
|
| 71 |
+
self.model = os.getenv("AZURE_OPENAI_MODEL", "gpt-5")
|
| 72 |
+
self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-5-eastus2")
|
| 73 |
+
|
| 74 |
+
# Patterns regex pour extraction experte
|
| 75 |
+
self.medical_patterns = {
|
| 76 |
+
'uterus_position': [
|
| 77 |
+
r'utérus est (\w+)',
|
| 78 |
+
r'L\'utérus .*?est (\w+)',
|
| 79 |
+
],
|
| 80 |
+
'hysterometry': [
|
| 81 |
+
r'(\d+(?:[.,]\d+)?)\s*(?:d\'|de\s+)?hystérométrie',
|
| 82 |
+
r'hystérométrie\s*:?\s*(\d+(?:[.,]\d+)?)',
|
| 83 |
+
],
|
| 84 |
+
'endometrium': [
|
| 85 |
+
r'(\d+(?:[.,]\d+)?)\s*(?:d\'|de\s+)?endomètre',
|
| 86 |
+
r'endomètre.*?(\d+(?:[.,]\d+)?)',
|
| 87 |
+
],
|
| 88 |
+
'zone_jonctionnelle': [
|
| 89 |
+
r'zone jonctionnelle\s+(\w+)',
|
| 90 |
+
r'(\w+)\s+zone jonctionnelle',
|
| 91 |
+
],
|
| 92 |
+
'myomas_fibrome': [
|
| 93 |
+
r'pas de (fibrome|myome)s?',
|
| 94 |
+
r'(fibrome|myome)s?\s+myomètre\s+pas de (fibrome|myome)s?',
|
| 95 |
+
r'sans (fibrome|myome)s?',
|
| 96 |
+
r'absence.*?(fibrome|myome)s?',
|
| 97 |
+
],
|
| 98 |
+
'adenomyosis': [
|
| 99 |
+
r'adénomyose\s+(\w+)',
|
| 100 |
+
r'(\w+)\s+d\'adénomyose',
|
| 101 |
+
],
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
def extract_medical_entities(self, transcription: str) -> ExtractedData:
|
| 105 |
+
"""
|
| 106 |
+
Extraction principale utilisant GPT-5 + NER expert amélioré
|
| 107 |
+
"""
|
| 108 |
+
logger.info("🚀 Début de l'extraction d'entités médicales")
|
| 109 |
+
|
| 110 |
+
# Étape 1: Préprocessing du texte
|
| 111 |
+
cleaned_text = self._preprocess_text(transcription)
|
| 112 |
+
logger.info(f"📝 Texte nettoyé: {cleaned_text[:100]}...")
|
| 113 |
+
|
| 114 |
+
# Étape 2: Extraction structurée des ovaires (nouvelle méthode)
|
| 115 |
+
ovary_data = self._extract_ovaries_structured(cleaned_text)
|
| 116 |
+
|
| 117 |
+
# Étape 3: Extraction avec GPT-5
|
| 118 |
+
gpt_entities = self._extract_with_gpt5(cleaned_text)
|
| 119 |
+
|
| 120 |
+
# Étape 4: Extraction avec NER expert (fallback et validation)
|
| 121 |
+
expert_entities = self._extract_with_expert_ner(cleaned_text)
|
| 122 |
+
|
| 123 |
+
# Étape 5: Fusion intelligente avec priorité aux ovaires structurés
|
| 124 |
+
final_data = self._merge_extraction_results_improved(gpt_entities, expert_entities, ovary_data, cleaned_text)
|
| 125 |
+
|
| 126 |
+
# Étape 6: Validation et calcul de confiance
|
| 127 |
+
final_data.extraction_confidence = self._calculate_confidence(final_data)
|
| 128 |
+
final_data.missing_fields = self._identify_missing_fields(final_data)
|
| 129 |
+
|
| 130 |
+
logger.info(f"✅ Extraction terminée - Confiance: {final_data.extraction_confidence:.2f}")
|
| 131 |
+
|
| 132 |
+
return final_data
|
| 133 |
+
|
| 134 |
+
def _extract_ovaries_structured(self, text: str) -> Dict[str, Dict[str, str]]:
|
| 135 |
+
"""
|
| 136 |
+
Extraction structurée des ovaires avec analyse contextuelle améliorée
|
| 137 |
+
"""
|
| 138 |
+
logger.info("🔍 Extraction structurée des ovaires")
|
| 139 |
+
|
| 140 |
+
ovary_data = {
|
| 141 |
+
'right': {'dimensions': None, 'cfa': None, 'accessibility': None},
|
| 142 |
+
'left': {'dimensions': None, 'cfa': None, 'accessibility': None}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# 1. Recherche explicite ovaire droit
|
| 146 |
+
right_match = re.search(
|
| 147 |
+
r'ovaire droit\s+mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)\s*mm.*?(\d+)\s*follicules',
|
| 148 |
+
text, re.IGNORECASE | re.DOTALL
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
if right_match:
|
| 152 |
+
ovary_data['right']['dimensions'] = f"{right_match.group(1)} x {right_match.group(2)} mm"
|
| 153 |
+
ovary_data['right']['cfa'] = right_match.group(3)
|
| 154 |
+
logger.info(f"✅ Ovaire droit trouvé: {ovary_data['right']}")
|
| 155 |
+
|
| 156 |
+
# 2. Recherche ovaire gauche avec analyse contextuelle
|
| 157 |
+
# Pattern principal pour les dimensions
|
| 158 |
+
left_dim_patterns = [
|
| 159 |
+
r'ovaire gauche.*?mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)',
|
| 160 |
+
r'il mesure\s+(\d+(?:[.,]\d+)?)\s*(?:fois|x)\s*(\d+(?:[.,]\d+)?)', # référence contextuelle
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
for pattern in left_dim_patterns:
|
| 164 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 165 |
+
if match:
|
| 166 |
+
ovary_data['left']['dimensions'] = f"{match.group(1)} x {match.group(2)} mm"
|
| 167 |
+
logger.info(f"✅ Dimensions ovaire gauche trouvées: {ovary_data['left']['dimensions']}")
|
| 168 |
+
break
|
| 169 |
+
|
| 170 |
+
# 3. Extraction CFA avec analyse contextuelle
|
| 171 |
+
cfa_matches = list(re.finditer(r'(\d+)\s*follicules', text, re.IGNORECASE))
|
| 172 |
+
cfa_siege_matches = list(re.finditer(r'siège de.*?(\d+)\s*follicules', text, re.IGNORECASE))
|
| 173 |
+
|
| 174 |
+
# Combiner tous les CFA trouvés
|
| 175 |
+
all_cfa = []
|
| 176 |
+
for match in cfa_matches:
|
| 177 |
+
all_cfa.append((match.group(1), match.start(), 'follicules'))
|
| 178 |
+
for match in cfa_siege_matches:
|
| 179 |
+
all_cfa.append((match.group(1), match.start(), 'siège'))
|
| 180 |
+
|
| 181 |
+
# Trier par position dans le texte
|
| 182 |
+
all_cfa.sort(key=lambda x: x[1])
|
| 183 |
+
|
| 184 |
+
# Attribution contextuelle des CFA
|
| 185 |
+
if len(all_cfa) >= 2:
|
| 186 |
+
# Le premier CFA est généralement pour l'ovaire droit
|
| 187 |
+
if not ovary_data['right']['cfa']:
|
| 188 |
+
ovary_data['right']['cfa'] = all_cfa[0][0]
|
| 189 |
+
|
| 190 |
+
# Le dernier CFA ou celui avec "siège" est généralement pour l'ovaire gauche
|
| 191 |
+
for cfa_value, pos, cfa_type in reversed(all_cfa):
|
| 192 |
+
if cfa_type == 'siège' or pos > all_cfa[0][1]:
|
| 193 |
+
ovary_data['left']['cfa'] = cfa_value
|
| 194 |
+
break
|
| 195 |
+
|
| 196 |
+
logger.info(f"✅ CFA attribués - Droit: {ovary_data['right']['cfa']}, Gauche: {ovary_data['left']['cfa']}")
|
| 197 |
+
|
| 198 |
+
elif len(all_cfa) == 1:
|
| 199 |
+
# Un seul CFA - analyser le contexte
|
| 200 |
+
cfa_value = all_cfa[0][0]
|
| 201 |
+
cfa_context = text[max(0, all_cfa[0][1]-50):all_cfa[0][1]+50].lower()
|
| 202 |
+
|
| 203 |
+
if 'gauche' in cfa_context or 'siège' in cfa_context:
|
| 204 |
+
ovary_data['left']['cfa'] = cfa_value
|
| 205 |
+
else:
|
| 206 |
+
ovary_data['right']['cfa'] = cfa_value
|
| 207 |
+
|
| 208 |
+
# 4. Extraction accessibilité avec analyse contextuelle améliorée
|
| 209 |
+
# Ovaire gauche - accessibilité difficile
|
| 210 |
+
if re.search(r'ovaire gauche.*?accès difficile', text, re.IGNORECASE) or \
|
| 211 |
+
re.search(r'd\'accès difficile à rétro-utérin', text, re.IGNORECASE):
|
| 212 |
+
ovary_data['left']['accessibility'] = 'difficile rétro-utérine'
|
| 213 |
+
|
| 214 |
+
# Analyse "par contre l'autre il est normal"
|
| 215 |
+
if re.search(r'par contre l\'autre il est normal', text, re.IGNORECASE):
|
| 216 |
+
# Dans le contexte médical, si l'ovaire gauche est difficile, "l'autre" = ovaire droit
|
| 217 |
+
if ovary_data['left']['accessibility'] == 'difficile rétro-utérine':
|
| 218 |
+
ovary_data['right']['accessibility'] = 'normale'
|
| 219 |
+
else:
|
| 220 |
+
# Analyse du contexte précédent pour déterminer quel ovaire est "l'autre"
|
| 221 |
+
context_before = text[:text.find('par contre')].lower() if 'par contre' in text.lower() else text.lower()
|
| 222 |
+
if 'gauche' in context_before[-100:]:
|
| 223 |
+
ovary_data['right']['accessibility'] = 'normale'
|
| 224 |
+
else:
|
| 225 |
+
ovary_data['left']['accessibility'] = 'normale'
|
| 226 |
+
|
| 227 |
+
logger.info(f"🎯 Données ovaires structurées: {ovary_data}")
|
| 228 |
+
return ovary_data
|
| 229 |
+
|
| 230 |
+
def _preprocess_text(self, text: str) -> str:
|
| 231 |
+
"""Prétraitement du texte médical"""
|
| 232 |
+
# Nettoyage des espaces multiples
|
| 233 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
| 234 |
+
|
| 235 |
+
# Correction des nombres séparés (ex: "7. 8" -> "7.8")
|
| 236 |
+
text = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', text)
|
| 237 |
+
text = re.sub(r'(\d+)\s+(\d+)', lambda m: f"{m.group(1)}.{m.group(2)}" if len(m.group(2)) == 1 else f"{m.group(1)} {m.group(2)}", text)
|
| 238 |
+
|
| 239 |
+
# Normalisation des unités
|
| 240 |
+
text = text.replace('fois', 'x')
|
| 241 |
+
|
| 242 |
+
return text
|
| 243 |
+
|
| 244 |
+
def _extract_with_gpt5(self, text: str) -> ExtractedData:
|
| 245 |
+
"""Extraction avec GPT-5 améliorée"""
|
| 246 |
+
prompt = self._build_improved_ner_prompt(text)
|
| 247 |
+
|
| 248 |
+
try:
|
| 249 |
+
response = self.client.chat.completions.create(
|
| 250 |
+
model=self.deployment,
|
| 251 |
+
messages=[
|
| 252 |
+
{
|
| 253 |
+
"role": "system",
|
| 254 |
+
"content": self._get_improved_medical_system_prompt()
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"role": "user",
|
| 258 |
+
"content": prompt
|
| 259 |
+
}
|
| 260 |
+
],
|
| 261 |
+
#temperature=0.1,
|
| 262 |
+
response_format={"type": "json_object"}
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
json_response = response.choices[0].message.content.strip()
|
| 266 |
+
data_dict = json.loads(json_response)
|
| 267 |
+
|
| 268 |
+
return self._dict_to_extracted_data(data_dict)
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
logger.error(f"❌ Erreur extraction GPT-5: {e}")
|
| 272 |
+
return ExtractedData()
|
| 273 |
+
|
| 274 |
+
def _build_improved_ner_prompt(self, text: str) -> str:
|
| 275 |
+
"""Construction du prompt NER amélioré pour GPT-5"""
|
| 276 |
+
return f"""
|
| 277 |
+
Tu es un expert en analyse d'échographies gynécologiques. Extrais précisément les entités médicales de cette transcription en analysant TRÈS ATTENTIVEMENT le contexte pour les ovaires.
|
| 278 |
+
|
| 279 |
+
TRANSCRIPTION: "{text}"
|
| 280 |
+
|
| 281 |
+
RÈGLES CRITIQUES POUR LES OVAIRES:
|
| 282 |
+
1. "L'ovaire droit mesure X x Y mm, Z follicules" -> ovaire droit
|
| 283 |
+
2. "il mesure X x Y mm siège de Z follicules" après mention de l'ovaire gauche -> ovaire gauche
|
| 284 |
+
3. "par contre l'autre il est normal" = généralement l'ovaire droit si le gauche est difficile d'accès
|
| 285 |
+
4. "siège de X follicules" = généralement ovaire gauche
|
| 286 |
+
5. Premier CFA mentionné = généralement ovaire droit, dernier = ovaire gauche
|
| 287 |
+
|
| 288 |
+
Extrais au format JSON strict:
|
| 289 |
+
|
| 290 |
+
{{
|
| 291 |
+
"uterus_position": "antéversé/rétroversé/intermédiaire ou null",
|
| 292 |
+
"uterus_size": "dimension en cm ou null",
|
| 293 |
+
"hysterometry": "valeur en mm ou null",
|
| 294 |
+
"endometrium_thickness": "valeur en mm ou null",
|
| 295 |
+
"myomas_present": true/false/null,
|
| 296 |
+
"zone_jonctionnelle_status": "normale/épaissie ou null",
|
| 297 |
+
"adenomyosis_type": "diffuse/focale/absente ou null",
|
| 298 |
+
"right_ovary_dimensions": "longueur x largeur avec unité ou null",
|
| 299 |
+
"right_ovary_cfa": "nombre follicules ou null",
|
| 300 |
+
"right_ovary_accessibility": "normale/aisée/difficile/rétro-utérine ou null",
|
| 301 |
+
"left_ovary_dimensions": "longueur x largeur avec unité ou null",
|
| 302 |
+
"left_ovary_cfa": "nombre follicules ou null",
|
| 303 |
+
"left_ovary_accessibility": "normale/aisée/difficile/rétro-utérine ou null",
|
| 304 |
+
"doppler_ip": "valeur IP ou null",
|
| 305 |
+
"doppler_ir": "valeur IR ou null"
|
| 306 |
+
}}
|
| 307 |
+
|
| 308 |
+
ANALYSE LE CONTEXTE COMPLET pour différencier ovaire droit/gauche.
|
| 309 |
+
Réponds uniquement avec le JSON, sans explication.
|
| 310 |
+
"""
|
| 311 |
+
|
| 312 |
+
def _get_improved_medical_system_prompt(self) -> str:
|
| 313 |
+
"""Prompt système médical amélioré pour GPT-5"""
|
| 314 |
+
return """Tu es un système expert en NER (Named Entity Recognition) médical spécialisé dans les échographies gynécologiques.
|
| 315 |
+
|
| 316 |
+
MISSION CRITIQUE: Extraire avec une précision maximale les entités médicales, avec une attention particulière à la DISAMBIGUATION DES OVAIRES.
|
| 317 |
+
|
| 318 |
+
EXPERTISE SPÉCIFIQUE OVAIRES:
|
| 319 |
+
- Analyse contextuelle: "L'ovaire droit mesure..." vs "il mesure..." (référence à l'ovaire précédemment mentionné)
|
| 320 |
+
- Références croisées: "l'autre", "par contre" nécessitent une analyse du contexte complet
|
| 321 |
+
- Attribution CFA: Premier mentionné = généralement droit, "siège de" = généralement gauche
|
| 322 |
+
- Accessibilité: "difficile rétro-utérin" est souvent l'ovaire gauche, "normal" l'autre
|
| 323 |
+
|
| 324 |
+
RÈGLES D'OR:
|
| 325 |
+
1. Lis TOUT le texte avant d'attribuer les mesures aux ovaires
|
| 326 |
+
2. Utilise les indices contextuels (ordre, proximité, références)
|
| 327 |
+
3. Ne mélange JAMAIS les données entre ovaire droit et gauche
|
| 328 |
+
4. Si ambiguïté, privilégie l'ordre d'apparition dans le texte médical standard
|
| 329 |
+
|
| 330 |
+
Tu dois être extrêmement précis dans la différenciation ovaire droit/gauche."""
|
| 331 |
+
|
| 332 |
+
def _extract_with_expert_ner(self, text: str) -> ExtractedData:
|
| 333 |
+
"""Extraction avec NER expert (regex + règles)"""
|
| 334 |
+
extracted = ExtractedData()
|
| 335 |
+
|
| 336 |
+
# Extraction position utérus
|
| 337 |
+
extracted.uterus_position = self._extract_pattern(text, 'uterus_position')
|
| 338 |
+
|
| 339 |
+
# Extraction hystérométrie
|
| 340 |
+
extracted.hysterometry = self._extract_pattern(text, 'hysterometry')
|
| 341 |
+
|
| 342 |
+
# Extraction endomètre
|
| 343 |
+
extracted.endometrium_thickness = self._extract_pattern(text, 'endometrium')
|
| 344 |
+
|
| 345 |
+
# Extraction zone jonctionnelle
|
| 346 |
+
zone_jonc = self._extract_zone_jonctionnelle(text)
|
| 347 |
+
if zone_jonc:
|
| 348 |
+
extracted.zone_jonctionnelle_status = zone_jonc
|
| 349 |
+
|
| 350 |
+
# Extraction myomes/fibromes
|
| 351 |
+
myomas_status = self._extract_myomas_status(text)
|
| 352 |
+
if myomas_status is not None:
|
| 353 |
+
extracted.myomas_present = myomas_status
|
| 354 |
+
|
| 355 |
+
# Extraction Doppler
|
| 356 |
+
doppler_values = self._extract_doppler(text)
|
| 357 |
+
extracted.doppler_ip = doppler_values.get('ip')
|
| 358 |
+
extracted.doppler_ir = doppler_values.get('ir')
|
| 359 |
+
|
| 360 |
+
# Extraction adénomyose
|
| 361 |
+
extracted.adenomyosis_type = self._extract_pattern(text, 'adenomyosis')
|
| 362 |
+
|
| 363 |
+
return extracted
|
| 364 |
+
|
| 365 |
+
def _extract_zone_jonctionnelle(self, text: str) -> Optional[str]:
|
| 366 |
+
"""Extraction du statut de la zone jonctionnelle"""
|
| 367 |
+
if re.search(r'zone jonctionnelle\s+épaissie', text, re.IGNORECASE):
|
| 368 |
+
return "épaissie"
|
| 369 |
+
elif re.search(r'épaissie.*?zone jonctionnelle', text, re.IGNORECASE):
|
| 370 |
+
return "épaissie"
|
| 371 |
+
elif re.search(r'zone jonctionnelle\s+normale', text, re.IGNORECASE):
|
| 372 |
+
return "normale"
|
| 373 |
+
|
| 374 |
+
return None
|
| 375 |
+
|
| 376 |
+
def _extract_myomas_status(self, text: str) -> Optional[bool]:
|
| 377 |
+
"""Extraction du statut des myomes/fibromes"""
|
| 378 |
+
negative_patterns = [
|
| 379 |
+
r'pas de (fibrome|myome)s?',
|
| 380 |
+
r'sans (fibrome|myome)s?',
|
| 381 |
+
r'absence.*?(fibrome|myome)s?',
|
| 382 |
+
r'(fibrome|myome)s?\s+myomètre\s+pas de (fibrome|myome)s?',
|
| 383 |
+
]
|
| 384 |
+
|
| 385 |
+
for pattern in negative_patterns:
|
| 386 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 387 |
+
return False
|
| 388 |
+
|
| 389 |
+
positive_patterns = [
|
| 390 |
+
r'présence.*?(fibrome|myome)s?',
|
| 391 |
+
r'(fibrome|myome)s?\s+présents?',
|
| 392 |
+
r'multiples (fibrome|myome)s?',
|
| 393 |
+
]
|
| 394 |
+
|
| 395 |
+
for pattern in positive_patterns:
|
| 396 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 397 |
+
return True
|
| 398 |
+
|
| 399 |
+
return None
|
| 400 |
+
|
| 401 |
+
def _extract_pattern(self, text: str, pattern_key: str) -> Optional[str]:
|
| 402 |
+
"""Extraction avec patterns regex"""
|
| 403 |
+
patterns = self.medical_patterns.get(pattern_key, [])
|
| 404 |
+
|
| 405 |
+
for pattern in patterns:
|
| 406 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 407 |
+
if match:
|
| 408 |
+
return match.group(1)
|
| 409 |
+
|
| 410 |
+
return None
|
| 411 |
+
|
| 412 |
+
def _extract_doppler(self, text: str) -> Dict[str, str]:
|
| 413 |
+
"""Extraction valeurs Doppler"""
|
| 414 |
+
doppler = {}
|
| 415 |
+
|
| 416 |
+
# Pattern pour "Doppler : IP X - IR Y"
|
| 417 |
+
doppler_match = re.search(r'Doppler\s*:?\s*IP\s*(\d+(?:[.,]\d+)?)\s*-?\s*IR\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE)
|
| 418 |
+
|
| 419 |
+
if doppler_match:
|
| 420 |
+
doppler['ip'] = doppler_match.group(1)
|
| 421 |
+
doppler['ir'] = doppler_match.group(2)
|
| 422 |
+
else:
|
| 423 |
+
# Recherche séparée
|
| 424 |
+
ip_match = re.search(r'IP\s*:?\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE)
|
| 425 |
+
if ip_match:
|
| 426 |
+
doppler['ip'] = ip_match.group(1)
|
| 427 |
+
|
| 428 |
+
ir_match = re.search(r'IR\s*:?\s*(\d+(?:[.,]\d+)?)', text, re.IGNORECASE)
|
| 429 |
+
if ir_match:
|
| 430 |
+
doppler['ir'] = ir_match.group(1)
|
| 431 |
+
|
| 432 |
+
return doppler
|
| 433 |
+
|
| 434 |
+
def _merge_extraction_results_improved(self, gpt_data: ExtractedData, expert_data: ExtractedData, ovary_data: Dict, text: str) -> ExtractedData:
|
| 435 |
+
"""Fusion intelligente avec priorité aux données ovaires structurées"""
|
| 436 |
+
merged = ExtractedData()
|
| 437 |
+
|
| 438 |
+
# Priorité: données ovaires structurées > GPT-5 > Expert NER
|
| 439 |
+
non_ovary_fields = [
|
| 440 |
+
'uterus_position', 'uterus_size', 'hysterometry',
|
| 441 |
+
'endometrium_thickness', 'adenomyosis_type', 'zone_jonctionnelle_status',
|
| 442 |
+
'myomas_present', 'doppler_ip', 'doppler_ir'
|
| 443 |
+
]
|
| 444 |
+
|
| 445 |
+
# Fusion des champs non-ovaires
|
| 446 |
+
for field in non_ovary_fields:
|
| 447 |
+
gpt_value = getattr(gpt_data, field, None)
|
| 448 |
+
expert_value = getattr(expert_data, field, None)
|
| 449 |
+
|
| 450 |
+
if gpt_value is not None and str(gpt_value).strip() and str(gpt_value) != 'null':
|
| 451 |
+
setattr(merged, field, gpt_value)
|
| 452 |
+
elif expert_value is not None and str(expert_value).strip():
|
| 453 |
+
setattr(merged, field, expert_value)
|
| 454 |
+
|
| 455 |
+
# Attribution prioritaire des données ovaires structurées
|
| 456 |
+
if ovary_data['right']['dimensions']:
|
| 457 |
+
merged.right_ovary_dimensions = ovary_data['right']['dimensions']
|
| 458 |
+
elif gpt_data.right_ovary_dimensions:
|
| 459 |
+
merged.right_ovary_dimensions = gpt_data.right_ovary_dimensions
|
| 460 |
+
|
| 461 |
+
if ovary_data['right']['cfa']:
|
| 462 |
+
merged.right_ovary_cfa = ovary_data['right']['cfa']
|
| 463 |
+
elif gpt_data.right_ovary_cfa:
|
| 464 |
+
merged.right_ovary_cfa = gpt_data.right_ovary_cfa
|
| 465 |
+
|
| 466 |
+
if ovary_data['right']['accessibility']:
|
| 467 |
+
merged.right_ovary_accessibility = ovary_data['right']['accessibility']
|
| 468 |
+
elif gpt_data.right_ovary_accessibility:
|
| 469 |
+
merged.right_ovary_accessibility = gpt_data.right_ovary_accessibility
|
| 470 |
+
|
| 471 |
+
if ovary_data['left']['dimensions']:
|
| 472 |
+
merged.left_ovary_dimensions = ovary_data['left']['dimensions']
|
| 473 |
+
elif gpt_data.left_ovary_dimensions:
|
| 474 |
+
merged.left_ovary_dimensions = gpt_data.left_ovary_dimensions
|
| 475 |
+
|
| 476 |
+
if ovary_data['left']['cfa']:
|
| 477 |
+
merged.left_ovary_cfa = ovary_data['left']['cfa']
|
| 478 |
+
elif gpt_data.left_ovary_cfa:
|
| 479 |
+
merged.left_ovary_cfa = gpt_data.left_ovary_cfa
|
| 480 |
+
|
| 481 |
+
if ovary_data['left']['accessibility']:
|
| 482 |
+
merged.left_ovary_accessibility = ovary_data['left']['accessibility']
|
| 483 |
+
elif gpt_data.left_ovary_accessibility:
|
| 484 |
+
merged.left_ovary_accessibility = gpt_data.left_ovary_accessibility
|
| 485 |
+
|
| 486 |
+
# Post-traitement contextuel
|
| 487 |
+
merged = self._post_process_contextual(merged, text)
|
| 488 |
+
|
| 489 |
+
return merged
|
| 490 |
+
|
| 491 |
+
def _post_process_contextual(self, data: ExtractedData, text: str) -> ExtractedData:
|
| 492 |
+
"""Post-traitement contextuel pour corriger les erreurs spécifiques"""
|
| 493 |
+
|
| 494 |
+
# Correction unités doublées
|
| 495 |
+
if data.hysterometry and 'mm mm' in str(data.hysterometry):
|
| 496 |
+
data.hysterometry = str(data.hysterometry).replace(' mm mm', ' mm')
|
| 497 |
+
|
| 498 |
+
if data.endometrium_thickness and 'mm mm' in str(data.endometrium_thickness):
|
| 499 |
+
data.endometrium_thickness = str(data.endometrium_thickness).replace(' mm mm', ' mm')
|
| 500 |
+
|
| 501 |
+
# Correction CFA doublés
|
| 502 |
+
if data.right_ovary_cfa and 'follicules follicules' in str(data.right_ovary_cfa):
|
| 503 |
+
data.right_ovary_cfa = str(data.right_ovary_cfa).replace(' follicules follicules', '')
|
| 504 |
+
|
| 505 |
+
if data.left_ovary_cfa and 'follicules follicules' in str(data.left_ovary_cfa):
|
| 506 |
+
data.left_ovary_cfa = str(data.left_ovary_cfa).replace(' follicules follicules', '')
|
| 507 |
+
|
| 508 |
+
# Ajout d'unités manquantes
|
| 509 |
+
if data.left_ovary_dimensions and not ('mm' in str(data.left_ovary_dimensions) or 'cm' in str(data.left_ovary_dimensions)):
|
| 510 |
+
if re.match(r'\d+\s*x\s*\d+$', str(data.left_ovary_dimensions).strip()):
|
| 511 |
+
data.left_ovary_dimensions = str(data.left_ovary_dimensions) + ' mm'
|
| 512 |
+
|
| 513 |
+
if data.right_ovary_dimensions and not ('mm' in str(data.right_ovary_dimensions) or 'cm' in str(data.right_ovary_dimensions)):
|
| 514 |
+
if re.match(r'\d+\s*x\s*\d+$', str(data.right_ovary_dimensions).strip()):
|
| 515 |
+
data.right_ovary_dimensions = str(data.right_ovary_dimensions) + ' mm'
|
| 516 |
+
|
| 517 |
+
return data
|
| 518 |
+
|
| 519 |
+
def _calculate_confidence(self, data: ExtractedData) -> float:
|
| 520 |
+
"""Calcul score de confiance basé sur la complétude"""
|
| 521 |
+
important_fields = [
|
| 522 |
+
'uterus_position', 'hysterometry', 'endometrium_thickness',
|
| 523 |
+
'right_ovary_dimensions', 'left_ovary_dimensions',
|
| 524 |
+
'right_ovary_cfa', 'left_ovary_cfa',
|
| 525 |
+
'doppler_ip', 'doppler_ir'
|
| 526 |
+
]
|
| 527 |
+
|
| 528 |
+
filled_fields = 0
|
| 529 |
+
for field in important_fields:
|
| 530 |
+
value = getattr(data, field, None)
|
| 531 |
+
if value is not None and str(value).strip():
|
| 532 |
+
filled_fields += 1
|
| 533 |
+
|
| 534 |
+
return filled_fields / len(important_fields)
|
| 535 |
+
|
| 536 |
+
def _identify_missing_fields(self, data: ExtractedData) -> List[str]:
|
| 537 |
+
"""Identifie les champs manquants"""
|
| 538 |
+
missing = []
|
| 539 |
+
|
| 540 |
+
field_mapping = {
|
| 541 |
+
'uterus_position': 'Position utérus',
|
| 542 |
+
'hysterometry': 'Hystérométrie',
|
| 543 |
+
'endometrium_thickness': 'Épaisseur endomètre',
|
| 544 |
+
'zone_jonctionnelle_status': 'Zone jonctionnelle',
|
| 545 |
+
'myomas_present': 'Présence myomes',
|
| 546 |
+
'right_ovary_dimensions': 'Taille ovaire droit',
|
| 547 |
+
'left_ovary_dimensions': 'Taille ovaire gauche',
|
| 548 |
+
'right_ovary_cfa': 'CFA ovaire droit',
|
| 549 |
+
'left_ovary_cfa': 'CFA ovaire gauche',
|
| 550 |
+
'right_ovary_accessibility': 'Accessibilité ovaire droit',
|
| 551 |
+
'left_ovary_accessibility': 'Accessibilité ovaire gauche',
|
| 552 |
+
'doppler_ip': 'IP Doppler',
|
| 553 |
+
'doppler_ir': 'IR Doppler'
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
for field, description in field_mapping.items():
|
| 557 |
+
value = getattr(data, field, None)
|
| 558 |
+
if value is None or (isinstance(value, str) and not value.strip()):
|
| 559 |
+
missing.append(description)
|
| 560 |
+
|
| 561 |
+
return missing
|
| 562 |
+
|
| 563 |
+
def _dict_to_extracted_data(self, data_dict: Dict[str, Any]) -> ExtractedData:
|
| 564 |
+
"""Convertit dictionnaire JSON en ExtractedData"""
|
| 565 |
+
extracted = ExtractedData()
|
| 566 |
+
|
| 567 |
+
field_mapping = {
|
| 568 |
+
'uterus_position': 'uterus_position',
|
| 569 |
+
'uterus_size': 'uterus_size',
|
| 570 |
+
'hysterometry': 'hysterometry',
|
| 571 |
+
'endometrium_thickness': 'endometrium_thickness',
|
| 572 |
+
'myomas_present': 'myomas_present',
|
| 573 |
+
'zone_jonctionnelle_status': 'zone_jonctionnelle_status',
|
| 574 |
+
'adenomyosis_type': 'adenomyosis_type',
|
| 575 |
+
'right_ovary_dimensions': 'right_ovary_dimensions',
|
| 576 |
+
'right_ovary_cfa': 'right_ovary_cfa',
|
| 577 |
+
'right_ovary_accessibility': 'right_ovary_accessibility',
|
| 578 |
+
'left_ovary_dimensions': 'left_ovary_dimensions',
|
| 579 |
+
'left_ovary_cfa': 'left_ovary_cfa',
|
| 580 |
+
'left_ovary_accessibility': 'left_ovary_accessibility',
|
| 581 |
+
'doppler_ip': 'doppler_ip',
|
| 582 |
+
'doppler_ir': 'doppler_ir'
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
for json_key, attr_name in field_mapping.items():
|
| 586 |
+
value = data_dict.get(json_key)
|
| 587 |
+
if value is not None and str(value).strip() and str(value) != 'null':
|
| 588 |
+
setattr(extracted, attr_name, value)
|
| 589 |
+
|
| 590 |
+
return extracted
|
| 591 |
+
|
| 592 |
+
def print_extraction_report(self, data: ExtractedData) -> str:
|
| 593 |
+
"""Génère un rapport d'extraction formaté"""
|
| 594 |
+
report = " RAPPORT D'EXTRACTION MÉDICALE\n"
|
| 595 |
+
report += "=" * 50 + "\n\n"
|
| 596 |
+
|
| 597 |
+
# Utérus
|
| 598 |
+
report += " UTÉRUS:\n"
|
| 599 |
+
report += f" Position: {data.uterus_position or '❌ Non trouvé'}\n"
|
| 600 |
+
report += f" Taille: {data.uterus_size or '❌ Non trouvé'}\n"
|
| 601 |
+
report += f" Hystérométrie: {data.hysterometry or '❌ Non trouvé'}\n"
|
| 602 |
+
|
| 603 |
+
# Endomètre
|
| 604 |
+
report += f"\n ENDOMÈTRE:\n"
|
| 605 |
+
report += f" Épaisseur: {data.endometrium_thickness or '❌ Non trouvé'}\n"
|
| 606 |
+
|
| 607 |
+
# Zone jonctionnelle
|
| 608 |
+
report += f"\n ZONE JONCTIONNELLE:\n"
|
| 609 |
+
report += f" Status: {data.zone_jonctionnelle_status or '❌ Non trouvé'}\n"
|
| 610 |
+
report += f" Myomes présents: {data.myomas_present if data.myomas_present is not None else '❌ Non trouvé'}\n"
|
| 611 |
+
report += f" Adénomyose: {data.adenomyosis_type or '❌ Non trouvé'}\n"
|
| 612 |
+
|
| 613 |
+
# Ovaires
|
| 614 |
+
report += f"\n OVAIRE DROIT:\n"
|
| 615 |
+
report += f" Dimensions: {data.right_ovary_dimensions or '❌ Non trouvé'}\n"
|
| 616 |
+
report += f" CFA: {data.right_ovary_cfa or '❌ Non trouvé'} follicules\n"
|
| 617 |
+
report += f" Accessibilité: {data.right_ovary_accessibility or '❌ Non trouvé'}\n"
|
| 618 |
+
|
| 619 |
+
report += f"\n OVAIRE GAUCHE:\n"
|
| 620 |
+
report += f" Dimensions: {data.left_ovary_dimensions or '❌ Non trouvé'}\n"
|
| 621 |
+
report += f" CFA: {data.left_ovary_cfa or '❌ Non trouvé'} follicules\n"
|
| 622 |
+
report += f" Accessibilité: {data.left_ovary_accessibility or '❌ Non trouvé'}\n"
|
| 623 |
+
|
| 624 |
+
# Doppler
|
| 625 |
+
report += f"\n DOPPLER:\n"
|
| 626 |
+
report += f" IP: {data.doppler_ip or '❌ Non trouvé'}\n"
|
| 627 |
+
report += f" IR: {data.doppler_ir or '❌ Non trouvé'}\n"
|
| 628 |
+
|
| 629 |
+
# Statistiques
|
| 630 |
+
report += f"\n STATISTIQUES:\n"
|
| 631 |
+
report += f" Score de confiance: {data.extraction_confidence:.1%}\n"
|
| 632 |
+
report += f" Champs manquants: {len(data.missing_fields)}\n"
|
| 633 |
+
|
| 634 |
+
if data.missing_fields:
|
| 635 |
+
report += f" Détails manquants: {', '.join(data.missing_fields)}\n"
|
| 636 |
+
|
| 637 |
+
return report
|
| 638 |
+
|
| 639 |
+
def test_extraction():
|
| 640 |
+
"""Test de l'extraction avec le cas problématique"""
|
| 641 |
+
# Transcription à analyser
|
| 642 |
+
transcription = """Compte rendu classique. L'utérus est antéversé de taille 7,8 cm 60 d'hystérométrie
|
| 643 |
+
3,7 d'endomètre triangulaire zone jonctionnelle épaissie focale d'adénomyose diffuse fibromes
|
| 644 |
+
myomètre pas de fibromes. Le col voulut le laisser comme il est la morphologie triangulaire.
|
| 645 |
+
L'ovaire droit mesure 26 x 20 mm, 5 follicules. L'ovaire gauche accessibilité au maître rétro
|
| 646 |
+
thérape par contre l'autre il est normal il mesure 25 x 19 mm siège de CFA : 22 follicules.
|
| 647 |
+
Le Doppler : IP 3,24 - IR 0,91 et le reste tout en fait qui est l'ovaire gauche d'accès
|
| 648 |
+
difficile à rétro-utérin."""
|
| 649 |
+
|
| 650 |
+
# Création de l'agent
|
| 651 |
+
agent = MedicalNERAgent()
|
| 652 |
+
|
| 653 |
+
# Extraction
|
| 654 |
+
result = agent.extract_medical_entities(transcription)
|
| 655 |
+
|
| 656 |
+
# Affichage du rapport
|
| 657 |
+
print(agent.print_extraction_report(result))
|
| 658 |
+
|
| 659 |
+
# Test spécifique pour vérifier la correction
|
| 660 |
+
print("\n🔍 VÉRIFICATION SPÉCIFIQUE DES OVAIRES:")
|
| 661 |
+
print(f"Ovaire droit - Dimensions: {result.right_ovary_dimensions}")
|
| 662 |
+
print(f"Ovaire droit - CFA: {result.right_ovary_cfa}")
|
| 663 |
+
print(f"Ovaire droit - Accessibilité: {result.right_ovary_accessibility}")
|
| 664 |
+
print(f"Ovaire gauche - Dimensions: {result.left_ovary_dimensions}")
|
| 665 |
+
print(f"Ovaire gauche - CFA: {result.left_ovary_cfa}")
|
| 666 |
+
print(f"Ovaire gauche - Accessibilité: {result.left_ovary_accessibility}")
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
if __name__ == "__main__":
|
| 670 |
+
test_extraction()
|
type3_preprocessing.py
ADDED
|
@@ -0,0 +1,739 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
import openai
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, List, Tuple
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 9 |
+
import os
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from openai import AzureOpenAI
|
| 12 |
+
from medkit.core.text import TextDocument
|
| 13 |
+
from medkit.text.ner.hf_entity_matcher import HFEntityMatcher
|
| 14 |
+
|
| 15 |
+
NER_MODEL = os.getenv("NER_MODEL", "medkit/DrBERT-CASM2")
|
| 16 |
+
|
| 17 |
+
# Charger les variables d'environnement depuis .env
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Récupération des paramètres
|
| 21 |
+
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 22 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 23 |
+
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
|
| 24 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
|
| 25 |
+
|
| 26 |
+
# Validation des variables d'environnement
|
| 27 |
+
def validate_azure_config():
|
| 28 |
+
"""Valide que toutes les variables Azure sont configurées"""
|
| 29 |
+
missing_vars = []
|
| 30 |
+
if not AZURE_OPENAI_KEY:
|
| 31 |
+
missing_vars.append("AZURE_OPENAI_KEY")
|
| 32 |
+
if not AZURE_OPENAI_ENDPOINT:
|
| 33 |
+
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
| 34 |
+
if not AZURE_OPENAI_DEPLOYMENT:
|
| 35 |
+
missing_vars.append("AZURE_OPENAI_DEPLOYMENT")
|
| 36 |
+
|
| 37 |
+
if missing_vars:
|
| 38 |
+
print(f"❌ Variables d'environnement manquantes: {', '.join(missing_vars)}")
|
| 39 |
+
print("📝 Veuillez créer un fichier .env avec:")
|
| 40 |
+
for var in missing_vars:
|
| 41 |
+
print(f" {var}=votre_valeur")
|
| 42 |
+
return False
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
# Client Azure OpenAI avec validation
|
| 46 |
+
azure_client = None
|
| 47 |
+
if validate_azure_config():
|
| 48 |
+
try:
|
| 49 |
+
azure_client = AzureOpenAI(
|
| 50 |
+
api_key=AZURE_OPENAI_KEY,
|
| 51 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
| 52 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
| 53 |
+
)
|
| 54 |
+
print("✅ Client Azure OpenAI initialisé avec succès")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"❌ Erreur lors de l'initialisation du client Azure OpenAI: {e}")
|
| 57 |
+
azure_client = None
|
| 58 |
+
|
| 59 |
+
ner_matcher = HFEntityMatcher(model=NER_MODEL)
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class CorrectionResult:
|
| 63 |
+
original_text: str
|
| 64 |
+
ner_corrected_text: str
|
| 65 |
+
final_corrected_text: str
|
| 66 |
+
medical_entities: List[Dict]
|
| 67 |
+
confidence_score: float
|
| 68 |
+
|
| 69 |
+
class MedicalNERCorrector:
|
| 70 |
+
"""Correcteur orthographique basé sur un NER médical français"""
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
try:
|
| 74 |
+
# Charger le modèle MedKit NER
|
| 75 |
+
self.matcher = HFEntityMatcher(model=NER_MODEL)
|
| 76 |
+
print(f"✅ Modèle NER '{NER_MODEL}' chargé avec succès")
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ Erreur lors du chargement du modèle NER {NER_MODEL}: {e}")
|
| 79 |
+
self.matcher = None
|
| 80 |
+
|
| 81 |
+
# Dictionnaire complet pour convertir tous les nombres en lettres vers chiffres
|
| 82 |
+
self.number_corrections = {
|
| 83 |
+
# Nombres de base
|
| 84 |
+
"zéro": "0", "un": "1", "deux": "2", "trois": "3", "quatre": "4",
|
| 85 |
+
"cinq": "5", "six": "6", "sept": "7", "huit": "8", "neuf": "9",
|
| 86 |
+
"dix": "10", "onze": "11", "douze": "12", "treize": "13", "quatorze": "14",
|
| 87 |
+
"quinze": "15", "seize": "16", "dix-sept": "17", "dix-huit": "18",
|
| 88 |
+
"dix-neuf": "19", "vingt": "20", "trente": "30", "quarante": "40",
|
| 89 |
+
"cinquante": "50", "soixante": "60", "soixante-dix": "70",
|
| 90 |
+
"quatre-vingts": "80", "quatre-vingt": "80", "quatre-vingt-dix": "90",
|
| 91 |
+
"cent": "100", "mille": "1000",
|
| 92 |
+
|
| 93 |
+
# Variantes courantes dans les transcriptions vocales
|
| 94 |
+
"1": "1", "1er": "1", "première": "1", "premier": "1",
|
| 95 |
+
"2ème": "2", "deuxième": "2", "second": "2", "seconde": "2",
|
| 96 |
+
"3ème": "3", "troisième": "3", "4ème": "4", "quatrième": "4",
|
| 97 |
+
"5ème": "5", "cinquième": "5", "6ème": "6", "sixième": "6",
|
| 98 |
+
"7ème": "7", "septième": "7", "8ème": "8", "huitième": "8",
|
| 99 |
+
"9ème": "9", "neuvième": "9", "10ème": "10", "dixième": "10",
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# Dictionnaire de corrections pour transcriptions vocales - ORDRE IMPORTANT
|
| 103 |
+
self.vocal_corrections = {
|
| 104 |
+
# Corrections de ponctuation - doivent être traitées en premier
|
| 105 |
+
"point à la ligne": ".\n",
|
| 106 |
+
"retour à la ligne": "\n",
|
| 107 |
+
"à la ligne": "\n",
|
| 108 |
+
"nouvelle ligne": "\n",
|
| 109 |
+
"saut de ligne": "\n",
|
| 110 |
+
"point virgule": ";",
|
| 111 |
+
"deux points": ":",
|
| 112 |
+
"point d'interrogation": "?",
|
| 113 |
+
"point d'exclamation": "!",
|
| 114 |
+
"virgule": ",",
|
| 115 |
+
"point": ".", # Doit être traité en dernier pour éviter les conflits
|
| 116 |
+
|
| 117 |
+
# Corrections des séquences IRM
|
| 118 |
+
"T un": "T1", "T deux": "T2", "T trois": "T3",
|
| 119 |
+
"t un": "T1", "t deux": "T2", "t trois": "T3",
|
| 120 |
+
"séquence T un": "séquence T1", "séquence T deux": "séquence T2",
|
| 121 |
+
|
| 122 |
+
# Corrections des niveaux vertébraux - cervicaux
|
| 123 |
+
"C un": "C1", "C deux": "C2", "C trois": "C3", "C quatre": "C4",
|
| 124 |
+
"C cinq": "C5", "C six": "C6", "C sept": "C7",
|
| 125 |
+
"c un": "C1", "c deux": "C2", "c trois": "C3", "c quatre": "C4",
|
| 126 |
+
"c cinq": "C5", "c six": "C6", "c sept": "C7",
|
| 127 |
+
|
| 128 |
+
# Niveaux thoraciques
|
| 129 |
+
"T un": "T1", "T deux": "T2", "T trois": "T3", "T quatre": "T4",
|
| 130 |
+
"T cinq": "T5", "T six": "T6", "T sept": "T7", "T huit": "T8",
|
| 131 |
+
"T neuf": "T9", "T dix": "T10", "T onze": "T11", "T douze": "T12",
|
| 132 |
+
|
| 133 |
+
# Niveaux lombaires
|
| 134 |
+
"L un": "L1", "L deux": "L2", "L trois": "L3", "L quatre": "L4", "L cinq": "L5",
|
| 135 |
+
"l un": "L1", "l deux": "L2", "l trois": "L3", "l quatre": "L4", "l cinq": "L5",
|
| 136 |
+
|
| 137 |
+
# Niveaux sacrés
|
| 138 |
+
"S un": "S1", "S deux": "S2", "S trois": "S3", "S quatre": "S4", "S cinq": "S5",
|
| 139 |
+
"s un": "S1", "s deux": "S2", "s trois": "S3", "s quatre": "S4", "s cinq": "S5",
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
# Dictionnaire de corrections médicales spécialisées - orthographe
|
| 143 |
+
self.medical_corrections = {
|
| 144 |
+
# Anatomie
|
| 145 |
+
"rachis": ["rachis", "rachi", "rachys", "rahis", "raxis"],
|
| 146 |
+
"cervical": ["cervical", "cervicale", "cervicaux", "servical", "servicale"],
|
| 147 |
+
"vertébraux": ["vertébraux", "vertebraux", "vertébrau", "vertébral", "vertebral"],
|
| 148 |
+
"médullaire": ["médullaire", "medullaire", "medulaire", "médulaire"],
|
| 149 |
+
"foraminal": ["foraminal", "foraminale", "foraminaux", "forraminal"],
|
| 150 |
+
"postérolatéral": ["postérolatéral", "posterolatéral", "postero-latéral", "postero latéral"],
|
| 151 |
+
"antérolatéral": ["antérolatéral", "anterolatéral", "antero-latéral", "antero latéral"],
|
| 152 |
+
"longitudinal": ["longitudinal", "longitudinale", "longitudinaux"],
|
| 153 |
+
|
| 154 |
+
# Pathologies
|
| 155 |
+
"uncarthrose": ["uncarthrose", "uncoarthrose", "uncartrose", "unkarthrose"],
|
| 156 |
+
"lordose": ["lordose", "lordoze", "lordosse", "lordosse"],
|
| 157 |
+
"cyphose": ["cyphose", "siphose", "kyphose", "kiphose"],
|
| 158 |
+
"scoliose": ["scoliose", "skoliose", "scholiose"],
|
| 159 |
+
"discopathie": ["discopathie", "disccopathie", "discopatie"],
|
| 160 |
+
"discal": ["discal", "discale", "diskal", "diskale", "disque"],
|
| 161 |
+
"hernie": ["hernie", "herny", "herni"],
|
| 162 |
+
"protrusion": ["protrusion", "protusion", "protruzion"],
|
| 163 |
+
"sténose": ["sténose", "stenose", "sténoze"],
|
| 164 |
+
"arthrose": ["arthrose", "artrose", "arthroze"],
|
| 165 |
+
"ostéophyte": ["ostéophyte", "osteophyte", "ostéofite"],
|
| 166 |
+
"ligamentaire": ["ligamentaire", "ligamentere", "ligamentair"],
|
| 167 |
+
|
| 168 |
+
# Techniques et examens
|
| 169 |
+
"sagittal": ["sagittal", "sagittale", "sagital", "sagittaux"],
|
| 170 |
+
"coronal": ["coronal", "coronale", "coronaux"],
|
| 171 |
+
"axial": ["axial", "axiale", "axiaux", "axial"],
|
| 172 |
+
"transversal": ["transversal", "transversale", "transversaux"],
|
| 173 |
+
"pondéré": ["pondéré", "pondéré", "pondere", "pondérée"],
|
| 174 |
+
"séquence": ["séquence", "sequence", "sekence"],
|
| 175 |
+
"contraste": ["contraste", "contraste", "kontraste"],
|
| 176 |
+
"gadolinium": ["gadolinium", "gadoliniun", "gadoliniom"],
|
| 177 |
+
|
| 178 |
+
# Mesures et directions
|
| 179 |
+
"millimètre": ["millimètre", "millimetre", "mm"],
|
| 180 |
+
"centimètre": ["centimètre", "centimetre", "cm"],
|
| 181 |
+
"gauche": ["gauche", "gosh", "goshe", "goche"],
|
| 182 |
+
"droite": ["droite", "droitte", "droithe", "droitr"],
|
| 183 |
+
"antérieur": ["antérieur", "anterieur", "antérieure", "anterieure"],
|
| 184 |
+
"postérieur": ["postérieur", "posterieur", "postérieure", "posterieure"],
|
| 185 |
+
"supérieur": ["supérieur", "superieur", "supérieure", "superieure"],
|
| 186 |
+
"inférieur": ["inférieur", "inferieur", "inférieure", "inferieure"],
|
| 187 |
+
"médian": ["médian", "median", "mediane", "médiane"],
|
| 188 |
+
"latéral": ["latéral", "lateral", "laterale", "latérale"],
|
| 189 |
+
|
| 190 |
+
# Signaux et aspects
|
| 191 |
+
"signal": ["signal", "signale", "signa", "signaux"],
|
| 192 |
+
"hypersignal": ["hypersignal", "hyper signal", "hypersignale"],
|
| 193 |
+
"hyposignal": ["hyposignal", "hypo signal", "hyposignale"],
|
| 194 |
+
"isosignal": ["isosignal", "iso signal", "isosignale"],
|
| 195 |
+
"hétérogène": ["hétérogène", "heterogene", "hétérogène"],
|
| 196 |
+
"homogène": ["homogène", "homogene", "omogene"],
|
| 197 |
+
|
| 198 |
+
# Autres termes fréquents
|
| 199 |
+
"dimension": ["dimension", "dimention", "dimmension"],
|
| 200 |
+
"normale": ["normale", "normal", "normalle"],
|
| 201 |
+
"anomalie": ["anomalie", "annomalie", "anomaly"],
|
| 202 |
+
"décelable": ["décelable", "decelabl", "décellabl"],
|
| 203 |
+
"absence": ["absence", "abscence", "absance"],
|
| 204 |
+
"présence": ["présence", "presence", "presance"],
|
| 205 |
+
"contact": ["contact", "contacte", "kontak"],
|
| 206 |
+
"compression": ["compression", "compresion", "kompression"],
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
# Expressions régulières pour les patterns médicaux
|
| 210 |
+
self.medical_patterns = {
|
| 211 |
+
"vertebral_level": r"[CTLS]\d+[\s-]*[CTLS]\d+",
|
| 212 |
+
"measurement": r"\d+[\s]*[x×]\s*\d+\s*mm",
|
| 213 |
+
"technique": r"T[1-3]",
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
def convert_numbers_to_digits(self, text: str) -> str:
|
| 217 |
+
"""Convertit TOUS les nombres en lettres vers des chiffres"""
|
| 218 |
+
corrected_text = text
|
| 219 |
+
|
| 220 |
+
# ÉTAPE 1: Gestion spéciale des mesures médicales communes
|
| 221 |
+
medical_measures = {
|
| 222 |
+
# Mesures d'utérus courantes
|
| 223 |
+
"sept point huit": "7,8",
|
| 224 |
+
"trois sept": "3,7",
|
| 225 |
+
"soixante": "60",
|
| 226 |
+
|
| 227 |
+
# Mesures d'ovaires courantes
|
| 228 |
+
"vingt six": "26",
|
| 229 |
+
|
| 230 |
+
"vingt cinq": "25",
|
| 231 |
+
"dix neuf": "19",
|
| 232 |
+
"vingt deux": "22",
|
| 233 |
+
|
| 234 |
+
# Mesures Doppler
|
| 235 |
+
"trois vingt quatre": "3,24", # IP
|
| 236 |
+
"quatre vingt onze": "0,91", # IR (avec virgule décimale)
|
| 237 |
+
|
| 238 |
+
# Autres mesures courantes
|
| 239 |
+
"quinze": "15",
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
# Application des mesures médicales en premier
|
| 243 |
+
for word_measure, digit_measure in medical_measures.items():
|
| 244 |
+
pattern = r'\b' + re.escape(word_measure) + r'\b'
|
| 245 |
+
corrected_text = re.sub(pattern, digit_measure, corrected_text, flags=re.IGNORECASE)
|
| 246 |
+
|
| 247 |
+
# ÉTAPE 2: Traitement des nombres composés restants
|
| 248 |
+
compound_without_dash = {
|
| 249 |
+
"vingt un": "21", "vingt deux": "22", "vingt trois": "23", "vingt quatre": "24",
|
| 250 |
+
"vingt cinq": "25", "vingt six": "26", "vingt sept": "27", "vingt huit": "28",
|
| 251 |
+
"vingt neuf": "29", "trente un": "31", "trente deux": "32", "trente trois": "33",
|
| 252 |
+
"trente quatre": "34", "trente cinq": "35", "trente six": "36", "trente sept": "37",
|
| 253 |
+
"trente huit": "38", "trente neuf": "39", "quarante un": "41", "quarante deux": "42",
|
| 254 |
+
"quarante trois": "43", "quarante quatre": "44", "quarante cinq": "45",
|
| 255 |
+
"quarante six": "46", "quarante sept": "47", "quarante huit": "48", "quarante neuf": "49",
|
| 256 |
+
"cinquante un": "51", "cinquante deux": "52", "cinquante trois": "53",
|
| 257 |
+
"cinquante quatre": "54", "cinquante cinq": "55", "cinquante six": "56",
|
| 258 |
+
"cinquante sept": "57", "cinquante huit": "58", "cinquante neuf": "59",
|
| 259 |
+
"soixante un": "61", "soixante deux": "62", "soixante trois": "63",
|
| 260 |
+
"soixante quatre": "64", "soixante cinq": "65", "soixante six": "66",
|
| 261 |
+
"soixante sept": "67", "soixante huit": "68", "soixante neuf": "69",
|
| 262 |
+
"soixante et onze": "71", "soixante douze": "72", "soixante treize": "73",
|
| 263 |
+
"soixante quatorze": "74", "soixante quinze": "75", "soixante seize": "76",
|
| 264 |
+
"soixante dix sept": "77", "soixante dix huit": "78", "soixante dix neuf": "79",
|
| 265 |
+
"quatre vingt un": "81", "quatre vingt deux": "82", "quatre vingt trois": "83",
|
| 266 |
+
"quatre vingt quatre": "84", "quatre vingt cinq": "85", "quatre vingt six": "86",
|
| 267 |
+
"quatre vingt sept": "87", "quatre vingt huit": "88", "quatre vingt neuf": "89",
|
| 268 |
+
"quatre vingt onze": "91", "quatre vingt douze": "92", "quatre vingt treize": "93",
|
| 269 |
+
"quatre vingt quatorze": "94", "quatre vingt quinze": "95", "quatre vingt seize": "96",
|
| 270 |
+
"quatre vingt dix sept": "97", "quatre vingt dix huit": "98", "quatre vingt dix neuf": "99",
|
| 271 |
+
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
for word, digit in compound_without_dash.items():
|
| 275 |
+
# Protection : ne remplace PAS si suivi de "fois" ET d'un autre nombre
|
| 276 |
+
pattern = r'\b' + re.escape(word) + r'\b(?!\s+fois\s+\w+)'
|
| 277 |
+
corrected_text = re.sub(pattern, digit, corrected_text, flags=re.IGNORECASE)
|
| 278 |
+
|
| 279 |
+
# ÉTAPE 3: Nombres simples (ordre modifié pour éviter les conflits)
|
| 280 |
+
simple_numbers = {
|
| 281 |
+
"zéro": "0", "un": "1", "deux": "2", "trois": "3", "quatre": "4",
|
| 282 |
+
"cinq": "5", "six": "6", "sept": "7", "huit": "8", "neuf": "9",
|
| 283 |
+
"dix": "10", "onze": "11", "douze": "12", "treize": "13", "quatorze": "14",
|
| 284 |
+
"quinze": "15", "seize": "16", "dix-sept": "17", "dix-huit": "18",
|
| 285 |
+
"dix-neuf": "19", "vingt": "20", "trente": "30", "quarante": "40",
|
| 286 |
+
"cinquante": "50", "soixante-dix": "70",
|
| 287 |
+
"quatre-vingts": "80", "quatre-vingt": "80", "quatre-vingt-dix": "90",
|
| 288 |
+
"cent": "100", "mille": "1000",
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
# Conversion des nombres simples
|
| 292 |
+
for word_number, digit in simple_numbers.items():
|
| 293 |
+
pattern = r'\b' + re.escape(word_number) + r'\b'
|
| 294 |
+
corrected_text = re.sub(pattern, digit, corrected_text, flags=re.IGNORECASE)
|
| 295 |
+
|
| 296 |
+
return corrected_text
|
| 297 |
+
|
| 298 |
+
def extract_medical_entities(self, text: str):
|
| 299 |
+
"""Extrait les entités médicales avec MedKit HFEntityMatcher"""
|
| 300 |
+
if not self.matcher:
|
| 301 |
+
return []
|
| 302 |
+
doc = TextDocument(text)
|
| 303 |
+
entities = self.matcher.run([doc.raw_segment])
|
| 304 |
+
# Transformer en format simple
|
| 305 |
+
formatted_entities = []
|
| 306 |
+
for ent in entities:
|
| 307 |
+
formatted_entities.append({
|
| 308 |
+
"text": ent.text,
|
| 309 |
+
"label": ent.label,
|
| 310 |
+
})
|
| 311 |
+
return formatted_entities
|
| 312 |
+
|
| 313 |
+
def correct_vocal_transcription(self, text: str) -> str:
|
| 314 |
+
"""Corrige les transcriptions vocales avec un ordre de priorité strict"""
|
| 315 |
+
corrected_text = text
|
| 316 |
+
|
| 317 |
+
# ÉTAPE 1: Conversion des nombres AVANT tout le reste
|
| 318 |
+
corrected_text = self.convert_numbers_to_digits(corrected_text)
|
| 319 |
+
|
| 320 |
+
# ÉTAPE 2: Corrections des expressions vocales dans l'ordre de priorité
|
| 321 |
+
# L'ordre est CRUCIAL pour éviter les conflits
|
| 322 |
+
priority_corrections = [
|
| 323 |
+
# Expressions de ponctuation complexes en premier
|
| 324 |
+
("point à la ligne", ".\n"),
|
| 325 |
+
("retour à la ligne", "\n"),
|
| 326 |
+
("à la ligne", "\n"),
|
| 327 |
+
("nouvelle ligne", "\n"),
|
| 328 |
+
("saut de ligne", "\n"),
|
| 329 |
+
("point virgule", ";"),
|
| 330 |
+
("deux points", ":"),
|
| 331 |
+
("point d'interrogation", "?"),
|
| 332 |
+
("point d'exclamation", "!"),
|
| 333 |
+
|
| 334 |
+
# Niveaux vertébraux avec nombres
|
| 335 |
+
("C 1", "C1"), ("C 2", "C2"), ("C 3", "C3"), ("C 4", "C4"),
|
| 336 |
+
("C 5", "C5"), ("C 6", "C6"), ("C 7", "C7"),
|
| 337 |
+
("L 1", "L1"), ("L 2", "L2"), ("L 3", "L3"), ("L 4", "L4"), ("L 5", "L5"),
|
| 338 |
+
("T 1", "T1"), ("T 2", "T2"), ("T 3", "T3"), ("T 4", "T4"),
|
| 339 |
+
("T 5", "T5"), ("T 6", "T6"), ("T 7", "T7"), ("T 8", "T8"),
|
| 340 |
+
("T 9", "T9"), ("T 10", "T10"), ("T 11", "T11"), ("T 12", "T12"),
|
| 341 |
+
|
| 342 |
+
# Séquences IRM
|
| 343 |
+
("séquence T 1", "séquence T1"), ("séquence T 2", "séquence T2"),
|
| 344 |
+
|
| 345 |
+
# Virgule et point en dernier pour éviter les conflits
|
| 346 |
+
("virgule", ","),
|
| 347 |
+
]
|
| 348 |
+
|
| 349 |
+
for vocal_term, replacement in priority_corrections:
|
| 350 |
+
# Utilisation de word boundaries pour éviter les remplacements partiels
|
| 351 |
+
pattern = r'\b' + re.escape(vocal_term) + r'\b'
|
| 352 |
+
corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
|
| 353 |
+
|
| 354 |
+
# ÉTAPE 3: Correction spéciale pour "point" - seulement si c'est vraiment une fin de phrase
|
| 355 |
+
# Pattern pour détecter "point" suivi d'un espace et d'une majuscule OU en fin de texte
|
| 356 |
+
corrected_text = re.sub(r'\bpoint(?!\s+(?:à|d\'|virgule))', '.', corrected_text, flags=re.IGNORECASE)
|
| 357 |
+
|
| 358 |
+
return corrected_text
|
| 359 |
+
|
| 360 |
+
def correct_medical_terms(self, text: str) -> str:
|
| 361 |
+
"""Corrige les termes médicaux basés sur le dictionnaire"""
|
| 362 |
+
corrected_text = text
|
| 363 |
+
|
| 364 |
+
for correct_term, variations in self.medical_corrections.items():
|
| 365 |
+
for variation in variations:
|
| 366 |
+
if variation != correct_term: # Éviter de remplacer par lui-même
|
| 367 |
+
# Correction avec préservation de la casse du premier caractère
|
| 368 |
+
pattern = r'\b' + re.escape(variation) + r'\b'
|
| 369 |
+
|
| 370 |
+
def replace_with_case(match):
|
| 371 |
+
matched_text = match.group(0)
|
| 372 |
+
if matched_text[0].isupper():
|
| 373 |
+
return correct_term.capitalize()
|
| 374 |
+
return correct_term
|
| 375 |
+
|
| 376 |
+
corrected_text = re.sub(pattern, replace_with_case, corrected_text, flags=re.IGNORECASE)
|
| 377 |
+
|
| 378 |
+
return corrected_text
|
| 379 |
+
|
| 380 |
+
def normalize_medical_patterns(self, text: str) -> str:
|
| 381 |
+
"""Normalise les patterns médicaux avec gestion des mesures"""
|
| 382 |
+
normalized_text = text
|
| 383 |
+
|
| 384 |
+
# Gestion spéciale des mesures avec "fois" (dimensions)
|
| 385 |
+
# Pattern: nombre fois nombre -> nombre x nombre
|
| 386 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s+fois\s+(\d+(?:[.,]\d+)?)', r'\1 x \2', normalized_text, flags=re.IGNORECASE)
|
| 387 |
+
|
| 388 |
+
# Normalisation des niveaux vertébraux (ex: C5 C6 -> C5-C6, C5c6 -> C5-C6)
|
| 389 |
+
normalized_text = re.sub(r'([CTLS])(\d)\s*([CTLS])?(\d)', lambda m: f"{m.group(1)}{m.group(2)}-{m.group(1)}{m.group(4)}", normalized_text, flags=re.IGNORECASE)
|
| 390 |
+
|
| 391 |
+
# Normalisation des mesures existantes (ex: 72x40mm -> 72 x 40 mm)
|
| 392 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*[x×]\s*(\d+(?:[.,]\d+)?)\s*mm', r'\1 x \2 mm', normalized_text)
|
| 393 |
+
|
| 394 |
+
# Ajout automatique de l'unité mm pour les mesures sans unité (nombre x nombre -> nombre x nombre mm)
|
| 395 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*x\s*(\d+(?:[.,]\d+)?)(?!\s*(?:mm|cm))', r'\1 x \2 mm', normalized_text, flags=re.IGNORECASE)
|
| 396 |
+
|
| 397 |
+
# Normalisation des millimètres écrits en toutes lettres
|
| 398 |
+
normalized_text = re.sub(r'(\d+(?:[.,]\d+)?)\s*millimètres?', r'\1 mm', normalized_text, flags=re.IGNORECASE)
|
| 399 |
+
|
| 400 |
+
# Gestion des mesures d'hystérométrie (format spécial)
|
| 401 |
+
normalized_text = re.sub(r"d['’]?hystérométrie\s+(\d+(?:[.,]\d+)?)", r"d'hystérométrie : \1 mm", normalized_text, flags=re.IGNORECASE)
|
| 402 |
+
|
| 403 |
+
# Gestion des mesures d'endomètre
|
| 404 |
+
normalized_text = re.sub(r"d['’]?endomètre\s+(\d+(?:[.,]\d+)?)", r"d'endometre : \1 mm", normalized_text, flags=re.IGNORECASE)
|
| 405 |
+
|
| 406 |
+
# Gestion du CFA (Compte Folliculaire Antral)
|
| 407 |
+
normalized_text = re.sub(r'(\d+)\s+follicules', r'CFA \1 follicules', normalized_text, flags=re.IGNORECASE)
|
| 408 |
+
|
| 409 |
+
return normalized_text
|
| 410 |
+
|
| 411 |
+
def clean_spacing_and_formatting(self, text: str) -> str:
|
| 412 |
+
"""Nettoie les espaces et améliore le formatage avec ajouts spécifiques"""
|
| 413 |
+
# Supprime les espaces multiples mais préserve les sauts de ligne
|
| 414 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 415 |
+
|
| 416 |
+
# AJOUT: Corrections spécifiques pour les mesures
|
| 417 |
+
# Corrige "7. 8" -> "7,8" (décimales)
|
| 418 |
+
text = re.sub(r'(\d+)\.\s+(\d+)(?!\s*(?:mm|cm|fois|x))', r'\1,\2', text)
|
| 419 |
+
|
| 420 |
+
# Corrige "20 6" -> "26" quand c'est clairement un nombre
|
| 421 |
+
text = re.sub(r'\b20\s+6\b', '26', text)
|
| 422 |
+
text = re.sub(r'\b20\s+5\b', '25', text)
|
| 423 |
+
text = re.sub(r'\b10\s+9\b', '19', text)
|
| 424 |
+
text = re.sub(r'\b20\s+2\b', '22', text)
|
| 425 |
+
text = re.sub(r'\b20\s+7\b', '27', text)
|
| 426 |
+
text = re.sub(r'\b3\s+20\s+4\b', '3,24', text)
|
| 427 |
+
text = re.sub(r'\b4\s+20\s+11\b', '0,91', text)
|
| 428 |
+
|
| 429 |
+
# Corrige la ponctuation (supprime l'espace avant les points, virgules)
|
| 430 |
+
text = re.sub(r'\s+([.,:;!?])', r'\1', text)
|
| 431 |
+
|
| 432 |
+
# Ajoute un espace après la ponctuation si nécessaire (sauf si suivi d'un saut de ligne)
|
| 433 |
+
text = re.sub(r'([.,:;!?])([A-Za-z])', r'\1 \2', text)
|
| 434 |
+
|
| 435 |
+
# AJOUT: Correction des apostrophes
|
| 436 |
+
text = re.sub(r'\bl\s+([aeiouAEIOU])', r"l'\1", text) # l ovaire -> l'ovaire
|
| 437 |
+
text = re.sub(r'\bd\s+([aeiouAEIOU])', r"d'\1", text) # d hystérométrie -> d'hystérométrie
|
| 438 |
+
|
| 439 |
+
# Nettoie les sauts de ligne multiples (max 2 consécutifs)
|
| 440 |
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
| 441 |
+
|
| 442 |
+
# Supprime les espaces en début et fin de ligne
|
| 443 |
+
lines = text.split('\n')
|
| 444 |
+
lines = [line.strip() for line in lines]
|
| 445 |
+
text = '\n'.join(lines)
|
| 446 |
+
|
| 447 |
+
# Capitalise après les points suivis d'un espace
|
| 448 |
+
text = re.sub(r'(\.\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), text)
|
| 449 |
+
|
| 450 |
+
# Capitalise le début du texte
|
| 451 |
+
if text and text[0].islower():
|
| 452 |
+
text = text[0].upper() + text[1:]
|
| 453 |
+
|
| 454 |
+
return text.strip()
|
| 455 |
+
def post_process_gynecology_report(self, text: str) -> str:
|
| 456 |
+
"""Post-traitement spécialisé pour les rapports gynécologiques"""
|
| 457 |
+
processed_text = text
|
| 458 |
+
|
| 459 |
+
# Structuration des mesures d'utérus
|
| 460 |
+
processed_text = re.sub(
|
| 461 |
+
r'utérus est (\w+)\s+(\d+,\d+)',
|
| 462 |
+
r'utérus est \1 de taille \2 cm',
|
| 463 |
+
processed_text,
|
| 464 |
+
flags=re.IGNORECASE
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
# Structuration des mesures d'ovaires
|
| 468 |
+
processed_text = re.sub(
|
| 469 |
+
r'ovaire (droit|gauche) (\d+ x \d+ mm)',
|
| 470 |
+
r'ovaire \1 mesure \2,',
|
| 471 |
+
processed_text,
|
| 472 |
+
flags=re.IGNORECASE
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
# Amélioration de la lisibilité du CFA
|
| 476 |
+
processed_text = re.sub(
|
| 477 |
+
r'CFA (\d+) follicules',
|
| 478 |
+
r'CFA : \1 follicules',
|
| 479 |
+
processed_text,
|
| 480 |
+
flags=re.IGNORECASE
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Formatage des indices Doppler
|
| 484 |
+
processed_text = re.sub(
|
| 485 |
+
r'doppler.*?(\d,\d+).*?(\d,\d+)',
|
| 486 |
+
r'Doppler : IP \1 - IR \2',
|
| 487 |
+
processed_text,
|
| 488 |
+
flags=re.IGNORECASE
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
return processed_text
|
| 492 |
+
|
| 493 |
+
class GPTMedicalFormatter:
|
| 494 |
+
"""Formateur de rapports médicaux utilisant GPT"""
|
| 495 |
+
|
| 496 |
+
def __init__(self, model: str = AZURE_OPENAI_DEPLOYMENT):
|
| 497 |
+
self.model = model
|
| 498 |
+
|
| 499 |
+
self.system_prompt = """
|
| 500 |
+
Tu es un expert en transcription médicale française. Tu dois corriger et formater UNIQUEMENT les erreurs évidentes dans ce texte médical déjà pré-traité.
|
| 501 |
+
|
| 502 |
+
RÈGLES STRICTES À APPLIQUER :
|
| 503 |
+
|
| 504 |
+
1. **PONCTUATION** :
|
| 505 |
+
- Supprime les doubles ponctuations : ",." → "."
|
| 506 |
+
- Supprime ".." → "."
|
| 507 |
+
- Corrige ",?" → "?"
|
| 508 |
+
|
| 509 |
+
2. **PARENTHÈSES** déjà converties mais nettoie si nécessaire
|
| 510 |
+
|
| 511 |
+
3. **ORTHOGRAPHE MÉDICALE** :
|
| 512 |
+
- "supérieur" au lieu de "supérieure" pour les adjectifs masculins
|
| 513 |
+
- "Discrète" → "Discret" pour les termes masculins
|
| 514 |
+
- Autres termes médicaux mal orthographiés
|
| 515 |
+
|
| 516 |
+
4. **FORMATAGE** :
|
| 517 |
+
- Assure-toi que chaque phrase se termine par un point
|
| 518 |
+
- Capitalise après les points
|
| 519 |
+
- Supprime les espaces inutiles
|
| 520 |
+
|
| 521 |
+
5. **INTERDICTIONS** :
|
| 522 |
+
- NE change PAS le contenu médical
|
| 523 |
+
- NE reformule PAS les phrases
|
| 524 |
+
- NE change PAS l'ordre des informations
|
| 525 |
+
- NE supprime PAS d'informations médicales
|
| 526 |
+
|
| 527 |
+
OBJECTIF : Rendre le texte médical propre et professionnel en gardant EXACTEMENT le même contenu.
|
| 528 |
+
|
| 529 |
+
Texte à corriger :
|
| 530 |
+
"""
|
| 531 |
+
|
| 532 |
+
def format_medical_report(self, text: str) -> str:
|
| 533 |
+
"""Formate le rapport médical avec GPT"""
|
| 534 |
+
if not azure_client:
|
| 535 |
+
print("❌ Client Azure OpenAI non disponible - utilisation du texte NER seulement")
|
| 536 |
+
return text
|
| 537 |
+
|
| 538 |
+
try:
|
| 539 |
+
print("🔄 Appel à l'API Azure OpenAI en cours...")
|
| 540 |
+
response = azure_client.chat.completions.create(
|
| 541 |
+
model=self.model,
|
| 542 |
+
messages=[
|
| 543 |
+
{"role": "system", "content": self.system_prompt},
|
| 544 |
+
{"role": "user", "content": f"Corrigez et formatez cette transcription médicale en préservant tous les sauts de ligne et le contenu médical:\n\n{text}"}
|
| 545 |
+
],
|
| 546 |
+
#max_tokens=2000,
|
| 547 |
+
#temperature=0.1
|
| 548 |
+
)
|
| 549 |
+
result = response.choices[0].message.content.strip()
|
| 550 |
+
print("✅ Réponse reçue de l'API Azure OpenAI")
|
| 551 |
+
return result
|
| 552 |
+
|
| 553 |
+
except Exception as e:
|
| 554 |
+
print(f"❌ Erreur lors de l'appel à l'API Azure OpenAI: {e}")
|
| 555 |
+
print(f" Type d'erreur: {type(e).__name__}")
|
| 556 |
+
if hasattr(e, 'response'):
|
| 557 |
+
print(f" Code de statut: {e.response.status_code if hasattr(e.response, 'status_code') else 'N/A'}")
|
| 558 |
+
print("🔄 Utilisation du texte corrigé par NER seulement")
|
| 559 |
+
return text
|
| 560 |
+
|
| 561 |
+
class MedicalTranscriptionProcessor:
|
| 562 |
+
"""Processeur principal pour les transcriptions médicales"""
|
| 563 |
+
|
| 564 |
+
def __init__(self, deployment: str = AZURE_OPENAI_DEPLOYMENT):
|
| 565 |
+
self.ner_corrector = MedicalNERCorrector()
|
| 566 |
+
self.gpt_formatter = GPTMedicalFormatter(deployment)
|
| 567 |
+
|
| 568 |
+
def process_transcription(self, text: str) -> CorrectionResult:
|
| 569 |
+
"""Traite une transcription médicale complète - TRAITEMENT OBLIGATOIRE EN 2 ÉTAPES"""
|
| 570 |
+
print("🏥 Démarrage du traitement de la transcription médicale...")
|
| 571 |
+
print("⚠️ TRAITEMENT EN 2 ÉTAPES OBLIGATOIRES: NER + GPT")
|
| 572 |
+
|
| 573 |
+
# =================== ÉTAPE 1: CORRECTIONS NER ===================
|
| 574 |
+
print("\n🔧 ÉTAPE 1/2: CORRECTIONS NER (Nombres, Ponctuation, Orthographe)")
|
| 575 |
+
print("-" * 60)
|
| 576 |
+
|
| 577 |
+
# Sous-étape 1.1: Correction des transcriptions vocales (inclut la conversion des nombres)
|
| 578 |
+
print(" 🎤 Correction des transcriptions vocales et conversion des nombres...")
|
| 579 |
+
vocal_corrected = self.ner_corrector.correct_vocal_transcription(text)
|
| 580 |
+
|
| 581 |
+
# Sous-étape 1.2: Extraction des entités médicales
|
| 582 |
+
print(" 📋 Extraction des entités médicales...")
|
| 583 |
+
medical_entities = self.ner_corrector.extract_medical_entities(vocal_corrected)
|
| 584 |
+
print(f" ✅ {len(medical_entities)} entités médicales détectées")
|
| 585 |
+
|
| 586 |
+
# Sous-étape 1.3: Correction orthographique des termes médicaux
|
| 587 |
+
print(" ✏️ Correction orthographique des termes médicaux...")
|
| 588 |
+
ner_corrected = self.ner_corrector.correct_medical_terms(vocal_corrected)
|
| 589 |
+
|
| 590 |
+
# Sous-étape 1.4: Normalisation des patterns médicaux
|
| 591 |
+
print(" 🔧 Normalisation des patterns médicaux...")
|
| 592 |
+
ner_corrected = self.ner_corrector.normalize_medical_patterns(ner_corrected)
|
| 593 |
+
|
| 594 |
+
# Sous-étape 1.5: Nettoyage du formatage
|
| 595 |
+
print(" 🧹 Nettoyage du formatage...")
|
| 596 |
+
ner_corrected = self.ner_corrector.post_process_gynecology_report(ner_corrected)
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
print("✅ ÉTAPE 1 TERMINÉE: Corrections NER appliquées")
|
| 600 |
+
|
| 601 |
+
# =================== ÉTAPE 2: FORMATAGE GPT ===================
|
| 602 |
+
print("\n🤖 ÉTAPE 2/2: FORMATAGE PROFESSIONNEL AVEC GPT")
|
| 603 |
+
print("-" * 60)
|
| 604 |
+
print(" 📝 Structuration du rapport médical...")
|
| 605 |
+
print(" 🎯 Amélioration de la lisibilité...")
|
| 606 |
+
print(" 📋 Organisation en sections médicales...")
|
| 607 |
+
|
| 608 |
+
final_corrected = self.gpt_formatter.format_medical_report(ner_corrected)
|
| 609 |
+
|
| 610 |
+
if final_corrected != ner_corrected:
|
| 611 |
+
print("✅ ÉTAPE 2 TERMINÉE: Formatage GPT appliqué avec succès")
|
| 612 |
+
else:
|
| 613 |
+
print("⚠️ ÉTAPE 2: GPT non disponible - utilisation du résultat NER")
|
| 614 |
+
|
| 615 |
+
# Calcul du score de confiance
|
| 616 |
+
confidence_score = self._calculate_confidence_score(text, final_corrected, medical_entities)
|
| 617 |
+
|
| 618 |
+
print(f"\n🎯 TRAITEMENT COMPLET TERMINÉ - Score de confiance: {confidence_score:.2%}")
|
| 619 |
+
|
| 620 |
+
return CorrectionResult(
|
| 621 |
+
original_text=text,
|
| 622 |
+
ner_corrected_text=ner_corrected,
|
| 623 |
+
final_corrected_text=final_corrected,
|
| 624 |
+
medical_entities=medical_entities,
|
| 625 |
+
confidence_score=confidence_score
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
def process_without_gpt(self, text: str) -> str:
|
| 629 |
+
print("⚠️ ATTENTION: Traitement partiel sans GPT (pour tests uniquement)")
|
| 630 |
+
print("💡 Pour un résultat professionnel, utilisez process_transcription() avec une clé API")
|
| 631 |
+
|
| 632 |
+
vocal_corrected = self.ner_corrector.correct_vocal_transcription(text)
|
| 633 |
+
medical_corrected = self.ner_corrector.correct_medical_terms(vocal_corrected)
|
| 634 |
+
normalized = self.ner_corrector.normalize_medical_patterns(medical_corrected)
|
| 635 |
+
cleaned = self.ner_corrector.clean_spacing_and_formatting(normalized)
|
| 636 |
+
return cleaned
|
| 637 |
+
|
| 638 |
+
def _calculate_confidence_score(self, original: str, corrected: str, entities: List[Dict]) -> float:
|
| 639 |
+
"""Calcule un score de confiance pour la correction"""
|
| 640 |
+
entity_score = min(len(entities) / 10, 1.0)
|
| 641 |
+
similarity_score = len(set(original.split()) & set(corrected.split())) / len(set(original.split()))
|
| 642 |
+
return (entity_score + similarity_score) / 2
|
| 643 |
+
|
| 644 |
+
def test_azure_connection():
|
| 645 |
+
"""Test de connexion à Azure OpenAI"""
|
| 646 |
+
if not azure_client:
|
| 647 |
+
print("❌ Client Azure OpenAI non initialisé")
|
| 648 |
+
return False
|
| 649 |
+
|
| 650 |
+
try:
|
| 651 |
+
print("🔍 Test de connexion à Azure OpenAI...")
|
| 652 |
+
response = azure_client.chat.completions.create(
|
| 653 |
+
model=AZURE_OPENAI_DEPLOYMENT,
|
| 654 |
+
messages=[{"role": "user", "content": "Test de connexion"}]
|
| 655 |
+
#max_tokens=10
|
| 656 |
+
)
|
| 657 |
+
print("✅ Connexion Azure OpenAI réussie")
|
| 658 |
+
return True
|
| 659 |
+
except Exception as e:
|
| 660 |
+
print(f"❌ Erreur de connexion Azure OpenAI: {e}")
|
| 661 |
+
return False
|
| 662 |
+
|
| 663 |
+
def main():
|
| 664 |
+
"""Fonction principale de démonstration"""
|
| 665 |
+
|
| 666 |
+
# Test de la configuration Azure
|
| 667 |
+
print("=" * 80)
|
| 668 |
+
print("🔧 VÉRIFICATION DE LA CONFIGURATION")
|
| 669 |
+
print("=" * 80)
|
| 670 |
+
|
| 671 |
+
print(f"📍 Endpoint Azure: {AZURE_OPENAI_ENDPOINT}")
|
| 672 |
+
print(f"🤖 Deployment: {AZURE_OPENAI_DEPLOYMENT}")
|
| 673 |
+
print(f"🔑 Clé API: {'✅ Configurée' if AZURE_OPENAI_KEY else '❌ Manquante'}")
|
| 674 |
+
|
| 675 |
+
# Test de connexion
|
| 676 |
+
if not test_azure_connection():
|
| 677 |
+
print("\n⚠️ Azure OpenAI non disponible - le traitement continuera avec NER seulement")
|
| 678 |
+
|
| 679 |
+
# Texte d'exemple avec problèmes identifiés
|
| 680 |
+
exemple_transcription = """irm pelvienne indication clinique point technique acquisition sagittale axiale et coronale t deux saturation axiale diffusion axiale t un résultats présence d un utérus antéversé médio pelvien dont le grand axe mesure soixante douze mm sur quarante millimètre sur quarante mm point la zone jonctionnelle apparaît floue point elle est épaissie de façon diffuse asymétrique avec une atteinte de plus de cinquante pour cent de l épaisseur du myomètre et comporte des spots en hypersignal t deux l ensemble traduisant une adénomyose point à la ligne pas d épaississement cervical à noter la présence d un petit kyste liquidien de type naboth point à la ligne les deux ovaires sont repérés porteurs de formations folliculaires communes en hypersignal homogène t deux de petite taille point l ovaire droit mesure trente fois vingt cinq mm l ovaire gauche vingt cinq fois vingt trois mm point pas d épanchement dans le cul de sac de douglas point à la ligne absence de foyer d endométriose profonde point conclusion points à la ligne aspect d adénomyose diffuse symétrique virgule profonde point à la ligne pas d épaississement endométrial point absence d endométriome point absence d épanchement dans le cul de sac de douglas point"""
|
| 681 |
+
|
| 682 |
+
# Initialisation du processeur
|
| 683 |
+
processor = MedicalTranscriptionProcessor(AZURE_OPENAI_DEPLOYMENT)
|
| 684 |
+
|
| 685 |
+
print("\n" + "="*80)
|
| 686 |
+
print("🏥 TRAITEMENT COMPLET DE LA TRANSCRIPTION MÉDICALE")
|
| 687 |
+
print("="*80)
|
| 688 |
+
|
| 689 |
+
# Traitement complet avec GPT (recommandé)
|
| 690 |
+
result = processor.process_transcription(exemple_transcription)
|
| 691 |
+
|
| 692 |
+
# Affichage des résultats complets
|
| 693 |
+
print("\n📄 TEXTE ORIGINAL:")
|
| 694 |
+
print("-" * 50)
|
| 695 |
+
print(result.original_text)
|
| 696 |
+
|
| 697 |
+
print(f"\n🔍 ENTITÉS MÉDICALES DÉTECTÉES ({len(result.medical_entities)}):")
|
| 698 |
+
print("-" * 50)
|
| 699 |
+
for entity in result.medical_entities:
|
| 700 |
+
print(f" • {entity['text']} ({entity['label']})")
|
| 701 |
+
|
| 702 |
+
print("\n🎤 APRÈS CORRECTION NER (sans GPT):")
|
| 703 |
+
print("-" * 50)
|
| 704 |
+
print(result.ner_corrected_text)
|
| 705 |
+
|
| 706 |
+
print("\n🤖 RAPPORT FINAL FORMATÉ (avec GPT):")
|
| 707 |
+
print("-" * 50)
|
| 708 |
+
if result.final_corrected_text:
|
| 709 |
+
print(result.final_corrected_text)
|
| 710 |
+
else:
|
| 711 |
+
print("❌ Aucun résultat GPT - vérifiez votre configuration Azure")
|
| 712 |
+
|
| 713 |
+
print(f"\n📊 SCORE DE CONFIANCE: {result.confidence_score:.2%}")
|
| 714 |
+
|
| 715 |
+
# Comparaison des résultats
|
| 716 |
+
if result.final_corrected_text != result.ner_corrected_text:
|
| 717 |
+
print("\n🔄 COMPARAISON NER vs GPT:")
|
| 718 |
+
print("-" * 50)
|
| 719 |
+
print("📈 Améliorations apportées par GPT:")
|
| 720 |
+
ner_lines = result.ner_corrected_text.split('\n')
|
| 721 |
+
gpt_lines = result.final_corrected_text.split('\n')
|
| 722 |
+
|
| 723 |
+
for i, (ner_line, gpt_line) in enumerate(zip(ner_lines, gpt_lines)):
|
| 724 |
+
if ner_line.strip() != gpt_line.strip():
|
| 725 |
+
print(f" Ligne {i+1}:")
|
| 726 |
+
print(f" NER: {ner_line}")
|
| 727 |
+
print(f" GPT: {gpt_line}")
|
| 728 |
+
|
| 729 |
+
print("\n" + "="*80)
|
| 730 |
+
print("✅ TRAITEMENT TERMINÉ")
|
| 731 |
+
if azure_client:
|
| 732 |
+
print("🎉 Les 2 étapes ont été appliquées avec succès")
|
| 733 |
+
else:
|
| 734 |
+
print("⚠️ Seule l'étape NER a pu être appliquée - configurez Azure OpenAI pour le formatage complet")
|
| 735 |
+
print("="*80)
|
| 736 |
+
|
| 737 |
+
if __name__ == "__main__":
|
| 738 |
+
print("✅ correcteur.py loaded main")
|
| 739 |
+
main()
|