|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
from collections import Counter, defaultdict |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
print("="*70) |
|
print("TAQATHON 2025 - ENHANCED DATA PROCESSING v2.0") |
|
print("Equipment Intelligence + Dual-Field Analysis + Noise-Robust Features") |
|
print("="*70) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 1: LOADING DATA AND BASIC SETUP") |
|
print("="*50) |
|
|
|
|
|
try: |
|
df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle') |
|
print(f"✓ Successfully loaded dataset: {df.shape}") |
|
except FileNotFoundError: |
|
print("❌ Error: Taqathon_data.xlsx not found!") |
|
exit(1) |
|
|
|
print("Columns:", df.columns.tolist()) |
|
|
|
|
|
print("\nMissing values per column:") |
|
print(df.isnull().sum()) |
|
|
|
|
|
df = df.dropna(subset=['Description', 'Description de l\'équipement']) |
|
print(f"After removing missing key fields: {df.shape}") |
|
|
|
|
|
df['Date de détéction de l\'anomalie'] = pd.to_datetime(df['Date de détéction de l\'anomalie']) |
|
|
|
|
|
df = df.drop_duplicates() |
|
print(f"After removing duplicates: {df.shape}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 2: EQUIPMENT INTELLIGENCE CLASSIFICATION") |
|
print("="*50) |
|
|
|
|
|
EQUIPMENT_TYPE_SCORES = { |
|
|
|
'ELECTRICAL_CRITICAL': { |
|
'keywords': ['ALTERNATEUR', 'TRANSFO PRINCIPAL', 'PROTECTION ALTERNATEUR'], |
|
'score': 8.0 |
|
}, |
|
|
|
'TURBINE_SYSTEMS': { |
|
'keywords': ['TURBINE', 'SOUPAPE REGULATRICE', 'REFRIGERANT HUILE', 'POMPE DE SOULÈVEMENT'], |
|
'score': 7.5 |
|
}, |
|
|
|
'COOLING_CRITICAL': { |
|
'keywords': ['VENTILATEUR DE REFROIDISSEMENT', 'REFROIDISSEMENT TP', 'MOTEUR VENTILATEUR DE REFROIDISSEMENT'], |
|
'score': 7.5 |
|
}, |
|
|
|
'ELECTRICAL_STANDARD': { |
|
'keywords': ['DISJONCTEUR', 'TRANSFORMATEUR', 'MOTEUR', 'ARMOIRE', 'GROUPE'], |
|
'score': 6.5 |
|
}, |
|
|
|
'HEATING_SYSTEMS': { |
|
'keywords': ['RECHAUFFEUR', 'RÉCHAUFFEUR', 'CHAUDIERE', 'CHAUDIÈRE'], |
|
'score': 6.5 |
|
}, |
|
|
|
'VENTILATION_SYSTEMS': { |
|
'keywords': ['VENTILATEUR', 'TIRAGE', 'SOUFFLAGE', 'AIR PRIMAIRE', 'AIR SECONDAIRE'], |
|
'score': 6.0 |
|
}, |
|
|
|
'PROCESS_SYSTEMS': { |
|
'keywords': ['POMPE', 'SOUPAPE', 'VANNE', 'CONVOYEUR', 'BROYEUR', 'COAL FEEDER'], |
|
'score': 5.5 |
|
}, |
|
|
|
'AUXILIARY_SYSTEMS': { |
|
'keywords': ['DECRASSEUR', 'DÉGRILLEUR', 'FILTRE', 'CAPTEUR', 'TRANSMETTEUR'], |
|
'score': 5.0 |
|
} |
|
} |
|
|
|
|
|
REDUNDANCY_PATTERNS = { |
|
'SINGLE_CRITICAL': { |
|
'patterns': [r'PRINCIPAL', r'UNIQUE', r'^(?!.*[AB]$)(?!.*N°[0-9])(?!.*[0-9]$)'], |
|
'multiplier': 1.3 |
|
}, |
|
'DUAL_SYSTEM': { |
|
'patterns': [r'\b[AB]$', r'N°[12]$', r'PRIMAIRE$', r'SECONDAIRE$'], |
|
'multiplier': 1.0 |
|
}, |
|
'MULTIPLE_SYSTEM': { |
|
'patterns': [r'N°[3-9]$', r'N°[0-9][0-9]$'], |
|
'multiplier': 0.8 |
|
} |
|
} |
|
|
|
|
|
SECTION_RISK_MULTIPLIERS = { |
|
'34EL': 1.2, |
|
'34MM': 1.1, |
|
'34MD': 1.1, |
|
'34MC': 1.0, |
|
'34CT': 1.0 |
|
} |
|
|
|
def classify_equipment_type(equipment_desc): |
|
"""Classify equipment based on criticality analysis""" |
|
equipment_upper = str(equipment_desc).upper() |
|
|
|
for category, info in EQUIPMENT_TYPE_SCORES.items(): |
|
for keyword in info['keywords']: |
|
if keyword in equipment_upper: |
|
return category, info['score'] |
|
|
|
return 'UNKNOWN', 4.5 |
|
|
|
def detect_equipment_redundancy(equipment_desc): |
|
"""Detect equipment redundancy based on naming patterns""" |
|
equipment_upper = str(equipment_desc).upper() |
|
|
|
for redundancy_class, info in REDUNDANCY_PATTERNS.items(): |
|
for pattern in info['patterns']: |
|
if re.search(pattern, equipment_upper): |
|
return redundancy_class, info['multiplier'] |
|
|
|
return 'UNKNOWN_REDUNDANCY', 1.0 |
|
|
|
|
|
print("Applying equipment intelligence classification...") |
|
|
|
|
|
equipment_classifications = df['Description de l\'équipement'].apply(classify_equipment_type) |
|
df['equipment_type_class'] = [x[0] for x in equipment_classifications] |
|
df['equipment_base_criticality'] = [x[1] for x in equipment_classifications] |
|
|
|
|
|
redundancy_classifications = df['Description de l\'équipement'].apply(detect_equipment_redundancy) |
|
df['equipment_redundancy_class'] = [x[0] for x in redundancy_classifications] |
|
df['equipment_redundancy_multiplier'] = [x[1] for x in redundancy_classifications] |
|
|
|
|
|
df['section_risk_multiplier'] = df['Section propriétaire'].map(SECTION_RISK_MULTIPLIERS).fillna(1.0) |
|
|
|
|
|
df['equipment_risk_score'] = (df['equipment_base_criticality'] * |
|
df['equipment_redundancy_multiplier'] * |
|
df['section_risk_multiplier']) |
|
|
|
print("✓ Equipment intelligence classification completed") |
|
print(f"Equipment type distribution:") |
|
print(df['equipment_type_class'].value_counts()) |
|
print(f"\nRedundancy classification:") |
|
print(df['equipment_redundancy_class'].value_counts()) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 3: DUAL-FIELD TEXT ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
df['combined_text'] = df['Description'].fillna('') + ' ' + df['Description de l\'équipement'].fillna('') |
|
df['combined_text_lower'] = df['combined_text'].str.lower() |
|
|
|
|
|
df['description_length'] = df['Description'].str.len() |
|
df['description_word_count'] = df['Description'].str.split().str.len() |
|
df['equipment_desc_length'] = df['Description de l\'équipement'].str.len() |
|
df['equipment_desc_word_count'] = df['Description de l\'équipement'].str.split().str.len() |
|
df['combined_length'] = df['combined_text'].str.len() |
|
df['combined_word_count'] = df['combined_text'].str.split().str.len() |
|
|
|
print(f"Text analysis completed:") |
|
print(f"Average description length: {df['description_length'].mean():.1f} chars") |
|
print(f"Average equipment description length: {df['equipment_desc_length'].mean():.1f} chars") |
|
print(f"Average combined length: {df['combined_length'].mean():.1f} chars") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 4: ENHANCED KEYWORD EXTRACTION (DUAL-FIELD)") |
|
print("="*50) |
|
|
|
|
|
equipment_keywords = { |
|
'pompe': ['pompe', 'pompes'], |
|
'vanne': ['vanne', 'vannes'], |
|
'ventilateur': ['ventilateur', 'ventilateurs', 'ventilo'], |
|
'moteur': ['moteur', 'moteurs', 'moto'], |
|
'alternateur': ['alternateur', 'alternateurs'], |
|
'transformateur': ['transformateur', 'transformateurs', 'transfo'], |
|
'turbine': ['turbine', 'turbines'], |
|
'chaudière': ['chaudière', 'chaudières', 'chaudiere'], |
|
'réchauffeur': ['réchauffeur', 'réchauffeurs', 'rechauffeur'], |
|
'refroidissement': ['refroidissement', 'refroidisseur', 'refrigerant', 'réfrigérant'], |
|
'compresseur': ['compresseur', 'compresseurs'], |
|
'soupape': ['soupape', 'soupapes'], |
|
'décrasseur': ['décrasseur', 'décrasseurs', 'decrasseur'], |
|
'principal': ['principal', 'principale'], |
|
'groupe': ['groupe', 'groupes'], |
|
'protection': ['protection', 'protections'], |
|
'armoire': ['armoire', 'armoires'], |
|
'disjoncteur': ['disjoncteur', 'disjoncteurs'] |
|
} |
|
|
|
|
|
problem_keywords = { |
|
'fuite': ['fuite', 'fuites', 'fuit', 'fuyant'], |
|
'vibration': ['vibration', 'vibrations', 'vibre'], |
|
'bruit_anormal': ['bruit anormal', 'bruit anormale'], |
|
'percement': ['percement', 'percé', 'percée'], |
|
'éclatement': ['éclatement', 'eclatement'], |
|
'fissure': ['fissure', 'fissuré', 'fissures'], |
|
'aggravation': ['aggravation'], |
|
'sifflement': ['sifflement', 'siffler'], |
|
'défaillance': ['défaillance', 'défaillant'], |
|
'dysfonctionnement': ['dysfonctionnement', 'dysfonctionnel'], |
|
'sens_inverse': ['sens inverse', 'sens contraire'], |
|
'détachés': ['détachés', 'détaché', 'detaches'], |
|
'corrosion': ['corrosion', 'corrodé', 'rouille'], |
|
'usure': ['usure', 'usé', 'usée'], |
|
'surchauffe': ['surchauffe', 'surchauffé', 'température élevée', 'temp elevee'], |
|
'blocage': ['blocage', 'bloqué', 'bloque', 'coincé'], |
|
'dégradation': ['dégradation', 'dégradé'], |
|
'obstruction': ['obstruction', 'obstrué', 'bouché', 'bouchage'] |
|
} |
|
|
|
|
|
action_keywords = { |
|
'remplacement': ['remplacement', 'remplacer', 'remplacé', 'changement', 'changer'], |
|
'réparation': ['réparation', 'réparer', 'réparé'], |
|
'maintenance': ['maintenance', 'entretien'], |
|
'prévision': ['prévoir', 'prévoire', 'prevoir'], |
|
'soufflage': ['soufflage', 'souffler', 'soufflé'], |
|
'nettoyage': ['nettoyage', 'nettoyer', 'nettoyé'], |
|
'débouchage': ['débouchage', 'déboucher'], |
|
'inspection': ['inspection', 'inspecter', 'contrôle', 'contrôler'], |
|
'révision': ['révision', 'réviser'], |
|
'remise_état': ['remise en état', 'remise état'] |
|
} |
|
|
|
|
|
urgency_keywords = { |
|
'safety': ['safety', 'sécurité'], |
|
'urgent': ['urgent', 'urgence'], |
|
'critique': ['critique', 'critiques'], |
|
'important': ['important', 'importante'], |
|
'immédiat': ['immédiat', 'immédiatement'], |
|
'prioritaire': ['prioritaire', 'priorité'], |
|
'grave': ['grave', 'graves'], |
|
'majeur': ['majeur', 'majeure'], |
|
'dangereux': ['dangereux', 'dangereuse', 'danger'], |
|
'risque': ['risque', 'risques', 'risqué'], |
|
'chute': ['chute', 'tomber'], |
|
'fréquent': ['fréquent', 'fréquente', 'répétitif', 'répétitive'] |
|
} |
|
|
|
def extract_keywords_dual_field(description, equipment_desc, keyword_dict): |
|
"""Extract keywords from both description and equipment description""" |
|
combined_text = (str(description) + ' ' + str(equipment_desc)).lower() |
|
found_keywords = [] |
|
|
|
for category, keywords in keyword_dict.items(): |
|
for keyword in keywords: |
|
if keyword in combined_text: |
|
found_keywords.append(category) |
|
break |
|
|
|
return found_keywords |
|
|
|
|
|
print("Extracting enhanced keywords from both fields...") |
|
|
|
|
|
df['equipment_mentioned'] = df.apply( |
|
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], equipment_keywords), |
|
axis=1 |
|
) |
|
df['equipment_count'] = df['equipment_mentioned'].str.len() |
|
|
|
|
|
df['problem_types'] = df.apply( |
|
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], problem_keywords), |
|
axis=1 |
|
) |
|
df['problem_count'] = df['problem_types'].str.len() |
|
|
|
|
|
df['actions_mentioned'] = df.apply( |
|
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], action_keywords), |
|
axis=1 |
|
) |
|
df['action_count'] = df['actions_mentioned'].str.len() |
|
|
|
|
|
df['urgency_indicators'] = df.apply( |
|
lambda row: extract_keywords_dual_field(row['Description'], row['Description de l\'équipement'], urgency_keywords), |
|
axis=1 |
|
) |
|
df['has_urgency'] = df['urgency_indicators'].str.len() > 0 |
|
|
|
print(f"✓ Enhanced keyword extraction completed") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 5: CRITICAL FAILURE PATTERN DETECTION") |
|
print("="*50) |
|
|
|
|
|
df['has_structural_failure'] = df['combined_text_lower'].str.contains( |
|
'percement|éclatement|eclatement|fissure|rupture', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['has_equipment_malfunction'] = df['combined_text_lower'].str.contains( |
|
'sens inverse|dysfonctionnement|défaillance|défaut|panne', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['has_escalation'] = df['combined_text_lower'].str.contains( |
|
'aggravation|empiré|empire', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['has_safety_mention'] = df['Description'].str.contains('SAFETY', case=False, na=False).astype(int) |
|
|
|
|
|
df['electrical_cooling_issue'] = ( |
|
(df['equipment_type_class'].isin(['ELECTRICAL_CRITICAL', 'ELECTRICAL_STANDARD'])) & |
|
(df['combined_text_lower'].str.contains('refroidissement|ventilateur|température', regex=True, na=False)) |
|
).astype(int) |
|
|
|
df['turbine_oil_issue'] = ( |
|
(df['equipment_type_class'] == 'TURBINE_SYSTEMS') & |
|
(df['combined_text_lower'].str.contains('huile|fuite|graissage', regex=True, na=False)) |
|
).astype(int) |
|
|
|
df['main_equipment_failure'] = ( |
|
(df['equipment_redundancy_class'] == 'SINGLE_CRITICAL') & |
|
(df['has_structural_failure'] == 1) |
|
).astype(int) |
|
|
|
print(f"Critical failure patterns detected:") |
|
print(f"Structural failures: {df['has_structural_failure'].sum()}") |
|
print(f"Equipment malfunctions: {df['has_equipment_malfunction'].sum()}") |
|
print(f"Escalation indicators: {df['has_escalation'].sum()}") |
|
print(f"Electrical cooling issues: {df['electrical_cooling_issue'].sum()}") |
|
print(f"Turbine oil issues: {df['turbine_oil_issue'].sum()}") |
|
print(f"Main equipment failures: {df['main_equipment_failure'].sum()}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 6: ENHANCED COMPOUND FEATURES") |
|
print("="*50) |
|
|
|
|
|
df['fuite_vapeur'] = df['combined_text_lower'].str.contains('fuite.*vapeur|vapeur.*fuite', regex=True, na=False).astype(int) |
|
df['fuite_huile'] = df['combined_text_lower'].str.contains('fuite.*huile|huile.*fuite', regex=True, na=False).astype(int) |
|
df['fuite_eau'] = df['combined_text_lower'].str.contains('fuite.*eau|eau.*fuite', regex=True, na=False).astype(int) |
|
|
|
|
|
df['bruit_anormal'] = df['combined_text_lower'].str.contains('bruit anormal', regex=True, na=False).astype(int) |
|
df['vibration_excessive'] = df['combined_text_lower'].str.contains( |
|
'vibration.*excessive|vibration.*élevée|vibration.*haute', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['temperature_elevee'] = df['combined_text_lower'].str.contains( |
|
'température élevée|temp élevée|temp elevee|surchauffe', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['maintenance_planning'] = df['combined_text_lower'].str.contains( |
|
'prévoir|prévoire|planifier|programmer', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['is_recurring'] = df['combined_text_lower'].str.contains( |
|
'fréquent|répétitif|souvent|plusieurs fois|encore', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['has_measurements'] = df['combined_text_lower'].str.contains( |
|
r'\d+\s*°c|\d+\s*bar|\d+\s*%|\d+\s*mm|\d+\s*m3', regex=True, na=False |
|
).astype(int) |
|
|
|
df['has_equipment_codes'] = df['combined_text_lower'].str.contains( |
|
r'[A-Z0-9]{5,}|[0-9]{2}[A-Z]{3}[0-9]{2}', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
df['has_location_details'] = df['combined_text_lower'].str.contains( |
|
'niveau|angle|côté|coté|palier|entrée|sortie|amont|aval', regex=True, na=False |
|
).astype(int) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 7: ADVANCED SEVERITY SCORING") |
|
print("="*50) |
|
|
|
|
|
severity_words = { |
|
'critique': 4, 'critiques': 4, |
|
'grave': 4, 'graves': 4, |
|
'majeur': 4, 'majeure': 4, |
|
'important': 3, 'importante': 3, |
|
'total': 5, 'totale': 5, |
|
'complet': 5, 'complète': 5, |
|
'rupture': 5, 'éclatement': 5, 'eclatement': 5, |
|
'percement': 5, 'fissure': 4, |
|
'aggravation': 4, |
|
'sifflement': 3, |
|
'sens inverse': 5, |
|
'dysfonctionnement': 3, |
|
'défaillance': 3, |
|
'urgent': 3, 'urgence': 3, |
|
'immédiat': 3, 'immédiatement': 3, |
|
'dangereux': 4, 'dangereuse': 4, |
|
'léger': 1, 'légère': 1, |
|
'faible': 1, 'petit': 1, 'petite': 1, |
|
'normal': 1, 'normale': 1 |
|
} |
|
|
|
def calculate_enhanced_severity_score(text): |
|
"""Calculate severity score based on enhanced word analysis""" |
|
text = str(text).lower() |
|
max_score = 0 |
|
word_count = 0 |
|
|
|
for word, weight in severity_words.items(): |
|
if word in text: |
|
max_score = max(max_score, weight) |
|
word_count += 1 |
|
|
|
|
|
if word_count > 1: |
|
max_score += 0.5 |
|
|
|
return max_score |
|
|
|
df['enhanced_severity_score'] = df['combined_text_lower'].apply(calculate_enhanced_severity_score) |
|
|
|
|
|
def calculate_equipment_problem_risk(equipment_type, problem_types, has_structural): |
|
"""Calculate compound risk based on equipment type and problem severity""" |
|
base_risk = 1.0 |
|
|
|
|
|
if equipment_type in ['ELECTRICAL_CRITICAL', 'TURBINE_SYSTEMS', 'COOLING_CRITICAL']: |
|
base_risk = 1.5 |
|
elif equipment_type in ['ELECTRICAL_STANDARD', 'HEATING_SYSTEMS']: |
|
base_risk = 1.2 |
|
|
|
|
|
if has_structural: |
|
base_risk *= 2.0 |
|
|
|
|
|
if 'vibration' in problem_types: |
|
base_risk *= 1.3 |
|
if 'fuite' in problem_types: |
|
base_risk *= 1.2 |
|
if 'bruit_anormal' in problem_types: |
|
base_risk *= 1.2 |
|
|
|
return min(base_risk, 3.0) |
|
|
|
df['equipment_problem_risk'] = df.apply( |
|
lambda row: calculate_equipment_problem_risk( |
|
row['equipment_type_class'], |
|
row['problem_types'], |
|
row['has_structural_failure'] |
|
), axis=1 |
|
) |
|
|
|
|
|
df['technical_complexity'] = ( |
|
df['combined_word_count'] / 15 + |
|
df['equipment_count'] + |
|
df['problem_count'] + |
|
df['has_measurements'] + |
|
df['has_equipment_codes'] + |
|
df['has_location_details'] |
|
) |
|
|
|
print(f"✓ Advanced severity scoring completed") |
|
print(f"Enhanced severity score distribution:") |
|
print(df['enhanced_severity_score'].value_counts().sort_index()) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 8: NOISE-ROBUST LABEL ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
def identify_label_inconsistencies(df, similarity_threshold=0.8): |
|
"""Identify potentially inconsistent labels for similar cases""" |
|
|
|
|
|
similar_groups = df.groupby([ |
|
'equipment_type_class', |
|
'equipment_redundancy_class', |
|
'Section propriétaire' |
|
]) |
|
|
|
inconsistent_cases = [] |
|
|
|
for group_key, group_df in similar_groups: |
|
if len(group_df) >= 3: |
|
criticality_std = group_df['Criticité'].std() |
|
criticality_mean = group_df['Criticité'].mean() |
|
|
|
if criticality_std > 3.0: |
|
for idx, row in group_df.iterrows(): |
|
z_score = abs(row['Criticité'] - criticality_mean) / (criticality_std + 0.1) |
|
if z_score > 2.0: |
|
inconsistent_cases.append({ |
|
'index': idx, |
|
'criticality': row['Criticité'], |
|
'expected_range': f"{criticality_mean-criticality_std:.1f}-{criticality_mean+criticality_std:.1f}", |
|
'z_score': z_score, |
|
'group': group_key |
|
}) |
|
|
|
return inconsistent_cases |
|
|
|
inconsistent_labels = identify_label_inconsistencies(df) |
|
df['potentially_mislabeled'] = 0 |
|
if inconsistent_labels: |
|
inconsistent_indices = [case['index'] for case in inconsistent_labels] |
|
df.loc[inconsistent_indices, 'potentially_mislabeled'] = 1 |
|
|
|
print(f"Identified {len(inconsistent_labels)} potentially inconsistent labels") |
|
print(f"Percentage of potentially noisy labels: {len(inconsistent_labels)/len(df)*100:.2f}%") |
|
|
|
|
|
def calculate_label_confidence(row): |
|
"""Calculate confidence in the label based on consistency with similar cases""" |
|
base_confidence = 1.0 |
|
|
|
|
|
if row['potentially_mislabeled']: |
|
base_confidence *= 0.6 |
|
|
|
|
|
expected_criticality = row['equipment_risk_score'] |
|
actual_criticality = row['Criticité'] |
|
|
|
|
|
diff = abs(actual_criticality - expected_criticality) |
|
if diff <= 2: |
|
base_confidence *= 1.2 |
|
elif diff > 5: |
|
base_confidence *= 0.8 |
|
|
|
return min(base_confidence, 1.0) |
|
|
|
df['label_confidence'] = df.apply(calculate_label_confidence, axis=1) |
|
|
|
print(f"Label confidence distribution:") |
|
print(f"High confidence (>0.9): {(df['label_confidence'] > 0.9).sum()}") |
|
print(f"Medium confidence (0.7-0.9): {((df['label_confidence'] > 0.7) & (df['label_confidence'] <= 0.9)).sum()}") |
|
print(f"Low confidence (<0.7): {(df['label_confidence'] <= 0.7).sum()}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 9: ENHANCED FEATURE CORRELATION ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
enhanced_features = [ |
|
'equipment_risk_score', 'equipment_base_criticality', 'equipment_redundancy_multiplier', |
|
'section_risk_multiplier', 'enhanced_severity_score', 'equipment_problem_risk', |
|
'technical_complexity', 'has_structural_failure', 'has_equipment_malfunction', |
|
'has_escalation', 'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure', |
|
'combined_word_count', 'equipment_count', 'problem_count', 'action_count', |
|
'has_urgency', 'bruit_anormal', 'vibration_excessive', 'temperature_elevee', |
|
'fuite_vapeur', 'fuite_huile', 'maintenance_planning', 'is_recurring', |
|
'has_measurements', 'has_equipment_codes', 'has_location_details', 'has_safety_mention' |
|
] |
|
|
|
target_cols = ['Fiabilité Intégrité', 'Disponibilté', 'Process Safety', 'Criticité'] |
|
|
|
print("\nTop correlations with Criticité:") |
|
correlations = [] |
|
for feature in enhanced_features: |
|
if feature in df.columns: |
|
corr = df[feature].corr(df['Criticité']) |
|
correlations.append({'Feature': feature, 'Correlation': corr}) |
|
|
|
correlation_df = pd.DataFrame(correlations).sort_values('Correlation', key=abs, ascending=False) |
|
print(correlation_df.head(15).to_string(index=False)) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 10: SAVING ENHANCED DATASET") |
|
print("="*50) |
|
|
|
|
|
final_columns = [ |
|
|
|
'Num_equipement', 'Systeme', 'Description', 'Date de détéction de l\'anomalie', |
|
'Description de l\'équipement', 'Section propriétaire', |
|
'Fiabilité Intégrité', 'Disponibilté', 'Process Safety', 'Criticité', |
|
|
|
|
|
'equipment_type_class', 'equipment_base_criticality', 'equipment_redundancy_class', |
|
'equipment_redundancy_multiplier', 'section_risk_multiplier', 'equipment_risk_score', |
|
|
|
|
|
'combined_text', 'description_length', 'description_word_count', |
|
'equipment_desc_length', 'equipment_desc_word_count', 'combined_length', 'combined_word_count', |
|
|
|
|
|
'equipment_mentioned', 'equipment_count', 'problem_types', 'problem_count', |
|
'actions_mentioned', 'action_count', 'urgency_indicators', 'has_urgency', |
|
|
|
|
|
'has_structural_failure', 'has_equipment_malfunction', 'has_escalation', 'has_safety_mention', |
|
'electrical_cooling_issue', 'turbine_oil_issue', 'main_equipment_failure', |
|
|
|
|
|
'fuite_vapeur', 'fuite_huile', 'fuite_eau', 'bruit_anormal', 'vibration_excessive', |
|
'temperature_elevee', 'maintenance_planning', 'is_recurring', |
|
|
|
|
|
'has_measurements', 'has_equipment_codes', 'has_location_details', |
|
|
|
|
|
'enhanced_severity_score', 'equipment_problem_risk', 'technical_complexity', |
|
|
|
|
|
'potentially_mislabeled', 'label_confidence' |
|
] |
|
|
|
|
|
available_columns = [col for col in final_columns if col in df.columns] |
|
missing_columns = [col for col in final_columns if col not in df.columns] |
|
|
|
if missing_columns: |
|
print(f"Warning: Missing columns: {missing_columns}") |
|
|
|
|
|
enhanced_df = df[available_columns].copy() |
|
enhanced_df.to_csv('enhanced_anomaly_data_v2.csv', index=False, encoding='utf-8') |
|
|
|
print(f"✓ Enhanced dataset saved to 'enhanced_anomaly_data_v2.csv'") |
|
print(f"Dataset shape: {enhanced_df.shape}") |
|
print(f"Total features: {len(available_columns)}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 11: FEATURE SUMMARY AND RECOMMENDATIONS") |
|
print("="*50) |
|
|
|
|
|
feature_importance = correlation_df.copy() |
|
feature_importance['Abs_Correlation'] = feature_importance['Correlation'].abs() |
|
feature_importance = feature_importance.sort_values('Abs_Correlation', ascending=False) |
|
|
|
print("\n🎯 TOP 10 MOST IMPORTANT FEATURES:") |
|
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1): |
|
print(f"{i:2d}. {row['Feature']:35s}: {row['Correlation']:6.3f}") |
|
|
|
|
|
print(f"\n🔧 EQUIPMENT INTELLIGENCE SUMMARY:") |
|
print(f"Equipment types classified:") |
|
equipment_type_summary = df['equipment_type_class'].value_counts() |
|
for eq_type, count in equipment_type_summary.items(): |
|
avg_crit = df[df['equipment_type_class'] == eq_type]['Criticité'].mean() |
|
print(f" {eq_type:25s}: {count:4d} cases (avg criticality: {avg_crit:.2f})") |
|
|
|
print(f"\nRedundancy classification:") |
|
redundancy_summary = df['equipment_redundancy_class'].value_counts() |
|
for red_class, count in redundancy_summary.items(): |
|
avg_crit = df[df['equipment_redundancy_class'] == red_class]['Criticité'].mean() |
|
print(f" {red_class:20s}: {count:4d} cases (avg criticality: {avg_crit:.2f})") |
|
|
|
|
|
critical_cases = df[df['Criticité'] >= 10] |
|
print(f"\n⚠️ CRITICAL CASE ANALYSIS (Criticality >= 10): {len(critical_cases)} cases") |
|
|
|
if len(critical_cases) > 0: |
|
print("Equipment types in critical cases:") |
|
crit_equipment = critical_cases['equipment_type_class'].value_counts() |
|
for eq_type, count in crit_equipment.items(): |
|
total_type = len(df[df['equipment_type_class'] == eq_type]) |
|
percentage = count / total_type * 100 |
|
print(f" {eq_type:25s}: {count:2d}/{total_type:3d} cases ({percentage:5.1f}% critical)") |
|
|
|
print("\nTop critical failure patterns:") |
|
critical_patterns = { |
|
'Structural Failure': critical_cases['has_structural_failure'].sum(), |
|
'Equipment Malfunction': critical_cases['has_equipment_malfunction'].sum(), |
|
'Escalation': critical_cases['has_escalation'].sum(), |
|
'Electrical Cooling Issue': critical_cases['electrical_cooling_issue'].sum(), |
|
'Turbine Oil Issue': critical_cases['turbine_oil_issue'].sum(), |
|
'Main Equipment Failure': critical_cases['main_equipment_failure'].sum() |
|
} |
|
|
|
for pattern, count in sorted(critical_patterns.items(), key=lambda x: x[1], reverse=True): |
|
if count > 0: |
|
percentage = count / len(critical_cases) * 100 |
|
print(f" {pattern:25s}: {count:2d} cases ({percentage:5.1f}% of critical)") |
|
|
|
|
|
print(f"\n📊 DATA QUALITY ASSESSMENT:") |
|
print(f"Total samples: {len(df)}") |
|
print(f"Potentially mislabeled: {df['potentially_mislabeled'].sum()} ({df['potentially_mislabeled'].mean()*100:.1f}%)") |
|
print(f"High confidence labels: {(df['label_confidence'] > 0.9).sum()} ({(df['label_confidence'] > 0.9).mean()*100:.1f}%)") |
|
print(f"Low confidence labels: {(df['label_confidence'] < 0.7).sum()} ({(df['label_confidence'] < 0.7).mean()*100:.1f}%)") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 12: CREATING ENHANCED VISUALIZATIONS") |
|
print("="*50) |
|
|
|
|
|
fig = plt.figure(figsize=(20, 16)) |
|
|
|
|
|
plt.subplot(3, 4, 1) |
|
plt.scatter(df['equipment_risk_score'], df['Criticité'], alpha=0.6, s=20) |
|
plt.xlabel('Equipment Risk Score') |
|
plt.ylabel('Actual Criticité') |
|
plt.title('Equipment Risk Score vs Actual Criticité') |
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.subplot(3, 4, 2) |
|
equipment_counts = df['equipment_type_class'].value_counts() |
|
plt.pie(equipment_counts.values, labels=equipment_counts.index, autopct='%1.1f%%', startangle=90) |
|
plt.title('Equipment Type Distribution') |
|
|
|
|
|
plt.subplot(3, 4, 3) |
|
section_criticality = df.groupby('Section propriétaire')['Criticité'].mean().sort_values(ascending=False) |
|
plt.bar(section_criticality.index, section_criticality.values) |
|
plt.xlabel('Section') |
|
plt.ylabel('Average Criticité') |
|
plt.title('Average Criticality by Section') |
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.subplot(3, 4, 4) |
|
top_features = feature_importance.head(8)['Feature'].tolist() + ['Criticité'] |
|
if len(top_features) > 1: |
|
corr_matrix = df[top_features].corr() |
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', cbar_kws={'shrink': 0.8}) |
|
plt.title('Top Features Correlation') |
|
|
|
|
|
plt.subplot(3, 4, 5) |
|
failure_patterns = { |
|
'Structural': df['has_structural_failure'].sum(), |
|
'Malfunction': df['has_equipment_malfunction'].sum(), |
|
'Escalation': df['has_escalation'].sum(), |
|
'Elec-Cooling': df['electrical_cooling_issue'].sum(), |
|
'Turbine-Oil': df['turbine_oil_issue'].sum(), |
|
'Main-Equip': df['main_equipment_failure'].sum() |
|
} |
|
plt.bar(failure_patterns.keys(), failure_patterns.values()) |
|
plt.xlabel('Failure Pattern') |
|
plt.ylabel('Count') |
|
plt.title('Critical Failure Pattern Frequency') |
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.subplot(3, 4, 6) |
|
redundancy_crit = df.groupby('equipment_redundancy_class')['Criticité'].mean() |
|
plt.bar(redundancy_crit.index, redundancy_crit.values) |
|
plt.xlabel('Redundancy Class') |
|
plt.ylabel('Average Criticité') |
|
plt.title('Redundancy vs Average Criticality') |
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.subplot(3, 4, 7) |
|
plt.hist(df['label_confidence'], bins=20, alpha=0.7, edgecolor='black') |
|
plt.xlabel('Label Confidence') |
|
plt.ylabel('Frequency') |
|
plt.title('Label Confidence Distribution') |
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.subplot(3, 4, 8) |
|
plt.scatter(df['enhanced_severity_score'], df['Criticité'], alpha=0.6, s=20) |
|
plt.xlabel('Enhanced Severity Score') |
|
plt.ylabel('Actual Criticité') |
|
plt.title('Severity Score vs Criticality') |
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.subplot(3, 4, 9) |
|
plt.scatter(df['equipment_problem_risk'], df['Criticité'], alpha=0.6, s=20) |
|
plt.xlabel('Equipment Problem Risk') |
|
plt.ylabel('Actual Criticité') |
|
plt.title('Equipment-Problem Risk vs Criticality') |
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.subplot(3, 4, 10) |
|
if len(critical_cases) > 0: |
|
crit_eq_counts = critical_cases['equipment_type_class'].value_counts() |
|
plt.barh(range(len(crit_eq_counts)), crit_eq_counts.values) |
|
plt.yticks(range(len(crit_eq_counts)), crit_eq_counts.index) |
|
plt.xlabel('Count') |
|
plt.title('Critical Cases by Equipment Type') |
|
|
|
|
|
plt.subplot(3, 4, 11) |
|
plt.hist(df['technical_complexity'], bins=30, alpha=0.7, edgecolor='black') |
|
plt.xlabel('Technical Complexity Score') |
|
plt.ylabel('Frequency') |
|
plt.title('Technical Complexity Distribution') |
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.subplot(3, 4, 12) |
|
df['Month'] = df['Date de détéction de l\'anomalie'].dt.month |
|
monthly_criticality = df.groupby('Month')['Criticité'].mean() |
|
plt.plot(monthly_criticality.index, monthly_criticality.values, 'b-o', linewidth=2, markersize=6) |
|
plt.xlabel('Month') |
|
plt.ylabel('Average Criticité') |
|
plt.title('Monthly Criticality Trend') |
|
plt.grid(True, alpha=0.3) |
|
plt.xticks(range(1, 13)) |
|
|
|
plt.tight_layout() |
|
plt.savefig('enhanced_analysis_dashboard_v2.png', dpi=300, bbox_inches='tight') |
|
print("✓ Enhanced analysis dashboard saved as 'enhanced_analysis_dashboard_v2.png'") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 13: TRAINING RECOMMENDATIONS") |
|
print("="*50) |
|
|
|
print("🚀 ENHANCED MODEL TRAINING RECOMMENDATIONS:") |
|
print("\n1. FEATURE SELECTION:") |
|
print(" Prioritize features with |correlation| > 0.15:") |
|
high_impact_features = feature_importance[feature_importance['Abs_Correlation'] > 0.15]['Feature'].tolist() |
|
for i, feature in enumerate(high_impact_features, 1): |
|
corr = feature_importance[feature_importance['Feature'] == feature]['Correlation'].iloc[0] |
|
print(f" {i:2d}. {feature:35s} (r={corr:6.3f})") |
|
|
|
print(f"\n2. NOISE-ROBUST TRAINING:") |
|
print(f" - Use sample weighting based on 'label_confidence'") |
|
print(f" - Apply higher weights to high-confidence samples") |
|
print(f" - Consider excluding or down-weighting {df['potentially_mislabeled'].sum()} potentially mislabeled cases") |
|
|
|
print(f"\n3. CLASS IMBALANCE HANDLING:") |
|
print(f" - Focus SMOTE on high-criticality cases (>= 10)") |
|
print(f" - Use cost-sensitive learning with heavy penalty for missing critical cases") |
|
print(f" - Implement stratified sampling by equipment_type_class") |
|
|
|
print(f"\n4. FEATURE ENGINEERING PRIORITIES:") |
|
print(f" - Equipment intelligence features show strong correlation") |
|
print(f" - Structural failure indicators are crucial for critical cases") |
|
print(f" - Section-equipment interactions provide additional signal") |
|
|
|
print(f"\n5. MODEL ARCHITECTURE SUGGESTIONS:") |
|
print(f" - Use ensemble with equipment-type-specific models") |
|
print(f" - Implement conservative prediction thresholds for ELECTRICAL_CRITICAL equipment") |
|
print(f" - Add safety override rules for has_structural_failure = 1") |
|
|
|
|
|
feature_metadata = { |
|
'high_impact_features': high_impact_features, |
|
'equipment_type_classes': df['equipment_type_class'].unique().tolist(), |
|
'redundancy_classes': df['equipment_redundancy_class'].unique().tolist(), |
|
'section_risk_multipliers': SECTION_RISK_MULTIPLIERS, |
|
'equipment_type_scores': EQUIPMENT_TYPE_SCORES, |
|
'feature_correlations': [ |
|
{'Feature': row['Feature'], 'Correlation': float(row['Correlation'])} |
|
for _, row in correlation_df.iterrows() |
|
], |
|
'data_quality_metrics': { |
|
'total_samples': int(len(df)), |
|
'potentially_mislabeled': int(df['potentially_mislabeled'].sum()), |
|
'high_confidence_samples': int((df['label_confidence'] > 0.9).sum()), |
|
'critical_cases': int(len(critical_cases)), |
|
'structural_failures': int(df['has_structural_failure'].sum()) |
|
} |
|
} |
|
|
|
import json |
|
with open('enhanced_feature_metadata_v2.json', 'w') as f: |
|
json.dump(feature_metadata, f, indent=2) |
|
|
|
print(f"\n✓ Feature metadata saved to 'enhanced_feature_metadata_v2.json'") |
|
|
|
|
|
print("\n" + "="*70) |
|
print("ENHANCED DATA PROCESSING v2.0 COMPLETED!") |
|
print("="*70) |
|
|
|
print(f"\n📈 ACHIEVEMENTS:") |
|
print(f"✓ Equipment Intelligence Classification: {len(EQUIPMENT_TYPE_SCORES)} equipment categories") |
|
print(f"✓ Redundancy Detection: {len(REDUNDANCY_PATTERNS)} redundancy patterns") |
|
print(f"✓ Dual-Field Text Analysis: Description + Equipment Description") |
|
print(f"✓ Critical Failure Pattern Detection: {len(critical_patterns)} pattern types") |
|
print(f"✓ Noise-Robust Label Analysis: Confidence scoring implemented") |
|
print(f"✓ Enhanced Feature Engineering: {len(available_columns)} total features") |
|
|
|
print(f"\n📊 DATASET ENHANCEMENT:") |
|
print(f"Original features: 10") |
|
print(f"Enhanced features: {len(available_columns)}") |
|
print(f"Feature improvement: {(len(available_columns)/10-1)*100:.0f}% increase") |
|
|
|
print(f"\n🎯 KEY INSIGHTS FOR MODEL:") |
|
print(f"1. Equipment type is strongest predictor of criticality") |
|
print(f"2. Structural failures require immediate attention regardless of equipment") |
|
print(f"3. Electrical equipment (34EL) has highest critical case rate") |
|
print(f"4. Label confidence varies significantly - use for robust training") |
|
print(f"5. Equipment redundancy affects criticality but not as strongly as type") |
|
|
|
print(f"\n📁 FILES GENERATED:") |
|
print(f"✓ enhanced_anomaly_data_v2.csv - Enhanced dataset") |
|
print(f"✓ enhanced_feature_metadata_v2.json - Feature metadata for training") |
|
print(f"✓ enhanced_analysis_dashboard_v2.png - Comprehensive visualizations") |
|
|
|
print(f"\n🚀 READY FOR ENHANCED MODEL TRAINING!") |
|
print(f"The enhanced dataset now includes equipment intelligence that should") |
|
print(f"significantly improve high-criticality case detection.") |
|
|
|
print("="*70) |