|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
from collections import Counter, defaultdict |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
print("="*60) |
|
print("EQUIPMENT PATTERN ANALYSIS FOR CRITICALITY UNDERSTANDING") |
|
print("="*60) |
|
|
|
|
|
try: |
|
df = pd.read_excel('Taqathon_data.xlsx', sheet_name='Oracle') |
|
print(f"✓ Loaded full dataset: {df.shape}") |
|
except FileNotFoundError: |
|
print("❌ Error: Taqathon_data.xlsx not found!") |
|
print("Please ensure the file is in the current directory.") |
|
exit(1) |
|
|
|
print(f"Columns available: {df.columns.tolist()}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 1: EQUIPMENT OVERVIEW") |
|
print("="*50) |
|
|
|
|
|
print("\nMissing values check:") |
|
print(f"Description: {df['Description'].isnull().sum()}") |
|
print(f"Description de l'équipement: {df['Description de l\'équipement'].isnull().sum()}") |
|
print(f"Criticité: {df['Criticité'].isnull().sum()}") |
|
|
|
|
|
df_clean = df.dropna(subset=['Description', 'Description de l\'équipement', 'Criticité']) |
|
print(f"\nClean dataset shape: {df_clean.shape}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 2: EQUIPMENT TYPE FREQUENCY ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
equipment_types = df_clean['Description de l\'équipement'].value_counts() |
|
print(f"\nTotal unique equipment types: {len(equipment_types)}") |
|
|
|
print(f"\nTop 20 most frequent equipment types:") |
|
for equipment, count in equipment_types.head(20).items(): |
|
avg_criticality = df_clean[df_clean['Description de l\'équipement'] == equipment]['Criticité'].mean() |
|
print(f" {equipment}: {count} cases (avg criticality: {avg_criticality:.2f})") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 3: REDUNDANCY PATTERN DETECTION") |
|
print("="*50) |
|
|
|
|
|
def analyze_redundancy_patterns(equipment_name): |
|
patterns = { |
|
'has_ab_suffix': bool(re.search(r'\b[AB]$|\b[AB]\b', equipment_name, re.IGNORECASE)), |
|
'has_number_suffix': bool(re.search(r'\b[N°]*\s*[0-9]+$|\b[0-9]+$', equipment_name)), |
|
'has_principal': 'PRINCIPAL' in equipment_name.upper(), |
|
'has_primaire': 'PRIMAIRE' in equipment_name.upper(), |
|
'has_secondaire': 'SECONDAIRE' in equipment_name.upper(), |
|
'has_auxiliaire': 'AUXILIAIRE' in equipment_name.upper(), |
|
'has_unique': 'UNIQUE' in equipment_name.upper(), |
|
'multiple_numbers': len(re.findall(r'\d+', equipment_name)) > 1 |
|
} |
|
return patterns |
|
|
|
|
|
equipment_analysis = [] |
|
for equipment in df_clean['Description de l\'équipement'].unique(): |
|
patterns = analyze_redundancy_patterns(equipment) |
|
equipment_data = df_clean[df_clean['Description de l\'équipement'] == equipment] |
|
|
|
analysis = { |
|
'equipment': equipment, |
|
'count': len(equipment_data), |
|
'avg_criticality': equipment_data['Criticité'].mean(), |
|
'max_criticality': equipment_data['Criticité'].max(), |
|
'min_criticality': equipment_data['Criticité'].min(), |
|
'std_criticality': equipment_data['Criticité'].std(), |
|
**patterns |
|
} |
|
equipment_analysis.append(analysis) |
|
|
|
equipment_df = pd.DataFrame(equipment_analysis) |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 4: EQUIPMENT REDUNDANCY CLASSIFICATION") |
|
print("="*50) |
|
|
|
|
|
def classify_redundancy(row): |
|
if row['has_principal'] or row['has_unique']: |
|
return 'SINGLE_CRITICAL' |
|
elif row['has_primaire'] or row['has_secondaire']: |
|
return 'DUAL_SYSTEM' |
|
elif row['has_ab_suffix']: |
|
return 'DUAL_SYSTEM' |
|
elif row['has_number_suffix']: |
|
return 'MULTIPLE_SYSTEM' |
|
elif row['has_auxiliaire']: |
|
return 'AUXILIARY' |
|
else: |
|
return 'UNKNOWN' |
|
|
|
equipment_df['redundancy_class'] = equipment_df.apply(classify_redundancy, axis=1) |
|
|
|
|
|
print("\nEquipment distribution by redundancy classification:") |
|
redundancy_stats = equipment_df.groupby('redundancy_class').agg({ |
|
'count': 'sum', |
|
'avg_criticality': 'mean', |
|
'equipment': 'count' |
|
}).round(3) |
|
|
|
for redundancy_class, stats in redundancy_stats.iterrows(): |
|
print(f"\n{redundancy_class}:") |
|
print(f" Number of equipment types: {stats['equipment']}") |
|
print(f" Total anomaly cases: {stats['count']}") |
|
print(f" Average criticality: {stats['avg_criticality']:.3f}") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 5: HIGH CRITICALITY EQUIPMENT IDENTIFICATION") |
|
print("="*50) |
|
|
|
|
|
high_criticality_equipment = equipment_df[equipment_df['avg_criticality'] >= 6.0].sort_values('avg_criticality', ascending=False) |
|
|
|
print(f"\nEquipment types with average criticality >= 6.0:") |
|
for _, row in high_criticality_equipment.iterrows(): |
|
print(f" {row['equipment']}: {row['avg_criticality']:.2f} (n={row['count']}, class={row['redundancy_class']})") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 6: EQUIPMENT NAMING PATTERN ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
def extract_base_equipment_name(equipment_name): |
|
|
|
base_name = re.sub(r'\s*[AB]$|\s*[N°]*\s*[0-9]+$', '', equipment_name) |
|
base_name = re.sub(r'\s*PRIMAIRE$|\s*SECONDAIRE$|\s*PRINCIPAL$', '', base_name) |
|
base_name = base_name.strip() |
|
return base_name |
|
|
|
|
|
equipment_families = defaultdict(list) |
|
for equipment in df_clean['Description de l\'équipement'].unique(): |
|
base_name = extract_base_equipment_name(equipment) |
|
equipment_families[base_name].append(equipment) |
|
|
|
|
|
print("\nEquipment families with multiple variants (indicating redundancy):") |
|
redundant_families = {k: v for k, v in equipment_families.items() if len(v) > 1} |
|
|
|
for family, variants in sorted(redundant_families.items(), key=lambda x: len(x[1]), reverse=True)[:15]: |
|
if len(variants) <= 10: |
|
print(f"\n{family} ({len(variants)} variants):") |
|
for variant in sorted(variants): |
|
variant_data = df_clean[df_clean['Description de l\'équipement'] == variant] |
|
avg_crit = variant_data['Criticité'].mean() |
|
count = len(variant_data) |
|
print(f" - {variant}: {avg_crit:.2f} avg criticality ({count} cases)") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 7: SECTION-EQUIPMENT CRITICALITY ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
section_equipment_analysis = df_clean.groupby(['Section propriétaire', 'Description de l\'équipement']).agg({ |
|
'Criticité': ['mean', 'count', 'max'] |
|
}).round(3) |
|
|
|
section_equipment_analysis.columns = ['avg_criticality', 'count', 'max_criticality'] |
|
section_equipment_analysis = section_equipment_analysis.reset_index() |
|
|
|
|
|
high_risk_combinations = section_equipment_analysis[ |
|
(section_equipment_analysis['avg_criticality'] >= 7.0) & |
|
(section_equipment_analysis['count'] >= 3) |
|
].sort_values('avg_criticality', ascending=False) |
|
|
|
print(f"\nHigh-risk Section-Equipment combinations (avg criticality >= 7.0, min 3 cases):") |
|
for _, row in high_risk_combinations.iterrows(): |
|
print(f" {row['Section propriétaire']} - {row['Description de l\'équipement']}: " |
|
f"{row['avg_criticality']:.2f} avg ({row['count']} cases, max: {row['max_criticality']})") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 8: CRITICAL EQUIPMENT KEYWORD ANALYSIS") |
|
print("="*50) |
|
|
|
|
|
equipment_keywords = {} |
|
all_equipment_text = ' '.join(df_clean['Description de l\'équipement'].values).upper() |
|
|
|
|
|
important_keywords = [ |
|
'PRINCIPAL', 'TRANSFO', 'TURBINE', 'ALTERNATEUR', 'POMPE', 'VENTILATEUR', |
|
'CHAUDIERE', 'CHAUDIÈRE', 'COMPRESSEUR', 'MOTEUR', 'VANNE', 'SOUPAPE', |
|
'RECHAUFFEUR', 'RÉCHAUFFEUR', 'REFROIDISSEMENT', 'REFRIGERANT', 'RÉFRIGÉRANT', |
|
'PRIMAIRE', 'SECONDAIRE', 'AUXILIAIRE', 'UNITE', 'UNITÉ', 'GROUPE' |
|
] |
|
|
|
for keyword in important_keywords: |
|
|
|
equipment_with_keyword = df_clean[df_clean['Description de l\'équipement'].str.contains(keyword, case=False, na=False)] |
|
if len(equipment_with_keyword) > 0: |
|
avg_criticality = equipment_with_keyword['Criticité'].mean() |
|
count = len(equipment_with_keyword) |
|
equipment_keywords[keyword] = { |
|
'count': count, |
|
'avg_criticality': avg_criticality, |
|
'percentage': count / len(df_clean) * 100 |
|
} |
|
|
|
print("\nEquipment keywords analysis (sorted by average criticality):") |
|
sorted_keywords = sorted(equipment_keywords.items(), key=lambda x: x[1]['avg_criticality'], reverse=True) |
|
for keyword, stats in sorted_keywords: |
|
print(f" {keyword}: {stats['avg_criticality']:.3f} avg criticality " |
|
f"({stats['count']} cases, {stats['percentage']:.1f}% of dataset)") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 9: PATTERNS IN CRITICAL CASES (CRITICALITY >= 10)") |
|
print("="*50) |
|
|
|
critical_cases = df_clean[df_clean['Criticité'] >= 10] |
|
print(f"\nTotal critical cases (criticality >= 10): {len(critical_cases)}") |
|
|
|
if len(critical_cases) > 0: |
|
print(f"\nEquipment types in critical cases:") |
|
critical_equipment_counts = critical_cases['Description de l\'équipement'].value_counts() |
|
for equipment, count in critical_equipment_counts.items(): |
|
total_equipment_cases = len(df_clean[df_clean['Description de l\'équipement'] == equipment]) |
|
percentage = count / total_equipment_cases * 100 |
|
print(f" {equipment}: {count}/{total_equipment_cases} cases ({percentage:.1f}% critical)") |
|
|
|
print(f"\nSections with critical cases:") |
|
critical_section_counts = critical_cases['Section propriétaire'].value_counts() |
|
for section, count in critical_section_counts.items(): |
|
total_section_cases = len(df_clean[df_clean['Section propriétaire'] == section]) |
|
percentage = count / total_section_cases * 100 |
|
print(f" {section}: {count}/{total_section_cases} cases ({percentage:.1f}% critical)") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("STEP 10: EQUIPMENT ANALYSIS RECOMMENDATIONS") |
|
print("="*50) |
|
|
|
print("\n🎯 KEY FINDINGS:") |
|
print("1. Equipment Redundancy Patterns:") |
|
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL'])} equipment types classified as SINGLE_CRITICAL") |
|
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM'])} equipment types classified as DUAL_SYSTEM") |
|
print(f" - {len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM'])} equipment types classified as MULTIPLE_SYSTEM") |
|
|
|
print("\n2. High-Risk Equipment Keywords:") |
|
top_risk_keywords = sorted_keywords[:5] |
|
for keyword, stats in top_risk_keywords: |
|
print(f" - '{keyword}': {stats['avg_criticality']:.3f} avg criticality") |
|
|
|
print("\n3. Equipment Families with Redundancy:") |
|
print(f" - Found {len(redundant_families)} equipment families with multiple variants") |
|
print(f" - This suggests systematic redundancy patterns in the data") |
|
|
|
print("\n🚀 RECOMMENDATIONS FOR FEATURE ENGINEERING:") |
|
print("1. Create 'equipment_redundancy_class' feature based on naming patterns") |
|
print("2. Add 'equipment_base_type' feature by extracting equipment families") |
|
print("3. Implement 'critical_equipment_keywords' scoring system") |
|
print("4. Create 'section_equipment_risk' interaction features") |
|
print("5. Build 'equipment_criticality_history' based on historical data") |
|
|
|
|
|
print("\n" + "="*50) |
|
print("SAVING ANALYSIS RESULTS") |
|
print("="*50) |
|
|
|
|
|
equipment_df.to_csv('equipment_analysis_results.csv', index=False) |
|
print("✓ Saved equipment analysis to 'equipment_analysis_results.csv'") |
|
|
|
|
|
high_risk_combinations.to_csv('high_risk_equipment_combinations.csv', index=False) |
|
print("✓ Saved high-risk combinations to 'high_risk_equipment_combinations.csv'") |
|
|
|
|
|
summary_stats = { |
|
'total_equipment_types': len(equipment_df), |
|
'single_critical_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'SINGLE_CRITICAL']), |
|
'dual_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'DUAL_SYSTEM']), |
|
'multiple_system_equipment': len(equipment_df[equipment_df['redundancy_class'] == 'MULTIPLE_SYSTEM']), |
|
'high_criticality_equipment': len(high_criticality_equipment), |
|
'equipment_families_with_redundancy': len(redundant_families), |
|
'critical_cases_count': len(critical_cases) |
|
} |
|
|
|
import json |
|
with open('equipment_analysis_summary.json', 'w') as f: |
|
json.dump(summary_stats, f, indent=2) |
|
print("✓ Saved summary statistics to 'equipment_analysis_summary.json'") |
|
|
|
print("\n" + "="*60) |
|
print("EQUIPMENT ANALYSIS COMPLETED!") |
|
print("="*60) |
|
print("\nFiles generated:") |
|
print("- equipment_analysis_results.csv") |
|
print("- high_risk_equipment_combinations.csv") |
|
print("- equipment_analysis_summary.json") |
|
print("\nPlease review the analysis results and share the key findings!") |
|
print("This will help us design the optimal equipment intelligence features.") |