|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import os |
|
|
import joblib |
|
|
from preprocessing import preprocess_pipeline |
|
|
|
|
|
def verify_data_integrity(): |
|
|
print("=== Starting Deep Verification ===") |
|
|
|
|
|
|
|
|
data_dir = os.path.join(os.path.dirname(__file__), '../data/crimedataset') |
|
|
train_path = os.path.join(data_dir, 'train.csv') |
|
|
test_path = os.path.join(data_dir, 'test.csv') |
|
|
|
|
|
|
|
|
print("\n[1] Loading Data...") |
|
|
if not os.path.exists(train_path): |
|
|
print("X Train file missing!") |
|
|
return |
|
|
|
|
|
df_train = pd.read_csv(train_path, parse_dates=['Dates']) |
|
|
print(f"OK Train Data Loaded: {df_train.shape}") |
|
|
|
|
|
|
|
|
print("\n[2] Checking for Duplicates...") |
|
|
duplicates = df_train.duplicated().sum() |
|
|
if duplicates > 0: |
|
|
print(f"! Warning: {duplicates} duplicate rows found in training data.") |
|
|
else: |
|
|
print("OK No duplicates found.") |
|
|
|
|
|
|
|
|
print("\n[3] Checking Class Balance...") |
|
|
violent_categories = [ |
|
|
'ASSAULT', 'ROBBERY', 'SEX OFFENSES FORCIBLE', 'KIDNAPPING', 'HOMICIDE', 'ARSON' |
|
|
] |
|
|
df_train['IsViolent'] = df_train['Category'].apply(lambda x: 1 if x in violent_categories else 0) |
|
|
balance = df_train['IsViolent'].value_counts(normalize=True) |
|
|
print(f"Violent Crime Ratio: {balance.get(1, 0)*100:.2f}%") |
|
|
print(f"Non-Violent Crime Ratio: {balance.get(0, 0)*100:.2f}%") |
|
|
|
|
|
if balance.get(1, 0) < 0.1: |
|
|
print("! Severe Class Imbalance detected (<10% positive class). Model may struggle with Recall.") |
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists(test_path): |
|
|
print("\n[4] Checking for Data Leakage (Train/Test Overlap)...") |
|
|
df_test = pd.read_csv(test_path, parse_dates=['Dates']) |
|
|
|
|
|
|
|
|
|
|
|
train_dates = set(df_train['Dates'].dt.date.unique()) |
|
|
test_dates = set(df_test['Dates'].dt.date.unique()) |
|
|
|
|
|
overlap = train_dates.intersection(test_dates) |
|
|
if len(overlap) > 0: |
|
|
print(f"! Warning: Found {len(overlap)} days present in BOTH Train and Test sets. Possible leakage if splitting by time.") |
|
|
else: |
|
|
print("OK No date overlap between Train and Test.") |
|
|
|
|
|
|
|
|
print("\n[5] Verifying Model Artifacts...") |
|
|
models_dir = os.path.join(os.path.dirname(__file__), '../models') |
|
|
required_files = ['best_model.pkl', 'label_encoders.pkl', 'kmeans.pkl'] |
|
|
|
|
|
all_exist = True |
|
|
for f in required_files: |
|
|
fpath = os.path.join(models_dir, f) |
|
|
if os.path.exists(fpath): |
|
|
print(f"OK Found {f}") |
|
|
|
|
|
try: |
|
|
joblib.load(fpath) |
|
|
print(f" -> Successfully loaded {f}") |
|
|
except Exception as e: |
|
|
print(f" X Failed to load {f}: {e}") |
|
|
all_exist = False |
|
|
else: |
|
|
print(f"X Missing {f}") |
|
|
all_exist = False |
|
|
|
|
|
if all_exist: |
|
|
print("\n=== Verification Complete: SYSTEM HEALTHY ===") |
|
|
else: |
|
|
print("\n=== Verification Complete: ISSUES DETECTED ===") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
verify_data_integrity() |
|
|
|