Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" | |
| def main(): | |
| df = pd.read_csv(DATA_PATH) | |
| # ------------------------------- | |
| # Handle label column safely | |
| # ------------------------------- | |
| if "Label" not in df.columns: | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| else: | |
| print("⚠️ Label column not found (labels may be added later).") | |
| print("\n🧪 SANITY CHECK REPORT\n") | |
| print(f"Total samples: {len(df)}") | |
| print(f"Total columns: {len(df.columns)}\n") | |
| # ------------------------------- | |
| # Critical column null checks | |
| # ------------------------------- | |
| print("🔍 Null value check (critical columns):") | |
| critical_cols = ["normalized_code", "Language"] | |
| if "Label" in df.columns: | |
| critical_cols.append("Label") | |
| for col in critical_cols: | |
| if col in df.columns: | |
| null_count = df[col].isnull().sum() | |
| print(f" {col}: {null_count} null values") | |
| else: | |
| print(f" {col}: COLUMN NOT FOUND") | |
| # ------------------------------- | |
| # Code quality checks | |
| # ------------------------------- | |
| empty_code = df["normalized_code"].astype(str).str.strip().eq("").sum() | |
| line_counts = df["normalized_code"].astype(str).str.split("\n").apply(len) | |
| print("\n🧾 Code quality check:") | |
| print(f" Empty normalized_code rows: {empty_code}") | |
| print(f" Very short code (<3 lines): {(line_counts < 3).sum()}") | |
| # ------------------------------- | |
| # Label sanity | |
| # ------------------------------- | |
| if "Label" in df.columns: | |
| unique_labels = sorted(df["Label"].unique()) | |
| print("\n🏷️ Label check:") | |
| print(f" Unique labels found: {unique_labels}") | |
| # ------------------------------- | |
| # Line count statistics | |
| # ------------------------------- | |
| print("\n📏 Line count statistics:") | |
| if "original_line_count" in df.columns and "normalized_line_count" in df.columns: | |
| print(df[["original_line_count", "normalized_line_count"]].describe()) | |
| else: | |
| print(" Line count columns not found.") | |
| # ------------------------------- | |
| # Language-wise sanity summary | |
| # ------------------------------- | |
| print("\n🌐 Language-wise summary:") | |
| for lang in df["Language"].unique(): | |
| lang_df = df[df["Language"] == lang] | |
| print(f"\nLanguage: {lang}") | |
| print(f" Samples: {len(lang_df)}") | |
| short = (lang_df["normalized_code"].str.split("\n").apply(len) < 3).sum() | |
| print(f" Very short code (<3 lines): {short}") | |
| print("\nSanity check completed successfully ✅") | |
| if __name__ == "__main__": | |
| main() | |