Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" | |
| def main(): | |
| df = pd.read_csv(DATA_PATH) | |
| # ------------------------------- | |
| # Handle Label column safely | |
| # ------------------------------- | |
| if "Label" not in df.columns: | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| else: | |
| print("⚠️ Label column not found. Skipping label-based checks.") | |
| # ------------------------------- | |
| # Normalize Language column | |
| # ------------------------------- | |
| if "Language" in df.columns: | |
| df["Language"] = df["Language"].astype(str).str.strip().str.lower() | |
| print("\n🧩 METADATA CONSISTENCY CHECK\n") | |
| # ------------------------------- | |
| # Language consistency | |
| # ------------------------------- | |
| print("🌐 Language distribution:") | |
| if "Language" in df.columns: | |
| print(df["Language"].value_counts()) | |
| else: | |
| print(" Language column not found") | |
| # ------------------------------- | |
| # Label vs Source_Type consistency | |
| # ------------------------------- | |
| print("\n🏷️ Label vs Source_Type consistency:") | |
| if "Label" in df.columns and "Source_Type" in df.columns: | |
| cross_tab = pd.crosstab(df["Label"], df["Source_Type"]) | |
| print(cross_tab) | |
| print("\nExpected behavior:") | |
| print(" Label 0 (Human) → Source_Type should be 'human'") | |
| print(" Label 1 (AI) → Source_Type should be 'ai'") | |
| else: | |
| print(" Required columns not found for consistency check") | |
| print("\nMetadata consistency check completed ✅") | |
| if __name__ == "__main__": | |
| main() | |