File size: 4,399 Bytes
6340cf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from process_xlsx import process_xlsx
import pandas as pd
import re

# Step 1: Convert to DataFrame
def create_course_dataframe(cleaned_column_names, column_names, department_program_courses):
    data = []
    for department, programs in department_program_courses.items():
        for program, courses in programs.items():
            for course in courses:
                row_data = course[:]  # Copy original row
                row_data.append(department)  # Add Department column
                row_data.append(program if program else "N/A")  # Add Program column
                data.append(row_data)
    
    # Add Department and Program to the column names
    extended_column_names = column_names + ['Department', 'Program']
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=extended_column_names)
    # Strip trailing spaces from column names and remove spaces/newlines
    df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '')
    
    # Ensure all column names are strings before stripping
    df.rename(columns=lambda x: str(x).strip(), inplace=True)
    
    # Clean the cleaned_column_names to match the stripped column names
    cleaned_column_names = [col.strip() for col in cleaned_column_names]
    
    # Select columns based on cleaned_column_names
    df = df[cleaned_column_names]

    return df

# Step 2: Diagnose Inconsistencies in Data
def diagnose_inconsistencies(df):
    # Report missing values
    missing_values = df.isnull().sum()
    print("\nMissing Values Per Column:")
    print(missing_values[missing_values > 0])
    
    # Check unique value counts to spot potential inconsistencies
    print("\nUnique Value Counts Per Column:")
    for column in df.columns:
        unique_vals = df[column].nunique()
        print(f"{column}: {unique_vals} unique values")
    
    # Identify potential misspellings and inconsistent values in key columns
    # Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc.
    print("\nInconsistent Patterns and Values:")
    
    # Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521')
    inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')]
    if not inconsistent_course_codes.empty:
        print("\nInconsistent Course Codes:")
        print(inconsistent_course_codes[['CourseCode']].drop_duplicates())
    
    # Check for inconsistent capitalization in 'Instructor' column
    df['Instructor'] = df['Instructor'].str.strip().str.title()
    instructor_inconsistencies = df['Instructor'].value_counts()
    print("\nInstructor Inconsistencies:")
    print(instructor_inconsistencies[instructor_inconsistencies > 1])
    
    # Check for possible misspellings or variations in Room
    print("\nRoom Variations:")
    room_variations = df['Room'].value_counts()
    print(room_variations[room_variations > 1])
    
    # Identify rows with missing key fields that should generally be non-null
    key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor']
    missing_key_fields = df[df[key_columns].isnull().any(axis=1)]
    if not missing_key_fields.empty:
        print("\nRows with Missing Key Fields:")
        print(missing_key_fields[key_columns])
    
    # Display data types and any anomalies in numeric fields
    print("\nData Types and Anomalies in Numeric Fields:")
    for column in df.select_dtypes(include=['number']).columns:
        print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}")
    
    return df


file_path = "data/FTCM_Course_List_Spring2025.xlsx"
result = process_xlsx(file_path)
    
if result:
    column_names, department_program_courses = result
    print(f"Column Names:{column_names}") 
else:
    print(f"Error processing file. {file_path}")

cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)', 
    'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
    'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo', 
    'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
# Sample usage
# Assuming column_names and department_program_courses are already defined
df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses)
df_cleaned = diagnose_inconsistencies(df)
diagnose_inconsistencies(df_cleaned)