File size: 3,568 Bytes
fb1a11c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
{
    "notebook_title": "Exploratory data analysis (EDA)",
    "notebook_type": "eda",
    "dataset_type": "numeric",
    "notebook_template": [
        {
            "cell_type": "markdown",
            "source": "\n---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---\n"
        },
        {
            "cell_type": "markdown",
            "source": "## 1. Setup necessary libraries and load the dataset"
        },
        {
            "cell_type": "code",
            "source": "\n# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn\n"
        },
        {
            "cell_type": "code",
            "source": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n"
        },
        {
            "cell_type": "code",
            "source": "\n# Load the dataset as a DataFrame\n{first_code}\n"
        },
        {
            "cell_type": "markdown",
            "source": "## 2. Understanding the Dataset"
        },
        {
            "cell_type": "code",
            "source": "\n# First rows of the dataset and info\nprint(df.head())\nprint(df.info())\n"
        },
        {
            "cell_type": "code",
            "source": "\n# Check for missing values\nprint(df.isnull().sum())\n"
        },
        {
            "cell_type": "code",
            "source": "\n# Identify data types of each column\nprint(df.dtypes)\n"
        },
        {
            "cell_type": "code",
            "source": "\n# Detect duplicated rows\nprint(df.duplicated().sum())\n"
        },
        {
            "cell_type": "code",
            "source": "\n# Generate descriptive statistics\nprint(df.describe())\n"
        },
        {
            "type": "categoric",
            "cell_type": "code",
            "source": "\n# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()\n"
        },
        {
            "cell_type": "markdown",
            "source": "## 3. Data Visualization"
        },
        {
            "type": "numeric",
            "cell_type": "code",
            "source": "\n# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()\n"
        },
        {
            "type": "numeric",
            "cell_type": "code",
            "source": "\n# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.histplot(df[column], kde=True)\n    plt.title(f'Distribution of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Frequency')\n    plt.show()\n"
        },
        {
            "type": "categoric",
            "cell_type": "code",
            "source": "\n# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.countplot(x=column, data=df)\n    plt.title(f'Count Plot of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Count')\n    plt.show()\n"
        },
        {
            "type": "numeric",
            "cell_type": "code",
            "source": "\n# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.boxplot(df[column])\n    plt.title(f'Box Plot of {column}')\n    plt.xlabel(column)\n    plt.show()\n"
        }
    ]
}