{ "cells": [ { "cell_type": "markdown", "id": "c2f352ed", "metadata": {}, "source": [ "# Dataset\n", "\n", "https://www.kaggle.com/datasets/andrewmvd/autism-screening-on-adults" ] }, { "cell_type": "code", "execution_count": 1, "id": "dff0477d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "from sklearn.ensemble import RandomForestClassifier\n", "import pickle \n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "53a8cd69", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score \\\n", "0 1 1 1 1 0 0 1 \n", "1 1 1 0 1 0 0 0 \n", "2 1 1 0 1 1 0 1 \n", "3 1 1 0 1 0 0 1 \n", "4 1 0 0 0 0 0 0 \n", "\n", " A8_Score A9_Score A10_Score age gender ethnicity jundice austim \\\n", "0 1 0 0 26.0 f White-European no no \n", "1 1 0 1 24.0 m Latino no yes \n", "2 1 1 1 27.0 m Latino yes yes \n", "3 1 0 1 35.0 f White-European no yes \n", "4 1 0 0 40.0 f ? no no \n", "\n", " contry_of_res used_app_before result age_desc relation Class/ASD \n", "0 United States no 6.0 18 and more Self NO \n", "1 Brazil no 5.0 18 and more Self NO \n", "2 Spain no 8.0 18 and more Parent YES \n", "3 United States no 6.0 18 and more Self NO \n", "4 Egypt no 2.0 18 and more ? NO \n" ] } ], "source": [ "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)\n", "df_autism = pd.read_csv('autism_screening.csv')\n", "print(df_autism.head())" ] }, { "cell_type": "code", "execution_count": 3, "id": "367014f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 704 entries, 0 to 703\n", "Data columns (total 21 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 A1_Score 704 non-null int64 \n", " 1 A2_Score 704 non-null int64 \n", " 2 A3_Score 704 non-null int64 \n", " 3 A4_Score 704 non-null int64 \n", " 4 A5_Score 704 non-null int64 \n", " 5 A6_Score 704 non-null int64 \n", " 6 A7_Score 704 non-null int64 \n", " 7 A8_Score 704 non-null int64 \n", " 8 A9_Score 704 non-null int64 \n", " 9 A10_Score 704 non-null int64 \n", " 10 age 702 non-null float64\n", " 11 gender 704 non-null object \n", " 12 ethnicity 704 non-null object \n", " 13 jundice 704 non-null object \n", " 14 austim 704 non-null object \n", " 15 contry_of_res 704 non-null object \n", " 16 used_app_before 704 non-null object \n", " 17 result 704 non-null float64\n", " 18 age_desc 704 non-null object \n", " 19 relation 704 non-null object \n", " 20 Class/ASD 704 non-null object \n", "dtypes: float64(2), int64(10), object(9)\n", "memory usage: 115.6+ KB\n" ] } ], "source": [ "df_autism.info()" ] }, { "cell_type": "code", "execution_count": 4, "id": "67d026a5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score',\n", " 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender',\n", " 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before',\n", " 'result', 'age_desc', 'relation', 'Class/ASD'],\n", " dtype='object')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_autism.columns" ] }, { "cell_type": "code", "execution_count": 5, "id": "c34f87cb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A1_Score 0\n", "A2_Score 0\n", "A3_Score 0\n", "A4_Score 0\n", "A5_Score 0\n", "A6_Score 0\n", "A7_Score 0\n", "A8_Score 0\n", "A9_Score 0\n", "A10_Score 0\n", "age 17.0\n", "gender f\n", "ethnicity ?\n", "jundice no\n", "austim no\n", "contry_of_res Afghanistan\n", "used_app_before no\n", "result 0.0\n", "age_desc 18 and more\n", "relation ?\n", "Class/ASD NO\n", "dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_autism.min()" ] }, { "cell_type": "code", "execution_count": 6, "id": "7346223c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A1_Score 1\n", "A2_Score 1\n", "A3_Score 1\n", "A4_Score 1\n", "A5_Score 1\n", "A6_Score 1\n", "A7_Score 1\n", "A8_Score 1\n", "A9_Score 1\n", "A10_Score 1\n", "age 383.0\n", "gender m\n", "ethnicity others\n", "jundice yes\n", "austim yes\n", "contry_of_res Viet Nam\n", "used_app_before yes\n", "result 10.0\n", "age_desc 18 and more\n", "relation Self\n", "Class/ASD YES\n", "dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_autism.max()" ] }, { "cell_type": "code", "execution_count": 7, "id": "4c90720b", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score \\\n", "0 1 1 1 1 0 0 1 \n", "1 1 1 0 1 0 0 0 \n", "2 1 1 0 1 1 0 1 \n", "3 1 1 0 1 0 0 1 \n", "4 1 0 0 0 0 0 0 \n", "\n", " A8_Score A9_Score A10_Score age gender ethnicity jundice austim \\\n", "0 1 0 0 26.0 f White-European no no \n", "1 1 0 1 24.0 m Latino no yes \n", "2 1 1 1 27.0 m Latino yes yes \n", "3 1 0 1 35.0 f White-European no yes \n", "4 1 0 0 40.0 f ? no no \n", "\n", " contry_of_res used_app_before result age_desc relation Class/ASD \n", "0 United States no 6.0 18 and more Self NO \n", "1 Brazil no 5.0 18 and more Self NO \n", "2 Spain no 8.0 18 and more Parent YES \n", "3 United States no 6.0 18 and more Self NO \n", "4 Egypt no 2.0 18 and more ? NO \n" ] } ], "source": [ "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)\n", "df_autism = pd.read_csv('autism_screening.csv')\n", "df_autism = df_autism[['A1_Score', 'A2_Score', 'A3_Score', 'A4_Score', 'A5_Score', 'A6_Score', 'A7_Score', 'A8_Score', 'A9_Score', 'A10_Score', 'age', 'gender', 'ethnicity', 'jundice', 'austim', 'contry_of_res', 'used_app_before', 'result', 'age_desc', 'relation', 'Class/ASD']].copy()\n", "print(df_autism.head())" ] }, { "cell_type": "code", "execution_count": 8, "id": "0da534fb", "metadata": {}, "outputs": [], "source": [ "df_autism = df_autism.dropna()" ] }, { "cell_type": "code", "execution_count": 9, "id": "025d840c", "metadata": {}, "outputs": [], "source": [ "df_autism['gender'] = np.where(df_autism['gender']=='m', 1, 0)\n", "df_autism['jundice'] = np.where(df_autism['jundice']=='yes', 1, 0)\n", "df_autism['austim'] = np.where(df_autism['austim']=='yes', 1, 0)\n", "df_autism['Class/ASD'] = np.where(df_autism['Class/ASD']=='YES', 1, 0)" ] }, { "cell_type": "code", "execution_count": 10, "id": "0f4d322a", "metadata": {}, "outputs": [], "source": [ "df = df_autism.drop(['ethnicity', 'contry_of_res', 'used_app_before', 'result', 'age_desc', 'relation'], axis=1)" ] }, { "cell_type": "code", "execution_count": 11, "id": "1f7ceaee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score \\\n", "0 1 1 1 1 0 0 1 \n", "1 1 1 0 1 0 0 0 \n", "2 1 1 0 1 1 0 1 \n", "3 1 1 0 1 0 0 1 \n", "4 1 0 0 0 0 0 0 \n", "\n", " A8_Score A9_Score A10_Score age gender jundice austim Class/ASD \n", "0 1 0 0 26.0 0 0 0 0 \n", "1 1 0 1 24.0 1 0 1 0 \n", "2 1 1 1 27.0 1 1 1 1 \n", "3 1 0 1 35.0 0 0 1 0 \n", "4 1 0 0 40.0 0 0 0 0 \n" ] } ], "source": [ "print(df.head())" ] }, { "cell_type": "code", "execution_count": 12, "id": "56bf3c6d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 702 entries, 0 to 703\n", "Data columns (total 15 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 A1_Score 702 non-null int64 \n", " 1 A2_Score 702 non-null int64 \n", " 2 A3_Score 702 non-null int64 \n", " 3 A4_Score 702 non-null int64 \n", " 4 A5_Score 702 non-null int64 \n", " 5 A6_Score 702 non-null int64 \n", " 6 A7_Score 702 non-null int64 \n", " 7 A8_Score 702 non-null int64 \n", " 8 A9_Score 702 non-null int64 \n", " 9 A10_Score 702 non-null int64 \n", " 10 age 702 non-null float64\n", " 11 gender 702 non-null int32 \n", " 12 jundice 702 non-null int32 \n", " 13 austim 702 non-null int32 \n", " 14 Class/ASD 702 non-null int32 \n", "dtypes: float64(1), int32(4), int64(10)\n", "memory usage: 76.8 KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 13, "id": "79794484", "metadata": {}, "outputs": [], "source": [ "X = df.drop('Class/ASD', axis=1)\n", "Y = df['Class/ASD']" ] }, { "cell_type": "code", "execution_count": 14, "id": "d505ff04", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier()" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf = RandomForestClassifier()\n", "clf.fit(X, Y)" ] }, { "cell_type": "code", "execution_count": 15, "id": "f64b8440", "metadata": {}, "outputs": [], "source": [ "pickle.dump(clf, open('autism_clf.pkl', 'wb'))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "vscode": { "interpreter": { "hash": "a07fda4108288eac30d110fb6c0835906d2e1f106803cd94f31ab0c859b7a7bd" } } }, "nbformat": 4, "nbformat_minor": 5 }