# Data preparation and import libraries import pandas as pd import numpy as np import datetime as dt # import matplotlib.pyplot as plt # import seaborn as sns import plotly.express as px # %matplotlib inline import warnings warnings.filterwarnings('ignore') from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import classification_report, accuracy_score from sklearn.metrics import confusion_matrix data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv') # Data Exploration and Data Cleaning data.head() data.shape data.info() # Check Duplication data.duplicated().sum() # There aren't duplicate values # check Missing value data.isna().sum() #- No Missing Value is avalible # Check the number of unique values of each column data.nunique() #- Dropping the id and Unnamed: 32 columns which will not provide any information for our model data = data.drop(['id','Unnamed: 32'], axis= 1) #- Changed the name of the diagnostic column to target! data = data.rename(columns={'diagnosis' : 'target'}) #- changed taget data in the dataset. I changed malignant to 1 and benign to 0. data.target.replace({'M' : '1','B': '0'},inplace=True) # Converting target type to int64 data.target = data.target.astype('float64') ### Data processing result data.head() data.tail() data.info() # Check statistic of dataset data.describe().T # Analysis📝 & EDA📊 # I looked at how many benign and malignant yields there are. data.target.value_counts() # visualized target data in the dataset. data['target'].value_counts().plot(kind='bar',edgecolor='black',color=['lightsteelblue','navajowhite']) plt.title(" Target",fontsize=20) plt.show() #- 1-->Malignant #- 0-->Benign # Correlation Analysis cor = data.corr() cor plt.figure(figsize=(25,23)) sns.heatmap(cor, annot= True, linewidths= 0.3 ,linecolor = "black", fmt = ".2f") plt.title('Correlation Heatmap') plt.show() #- Data with a correlation greater than 0.75. threshold = 0.75 filtre = np.abs(cor["target"] > threshold) corr_features = cor.columns[filtre].tolist() plt.figure(figsize=(10,8)) sns.clustermap(data[corr_features].corr(), annot = True, fmt = ".2f") plt.title("\n Correlation Between Features with Cor Thresgold [0.75]\n",fontsize=20) plt.show() ### visualized the data with a correlation greater than 0.75 sns.pairplot(data[corr_features], diag_kind = "kde" , markers = "*", hue="target") plt.show() # Machine Learning Model Evaluation # Splitting data x= data.drop('target',axis=1) y= data['target'] x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=101) s= StandardScaler() x_train = s.fit_transform(x_train) x_test = s.fit_transform(x_test) algorithm = ['KNeighborsClassifier','RandomForestClassifier','DecisionTreeClassifier','GaussianNB','LogisticRegression'] Accuracy=[] def all(model): model.fit(x_train,y_train) pred = model.predict(x_test) acc=accuracy_score(y_test,pred) Accuracy.append(acc) # confusion matrix without Normalization print('confusion matrix') # Calculate confusion matrix cm = confusion_matrix(y_test,pred) # Plot the confusion matrix sns.heatmap(cm, annot=True, fmt='d',cmap=['lightsteelblue','navajowhite']) plt.title('Confusion matrix') plt.xlabel('Predcted lablel') plt.ylabel('True lable') plt.show() # confusion matrix without Normalization print('Normalized confusion matrix') # Calculate confusion matrix cm1 = confusion_matrix(y_test,pred, normalize='true') # Plot the confusion matrix sns.heatmap(cm1, annot=True,cmap=['lightsteelblue','navajowhite']) plt.title('Normalized Confusion matrix') plt.xlabel('Predcted lablel') plt.ylabel('True lable') plt.show() # print Confusion matrix, Classification report and accuracy report print(cm) print(classification_report(y_test,pred)) print('accuracy_score : ' , acc) ### KNN machine learning model Evaluation for breast cancer model_1 =KNeighborsClassifier(n_neighbors=2) all(model_1) ### RandomForest model_2= RandomForestClassifier(n_estimators=100,random_state=0) all(model_2) ### DecisionTree model_3 = DecisionTreeClassifier(random_state=42) all(model_3) ### Naive_bayes model_4 = GaussianNB() all(model_4) ### Logistic Regression model_5 = LogisticRegression() all(model_5) df = pd.DataFrame({'Algorithm':algorithm,'Accuracy':Accuracy }) df fig = plt.figure(figsize=(20,10)) plt.plot(df.Algorithm,df.Accuracy,label='Accuracy',lw=5,color='peru',marker='o',markersize = 15) plt.legend(fontsize=15) plt.xlabel('\nModel',fontsize= 20) plt.ylabel('Accuracy\n',fontsize= 20) plt.show()