# Data preparation and import libraries | |
import pandas as pd | |
import numpy as np | |
import datetime as dt | |
# import matplotlib.pyplot as plt | |
# import seaborn as sns | |
import as px | |
# %matplotlib inline | |
import warnings | |
warnings.filterwarnings('ignore') | |
from sklearn.model_selection import train_test_split | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.metrics import classification_report, accuracy_score | |
from sklearn.metrics import confusion_matrix | |
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv') | |
# Data Exploration and Data Cleaning | |
data.head() | |
data.shape | | | |
# Check Duplication | |
data.duplicated().sum() | |
# There aren't duplicate values | |
# check Missing value | |
data.isna().sum() | |
#- No Missing Value is avalible | |
# Check the number of unique values of each column | |
data.nunique() | |
#- Dropping the id and Unnamed: 32 columns which will not provide any information for our model | |
data = data.drop(['id','Unnamed: 32'], axis= 1) | |
#- Changed the name of the diagnostic column to target! | |
data = data.rename(columns={'diagnosis' : 'target'}) | |
#- changed taget data in the dataset. I changed malignant to 1 and benign to 0. | |{'M' : '1','B': '0'},inplace=True) | |
# Converting target type to int64 | | ='float64') | |
### Data processing result | |
data.head() | |
data.tail() | | | |
# Check statistic of dataset | |
data.describe().T | |
# Analysis📝 & EDA📊 | |
# I looked at how many benign and malignant yields there are. | | | |
# visualized target data in the dataset. | |
data['target'].value_counts().plot(kind='bar',edgecolor='black',color=['lightsteelblue','navajowhite']) | |
plt.title(" Target",fontsize=20) | | | |
#- 1-->Malignant | |
#- 0-->Benign | |
# Correlation Analysis | |
cor = data.corr() | |
cor | |
plt.figure(figsize=(25,23)) | |
sns.heatmap(cor, annot= True, linewidths= 0.3 ,linecolor = "black", fmt = ".2f") | |
plt.title('Correlation Heatmap') | | | |
#- Data with a correlation greater than 0.75. | |
threshold = 0.75 | |
filtre = np.abs(cor["target"] > threshold) | |
corr_features = cor.columns[filtre].tolist() | |
plt.figure(figsize=(10,8)) | |
sns.clustermap(data[corr_features].corr(), annot = True, fmt = ".2f") | |
plt.title("\n Correlation Between Features with Cor Thresgold [0.75]\n",fontsize=20) | | | |
### visualized the data with a correlation greater than 0.75 | |
sns.pairplot(data[corr_features], diag_kind = "kde" , markers = "*", hue="target") | | | |
# Machine Learning Model Evaluation | |
# Splitting data | |
x= data.drop('target',axis=1) | |
y= data['target'] | |
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30,random_state=101) | |
s= StandardScaler() | |
x_train = s.fit_transform(x_train) | |
x_test = s.fit_transform(x_test) | |
algorithm = ['KNeighborsClassifier','RandomForestClassifier','DecisionTreeClassifier','GaussianNB','LogisticRegression'] | |
Accuracy=[] | |
def all(model): | |,y_train) | |
pred = model.predict(x_test) | |
acc=accuracy_score(y_test,pred) | |
Accuracy.append(acc) | |
# confusion matrix without Normalization | |
print('confusion matrix') | |
# Calculate confusion matrix | |
cm = confusion_matrix(y_test,pred) | |
# Plot the confusion matrix | |
sns.heatmap(cm, annot=True, fmt='d',cmap=['lightsteelblue','navajowhite']) | |
plt.title('Confusion matrix') | |
plt.xlabel('Predcted lablel') | |
plt.ylabel('True lable') | | | |
# confusion matrix without Normalization | |
print('Normalized confusion matrix') | |
# Calculate confusion matrix | |
cm1 = confusion_matrix(y_test,pred, normalize='true') | |
# Plot the confusion matrix | |
sns.heatmap(cm1, annot=True,cmap=['lightsteelblue','navajowhite']) | |
plt.title('Normalized Confusion matrix') | |
plt.xlabel('Predcted lablel') | |
plt.ylabel('True lable') | | | |
# print Confusion matrix, Classification report and accuracy report | |
print(cm) | |
print(classification_report(y_test,pred)) | |
print('accuracy_score : ' , acc) | |
### KNN machine learning model Evaluation for breast cancer | |
model_1 =KNeighborsClassifier(n_neighbors=2) | |
all(model_1) | |
### RandomForest | |
model_2= RandomForestClassifier(n_estimators=100,random_state=0) | |
all(model_2) | |
### DecisionTree | |
model_3 = DecisionTreeClassifier(random_state=42) | |
all(model_3) | |
### Naive_bayes | |
model_4 = GaussianNB() | |
all(model_4) | |
### Logistic Regression | |
model_5 = LogisticRegression() | |
all(model_5) | |
df = pd.DataFrame({'Algorithm':algorithm,'Accuracy':Accuracy }) | |
df | |
fig = plt.figure(figsize=(20,10)) | |
plt.plot(df.Algorithm,df.Accuracy,label='Accuracy',lw=5,color='peru',marker='o',markersize = 15) | |
plt.legend(fontsize=15) | |
plt.xlabel('\nModel',fontsize= 20) | |
plt.ylabel('Accuracy\n',fontsize= 20) | | | |