Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[28]: | |
| import numpy as np | |
| import pandas as pd | |
| import datetime | |
| import seaborn as sns | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import missingno as msno | |
| import statistics | |
| import plotly | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from wordcloud import WordCloud, STOPWORDS | |
| import nlplot | |
| # #### Reading the Data set ######### | |
| # In[2]: | |
| def get_details(data): | |
| try: | |
| correlation_matrix_info = {} | |
| missing_values_info = {} | |
| print("started") | |
| s_time = datetime.datetime.now() | |
| data_columns=data.columns.tolist() | |
| # ##################################### | |
| # | |
| # ########## Types of variable ############ | |
| # In[3]: | |
| num_data = data.select_dtypes(include=np.number) # numeric data | |
| num_data_col = data.select_dtypes(include=np.number).columns.tolist() # numeric column name | |
| # print("numeric column",len(num_data_col)) | |
| cat_data = data.select_dtypes(include=['object']) # Categorical data | |
| cat_data_col = data.select_dtypes(include=['object']).columns.tolist() # categorical column names | |
| # print("Categorical column",len(cat_data_col)) | |
| bool_data = data.select_dtypes(include=["bool_"]) # bool data | |
| bool_data_col = data.select_dtypes(include=["bool_"]).columns.tolist() # bool column names | |
| # print("Boolean column",len(bool_data_col)) | |
| unsupported_data = data.select_dtypes(exclude=["number", "bool_", "object_"]) | |
| # ########################################################################################## | |
| # | |
| # ################################### No of columns ######################################### | |
| # In[4]: | |
| column = data.columns | |
| col_length = len(column) | |
| row_length = len(data) | |
| # print("Number of variables ",col_length) #Number of variables | |
| # print("Number of observations ",row_length) #Number of observations | |
| total_cells = col_length * row_length | |
| # ############################################################################################ | |
| # | |
| # ################################ Missing cell and % ######################################### | |
| # In[5]: | |
| missing_values = np.where(pd.isnull(data)) | |
| no_of_missing_values = len(missing_values[0]) # no of missing cells | |
| missing_value_per = (no_of_missing_values / total_cells) * 100 # missing cell % | |
| # print("no of missing cells ",no_of_missing_values) | |
| # print("missing cell(%) ",missing_value_per,"%") | |
| # ############################################################################################# | |
| # | |
| # ################################# duplicate rows and % ####################################### | |
| # In[6]: | |
| duplicate = data[data.duplicated()] | |
| duplicate_rows = len(duplicate) | |
| dup_row_per = (duplicate_rows / row_length) * 100 | |
| # print("Duplicate rows ",duplicate_rows) | |
| # print("Duplicate rows (%) ",dup_row_per,"%") | |
| # ############################################################################################### | |
| # | |
| # #################################### Memory usage ############################################### | |
| # In[7]: | |
| memory_usage = data.memory_usage(deep=True).sum() | |
| memory_usage_MB = memory_usage / 1024 ** 2 | |
| # print("Total size in memory ",memory_usage_MB,"MiB") | |
| avg_memory_usage = data.memory_usage(deep=True).mean() | |
| avg_memory_usage_MB = avg_memory_usage / 1024 ** 2 | |
| # print("Average record size in memory ",avg_memory_usage_MB,"MiB") | |
| # ################################################################################################# | |
| # | |
| print("Overview Completed") | |
| # ####################################### General Insights of Numeric Variable ########################################## | |
| # | |
| # In[8]: | |
| num_variable = {} | |
| for col in num_data_col: | |
| val = {} | |
| distinct_val = data[col].nunique() | |
| val['distinct'] = int(distinct_val) | |
| total_count = len(data[col]) | |
| distinct_per = (distinct_val / total_count) * 100 | |
| val['distinct_percent'] = str(distinct_per) + "%" | |
| null = data[col].isnull().sum() | |
| val['missing'] = int(null) | |
| percent_missing = data[col].isnull().sum() * 100 / len(data[col]) | |
| val['missing_percent'] = str(percent_missing) + "%" | |
| zeros_in_col = (data[col] == 0).sum() | |
| val['zeros'] = int(zeros_in_col) | |
| zero_percent = (zeros_in_col / total_count) * 100 | |
| val['zero_percent'] = str(zero_percent) + "%" | |
| mean = data[col].mean() | |
| val['mean'] = float(mean) | |
| mini = data[col].min() | |
| val['minimum'] = str(mini) | |
| median = data[col].median() | |
| val['median'] = str(median) | |
| maxi = data[col].max() | |
| val['maximum'] = str(maxi) | |
| # infinite = df[col].isin([np.inf, -np.inf]) | |
| infinite = np.isinf(data[col]).values.sum() | |
| val['infinite'] = int(infinite) | |
| infinite_percent = infinite * 100 / len(data[col]) | |
| val['infinite_percent'] = str(infinite_percent) + "%" | |
| percent5 = np.percentile(data[col], 5) | |
| val['5th_percentile'] = str(percent5) | |
| percent95 = np.percentile(data[col], 95) | |
| val['95th_percentile'] = str(percent95) | |
| range1 = maxi - mini | |
| val['range'] = str(range1) | |
| q1 = np.percentile(data[col], 25) | |
| val['q1'] = str(q1) | |
| q3 = np.percentile(data[col], 75) | |
| val['q3'] = str(q3) | |
| iqr = q3 - q1 | |
| val['iqr'] = str(iqr) | |
| sample = data[col] | |
| standard_deviation = statistics.stdev(data[col]) | |
| val['standard_deviation'] = str(standard_deviation) | |
| df1 = pd.DataFrame(data) | |
| val['skewness'] = str(data[col].skew()) | |
| val['kurtosis'] = str(data[col].kurtosis()) | |
| val['sum'] = str(data[col].sum()) | |
| val['variance'] = str(data[col].var()) | |
| cv = standard_deviation / mean | |
| # val['co-efficient_variance'] = str(cv) | |
| val['monotocity'] = str(((all(data[col][i] <= data[col][i + 1] for i in range(len(data[col]) - 1)) | |
| or all(data[col][i] >= data[col][i + 1] for i in range(len(data[col] - 1)))))) | |
| #fig, ax = px.subplots(figsize=(10, 10)) | |
| fig = px.histogram(data, x=col) | |
| fig.update_layout(bargap=0.2) | |
| #fig.update_layout(width=25,height=25) | |
| val['visual_path'] = fig | |
| out_fig = px.box(data, x=col) | |
| val['outlier_img'] = out_fig | |
| #st.plotly_chart(fig) | |
| #px.close(fig) | |
| num_variable[col] = val | |
| ######################################################################################################################### | |
| print("Numeric Variable Completed") | |
| ####################################### General Insights of Categorical Variable ########################################## | |
| # In[9]: | |
| cat_variable = {} | |
| for col in cat_data_col: | |
| val = {} | |
| distinct_val = data[col].nunique() | |
| total_count = len(data[col]) | |
| distinct_per = (distinct_val / total_count) * 100 | |
| val['distinct'] = int(distinct_val) | |
| val['distinct_percent'] = str(round(distinct_per, 5)) + "%" | |
| missing_val = np.where(pd.isnull(data[col])) | |
| missing_val_count = len(missing_val[0]) | |
| missing_value_per = (missing_val_count / total_count) * 100 | |
| val['missing'] = int(missing_val_count) | |
| val['missing_percent'] = str(str(round(missing_value_per, 5))) + "%" | |
| memory_usage_col = data[col].memory_usage(deep=True) | |
| memory_usage_col_MB = memory_usage_col / 1024 ** 2 | |
| val['memory'] = str(round(memory_usage_col_MB, 5)) + " MiB" | |
| measurer = np.vectorize(len) | |
| temp_df1 = data[col].dropna() | |
| length_result = measurer(temp_df1.values.astype(str)) | |
| val['max_length'] = int(length_result.max()) | |
| val['median_length'] = int(np.median(length_result)) | |
| val['mean_length'] = float(length_result.mean()) | |
| val['min_length'] = int(length_result.min()) | |
| temp_df = pd.DataFrame(data[col].str.len()) | |
| val['total_character'] = int(temp_df.sum()) | |
| lst = [] | |
| for i in data[col]: | |
| if type(i) == str: | |
| l = list(set(i)) | |
| for j in l: | |
| if j not in lst: | |
| lst.append(j) | |
| val['distinct_character'] = int(len(lst)) | |
| val['distinct_categories'] = "" | |
| val['distinct_blocks'] = "??" | |
| val['distinct_scripts'] = "??" | |
| val['unique'] = "??" | |
| val['unique_percent'] = "??" | |
| #fig=plt.figure() | |
| fig = px.histogram(data, y=col) | |
| #fig.update_layout(width=25,height=25) | |
| val['visual_path'] = fig | |
| #px.close(fig) | |
| cat_variable[col] = val | |
| # #################################################################################################### | |
| print("Categorical Variable Completed") | |
| ##### Scatter Plot for dataset ########## | |
| sc_fig = px.scatter_matrix(data) | |
| ######################################### | |
| ################# Correlation matrix Visualization ############################# | |
| ################## pearson ############################# | |
| pearsoncorr = num_data.corr(method='pearson') | |
| fig = go.Figure(data = [ | |
| go.Heatmap( | |
| z=pearsoncorr, | |
| x=pearsoncorr.columns, | |
| y=pearsoncorr.columns) | |
| ]) | |
| correlation_matrix_info['pearsons'] = fig | |
| ########################################################## | |
| ################## spearman's ############################# | |
| spearmancorr = num_data.corr(method='spearman') | |
| fig = go.Figure(data = [ | |
| go.Heatmap( | |
| z=spearmancorr, | |
| x=spearmancorr.columns, | |
| y=spearmancorr.columns) | |
| ]) | |
| correlation_matrix_info['spearmans'] = fig | |
| ########################################################### | |
| # ################# kendall's ############################# | |
| pearsoncorr = num_data.corr(method='kendall') | |
| fig = go.Figure(data = [ | |
| go.Heatmap( | |
| z=pearsoncorr, | |
| x=pearsoncorr.columns, | |
| y=pearsoncorr.columns) | |
| ]) | |
| correlation_matrix_info['kendall'] = fig | |
| ####################################################### | |
| ###################################################################################################################### | |
| ############################################### Missing Values #################################################### | |
| #################### Count ################ | |
| fig1=plt.figure() | |
| msno.bar(data, figsize=(20, 20), color="dodgerblue") | |
| missing_values_info['count'] = fig1 | |
| plt.close(fig1) | |
| ########################################### | |
| ################## Matrix ################## | |
| fig2=msno.matrix(data, color=(0.27, 0.52, 1.0)) | |
| fig_2 = fig2.get_figure() | |
| missing_values_info['matrix'] = fig_2 | |
| plt.close() | |
| ############################################# | |
| ################ heatmap ################ | |
| fig3=msno.heatmap(data) | |
| fig_3 = fig3.get_figure() | |
| missing_values_info['heatmap'] = fig_3 | |
| plt.close() | |
| ############################################# | |
| ############## dendrogram ################## | |
| fig4=msno.dendrogram(data) | |
| fig_4 = fig4.get_figure() | |
| missing_values_info['dendrogram'] = fig_4 | |
| plt.close() | |
| ################################################ | |
| ################################################################### | |
| f_time = datetime.datetime.now() | |
| duration = f_time - s_time | |
| final_output = {} | |
| overview = {} | |
| reproduction = {} | |
| numerical_variable_info = {} | |
| categorical_variable_info = {} | |
| data_statistics = {} | |
| variable_type = {} | |
| data_statistics['number_of_variables'] = int(col_length) | |
| data_statistics['number_of_observations'] = int(row_length) | |
| data_statistics['no_of_missing_cells'] = int(no_of_missing_values) | |
| data_statistics['missing_cell_percent'] = str(round(missing_value_per, 5)) + "%" | |
| data_statistics['duplicate_rows'] = int(duplicate_rows) | |
| data_statistics['duplicate_rows_percent'] = str(round(dup_row_per, 5)) + "%" | |
| data_statistics['total_size_in_memory'] = str(round(memory_usage_MB, 5)) + "MiB" | |
| data_statistics['average_memory_Usage'] = str(round(avg_memory_usage_MB, 5)) + "MiB" | |
| variable_type['numeric_column'] = int(len(num_data_col)) | |
| variable_type['categorical_column'] = int(len(cat_data_col)) | |
| variable_type['boolean_column'] = int(len(bool_data_col)) | |
| overview['data_statistics'] = data_statistics | |
| overview['variable_type'] = variable_type | |
| reproduction['analysis_started'] = str(s_time) | |
| reproduction['analysis_finished'] = str(f_time) | |
| reproduction['duration'] = str(duration) | |
| reproduction['software_version'] = "??" | |
| reproduction['download_configuration'] = "??" | |
| numerical_variable_info['variable_info'] = num_variable | |
| categorical_variable_info['variable_info'] = cat_variable | |
| ################## Main Functions ###################################### | |
| final_output['overview'] = overview | |
| final_output['reproduction'] = reproduction | |
| final_output['numerical_variable_info'] = numerical_variable_info | |
| final_output['categorical_variable_info'] = categorical_variable_info | |
| final_output['scatter_chart_matrix']=sc_fig | |
| final_output['correlation_matrix_info'] = correlation_matrix_info | |
| final_output['missing_values_info'] = missing_values_info | |
| ####################################################################### | |
| return final_output | |
| except Exception as e: | |
| # exc_type, exc_obj, exc_tb = sys.exc_info() | |
| # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] | |
| return None | |
| ############### Prints the Imbalance Ration of the dataset ################## | |
| def imbalnce_ratio(dataset, target): | |
| val = "" | |
| if dataset[target].nunique() <= 10: | |
| dt = dataset[target].value_counts() | |
| ln = len(dataset[target].value_counts()) | |
| for i in range(0, ln): | |
| ir_cal = round(dt[i] / len(dataset) * 10, 1) | |
| category ="/"+ str(dt.index[i]) | |
| if ir_cal.is_integer(): | |
| val = val + str(int(ir_cal)) | |
| val =val+ category | |
| else: | |
| val = val + str(ir_cal) | |
| val = val + category | |
| if i != (ln - 1): | |
| val = val + " : " | |
| return val | |
| ################################################################### | |
| ########### return's an image which describes about Text visulization ############ | |
| def word_cloud(dataset, column): | |
| if column == "Select": | |
| pass | |
| else: | |
| comment_words = ' ' | |
| wc = WordCloud(stopwords=set(STOPWORDS), | |
| max_words=200, | |
| max_font_size=100) | |
| for val in dataset[column]: | |
| # typecaste each val to string | |
| val = str(val) | |
| # split the value | |
| tokens = val.split() | |
| # Converts each token into lowercase | |
| for i in range(len(tokens)): | |
| tokens[i] = tokens[i].lower() | |
| for words in tokens: | |
| comment_words = comment_words + words + ' ' | |
| wc.generate(comment_words) | |
| word_list = [] | |
| freq_list = [] | |
| fontsize_list = [] | |
| position_list = [] | |
| orientation_list = [] | |
| color_list = [] | |
| for (word, freq), fontsize, position, orientation, color in wc.layout_: | |
| word_list.append(word) | |
| freq_list.append(freq) | |
| fontsize_list.append(fontsize) | |
| position_list.append(position) | |
| orientation_list.append(orientation) | |
| color_list.append(color) | |
| # get the positions | |
| x = [] | |
| y = [] | |
| for i in position_list: | |
| x.append(i[0]) | |
| y.append(i[1]) | |
| # get the relative occurence frequencies | |
| new_freq_list = [] | |
| for i in freq_list: | |
| new_freq_list.append(i * 100) | |
| trace = go.Scatter(x=x, | |
| y=y, | |
| textfont=dict(size=new_freq_list, | |
| color=color_list), | |
| hoverinfo='text', | |
| hovertext=['{0} {1:.2f} %'.format(w, f) for w, f in zip(word_list, new_freq_list)], | |
| mode='text', | |
| text=word_list | |
| ) | |
| layout = go.Layout({'xaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}, | |
| 'yaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}}) | |
| fig = go.Figure(data=[trace], layout=layout) | |
| return fig | |
| ############################################################################### | |
| ########### return's an image which describes about target feature for NLP text classification ############ | |
| def plotly_target(dataset, column): | |
| if column == "Select": | |
| return None | |
| else: | |
| fig = px.histogram(dataset, y=column) | |
| fig.update_layout(bargap=0.2) | |
| return fig | |
| ############################################################################################################ | |
| ############ Plotting n-gram for text feature in NLP Text Classification ########################### | |
| def plot_ngram(dataset, input_col): | |
| if input_col == 'Select': | |
| return None | |
| else: | |
| train = dataset | |
| train[input_col] = train[input_col].apply(lambda x: x.lower()) | |
| npt = nlplot.NLPlot(train, target_col=input_col) | |
| stopwords = npt.get_stopword(top_n=30, min_freq=0) | |
| fig = npt.bar_ngram( | |
| title='bi-gram', | |
| xaxis_label='word_count', | |
| yaxis_label='word', | |
| ngram=2, | |
| top_n=50, | |
| width=700, | |
| height=1100, | |
| stopwords=stopwords, | |
| ) | |
| return fig | |
| ################################################################################################# |