import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import accuracy_score, classification_report from sklearn.cluster import KMeans from sklearn.decomposition import PCA class MachineLearning: def perform_ml_tasks(self, df): task_type = st.selectbox("Select ML task", ["Classification", "Clustering", "Dimensionality Reduction"]) if task_type == "Classification": self.perform_classification(df) elif task_type == "Clustering": self.perform_clustering(df) elif task_type == "Dimensionality Reduction": self.perform_dimensionality_reduction(df) def perform_classification(self, df): target_column = st.selectbox("Select target column", df.columns) feature_columns = st.multiselect("Select feature columns", df.columns.drop(target_column)) if len(feature_columns) > 0: X = df[feature_columns] y = df[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) model_type = st.selectbox("Select model type", ["Logistic Regression", "Decision Tree", "Random Forest", "SVM"]) if model_type == "Logistic Regression": model = LogisticRegression() elif model_type == "Decision Tree": model = DecisionTreeClassifier() elif model_type == "Random Forest": model = RandomForestClassifier() elif model_type == "SVM": model = SVC() model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) st.subheader("Classification Results") st.write(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") st.write("Classification Report:") st.code(classification_report(y_test, y_pred)) def perform_clustering(self, df): feature_columns = st.multiselect("Select feature columns for clustering", df.columns) if len(feature_columns) > 0: X = df[feature_columns] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) n_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=3) kmeans = KMeans(n_clusters=n_clusters, random_state=42) cluster_labels = kmeans.fit_predict(X_scaled) df['Cluster'] = cluster_labels st.subheader("Clustering Results") if len(feature_columns) >= 2: fig = px.scatter(df, x=feature_columns[0], y=feature_columns[1], color='Cluster') st.plotly_chart(fig) st.write("Cluster Centers:") cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_) st.write(pd.DataFrame(cluster_centers, columns=feature_columns)) def perform_dimensionality_reduction(self, df): feature_columns = st.multiselect("Select feature columns for dimensionality reduction", df.columns) if len(feature_columns) > 0: X = df[feature_columns] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) n_components = st.slider("Select number of components", min_value=2, max_value=min(len(feature_columns), 10), value=2) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X_scaled) st.subheader("PCA Results") explained_variance_ratio = pca.explained_variance_ratio_ st.write(f"Explained Variance Ratio: {explained_variance_ratio}") if n_components >= 2: fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], title="PCA Visualization") st.plotly_chart(fig) st.write("PCA Components:") st.write(pd.DataFrame(pca.components_, columns=feature_columns))