#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 31 13:45:56 2024 @author: Group leaders group """ import streamlit as st import pandas as pd import numpy as np import joblib import matplotlib.pyplot as plt import seaborn as sns import re from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import train_test_split, cross_val_score from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from xgboost import XGBRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn import metrics from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from textblob import TextBlob # Page 1: Content from Ml.py def page_ml(): def load_data(uploaded_file): data = pd.read_csv(uploaded_file) return data def preprocess_data(data): data = data.drop(["Unnamed: 0"], axis=1) data = data.drop(data[data["x"] == 0].index) data = data.drop(data[data["y"] == 0].index) data = data.drop(data[data["z"] == 0].index) data = data[(data["x"] < 30)] data = data[(data["y"] < 30)] data = data[(data["z"] < 30) & (data["z"] > 2)] return data def encode_data(data): label_data = data.copy() s = (data.dtypes == "object") object_cols = list(s[s].index) label_encoders = {} for col in object_cols: le = LabelEncoder() label_data[col] = le.fit_transform(label_data[col]) label_encoders[col] = le return label_data, label_encoders @st.cache_resource def train_and_save_models(X_train, y_train): pipeline_lr = Pipeline([("scalar1", StandardScaler()), ("lr_classifier", LinearRegression())]) pipeline_dt = Pipeline([("scalar2", StandardScaler()), ("dt_classifier", DecisionTreeRegressor())]) pipeline_rf = Pipeline([("scalar3", StandardScaler()), ("rf_classifier", RandomForestRegressor())]) pipeline_kn = Pipeline([("scalar4", StandardScaler()), ("kn_classifier", KNeighborsRegressor())]) pipeline_xgb = Pipeline([("scalar5", StandardScaler()), ("xgb_classifier", XGBRegressor())]) pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb] pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest", 3: "KNeighbors", 4: "XGBRegressor"} for i, pipe in enumerate(pipelines): pipe.fit(X_train, y_train) joblib.dump(pipe, f"{pipe_dict[i]}.pkl") # Save each model return pipe_dict @st.cache_resource def load_best_model(pipe_dict, X_train, y_train): cv_results_rms = [] for i in range(len(pipe_dict)): model = joblib.load(f"{pipe_dict[i]}.pkl") cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10) mean_rmse = -cv_score.mean() # Convert negative RMSE to positive cv_results_rms.append(mean_rmse) st.write(f"{pipe_dict[i]}: {mean_rmse}") best_model_index = np.argmin(cv_results_rms) # Use np.argmin to get the model with the smallest RMSE best_model_name = pipe_dict[best_model_index] best_model = joblib.load(f"{best_model_name}.pkl") return best_model, best_model_name def main(): st.title("Diamond Price Prediction") uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = load_data(uploaded_file) st.write("Data Preview:") st.write(data.head()) data = preprocess_data(data) st.write("Preprocessed Data:") st.write(data.head()) label_data, label_encoders = encode_data(data) st.write("Encoded Data:") st.write(label_data.head()) X = label_data.drop(["price"], axis=1) y = label_data["price"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7) st.write("Training and saving models...") pipe_dict = train_and_save_models(X_train, y_train) st.write("Evaluating models...") best_model, best_model_name = load_best_model(pipe_dict, X_train, y_train) st.write(f"The best model is: {best_model_name}") st.write("Model Performance on Test Data:") pred = best_model.predict(X_test) st.write("R^2:", metrics.r2_score(y_test, pred)) st.write("Adjusted R^2:", 1 - (1 - metrics.r2_score(y_test, pred)) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)) st.write("MAE:", metrics.mean_absolute_error(y_test, pred)) st.write("MSE:", metrics.mean_squared_error(y_test, pred)) st.write("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, pred))) st.write("Make Predictions:") input_data = {} for col in X.columns: if col in label_encoders: categories = label_encoders[col].classes_ input_data[col] = st.selectbox(f"Select {col}", categories) else: input_data[col] = st.number_input(f"Input {col}") input_df = pd.DataFrame([input_data]) for col in label_encoders: input_df[col] = label_encoders[col].transform(input_df[col]) prediction = best_model.predict(input_df) st.write(f"Predicted Price: {prediction[0]}") if __name__ == "__main__": main() # Page 2: Content from diamondNLP6.py def page_diamond_nlp(): st.set_option('deprecation.showPyplotGlobalUse', False) # Title st.title('Diamond Comments Analysis') # Upload diamond comments dataset st.header("Upload Diamond Comments Dataset") uploaded_file_1 = st.file_uploader("Choose a CSV file", type="csv", key="comments_file") if uploaded_file_1 is not None: data = pd.read_csv(uploaded_file_1) st.write("Diamond Comments Data Loaded Successfully!") if st.checkbox('Show Diamond Comments Data'): st.write(data) # LDA Topic Modeling st.header("LDA Topic Modeling") vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') dtm = vectorizer.fit_transform(data['Comment']) n_topics = 5 lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0) lda_topics = lda_model.fit_transform(dtm) # Extracting words for each topic words = vectorizer.get_feature_names_out() topic_keywords = {} for topic_idx, topic in enumerate(lda_model.components_): topic_keywords[topic_idx] = [words[i] for i in topic.argsort()[:-11:-1]] # Plotting Topics fig, axes = plt.subplots(n_topics, 1, figsize=(10, 2 * n_topics)) for topic_idx, topic in enumerate(lda_model.components_): top_features_ind = topic.argsort()[:-11:-1] top_features = [words[i] for i in top_features_ind] weights = topic[top_features_ind] ax = axes[topic_idx] ax.barh(top_features, weights, height=0.7) ax.set_title(f'Topic {topic_idx +1}') ax.invert_yaxis() ax.tick_params(axis='both', which='major', labelsize=10) for i in ax.patches: ax.text(i.get_width() + 0.1, i.get_y() + i.get_height()/2, str(round(i.get_width(), 2)), fontsize=10, ha='center', va='center') fig.tight_layout() st.pyplot(fig) # Sentiment Analysis st.header("Sentiment Analysis") data['Polarity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.polarity) data['Subjectivity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.subjectivity) fig, axes = plt.subplots(1, 2, figsize=(14, 6)) sns.histplot(data['Polarity'], bins=30, ax=axes[0], kde=True, color='skyblue') axes[0].set_title('Polarity Distribution') sns.histplot(data['Subjectivity'], bins=30, ax=axes[1], kde=True, color='lightgreen') axes[1].set_title('Subjectivity Distribution') plt.tight_layout() st.pyplot(fig) # Common Words Visualization st.header("Common Words in Comments") vec = CountVectorizer(stop_words='english').fit(data['Comment']) bag_of_words = vec.transform(data['Comment']) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) words, freqs = zip(*words_freq[:20]) plt.figure(figsize=(10, 8)) sns.barplot(x=list(freqs), y=list(words), palette="viridis") plt.title('Top 20 Most Common Words') plt.xlabel('Frequency') plt.ylabel('Word') plt.show() st.pyplot() # Upload diamond purchase purpose dataset st.header("Upload Diamond Purchase Purpose Dataset") uploaded_file_2 = st.file_uploader("Choose a CSV file", type="csv", key="purpose_file") if uploaded_file_2 is not None: dpp_data = pd.read_csv(uploaded_file_2) st.write("Diamond Purchase Purpose Data Loaded Successfully!") if st.checkbox('Show Diamond Purchase Purpose Data'): st.write(dpp_data) # Preprocess text data def preprocess_text(text): text = text.lower() text = re.sub(r'[^a-z\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text # Apply preprocessing to comments dpp_data['Processed_Comment'] = dpp_data['Comment'].apply(preprocess_text) # Simple English stop words list simple_stopwords = ['the', 'a', 'and', 'is', 'in', 'it', 'this', 'that', 'of', 'for', 'on', 'with', 'as', 'to', 'at', 'by', 'an'] # Extract keywords using CountVectorizer vectorizer = CountVectorizer(stop_words=simple_stopwords, max_features=100) X = vectorizer.fit_transform(dpp_data['Processed_Comment']) features = vectorizer.get_feature_names_out() # Calculate and display the most frequent keywords keyword_counts = X.sum(axis=0) keyword_counts_sorted = sorted(zip(features, keyword_counts.tolist()[0]), key=lambda x: x[1], reverse=True) # Collect top 20 frequent keywords top_keywords = keyword_counts_sorted[:20] # Define keyword categories categories = { 'Gift/Anniversary': ['anniversary', 'gifted', 'happiness', 'joy'], 'Industrial Use': ['industrial', 'use'], 'Investment': ['rare', 'investment'] } # Function to categorize comments def categorize_comment(text): category_counts = {category: 0 for category in categories} for word in text.split(): for category, keywords in categories.items(): if word in keywords: category_counts[category] += 1 max_category = 'Other' max_count = 0 for category, count in category_counts.items(): if count > max_count: max_category = category max_count = count return max_category # Categorize each comment dpp_data['Category'] = dpp_data['Processed_Comment'].apply(categorize_comment) # Display category distribution category_distribution = dpp_data['Category'].value_counts() # Plotting the distribution of top 20 frequent keywords keywords, counts = zip(*top_keywords) plt.figure(figsize=(12, 8)) plt.bar(keywords, counts, color='skyblue') plt.title('Top 20 Frequent Keywords') plt.xlabel('Keywords') plt.ylabel('Frequency') plt.xticks(rotation=90) plt.show() st.pyplot() # Plotting the distribution of comments by purchase category plt.figure(figsize=(10, 6)) category_distribution.plot(kind='bar', color=['skyblue', 'green', 'gold', 'gray']) plt.title('Distribution of Comments by Purchase Category') plt.xlabel('Category') plt.ylabel('Number of Comments') plt.xticks(rotation=45) plt.show() st.pyplot() # Create the main app function def main(): st.sidebar.title("Navigation") page = st.sidebar.radio("Go to", ["Diamond Price Prediction", "Diamond Comments Analysis"]) if page == "Diamond Price Prediction": page_ml() elif page == "Diamond Comments Analysis": page_diamond_nlp() if __name__ == "__main__": main()