Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 31 13:45:56 2024 | |
@author: Group leaders group | |
""" | |
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import joblib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import re | |
from sklearn.preprocessing import LabelEncoder, StandardScaler | |
from sklearn.model_selection import train_test_split, cross_val_score | |
from sklearn.pipeline import Pipeline | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.linear_model import LinearRegression | |
from xgboost import XGBRegressor | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn import metrics | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
from textblob import TextBlob | |
# Page 1: Content from Ml.py | |
def page_ml(): | |
def load_data(uploaded_file): | |
data = pd.read_csv(uploaded_file) | |
return data | |
def preprocess_data(data): | |
data = data.drop(["Unnamed: 0"], axis=1) | |
data = data.drop(data[data["x"] == 0].index) | |
data = data.drop(data[data["y"] == 0].index) | |
data = data.drop(data[data["z"] == 0].index) | |
data = data[(data["x"] < 30)] | |
data = data[(data["y"] < 30)] | |
data = data[(data["z"] < 30) & (data["z"] > 2)] | |
return data | |
def encode_data(data): | |
label_data = data.copy() | |
s = (data.dtypes == "object") | |
object_cols = list(s[s].index) | |
label_encoders = {} | |
for col in object_cols: | |
le = LabelEncoder() | |
label_data[col] = le.fit_transform(label_data[col]) | |
label_encoders[col] = le | |
return label_data, label_encoders | |
def train_and_save_models(X_train, y_train): | |
pipeline_lr = Pipeline([("scalar1", StandardScaler()), ("lr_classifier", LinearRegression())]) | |
pipeline_dt = Pipeline([("scalar2", StandardScaler()), ("dt_classifier", DecisionTreeRegressor())]) | |
pipeline_rf = Pipeline([("scalar3", StandardScaler()), ("rf_classifier", RandomForestRegressor())]) | |
pipeline_kn = Pipeline([("scalar4", StandardScaler()), ("kn_classifier", KNeighborsRegressor())]) | |
pipeline_xgb = Pipeline([("scalar5", StandardScaler()), ("xgb_classifier", XGBRegressor())]) | |
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb] | |
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest", 3: "KNeighbors", 4: "XGBRegressor"} | |
for i, pipe in enumerate(pipelines): | |
pipe.fit(X_train, y_train) | |
joblib.dump(pipe, f"{pipe_dict[i]}.pkl") # Save each model | |
return pipe_dict | |
def load_best_model(pipe_dict, X_train, y_train): | |
cv_results_rms = [] | |
for i in range(len(pipe_dict)): | |
model = joblib.load(f"{pipe_dict[i]}.pkl") | |
cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10) | |
mean_rmse = -cv_score.mean() # Convert negative RMSE to positive | |
cv_results_rms.append(mean_rmse) | |
st.write(f"{pipe_dict[i]}: {mean_rmse}") | |
best_model_index = np.argmin(cv_results_rms) # Use np.argmin to get the model with the smallest RMSE | |
best_model_name = pipe_dict[best_model_index] | |
best_model = joblib.load(f"{best_model_name}.pkl") | |
return best_model, best_model_name | |
def main(): | |
st.title("Diamond Price Prediction") | |
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") | |
if uploaded_file is not None: | |
data = load_data(uploaded_file) | |
st.write("Data Preview:") | |
st.write(data.head()) | |
data = preprocess_data(data) | |
st.write("Preprocessed Data:") | |
st.write(data.head()) | |
label_data, label_encoders = encode_data(data) | |
st.write("Encoded Data:") | |
st.write(label_data.head()) | |
X = label_data.drop(["price"], axis=1) | |
y = label_data["price"] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7) | |
st.write("Training and saving models...") | |
pipe_dict = train_and_save_models(X_train, y_train) | |
st.write("Evaluating models...") | |
best_model, best_model_name = load_best_model(pipe_dict, X_train, y_train) | |
st.write(f"The best model is: {best_model_name}") | |
st.write("Model Performance on Test Data:") | |
pred = best_model.predict(X_test) | |
st.write("R^2:", metrics.r2_score(y_test, pred)) | |
st.write("Adjusted R^2:", | |
1 - (1 - metrics.r2_score(y_test, pred)) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)) | |
st.write("MAE:", metrics.mean_absolute_error(y_test, pred)) | |
st.write("MSE:", metrics.mean_squared_error(y_test, pred)) | |
st.write("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, pred))) | |
st.write("Make Predictions:") | |
input_data = {} | |
for col in X.columns: | |
if col in label_encoders: | |
categories = label_encoders[col].classes_ | |
input_data[col] = st.selectbox(f"Select {col}", categories) | |
else: | |
input_data[col] = st.number_input(f"Input {col}") | |
input_df = pd.DataFrame([input_data]) | |
for col in label_encoders: | |
input_df[col] = label_encoders[col].transform(input_df[col]) | |
prediction = best_model.predict(input_df) | |
st.write(f"Predicted Price: {prediction[0]}") | |
if __name__ == "__main__": | |
main() | |
# Page 2: Content from diamondNLP6.py | |
def page_diamond_nlp(): | |
st.set_option('deprecation.showPyplotGlobalUse', False) | |
# Title | |
st.title('Diamond Comments Analysis') | |
# Upload diamond comments dataset | |
st.header("Upload Diamond Comments Dataset") | |
uploaded_file_1 = st.file_uploader("Choose a CSV file", type="csv", key="comments_file") | |
if uploaded_file_1 is not None: | |
data = pd.read_csv(uploaded_file_1) | |
st.write("Diamond Comments Data Loaded Successfully!") | |
if st.checkbox('Show Diamond Comments Data'): | |
st.write(data) | |
# LDA Topic Modeling | |
st.header("LDA Topic Modeling") | |
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') | |
dtm = vectorizer.fit_transform(data['Comment']) | |
n_topics = 5 | |
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0) | |
lda_topics = lda_model.fit_transform(dtm) | |
# Extracting words for each topic | |
words = vectorizer.get_feature_names_out() | |
topic_keywords = {} | |
for topic_idx, topic in enumerate(lda_model.components_): | |
topic_keywords[topic_idx] = [words[i] for i in topic.argsort()[:-11:-1]] | |
# Plotting Topics | |
fig, axes = plt.subplots(n_topics, 1, figsize=(10, 2 * n_topics)) | |
for topic_idx, topic in enumerate(lda_model.components_): | |
top_features_ind = topic.argsort()[:-11:-1] | |
top_features = [words[i] for i in top_features_ind] | |
weights = topic[top_features_ind] | |
ax = axes[topic_idx] | |
ax.barh(top_features, weights, height=0.7) | |
ax.set_title(f'Topic {topic_idx +1}') | |
ax.invert_yaxis() | |
ax.tick_params(axis='both', which='major', labelsize=10) | |
for i in ax.patches: | |
ax.text(i.get_width() + 0.1, i.get_y() + i.get_height()/2, str(round(i.get_width(), 2)), fontsize=10, ha='center', va='center') | |
fig.tight_layout() | |
st.pyplot(fig) | |
# Sentiment Analysis | |
st.header("Sentiment Analysis") | |
data['Polarity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.polarity) | |
data['Subjectivity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.subjectivity) | |
fig, axes = plt.subplots(1, 2, figsize=(14, 6)) | |
sns.histplot(data['Polarity'], bins=30, ax=axes[0], kde=True, color='skyblue') | |
axes[0].set_title('Polarity Distribution') | |
sns.histplot(data['Subjectivity'], bins=30, ax=axes[1], kde=True, color='lightgreen') | |
axes[1].set_title('Subjectivity Distribution') | |
plt.tight_layout() | |
st.pyplot(fig) | |
# Common Words Visualization | |
st.header("Common Words in Comments") | |
vec = CountVectorizer(stop_words='english').fit(data['Comment']) | |
bag_of_words = vec.transform(data['Comment']) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] | |
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) | |
words, freqs = zip(*words_freq[:20]) | |
plt.figure(figsize=(10, 8)) | |
sns.barplot(x=list(freqs), y=list(words), palette="viridis") | |
plt.title('Top 20 Most Common Words') | |
plt.xlabel('Frequency') | |
plt.ylabel('Word') | |
plt.show() | |
st.pyplot() | |
# Upload diamond purchase purpose dataset | |
st.header("Upload Diamond Purchase Purpose Dataset") | |
uploaded_file_2 = st.file_uploader("Choose a CSV file", type="csv", key="purpose_file") | |
if uploaded_file_2 is not None: | |
dpp_data = pd.read_csv(uploaded_file_2) | |
st.write("Diamond Purchase Purpose Data Loaded Successfully!") | |
if st.checkbox('Show Diamond Purchase Purpose Data'): | |
st.write(dpp_data) | |
# Preprocess text data | |
def preprocess_text(text): | |
text = text.lower() | |
text = re.sub(r'[^a-z\s]', '', text) | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
# Apply preprocessing to comments | |
dpp_data['Processed_Comment'] = dpp_data['Comment'].apply(preprocess_text) | |
# Simple English stop words list | |
simple_stopwords = ['the', 'a', 'and', 'is', 'in', 'it', 'this', 'that', 'of', 'for', 'on', 'with', 'as', 'to', 'at', 'by', 'an'] | |
# Extract keywords using CountVectorizer | |
vectorizer = CountVectorizer(stop_words=simple_stopwords, max_features=100) | |
X = vectorizer.fit_transform(dpp_data['Processed_Comment']) | |
features = vectorizer.get_feature_names_out() | |
# Calculate and display the most frequent keywords | |
keyword_counts = X.sum(axis=0) | |
keyword_counts_sorted = sorted(zip(features, keyword_counts.tolist()[0]), key=lambda x: x[1], reverse=True) | |
# Collect top 20 frequent keywords | |
top_keywords = keyword_counts_sorted[:20] | |
# Define keyword categories | |
categories = { | |
'Gift/Anniversary': ['anniversary', 'gifted', 'happiness', 'joy'], | |
'Industrial Use': ['industrial', 'use'], | |
'Investment': ['rare', 'investment'] | |
} | |
# Function to categorize comments | |
def categorize_comment(text): | |
category_counts = {category: 0 for category in categories} | |
for word in text.split(): | |
for category, keywords in categories.items(): | |
if word in keywords: | |
category_counts[category] += 1 | |
max_category = 'Other' | |
max_count = 0 | |
for category, count in category_counts.items(): | |
if count > max_count: | |
max_category = category | |
max_count = count | |
return max_category | |
# Categorize each comment | |
dpp_data['Category'] = dpp_data['Processed_Comment'].apply(categorize_comment) | |
# Display category distribution | |
category_distribution = dpp_data['Category'].value_counts() | |
# Plotting the distribution of top 20 frequent keywords | |
keywords, counts = zip(*top_keywords) | |
plt.figure(figsize=(12, 8)) | |
plt.bar(keywords, counts, color='skyblue') | |
plt.title('Top 20 Frequent Keywords') | |
plt.xlabel('Keywords') | |
plt.ylabel('Frequency') | |
plt.xticks(rotation=90) | |
plt.show() | |
st.pyplot() | |
# Plotting the distribution of comments by purchase category | |
plt.figure(figsize=(10, 6)) | |
category_distribution.plot(kind='bar', color=['skyblue', 'green', 'gold', 'gray']) | |
plt.title('Distribution of Comments by Purchase Category') | |
plt.xlabel('Category') | |
plt.ylabel('Number of Comments') | |
plt.xticks(rotation=45) | |
plt.show() | |
st.pyplot() | |
# Create the main app function | |
def main(): | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Diamond Price Prediction", "Diamond Comments Analysis"]) | |
if page == "Diamond Price Prediction": | |
page_ml() | |
elif page == "Diamond Comments Analysis": | |
page_diamond_nlp() | |
if __name__ == "__main__": | |
main() | |