Spaces:

Quetiento
/

Diamonds_NLP

Sleeping

App Files Files Community

Diamonds_NLP / app.py

Quetiento

Update app.py

3533eab verified about 2 months ago

raw

history blame contribute delete

No virus

13.1 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Fri May 31 13:45:56 2024

	@author: Group leaders group
	"""
	import streamlit as st
	import pandas as pd
	import numpy as np
	import joblib
	import matplotlib.pyplot as plt
	import seaborn as sns
	import re
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.pipeline import Pipeline
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.linear_model import LinearRegression
	from xgboost import XGBRegressor
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn import metrics
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from textblob import TextBlob


	# Page 1: Content from Ml.py
	def page_ml():


	def load_data(uploaded_file):
	data = pd.read_csv(uploaded_file)
	return data

	def preprocess_data(data):
	data = data.drop(["Unnamed: 0"], axis=1)
	data = data.drop(data[data["x"] == 0].index)
	data = data.drop(data[data["y"] == 0].index)
	data = data.drop(data[data["z"] == 0].index)
	data = data[(data["x"] < 30)]
	data = data[(data["y"] < 30)]
	data = data[(data["z"] < 30) & (data["z"] > 2)]
	return data

	def encode_data(data):
	label_data = data.copy()
	s = (data.dtypes == "object")
	object_cols = list(s[s].index)
	label_encoders = {}
	for col in object_cols:
	le = LabelEncoder()
	label_data[col] = le.fit_transform(label_data[col])
	label_encoders[col] = le
	return label_data, label_encoders
	@st.cache_resource
	def train_and_save_models(X_train, y_train):
	pipeline_lr = Pipeline([("scalar1", StandardScaler()), ("lr_classifier", LinearRegression())])
	pipeline_dt = Pipeline([("scalar2", StandardScaler()), ("dt_classifier", DecisionTreeRegressor())])
	pipeline_rf = Pipeline([("scalar3", StandardScaler()), ("rf_classifier", RandomForestRegressor())])
	pipeline_kn = Pipeline([("scalar4", StandardScaler()), ("kn_classifier", KNeighborsRegressor())])
	pipeline_xgb = Pipeline([("scalar5", StandardScaler()), ("xgb_classifier", XGBRegressor())])

	pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]
	pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest", 3: "KNeighbors", 4: "XGBRegressor"}

	for i, pipe in enumerate(pipelines):
	pipe.fit(X_train, y_train)
	joblib.dump(pipe, f"{pipe_dict[i]}.pkl") # Save each model

	return pipe_dict
	@st.cache_resource
	def load_best_model(pipe_dict, X_train, y_train):
	cv_results_rms = []
	for i in range(len(pipe_dict)):
	model = joblib.load(f"{pipe_dict[i]}.pkl")
	cv_score = cross_val_score(model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)
	mean_rmse = -cv_score.mean() # Convert negative RMSE to positive
	cv_results_rms.append(mean_rmse)
	st.write(f"{pipe_dict[i]}: {mean_rmse}")

	best_model_index = np.argmin(cv_results_rms) # Use np.argmin to get the model with the smallest RMSE
	best_model_name = pipe_dict[best_model_index]
	best_model = joblib.load(f"{best_model_name}.pkl")
	return best_model, best_model_name
	def main():
	st.title("Diamond Price Prediction")

	uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
	if uploaded_file is not None:
	data = load_data(uploaded_file)
	st.write("Data Preview:")
	st.write(data.head())

	data = preprocess_data(data)
	st.write("Preprocessed Data:")
	st.write(data.head())

	label_data, label_encoders = encode_data(data)
	st.write("Encoded Data:")
	st.write(label_data.head())

	X = label_data.drop(["price"], axis=1)
	y = label_data["price"]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)

	st.write("Training and saving models...")
	pipe_dict = train_and_save_models(X_train, y_train)

	st.write("Evaluating models...")
	best_model, best_model_name = load_best_model(pipe_dict, X_train, y_train)

	st.write(f"The best model is: {best_model_name}")

	st.write("Model Performance on Test Data:")
	pred = best_model.predict(X_test)
	st.write("R^2:", metrics.r2_score(y_test, pred))
	st.write("Adjusted R^2:",
	1 - (1 - metrics.r2_score(y_test, pred)) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1))
	st.write("MAE:", metrics.mean_absolute_error(y_test, pred))
	st.write("MSE:", metrics.mean_squared_error(y_test, pred))
	st.write("RMSE:", np.sqrt(metrics.mean_squared_error(y_test, pred)))

	st.write("Make Predictions:")
	input_data = {}
	for col in X.columns:
	if col in label_encoders:
	categories = label_encoders[col].classes_
	input_data[col] = st.selectbox(f"Select {col}", categories)
	else:
	input_data[col] = st.number_input(f"Input {col}")

	input_df = pd.DataFrame([input_data])
	for col in label_encoders:
	input_df[col] = label_encoders[col].transform(input_df[col])

	prediction = best_model.predict(input_df)
	st.write(f"Predicted Price: {prediction[0]}")

	if __name__ == "__main__":
	main()

	# Page 2: Content from diamondNLP6.py
	def page_diamond_nlp():


	st.set_option('deprecation.showPyplotGlobalUse', False)

	# Title
	st.title('Diamond Comments Analysis')

	# Upload diamond comments dataset
	st.header("Upload Diamond Comments Dataset")
	uploaded_file_1 = st.file_uploader("Choose a CSV file", type="csv", key="comments_file")
	if uploaded_file_1 is not None:
	data = pd.read_csv(uploaded_file_1)
	st.write("Diamond Comments Data Loaded Successfully!")
	if st.checkbox('Show Diamond Comments Data'):
	st.write(data)

	# LDA Topic Modeling
	st.header("LDA Topic Modeling")
	vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
	dtm = vectorizer.fit_transform(data['Comment'])
	n_topics = 5
	lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0)
	lda_topics = lda_model.fit_transform(dtm)

	# Extracting words for each topic
	words = vectorizer.get_feature_names_out()
	topic_keywords = {}
	for topic_idx, topic in enumerate(lda_model.components_):
	topic_keywords[topic_idx] = [words[i] for i in topic.argsort()[:-11:-1]]

	# Plotting Topics
	fig, axes = plt.subplots(n_topics, 1, figsize=(10, 2 * n_topics))
	for topic_idx, topic in enumerate(lda_model.components_):
	top_features_ind = topic.argsort()[:-11:-1]
	top_features = [words[i] for i in top_features_ind]
	weights = topic[top_features_ind]

	ax = axes[topic_idx]
	ax.barh(top_features, weights, height=0.7)
	ax.set_title(f'Topic {topic_idx +1}')
	ax.invert_yaxis()
	ax.tick_params(axis='both', which='major', labelsize=10)
	for i in ax.patches:
	ax.text(i.get_width() + 0.1, i.get_y() + i.get_height()/2, str(round(i.get_width(), 2)), fontsize=10, ha='center', va='center')
	fig.tight_layout()
	st.pyplot(fig)

	# Sentiment Analysis
	st.header("Sentiment Analysis")
	data['Polarity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.polarity)
	data['Subjectivity'] = data['Comment'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
	fig, axes = plt.subplots(1, 2, figsize=(14, 6))
	sns.histplot(data['Polarity'], bins=30, ax=axes[0], kde=True, color='skyblue')
	axes[0].set_title('Polarity Distribution')
	sns.histplot(data['Subjectivity'], bins=30, ax=axes[1], kde=True, color='lightgreen')
	axes[1].set_title('Subjectivity Distribution')
	plt.tight_layout()
	st.pyplot(fig)

	# Common Words Visualization
	st.header("Common Words in Comments")
	vec = CountVectorizer(stop_words='english').fit(data['Comment'])
	bag_of_words = vec.transform(data['Comment'])
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
	words, freqs = zip(*words_freq[:20])
	plt.figure(figsize=(10, 8))
	sns.barplot(x=list(freqs), y=list(words), palette="viridis")
	plt.title('Top 20 Most Common Words')
	plt.xlabel('Frequency')
	plt.ylabel('Word')
	plt.show()
	st.pyplot()

	# Upload diamond purchase purpose dataset
	st.header("Upload Diamond Purchase Purpose Dataset")
	uploaded_file_2 = st.file_uploader("Choose a CSV file", type="csv", key="purpose_file")
	if uploaded_file_2 is not None:
	dpp_data = pd.read_csv(uploaded_file_2)
	st.write("Diamond Purchase Purpose Data Loaded Successfully!")
	if st.checkbox('Show Diamond Purchase Purpose Data'):
	st.write(dpp_data)

	# Preprocess text data
	def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	# Apply preprocessing to comments
	dpp_data['Processed_Comment'] = dpp_data['Comment'].apply(preprocess_text)

	# Simple English stop words list
	simple_stopwords = ['the', 'a', 'and', 'is', 'in', 'it', 'this', 'that', 'of', 'for', 'on', 'with', 'as', 'to', 'at', 'by', 'an']

	# Extract keywords using CountVectorizer
	vectorizer = CountVectorizer(stop_words=simple_stopwords, max_features=100)
	X = vectorizer.fit_transform(dpp_data['Processed_Comment'])
	features = vectorizer.get_feature_names_out()

	# Calculate and display the most frequent keywords
	keyword_counts = X.sum(axis=0)
	keyword_counts_sorted = sorted(zip(features, keyword_counts.tolist()[0]), key=lambda x: x[1], reverse=True)

	# Collect top 20 frequent keywords
	top_keywords = keyword_counts_sorted[:20]

	# Define keyword categories
	categories = {
	'Gift/Anniversary': ['anniversary', 'gifted', 'happiness', 'joy'],
	'Industrial Use': ['industrial', 'use'],
	'Investment': ['rare', 'investment']
	}

	# Function to categorize comments
	def categorize_comment(text):
	category_counts = {category: 0 for category in categories}
	for word in text.split():
	for category, keywords in categories.items():
	if word in keywords:
	category_counts[category] += 1

	max_category = 'Other'
	max_count = 0
	for category, count in category_counts.items():
	if count > max_count:
	max_category = category
	max_count = count

	return max_category

	# Categorize each comment
	dpp_data['Category'] = dpp_data['Processed_Comment'].apply(categorize_comment)

	# Display category distribution
	category_distribution = dpp_data['Category'].value_counts()

	# Plotting the distribution of top 20 frequent keywords
	keywords, counts = zip(*top_keywords)
	plt.figure(figsize=(12, 8))
	plt.bar(keywords, counts, color='skyblue')
	plt.title('Top 20 Frequent Keywords')
	plt.xlabel('Keywords')
	plt.ylabel('Frequency')
	plt.xticks(rotation=90)
	plt.show()
	st.pyplot()

	# Plotting the distribution of comments by purchase category
	plt.figure(figsize=(10, 6))
	category_distribution.plot(kind='bar', color=['skyblue', 'green', 'gold', 'gray'])
	plt.title('Distribution of Comments by Purchase Category')
	plt.xlabel('Category')
	plt.ylabel('Number of Comments')
	plt.xticks(rotation=45)
	plt.show()
	st.pyplot()


	# Create the main app function
	def main():
	st.sidebar.title("Navigation")
	page = st.sidebar.radio("Go to", ["Diamond Price Prediction", "Diamond Comments Analysis"])

	if page == "Diamond Price Prediction":
	page_ml()
	elif page == "Diamond Comments Analysis":
	page_diamond_nlp()

	if __name__ == "__main__":
	main()