Spaces:

bourahima
/

Carrefourrefbem

Sleeping

Carrefourrefbem / App /utils /divers_function.py

COULIBALY Bourahima

update

2c49a88 over 1 year ago

8 kB

	import streamlit as st
	import pandas as pd
	import re
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from typing import Callable
	from App.utils.standadisation import *
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from nltk.stem.snowball import FrenchStemmer
	from nltk.corpus import stopwords


	@st.cache_data
	def convert_df(df):
	return df.to_csv().encode("utf-8")


	@st.cache_data
	def supprime_country(df):
	try:
	df.drop(["Country"], axis=1, inplace=True)
	except:
	try:
	df.drop(["COUNTRY_KEY"], axis=1, inplace=True)
	except:
	try:
	df.drop(["COUNTRY"], axis=1, inplace=True)
	except:
	pass
	return df


	def merge_and_update_classification(
	main_df, update_df, product_id_col, classification_col
	):
	"""
	Merge two DataFrames and update the classification based on the update_df.
	Only rows where the classification has changed are retained.

	Args:
	main_df (pd.DataFrame): The main DataFrame containing original data.
	update_df (pd.DataFrame): DataFrame containing updated classifications.
	product_id_col (str): Name of the column used as the product identifier.
	classification_col (str): Name of the classification column to be updated.

	Returns:
	pd.DataFrame: A DataFrame containing only the rows where classification was updated.
	"""
	# Get unique product IDs from the update DataFrame
	update_product_ids = update_df[product_id_col].unique()

	# Filter main DataFrame to include only products in the update DataFrame
	filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)]

	# Preserve the original classification
	original_classification_col = f"original_{classification_col}"
	filtered_main_df[original_classification_col] = filtered_main_df[classification_col]

	# Merge DataFrames
	merged_df = pd.merge(
	filtered_main_df,
	update_df,
	on=[product_id_col],
	how="inner",
	suffixes=("_main", "_update"),
	indicator=True,
	)

	# Update classification, keeping original if update is NaN
	merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna(
	merged_df[original_classification_col]
	)

	# Keep only rows where classification has changed
	updated_df = merged_df[
	merged_df[f"{classification_col}_main"]
	!= merged_df[f"{classification_col}_update"]
	]

	# Remove merge indicator column
	final_df = updated_df.drop(columns=["_merge"])

	return final_df


	def data_cleaning_func(strings):

	strings = strings.lower().strip()
	strings = strings.replace("'", " ")
	strings = strings.replace("/", " ")
	strings = re.sub(r"[^\w\s]", " ", strings)
	text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings)

	return text_normalized


	def standardization_func(strings):
	liste = strings.split(" ")
	for i in range(len(liste)):
	if liste[i] in dictionnaire.keys():
	liste[i] = dictionnaire[liste[i]]
	return " ".join(liste)


	def remove_stop_words_func(strings):
	liste_stopword_unicode = [str(item) for item in liste_stopword]
	en_stops = set(stopwords.words("english") + liste_stopword_unicode)
	fr_stops = set(stopwords.words("french") + liste_stopword_unicode)

	list_DESCRIPTION = strings.split(" ")
	cleaned_list = []

	for ingredient in list_DESCRIPTION:
	temp = ingredient.split(" ")
	cleaned_ingredient = " ".join(
	[word for word in temp if word.lower() not in en_stops]
	)
	cleaned_list.append(cleaned_ingredient)

	strings = " ".join([ingredient for ingredient in cleaned_list])
	list_DESCRIPTION = strings.split(" ")
	cleaned_list = []

	for ingredient in list_DESCRIPTION:
	temp = ingredient.split(" ")
	cleaned_ingredient = " ".join(
	[word for word in temp if word.lower() not in fr_stops]
	)
	cleaned_list.append(cleaned_ingredient)

	strings = " ".join([ingredient for ingredient in cleaned_list])
	return strings


	en_stemmer = PorterStemmer()
	fr_stemmer = FrenchStemmer()


	def stem_sentence(sentence, stemmer):
	words = sentence.split(" ")
	stemmed_words = [stemmer.stem(word) for word in words]
	stemmed_sentence = " ".join(stemmed_words)
	return stemmed_sentence


	def english_stemmer(strings):
	list_ingredients = strings.split(" ")
	stemmed_list = [
	stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients
	]
	strings = " ".join(stemmed_list)
	return strings


	def french_stemmer(strings):
	list_ingredients = strings.split(",")
	stemmed_list = [
	stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients
	]
	strings = " ".join(stemmed_list)
	return strings


	def cosine_similarity_func(expr1, expr2):

	vectorizer = CountVectorizer()
	vectors = vectorizer.fit_transform([expr1, expr2])
	similarity = cosine_similarity(vectors[0], vectors[1])

	return similarity[0][0]


	def add_text_similarity(
	df: pd.DataFrame,
	data_cleaning_func: Callable = data_cleaning_func,
	remove_stop_words_func: Callable = remove_stop_words_func,
	standardization_func: Callable = standardization_func,
	cosine_similarity_func: Callable = cosine_similarity_func,
	) -> pd.DataFrame:
	"""
	Add text similarity measures to the DataFrame based on item descriptions.

	Args:
	df (pd.DataFrame): Input DataFrame containing item descriptions.
	data_cleaning_func (Callable): Function to clean the text data.
	remove_stop_words_func (Callable): Function to remove stop words.
	standardization_func (Callable): Function to standardize text.
	cosine_similarity_func (Callable): Function to calculate cosine similarity.

	Returns:
	pd.DataFrame: DataFrame with added text similarity measures.
	"""
	# Clean item descriptions
	df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply(
	data_cleaning_func
	)
	df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply(
	data_cleaning_func
	)

	# Remove stop words (French and English)
	for language in ["french", "english"]:
	stop_words = set(stopwords.words(language))
	for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
	df[col] = df[col].apply(
	lambda x: " ".join(
	word for word in x.split() if word.lower() not in stop_words
	)
	)

	# Apply custom stop words removal
	for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
	df[col] = df[col].apply(remove_stop_words_func)

	# Standardize text
	for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
	df[col] = df[col].apply(standardization_func)

	# Calculate cosine similarity
	df["Cosine_Similarity"] = df.apply(
	lambda row: cosine_similarity_func(
	row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"]
	),
	axis=1,
	)

	return df


	def display_data_with_download_button(
	df,
	title="Data without decision-making"
	) -> None:
	if df.empty:
	st.write("No result for the above criterion ")
	else:
	st.subheader(title)
	df.loc[:, "Evaluation"] = True
	edited_df = st.data_editor(df)
	csv_data = convert_df(edited_df)
	try:
	st.download_button(
	label="Download data as CSV",
	data=csv_data,
	file_name=f"{title}.csv",
	mime="text/csv",
	key=title,
	)
	except:
	st.download_button(
	label="Download data as CSV",
	data=csv_data,
	file_name=f"{title}.csv",
	mime="text/csv",
	key=title + "1",
	)