Carrefourrefbem / App /utils /divers_function.py
COULIBALY Bourahima
update
2c49a88
raw
history blame
8 kB
import streamlit as st
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Callable
from App.utils.standadisation import *
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
@st.cache_data
def convert_df(df):
return df.to_csv().encode("utf-8")
@st.cache_data
def supprime_country(df):
try:
df.drop(["Country"], axis=1, inplace=True)
except:
try:
df.drop(["COUNTRY_KEY"], axis=1, inplace=True)
except:
try:
df.drop(["COUNTRY"], axis=1, inplace=True)
except:
pass
return df
def merge_and_update_classification(
main_df, update_df, product_id_col, classification_col
):
"""
Merge two DataFrames and update the classification based on the update_df.
Only rows where the classification has changed are retained.
Args:
main_df (pd.DataFrame): The main DataFrame containing original data.
update_df (pd.DataFrame): DataFrame containing updated classifications.
product_id_col (str): Name of the column used as the product identifier.
classification_col (str): Name of the classification column to be updated.
Returns:
pd.DataFrame: A DataFrame containing only the rows where classification was updated.
"""
# Get unique product IDs from the update DataFrame
update_product_ids = update_df[product_id_col].unique()
# Filter main DataFrame to include only products in the update DataFrame
filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)]
# Preserve the original classification
original_classification_col = f"original_{classification_col}"
filtered_main_df[original_classification_col] = filtered_main_df[classification_col]
# Merge DataFrames
merged_df = pd.merge(
filtered_main_df,
update_df,
on=[product_id_col],
how="inner",
suffixes=("_main", "_update"),
indicator=True,
)
# Update classification, keeping original if update is NaN
merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna(
merged_df[original_classification_col]
)
# Keep only rows where classification has changed
updated_df = merged_df[
merged_df[f"{classification_col}_main"]
!= merged_df[f"{classification_col}_update"]
]
# Remove merge indicator column
final_df = updated_df.drop(columns=["_merge"])
return final_df
def data_cleaning_func(strings):
strings = strings.lower().strip()
strings = strings.replace("'", " ")
strings = strings.replace("/", " ")
strings = re.sub(r"[^\w\s]", " ", strings)
text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings)
return text_normalized
def standardization_func(strings):
liste = strings.split(" ")
for i in range(len(liste)):
if liste[i] in dictionnaire.keys():
liste[i] = dictionnaire[liste[i]]
return " ".join(liste)
def remove_stop_words_func(strings):
liste_stopword_unicode = [str(item) for item in liste_stopword]
en_stops = set(stopwords.words("english") + liste_stopword_unicode)
fr_stops = set(stopwords.words("french") + liste_stopword_unicode)
list_DESCRIPTION = strings.split(" ")
cleaned_list = []
for ingredient in list_DESCRIPTION:
temp = ingredient.split(" ")
cleaned_ingredient = " ".join(
[word for word in temp if word.lower() not in en_stops]
)
cleaned_list.append(cleaned_ingredient)
strings = " ".join([ingredient for ingredient in cleaned_list])
list_DESCRIPTION = strings.split(" ")
cleaned_list = []
for ingredient in list_DESCRIPTION:
temp = ingredient.split(" ")
cleaned_ingredient = " ".join(
[word for word in temp if word.lower() not in fr_stops]
)
cleaned_list.append(cleaned_ingredient)
strings = " ".join([ingredient for ingredient in cleaned_list])
return strings
en_stemmer = PorterStemmer()
fr_stemmer = FrenchStemmer()
def stem_sentence(sentence, stemmer):
words = sentence.split(" ")
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def english_stemmer(strings):
list_ingredients = strings.split(" ")
stemmed_list = [
stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients
]
strings = " ".join(stemmed_list)
return strings
def french_stemmer(strings):
list_ingredients = strings.split(",")
stemmed_list = [
stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients
]
strings = " ".join(stemmed_list)
return strings
def cosine_similarity_func(expr1, expr2):
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform([expr1, expr2])
similarity = cosine_similarity(vectors[0], vectors[1])
return similarity[0][0]
def add_text_similarity(
df: pd.DataFrame,
data_cleaning_func: Callable = data_cleaning_func,
remove_stop_words_func: Callable = remove_stop_words_func,
standardization_func: Callable = standardization_func,
cosine_similarity_func: Callable = cosine_similarity_func,
) -> pd.DataFrame:
"""
Add text similarity measures to the DataFrame based on item descriptions.
Args:
df (pd.DataFrame): Input DataFrame containing item descriptions.
data_cleaning_func (Callable): Function to clean the text data.
remove_stop_words_func (Callable): Function to remove stop words.
standardization_func (Callable): Function to standardize text.
cosine_similarity_func (Callable): Function to calculate cosine similarity.
Returns:
pd.DataFrame: DataFrame with added text similarity measures.
"""
# Clean item descriptions
df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply(
data_cleaning_func
)
df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply(
data_cleaning_func
)
# Remove stop words (French and English)
for language in ["french", "english"]:
stop_words = set(stopwords.words(language))
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(
lambda x: " ".join(
word for word in x.split() if word.lower() not in stop_words
)
)
# Apply custom stop words removal
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(remove_stop_words_func)
# Standardize text
for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]:
df[col] = df[col].apply(standardization_func)
# Calculate cosine similarity
df["Cosine_Similarity"] = df.apply(
lambda row: cosine_similarity_func(
row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"]
),
axis=1,
)
return df
def display_data_with_download_button(
df,
title="Data without decision-making"
) -> None:
if df.empty:
st.write("No result for the above criterion ")
else:
st.subheader(title)
df.loc[:, "Evaluation"] = True
edited_df = st.data_editor(df)
csv_data = convert_df(edited_df)
try:
st.download_button(
label="Download data as CSV",
data=csv_data,
file_name=f"{title}.csv",
mime="text/csv",
key=title,
)
except:
st.download_button(
label="Download data as CSV",
data=csv_data,
file_name=f"{title}.csv",
mime="text/csv",
key=title + "1",
)