Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import re | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from typing import Callable | |
| from App.utils.standadisation import * | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| from nltk.stem.snowball import FrenchStemmer | |
| from nltk.corpus import stopwords | |
| def convert_df(df): | |
| return df.to_csv().encode("utf-8") | |
| def supprime_country(df): | |
| try: | |
| df.drop(["Country"], axis=1, inplace=True) | |
| except: | |
| try: | |
| df.drop(["COUNTRY_KEY"], axis=1, inplace=True) | |
| except: | |
| try: | |
| df.drop(["COUNTRY"], axis=1, inplace=True) | |
| except: | |
| pass | |
| return df | |
| def merge_and_update_classification( | |
| main_df, update_df, product_id_col, classification_col | |
| ): | |
| """ | |
| Merge two DataFrames and update the classification based on the update_df. | |
| Only rows where the classification has changed are retained. | |
| Args: | |
| main_df (pd.DataFrame): The main DataFrame containing original data. | |
| update_df (pd.DataFrame): DataFrame containing updated classifications. | |
| product_id_col (str): Name of the column used as the product identifier. | |
| classification_col (str): Name of the classification column to be updated. | |
| Returns: | |
| pd.DataFrame: A DataFrame containing only the rows where classification was updated. | |
| """ | |
| # Get unique product IDs from the update DataFrame | |
| update_product_ids = update_df[product_id_col].unique() | |
| # Filter main DataFrame to include only products in the update DataFrame | |
| filtered_main_df = main_df[main_df[product_id_col].isin(update_product_ids)] | |
| # Preserve the original classification | |
| original_classification_col = f"original_{classification_col}" | |
| filtered_main_df[original_classification_col] = filtered_main_df[classification_col] | |
| # Merge DataFrames | |
| merged_df = pd.merge( | |
| filtered_main_df, | |
| update_df, | |
| on=[product_id_col], | |
| how="inner", | |
| suffixes=("_main", "_update"), | |
| indicator=True, | |
| ) | |
| # Update classification, keeping original if update is NaN | |
| merged_df[classification_col] = merged_df[f"{classification_col}_update"].fillna( | |
| merged_df[original_classification_col] | |
| ) | |
| # Keep only rows where classification has changed | |
| updated_df = merged_df[ | |
| merged_df[f"{classification_col}_main"] | |
| != merged_df[f"{classification_col}_update"] | |
| ] | |
| # Remove merge indicator column | |
| final_df = updated_df.drop(columns=["_merge"]) | |
| return final_df | |
| def data_cleaning_func(strings): | |
| strings = strings.lower().strip() | |
| strings = strings.replace("'", " ") | |
| strings = strings.replace("/", " ") | |
| strings = re.sub(r"[^\w\s]", " ", strings) | |
| text_normalized = re.sub("[^A-Za-z ,éêèîôœàâ]+", " ", strings) | |
| return text_normalized | |
| def standardization_func(strings): | |
| liste = strings.split(" ") | |
| for i in range(len(liste)): | |
| if liste[i] in dictionnaire.keys(): | |
| liste[i] = dictionnaire[liste[i]] | |
| return " ".join(liste) | |
| def remove_stop_words_func(strings): | |
| liste_stopword_unicode = [str(item) for item in liste_stopword] | |
| en_stops = set(stopwords.words("english") + liste_stopword_unicode) | |
| fr_stops = set(stopwords.words("french") + liste_stopword_unicode) | |
| list_DESCRIPTION = strings.split(" ") | |
| cleaned_list = [] | |
| for ingredient in list_DESCRIPTION: | |
| temp = ingredient.split(" ") | |
| cleaned_ingredient = " ".join( | |
| [word for word in temp if word.lower() not in en_stops] | |
| ) | |
| cleaned_list.append(cleaned_ingredient) | |
| strings = " ".join([ingredient for ingredient in cleaned_list]) | |
| list_DESCRIPTION = strings.split(" ") | |
| cleaned_list = [] | |
| for ingredient in list_DESCRIPTION: | |
| temp = ingredient.split(" ") | |
| cleaned_ingredient = " ".join( | |
| [word for word in temp if word.lower() not in fr_stops] | |
| ) | |
| cleaned_list.append(cleaned_ingredient) | |
| strings = " ".join([ingredient for ingredient in cleaned_list]) | |
| return strings | |
| en_stemmer = PorterStemmer() | |
| fr_stemmer = FrenchStemmer() | |
| def stem_sentence(sentence, stemmer): | |
| words = sentence.split(" ") | |
| stemmed_words = [stemmer.stem(word) for word in words] | |
| stemmed_sentence = " ".join(stemmed_words) | |
| return stemmed_sentence | |
| def english_stemmer(strings): | |
| list_ingredients = strings.split(" ") | |
| stemmed_list = [ | |
| stem_sentence(ingredient, en_stemmer) for ingredient in list_ingredients | |
| ] | |
| strings = " ".join(stemmed_list) | |
| return strings | |
| def french_stemmer(strings): | |
| list_ingredients = strings.split(",") | |
| stemmed_list = [ | |
| stem_sentence(ingredient, fr_stemmer) for ingredient in list_ingredients | |
| ] | |
| strings = " ".join(stemmed_list) | |
| return strings | |
| def cosine_similarity_func(expr1, expr2): | |
| vectorizer = CountVectorizer() | |
| vectors = vectorizer.fit_transform([expr1, expr2]) | |
| similarity = cosine_similarity(vectors[0], vectors[1]) | |
| return similarity[0][0] | |
| def add_text_similarity( | |
| df: pd.DataFrame, | |
| data_cleaning_func: Callable = data_cleaning_func, | |
| remove_stop_words_func: Callable = remove_stop_words_func, | |
| standardization_func: Callable = standardization_func, | |
| cosine_similarity_func: Callable = cosine_similarity_func, | |
| ) -> pd.DataFrame: | |
| """ | |
| Add text similarity measures to the DataFrame based on item descriptions. | |
| Args: | |
| df (pd.DataFrame): Input DataFrame containing item descriptions. | |
| data_cleaning_func (Callable): Function to clean the text data. | |
| remove_stop_words_func (Callable): Function to remove stop words. | |
| standardization_func (Callable): Function to standardize text. | |
| cosine_similarity_func (Callable): Function to calculate cosine similarity. | |
| Returns: | |
| pd.DataFrame: DataFrame with added text similarity measures. | |
| """ | |
| # Clean item descriptions | |
| df["ITEM_DESC_before_clean"] = df["ITEM_DESC_main"].apply( | |
| data_cleaning_func | |
| ) | |
| df["ITEM_DESC_after_clean"] = df["ITEM_DESC_update"].apply( | |
| data_cleaning_func | |
| ) | |
| # Remove stop words (French and English) | |
| for language in ["french", "english"]: | |
| stop_words = set(stopwords.words(language)) | |
| for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: | |
| df[col] = df[col].apply( | |
| lambda x: " ".join( | |
| word for word in x.split() if word.lower() not in stop_words | |
| ) | |
| ) | |
| # Apply custom stop words removal | |
| for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: | |
| df[col] = df[col].apply(remove_stop_words_func) | |
| # Standardize text | |
| for col in ["ITEM_DESC_before_clean", "ITEM_DESC_after_clean"]: | |
| df[col] = df[col].apply(standardization_func) | |
| # Calculate cosine similarity | |
| df["Cosine_Similarity"] = df.apply( | |
| lambda row: cosine_similarity_func( | |
| row["ITEM_DESC_after_clean"], row["ITEM_DESC_before_clean"] | |
| ), | |
| axis=1, | |
| ) | |
| return df | |
| def display_data_with_download_button( | |
| df, | |
| title="Data without decision-making" | |
| ) -> None: | |
| if df.empty: | |
| st.write("No result for the above criterion ") | |
| else: | |
| st.subheader(title) | |
| df.loc[:, "Evaluation"] = True | |
| edited_df = st.data_editor(df) | |
| csv_data = convert_df(edited_df) | |
| try: | |
| st.download_button( | |
| label="Download data as CSV", | |
| data=csv_data, | |
| file_name=f"{title}.csv", | |
| mime="text/csv", | |
| key=title, | |
| ) | |
| except: | |
| st.download_button( | |
| label="Download data as CSV", | |
| data=csv_data, | |
| file_name=f"{title}.csv", | |
| mime="text/csv", | |
| key=title + "1", | |
| ) | |