import pandas as pd from transformers import AutoTokenizer, AutoModel from sentence_transformers import SentenceTransformer, util import numpy as np import torch def load_data(file_obj): # Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it return pd.read_excel(file_obj) def initialize_models(): model_ST = SentenceTransformer("all-mpnet-base-v2") return model_ST def generate_embeddings(df, model, Column): embeddings_list = [] for index, row in df.iterrows(): if type(row["Title"]) == str and type(row[Column]) == str: print(index) content = row["Title"] + "\n" + row[Column] embeddings = model.encode(content, convert_to_tensor=True) embeddings_list.append(embeddings) else: embeddings_list.append(np.nan) df['Embeddings'] = embeddings_list return df def process_categories(categories, model): # Create a new DataFrame to store category information and embeddings df_cate = pd.DataFrame(categories) # Generate embeddings for each category description df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1) return df_cate def match_categories(df, category_df): categories_list, experts_list, topic_list, scores_list = [], [], [], [] for ebd_content in df['Embeddings']: if isinstance(ebd_content, torch.Tensor): cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0] high_score_indices = [i for i, score in enumerate(cos_scores) if score > 0.45] # Append the corresponding categories, experts, and topics for each high-scoring index categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices]) experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices]) topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices]) scores_list.append([float(cos_scores[index]) for index in high_score_indices]) else: categories_list.append(np.nan) experts_list.append(np.nan) topic_list.append(np.nan) scores_list.append('pas interessant') df["Description"] = categories_list df["Expert"] = experts_list df["Topic"] = topic_list df["Score"] = scores_list return df def flatten_nested_lists(nested_list): """Flatten a list of potentially nested lists into a single list.""" flattened_list = [] for item in nested_list: if isinstance(item, list): flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list else: flattened_list.append(item) return flattened_list def save_data(df, filename): # Apply flattening and then join for the 'Expert' column df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x) df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x) df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x) new_filename = filename.replace(".", "_classified.") df.to_excel(new_filename, index=False) return new_filename def classification(column, file_path, categories): # Load data df = load_data(file_path) # Initialize models model_ST = initialize_models() # Generate embeddings for df df = generate_embeddings(df, model_ST, column) category_df = process_categories(categories, model_ST) # Match categories df = match_categories(df, category_df) # Save data return save_data(df,file_path), df