File size: 4,206 Bytes
691ae9d
 
 
 
 
 
 
 
 
 
7fb5b91
 
 
 
 
 
691ae9d
 
 
57bf1c3
691ae9d
57bf1c3
 
 
 
 
 
 
691ae9d
 
 
 
 
 
 
 
 
7fb5b91
 
 
 
 
 
 
 
691ae9d
 
57bf1c3
7fb5b91
 
691ae9d
7fb5b91
691ae9d
57bf1c3
7fb5b91
 
 
 
 
 
691ae9d
 
 
7fb5b91
 
 
 
691ae9d
7fb5b91
691ae9d
 
 
7fb5b91
 
 
 
 
 
 
 
 
691ae9d
 
7fb5b91
 
 
 
 
 
0e439ee
691ae9d
 
 
 
57bf1c3
691ae9d
 
 
 
7fb5b91
691ae9d
 
 
 
7fb5b91
691ae9d
 
 
57bf1c3
691ae9d
 
7fb5b91
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

def load_data(file_obj):
    # Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
    return pd.read_excel(file_obj)


def initialize_models():
    model_ST = SentenceTransformer("all-mpnet-base-v2")
    return model_ST


def generate_embeddings(df, model, Column):
    embeddings_list = []
    for index, row in df.iterrows():
        if type(row[Column]) == str:
            print(index)
            if 'Title' in df.columns:
                if type(row["Title"]) == str:
                    content = row["Title"] + "\n" + row[Column]
                else:
                    content = row[Column]
            else:
                content = row[Column]
            embeddings = model.encode(content, convert_to_tensor=True)
            embeddings_list.append(embeddings)
        else:
            embeddings_list.append(np.nan)
    df['Embeddings'] = embeddings_list
    return df


def process_categories(categories, model):
    # Create a new DataFrame to store category information and embeddings
    df_cate = pd.DataFrame(categories)
    
    # Generate embeddings for each category description
    df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)

    return df_cate



def match_categories(df, category_df, treshold=0.45):
    
    categories_list, experts_list, topic_list, scores_list = [], [], [], []
    for ebd_content in df['Embeddings']:
        if isinstance(ebd_content, torch.Tensor):
            cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
            high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]

            # Append the corresponding categories, experts, and topics for each high-scoring index
            categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
            experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
            topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
            scores_list.append([float(cos_scores[index]) for index in high_score_indices])
        else:
            categories_list.append(np.nan)
            experts_list.append(np.nan)
            topic_list.append(np.nan)
            scores_list.append('pas interessant')

    df["Description"] = categories_list
    df["Expert"] = experts_list
    df["Topic"] = topic_list
    df["Score"] = scores_list
    return df

def flatten_nested_lists(nested_list):
    """Flatten a list of potentially nested lists into a single list."""
    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_nested_lists(item))  # Recursively flatten the list
        else:
            flattened_list.append(item)
    return flattened_list

def save_data(df, filename):
    # Apply flattening and then join for the 'Expert' column
    df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
    df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)

    df = df.drop(columns=['Embeddings'])
    new_filename = filename.replace(".", "_classified.")
    df.to_excel(new_filename, index=False)
    return new_filename

def classification(column, file_path, categories, treshold):
    # Load data
    df = load_data(file_path)

    # Initialize models
    model_ST = initialize_models()

    # Generate embeddings for df
    df = generate_embeddings(df, model_ST, column)


    category_df = process_categories(categories, model_ST)

    # Match categories
    df = match_categories(df, category_df, treshold=treshold)

    # Save data
    return save_data(df,file_path), df