File size: 9,733 Bytes
691ae9d
 
 
 
 
3049ae8
691ae9d
eaa7be6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71b98b7
e3477e6
71b98b7
 
 
 
 
 
 
 
 
 
0a298a0
 
71b98b7
 
e3477e6
71b98b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691ae9d
 
 
 
7fb5b91
 
 
 
 
 
691ae9d
 
 
57bf1c3
691ae9d
57bf1c3
 
 
 
 
 
 
691ae9d
 
 
 
 
 
 
 
 
7fb5b91
 
 
 
 
 
 
 
691ae9d
 
57bf1c3
7fb5b91
 
691ae9d
7fb5b91
691ae9d
57bf1c3
7fb5b91
 
 
 
 
 
691ae9d
 
 
7fb5b91
 
 
 
691ae9d
7fb5b91
691ae9d
 
 
7fb5b91
 
 
 
 
 
 
 
 
691ae9d
 
7fb5b91
 
 
 
 
 
0e439ee
691ae9d
 
 
 
57bf1c3
691ae9d
 
 
 
7fb5b91
691ae9d
 
 
 
7fb5b91
691ae9d
 
 
57bf1c3
691ae9d
 
7fb5b91
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import gradio as gr


# Categories
categories = [
{
  "topic": "Confidentiality and Privacy Protection",
  "description": "This topic covers the protection of confidentiality, privacy, and integrity in security systems. It also includes authentication and authorization processes.",
  "experts": ["Mireille"]
},
{
  "topic": "Distributed Trust and End-User Trust Models",
  "description": "This topic focuses on distributed trust models and how end-users establish trust in secure systems.",
  "experts": ["Mireille", "Khawla"]
},
{
  "topic": "Secure Element and Key Provisioning",
  "description": "This topic involves the secure element in systems and the process of key provisioning.",
  "experts": ["Mireille"]
},
{
  "topic": "Residential Gateway Security",
  "description": "This topic covers the security aspects of Residential Gateways.",
  "experts": ["Mireille"]
},
{
  "topic": "Standalone Non-Public Network (SNPN) Inter-Connection and Cybersecurity",
  "description": "This topic focuses on the inter-connection of Standalone Non-Public Networks and related cyber-security topics.",
  "experts": ["Khawla"]
},
{
  "topic": "Distributed Ledger and Blockchain in SNPN",
  "description": "This topic covers the use of distributed ledger technology and blockchain in securing Standalone Non-Public Networks.",
  "experts": ["Khawla"]
},
{
  "topic": "Distributed Networks and Communication",
  "description": "This topic involves distributed networks such as mesh networks, ad-hoc networks, and multi-hop networks, and their cyber-security aspects.",
  "experts": ["Guillaume"]
},
{
  "topic": "Swarm of Drones and Unmanned Aerial Vehicles Network Infrastructure",
  "description": "This topic covers the network infrastructure deployed by Swarm of Drones and Unmanned Aerial Vehicles.",
  "experts": ["Guillaume"]
},
{
  "topic": "USIM and Over-the-Air Services",
  "description": "This topic involves USIM and related over-the-air services such as Steering of Roaming, roaming services, network selection, and UE configuration.",
  "experts": ["Vincent"]
},
{
  "topic": "Eco-Design and Societal Impact of Technology",
  "description": "This topic covers eco-design concepts, including energy saving, energy efficiency, carbon emissions, and the societal impact of technology.",
  "experts": ["Pierre"]
},
{
  "topic": "Service Requirements of New Services",
  "description": "This topic involves defining service requirements for new services, detecting low signals of new trends and technologies, and assessing their impact on USIM services or over-the-air services.",
  "experts": ["Ly-Thanh"]
},
{
  "topic": "Satellite and Non Terrestrial Networks",
  "description": "This topic covers satellite networks, Non Terrestrial Networks, Private Networks, IoT, Inter Satellite communication, and Radio Access Network.",
  "experts": ["Nicolas"]
},
{
  "topic": "Public Safety and Emergency Communication",
  "description": "This topic involves Public Safety Communication, Military Communication, Emergency Calls, Emergency Services, Disaster Communication Access, and other related areas.",
  "experts": ["Dorin"]
},
{
    "topic": "Identifying the Human User of a Subscription",
    "description": "This topic involves methods and processes for identifying the human user associated with a subscription.",
    "experts": ["Kumar"]  # Les experts pour cette catégorie ne sont pas spécifiés
},
{
    "topic": "Authentication and Authorization of Users and Restrictions on Users",
    "description": "This topic covers authentication and authorization processes, as well as restrictions imposed on users.",
    "experts": ["Kumar"]  # Les experts pour cette catégorie ne sont pas spécifiés
},
{
    "topic": "Exposure of User Identity Profile Information",
    "description": "This topic involves the exposure of user identity profile information and its security implications.",
    "experts": ["Kumar"]  # Les experts pour cette catégorie ne sont pas spécifiés
},
{
    "topic": "Identifying non-3GPP Devices Connecting behind a UE or 5G-RG",
    "description": "This topic involves identifying non-3GPP devices connecting behind a UE (User Equipment) or 5G-RG (5G Residential Gateway).",
    "experts": ["Kumar"]  # Les experts pour cette catégorie ne sont pas spécifiés
}
]

def add_categories(df,df_all):
    categories = df.to_dict("records")
    categories_all = df_all.to_dict("list")
    for cat in categories:
        if cat['topic'] not in categories_all['topic']:
            categories_all['topic'].append(cat['topic'])
            categories_all['description'].append(cat['description'])
            categories_all['experts'].append(cat['experts'])
            print(f"AFTER ADDINGS Those are the categories_all : {categories_all}")

    return gr.update(choices=categories_all['topic']),pd.DataFrame.from_dict(categories_all)

df_cate = pd.DataFrame(categories)    
df_cat_filter = df_cate.to_dict("list")["topic"]

def filter_by_topics(filters, categories):
    value_filtered = []
    categories = categories.to_dict("records")
    for cat in categories:
        if cat['topic'] in filters:
            value_filtered.append(cat)
    
    return gr.DataFrame(label='categories', value=pd.DataFrame(value_filtered), interactive=True)

### End
    
def reset_cate(df_categories):
    if df_categories.equals(df_cate):
        df_categories = pd.DataFrame([['', '', '']], columns=['topic', 'description', 'expert'])
    else:
        df_categories = df_cate.copy()
    return df_categories


def load_data(file_obj):
    # Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
    return pd.read_excel(file_obj)


def initialize_models():
    model_ST = SentenceTransformer("all-mpnet-base-v2")
    return model_ST


def generate_embeddings(df, model, Column):
    embeddings_list = []
    for index, row in df.iterrows():
        if type(row[Column]) == str:
            print(index)
            if 'Title' in df.columns:
                if type(row["Title"]) == str:
                    content = row["Title"] + "\n" + row[Column]
                else:
                    content = row[Column]
            else:
                content = row[Column]
            embeddings = model.encode(content, convert_to_tensor=True)
            embeddings_list.append(embeddings)
        else:
            embeddings_list.append(np.nan)
    df['Embeddings'] = embeddings_list
    return df


def process_categories(categories, model):
    # Create a new DataFrame to store category information and embeddings
    df_cate = pd.DataFrame(categories)
    
    # Generate embeddings for each category description
    df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)

    return df_cate



def match_categories(df, category_df, treshold=0.45):
    
    categories_list, experts_list, topic_list, scores_list = [], [], [], []
    for ebd_content in df['Embeddings']:
        if isinstance(ebd_content, torch.Tensor):
            cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
            high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]

            # Append the corresponding categories, experts, and topics for each high-scoring index
            categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
            experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
            topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
            scores_list.append([float(cos_scores[index]) for index in high_score_indices])
        else:
            categories_list.append(np.nan)
            experts_list.append(np.nan)
            topic_list.append(np.nan)
            scores_list.append('pas interessant')

    df["Description"] = categories_list
    df["Expert"] = experts_list
    df["Topic"] = topic_list
    df["Score"] = scores_list
    return df

def flatten_nested_lists(nested_list):
    """Flatten a list of potentially nested lists into a single list."""
    flattened_list = []
    for item in nested_list:
        if isinstance(item, list):
            flattened_list.extend(flatten_nested_lists(item))  # Recursively flatten the list
        else:
            flattened_list.append(item)
    return flattened_list

def save_data(df, filename):
    # Apply flattening and then join for the 'Expert' column
    df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
    df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
    df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)

    df = df.drop(columns=['Embeddings'])
    new_filename = filename.replace(".", "_classified.")
    df.to_excel(new_filename, index=False)
    return new_filename

def classification(column, file_path, categories, treshold):
    # Load data
    df = load_data(file_path)

    # Initialize models
    model_ST = initialize_models()

    # Generate embeddings for df
    df = generate_embeddings(df, model_ST, column)


    category_df = process_categories(categories, model_ST)

    # Match categories
    df = match_categories(df, category_df, treshold=treshold)

    # Save data
    return save_data(df,file_path), df