Spaces:
Sleeping
Sleeping
File size: 9,733 Bytes
691ae9d 3049ae8 691ae9d eaa7be6 71b98b7 e3477e6 71b98b7 0a298a0 71b98b7 e3477e6 71b98b7 691ae9d 7fb5b91 691ae9d 57bf1c3 691ae9d 57bf1c3 691ae9d 7fb5b91 691ae9d 57bf1c3 7fb5b91 691ae9d 7fb5b91 691ae9d 57bf1c3 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 7fb5b91 0e439ee 691ae9d 57bf1c3 691ae9d 7fb5b91 691ae9d 7fb5b91 691ae9d 57bf1c3 691ae9d 7fb5b91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import gradio as gr
# Categories
categories = [
{
"topic": "Confidentiality and Privacy Protection",
"description": "This topic covers the protection of confidentiality, privacy, and integrity in security systems. It also includes authentication and authorization processes.",
"experts": ["Mireille"]
},
{
"topic": "Distributed Trust and End-User Trust Models",
"description": "This topic focuses on distributed trust models and how end-users establish trust in secure systems.",
"experts": ["Mireille", "Khawla"]
},
{
"topic": "Secure Element and Key Provisioning",
"description": "This topic involves the secure element in systems and the process of key provisioning.",
"experts": ["Mireille"]
},
{
"topic": "Residential Gateway Security",
"description": "This topic covers the security aspects of Residential Gateways.",
"experts": ["Mireille"]
},
{
"topic": "Standalone Non-Public Network (SNPN) Inter-Connection and Cybersecurity",
"description": "This topic focuses on the inter-connection of Standalone Non-Public Networks and related cyber-security topics.",
"experts": ["Khawla"]
},
{
"topic": "Distributed Ledger and Blockchain in SNPN",
"description": "This topic covers the use of distributed ledger technology and blockchain in securing Standalone Non-Public Networks.",
"experts": ["Khawla"]
},
{
"topic": "Distributed Networks and Communication",
"description": "This topic involves distributed networks such as mesh networks, ad-hoc networks, and multi-hop networks, and their cyber-security aspects.",
"experts": ["Guillaume"]
},
{
"topic": "Swarm of Drones and Unmanned Aerial Vehicles Network Infrastructure",
"description": "This topic covers the network infrastructure deployed by Swarm of Drones and Unmanned Aerial Vehicles.",
"experts": ["Guillaume"]
},
{
"topic": "USIM and Over-the-Air Services",
"description": "This topic involves USIM and related over-the-air services such as Steering of Roaming, roaming services, network selection, and UE configuration.",
"experts": ["Vincent"]
},
{
"topic": "Eco-Design and Societal Impact of Technology",
"description": "This topic covers eco-design concepts, including energy saving, energy efficiency, carbon emissions, and the societal impact of technology.",
"experts": ["Pierre"]
},
{
"topic": "Service Requirements of New Services",
"description": "This topic involves defining service requirements for new services, detecting low signals of new trends and technologies, and assessing their impact on USIM services or over-the-air services.",
"experts": ["Ly-Thanh"]
},
{
"topic": "Satellite and Non Terrestrial Networks",
"description": "This topic covers satellite networks, Non Terrestrial Networks, Private Networks, IoT, Inter Satellite communication, and Radio Access Network.",
"experts": ["Nicolas"]
},
{
"topic": "Public Safety and Emergency Communication",
"description": "This topic involves Public Safety Communication, Military Communication, Emergency Calls, Emergency Services, Disaster Communication Access, and other related areas.",
"experts": ["Dorin"]
},
{
"topic": "Identifying the Human User of a Subscription",
"description": "This topic involves methods and processes for identifying the human user associated with a subscription.",
"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
},
{
"topic": "Authentication and Authorization of Users and Restrictions on Users",
"description": "This topic covers authentication and authorization processes, as well as restrictions imposed on users.",
"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
},
{
"topic": "Exposure of User Identity Profile Information",
"description": "This topic involves the exposure of user identity profile information and its security implications.",
"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
},
{
"topic": "Identifying non-3GPP Devices Connecting behind a UE or 5G-RG",
"description": "This topic involves identifying non-3GPP devices connecting behind a UE (User Equipment) or 5G-RG (5G Residential Gateway).",
"experts": ["Kumar"] # Les experts pour cette catégorie ne sont pas spécifiés
}
]
def add_categories(df,df_all):
categories = df.to_dict("records")
categories_all = df_all.to_dict("list")
for cat in categories:
if cat['topic'] not in categories_all['topic']:
categories_all['topic'].append(cat['topic'])
categories_all['description'].append(cat['description'])
categories_all['experts'].append(cat['experts'])
print(f"AFTER ADDINGS Those are the categories_all : {categories_all}")
return gr.update(choices=categories_all['topic']),pd.DataFrame.from_dict(categories_all)
df_cate = pd.DataFrame(categories)
df_cat_filter = df_cate.to_dict("list")["topic"]
def filter_by_topics(filters, categories):
value_filtered = []
categories = categories.to_dict("records")
for cat in categories:
if cat['topic'] in filters:
value_filtered.append(cat)
return gr.DataFrame(label='categories', value=pd.DataFrame(value_filtered), interactive=True)
### End
def reset_cate(df_categories):
if df_categories.equals(df_cate):
df_categories = pd.DataFrame([['', '', '']], columns=['topic', 'description', 'expert'])
else:
df_categories = df_cate.copy()
return df_categories
def load_data(file_obj):
# Assuming file_obj is a file-like object uploaded via Gradio, use `pd.read_excel` directly on it
return pd.read_excel(file_obj)
def initialize_models():
model_ST = SentenceTransformer("all-mpnet-base-v2")
return model_ST
def generate_embeddings(df, model, Column):
embeddings_list = []
for index, row in df.iterrows():
if type(row[Column]) == str:
print(index)
if 'Title' in df.columns:
if type(row["Title"]) == str:
content = row["Title"] + "\n" + row[Column]
else:
content = row[Column]
else:
content = row[Column]
embeddings = model.encode(content, convert_to_tensor=True)
embeddings_list.append(embeddings)
else:
embeddings_list.append(np.nan)
df['Embeddings'] = embeddings_list
return df
def process_categories(categories, model):
# Create a new DataFrame to store category information and embeddings
df_cate = pd.DataFrame(categories)
# Generate embeddings for each category description
df_cate['Embeddings'] = df_cate.apply(lambda cat: model.encode(cat['description'], convert_to_tensor=True), axis=1)
return df_cate
def match_categories(df, category_df, treshold=0.45):
categories_list, experts_list, topic_list, scores_list = [], [], [], []
for ebd_content in df['Embeddings']:
if isinstance(ebd_content, torch.Tensor):
cos_scores = util.cos_sim(ebd_content, torch.stack(list(category_df['Embeddings']), dim=0))[0]
high_score_indices = [i for i, score in enumerate(cos_scores) if score > treshold]
# Append the corresponding categories, experts, and topics for each high-scoring index
categories_list.append([category_df.loc[index, 'description'] for index in high_score_indices])
experts_list.append([category_df.loc[index, 'experts'] for index in high_score_indices])
topic_list.append([category_df.loc[index, 'topic'] for index in high_score_indices])
scores_list.append([float(cos_scores[index]) for index in high_score_indices])
else:
categories_list.append(np.nan)
experts_list.append(np.nan)
topic_list.append(np.nan)
scores_list.append('pas interessant')
df["Description"] = categories_list
df["Expert"] = experts_list
df["Topic"] = topic_list
df["Score"] = scores_list
return df
def flatten_nested_lists(nested_list):
"""Flatten a list of potentially nested lists into a single list."""
flattened_list = []
for item in nested_list:
if isinstance(item, list):
flattened_list.extend(flatten_nested_lists(item)) # Recursively flatten the list
else:
flattened_list.append(item)
return flattened_list
def save_data(df, filename):
# Apply flattening and then join for the 'Expert' column
df['Expert'] = df['Expert'].apply(lambda x: ', '.join(flatten_nested_lists(x)) if isinstance(x, list) else x)
df['Description'] = df['Description'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['Topic'] = df['Topic'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df['Score'] = df['Score'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
df = df.drop(columns=['Embeddings'])
new_filename = filename.replace(".", "_classified.")
df.to_excel(new_filename, index=False)
return new_filename
def classification(column, file_path, categories, treshold):
# Load data
df = load_data(file_path)
# Initialize models
model_ST = initialize_models()
# Generate embeddings for df
df = generate_embeddings(df, model_ST, column)
category_df = process_categories(categories, model_ST)
# Match categories
df = match_categories(df, category_df, treshold=treshold)
# Save data
return save_data(df,file_path), df
|