Spaces:
Sleeping
Sleeping
# This file is used to compute the embedding of the technologies, easily executable on google colab | |
#!pip install sentence-transformers | |
#!pip install nltk | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
import pickle | |
import pandas as pd | |
import nltk | |
from nltk.stem import * | |
nltk.download("punkt_tab") | |
print("Loading SentenceTransformer model...") | |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
print("Model loaded.") | |
def load_technologies(): | |
df = pd.read_excel('technologies_database.xlsx') | |
return df | |
def tech_to_dict(technologies): | |
tech_dict = [] | |
for index, tech in enumerate(technologies): | |
if not tech.find("<title>") > 1: | |
tab = tech.split("\n") | |
tab.pop(0) | |
tab.pop(len(tab)-1) | |
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], | |
"purpose": tab[1][tab[1].find(": ")+2:], | |
"key_components": tab[2][tab[2].find(": ")+2:], | |
"advantages": tab[3][tab[3].find(": ")+2:], | |
"limitations": tab[4][tab[4].find(": ")+2:], | |
"id": index}) | |
return tech_dict | |
def stem(data,data_type): | |
stemmer = SnowballStemmer("english") | |
processed_data = [] | |
if data_type == "technologies": | |
for t_item in data: | |
processed_data.append({ | |
"title": stemmer.stem(t_item["title"]), | |
"purpose": stemmer.stem(t_item["purpose"]), | |
"key_components": stemmer.stem(t_item["key_components"]), | |
"advantages": stemmer.stem(t_item["advantages"]), | |
"limitations": stemmer.stem(t_item["limitations"]), | |
"id": t_item["id"] | |
}) | |
else: | |
for t_item in data: | |
print(t_item) | |
processed_data.append({ | |
"title": stemmer.stem(t_item), | |
"description": stemmer.stem(data[t_item]) | |
}) | |
return processed_data | |
def preprocess_tech_data(_df): | |
if _df is None or "description" not in _df.columns: | |
return [], [] | |
technologies_list = _df["description"].to_list() | |
tech_dict_raw = tech_to_dict(technologies_list) | |
tech_dict_filtered = [ | |
t for t in tech_dict_raw if ( | |
len(t.get("title", "")) >= 5 and | |
len(t.get("advantages", "")) >= 5 and | |
len(t.get("key_components", "")) >= 5 | |
) | |
] | |
if not tech_dict_filtered: | |
return [], [] | |
processed_tech_wt = stem(tech_dict_filtered,"technologies") | |
for t_item_wt in processed_tech_wt: | |
kc = t_item_wt.get("key_components") | |
if isinstance(kc, str): | |
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc)) | |
else: | |
t_item_wt["key_components"] = "" | |
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)] | |
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else [] | |
return processed_tech_wt, _keys, original_tech_for_display | |
df = load_technologies() | |
global_tech,keys,original_tech = preprocess_tech_data(df) | |
global_tech_purposes = [t["purpose"] for t in global_tech] | |
# Encode all global_tech purposes into embeddings | |
print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.") | |
global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True) | |
print("Global tech embeddings created.") | |
# Define the filename for the pickle file | |
output_filename = 'global_tech_embeddings.pkl' | |
# Save the embeddings and the global_tech data (optional, but good for context) | |
# Saving global_tech alongside embeddings ensures you have the original data if needed | |
data_to_save = { | |
'global_tech': global_tech, # The original list of dictionaries | |
'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings | |
} | |
print(f"Saving embeddings and global_tech data to {output_filename}...") | |
with open(output_filename, 'wb') as f: | |
pickle.dump(data_to_save, f) | |
print(f"Data saved successfully to {output_filename}.") | |
print(f"\nTo load this file later in your API, use: \n" | |
f"with open('{output_filename}', 'rb') as f:\n" | |
f" loaded_data = pickle.load(f)\n" | |
f"global_tech = loaded_data['global_tech']\n" | |
f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n") | |