import pickle import numpy as np import pandas as pd import nltk from nltk.stem import * nltk.download("punkt_tab") FILE_PATH = "/app/src/ressources/technologies_database.xlsx" def set_prompt(problem): prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only. Take into account different technical domains to encompass the whole problem. Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"}) Technical problem : """ + problem return prompt def load_technologies_excel(): df = pd.read_excel(FILE_PATH) return df def load_technologies(): EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl' try: with open(EMBEDDINGS_FILE, 'rb') as f: loaded_data = pickle.load(f) global_tech = loaded_data['global_tech'] global_tech_embedding = loaded_data['global_tech_embeddings'] return global_tech, global_tech_embedding except Exception as e: print(f"Error: {e}") def tech_to_dict(technologies): tech_dict = [] for index, tech in enumerate(technologies): if not tech.find("") > 1: tab = tech.split("\n") tab.pop(0) tab.pop(len(tab)-1) tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], "purpose": tab[1][tab[1].find(": ")+2:], "key_components": tab[2][tab[2].find(": ")+2:], "advantages": tab[3][tab[3].find(": ")+2:], "limitations": tab[4][tab[4].find(": ")+2:], "id": index}) return tech_dict def save_dataframe(df, title): pd.DataFrame(df).to_excel(title) return title def stem(data,data_type): stemmer = SnowballStemmer("english") processed_data = [] if data_type == "technologies": for t_item in data: processed_data.append({ "title": stemmer.stem(t_item["title"]), "purpose": stemmer.stem(t_item["purpose"]), "key_components": stemmer.stem(t_item["key_components"]), "advantages": stemmer.stem(t_item["advantages"]), "limitations": stemmer.stem(t_item["limitations"]), "id": t_item["id"] }) else: for t_item in data: print(t_item) processed_data.append({ "title": stemmer.stem(t_item), "description": stemmer.stem(data[t_item]) }) return processed_data def get_technologies_by_id(id_list, technologies): result = [] id_set = set(id_list) for tech in technologies: if tech.get('id') in id_set: result.append(tech) return result def save_to_pickle(result_similarites): constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites]))) max_id2 = max([item['id2'] for item in result_similarites]) row_label_to_index = {title: i for i, title in enumerate(constraint_titles)} col_labels = list(range(1, max_id2 + 1)) num_rows = len(constraint_titles) num_cols = max_id2 matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32) for item in result_similarites: row_idx = row_label_to_index[item['constraint']['title']] col_idx = item['id2'] - 1 # similarity_value = item['similarity'].item() matrix[row_idx, col_idx] = similarity_value print(f"Successfully created matrix with shape: {matrix.shape}") print(f"Number of rows (unique constraints): {num_rows}") print(f"Number of columns (max id2): {num_cols}") print("\nExample 5x5 block of the created matrix (NaN for missing values):") print(matrix[:5, :5]) output_filename = "cosine_similarity_matrix_with_labels.pkl" data_to_save = { 'matrix': matrix, 'row_labels': constraint_titles, 'col_labels': col_labels } with open(output_filename, 'wb') as f: pickle.dump(data_to_save, f) print(f"\nMatrix and labels saved to {output_filename}") return output_filename