from sentence_transformers import SentenceTransformer, util import json import time import pandas as pd import numpy as np import pickle import chromadb from chromadb.config import Settings from chromadb.utils import embedding_functions from chromadb.db.clickhouse import NoDatapointsException def query_aas(query_json, collection, model, metalabel): query = json.loads(query_json) name = query["Name"] definition = query["Definition"] unit = query["Unit"] datatype = query["Datatype"] semantic_id = query["SemanticId"] numberAAS = query["NumberAASReturned"] #model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass") datatype_mapping = { "boolean": "BOOLEAN", "string": "STRING", "string_translatable": "STRING", "translatable_string": "STRING", "non_translatable_string": "STRING", "date": "DATE", "data_time": "DATE", "uri": "URI", "int": "INT", "int_measure": "INT", "int_currency": "INT", "integer": "INT", "real": "REAL", "real_measure": "REAL", "real_currency": "REAL", "enum_code": "ENUM_CODE", "enum_int": "ENUM_CODE", "ENUM_REAL": "ENUM_CODE", "ENUM_RATIONAL": "ENUM_CODE", "ENUM_BOOLEAN": "ENUM_CODE", "ENUM_STRING": "ENUM_CODE", "enum_reference": "ENUM_CODE", "enum_instance": "ENUM_CODE", "set(b1,b2)": "SET", "constrained_set(b1,b2,cmn,cmx)": "SET", "set [0,?]": "SET", "set [1,?]": "SET", "set [1, ?]": "SET", "nan": "NaN", "media_type": "LARGE_OBJECT_TYPE", } unit_mapping = { "nan": "NaN", "hertz": "FREQUENCY", "hz": "FREQUENCY", "pa": "PRESSURE", "pascal": "PRESSURE", "n/m²": "PRESSURE", "bar": "PRESSURE", "%": "SCALARS_PERC", "w": "POWER", "watt": "POWER", "kw": "POWER", "kg/m³": "CHEMISTRY", "m²/s": "CHEMISTRY", "pa*s": "CHEMISTRY", "v": "ELECTRICAL", "volt": "ELECTRICAL", "db": "ACOUSTICS", "db(a)": "ACOUSTICS", "k": "TEMPERATURE", "°c": "TEMPERATURE", "n": "MECHANICS", "newton": "MECHANICS", "kg/s": "FLOW", "kg/h": "FLOW", "m³/s": "FLOW", "m³/h": "FLOW", "l/s": "FLOW", "l/h": "FLOW", "µm": "LENGTH", "mm": "LENGTH", "cm": "LENGTH", "dm": "LENGTH", "m": "LENGTH", "meter": "LENGTH", "m/s": "SPEED", "km/h": "SPEED", "s^(-1)": "FREQUENCY", "1/s": "FREQUENCY", "s": "TIME", "h": "TIME", "min": "TIME", "d": "TIME", "hours": "TIME", "a": "ELECTRICAL", "m³": "VOLUME", "m²": "AREA", "rpm": "FLOW", "nm": "MECHANICS", "m/m": "MECHANICS", "m³/m²s": "MECHANICS", "w(m²*K)": "HEAT_TRANSFER", "kwh": "ELECTRICAL", "kg/(s*m²)": "FLOW", "kg": "MASS", "w/(m*k)": "HEAT_TRANSFER", "m²*k/w": "HEAT_TRANSFER", "j/s": "POWER", } #with open( # "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle", # "rb", #) as handle: # metalabel = pickle.load(handle) unit_lower = unit.lower() datatype_lower = datatype.lower() unit_categ = unit_mapping.get(unit_lower) datatype_categ = datatype_mapping.get(datatype_lower) if unit_categ == None: unit_categ = "NaN" if datatype_categ == None: datatype_categ = "NaN" concat = (unit_categ, datatype_categ) keys = [k for k, v in metalabel.items() if v == concat] metadata = keys[0] name_embedding = model.encode(name) definition_embedding = model.encode(definition) concat_name_def_query = np.concatenate( (definition_embedding, name_embedding), axis=0 ) concat_name_def_query = concat_name_def_query.tolist() queries = [concat_name_def_query] print(type(queries)) # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2 # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich try: homogen = collection.query( query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id} ) # except NoDatapointsException: # homogen = 'Nix' except Exception: homogen = "Nix" if homogen != "Nix": result = homogen result["matching_method"] = "Semantic equivalent , same semantic Id" result["matching_algorithm"] = "None" result["distances"] = [[0]] value = result['documents'][0][0] value_dict = json.loads(value) final_result = { "matching_method": result['matching_method'], "matching_algorithm": result['matching_algorithm'], "matching_distance": result['distances'][0][0], "aas_id": result['metadatas'][0][0]['AASId'], "aas_id_short": result['metadatas'][0][0]['AASIdShort'], "submodel_id_short": result['metadatas'][0][0]['SubmodelName'], "submodel_id": result['metadatas'][0][0]['SubmodelId'], "matched_object": value_dict, } #final_results = [final_result] # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten elif homogen == "Nix": try: with_metadata = collection.query( query_embeddings=queries, n_results=1, where={"Metalabel": metadata}, ) # except NoDatapointsException: # with_metadata = 'Nix' except Exception: with_metadata = "Nix" without_metadata = collection.query( query_embeddings=queries, n_results=1, ) if with_metadata == "Nix": result = without_metadata result[ "matching_method" ] = "Semantically not equivalent, NLP without Metadata" result[ "matching_algorithm" ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" elif with_metadata != "Nix": distance_with_meta = with_metadata["distances"][0][0] distance_without_meta = without_metadata["distances"][0][0] print(distance_with_meta) print(distance_without_meta) # Vergleich der Abstände von mit und ohne Metadaten if distance_without_meta <= distance_with_meta: result = without_metadata result[ "matching_method" ] = "Semantically not equivalent, NLP without Metadata" result[ "matching_algorithm" ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" else: result = with_metadata result[ "matching_method" ] = "Semantically not equivalent, NLP without Metadata" result[ "matching_algorithm" ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass" # Aufbereiten des passenden finalen Ergebnisses """ final_results = [] for i in range(0, return_matches): value = result['documents'][0][i] value_dict = json.loads(value) final_result = { "matching_method": result['matching_method'], "matching_algorithm": result['matching_algorithm'], "matching_distance": result['distances'][0][i], "aas_id": result['metadatas'][0][i]['AASId'], "aas_id_short": result['metadatas'][0][i]['AASIdShort'], "submodel_id_short": result['metadatas'][0][i]['SubmodelName'], "submodel_id": result['metadatas'][0][i]['SubmodelId'], #"matched_object": result['documents'][0][i] "matched_object": value_dict } final_results.append(final_result) """ value = result['documents'][0][0] value_dict = json.loads(value) final_result = { "matching_method": result['matching_method'], "matching_algorithm": result['matching_algorithm'], "matching_distance": result['distances'][0][0], "aas_id": result['metadatas'][0][0]['AASId'], "aas_id_short": result['metadatas'][0][0]['AASIdShort'], "submodel_id_short": result['metadatas'][0][0]['SubmodelName'], "submodel_id": result['metadatas'][0][0]['SubmodelId'], "matched_object": value_dict } return final_result def get_best_results(json_query, results): query = json.loads(json_query) numberAAS = query["NumberAASReturned"] sorted_results = sorted(results, key=lambda aas: aas['matching_distance']) numberAAS_count = numberAAS-1 best_results = sorted_results[0:numberAAS] return best_results def ask_database(query, metalabel, model, collections, client_chroma): # Alle AAS werden nacheinaner abgefragt json_query = json.dumps(query, indent=4) results = [] for collection in collections: print(collection.name) collection = client_chroma.get_collection(collection.name) result = query_aas(json_query, collection, model, metalabel) results.append(result) #results_json = json.dumps(results) best_results = get_best_results(json_query, results) return best_results