docker-aas / app /predict_different_aas.py
mboth's picture
Upload 5 files
c2e327f
raw
history blame
No virus
10.2 kB
from sentence_transformers import SentenceTransformer, util
import json
import time
import pandas as pd
import numpy as np
import pickle
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from chromadb.db.clickhouse import NoDatapointsException
def query_aas(query_json, collection, model, metalabel):
query = json.loads(query_json)
name = query["Name"]
definition = query["Definition"]
unit = query["Unit"]
datatype = query["Datatype"]
semantic_id = query["SemanticId"]
numberAAS = query["NumberAASReturned"]
#model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
datatype_mapping = {
"boolean": "BOOLEAN",
"string": "STRING",
"string_translatable": "STRING",
"translatable_string": "STRING",
"non_translatable_string": "STRING",
"date": "DATE",
"data_time": "DATE",
"uri": "URI",
"int": "INT",
"int_measure": "INT",
"int_currency": "INT",
"integer": "INT",
"real": "REAL",
"real_measure": "REAL",
"real_currency": "REAL",
"enum_code": "ENUM_CODE",
"enum_int": "ENUM_CODE",
"ENUM_REAL": "ENUM_CODE",
"ENUM_RATIONAL": "ENUM_CODE",
"ENUM_BOOLEAN": "ENUM_CODE",
"ENUM_STRING": "ENUM_CODE",
"enum_reference": "ENUM_CODE",
"enum_instance": "ENUM_CODE",
"set(b1,b2)": "SET",
"constrained_set(b1,b2,cmn,cmx)": "SET",
"set [0,?]": "SET",
"set [1,?]": "SET",
"set [1, ?]": "SET",
"nan": "NaN",
"media_type": "LARGE_OBJECT_TYPE",
}
unit_mapping = {
"nan": "NaN",
"hertz": "FREQUENCY",
"hz": "FREQUENCY",
"pa": "PRESSURE",
"pascal": "PRESSURE",
"n/m²": "PRESSURE",
"bar": "PRESSURE",
"%": "SCALARS_PERC",
"w": "POWER",
"watt": "POWER",
"kw": "POWER",
"kg/m³": "CHEMISTRY",
"m²/s": "CHEMISTRY",
"pa*s": "CHEMISTRY",
"v": "ELECTRICAL",
"volt": "ELECTRICAL",
"db": "ACOUSTICS",
"db(a)": "ACOUSTICS",
"k": "TEMPERATURE",
"°c": "TEMPERATURE",
"n": "MECHANICS",
"newton": "MECHANICS",
"kg/s": "FLOW",
"kg/h": "FLOW",
"m³/s": "FLOW",
"m³/h": "FLOW",
"l/s": "FLOW",
"l/h": "FLOW",
"µm": "LENGTH",
"mm": "LENGTH",
"cm": "LENGTH",
"dm": "LENGTH",
"m": "LENGTH",
"meter": "LENGTH",
"m/s": "SPEED",
"km/h": "SPEED",
"s^(-1)": "FREQUENCY",
"1/s": "FREQUENCY",
"s": "TIME",
"h": "TIME",
"min": "TIME",
"d": "TIME",
"hours": "TIME",
"a": "ELECTRICAL",
"m³": "VOLUME",
"m²": "AREA",
"rpm": "FLOW",
"nm": "MECHANICS",
"m/m": "MECHANICS",
"m³/m²s": "MECHANICS",
"w(m²*K)": "HEAT_TRANSFER",
"kwh": "ELECTRICAL",
"kg/(s*m²)": "FLOW",
"kg": "MASS",
"w/(m*k)": "HEAT_TRANSFER",
"m²*k/w": "HEAT_TRANSFER",
"j/s": "POWER",
}
#with open(
# "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle",
# "rb",
#) as handle:
# metalabel = pickle.load(handle)
unit_lower = unit.lower()
datatype_lower = datatype.lower()
unit_categ = unit_mapping.get(unit_lower)
datatype_categ = datatype_mapping.get(datatype_lower)
if unit_categ == None:
unit_categ = "NaN"
if datatype_categ == None:
datatype_categ = "NaN"
concat = (unit_categ, datatype_categ)
keys = [k for k, v in metalabel.items() if v == concat]
metadata = keys[0]
name_embedding = model.encode(name)
definition_embedding = model.encode(definition)
concat_name_def_query = np.concatenate(
(definition_embedding, name_embedding), axis=0
)
concat_name_def_query = concat_name_def_query.tolist()
queries = [concat_name_def_query]
print(type(queries))
# Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
# Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
# Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
# In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2
# Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
try:
homogen = collection.query(
query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id}
)
# except NoDatapointsException:
# homogen = 'Nix'
except Exception:
homogen = "Nix"
if homogen != "Nix":
result = homogen
result["matching_method"] = "Semantic equivalent , same semantic Id"
result["matching_algorithm"] = "None"
result["distances"] = [[0]]
value = result['documents'][0][0]
value_dict = json.loads(value)
final_result = {
"matching_method": result['matching_method'],
"matching_algorithm": result['matching_algorithm'],
"matching_distance": result['distances'][0][0],
"aas_id": result['metadatas'][0][0]['AASId'],
"aas_id_short": result['metadatas'][0][0]['AASIdShort'],
"submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
"submodel_id": result['metadatas'][0][0]['SubmodelId'],
"matched_object": value_dict,
}
#final_results = [final_result]
# Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
elif homogen == "Nix":
try:
with_metadata = collection.query(
query_embeddings=queries,
n_results=1,
where={"Metalabel": metadata},
)
# except NoDatapointsException:
# with_metadata = 'Nix'
except Exception:
with_metadata = "Nix"
without_metadata = collection.query(
query_embeddings=queries,
n_results=1,
)
if with_metadata == "Nix":
result = without_metadata
result[
"matching_method"
] = "Semantically not equivalent, NLP without Metadata"
result[
"matching_algorithm"
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
elif with_metadata != "Nix":
distance_with_meta = with_metadata["distances"][0][0]
distance_without_meta = without_metadata["distances"][0][0]
print(distance_with_meta)
print(distance_without_meta)
# Vergleich der Abstände von mit und ohne Metadaten
if distance_without_meta <= distance_with_meta:
result = without_metadata
result[
"matching_method"
] = "Semantically not equivalent, NLP without Metadata"
result[
"matching_algorithm"
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
else:
result = with_metadata
result[
"matching_method"
] = "Semantically not equivalent, NLP without Metadata"
result[
"matching_algorithm"
] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
# Aufbereiten des passenden finalen Ergebnisses
"""
final_results = []
for i in range(0, return_matches):
value = result['documents'][0][i]
value_dict = json.loads(value)
final_result = {
"matching_method": result['matching_method'],
"matching_algorithm": result['matching_algorithm'],
"matching_distance": result['distances'][0][i],
"aas_id": result['metadatas'][0][i]['AASId'],
"aas_id_short": result['metadatas'][0][i]['AASIdShort'],
"submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
"submodel_id": result['metadatas'][0][i]['SubmodelId'],
#"matched_object": result['documents'][0][i]
"matched_object": value_dict
}
final_results.append(final_result)
"""
value = result['documents'][0][0]
value_dict = json.loads(value)
final_result = {
"matching_method": result['matching_method'],
"matching_algorithm": result['matching_algorithm'],
"matching_distance": result['distances'][0][0],
"aas_id": result['metadatas'][0][0]['AASId'],
"aas_id_short": result['metadatas'][0][0]['AASIdShort'],
"submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
"submodel_id": result['metadatas'][0][0]['SubmodelId'],
"matched_object": value_dict
}
return final_result
def get_best_results(json_query, results):
query = json.loads(json_query)
numberAAS = query["NumberAASReturned"]
sorted_results = sorted(results, key=lambda aas: aas['matching_distance'])
numberAAS_count = numberAAS-1
best_results = sorted_results[0:numberAAS]
return best_results
def ask_database(query, metalabel, model, collections, client_chroma):
# Alle AAS werden nacheinaner abgefragt
json_query = json.dumps(query, indent=4)
results = []
for collection in collections:
print(collection.name)
collection = client_chroma.get_collection(collection.name)
result = query_aas(json_query, collection, model, metalabel)
results.append(result)
#results_json = json.dumps(results)
best_results = get_best_results(json_query, results)
return best_results