Spaces:

mboth
/

docker-aas

Runtime error

App Files Files Community

docker-aas / app /predict_different_aas.py

mboth

Upload 5 files

c2e327f over 1 year ago

raw

history blame

No virus

10.2 kB

	from sentence_transformers import SentenceTransformer, util
	import json
	import time
	import pandas as pd
	import numpy as np
	import pickle

	import chromadb
	from chromadb.config import Settings
	from chromadb.utils import embedding_functions
	from chromadb.db.clickhouse import NoDatapointsException


	def query_aas(query_json, collection, model, metalabel):
	query = json.loads(query_json)
	name = query["Name"]
	definition = query["Definition"]
	unit = query["Unit"]
	datatype = query["Datatype"]
	semantic_id = query["SemanticId"]
	numberAAS = query["NumberAASReturned"]

	#model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")

	datatype_mapping = {
	"boolean": "BOOLEAN",
	"string": "STRING",
	"string_translatable": "STRING",
	"translatable_string": "STRING",
	"non_translatable_string": "STRING",
	"date": "DATE",
	"data_time": "DATE",
	"uri": "URI",
	"int": "INT",
	"int_measure": "INT",
	"int_currency": "INT",
	"integer": "INT",
	"real": "REAL",
	"real_measure": "REAL",
	"real_currency": "REAL",
	"enum_code": "ENUM_CODE",
	"enum_int": "ENUM_CODE",
	"ENUM_REAL": "ENUM_CODE",
	"ENUM_RATIONAL": "ENUM_CODE",
	"ENUM_BOOLEAN": "ENUM_CODE",
	"ENUM_STRING": "ENUM_CODE",
	"enum_reference": "ENUM_CODE",
	"enum_instance": "ENUM_CODE",
	"set(b1,b2)": "SET",
	"constrained_set(b1,b2,cmn,cmx)": "SET",
	"set [0,?]": "SET",
	"set [1,?]": "SET",
	"set [1, ?]": "SET",
	"nan": "NaN",
	"media_type": "LARGE_OBJECT_TYPE",
	}

	unit_mapping = {
	"nan": "NaN",
	"hertz": "FREQUENCY",
	"hz": "FREQUENCY",
	"pa": "PRESSURE",
	"pascal": "PRESSURE",
	"n/m²": "PRESSURE",
	"bar": "PRESSURE",
	"%": "SCALARS_PERC",
	"w": "POWER",
	"watt": "POWER",
	"kw": "POWER",
	"kg/m³": "CHEMISTRY",
	"m²/s": "CHEMISTRY",
	"pa*s": "CHEMISTRY",
	"v": "ELECTRICAL",
	"volt": "ELECTRICAL",
	"db": "ACOUSTICS",
	"db(a)": "ACOUSTICS",
	"k": "TEMPERATURE",
	"°c": "TEMPERATURE",
	"n": "MECHANICS",
	"newton": "MECHANICS",
	"kg/s": "FLOW",
	"kg/h": "FLOW",
	"m³/s": "FLOW",
	"m³/h": "FLOW",
	"l/s": "FLOW",
	"l/h": "FLOW",
	"µm": "LENGTH",
	"mm": "LENGTH",
	"cm": "LENGTH",
	"dm": "LENGTH",
	"m": "LENGTH",
	"meter": "LENGTH",
	"m/s": "SPEED",
	"km/h": "SPEED",
	"s^(-1)": "FREQUENCY",
	"1/s": "FREQUENCY",
	"s": "TIME",
	"h": "TIME",
	"min": "TIME",
	"d": "TIME",
	"hours": "TIME",
	"a": "ELECTRICAL",
	"m³": "VOLUME",
	"m²": "AREA",
	"rpm": "FLOW",
	"nm": "MECHANICS",
	"m/m": "MECHANICS",
	"m³/m²s": "MECHANICS",
	"w(m²*K)": "HEAT_TRANSFER",
	"kwh": "ELECTRICAL",
	"kg/(s*m²)": "FLOW",
	"kg": "MASS",
	"w/(m*k)": "HEAT_TRANSFER",
	"m²*k/w": "HEAT_TRANSFER",
	"j/s": "POWER",
	}

	#with open(
	# "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle",
	# "rb",
	#) as handle:
	# metalabel = pickle.load(handle)

	unit_lower = unit.lower()
	datatype_lower = datatype.lower()

	unit_categ = unit_mapping.get(unit_lower)
	datatype_categ = datatype_mapping.get(datatype_lower)

	if unit_categ == None:
	unit_categ = "NaN"
	if datatype_categ == None:
	datatype_categ = "NaN"

	concat = (unit_categ, datatype_categ)
	keys = [k for k, v in metalabel.items() if v == concat]
	metadata = keys[0]

	name_embedding = model.encode(name)
	definition_embedding = model.encode(definition)
	concat_name_def_query = np.concatenate(
	(definition_embedding, name_embedding), axis=0
	)
	concat_name_def_query = concat_name_def_query.tolist()

	queries = [concat_name_def_query]
	print(type(queries))

	# Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
	# Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
	# Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
	# In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2

	# Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
	try:
	homogen = collection.query(
	query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id}
	)
	# except NoDatapointsException:
	# homogen = 'Nix'

	except Exception:
	homogen = "Nix"

	if homogen != "Nix":
	result = homogen
	result["matching_method"] = "Semantic equivalent , same semantic Id"
	result["matching_algorithm"] = "None"
	result["distances"] = [[0]]

	value = result['documents'][0][0]
	value_dict = json.loads(value)

	final_result = {
	"matching_method": result['matching_method'],
	"matching_algorithm": result['matching_algorithm'],
	"matching_distance": result['distances'][0][0],
	"aas_id": result['metadatas'][0][0]['AASId'],
	"aas_id_short": result['metadatas'][0][0]['AASIdShort'],
	"submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
	"submodel_id": result['metadatas'][0][0]['SubmodelId'],
	"matched_object": value_dict,
	}
	#final_results = [final_result]
	# Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
	elif homogen == "Nix":
	try:
	with_metadata = collection.query(
	query_embeddings=queries,
	n_results=1,
	where={"Metalabel": metadata},
	)

	# except NoDatapointsException:
	# with_metadata = 'Nix'

	except Exception:
	with_metadata = "Nix"

	without_metadata = collection.query(
	query_embeddings=queries,
	n_results=1,
	)

	if with_metadata == "Nix":
	result = without_metadata
	result[
	"matching_method"
	] = "Semantically not equivalent, NLP without Metadata"
	result[
	"matching_algorithm"
	] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"

	elif with_metadata != "Nix":
	distance_with_meta = with_metadata["distances"][0][0]
	distance_without_meta = without_metadata["distances"][0][0]
	print(distance_with_meta)
	print(distance_without_meta)
	# Vergleich der Abstände von mit und ohne Metadaten
	if distance_without_meta <= distance_with_meta:
	result = without_metadata
	result[
	"matching_method"
	] = "Semantically not equivalent, NLP without Metadata"
	result[
	"matching_algorithm"
	] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"

	else:
	result = with_metadata
	result[
	"matching_method"
	] = "Semantically not equivalent, NLP without Metadata"
	result[
	"matching_algorithm"
	] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
	# Aufbereiten des passenden finalen Ergebnisses
	"""
	final_results = []
	for i in range(0, return_matches):
	value = result['documents'][0][i]
	value_dict = json.loads(value)
	final_result = {
	"matching_method": result['matching_method'],
	"matching_algorithm": result['matching_algorithm'],
	"matching_distance": result['distances'][0][i],
	"aas_id": result['metadatas'][0][i]['AASId'],
	"aas_id_short": result['metadatas'][0][i]['AASIdShort'],
	"submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
	"submodel_id": result['metadatas'][0][i]['SubmodelId'],
	#"matched_object": result['documents'][0][i]
	"matched_object": value_dict
	}
	final_results.append(final_result)
	"""
	value = result['documents'][0][0]
	value_dict = json.loads(value)
	final_result = {
	"matching_method": result['matching_method'],
	"matching_algorithm": result['matching_algorithm'],
	"matching_distance": result['distances'][0][0],
	"aas_id": result['metadatas'][0][0]['AASId'],
	"aas_id_short": result['metadatas'][0][0]['AASIdShort'],
	"submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
	"submodel_id": result['metadatas'][0][0]['SubmodelId'],
	"matched_object": value_dict
	}
	return final_result

	def get_best_results(json_query, results):
	query = json.loads(json_query)
	numberAAS = query["NumberAASReturned"]
	sorted_results = sorted(results, key=lambda aas: aas['matching_distance'])
	numberAAS_count = numberAAS-1
	best_results = sorted_results[0:numberAAS]

	return best_results


	def ask_database(query, metalabel, model, collections, client_chroma):
	# Alle AAS werden nacheinaner abgefragt
	json_query = json.dumps(query, indent=4)
	results = []
	for collection in collections:
	print(collection.name)
	collection = client_chroma.get_collection(collection.name)
	result = query_aas(json_query, collection, model, metalabel)
	results.append(result)
	#results_json = json.dumps(results)
	best_results = get_best_results(json_query, results)
	return best_results