mboth commited on
Commit
c2e327f
1 Parent(s): c4ce2cb

Upload 5 files

Browse files
app/database_build.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from chromadb.db.clickhouse import NoDatapointsException
12
+
13
+
14
+ def prepare_cd(conceptDescriptions):
15
+ df_cd = pd.DataFrame(
16
+ columns=["SemanticId", "Definition", "PreferredName", "Datatype", "Unit"]
17
+ )
18
+ # In den leeren DF werden alle Concept Descriptions eingelesen
19
+ for cd in conceptDescriptions:
20
+ semantic_id = cd["identification"]["id"]
21
+ data_spec = cd["embeddedDataSpecifications"][0]["dataSpecificationContent"]
22
+ preferred_name = data_spec["preferredName"]
23
+ short_name = data_spec["shortName"]
24
+ if len(preferred_name) > 1:
25
+ for name_variant in preferred_name:
26
+ if (
27
+ name_variant["language"] == "EN"
28
+ or name_variant["language"] == "en"
29
+ or name_variant["language"] == "EN?"
30
+ ):
31
+ name = name_variant["text"]
32
+ elif len(preferred_name) == 1:
33
+ name = preferred_name[0]["text"]
34
+ elif len(preferred_name) == 0:
35
+ short_name = data_spec["shortName"]
36
+ if len(short_name) == 0:
37
+ name = "NaN"
38
+ else:
39
+ name = short_name[0]["text"]
40
+
41
+ definition = data_spec["definition"]
42
+ if len(definition) > 1:
43
+ for definition_variant in definition:
44
+ if (
45
+ definition_variant["language"] == "EN"
46
+ or definition_variant["language"] == "en"
47
+ or definition_variant["language"] == "EN?"
48
+ ):
49
+ chosen_def = definition_variant["text"]
50
+ elif len(definition) == 1:
51
+ chosen_def = definition[0]["text"]
52
+ elif len(definition) == 0:
53
+ chosen_def = "NaN"
54
+
55
+ if data_spec["dataType"] == "":
56
+ datatype = "NaN"
57
+ else:
58
+ datatype = data_spec["dataType"]
59
+
60
+ if data_spec["unit"] == "":
61
+ unit = "NaN"
62
+ else:
63
+ unit = data_spec["unit"]
64
+
65
+ new_entry = pd.DataFrame(
66
+ {
67
+ "SemanticId": semantic_id,
68
+ "Definition": chosen_def,
69
+ "PreferredName": name,
70
+ "Datatype": datatype,
71
+ "Unit": unit,
72
+ },
73
+ index=[0],
74
+ )
75
+ df_cd = pd.concat([df_cd, new_entry], ignore_index=True)
76
+ return df_cd
77
+
78
+
79
+ def get_values(submodel_element):
80
+ # Auslesen der Submodel Element Werte
81
+ se_type = submodel_element["modelType"]["name"]
82
+ se_semantic_id = submodel_element["semanticId"]["keys"][0]["value"]
83
+ se_semantic_id_local = submodel_element["semanticId"]["keys"][0]["local"]
84
+ se_id_short = submodel_element["idShort"]
85
+ value = []
86
+ se_value = submodel_element["value"]
87
+ value.append(se_value)
88
+
89
+ return se_type, se_semantic_id, se_semantic_id_local, se_id_short, value
90
+
91
+
92
+ def get_concept_description(semantic_id, df_cd):
93
+ cd_content = df_cd.loc[df_cd["SemanticId"] == semantic_id]
94
+
95
+ if cd_content.empty:
96
+ cd_content = pd.DataFrame(
97
+ {
98
+ "SemanticId": semantic_id,
99
+ "Definition": "NaN",
100
+ "PreferredName": "NaN",
101
+ "Datatype": "NaN",
102
+ "Unit": "NaN",
103
+ },
104
+ index=[0],
105
+ )
106
+
107
+ cd_content = cd_content.iloc[0]
108
+
109
+ return cd_content
110
+
111
+
112
+ def get_values_sec(
113
+ df_cd,
114
+ content,
115
+ df,
116
+ aas_id,
117
+ aas_name,
118
+ submodel_id,
119
+ submodel_name,
120
+ submodel_semantic_id,
121
+ ):
122
+ collection_values = content[0]["value"]
123
+ for element in collection_values:
124
+ content = []
125
+ content.append(element)
126
+
127
+ se_type, se_semantic_id, se_semantic_id_local, se_id_short, value = get_values(
128
+ element
129
+ )
130
+ if se_type == "SubmodelElementCollection":
131
+ if se_semantic_id_local == True:
132
+ cd_content = get_concept_description(se_semantic_id, df_cd)
133
+ definition = cd_content["Definition"]
134
+ preferred_name = cd_content["PreferredName"]
135
+ datatype = cd_content["Datatype"]
136
+ unit = cd_content["Unit"]
137
+
138
+ else:
139
+ definition = "NaN"
140
+ preferred_name = "NaN"
141
+ datatype = "NaN"
142
+ unit = "NaN"
143
+
144
+ new_row = pd.DataFrame(
145
+ {
146
+ "AASId": aas_id,
147
+ "AASIdShort": aas_name,
148
+ "SubmodelId": submodel_id,
149
+ "SubmodelName": submodel_name,
150
+ "SubmodelSemanticId": submodel_semantic_id,
151
+ "SEContent": content,
152
+ "SESemanticId": se_semantic_id,
153
+ "SEModelType": se_type,
154
+ "SEIdShort": se_id_short,
155
+ "SEValue": value,
156
+ "Definition": definition,
157
+ "PreferredName": preferred_name,
158
+ "Datatype": datatype,
159
+ "Unit": unit,
160
+ }
161
+ )
162
+ df = pd.concat([df, new_row], ignore_index=True)
163
+
164
+ content = []
165
+ content.append(element)
166
+ # Rekursive Funktion -> so oft durchlaufen bis unterste Ebene der Collections erreicht ist, so werden verschachteltet SECs bis zum Ende ausgelesen
167
+ df = get_values_sec(
168
+ df_cd,
169
+ content,
170
+ df,
171
+ aas_id,
172
+ aas_name,
173
+ submodel_id,
174
+ submodel_name,
175
+ submodel_semantic_id,
176
+ )
177
+
178
+ else:
179
+ if se_semantic_id_local == True:
180
+ cd_content = get_concept_description(se_semantic_id, df_cd)
181
+ definition = cd_content["Definition"]
182
+ preferred_name = cd_content["PreferredName"]
183
+ datatype = cd_content["Datatype"]
184
+ unit = cd_content["Unit"]
185
+
186
+ else:
187
+ definition = "NaN"
188
+ preferred_name = "NaN"
189
+ datatype = "NaN"
190
+ unit = "NaN"
191
+
192
+ new_row = pd.DataFrame(
193
+ {
194
+ "AASId": aas_id,
195
+ "AASIdShort": aas_name,
196
+ "SubmodelId": submodel_id,
197
+ "SubmodelName": submodel_name,
198
+ "SubmodelSemanticId": submodel_semantic_id,
199
+ "SEContent": content,
200
+ "SESemanticId": se_semantic_id,
201
+ "SEModelType": se_type,
202
+ "SEIdShort": se_id_short,
203
+ "SEValue": value,
204
+ "Definition": definition,
205
+ "PreferredName": preferred_name,
206
+ "Datatype": datatype,
207
+ "Unit": unit,
208
+ }
209
+ )
210
+ df = pd.concat([df, new_row], ignore_index=True)
211
+
212
+ return df
213
+
214
+
215
+ def set_up_metadata(metalabel, df):
216
+ datatype_mapping = {
217
+ "boolean": "BOOLEAN",
218
+ "string": "STRING",
219
+ "string_translatable": "STRING",
220
+ "translatable_string": "STRING",
221
+ "non_translatable_string": "STRING",
222
+ "date": "DATE",
223
+ "data_time": "DATE",
224
+ "uri": "URI",
225
+ "int": "INT",
226
+ "int_measure": "INT",
227
+ "int_currency": "INT",
228
+ "integer": "INT",
229
+ "real": "REAL",
230
+ "real_measure": "REAL",
231
+ "real_currency": "REAL",
232
+ "enum_code": "ENUM_CODE",
233
+ "enum_int": "ENUM_CODE",
234
+ "ENUM_REAL": "ENUM_CODE",
235
+ "ENUM_RATIONAL": "ENUM_CODE",
236
+ "ENUM_BOOLEAN": "ENUM_CODE",
237
+ "ENUM_STRING": "ENUM_CODE",
238
+ "enum_reference": "ENUM_CODE",
239
+ "enum_instance": "ENUM_CODE",
240
+ "set(b1,b2)": "SET",
241
+ "constrained_set(b1,b2,cmn,cmx)": "SET",
242
+ "set [0,?]": "SET",
243
+ "set [1,?]": "SET",
244
+ "set [1, ?]": "SET",
245
+ "nan": "NaN",
246
+ "media_type": "LARGE_OBJECT_TYPE",
247
+ }
248
+
249
+ unit_mapping = {
250
+ "nan": "NaN",
251
+ "hertz": "FREQUENCY",
252
+ "hz": "FREQUENCY",
253
+ "pa": "PRESSURE",
254
+ "pascal": "PRESSURE",
255
+ "n/m²": "PRESSURE",
256
+ "bar": "PRESSURE",
257
+ "%": "SCALARS_PERC",
258
+ "w": "POWER",
259
+ "watt": "POWER",
260
+ "kw": "POWER",
261
+ "kg/m³": "CHEMISTRY",
262
+ "m²/s": "CHEMISTRY",
263
+ "pa*s": "CHEMISTRY",
264
+ "v": "ELECTRICAL",
265
+ "volt": "ELECTRICAL",
266
+ "db": "ACOUSTICS",
267
+ "db(a)": "ACOUSTICS",
268
+ "k": "TEMPERATURE",
269
+ "°c": "TEMPERATURE",
270
+ "n": "MECHANICS",
271
+ "newton": "MECHANICS",
272
+ "kg/s": "FLOW",
273
+ "kg/h": "FLOW",
274
+ "m³/s": "FLOW",
275
+ "m³/h": "FLOW",
276
+ "l/s": "FLOW",
277
+ "l/h": "FLOW",
278
+ "µm": "LENGTH",
279
+ "mm": "LENGTH",
280
+ "cm": "LENGTH",
281
+ "dm": "LENGTH",
282
+ "m": "LENGTH",
283
+ "meter": "LENGTH",
284
+ "m/s": "SPEED",
285
+ "km/h": "SPEED",
286
+ "s^(-1)": "FREQUENCY",
287
+ "1/s": "FREQUENCY",
288
+ "s": "TIME",
289
+ "h": "TIME",
290
+ "min": "TIME",
291
+ "d": "TIME",
292
+ "hours": "TIME",
293
+ "a": "ELECTRICAL",
294
+ "m³": "VOLUME",
295
+ "m²": "AREA",
296
+ "rpm": "FLOW",
297
+ "nm": "MECHANICS",
298
+ "m/m": "MECHANICS",
299
+ "m³/m²s": "MECHANICS",
300
+ "w(m²*K)": "HEAT_TRANSFER",
301
+ "kwh": "ELECTRICAL",
302
+ "kg/(s*m²)": "FLOW",
303
+ "kg": "MASS",
304
+ "w/(m*k)": "HEAT_TRANSFER",
305
+ "m²*k/w": "HEAT_TRANSFER",
306
+ "j/s": "POWER",
307
+ }
308
+
309
+ dataset = df
310
+ dataset["unit_lowercase"] = dataset["Unit"]
311
+ dataset["unit_lowercase"] = dataset["unit_lowercase"].str.lower()
312
+ dataset["unit_categ"] = dataset["unit_lowercase"].map(unit_mapping)
313
+
314
+ dataset["datatype_lowercase"] = dataset["Datatype"]
315
+ dataset["datatype_lowercase"] = dataset["datatype_lowercase"].str.lower()
316
+ dataset["datatype_categ"] = dataset["datatype_lowercase"].map(datatype_mapping)
317
+
318
+ dataset = dataset.fillna("NaN")
319
+ dataset["index"] = dataset.index
320
+
321
+ # uni_datatype=dataset['datatype_categ'].unique()
322
+ # uni_unit=dataset['unit_categ'].unique()
323
+ unique_labels_set = set()
324
+
325
+ dataset["Metalabel"] = ""
326
+ for i in range(0, len(dataset["Metalabel"])):
327
+ concat = (str(dataset["unit_categ"][i]), str(dataset["datatype_categ"][i]))
328
+ keys = [k for k, v in metalabel.items() if v == concat]
329
+ dataset["Metalabel"][i] = keys[0]
330
+ unique_labels_set.add(keys[0])
331
+ unique_label = list(unique_labels_set)
332
+ print(unique_label)
333
+
334
+ return dataset
335
+
336
+
337
+ def encode(aas_df, model):
338
+ # Einsatz von Sentence Bert um Embeddings zu kreieren
339
+ aas_df["PreferredName"] = "Name: " + aas_df["PreferredName"].astype(str)
340
+ aas_df["Definition"] = "Description: " + aas_df["Definition"].astype(str) + "; "
341
+ corpus_names = aas_df.loc[:, "PreferredName"]
342
+ corpus_definitions = aas_df.loc[:, "Definition"]
343
+ embeddings_definitions = model.encode(corpus_definitions, show_progress_bar=True)
344
+ embeddings_names = model.encode(corpus_names, show_progress_bar=True)
345
+ concat_name_def_emb = np.concatenate(
346
+ (embeddings_definitions, embeddings_names), axis=1
347
+ )
348
+ # aas_df['EmbeddingDefinition'] = embeddings_definitions.tolist()
349
+ # aas_df['EmbeddingName'] = embeddings_names.tolist()
350
+ aas_df["EmbeddingNameDefinition"] = concat_name_def_emb.tolist()
351
+ return aas_df
352
+
353
+
354
+ def convert_to_list(aas_df):
355
+ # Für die Datenbank werden teilweise Listen gebraucht
356
+ aas_index = aas_df.index.tolist()
357
+ aas_index_str = [str(r) for r in aas_index]
358
+ se_content = aas_df["SEContent"].tolist()
359
+ se_embedding_name_definition = aas_df["EmbeddingNameDefinition"].tolist()
360
+
361
+ aas_df_dropped = aas_df.drop(
362
+ ["EmbeddingNameDefinition", "SEContent", "SEValue"], axis=1
363
+ )
364
+
365
+ metadata = aas_df_dropped.to_dict("records")
366
+
367
+ return metadata, aas_index_str, se_content, se_embedding_name_definition
368
+
369
+
370
+ def set_up_chroma(
371
+ metadata, aas_index_str, se_content, se_embedding_name_definition, aas_name, client
372
+ ):
373
+ aas_name = aas_name.lower()
374
+ # Kein Großbuchstaben in Datenbank erlaubt
375
+ print(aas_name)
376
+ # client = chromadb.Client(Settings(
377
+ # chroma_db_impl="duckdb+parquet",
378
+ # persist_directory="./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment" # Optional, defaults to .chromadb/ in the current directory
379
+ # ))
380
+ emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
381
+ model_name="gart-labor/eng-distilBERT-se-eclass"
382
+ )
383
+ collection = client.get_or_create_collection(
384
+ name=aas_name, embedding_function=emb_fn
385
+ )
386
+
387
+ aas_content_string = []
388
+ # Umwandeln in Json damit es in db geschrieben werden kann
389
+ for element in se_content:
390
+ content = json.dumps(element)
391
+ aas_content_string.append(content)
392
+
393
+ items = collection.count() # returns the number of items in the collection
394
+ print(collection)
395
+ print("Datenbank erstellt, Anzahl Items:")
396
+ print(items)
397
+ if items == 0:
398
+ # Hinzufügen der SE Inhalte, der Embeddings und weiterer Metadaten in collection der Datenbank
399
+ collection.add(
400
+ documents=aas_content_string,
401
+ embeddings=se_embedding_name_definition,
402
+ metadatas=metadata,
403
+ ids=aas_index_str,
404
+ )
405
+ items = collection.count() # returns the number of items in the collection
406
+ print("------------")
407
+ print("Datenbank befüllt, Anzahl items:")
408
+ print(items)
409
+ else:
410
+ print("-----------")
411
+ print("AAS schon vorhanden")
412
+
413
+ return collection
414
+
415
+
416
+ def read_aas(aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel):
417
+ df = pd.DataFrame(
418
+ columns=[
419
+ "AASId",
420
+ "AASIdShort",
421
+ "SubmodelId",
422
+ "SubmodelName",
423
+ "SubmodelSemanticId",
424
+ "SEContent",
425
+ "SESemanticId",
426
+ "SEModelType",
427
+ "SEIdShort",
428
+ "SEValue",
429
+ "Definition",
430
+ "PreferredName",
431
+ "Datatype",
432
+ "Unit",
433
+ ]
434
+ )
435
+
436
+ aas_id = aas[0]["identification"]["id"]
437
+ aas_name = aas[0]["idShort"]
438
+ # Aufbereiten aller Concept descriptions als pandas dataframe, damit diese nachher einfacher untersucht werden können
439
+ df_cd = prepare_cd(conceptDescriptions)
440
+ # Auslesen der Teilmodelle
441
+ for submodel in submodels:
442
+ submodel_name = submodel["idShort"]
443
+ submodel_id = submodel["identification"]["id"]
444
+ # Muss gemacht werden, da Anzahl der Teilmodelle innerhalb der AAS und des Env nicht immer übereisntimmen
445
+ if submodel_id in submodels_ids:
446
+ semantic_id_existing = submodel["semanticId"]["keys"]
447
+ if not semantic_id_existing:
448
+ submodel_semantic_id = "Not defined"
449
+ else:
450
+ submodel_semantic_id = semantic_id_existing[0]["value"]
451
+ submodel_elements = submodel["submodelElements"]
452
+ # Auslesen Submodel Elements
453
+ for submodel_element in submodel_elements:
454
+ content = []
455
+ content.append(submodel_element)
456
+
457
+ (
458
+ se_type,
459
+ se_semantic_id,
460
+ se_semantic_id_local,
461
+ se_id_short,
462
+ value,
463
+ ) = get_values(submodel_element)
464
+
465
+ # When Concept Description local dann auslesen der Concept Description
466
+ if se_semantic_id_local == True:
467
+ cd_content = get_concept_description(se_semantic_id, df_cd)
468
+ definition = cd_content["Definition"]
469
+ preferred_name = cd_content["PreferredName"]
470
+ datatype = cd_content["Datatype"]
471
+ unit = cd_content["Unit"]
472
+
473
+ else:
474
+ definition = "NaN"
475
+ preferred_name = "NaN"
476
+ datatype = "NaN"
477
+ unit = "NaN"
478
+
479
+ new_row = pd.DataFrame(
480
+ {
481
+ "AASId": aas_id,
482
+ "AASIdShort": aas_name,
483
+ "SubmodelId": submodel_id,
484
+ "SubmodelName": submodel_name,
485
+ "SubmodelSemanticId": submodel_semantic_id,
486
+ "SEContent": content,
487
+ "SESemanticId": se_semantic_id,
488
+ "SEModelType": se_type,
489
+ "SEIdShort": se_id_short,
490
+ "SEValue": value,
491
+ "Definition": definition,
492
+ "PreferredName": preferred_name,
493
+ "Datatype": datatype,
494
+ "Unit": unit,
495
+ }
496
+ )
497
+ df = pd.concat([df, new_row], ignore_index=True)
498
+
499
+ # Wenn Submodel Element Collection dann diese Werte auch auslesen
500
+ if se_type == "SubmodelElementCollection":
501
+ df = get_values_sec(
502
+ df_cd,
503
+ content,
504
+ df,
505
+ aas_id,
506
+ aas_name,
507
+ submodel_id,
508
+ submodel_name,
509
+ submodel_semantic_id,
510
+ )
511
+ else:
512
+ continue
513
+
514
+ df = set_up_metadata(metalabel, df)
515
+
516
+ return df, aas_name
517
+
518
+
519
+ def index_corpus(data, model, metalabel, client_chroma):
520
+ # Start Punkt
521
+
522
+ aas = data["assetAdministrationShells"]
523
+ aas_submodels = aas[0]["submodels"]
524
+ submodels_ids = []
525
+ for submodel in aas_submodels:
526
+ submodels_ids.append(submodel["keys"][0]["value"])
527
+ submodels = data["submodels"]
528
+ conceptDescriptions = data["conceptDescriptions"]
529
+ assets = data["assets"]
530
+
531
+ aas_df, aas_name = read_aas(
532
+ aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel
533
+ )
534
+ # aas_df_embeddings = encode(aas_df, model)
535
+ aas_df = encode(aas_df, model)
536
+ metadata, aas_index_str, se_content, se_embedding_name_definition = convert_to_list(
537
+ aas_df
538
+ )
539
+ collection = set_up_chroma(
540
+ metadata,
541
+ aas_index_str,
542
+ se_content,
543
+ se_embedding_name_definition,
544
+ aas_name,
545
+ client_chroma,
546
+ )
547
+
548
+ return collection
549
+
550
+
551
+ # if __name__ == '__main__':
552
+ # create_database = index_corpus(aas = 'festo_switch.json')
app/main.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+
3
+ # from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
+ import time
5
+ import os
6
+ import json
7
+ import pandas as pd
8
+ import numpy as np
9
+ import category_encoders as ce
10
+ import string
11
+ import pickle
12
+ import tqdm.autonotebook
13
+ from fastapi import FastAPI, Request, UploadFile, File
14
+ from joblib import dump, load
15
+ from pydantic import BaseModel
16
+ import sys
17
+ from database_build import index_corpus
18
+ from predict_different_aas import ask_database
19
+ from predict_one_aas import query_specific_aas
20
+ from typing import Any, Dict, AnyStr, List, Union
21
+ import chromadb
22
+ from chromadb.config import Settings
23
+ from typing import Union
24
+
25
+ app = FastAPI(title="Interface Semantic Matching")
26
+
27
+ JSONObject = Dict[AnyStr, Any]
28
+ JSONArray = List[Any]
29
+ JSONStructure = Union[JSONArray, JSONObject]
30
+
31
+
32
+ class submodelElement(BaseModel):
33
+ datatype: str
34
+ definition: str
35
+ name: str
36
+ semantic_id: str
37
+ unit: str
38
+ return_matches: int
39
+ aas_id: str
40
+ number_aas_returned: int
41
+
42
+ @app.on_event("startup")
43
+ def load_hf_model():
44
+ global model
45
+ # Altes Modell
46
+ # model = SentenceTransformer('mboth/distil-eng-quora-sentence')
47
+
48
+ # Fine Tuned Modell
49
+ model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
50
+
51
+ # global model_translate
52
+ # model_translate = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
53
+ # global tokenizer_translate
54
+ # tokenizer_translate = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
55
+
56
+ with open("app/metadata.pickle", "rb") as handle:
57
+ global metalabel
58
+ metalabel = pickle.load(handle)
59
+ global client_chroma
60
+ client_chroma = chromadb.Client(
61
+ Settings(
62
+ chroma_api_impl="rest",
63
+ # chroma_server_host muss angepasst werden nach jedem Neustart AWS
64
+ chroma_server_host="3.67.80.82",
65
+ chroma_server_http_port=8000,
66
+ )
67
+ )
68
+
69
+
70
+ @app.post("/PostAssetAdministrationShellEmbeddings")
71
+ async def index_aas(aas: UploadFile = File(...)):
72
+ data = json.load(aas.file)
73
+ print(type(data))
74
+ # aas = new_file
75
+ #aas, submodels, conceptDescriptions, assets, aas_df, collection, aas_name= index_corpus(data, model, metalabel, client_chroma)
76
+ collection = index_corpus(data, model, metalabel, client_chroma)
77
+ ready = 'AAS ready'
78
+ return ready
79
+
80
+
81
+ @app.post("/GetSubmodelElementsFromDifferentAASBySemanticIdAndSemanticInformation")
82
+ def predict_different_aas(name: str, definition: str, number_aas_returned: Union[int, None] = 1, semantic_id: Union[str, None] = "NaN", unit: Union[str, None] = "NaN", datatype: Union[str, None] = "NaN"):
83
+ collections = client_chroma.list_collections()
84
+ query = {
85
+ "Name": name,
86
+ "Definition": definition,
87
+ "Unit": unit,
88
+ "Datatype": datatype,
89
+ "SemanticId": semantic_id,
90
+ "NumberAASReturned": number_aas_returned
91
+ }
92
+ results = ask_database(query, metalabel, model, collections, client_chroma)
93
+
94
+ return results
95
+
96
+ @app.post("/GetSubmodelElementsFromSpecificAASBySemanticIdAndSemanticInformation")
97
+ def predict_specific_aas(name: str, definition: str, aas_id: str, return_matches: Union[int, None] = 2, semantic_id: Union[str, None] = "NaN", unit: Union[str, None] = "NaN", datatype: Union[str, None] = "NaN"):
98
+ collections = client_chroma.list_collections()
99
+ query = {
100
+ "Name": name,
101
+ "Definition": definition,
102
+ "Unit": unit,
103
+ "Datatype": datatype,
104
+ "SemanticId": semantic_id,
105
+ "ReturnMatches": return_matches,
106
+ "AASId": aas_id,
107
+ }
108
+ result = query_specific_aas(query, metalabel, model, collections, client_chroma)
109
+
110
+ return result
app/metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4aee0cd2ca534e4af8023bd334db591a0a46b2a37154758aa5e3873b8d4728
3
+ size 1670
app/predict_different_aas.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from chromadb.db.clickhouse import NoDatapointsException
12
+
13
+
14
+ def query_aas(query_json, collection, model, metalabel):
15
+ query = json.loads(query_json)
16
+ name = query["Name"]
17
+ definition = query["Definition"]
18
+ unit = query["Unit"]
19
+ datatype = query["Datatype"]
20
+ semantic_id = query["SemanticId"]
21
+ numberAAS = query["NumberAASReturned"]
22
+
23
+ #model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
24
+
25
+ datatype_mapping = {
26
+ "boolean": "BOOLEAN",
27
+ "string": "STRING",
28
+ "string_translatable": "STRING",
29
+ "translatable_string": "STRING",
30
+ "non_translatable_string": "STRING",
31
+ "date": "DATE",
32
+ "data_time": "DATE",
33
+ "uri": "URI",
34
+ "int": "INT",
35
+ "int_measure": "INT",
36
+ "int_currency": "INT",
37
+ "integer": "INT",
38
+ "real": "REAL",
39
+ "real_measure": "REAL",
40
+ "real_currency": "REAL",
41
+ "enum_code": "ENUM_CODE",
42
+ "enum_int": "ENUM_CODE",
43
+ "ENUM_REAL": "ENUM_CODE",
44
+ "ENUM_RATIONAL": "ENUM_CODE",
45
+ "ENUM_BOOLEAN": "ENUM_CODE",
46
+ "ENUM_STRING": "ENUM_CODE",
47
+ "enum_reference": "ENUM_CODE",
48
+ "enum_instance": "ENUM_CODE",
49
+ "set(b1,b2)": "SET",
50
+ "constrained_set(b1,b2,cmn,cmx)": "SET",
51
+ "set [0,?]": "SET",
52
+ "set [1,?]": "SET",
53
+ "set [1, ?]": "SET",
54
+ "nan": "NaN",
55
+ "media_type": "LARGE_OBJECT_TYPE",
56
+ }
57
+
58
+ unit_mapping = {
59
+ "nan": "NaN",
60
+ "hertz": "FREQUENCY",
61
+ "hz": "FREQUENCY",
62
+ "pa": "PRESSURE",
63
+ "pascal": "PRESSURE",
64
+ "n/m²": "PRESSURE",
65
+ "bar": "PRESSURE",
66
+ "%": "SCALARS_PERC",
67
+ "w": "POWER",
68
+ "watt": "POWER",
69
+ "kw": "POWER",
70
+ "kg/m³": "CHEMISTRY",
71
+ "m²/s": "CHEMISTRY",
72
+ "pa*s": "CHEMISTRY",
73
+ "v": "ELECTRICAL",
74
+ "volt": "ELECTRICAL",
75
+ "db": "ACOUSTICS",
76
+ "db(a)": "ACOUSTICS",
77
+ "k": "TEMPERATURE",
78
+ "°c": "TEMPERATURE",
79
+ "n": "MECHANICS",
80
+ "newton": "MECHANICS",
81
+ "kg/s": "FLOW",
82
+ "kg/h": "FLOW",
83
+ "m³/s": "FLOW",
84
+ "m³/h": "FLOW",
85
+ "l/s": "FLOW",
86
+ "l/h": "FLOW",
87
+ "µm": "LENGTH",
88
+ "mm": "LENGTH",
89
+ "cm": "LENGTH",
90
+ "dm": "LENGTH",
91
+ "m": "LENGTH",
92
+ "meter": "LENGTH",
93
+ "m/s": "SPEED",
94
+ "km/h": "SPEED",
95
+ "s^(-1)": "FREQUENCY",
96
+ "1/s": "FREQUENCY",
97
+ "s": "TIME",
98
+ "h": "TIME",
99
+ "min": "TIME",
100
+ "d": "TIME",
101
+ "hours": "TIME",
102
+ "a": "ELECTRICAL",
103
+ "m³": "VOLUME",
104
+ "m²": "AREA",
105
+ "rpm": "FLOW",
106
+ "nm": "MECHANICS",
107
+ "m/m": "MECHANICS",
108
+ "m³/m²s": "MECHANICS",
109
+ "w(m²*K)": "HEAT_TRANSFER",
110
+ "kwh": "ELECTRICAL",
111
+ "kg/(s*m²)": "FLOW",
112
+ "kg": "MASS",
113
+ "w/(m*k)": "HEAT_TRANSFER",
114
+ "m²*k/w": "HEAT_TRANSFER",
115
+ "j/s": "POWER",
116
+ }
117
+
118
+ #with open(
119
+ # "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle",
120
+ # "rb",
121
+ #) as handle:
122
+ # metalabel = pickle.load(handle)
123
+
124
+ unit_lower = unit.lower()
125
+ datatype_lower = datatype.lower()
126
+
127
+ unit_categ = unit_mapping.get(unit_lower)
128
+ datatype_categ = datatype_mapping.get(datatype_lower)
129
+
130
+ if unit_categ == None:
131
+ unit_categ = "NaN"
132
+ if datatype_categ == None:
133
+ datatype_categ = "NaN"
134
+
135
+ concat = (unit_categ, datatype_categ)
136
+ keys = [k for k, v in metalabel.items() if v == concat]
137
+ metadata = keys[0]
138
+
139
+ name_embedding = model.encode(name)
140
+ definition_embedding = model.encode(definition)
141
+ concat_name_def_query = np.concatenate(
142
+ (definition_embedding, name_embedding), axis=0
143
+ )
144
+ concat_name_def_query = concat_name_def_query.tolist()
145
+
146
+ queries = [concat_name_def_query]
147
+ print(type(queries))
148
+
149
+ # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
150
+ # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
151
+ # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
152
+ # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2
153
+
154
+ # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
155
+ try:
156
+ homogen = collection.query(
157
+ query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id}
158
+ )
159
+ # except NoDatapointsException:
160
+ # homogen = 'Nix'
161
+
162
+ except Exception:
163
+ homogen = "Nix"
164
+
165
+ if homogen != "Nix":
166
+ result = homogen
167
+ result["matching_method"] = "Semantic equivalent , same semantic Id"
168
+ result["matching_algorithm"] = "None"
169
+ result["distances"] = [[0]]
170
+
171
+ value = result['documents'][0][0]
172
+ value_dict = json.loads(value)
173
+
174
+ final_result = {
175
+ "matching_method": result['matching_method'],
176
+ "matching_algorithm": result['matching_algorithm'],
177
+ "matching_distance": result['distances'][0][0],
178
+ "aas_id": result['metadatas'][0][0]['AASId'],
179
+ "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
180
+ "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
181
+ "submodel_id": result['metadatas'][0][0]['SubmodelId'],
182
+ "matched_object": value_dict,
183
+ }
184
+ #final_results = [final_result]
185
+ # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
186
+ elif homogen == "Nix":
187
+ try:
188
+ with_metadata = collection.query(
189
+ query_embeddings=queries,
190
+ n_results=1,
191
+ where={"Metalabel": metadata},
192
+ )
193
+
194
+ # except NoDatapointsException:
195
+ # with_metadata = 'Nix'
196
+
197
+ except Exception:
198
+ with_metadata = "Nix"
199
+
200
+ without_metadata = collection.query(
201
+ query_embeddings=queries,
202
+ n_results=1,
203
+ )
204
+
205
+ if with_metadata == "Nix":
206
+ result = without_metadata
207
+ result[
208
+ "matching_method"
209
+ ] = "Semantically not equivalent, NLP without Metadata"
210
+ result[
211
+ "matching_algorithm"
212
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
213
+
214
+ elif with_metadata != "Nix":
215
+ distance_with_meta = with_metadata["distances"][0][0]
216
+ distance_without_meta = without_metadata["distances"][0][0]
217
+ print(distance_with_meta)
218
+ print(distance_without_meta)
219
+ # Vergleich der Abstände von mit und ohne Metadaten
220
+ if distance_without_meta <= distance_with_meta:
221
+ result = without_metadata
222
+ result[
223
+ "matching_method"
224
+ ] = "Semantically not equivalent, NLP without Metadata"
225
+ result[
226
+ "matching_algorithm"
227
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
228
+
229
+ else:
230
+ result = with_metadata
231
+ result[
232
+ "matching_method"
233
+ ] = "Semantically not equivalent, NLP without Metadata"
234
+ result[
235
+ "matching_algorithm"
236
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
237
+ # Aufbereiten des passenden finalen Ergebnisses
238
+ """
239
+ final_results = []
240
+ for i in range(0, return_matches):
241
+ value = result['documents'][0][i]
242
+ value_dict = json.loads(value)
243
+ final_result = {
244
+ "matching_method": result['matching_method'],
245
+ "matching_algorithm": result['matching_algorithm'],
246
+ "matching_distance": result['distances'][0][i],
247
+ "aas_id": result['metadatas'][0][i]['AASId'],
248
+ "aas_id_short": result['metadatas'][0][i]['AASIdShort'],
249
+ "submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
250
+ "submodel_id": result['metadatas'][0][i]['SubmodelId'],
251
+ #"matched_object": result['documents'][0][i]
252
+ "matched_object": value_dict
253
+ }
254
+ final_results.append(final_result)
255
+ """
256
+ value = result['documents'][0][0]
257
+ value_dict = json.loads(value)
258
+ final_result = {
259
+ "matching_method": result['matching_method'],
260
+ "matching_algorithm": result['matching_algorithm'],
261
+ "matching_distance": result['distances'][0][0],
262
+ "aas_id": result['metadatas'][0][0]['AASId'],
263
+ "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
264
+ "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
265
+ "submodel_id": result['metadatas'][0][0]['SubmodelId'],
266
+ "matched_object": value_dict
267
+ }
268
+ return final_result
269
+
270
+ def get_best_results(json_query, results):
271
+ query = json.loads(json_query)
272
+ numberAAS = query["NumberAASReturned"]
273
+ sorted_results = sorted(results, key=lambda aas: aas['matching_distance'])
274
+ numberAAS_count = numberAAS-1
275
+ best_results = sorted_results[0:numberAAS]
276
+
277
+ return best_results
278
+
279
+
280
+ def ask_database(query, metalabel, model, collections, client_chroma):
281
+ # Alle AAS werden nacheinaner abgefragt
282
+ json_query = json.dumps(query, indent=4)
283
+ results = []
284
+ for collection in collections:
285
+ print(collection.name)
286
+ collection = client_chroma.get_collection(collection.name)
287
+ result = query_aas(json_query, collection, model, metalabel)
288
+ results.append(result)
289
+ #results_json = json.dumps(results)
290
+ best_results = get_best_results(json_query, results)
291
+ return best_results
app/predict_one_aas.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from chromadb.db.clickhouse import NoDatapointsException
12
+
13
+ def query_right_aas(json_query, collection, metalabel, model):
14
+ query = json.loads(json_query)
15
+ name = query['Name']
16
+ definition = query["Definition"]
17
+ unit = query["Unit"]
18
+ datatype = query["Datatype"]
19
+ semantic_id = query["SemanticId"]
20
+ return_matches = query["ReturnMatches"]
21
+
22
+ datatype_mapping = {'boolean': 'BOOLEAN', 'string': 'STRING', 'string_translatable':'STRING', 'translatable_string': 'STRING', 'non_translatable_string':'STRING',
23
+ 'date':'DATE', 'data_time':'DATE', 'uri':'URI', 'int':'INT', 'int_measure':'INT', 'int_currency':'INT', 'integer': 'INT',
24
+ 'real':'REAL', 'real_measure': 'REAL', 'real_currency':'REAL', 'enum_code': 'ENUM_CODE', 'enum_int':'ENUM_CODE',
25
+ 'ENUM_REAL': 'ENUM_CODE', 'ENUM_RATIONAL': 'ENUM_CODE', 'ENUM_BOOLEAN': 'ENUM_CODE', 'ENUM_STRING': 'ENUM_CODE',
26
+ 'enum_reference': 'ENUM_CODE', 'enum_instance': 'ENUM_CODE', 'set(b1,b2)': 'SET',
27
+ 'constrained_set(b1,b2,cmn,cmx)': 'SET', 'set [0,?]': 'SET', 'set [1,?]': 'SET','set [1, ?]': 'SET', 'nan': 'NaN',
28
+ 'media_type':'LARGE_OBJECT_TYPE'}
29
+
30
+ unit_mapping = {'nan': 'NaN', 'hertz': 'FREQUENCY', 'hz': 'FREQUENCY', 'pa': 'PRESSURE', 'pascal': 'PRESSURE', 'n/m²':'PRESSURE',
31
+ 'bar': 'PRESSURE', '%': 'SCALARS_PERC', 'w': 'POWER', 'watt': 'POWER', 'kw': 'POWER', 'kg/m³':'CHEMISTRY',
32
+ 'm²/s': 'CHEMISTRY', 'pa*s': 'CHEMISTRY', 'v':'ELECTRICAL', 'volt': 'ELECTRICAL', 'db': 'ACOUSTICS',
33
+ 'db(a)': 'ACOUSTICS','k': 'TEMPERATURE', '°c': 'TEMPERATURE', 'n': 'MECHANICS', 'newton':'MECHANICS', 'kg/s':'FLOW',
34
+ 'kg/h':'FLOW', 'm³/s': 'FLOW', 'm³/h': 'FLOW', 'l/s':'FLOW', 'l/h':'FLOW', 'µm': 'LENGTH', 'mm':'LENGTH', 'cm':'LENGTH',
35
+ 'dm':'LENGTH', 'm':'LENGTH' ,'meter': 'LENGTH', 'm/s':'SPEED', 'km/h': 'SPEED', 's^(-1)':'FREQUENCY', '1/s':'FREQUENCY',
36
+ 's':'TIME', 'h':'TIME', 'min':'TIME', 'd': 'TIME', 'hours': 'TIME', 'a': 'ELECTRICAL', 'm³': 'VOLUME',
37
+ 'm²': 'AREA', 'rpm': 'FLOW', 'nm': 'MECHANICS', 'm/m': 'MECHANICS', 'm³/m²s': 'MECHANICS', 'w(m²*K)': 'HEAT_TRANSFER',
38
+ 'kwh': 'ELECTRICAL', 'kg/(s*m²)': 'FLOW', 'kg': 'MASS', 'w/(m*k)': 'HEAT_TRANSFER', 'm²*k/w': 'HEAT_TRANSFER',
39
+ 'j/s': 'POWER'}
40
+
41
+ unit_lower = unit.lower()
42
+ datatype_lower = datatype.lower()
43
+
44
+ unit_categ = unit_mapping.get(unit_lower)
45
+ datatype_categ = datatype_mapping.get(datatype_lower)
46
+
47
+ if unit_categ == None:
48
+ unit_categ = 'NaN'
49
+ if datatype_categ == None:
50
+ datatype_categ = 'NaN'
51
+
52
+ concat= (unit_categ, datatype_categ)
53
+ keys = [k for k, v in metalabel.items() if v == concat]
54
+ metadata = keys[0]
55
+
56
+ name_embedding = model.encode(name)
57
+ definition_embedding = model.encode(definition)
58
+ concat_name_def_query = np.concatenate((definition_embedding, name_embedding), axis = 0)
59
+ concat_name_def_query = concat_name_def_query.tolist()
60
+
61
+ queries = [concat_name_def_query]
62
+ #print(type(queries))
63
+
64
+ # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
65
+ # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
66
+ # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
67
+ # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2
68
+
69
+ # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
70
+ try:
71
+ homogen = collection.query(
72
+ query_embeddings=queries,
73
+ n_results=1,
74
+ where={"SESemanticId": semantic_id}
75
+ )
76
+ #except NoDatapointsException:
77
+ # homogen = 'Nix'
78
+
79
+ except Exception:
80
+ homogen = 'Nix'
81
+
82
+ if homogen != 'Nix':
83
+ result = homogen
84
+ result['matching_method']= 'Semantic equivalent , same semantic Id'
85
+ result['matching_algorithm'] = 'None'
86
+ result['distances'] = [[0]]
87
+ value = result['documents'][0][0]
88
+ value_dict = json.loads(value)
89
+
90
+ final_result = {
91
+ "matching_method": result['matching_method'],
92
+ "matching_algorithm": result['matching_algorithm'],
93
+ "matching_distance": result['distances'][0][0],
94
+ "aas_id": result['metadatas'][0][0]['AASId'],
95
+ "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
96
+ "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
97
+ "submodel_id": result['metadatas'][0][0]['SubmodelId'],
98
+ "matched_object": value_dict,
99
+ }
100
+ final_results = [final_result]
101
+ # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
102
+ elif homogen == 'Nix':
103
+ try:
104
+ with_metadata = collection.query(
105
+ query_embeddings=queries,
106
+ n_results=return_matches,
107
+ where={"Metalabel": metadata},
108
+ )
109
+
110
+ #except NoDatapointsException:
111
+ # with_metadata = 'Nix'
112
+
113
+ except Exception:
114
+ with_metadata = 'Nix'
115
+
116
+ without_metadata = collection.query(
117
+ query_embeddings=queries,
118
+ n_results=return_matches,
119
+ )
120
+ print(without_metadata)
121
+
122
+ if with_metadata == 'Nix':
123
+ result = without_metadata
124
+ result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
125
+ result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
126
+
127
+ elif with_metadata != 'Nix':
128
+ distance_with_meta = with_metadata['distances'][0][0]
129
+ distance_without_meta = without_metadata['distances'][0][0]
130
+ #print(distance_with_meta)
131
+ #print(distance_without_meta)
132
+ # Vergleich der Abstände von mit und ohne Metadaten
133
+ if distance_without_meta <= distance_with_meta:
134
+ result = without_metadata
135
+ result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
136
+ result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
137
+
138
+ else:
139
+ result = with_metadata
140
+ result['matching_method']= 'Semantically not equivalent, NLP without Metadata'
141
+ result['matching_algorithm'] = 'Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass'
142
+ # Aufbereiten des passenden finalen Ergebnisses
143
+ final_results = []
144
+ print(result)
145
+ for i in range(0, return_matches):
146
+ value = result['documents'][0][i]
147
+ value_dict = json.loads(value)
148
+ final_result = {
149
+ "matching_method": result['matching_method'],
150
+ "matching_algorithm": result['matching_algorithm'],
151
+ "matching_distance": result['distances'][0][i],
152
+ #"aas_id": result['metadatas'][0][i]['AASId'],
153
+ #"aas_id_short": result['metadatas'][0][i]['AASIdShort'],
154
+ "submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
155
+ "submodel_id": result['metadatas'][0][i]['SubmodelId'],
156
+ "matched_object": value_dict
157
+ }
158
+ #final_result = json.dumps(final_result, indent = 4)
159
+ final_results.append(final_result)
160
+
161
+ return final_results
162
+
163
+ def get_right_collection(collections, aas_id):
164
+ right_collection = []
165
+ for collection in collections:
166
+ try_collection = collection.get(where={'AASId': aas_id})
167
+ try:
168
+ collection_aas_id = try_collection['metadatas'][0]['AASId']
169
+ right_collection.append(collection)
170
+ except:
171
+ print('Nix')
172
+ if(right_collection == []):
173
+ right_collection = ['AAS not in database']
174
+
175
+ return right_collection
176
+
177
+ # Eine spezifische AAS
178
+ def query_specific_aas(query, metalabel, model, collections, client_chroma):
179
+ json_query = json.dumps(query, indent = 4)
180
+ aas_id = query['AASId']
181
+ right_collection = get_right_collection(collections, aas_id)
182
+ if right_collection == ['AAS not in database']:
183
+ result = right_collection
184
+ else:
185
+ collection = client_chroma.get_collection(right_collection[0].name)
186
+ result = query_right_aas(json_query, collection, metalabel, model)
187
+
188
+ return result