mboth commited on
Commit
495245d
1 Parent(s): a4cf025

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +15 -0
  2. app/database_build.py +552 -0
  3. app/main.py +90 -0
  4. app/metadata.pickle +3 -0
  5. app/predict_se.py +264 -0
  6. requirements.txt +0 -0
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8
2
+
3
+ COPY requirements.txt .
4
+
5
+ RUN pip install -r requirements.txt && rm requirements.txt
6
+
7
+ EXPOSE 80
8
+
9
+ COPY ./app /app
10
+
11
+ ADD ./app/database_build.py .
12
+
13
+ ADD ./app/predict_se.py .
14
+
15
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
app/database_build.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from chromadb.db.clickhouse import NoDatapointsException
12
+
13
+
14
+ def prepare_cd(conceptDescriptions):
15
+ df_cd = pd.DataFrame(
16
+ columns=["SemanticId", "Definition", "PreferredName", "Datatype", "Unit"]
17
+ )
18
+ # In den leeren DF werden alle Concept Descriptions eingelesen
19
+ for cd in conceptDescriptions:
20
+ semantic_id = cd["identification"]["id"]
21
+ data_spec = cd["embeddedDataSpecifications"][0]["dataSpecificationContent"]
22
+ preferred_name = data_spec["preferredName"]
23
+ short_name = data_spec["shortName"]
24
+ if len(preferred_name) > 1:
25
+ for name_variant in preferred_name:
26
+ if (
27
+ name_variant["language"] == "EN"
28
+ or name_variant["language"] == "en"
29
+ or name_variant["language"] == "EN?"
30
+ ):
31
+ name = name_variant["text"]
32
+ elif len(preferred_name) == 1:
33
+ name = preferred_name[0]["text"]
34
+ elif len(preferred_name) == 0:
35
+ short_name = data_spec["shortName"]
36
+ if len(short_name) == 0:
37
+ name = "NaN"
38
+ else:
39
+ name = short_name[0]["text"]
40
+
41
+ definition = data_spec["definition"]
42
+ if len(definition) > 1:
43
+ for definition_variant in definition:
44
+ if (
45
+ definition_variant["language"] == "EN"
46
+ or definition_variant["language"] == "en"
47
+ or definition_variant["language"] == "EN?"
48
+ ):
49
+ chosen_def = definition_variant["text"]
50
+ elif len(definition) == 1:
51
+ chosen_def = definition[0]["text"]
52
+ elif len(definition) == 0:
53
+ chosen_def = "NaN"
54
+
55
+ if data_spec["dataType"] == "":
56
+ datatype = "NaN"
57
+ else:
58
+ datatype = data_spec["dataType"]
59
+
60
+ if data_spec["unit"] == "":
61
+ unit = "NaN"
62
+ else:
63
+ unit = data_spec["unit"]
64
+
65
+ new_entry = pd.DataFrame(
66
+ {
67
+ "SemanticId": semantic_id,
68
+ "Definition": chosen_def,
69
+ "PreferredName": name,
70
+ "Datatype": datatype,
71
+ "Unit": unit,
72
+ },
73
+ index=[0],
74
+ )
75
+ df_cd = pd.concat([df_cd, new_entry], ignore_index=True)
76
+ return df_cd
77
+
78
+
79
+ def get_values(submodel_element):
80
+ # Auslesen der Submodel Element Werte
81
+ se_type = submodel_element["modelType"]["name"]
82
+ se_semantic_id = submodel_element["semanticId"]["keys"][0]["value"]
83
+ se_semantic_id_local = submodel_element["semanticId"]["keys"][0]["local"]
84
+ se_id_short = submodel_element["idShort"]
85
+ value = []
86
+ se_value = submodel_element["value"]
87
+ value.append(se_value)
88
+
89
+ return se_type, se_semantic_id, se_semantic_id_local, se_id_short, value
90
+
91
+
92
+ def get_concept_description(semantic_id, df_cd):
93
+ cd_content = df_cd.loc[df_cd["SemanticId"] == semantic_id]
94
+
95
+ if cd_content.empty:
96
+ cd_content = pd.DataFrame(
97
+ {
98
+ "SemanticId": semantic_id,
99
+ "Definition": "NaN",
100
+ "PreferredName": "NaN",
101
+ "Datatype": "NaN",
102
+ "Unit": "NaN",
103
+ },
104
+ index=[0],
105
+ )
106
+
107
+ cd_content = cd_content.iloc[0]
108
+
109
+ return cd_content
110
+
111
+
112
+ def get_values_sec(
113
+ df_cd,
114
+ content,
115
+ df,
116
+ aas_id,
117
+ aas_name,
118
+ submodel_id,
119
+ submodel_name,
120
+ submodel_semantic_id,
121
+ ):
122
+ collection_values = content[0]["value"]
123
+ for element in collection_values:
124
+ content = []
125
+ content.append(element)
126
+
127
+ se_type, se_semantic_id, se_semantic_id_local, se_id_short, value = get_values(
128
+ element
129
+ )
130
+ if se_type == "SubmodelElementCollection":
131
+ if se_semantic_id_local == True:
132
+ cd_content = get_concept_description(se_semantic_id, df_cd)
133
+ definition = cd_content["Definition"]
134
+ preferred_name = cd_content["PreferredName"]
135
+ datatype = cd_content["Datatype"]
136
+ unit = cd_content["Unit"]
137
+
138
+ else:
139
+ definition = "NaN"
140
+ preferred_name = "NaN"
141
+ datatype = "NaN"
142
+ unit = "NaN"
143
+
144
+ new_row = pd.DataFrame(
145
+ {
146
+ "AASId": aas_id,
147
+ "AASIdShort": aas_name,
148
+ "SubmodelId": submodel_id,
149
+ "SubmodelName": submodel_name,
150
+ "SubmodelSemanticId": submodel_semantic_id,
151
+ "SEContent": content,
152
+ "SESemanticId": se_semantic_id,
153
+ "SEModelType": se_type,
154
+ "SEIdShort": se_id_short,
155
+ "SEValue": value,
156
+ "Definition": definition,
157
+ "PreferredName": preferred_name,
158
+ "Datatype": datatype,
159
+ "Unit": unit,
160
+ }
161
+ )
162
+ df = pd.concat([df, new_row], ignore_index=True)
163
+
164
+ content = []
165
+ content.append(element)
166
+ # Rekursive Funktion -> so oft durchlaufen bis unterste Ebene der Collections erreicht ist, so werden verschachteltet SECs bis zum Ende ausgelesen
167
+ df = get_values_sec(
168
+ df_cd,
169
+ content,
170
+ df,
171
+ aas_id,
172
+ aas_name,
173
+ submodel_id,
174
+ submodel_name,
175
+ submodel_semantic_id,
176
+ )
177
+
178
+ else:
179
+ if se_semantic_id_local == True:
180
+ cd_content = get_concept_description(se_semantic_id, df_cd)
181
+ definition = cd_content["Definition"]
182
+ preferred_name = cd_content["PreferredName"]
183
+ datatype = cd_content["Datatype"]
184
+ unit = cd_content["Unit"]
185
+
186
+ else:
187
+ definition = "NaN"
188
+ preferred_name = "NaN"
189
+ datatype = "NaN"
190
+ unit = "NaN"
191
+
192
+ new_row = pd.DataFrame(
193
+ {
194
+ "AASId": aas_id,
195
+ "AASIdShort": aas_name,
196
+ "SubmodelId": submodel_id,
197
+ "SubmodelName": submodel_name,
198
+ "SubmodelSemanticId": submodel_semantic_id,
199
+ "SEContent": content,
200
+ "SESemanticId": se_semantic_id,
201
+ "SEModelType": se_type,
202
+ "SEIdShort": se_id_short,
203
+ "SEValue": value,
204
+ "Definition": definition,
205
+ "PreferredName": preferred_name,
206
+ "Datatype": datatype,
207
+ "Unit": unit,
208
+ }
209
+ )
210
+ df = pd.concat([df, new_row], ignore_index=True)
211
+
212
+ return df
213
+
214
+
215
+ def set_up_metadata(metalabel, df):
216
+ datatype_mapping = {
217
+ "boolean": "BOOLEAN",
218
+ "string": "STRING",
219
+ "string_translatable": "STRING",
220
+ "translatable_string": "STRING",
221
+ "non_translatable_string": "STRING",
222
+ "date": "DATE",
223
+ "data_time": "DATE",
224
+ "uri": "URI",
225
+ "int": "INT",
226
+ "int_measure": "INT",
227
+ "int_currency": "INT",
228
+ "integer": "INT",
229
+ "real": "REAL",
230
+ "real_measure": "REAL",
231
+ "real_currency": "REAL",
232
+ "enum_code": "ENUM_CODE",
233
+ "enum_int": "ENUM_CODE",
234
+ "ENUM_REAL": "ENUM_CODE",
235
+ "ENUM_RATIONAL": "ENUM_CODE",
236
+ "ENUM_BOOLEAN": "ENUM_CODE",
237
+ "ENUM_STRING": "ENUM_CODE",
238
+ "enum_reference": "ENUM_CODE",
239
+ "enum_instance": "ENUM_CODE",
240
+ "set(b1,b2)": "SET",
241
+ "constrained_set(b1,b2,cmn,cmx)": "SET",
242
+ "set [0,?]": "SET",
243
+ "set [1,?]": "SET",
244
+ "set [1, ?]": "SET",
245
+ "nan": "NaN",
246
+ "media_type": "LARGE_OBJECT_TYPE",
247
+ }
248
+
249
+ unit_mapping = {
250
+ "nan": "NaN",
251
+ "hertz": "FREQUENCY",
252
+ "hz": "FREQUENCY",
253
+ "pa": "PRESSURE",
254
+ "pascal": "PRESSURE",
255
+ "n/m²": "PRESSURE",
256
+ "bar": "PRESSURE",
257
+ "%": "SCALARS_PERC",
258
+ "w": "POWER",
259
+ "watt": "POWER",
260
+ "kw": "POWER",
261
+ "kg/m³": "CHEMISTRY",
262
+ "m²/s": "CHEMISTRY",
263
+ "pa*s": "CHEMISTRY",
264
+ "v": "ELECTRICAL",
265
+ "volt": "ELECTRICAL",
266
+ "db": "ACOUSTICS",
267
+ "db(a)": "ACOUSTICS",
268
+ "k": "TEMPERATURE",
269
+ "°c": "TEMPERATURE",
270
+ "n": "MECHANICS",
271
+ "newton": "MECHANICS",
272
+ "kg/s": "FLOW",
273
+ "kg/h": "FLOW",
274
+ "m³/s": "FLOW",
275
+ "m³/h": "FLOW",
276
+ "l/s": "FLOW",
277
+ "l/h": "FLOW",
278
+ "µm": "LENGTH",
279
+ "mm": "LENGTH",
280
+ "cm": "LENGTH",
281
+ "dm": "LENGTH",
282
+ "m": "LENGTH",
283
+ "meter": "LENGTH",
284
+ "m/s": "SPEED",
285
+ "km/h": "SPEED",
286
+ "s^(-1)": "FREQUENCY",
287
+ "1/s": "FREQUENCY",
288
+ "s": "TIME",
289
+ "h": "TIME",
290
+ "min": "TIME",
291
+ "d": "TIME",
292
+ "hours": "TIME",
293
+ "a": "ELECTRICAL",
294
+ "m³": "VOLUME",
295
+ "m²": "AREA",
296
+ "rpm": "FLOW",
297
+ "nm": "MECHANICS",
298
+ "m/m": "MECHANICS",
299
+ "m³/m²s": "MECHANICS",
300
+ "w(m²*K)": "HEAT_TRANSFER",
301
+ "kwh": "ELECTRICAL",
302
+ "kg/(s*m²)": "FLOW",
303
+ "kg": "MASS",
304
+ "w/(m*k)": "HEAT_TRANSFER",
305
+ "m²*k/w": "HEAT_TRANSFER",
306
+ "j/s": "POWER",
307
+ }
308
+
309
+ dataset = df
310
+ dataset["unit_lowercase"] = dataset["Unit"]
311
+ dataset["unit_lowercase"] = dataset["unit_lowercase"].str.lower()
312
+ dataset["unit_categ"] = dataset["unit_lowercase"].map(unit_mapping)
313
+
314
+ dataset["datatype_lowercase"] = dataset["Datatype"]
315
+ dataset["datatype_lowercase"] = dataset["datatype_lowercase"].str.lower()
316
+ dataset["datatype_categ"] = dataset["datatype_lowercase"].map(datatype_mapping)
317
+
318
+ dataset = dataset.fillna("NaN")
319
+ dataset["index"] = dataset.index
320
+
321
+ # uni_datatype=dataset['datatype_categ'].unique()
322
+ # uni_unit=dataset['unit_categ'].unique()
323
+ unique_labels_set = set()
324
+
325
+ dataset["Metalabel"] = ""
326
+ for i in range(0, len(dataset["Metalabel"])):
327
+ concat = (str(dataset["unit_categ"][i]), str(dataset["datatype_categ"][i]))
328
+ keys = [k for k, v in metalabel.items() if v == concat]
329
+ dataset["Metalabel"][i] = keys[0]
330
+ unique_labels_set.add(keys[0])
331
+ unique_label = list(unique_labels_set)
332
+ print(unique_label)
333
+
334
+ return dataset
335
+
336
+
337
+ def encode(aas_df, model):
338
+ # Einsatz von Sentence Bert um Embeddings zu kreieren
339
+ aas_df["PreferredName"] = "Name: " + aas_df["PreferredName"].astype(str)
340
+ aas_df["Definition"] = "Description: " + aas_df["Definition"].astype(str) + "; "
341
+ corpus_names = aas_df.loc[:, "PreferredName"]
342
+ corpus_definitions = aas_df.loc[:, "Definition"]
343
+ embeddings_definitions = model.encode(corpus_definitions, show_progress_bar=True)
344
+ embeddings_names = model.encode(corpus_names, show_progress_bar=True)
345
+ concat_name_def_emb = np.concatenate(
346
+ (embeddings_definitions, embeddings_names), axis=1
347
+ )
348
+ # aas_df['EmbeddingDefinition'] = embeddings_definitions.tolist()
349
+ # aas_df['EmbeddingName'] = embeddings_names.tolist()
350
+ aas_df["EmbeddingNameDefinition"] = concat_name_def_emb.tolist()
351
+ return aas_df
352
+
353
+
354
+ def convert_to_list(aas_df):
355
+ # Für die Datenbank werden teilweise Listen gebraucht
356
+ aas_index = aas_df.index.tolist()
357
+ aas_index_str = [str(r) for r in aas_index]
358
+ se_content = aas_df["SEContent"].tolist()
359
+ se_embedding_name_definition = aas_df["EmbeddingNameDefinition"].tolist()
360
+
361
+ aas_df_dropped = aas_df.drop(
362
+ ["EmbeddingNameDefinition", "SEContent", "SEValue"], axis=1
363
+ )
364
+
365
+ metadata = aas_df_dropped.to_dict("records")
366
+
367
+ return metadata, aas_index_str, se_content, se_embedding_name_definition
368
+
369
+
370
+ def set_up_chroma(
371
+ metadata, aas_index_str, se_content, se_embedding_name_definition, aas_name, client
372
+ ):
373
+ aas_name = aas_name.lower()
374
+ # Kein Großbuchstaben in Datenbank erlaubt
375
+ print(aas_name)
376
+ # client = chromadb.Client(Settings(
377
+ # chroma_db_impl="duckdb+parquet",
378
+ # persist_directory="./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment" # Optional, defaults to .chromadb/ in the current directory
379
+ # ))
380
+ emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
381
+ model_name="gart-labor/eng-distilBERT-se-eclass"
382
+ )
383
+ collection = client.get_or_create_collection(
384
+ name=aas_name, embedding_function=emb_fn
385
+ )
386
+
387
+ aas_content_string = []
388
+ # Umwandeln in Json damit es in db geschrieben werden kann
389
+ for element in se_content:
390
+ content = json.dumps(element)
391
+ aas_content_string.append(content)
392
+
393
+ items = collection.count() # returns the number of items in the collection
394
+ print(collection)
395
+ print("Datenbank erstellt, Anzahl Items:")
396
+ print(items)
397
+ if items == 0:
398
+ # Hinzufügen der SE Inhalte, der Embeddings und weiterer Metadaten in collection der Datenbank
399
+ collection.add(
400
+ documents=aas_content_string,
401
+ embeddings=se_embedding_name_definition,
402
+ metadatas=metadata,
403
+ ids=aas_index_str,
404
+ )
405
+ items = collection.count() # returns the number of items in the collection
406
+ print("------------")
407
+ print("Datenbank befüllt, Anzahl items:")
408
+ print(items)
409
+ else:
410
+ print("-----------")
411
+ print("AAS schon vorhanden")
412
+
413
+ return collection
414
+
415
+
416
+ def read_aas(aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel):
417
+ df = pd.DataFrame(
418
+ columns=[
419
+ "AASId",
420
+ "AASIdShort",
421
+ "SubmodelId",
422
+ "SubmodelName",
423
+ "SubmodelSemanticId",
424
+ "SEContent",
425
+ "SESemanticId",
426
+ "SEModelType",
427
+ "SEIdShort",
428
+ "SEValue",
429
+ "Definition",
430
+ "PreferredName",
431
+ "Datatype",
432
+ "Unit",
433
+ ]
434
+ )
435
+
436
+ aas_id = aas[0]["identification"]["id"]
437
+ aas_name = aas[0]["idShort"]
438
+ # Aufbereiten aller Concept descriptions als pandas dataframe, damit diese nachher einfacher untersucht werden können
439
+ df_cd = prepare_cd(conceptDescriptions)
440
+ # Auslesen der Teilmodelle
441
+ for submodel in submodels:
442
+ submodel_name = submodel["idShort"]
443
+ submodel_id = submodel["identification"]["id"]
444
+ # Muss gemacht werden, da Anzahl der Teilmodelle innerhalb der AAS und des Env nicht immer übereisntimmen
445
+ if submodel_id in submodels_ids:
446
+ semantic_id_existing = submodel["semanticId"]["keys"]
447
+ if not semantic_id_existing:
448
+ submodel_semantic_id = "Not defined"
449
+ else:
450
+ submodel_semantic_id = semantic_id_existing[0]["value"]
451
+ submodel_elements = submodel["submodelElements"]
452
+ # Auslesen Submodel Elements
453
+ for submodel_element in submodel_elements:
454
+ content = []
455
+ content.append(submodel_element)
456
+
457
+ (
458
+ se_type,
459
+ se_semantic_id,
460
+ se_semantic_id_local,
461
+ se_id_short,
462
+ value,
463
+ ) = get_values(submodel_element)
464
+
465
+ # When Concept Description local dann auslesen der Concept Description
466
+ if se_semantic_id_local == True:
467
+ cd_content = get_concept_description(se_semantic_id, df_cd)
468
+ definition = cd_content["Definition"]
469
+ preferred_name = cd_content["PreferredName"]
470
+ datatype = cd_content["Datatype"]
471
+ unit = cd_content["Unit"]
472
+
473
+ else:
474
+ definition = "NaN"
475
+ preferred_name = "NaN"
476
+ datatype = "NaN"
477
+ unit = "NaN"
478
+
479
+ new_row = pd.DataFrame(
480
+ {
481
+ "AASId": aas_id,
482
+ "AASIdShort": aas_name,
483
+ "SubmodelId": submodel_id,
484
+ "SubmodelName": submodel_name,
485
+ "SubmodelSemanticId": submodel_semantic_id,
486
+ "SEContent": content,
487
+ "SESemanticId": se_semantic_id,
488
+ "SEModelType": se_type,
489
+ "SEIdShort": se_id_short,
490
+ "SEValue": value,
491
+ "Definition": definition,
492
+ "PreferredName": preferred_name,
493
+ "Datatype": datatype,
494
+ "Unit": unit,
495
+ }
496
+ )
497
+ df = pd.concat([df, new_row], ignore_index=True)
498
+
499
+ # Wenn Submodel Element Collection dann diese Werte auch auslesen
500
+ if se_type == "SubmodelElementCollection":
501
+ df = get_values_sec(
502
+ df_cd,
503
+ content,
504
+ df,
505
+ aas_id,
506
+ aas_name,
507
+ submodel_id,
508
+ submodel_name,
509
+ submodel_semantic_id,
510
+ )
511
+ else:
512
+ continue
513
+
514
+ df = set_up_metadata(metalabel, df)
515
+
516
+ return df, aas_name
517
+
518
+
519
+ def index_corpus(data, model, metalabel, client_chroma):
520
+ # Start Punkt
521
+
522
+ aas = data["assetAdministrationShells"]
523
+ aas_submodels = aas[0]["submodels"]
524
+ submodels_ids = []
525
+ for submodel in aas_submodels:
526
+ submodels_ids.append(submodel["keys"][0]["value"])
527
+ submodels = data["submodels"]
528
+ conceptDescriptions = data["conceptDescriptions"]
529
+ assets = data["assets"]
530
+
531
+ aas_df, aas_name = read_aas(
532
+ aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel
533
+ )
534
+ # aas_df_embeddings = encode(aas_df, model)
535
+ aas_df = encode(aas_df, model)
536
+ metadata, aas_index_str, se_content, se_embedding_name_definition = convert_to_list(
537
+ aas_df
538
+ )
539
+ collection = set_up_chroma(
540
+ metadata,
541
+ aas_index_str,
542
+ se_content,
543
+ se_embedding_name_definition,
544
+ aas_name,
545
+ client_chroma,
546
+ )
547
+
548
+ return collection
549
+
550
+
551
+ # if __name__ == '__main__':
552
+ # create_database = index_corpus(aas = 'festo_switch.json')
app/main.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+
3
+ # from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
+ import time
5
+ import os
6
+ import json
7
+ import pandas as pd
8
+ import numpy as np
9
+ import category_encoders as ce
10
+ import string
11
+ import pickle
12
+ import tqdm.autonotebook
13
+ from fastapi import FastAPI, Request, UploadFile, File
14
+ from joblib import dump, load
15
+ from pydantic import BaseModel
16
+ import sys
17
+ from database_build import index_corpus
18
+ from predict_se import ask_database
19
+ from typing import Any, Dict, AnyStr, List, Union
20
+ import chromadb
21
+ from chromadb.config import Settings
22
+
23
+ app = FastAPI(title="Interface Semantic Matching")
24
+
25
+ JSONObject = Dict[AnyStr, Any]
26
+ JSONArray = List[Any]
27
+ JSONStructure = Union[JSONArray, JSONObject]
28
+
29
+
30
+ class submodelElement(BaseModel):
31
+ datatype: str ="NaN"
32
+ definition: str
33
+ name: str
34
+ semantic_id: str
35
+ unit: str = "NaN"
36
+ return_matches: int = 3
37
+
38
+ @app.on_event("startup")
39
+ def load_hf_model():
40
+ global model
41
+ # Altes Modell
42
+ # model = SentenceTransformer('mboth/distil-eng-quora-sentence')
43
+
44
+ # Fine Tuned Modell
45
+ model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
46
+
47
+ # global model_translate
48
+ # model_translate = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
49
+ # global tokenizer_translate
50
+ # tokenizer_translate = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
51
+
52
+ with open("app/metadata.pickle", "rb") as handle:
53
+ global metalabel
54
+ metalabel = pickle.load(handle)
55
+ global client_chroma
56
+ client_chroma = chromadb.Client(
57
+ Settings(
58
+ chroma_api_impl="rest",
59
+ # chroma_server_host muss angepasst werden nach jedem Neustart AWS
60
+ chroma_server_host="3.67.80.82",
61
+ chroma_server_http_port=8000,
62
+ )
63
+ )
64
+
65
+
66
+ @app.post("/PostAssetAdministrationShellEmbeddings")
67
+ async def index_aas(aas: UploadFile = File(...)):
68
+ data = json.load(aas.file)
69
+ print(type(data))
70
+ # aas = new_file
71
+ #aas, submodels, conceptDescriptions, assets, aas_df, collection, aas_name= index_corpus(data, model, metalabel, client_chroma)
72
+ collection = index_corpus(data, model, metalabel, client_chroma)
73
+ ready = 'AAS ready'
74
+ return ready
75
+
76
+
77
+ @app.post("/GetAllSubmodelElementsBySemanticIdAndSemanticInformation")
78
+ def predict(name: str, definition: str, semantic_id: str, unit: str, datatype: str, return_matches: int):
79
+ collections = client_chroma.list_collections()
80
+ query = {
81
+ "Name": name,
82
+ "Definition": definition,
83
+ "Unit": unit,
84
+ "Datatype": datatype,
85
+ "SemanticId": semantic_id,
86
+ "ReturnMatches": return_matches,
87
+ }
88
+ results = ask_database(query, metalabel, model, collections, client_chroma)
89
+
90
+ return results
app/metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4aee0cd2ca534e4af8023bd334db591a0a46b2a37154758aa5e3873b8d4728
3
+ size 1670
app/predict_se.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import json
3
+ import time
4
+ import pandas as pd
5
+ import numpy as np
6
+ import pickle
7
+
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from chromadb.db.clickhouse import NoDatapointsException
12
+
13
+
14
+ def query_aas(query_json, collection, model, metalabel):
15
+ query = json.loads(query_json)
16
+ name = query["Name"]
17
+ definition = query["Definition"]
18
+ unit = query["Unit"]
19
+ datatype = query["Datatype"]
20
+ semantic_id = query["SemanticId"]
21
+ return_matches = query["ReturnMatches"]
22
+
23
+ #model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
24
+
25
+ datatype_mapping = {
26
+ "boolean": "BOOLEAN",
27
+ "string": "STRING",
28
+ "string_translatable": "STRING",
29
+ "translatable_string": "STRING",
30
+ "non_translatable_string": "STRING",
31
+ "date": "DATE",
32
+ "data_time": "DATE",
33
+ "uri": "URI",
34
+ "int": "INT",
35
+ "int_measure": "INT",
36
+ "int_currency": "INT",
37
+ "integer": "INT",
38
+ "real": "REAL",
39
+ "real_measure": "REAL",
40
+ "real_currency": "REAL",
41
+ "enum_code": "ENUM_CODE",
42
+ "enum_int": "ENUM_CODE",
43
+ "ENUM_REAL": "ENUM_CODE",
44
+ "ENUM_RATIONAL": "ENUM_CODE",
45
+ "ENUM_BOOLEAN": "ENUM_CODE",
46
+ "ENUM_STRING": "ENUM_CODE",
47
+ "enum_reference": "ENUM_CODE",
48
+ "enum_instance": "ENUM_CODE",
49
+ "set(b1,b2)": "SET",
50
+ "constrained_set(b1,b2,cmn,cmx)": "SET",
51
+ "set [0,?]": "SET",
52
+ "set [1,?]": "SET",
53
+ "set [1, ?]": "SET",
54
+ "nan": "NaN",
55
+ "media_type": "LARGE_OBJECT_TYPE",
56
+ }
57
+
58
+ unit_mapping = {
59
+ "nan": "NaN",
60
+ "hertz": "FREQUENCY",
61
+ "hz": "FREQUENCY",
62
+ "pa": "PRESSURE",
63
+ "pascal": "PRESSURE",
64
+ "n/m²": "PRESSURE",
65
+ "bar": "PRESSURE",
66
+ "%": "SCALARS_PERC",
67
+ "w": "POWER",
68
+ "watt": "POWER",
69
+ "kw": "POWER",
70
+ "kg/m³": "CHEMISTRY",
71
+ "m²/s": "CHEMISTRY",
72
+ "pa*s": "CHEMISTRY",
73
+ "v": "ELECTRICAL",
74
+ "volt": "ELECTRICAL",
75
+ "db": "ACOUSTICS",
76
+ "db(a)": "ACOUSTICS",
77
+ "k": "TEMPERATURE",
78
+ "°c": "TEMPERATURE",
79
+ "n": "MECHANICS",
80
+ "newton": "MECHANICS",
81
+ "kg/s": "FLOW",
82
+ "kg/h": "FLOW",
83
+ "m³/s": "FLOW",
84
+ "m³/h": "FLOW",
85
+ "l/s": "FLOW",
86
+ "l/h": "FLOW",
87
+ "µm": "LENGTH",
88
+ "mm": "LENGTH",
89
+ "cm": "LENGTH",
90
+ "dm": "LENGTH",
91
+ "m": "LENGTH",
92
+ "meter": "LENGTH",
93
+ "m/s": "SPEED",
94
+ "km/h": "SPEED",
95
+ "s^(-1)": "FREQUENCY",
96
+ "1/s": "FREQUENCY",
97
+ "s": "TIME",
98
+ "h": "TIME",
99
+ "min": "TIME",
100
+ "d": "TIME",
101
+ "hours": "TIME",
102
+ "a": "ELECTRICAL",
103
+ "m³": "VOLUME",
104
+ "m²": "AREA",
105
+ "rpm": "FLOW",
106
+ "nm": "MECHANICS",
107
+ "m/m": "MECHANICS",
108
+ "m³/m²s": "MECHANICS",
109
+ "w(m²*K)": "HEAT_TRANSFER",
110
+ "kwh": "ELECTRICAL",
111
+ "kg/(s*m²)": "FLOW",
112
+ "kg": "MASS",
113
+ "w/(m*k)": "HEAT_TRANSFER",
114
+ "m²*k/w": "HEAT_TRANSFER",
115
+ "j/s": "POWER",
116
+ }
117
+
118
+ #with open(
119
+ # "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle",
120
+ # "rb",
121
+ #) as handle:
122
+ # metalabel = pickle.load(handle)
123
+
124
+ unit_lower = unit.lower()
125
+ datatype_lower = datatype.lower()
126
+
127
+ unit_categ = unit_mapping.get(unit_lower)
128
+ datatype_categ = datatype_mapping.get(datatype_lower)
129
+
130
+ if unit_categ == None:
131
+ unit_categ = "NaN"
132
+ if datatype_categ == None:
133
+ datatype_categ = "NaN"
134
+
135
+ concat = (unit_categ, datatype_categ)
136
+ keys = [k for k, v in metalabel.items() if v == concat]
137
+ metadata = keys[0]
138
+
139
+ name_embedding = model.encode(name)
140
+ definition_embedding = model.encode(definition)
141
+ concat_name_def_query = np.concatenate(
142
+ (definition_embedding, name_embedding), axis=0
143
+ )
144
+ concat_name_def_query = concat_name_def_query.tolist()
145
+
146
+ queries = [concat_name_def_query]
147
+ print(type(queries))
148
+
149
+ # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
150
+ # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
151
+ # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
152
+ # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2
153
+
154
+ # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
155
+ try:
156
+ homogen = collection.query(
157
+ query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id}
158
+ )
159
+ # except NoDatapointsException:
160
+ # homogen = 'Nix'
161
+
162
+ except Exception:
163
+ homogen = "Nix"
164
+
165
+ if homogen != "Nix":
166
+ result = homogen
167
+ result["matching_method"] = "Semantic equivalent , same semantic Id"
168
+ result["matching_algorithm"] = "None"
169
+ result["distances"] = [[0]]
170
+
171
+ final_result = {
172
+ "matching_method": result['matching_method'],
173
+ "matching_algorithm": result['matching_algorithm'],
174
+ "matching_distance": result['distances'][0][0],
175
+ "aas_id": result['metadatas'][0][0]['AASId'],
176
+ "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
177
+ "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
178
+ "submodel_id": result['metadatas'][0][0]['SubmodelId'],
179
+ "matched_object": result['documents'][0][0],
180
+ }
181
+ final_results = [final_result]
182
+ # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
183
+ elif homogen == "Nix":
184
+ try:
185
+ with_metadata = collection.query(
186
+ query_embeddings=queries,
187
+ n_results=return_matches,
188
+ where={"Metalabel": metadata},
189
+ )
190
+
191
+ # except NoDatapointsException:
192
+ # with_metadata = 'Nix'
193
+
194
+ except Exception:
195
+ with_metadata = "Nix"
196
+
197
+ without_metadata = collection.query(
198
+ query_embeddings=queries,
199
+ n_results=return_matches,
200
+ )
201
+
202
+ if with_metadata == "Nix":
203
+ result = without_metadata
204
+ result[
205
+ "matching_method"
206
+ ] = "Semantically not equivalent, NLP without Metadata"
207
+ result[
208
+ "matching_algorithm"
209
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
210
+
211
+ elif with_metadata != "Nix":
212
+ distance_with_meta = with_metadata["distances"][0][0]
213
+ distance_without_meta = without_metadata["distances"][0][0]
214
+ print(distance_with_meta)
215
+ print(distance_without_meta)
216
+ # Vergleich der Abstände von mit und ohne Metadaten
217
+ if distance_without_meta <= distance_with_meta:
218
+ result = without_metadata
219
+ result[
220
+ "matching_method"
221
+ ] = "Semantically not equivalent, NLP without Metadata"
222
+ result[
223
+ "matching_algorithm"
224
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
225
+
226
+ else:
227
+ result = with_metadata
228
+ result[
229
+ "matching_method"
230
+ ] = "Semantically not equivalent, NLP without Metadata"
231
+ result[
232
+ "matching_algorithm"
233
+ ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
234
+ # Aufbereiten des passenden finalen Ergebnisses
235
+ final_results = []
236
+ for i in range(0, return_matches):
237
+ value = result['documents'][0][i]
238
+ value_dict = json.loads(value)
239
+ final_result = {
240
+ "matching_method": result['matching_method'],
241
+ "matching_algorithm": result['matching_algorithm'],
242
+ "matching_distance": result['distances'][0][i],
243
+ "aas_id": result['metadatas'][0][i]['AASId'],
244
+ "aas_id_short": result['metadatas'][0][i]['AASIdShort'],
245
+ "submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
246
+ "submodel_id": result['metadatas'][0][i]['SubmodelId'],
247
+ #"matched_object": result['documents'][0][i]
248
+ "matched_object": value_dict
249
+ }
250
+ final_results.append(final_result)
251
+ return final_results
252
+
253
+
254
+ def ask_database(query, metalabel, model, collections, client_chroma):
255
+ # Alle AAS werden nacheinaner abgefragt
256
+ json_query = json.dumps(query, indent=4)
257
+ results = []
258
+ for collection in collections:
259
+ print(collection.name)
260
+ collection = client_chroma.get_collection(collection.name)
261
+ result = query_aas(json_query, collection, model, metalabel)
262
+ results.append(result)
263
+ #results_json = json.dumps(results)
264
+ return results
requirements.txt ADDED
Binary file (1.41 kB). View file