Carlos Salgado commited on
Commit
16c1bbd
1 Parent(s): f3f79d9

add notebooks, rename and polish generate_metadata.py

Browse files
backend/generate_metadata.py CHANGED
@@ -9,29 +9,8 @@ from langchain_community.document_loaders import UnstructuredPDFLoader
9
  from langchain_community.embeddings.fake import FakeEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
 
12
- from langchain_community.vectorstores import Vectara
13
-
14
- from backend.schema import Metadata, BimDiscipline
15
-
16
  load_dotenv()
17
 
18
- vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
19
- vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
20
- vectara_api_key = os.environ['VECTARA_API_KEY']
21
-
22
- vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
23
- vectara_corpus_id=vectara_corpus_id,
24
- vectara_api_key=vectara_api_key)
25
-
26
- prompt_template = """
27
- BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']
28
-
29
- You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
30
-
31
- Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:
32
- context="
33
- """
34
-
35
 
36
  def ingest(file_path):
37
  extension = file_path.split('.')[-1]
@@ -63,18 +42,29 @@ def ingest(file_path):
63
  return docs
64
 
65
 
 
 
 
66
 
67
- def extract_metadata(docs):
 
 
 
 
68
  # plain text
 
69
  context = "".join(
70
  [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
71
 
72
- prompt = f'{prompt_template}{context}"'
73
 
 
 
74
  # Create client
75
  client = openai.OpenAI(
76
  base_url="https://api.together.xyz/v1",
77
  api_key=os.environ["TOGETHER_API_KEY"],
 
78
  )
79
 
80
  # Call the LLM with the JSON schema
@@ -91,8 +81,8 @@ def extract_metadata(docs):
91
  }
92
  ]
93
  )
94
- # returns a dictionary
95
- return json.loads(chat_completion.choices[0].message.content)
96
 
97
 
98
  if __name__ == "__main__":
@@ -107,5 +97,5 @@ if __name__ == "__main__":
107
  sys.exit(-1)
108
 
109
  docs = ingest(args.document)
110
- metadata = extract_metadata(docs)
111
  print(metadata)
 
9
  from langchain_community.embeddings.fake import FakeEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
 
 
 
 
 
12
  load_dotenv()
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def ingest(file_path):
16
  extension = file_path.split('.')[-1]
 
42
  return docs
43
 
44
 
45
+ def generate_metadata(docs):
46
+ prompt_template = """
47
+ BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']
48
 
49
+ You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
50
+
51
+ Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
52
+ context="
53
+ """
54
  # plain text
55
+ filepath = [doc.metadata for doc in docs][0]['source']
56
  context = "".join(
57
  [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
58
 
59
+ prompt = f'{prompt_template}{context}"\nFilepath:{filepath}'
60
 
61
+ #print(prompt)
62
+
63
  # Create client
64
  client = openai.OpenAI(
65
  base_url="https://api.together.xyz/v1",
66
  api_key=os.environ["TOGETHER_API_KEY"],
67
+ #api_key=userdata.get('TOGETHER_API_KEY'),
68
  )
69
 
70
  # Call the LLM with the JSON schema
 
81
  }
82
  ]
83
  )
84
+
85
+ return json.loads(chat_completion.choices[0].message.content)
86
 
87
 
88
  if __name__ == "__main__":
 
97
  sys.exit(-1)
98
 
99
  docs = ingest(args.document)
100
+ metadata = generate_metadata(docs)
101
  print(metadata)
notebooks/preprocess_dataset.ipynb ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 106,
6
+ "metadata": {
7
+ "id": "f-ERaM64ONeC"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "# preprocess csv\n",
12
+ "import pandas as pd\n",
13
+ "filename = '/content/U3_Metadaten.csv'\n",
14
+ "df = pd.read_csv(filename, on_bad_lines='skip')"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 118,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/",
23
+ "height": 424
24
+ },
25
+ "id": "AYxRURTvQiFb",
26
+ "outputId": "18bf4139-47ac-4939-e635-9f09f560200c"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "application/vnd.google.colaboratory.intrinsic+json": {
32
+ "summary": "{\n \"name\": \"clean_df\",\n \"rows\": 158,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 158,\n \"samples\": [\n \"ISB-020-U3-W-R-01-B17012-028-000\",\n \"ISB-020-U3-W-L-01-B15100-018-000\",\n \"ISB-020-U3-W-R-01-B17012-034-000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Beschreibung\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"Foto\",\n \"Bodenheizung / Ventileinstellung / FBH AB PM\",\n \"Foto - Novocon S demontiert und Stellenantriebe montiert!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Disziplin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"D - Datennetz\",\n \"E - Elektroanlagen\",\n \"S - Sanitaer\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
33
+ "type": "dataframe",
34
+ "variable_name": "clean_df"
35
+ },
36
+ "text/html": [
37
+ "\n",
38
+ " <div id=\"df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0\" class=\"colab-df-container\">\n",
39
+ " <div>\n",
40
+ "<style scoped>\n",
41
+ " .dataframe tbody tr th:only-of-type {\n",
42
+ " vertical-align: middle;\n",
43
+ " }\n",
44
+ "\n",
45
+ " .dataframe tbody tr th {\n",
46
+ " vertical-align: top;\n",
47
+ " }\n",
48
+ "\n",
49
+ " .dataframe thead th {\n",
50
+ " text-align: right;\n",
51
+ " }\n",
52
+ "</style>\n",
53
+ "<table border=\"1\" class=\"dataframe\">\n",
54
+ " <thead>\n",
55
+ " <tr style=\"text-align: right;\">\n",
56
+ " <th></th>\n",
57
+ " <th>Name</th>\n",
58
+ " <th>Beschreibung</th>\n",
59
+ " <th>Disziplin</th>\n",
60
+ " </tr>\n",
61
+ " </thead>\n",
62
+ " <tbody>\n",
63
+ " <tr>\n",
64
+ " <th>0</th>\n",
65
+ " <td>ISB-020-U3-W-D-01-B07005-001-000</td>\n",
66
+ " <td>Bauarten und Stuecknachweis SGK</td>\n",
67
+ " <td>D - Datennetz</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>1</th>\n",
71
+ " <td>ISB-020-U3-W-D-01-B07005-002-000</td>\n",
72
+ " <td>Bauarten und Stuecknachweis SGK</td>\n",
73
+ " <td>D - Datennetz</td>\n",
74
+ " </tr>\n",
75
+ " <tr>\n",
76
+ " <th>2</th>\n",
77
+ " <td>ISB-020-U3-W-D-01-B07005-003-000</td>\n",
78
+ " <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
79
+ " <td>D - Datennetz</td>\n",
80
+ " </tr>\n",
81
+ " <tr>\n",
82
+ " <th>3</th>\n",
83
+ " <td>ISB-020-U3-W-D-01-B07005-004-000</td>\n",
84
+ " <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
85
+ " <td>D - Datennetz</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>4</th>\n",
89
+ " <td>ISB-020-U3-W-D-01-B18012-001-000</td>\n",
90
+ " <td>Sicherungslegende G-020 U3 779-AS 1</td>\n",
91
+ " <td>D - Datennetz</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <th>...</th>\n",
95
+ " <td>...</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>...</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>153</th>\n",
101
+ " <td>ISB-020-U3-W-S-01-B17012-008-000</td>\n",
102
+ " <td>Foto</td>\n",
103
+ " <td>S - Sanitaer</td>\n",
104
+ " </tr>\n",
105
+ " <tr>\n",
106
+ " <th>159</th>\n",
107
+ " <td>ISB-020-U3-W-S-01-B17012-010-000</td>\n",
108
+ " <td>Foto</td>\n",
109
+ " <td>S - Sanitaer</td>\n",
110
+ " </tr>\n",
111
+ " <tr>\n",
112
+ " <th>160</th>\n",
113
+ " <td>ISB-020-U3-W-S-01-B17012-011-000</td>\n",
114
+ " <td>Foto</td>\n",
115
+ " <td>S - Sanitaer</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>161</th>\n",
119
+ " <td>ISB-020-U3-W-S-01-B18003-001-020</td>\n",
120
+ " <td>Schieber / Hawle / Schieber 4000 + Handrad 780...</td>\n",
121
+ " <td>S - Sanitaer</td>\n",
122
+ " </tr>\n",
123
+ " <tr>\n",
124
+ " <th>162</th>\n",
125
+ " <td>ISB-020-U3-W-S-01-B19009-001-020</td>\n",
126
+ " <td>Schieber / Hawle / 4000 Schutzraum</td>\n",
127
+ " <td>S - Sanitaer</td>\n",
128
+ " </tr>\n",
129
+ " </tbody>\n",
130
+ "</table>\n",
131
+ "<p>158 rows × 3 columns</p>\n",
132
+ "</div>\n",
133
+ " <div class=\"colab-df-buttons\">\n",
134
+ "\n",
135
+ " <div class=\"colab-df-container\">\n",
136
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0')\"\n",
137
+ " title=\"Convert this dataframe to an interactive table.\"\n",
138
+ " style=\"display:none;\">\n",
139
+ "\n",
140
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
141
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
142
+ " </svg>\n",
143
+ " </button>\n",
144
+ "\n",
145
+ " <style>\n",
146
+ " .colab-df-container {\n",
147
+ " display:flex;\n",
148
+ " gap: 12px;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .colab-df-convert {\n",
152
+ " background-color: #E8F0FE;\n",
153
+ " border: none;\n",
154
+ " border-radius: 50%;\n",
155
+ " cursor: pointer;\n",
156
+ " display: none;\n",
157
+ " fill: #1967D2;\n",
158
+ " height: 32px;\n",
159
+ " padding: 0 0 0 0;\n",
160
+ " width: 32px;\n",
161
+ " }\n",
162
+ "\n",
163
+ " .colab-df-convert:hover {\n",
164
+ " background-color: #E2EBFA;\n",
165
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
166
+ " fill: #174EA6;\n",
167
+ " }\n",
168
+ "\n",
169
+ " .colab-df-buttons div {\n",
170
+ " margin-bottom: 4px;\n",
171
+ " }\n",
172
+ "\n",
173
+ " [theme=dark] .colab-df-convert {\n",
174
+ " background-color: #3B4455;\n",
175
+ " fill: #D2E3FC;\n",
176
+ " }\n",
177
+ "\n",
178
+ " [theme=dark] .colab-df-convert:hover {\n",
179
+ " background-color: #434B5C;\n",
180
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
181
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
182
+ " fill: #FFFFFF;\n",
183
+ " }\n",
184
+ " </style>\n",
185
+ "\n",
186
+ " <script>\n",
187
+ " const buttonEl =\n",
188
+ " document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0 button.colab-df-convert');\n",
189
+ " buttonEl.style.display =\n",
190
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
191
+ "\n",
192
+ " async function convertToInteractive(key) {\n",
193
+ " const element = document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0');\n",
194
+ " const dataTable =\n",
195
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
196
+ " [key], {});\n",
197
+ " if (!dataTable) return;\n",
198
+ "\n",
199
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
200
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
201
+ " + ' to learn more about interactive tables.';\n",
202
+ " element.innerHTML = '';\n",
203
+ " dataTable['output_type'] = 'display_data';\n",
204
+ " await google.colab.output.renderOutput(dataTable, element);\n",
205
+ " const docLink = document.createElement('div');\n",
206
+ " docLink.innerHTML = docLinkHtml;\n",
207
+ " element.appendChild(docLink);\n",
208
+ " }\n",
209
+ " </script>\n",
210
+ " </div>\n",
211
+ "\n",
212
+ "\n",
213
+ "<div id=\"df-518b8ddb-11a0-49a2-8903-71e4063ca189\">\n",
214
+ " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-518b8ddb-11a0-49a2-8903-71e4063ca189')\"\n",
215
+ " title=\"Suggest charts\"\n",
216
+ " style=\"display:none;\">\n",
217
+ "\n",
218
+ "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
219
+ " width=\"24px\">\n",
220
+ " <g>\n",
221
+ " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
222
+ " </g>\n",
223
+ "</svg>\n",
224
+ " </button>\n",
225
+ "\n",
226
+ "<style>\n",
227
+ " .colab-df-quickchart {\n",
228
+ " --bg-color: #E8F0FE;\n",
229
+ " --fill-color: #1967D2;\n",
230
+ " --hover-bg-color: #E2EBFA;\n",
231
+ " --hover-fill-color: #174EA6;\n",
232
+ " --disabled-fill-color: #AAA;\n",
233
+ " --disabled-bg-color: #DDD;\n",
234
+ " }\n",
235
+ "\n",
236
+ " [theme=dark] .colab-df-quickchart {\n",
237
+ " --bg-color: #3B4455;\n",
238
+ " --fill-color: #D2E3FC;\n",
239
+ " --hover-bg-color: #434B5C;\n",
240
+ " --hover-fill-color: #FFFFFF;\n",
241
+ " --disabled-bg-color: #3B4455;\n",
242
+ " --disabled-fill-color: #666;\n",
243
+ " }\n",
244
+ "\n",
245
+ " .colab-df-quickchart {\n",
246
+ " background-color: var(--bg-color);\n",
247
+ " border: none;\n",
248
+ " border-radius: 50%;\n",
249
+ " cursor: pointer;\n",
250
+ " display: none;\n",
251
+ " fill: var(--fill-color);\n",
252
+ " height: 32px;\n",
253
+ " padding: 0;\n",
254
+ " width: 32px;\n",
255
+ " }\n",
256
+ "\n",
257
+ " .colab-df-quickchart:hover {\n",
258
+ " background-color: var(--hover-bg-color);\n",
259
+ " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
260
+ " fill: var(--button-hover-fill-color);\n",
261
+ " }\n",
262
+ "\n",
263
+ " .colab-df-quickchart-complete:disabled,\n",
264
+ " .colab-df-quickchart-complete:disabled:hover {\n",
265
+ " background-color: var(--disabled-bg-color);\n",
266
+ " fill: var(--disabled-fill-color);\n",
267
+ " box-shadow: none;\n",
268
+ " }\n",
269
+ "\n",
270
+ " .colab-df-spinner {\n",
271
+ " border: 2px solid var(--fill-color);\n",
272
+ " border-color: transparent;\n",
273
+ " border-bottom-color: var(--fill-color);\n",
274
+ " animation:\n",
275
+ " spin 1s steps(1) infinite;\n",
276
+ " }\n",
277
+ "\n",
278
+ " @keyframes spin {\n",
279
+ " 0% {\n",
280
+ " border-color: transparent;\n",
281
+ " border-bottom-color: var(--fill-color);\n",
282
+ " border-left-color: var(--fill-color);\n",
283
+ " }\n",
284
+ " 20% {\n",
285
+ " border-color: transparent;\n",
286
+ " border-left-color: var(--fill-color);\n",
287
+ " border-top-color: var(--fill-color);\n",
288
+ " }\n",
289
+ " 30% {\n",
290
+ " border-color: transparent;\n",
291
+ " border-left-color: var(--fill-color);\n",
292
+ " border-top-color: var(--fill-color);\n",
293
+ " border-right-color: var(--fill-color);\n",
294
+ " }\n",
295
+ " 40% {\n",
296
+ " border-color: transparent;\n",
297
+ " border-right-color: var(--fill-color);\n",
298
+ " border-top-color: var(--fill-color);\n",
299
+ " }\n",
300
+ " 60% {\n",
301
+ " border-color: transparent;\n",
302
+ " border-right-color: var(--fill-color);\n",
303
+ " }\n",
304
+ " 80% {\n",
305
+ " border-color: transparent;\n",
306
+ " border-right-color: var(--fill-color);\n",
307
+ " border-bottom-color: var(--fill-color);\n",
308
+ " }\n",
309
+ " 90% {\n",
310
+ " border-color: transparent;\n",
311
+ " border-bottom-color: var(--fill-color);\n",
312
+ " }\n",
313
+ " }\n",
314
+ "</style>\n",
315
+ "\n",
316
+ " <script>\n",
317
+ " async function quickchart(key) {\n",
318
+ " const quickchartButtonEl =\n",
319
+ " document.querySelector('#' + key + ' button');\n",
320
+ " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
321
+ " quickchartButtonEl.classList.add('colab-df-spinner');\n",
322
+ " try {\n",
323
+ " const charts = await google.colab.kernel.invokeFunction(\n",
324
+ " 'suggestCharts', [key], {});\n",
325
+ " } catch (error) {\n",
326
+ " console.error('Error during call to suggestCharts:', error);\n",
327
+ " }\n",
328
+ " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
329
+ " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
330
+ " }\n",
331
+ " (() => {\n",
332
+ " let quickchartButtonEl =\n",
333
+ " document.querySelector('#df-518b8ddb-11a0-49a2-8903-71e4063ca189 button');\n",
334
+ " quickchartButtonEl.style.display =\n",
335
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
336
+ " })();\n",
337
+ " </script>\n",
338
+ "</div>\n",
339
+ "\n",
340
+ " <div id=\"id_5f410c26-0cce-4d03-86e0-353ac70a1d74\">\n",
341
+ " <style>\n",
342
+ " .colab-df-generate {\n",
343
+ " background-color: #E8F0FE;\n",
344
+ " border: none;\n",
345
+ " border-radius: 50%;\n",
346
+ " cursor: pointer;\n",
347
+ " display: none;\n",
348
+ " fill: #1967D2;\n",
349
+ " height: 32px;\n",
350
+ " padding: 0 0 0 0;\n",
351
+ " width: 32px;\n",
352
+ " }\n",
353
+ "\n",
354
+ " .colab-df-generate:hover {\n",
355
+ " background-color: #E2EBFA;\n",
356
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
357
+ " fill: #174EA6;\n",
358
+ " }\n",
359
+ "\n",
360
+ " [theme=dark] .colab-df-generate {\n",
361
+ " background-color: #3B4455;\n",
362
+ " fill: #D2E3FC;\n",
363
+ " }\n",
364
+ "\n",
365
+ " [theme=dark] .colab-df-generate:hover {\n",
366
+ " background-color: #434B5C;\n",
367
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
368
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
369
+ " fill: #FFFFFF;\n",
370
+ " }\n",
371
+ " </style>\n",
372
+ " <button class=\"colab-df-generate\" onclick=\"generateWithVariable('clean_df')\"\n",
373
+ " title=\"Generate code using this dataframe.\"\n",
374
+ " style=\"display:none;\">\n",
375
+ "\n",
376
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
377
+ " width=\"24px\">\n",
378
+ " <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
379
+ " </svg>\n",
380
+ " </button>\n",
381
+ " <script>\n",
382
+ " (() => {\n",
383
+ " const buttonEl =\n",
384
+ " document.querySelector('#id_5f410c26-0cce-4d03-86e0-353ac70a1d74 button.colab-df-generate');\n",
385
+ " buttonEl.style.display =\n",
386
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
387
+ "\n",
388
+ " buttonEl.onclick = () => {\n",
389
+ " google.colab.notebook.generateWithVariable('clean_df');\n",
390
+ " }\n",
391
+ " })();\n",
392
+ " </script>\n",
393
+ " </div>\n",
394
+ "\n",
395
+ " </div>\n",
396
+ " </div>\n"
397
+ ],
398
+ "text/plain": [
399
+ " Name \\\n",
400
+ "0 ISB-020-U3-W-D-01-B07005-001-000 \n",
401
+ "1 ISB-020-U3-W-D-01-B07005-002-000 \n",
402
+ "2 ISB-020-U3-W-D-01-B07005-003-000 \n",
403
+ "3 ISB-020-U3-W-D-01-B07005-004-000 \n",
404
+ "4 ISB-020-U3-W-D-01-B18012-001-000 \n",
405
+ ".. ... \n",
406
+ "153 ISB-020-U3-W-S-01-B17012-008-000 \n",
407
+ "159 ISB-020-U3-W-S-01-B17012-010-000 \n",
408
+ "160 ISB-020-U3-W-S-01-B17012-011-000 \n",
409
+ "161 ISB-020-U3-W-S-01-B18003-001-020 \n",
410
+ "162 ISB-020-U3-W-S-01-B19009-001-020 \n",
411
+ "\n",
412
+ " Beschreibung Disziplin \n",
413
+ "0 Bauarten und Stuecknachweis SGK D - Datennetz \n",
414
+ "1 Bauarten und Stuecknachweis SGK D - Datennetz \n",
415
+ "2 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n",
416
+ "3 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n",
417
+ "4 Sicherungslegende G-020 U3 779-AS 1 D - Datennetz \n",
418
+ ".. ... ... \n",
419
+ "153 Foto S - Sanitaer \n",
420
+ "159 Foto S - Sanitaer \n",
421
+ "160 Foto S - Sanitaer \n",
422
+ "161 Schieber / Hawle / Schieber 4000 + Handrad 780... S - Sanitaer \n",
423
+ "162 Schieber / Hawle / 4000 Schutzraum S - Sanitaer \n",
424
+ "\n",
425
+ "[158 rows x 3 columns]"
426
+ ]
427
+ },
428
+ "execution_count": 118,
429
+ "metadata": {},
430
+ "output_type": "execute_result"
431
+ }
432
+ ],
433
+ "source": [
434
+ "# drop all columns except name, description, discipline\n",
435
+ "features = ['Name', 'Beschreibung', 'Disziplin']\n",
436
+ "# Remove rows with NaN values\n",
437
+ "clean_df = df[features].dropna()\n",
438
+ "clean_df"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 143,
444
+ "metadata": {
445
+ "id": "_PtvbAskQa72"
446
+ },
447
+ "outputs": [],
448
+ "source": [
449
+ "clean_df.to_csv('name-description-discipline-data.csv')"
450
+ ]
451
+ }
452
+ ],
453
+ "metadata": {
454
+ "colab": {
455
+ "provenance": []
456
+ },
457
+ "kernelspec": {
458
+ "display_name": "Python 3",
459
+ "name": "python3"
460
+ },
461
+ "language_info": {
462
+ "name": "python"
463
+ }
464
+ },
465
+ "nbformat": 4,
466
+ "nbformat_minor": 0
467
+ }
notebooks/vectarize.ipynb ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "True"
12
+ ]
13
+ },
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import os \n",
21
+ "from dotenv import load_dotenv\n",
22
+ "\n",
23
+ "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
24
+ "\n",
25
+ "from langchain_community.vectorstores import Vectara\n",
26
+ "load_dotenv()"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "loader = CSVLoader(file_path='/home/salgadev/code/DocVerifyRAG/name-description-discipline-data.csv')\n",
36
+ "data = loader.load()\n",
37
+ "\n",
38
+ "vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']\n",
39
+ "vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']\n",
40
+ "vectara_api_key = os.environ['VECTARA_API_KEY']\n",
41
+ "#hf_token = os.environ['HF_API_TOKEN']\n",
42
+ "\n",
43
+ "vectorstore = Vectara(vectara_customer_id=vectara_customer_id,\n",
44
+ " vectara_corpus_id=vectara_corpus_id,\n",
45
+ " vectara_api_key=vectara_api_key)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 3,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
55
+ "embeddings = HuggingFaceEmbeddings(model_name=\"intfloat/multilingual-e5-large\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "vectara = Vectara.from_documents(data, embedding=embeddings)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 5,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
74
+ "\n"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 7,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "summary_config = {\"is_enabled\": True, \"max_results\": 5, \"response_lang\": \"eng\"}\n",
84
+ "retriever = vectara.as_retriever(\n",
85
+ " search_kwargs={\"k\": 3, \"summary_config\": summary_config}\n",
86
+ ")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 8,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "def get_sources(documents):\n",
96
+ " return documents[:-1]\n",
97
+ "\n",
98
+ "\n",
99
+ "def get_summary(documents):\n",
100
+ " return documents[-1].page_content"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 9,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "data": {
110
+ "text/plain": [
111
+ "'The documents related to the electrical discipline include items like ISB-020-U3-W-E-01-B07005-002-020, which pertains to U3 740KV 2 USV, and ISB-020-U3-W-E-01-B07005-002-040 for U3 780KV 4 equipment. These documents are part of the E - Elektroanlagen discipline, focusing on electrical systems and installations [7][11]. Additionally, there are documents specifying different aspects such as AS 1_G010, AS 2_G011, and AS 1_G009, highlighting specific details within the electrical discipline documentation [7][11]. These documents are crucial for ensuring proper electrical planning, design, and implementation within various systems and structures.'"
112
+ ]
113
+ },
114
+ "execution_count": 9,
115
+ "metadata": {},
116
+ "output_type": "execute_result"
117
+ }
118
+ ],
119
+ "source": [
120
+ "query_str = \"Describe document related to the electrical discipline\"\n",
121
+ "\n",
122
+ "(retriever | get_summary).invoke(query_str)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 10,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "text/plain": [
133
+ "[Document(page_content=': 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung : 16\\nName: ISB-020-U3-W-E-01-B15100-036-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '14', 'lang': 'deu', 'offset': '0', 'len': '110'}),\n",
134
+ " Document(page_content=': 7\\nName: ISB-020-U3-W-E-01-B07005-001-010\\nBeschreibung: E_020 U3 780_KV 4_E031 E_Ladestationen\\nDisziplin: E - Elektroanlagen : 8\\nName: ISB-020-U3-W-E-01-B07005-001-020\\nBeschreibung: E_020 U3 740_KV 2\\nDisziplin: E - Elektroanlagen : 9\\nName: ISB-020-U3-W-E-01-B07005-001-040\\nBeschreibung: G_020 U3 779_AS 1_G009\\nDisziplin: E - Elektroanlagen : 10\\nName: ISB-020-U3-W-E-01-B07005-001-999\\nBeschreibung: 772 UV 1 G022 / WW 218057\\nDisziplin: E - Elektroanlagen : 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen', metadata={'source': 'langchain', 'row': '9', 'lang': 'deu', 'offset': '0', 'len': '109'}),\n",
135
+ " Document(page_content=': 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen : 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '13', 'lang': 'deu', 'offset': '0', 'len': '105'})]"
136
+ ]
137
+ },
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": [
144
+ "(retriever | get_sources).invoke(query_str)\n",
145
+ "\n"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 11,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "madeup_metadata = {'filename': 'school_plumbing.txt', 'description': 'This document describes the plumbing system for a typical school building, including potable water supply, fixtures and appliances, drainage waste and vent (DWV) systems, and stormwater management.', 'discipline': 'plumbing'}"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 12,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "prompt_template = \"\"\"Compare the following metadata and return a confidence interval measuring how much the metadata is similar to your available information \n",
164
+ "\"\"\""
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 13,
170
+ "metadata": {},
171
+ "outputs": [
172
+ {
173
+ "data": {
174
+ "text/plain": [
175
+ "'The returned results did not contain sufficient information to be summarized into a useful answer for your query. Please try a different search or restate your query differently.'"
176
+ ]
177
+ },
178
+ "execution_count": 13,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "query_str = f'{prompt_template}\\nmetadata:{madeup_metadata}'\n",
185
+ "(retriever | get_summary).invoke(query_str)"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 15,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "query_str = 'What discipline does this description belong to? Description: This document provides instructions for handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, primarily used in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C.'\n"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 16,
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "data": {
204
+ "text/plain": [
205
+ "'The description provided pertains to the discipline of Sanitaer (Sanitary), as indicated by search results [159] and [160]. These instructions are related to handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, commonly utilized in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C. The document likely focuses on the proper procedures for managing and servicing these components within sanitary systems.'"
206
+ ]
207
+ },
208
+ "execution_count": 16,
209
+ "metadata": {},
210
+ "output_type": "execute_result"
211
+ }
212
+ ],
213
+ "source": [
214
+ "(retriever | get_summary).invoke(query_str)"
215
+ ]
216
+ }
217
+ ],
218
+ "metadata": {
219
+ "kernelspec": {
220
+ "display_name": "Python 3",
221
+ "language": "python",
222
+ "name": "python3"
223
+ },
224
+ "language_info": {
225
+ "codemirror_mode": {
226
+ "name": "ipython",
227
+ "version": 3
228
+ },
229
+ "file_extension": ".py",
230
+ "mimetype": "text/x-python",
231
+ "name": "python",
232
+ "nbconvert_exporter": "python",
233
+ "pygments_lexer": "ipython3",
234
+ "version": "3.11.8"
235
+ }
236
+ },
237
+ "nbformat": 4,
238
+ "nbformat_minor": 2
239
+ }