pwc-india commited on
Commit
946a21d
·
verified ·
1 Parent(s): ca8d5a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +447 -422
app.py CHANGED
@@ -1,422 +1,447 @@
1
- import pandas as pd
2
- import matplotlib.pyplot as plt
3
- import networkx as nx
4
- import numpy as np
5
- import streamlit as st
6
- import sdv
7
- from sdv.datasets.local import load_csvs
8
- from sdv.metadata import MultiTableMetadata
9
- from sdv.multi_table import HMASynthesizer
10
- import time
11
- import os
12
- import gc
13
- import warnings
14
- from PIL import Image
15
- from sdv.metadata import SingleTableMetadata
16
- import pyodbc
17
- import google.generativeai as genai
18
- from google.generativeai.types import HarmCategory, HarmBlockThreshold
19
- import textwrap
20
- from streamlit_extras.stylable_container import stylable_container
21
- from streamlit_extras.stateful_button import button
22
- import json
23
- from io import BytesIO
24
- genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w')
25
- genai_mod = genai.GenerativeModel(
26
- model_name='models/gemini-pro'
27
- )
28
-
29
- st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide')
30
- st.markdown("""
31
- <style>
32
-
33
- /* Remove blank space at top and bottom */
34
- .block-container {
35
- padding-top: 2rem;
36
- }
37
-
38
- /* Remove blank space at the center canvas */
39
- .st-emotion-cache-z5fcl4 {
40
- position: relative;
41
- top: -62px;
42
- }
43
-
44
- /* Make the toolbar transparent and the content below it clickable */
45
- .st-emotion-cache-18ni7ap {
46
- pointer-events: none;
47
- background: rgb(255 255 255 / 0%)
48
- }
49
- .st-emotion-cache-zq5wmm {
50
- pointer-events: auto;
51
- background: rgb(255 255 255);
52
- border-radius: 5px;
53
- }
54
- </style>
55
- """, unsafe_allow_html=True)
56
- def clear_cache():
57
- if 'rdf' in st.session_state:
58
- st.session_state.pop('rdf')
59
-
60
- def create_er_diagram(df):
61
- G = nx.DiGraph() # Directed graph
62
-
63
- # Dictionary to hold table columns
64
- table_columns = {}
65
-
66
- # Add nodes and edges to the graph
67
- for _, row in df.iterrows():
68
- parent_table = row['PARENT TABLE']
69
- child_table = row['CHILD TABLE']
70
- parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN']
71
- child_fk = row['CHILD TABLE RELATIONSHIP COLUMN']
72
- cardinality = row.get('CARDINALITY', '1:N')
73
-
74
- # Add columns to tables
75
- if parent_table not in table_columns:
76
- table_columns[parent_table] = []
77
- table_columns[parent_table].append(parent_pk)
78
-
79
- if child_table not in table_columns:
80
- table_columns[child_table] = []
81
- table_columns[child_table].append(child_fk)
82
-
83
- # Add nodes and edges
84
- G.add_node(parent_table)
85
- G.add_node(child_table)
86
- G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}')
87
-
88
- return G, table_columns
89
-
90
- def draw_er_diagram(G, table_columns):
91
- pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes
92
-
93
- plt.figure(figsize=(8, 8))
94
- nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True)
95
-
96
- # Draw node labels (table names in bold)
97
- for node, (x, y) in pos.items():
98
- plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center')
99
-
100
- # Draw column names
101
- for node, columns in table_columns.items():
102
- x, y = pos[node]
103
- column_text = '\n'.join(columns)
104
- plt.text(x, y, column_text, fontsize=6, ha='center', va='center')
105
-
106
- # Draw edge labels
107
- edge_labels = nx.get_edge_attributes(G, 'label')
108
- nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
109
- st.subheader("Schematic Representation")
110
- with st.container(border=True, height= 350):
111
- st.pyplot(plt)
112
- img_bytes = BytesIO()
113
- plt.savefig(img_bytes, format='png')
114
- img_bytes.seek(0)
115
- return img_bytes
116
-
117
- def cardinality(parent_df, child_df, parent_column, child_column):
118
- # Check uniqueness of parent primary key
119
- is_parent_unique = parent_df[parent_column].is_unique
120
-
121
- # Check uniqueness of child foreign key
122
- is_child_unique = child_df[child_column].is_unique
123
-
124
- # Determine cardinality
125
- if is_parent_unique and is_child_unique:
126
- return '1:1'
127
- elif is_parent_unique and not is_child_unique:
128
- return '1:N'
129
- elif not is_parent_unique and is_child_unique:
130
- return 'N:1'
131
- else:
132
- return 'N:N'
133
-
134
- #st.title('AUTOMATED DATA CATALOGUE')
135
- st.subheader('SELECT SOURCE')
136
- selectcol11, selectcol12 = st.columns(2)
137
- with selectcol11:
138
- select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache)
139
- with selectcol12:
140
- select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache)
141
- if select1 =='DB_10001' and select2 is not None:
142
- with st.spinner("Loading Tables:"):
143
- conn1 = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
144
- "Server=sql-ext-dev-uks-001.database.windows.net;"
145
- "Database=sqldb-ext-dev-uks-001;"
146
- "UID=dbadmin;"
147
- "PWD=mYpa$$w0rD" )
148
-
149
- query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC"
150
- st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME'])
151
-
152
- table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache)
153
- sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
154
-
155
- discover= button("Discover", key='discover')
156
-
157
- if discover:
158
- if sample_selector=='100':
159
- count="top 100"
160
- elif sample_selector=='10K':
161
- count="top 10000"
162
- elif sample_selector=='100K':
163
- count="top 100000"
164
- elif sample_selector=='1M':
165
- count="top 1000000"
166
- else:
167
- count=""
168
-
169
- query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC"
170
- st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME'])
171
- st.session_state.dataframes = {}
172
- st.session_state.col_names = []
173
- for tab in st.session_state.tab_names:
174
- query2_2= "select "+count+" * from ["+select2+"].["+tab+"]"
175
- st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1)
176
- st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns)
177
- #st.session_state.data_load = "Yes"
178
-
179
- tab_names = st.session_state.tab_names
180
- dataframes = st.session_state.dataframes
181
- col_names = st.session_state.col_names
182
- metadata = MultiTableMetadata()
183
- metadata.detect_from_dataframes(
184
- data= st.session_state.dataframes
185
- )
186
- multi_python_dict = metadata.to_dict()
187
-
188
- st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.")
189
- #st.subheader("DATA CATALOGUE")
190
- tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"])
191
- def view_callback():
192
- st.session_state.tdet = False
193
- with tab1:
194
- #st.write(python_dict)
195
- st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME'])
196
- containter_length = (len(st.session_state.table_list) + 1)*35
197
- tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values)
198
- tabs2= st.tabs(tab_names_shown)
199
- for i, tab in enumerate(tabs2):
200
- with tab:
201
- with st.container(height= 400, border=True):
202
- cole1,cole2=st.columns([1,1.5])
203
- with cole1:
204
- conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
205
- "Server=sql-ext-dev-uks-001.database.windows.net;"
206
- "Database=sqldb-ext-dev-uks-001;"
207
- "UID=dbadmin;"
208
- "PWD=mYpa$$w0rD" )
209
- table_selector= tab_names_shown[i]
210
- if table_selector is not None:
211
- query2="select "+count+" * from [dbo].["+table_selector+"]"
212
- #df = pd.read_sql_query(query2,con=conn)
213
- df = st.session_state.dataframes[table_selector]
214
- selected_df = pd.DataFrame()
215
- for col in df.columns:
216
- # Filter non-null and non-blank values
217
- non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip()
218
-
219
- # Select up to 10 values (or fewer if less than 10 non-null values)
220
- selected_values = list(non_null_values[:10])
221
- selected_values = selected_values + [""] * (10 - len(selected_values))
222
- # Add selected values to the new dataframe
223
- selected_df[col] = selected_values
224
- #st.dataframe(selected_df)
225
- null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1]
226
- null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n"
227
- for col in null_columns[:-1]:
228
- null_mes += f":orange[**{col}**]" + ', '
229
- for collast in null_columns[-1:]:
230
- if len(null_columns)> 1:
231
- null_mes += '**and** ' + f":orange[**{collast}**]"
232
- else:
233
- null_mes += f":orange[**{collast}**]"
234
-
235
- if len(null_columns) != 0:
236
- with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True):
237
- st.markdown(null_mes)
238
- inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns))
239
- run = st.button('Check', key= f"{tab_names_shown[i]}")
240
- else:
241
- st.success("No redundant Columns Found in Terms of Data Completeness")
242
- inf_filter= None
243
- run = False
244
-
245
- if inf_filter is not None:
246
- df.drop(columns=inf_filter, inplace=True)
247
- selected_df.drop(columns=inf_filter, inplace=True)
248
-
249
- if run or len(null_columns) == 0:
250
- main_list=df.columns.to_list()
251
- sub_list=['ID','LOADID','FILE_NAME']
252
- if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
253
- df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
254
- conn.close()
255
- sin_metadata = SingleTableMetadata()
256
- sin_metadata.detect_from_dataframe(df)
257
- python_dict = sin_metadata.to_dict()
258
- if f'cont_{table_selector}' not in st.session_state:
259
- with st.spinner("Processing Table"):
260
- # Create a GenerativeModel instance
261
- genai_mod = genai.GenerativeModel(
262
- model_name='models/gemini-pro'
263
- )
264
- if 'primary_key' in python_dict:
265
- primary_key = python_dict['primary_key']
266
- else:
267
- primary_key = "Could Not be Identified"
268
-
269
-
270
- story = f""" Details of the table:
271
- table columns: {str(list(df.columns))}
272
- column datatypes: {str(df.dtypes.to_string())}
273
- table sample data: {selected_df.head(10).to_string()}
274
- """
275
- response = genai_mod.generate_content(textwrap.dedent("""
276
- You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
277
- Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN.
278
-
279
- Here is the table details:
280
-
281
- """) + story + f"The Primary Key is:{primary_key}" ,
282
- safety_settings={
283
- HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
284
- HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
285
- HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
286
- HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
287
- })
288
- st.session_state[f'cont_{table_selector}'] = response.text
289
-
290
- st.markdown(st.session_state[f'cont_{table_selector}'])
291
- with cole2:
292
- st.markdown("**DATA PREVIEW**")
293
- st.dataframe(df, use_container_width= True)
294
-
295
- with tab2:
296
- metadata1 = MultiTableMetadata()
297
- metadata1.detect_from_dataframes(
298
- data= st.session_state.dataframes
299
- )
300
- multi_python_dict1 = metadata1.to_dict()
301
- rlist1=multi_python_dict1['relationships']
302
- rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY'])
303
- for i in range(len(rlist1)):
304
- rlist=rlist1[i]
305
- nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i])
306
- rdf=pd.concat([rdf,nrow],ignore_index=True)
307
-
308
- rdf['CARDINALITY'] = rdf.apply(
309
- lambda row: cardinality(
310
- st.session_state.dataframes[str(row['PARENT TABLE'])],
311
- st.session_state.dataframes[str(row['CHILD TABLE'])],
312
- str(row['PARENT TABLE RELATIONSHIP COLUMN']),
313
- str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
314
-
315
-
316
- if 'rdf' not in st.session_state:
317
- st.session_state.rdf = rdf
318
-
319
- edited_map_df = st.data_editor(
320
- st.session_state.rdf,
321
- column_config={
322
- "PARENT TABLE": st.column_config.SelectboxColumn(
323
- "Available Parent Table",
324
- width="medium",
325
- options=tab_names,
326
- required=True,
327
- ),
328
- "CHILD TABLE": st.column_config.SelectboxColumn(
329
- "Available Child Table",
330
- width="medium",
331
- options=tab_names,
332
- required=True,
333
- ),
334
- "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
335
- "Available Parent Table Relationship Column",
336
- width="medium",
337
- options=col_names,
338
- required=True,
339
- ),
340
- "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
341
- "Available Child Table Relationship Column",
342
- width="medium",
343
- options=col_names,
344
- required=True,
345
- ),
346
- "CARDINALITY": st.column_config.SelectboxColumn(
347
- "Cardinality",
348
- width="medium",
349
- options=['1:1','1:N','N:1','N:N'],
350
- required=True,
351
- )
352
- },
353
- hide_index=True,
354
- num_rows = 'dynamic',
355
- use_container_width = True
356
- )
357
-
358
- for i,row in edited_map_df.iterrows():
359
- pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns
360
- ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns
361
- pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
362
- cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
363
- match = [val for val in pvals if val in cvals]
364
- #st.write(match)
365
- if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist:
366
- st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}")
367
- else:
368
- pass
369
- if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist:
370
- st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}")
371
- else:
372
- pass
373
- if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist):
374
- pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
375
- cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
376
- match = [val for val in pvals if val in cvals]
377
- if match == []:
378
- st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ")
379
- if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []):
380
- # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique()
381
- # if primary_check:
382
- # pass
383
- # else:
384
- # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.")
385
- pass
386
-
387
- add = st.button("Add Relationship", key='add')
388
- if add:
389
- if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])):
390
- add_df = edited_map_df
391
- else:
392
- add_df = st.session_state.rdf
393
- else:
394
- add_df = st.session_state.rdf
395
-
396
- add_df['CARDINALITY'] = add_df.apply(
397
- lambda row: cardinality(
398
- st.session_state.dataframes[str(row['PARENT TABLE'])],
399
- st.session_state.dataframes[str(row['CHILD TABLE'])],
400
- str(row['PARENT TABLE RELATIONSHIP COLUMN']),
401
- str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
402
-
403
- st.session_state.add_df = add_df
404
- edited_map_df = st.session_state.add_df
405
-
406
- rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values)
407
- unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs]
408
- st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""")
409
-
410
- G, table_columns = create_er_diagram(st.session_state.add_df)
411
- img_bytes= draw_er_diagram(G, table_columns)
412
- col21, col22= st.columns([1,8])
413
- with col21:
414
- if st.button("Regenerate"):
415
- st.rerun()
416
- with col22:
417
- st.download_button(
418
- label="Download ER Diagram",
419
- data=img_bytes,
420
- file_name="er_diagram.png",
421
- mime="image/png"
422
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import networkx as nx
4
+ import numpy as np
5
+ import streamlit as st
6
+ import sdv
7
+ from sdv.datasets.local import load_csvs
8
+ from sdv.metadata import MultiTableMetadata
9
+ from sdv.multi_table import HMASynthesizer
10
+ import time
11
+ import os
12
+ import gc
13
+ import warnings
14
+ from PIL import Image
15
+ from sdv.metadata import SingleTableMetadata
16
+ import pyodbc
17
+ import google.generativeai as genai
18
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
19
+ import textwrap
20
+ from streamlit_extras.stylable_container import stylable_container
21
+ from streamlit_extras.stateful_button import button
22
+ import json
23
+ from io import BytesIO
24
+ ######
25
+ import os
26
+ import subprocess
27
+
28
+ # Run the commands to install ODBC driver
29
+ subprocess.run([
30
+ "apt-get", "update"
31
+ ])
32
+ subprocess.run([
33
+ "apt-get", "install", "-y", "curl", "apt-transport-https"
34
+ ])
35
+ subprocess.run([
36
+ "curl", "https://packages.microsoft.com/keys/microsoft.asc", "|", "apt-key", "add", "-"
37
+ ])
38
+ subprocess.run([
39
+ "curl", "https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list",
40
+ "|", "tee", "/etc/apt/sources.list.d/mssql-release.list"
41
+ ])
42
+ subprocess.run([
43
+ "apt-get", "update"
44
+ ])
45
+ subprocess.run([
46
+ "ACCEPT_EULA=Y", "apt-get", "install", "-y", "msodbcsql17", "unixodbc-dev"
47
+ ])
48
+
49
+ genai.configure(api_key='AIzaSyCeY8jSHKW6t0OSDRjc2VAfBvMunVrff2w')
50
+ genai_mod = genai.GenerativeModel(
51
+ model_name='models/gemini-pro'
52
+ )
53
+ ###########
54
+ st.set_page_config(page_title='DATA DISCOVERY', layout= 'wide')
55
+ st.markdown("""
56
+ <style>
57
+
58
+ /* Remove blank space at top and bottom */
59
+ .block-container {
60
+ padding-top: 2rem;
61
+ }
62
+
63
+ /* Remove blank space at the center canvas */
64
+ .st-emotion-cache-z5fcl4 {
65
+ position: relative;
66
+ top: -62px;
67
+ }
68
+
69
+ /* Make the toolbar transparent and the content below it clickable */
70
+ .st-emotion-cache-18ni7ap {
71
+ pointer-events: none;
72
+ background: rgb(255 255 255 / 0%)
73
+ }
74
+ .st-emotion-cache-zq5wmm {
75
+ pointer-events: auto;
76
+ background: rgb(255 255 255);
77
+ border-radius: 5px;
78
+ }
79
+ </style>
80
+ """, unsafe_allow_html=True)
81
+ def clear_cache():
82
+ if 'rdf' in st.session_state:
83
+ st.session_state.pop('rdf')
84
+
85
+ def create_er_diagram(df):
86
+ G = nx.DiGraph() # Directed graph
87
+
88
+ # Dictionary to hold table columns
89
+ table_columns = {}
90
+
91
+ # Add nodes and edges to the graph
92
+ for _, row in df.iterrows():
93
+ parent_table = row['PARENT TABLE']
94
+ child_table = row['CHILD TABLE']
95
+ parent_pk = row['PARENT TABLE RELATIONSHIP COLUMN']
96
+ child_fk = row['CHILD TABLE RELATIONSHIP COLUMN']
97
+ cardinality = row.get('CARDINALITY', '1:N')
98
+
99
+ # Add columns to tables
100
+ if parent_table not in table_columns:
101
+ table_columns[parent_table] = []
102
+ table_columns[parent_table].append(parent_pk)
103
+
104
+ if child_table not in table_columns:
105
+ table_columns[child_table] = []
106
+ table_columns[child_table].append(child_fk)
107
+
108
+ # Add nodes and edges
109
+ G.add_node(parent_table)
110
+ G.add_node(child_table)
111
+ G.add_edge(parent_table, child_table, label=f'{parent_pk} -> {child_fk}\n{cardinality}')
112
+
113
+ return G, table_columns
114
+
115
+ def draw_er_diagram(G, table_columns):
116
+ pos = nx.spring_layout(G, k=1.5, iterations=50) # Use a layout that spreads out nodes
117
+
118
+ plt.figure(figsize=(8, 8))
119
+ nx.draw(G, pos, with_labels=False, node_size=2500, node_color='lightblue', edge_color='gray', font_size=8, font_weight='bold', arrows=True)
120
+
121
+ # Draw node labels (table names in bold)
122
+ for node, (x, y) in pos.items():
123
+ plt.text(x, y + 0.13, node, fontsize=7, fontweight='bold', ha='center', va='center')
124
+
125
+ # Draw column names
126
+ for node, columns in table_columns.items():
127
+ x, y = pos[node]
128
+ column_text = '\n'.join(columns)
129
+ plt.text(x, y, column_text, fontsize=6, ha='center', va='center')
130
+
131
+ # Draw edge labels
132
+ edge_labels = nx.get_edge_attributes(G, 'label')
133
+ nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
134
+ st.subheader("Schematic Representation")
135
+ with st.container(border=True, height= 350):
136
+ st.pyplot(plt)
137
+ img_bytes = BytesIO()
138
+ plt.savefig(img_bytes, format='png')
139
+ img_bytes.seek(0)
140
+ return img_bytes
141
+
142
+ def cardinality(parent_df, child_df, parent_column, child_column):
143
+ # Check uniqueness of parent primary key
144
+ is_parent_unique = parent_df[parent_column].is_unique
145
+
146
+ # Check uniqueness of child foreign key
147
+ is_child_unique = child_df[child_column].is_unique
148
+
149
+ # Determine cardinality
150
+ if is_parent_unique and is_child_unique:
151
+ return '1:1'
152
+ elif is_parent_unique and not is_child_unique:
153
+ return '1:N'
154
+ elif not is_parent_unique and is_child_unique:
155
+ return 'N:1'
156
+ else:
157
+ return 'N:N'
158
+
159
+ #st.title('AUTOMATED DATA CATALOGUE')
160
+ st.subheader('SELECT SOURCE')
161
+ selectcol11, selectcol12 = st.columns(2)
162
+ with selectcol11:
163
+ select1=st.selectbox('SOURCE DB NAME',('DB_10001','Marcopolo_db'),key='dbname',index=None,placeholder='Select database name', on_change=clear_cache)
164
+ with selectcol12:
165
+ select2=st.selectbox('SOURCE SCHEMA NAME',('DBO','CLIENT'),key='SCHname',index=None,placeholder='Select schema name', on_change=clear_cache)
166
+ if select1 =='DB_10001' and select2 is not None:
167
+ with st.spinner("Loading Tables:"):
168
+ conn1 = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
169
+ "Server=sql-ext-dev-uks-001.database.windows.net;"
170
+ "Database=sqldb-ext-dev-uks-001;"
171
+ "UID=dbadmin;"
172
+ "PWD=mYpa$$w0rD" )
173
+
174
+ query0_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' ORDER BY TABLE_NAME ASC"
175
+ st.session_state.tab_names_init=list(pd.read_sql_query(query0_1,con=conn1)['TABLE_NAME'])
176
+
177
+ table_selector=st.multiselect('SOURCE TABLE NAME',st.session_state.tab_names_init,default=None,placeholder='Select table(s) for automated data cataloging', on_change= clear_cache)
178
+ sample_selector=st.selectbox('SELECT SAMPLE SIZE',['100','10K','100K','1M','Full Table'],index=None,placeholder='Select sample size for the table(s)', on_change= clear_cache)
179
+
180
+ discover= button("Discover", key='discover')
181
+
182
+ if discover:
183
+ if sample_selector=='100':
184
+ count="top 100"
185
+ elif sample_selector=='10K':
186
+ count="top 10000"
187
+ elif sample_selector=='100K':
188
+ count="top 100000"
189
+ elif sample_selector=='1M':
190
+ count="top 1000000"
191
+ else:
192
+ count=""
193
+
194
+ query1_1=f"select * from INFORMATION_SCHEMA.TABLES where TABLE_SCHEMA='{select2}' and TABLE_NAME in ("+(', '.join(f"'{table}'" for table in table_selector))+") ORDER BY TABLE_NAME ASC"
195
+ st.session_state.tab_names=list(pd.read_sql_query(query1_1,con=conn1)['TABLE_NAME'])
196
+ st.session_state.dataframes = {}
197
+ st.session_state.col_names = []
198
+ for tab in st.session_state.tab_names:
199
+ query2_2= "select "+count+" * from ["+select2+"].["+tab+"]"
200
+ st.session_state.dataframes[f'{tab}'] = pd.read_sql_query(query2_2,con=conn1)
201
+ st.session_state.col_names = st.session_state.col_names + list(st.session_state.dataframes[f'{tab}'].columns)
202
+ #st.session_state.data_load = "Yes"
203
+
204
+ tab_names = st.session_state.tab_names
205
+ dataframes = st.session_state.dataframes
206
+ col_names = st.session_state.col_names
207
+ metadata = MultiTableMetadata()
208
+ metadata.detect_from_dataframes(
209
+ data= st.session_state.dataframes
210
+ )
211
+ multi_python_dict = metadata.to_dict()
212
+
213
+ st.markdown(f"System has ingested :orange[**{str(len(tab_names))} tables**] from the source. Please proceed with the discovery.")
214
+ #st.subheader("DATA CATALOGUE")
215
+ tab1, tab2= st.tabs(["Explain Tables", "Show Relationships"])
216
+ def view_callback():
217
+ st.session_state.tdet = False
218
+ with tab1:
219
+ #st.write(python_dict)
220
+ st.session_state.table_list= pd.DataFrame(tab_names,columns=['TABLE NAME'])
221
+ containter_length = (len(st.session_state.table_list) + 1)*35
222
+ tab_names_shown= list(st.session_state.table_list['TABLE NAME'].values)
223
+ tabs2= st.tabs(tab_names_shown)
224
+ for i, tab in enumerate(tabs2):
225
+ with tab:
226
+ with st.container(height= 400, border=True):
227
+ cole1,cole2=st.columns([1,1.5])
228
+ with cole1:
229
+ conn = pyodbc.connect("Driver={ODBC Driver 17 for SQL Server};"
230
+ "Server=sql-ext-dev-uks-001.database.windows.net;"
231
+ "Database=sqldb-ext-dev-uks-001;"
232
+ "UID=dbadmin;"
233
+ "PWD=mYpa$$w0rD" )
234
+ table_selector= tab_names_shown[i]
235
+ if table_selector is not None:
236
+ query2="select "+count+" * from [dbo].["+table_selector+"]"
237
+ #df = pd.read_sql_query(query2,con=conn)
238
+ df = st.session_state.dataframes[table_selector]
239
+ selected_df = pd.DataFrame()
240
+ for col in df.columns:
241
+ # Filter non-null and non-blank values
242
+ non_null_values = df[col][df[col] != ''].dropna().astype(str).str.strip()
243
+
244
+ # Select up to 10 values (or fewer if less than 10 non-null values)
245
+ selected_values = list(non_null_values[:10])
246
+ selected_values = selected_values + [""] * (10 - len(selected_values))
247
+ # Add selected values to the new dataframe
248
+ selected_df[col] = selected_values
249
+ #st.dataframe(selected_df)
250
+ null_columns = [col for col in selected_df.columns if selected_df.apply(lambda x: x == '')[col].nunique() > 1]
251
+ null_mes= "**The Following columns have very few records(less than 10). You might exclude them (if they are redundant) for better table discovery:** \n\n"
252
+ for col in null_columns[:-1]:
253
+ null_mes += f":orange[**{col}**]" + ', '
254
+ for collast in null_columns[-1:]:
255
+ if len(null_columns)> 1:
256
+ null_mes += '**and** ' + f":orange[**{collast}**]"
257
+ else:
258
+ null_mes += f":orange[**{collast}**]"
259
+
260
+ if len(null_columns) != 0:
261
+ with st.expander("🛈 Potential redundant Columns Found in Terms of Data Completeness:", expanded= True):
262
+ st.markdown(null_mes)
263
+ inf_filter= st.multiselect('Select Incomplete and Insignificant Columns to exclude:', list(null_columns))
264
+ run = st.button('Check', key= f"{tab_names_shown[i]}")
265
+ else:
266
+ st.success("No redundant Columns Found in Terms of Data Completeness")
267
+ inf_filter= None
268
+ run = False
269
+
270
+ if inf_filter is not None:
271
+ df.drop(columns=inf_filter, inplace=True)
272
+ selected_df.drop(columns=inf_filter, inplace=True)
273
+
274
+ if run or len(null_columns) == 0:
275
+ main_list=df.columns.to_list()
276
+ sub_list=['ID','LOADID','FILE_NAME']
277
+ if any(main_list[i:i+len(sub_list)] == sub_list for i in range(len(main_list) - len(sub_list) + 1)):
278
+ df=df.drop(['ID','LOADID','FILE_NAME'],axis=1)
279
+ conn.close()
280
+ sin_metadata = SingleTableMetadata()
281
+ sin_metadata.detect_from_dataframe(df)
282
+ python_dict = sin_metadata.to_dict()
283
+ if f'cont_{table_selector}' not in st.session_state:
284
+ with st.spinner("Processing Table"):
285
+ # Create a GenerativeModel instance
286
+ genai_mod = genai.GenerativeModel(
287
+ model_name='models/gemini-pro'
288
+ )
289
+ if 'primary_key' in python_dict:
290
+ primary_key = python_dict['primary_key']
291
+ else:
292
+ primary_key = "Could Not be Identified"
293
+
294
+
295
+ story = f""" Details of the table:
296
+ table columns: {str(list(df.columns))}
297
+ column datatypes: {str(df.dtypes.to_string())}
298
+ table sample data: {selected_df.head(10).to_string()}
299
+ """
300
+ response = genai_mod.generate_content(textwrap.dedent("""
301
+ You are a Data Migration expert. You can analyze and understand any table/data/ Please return a narration about the data. The narration should Include primary key name(if any) and a intellectual guess about the table schema. The data can be any kind of generic data. you have to guess the object name/class name/schema name etc. of that data. Don't add unnecessary details. Strictly stick to the informations provided only.
302
+ Important: Please consider All fields are mandetorily during your analysis. Explain all fields precisely without unnecessary and irrelevant information. NO NEED TO PROVIDE THE SAMPLE DATA AGAIN.
303
+
304
+ Here is the table details:
305
+
306
+ """) + story + f"The Primary Key is:{primary_key}" ,
307
+ safety_settings={
308
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
309
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
310
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
311
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
312
+ })
313
+ st.session_state[f'cont_{table_selector}'] = response.text
314
+
315
+ st.markdown(st.session_state[f'cont_{table_selector}'])
316
+ with cole2:
317
+ st.markdown("**DATA PREVIEW**")
318
+ st.dataframe(df, use_container_width= True)
319
+
320
+ with tab2:
321
+ metadata1 = MultiTableMetadata()
322
+ metadata1.detect_from_dataframes(
323
+ data= st.session_state.dataframes
324
+ )
325
+ multi_python_dict1 = metadata1.to_dict()
326
+ rlist1=multi_python_dict1['relationships']
327
+ rdf=pd.DataFrame(columns=['PARENT TABLE','CHILD TABLE','PARENT TABLE RELATIONSHIP COLUMN','CHILD TABLE RELATIONSHIP COLUMN','CARDINALITY'])
328
+ for i in range(len(rlist1)):
329
+ rlist=rlist1[i]
330
+ nrow=pd.DataFrame({'PARENT TABLE':rlist['parent_table_name'],'CHILD TABLE':rlist['child_table_name'],'PARENT TABLE RELATIONSHIP COLUMN':rlist['parent_primary_key'],'CHILD TABLE RELATIONSHIP COLUMN':rlist['child_foreign_key']},index=[i])
331
+ rdf=pd.concat([rdf,nrow],ignore_index=True)
332
+
333
+ rdf['CARDINALITY'] = rdf.apply(
334
+ lambda row: cardinality(
335
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
336
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
337
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
338
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
339
+
340
+
341
+ if 'rdf' not in st.session_state:
342
+ st.session_state.rdf = rdf
343
+
344
+ edited_map_df = st.data_editor(
345
+ st.session_state.rdf,
346
+ column_config={
347
+ "PARENT TABLE": st.column_config.SelectboxColumn(
348
+ "Available Parent Table",
349
+ width="medium",
350
+ options=tab_names,
351
+ required=True,
352
+ ),
353
+ "CHILD TABLE": st.column_config.SelectboxColumn(
354
+ "Available Child Table",
355
+ width="medium",
356
+ options=tab_names,
357
+ required=True,
358
+ ),
359
+ "PARENT TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
360
+ "Available Parent Table Relationship Column",
361
+ width="medium",
362
+ options=col_names,
363
+ required=True,
364
+ ),
365
+ "CHILD TABLE RELATIONSHIP COLUMN": st.column_config.SelectboxColumn(
366
+ "Available Child Table Relationship Column",
367
+ width="medium",
368
+ options=col_names,
369
+ required=True,
370
+ ),
371
+ "CARDINALITY": st.column_config.SelectboxColumn(
372
+ "Cardinality",
373
+ width="medium",
374
+ options=['1:1','1:N','N:1','N:N'],
375
+ required=True,
376
+ )
377
+ },
378
+ hide_index=True,
379
+ num_rows = 'dynamic',
380
+ use_container_width = True
381
+ )
382
+
383
+ for i,row in edited_map_df.iterrows():
384
+ pcolchecklist = st.session_state.dataframes[str(row['PARENT TABLE'])].columns
385
+ ccolchecklist = st.session_state.dataframes[str(row['CHILD TABLE'])].columns
386
+ pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
387
+ cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
388
+ match = [val for val in pvals if val in cvals]
389
+ #st.write(match)
390
+ if row['PARENT TABLE RELATIONSHIP COLUMN'] not in pcolchecklist:
391
+ st.error(f"{row['PARENT TABLE RELATIONSHIP COLUMN']} does not belong to {row['PARENT TABLE']}")
392
+ else:
393
+ pass
394
+ if row['CHILD TABLE RELATIONSHIP COLUMN'] not in ccolchecklist:
395
+ st.error(f"{row['CHILD TABLE RELATIONSHIP COLUMN']} does not belong to {row['CHILD TABLE']}")
396
+ else:
397
+ pass
398
+ if (row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist):
399
+ pvals= list(st.session_state.dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)
400
+ cvals= list(st.session_state.dataframes[str(row['CHILD TABLE'])][row['CHILD TABLE RELATIONSHIP COLUMN']].values)
401
+ match = [val for val in pvals if val in cvals]
402
+ if match == []:
403
+ st.error(f"The Joining Condition Between column: {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} and column: {row['CHILD TABLE RELATIONSHIP COLUMN']} from Table: {row['CHILD TABLE']} does not yield any record. ")
404
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and (match != []):
405
+ # primary_check = len(list(dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].values)) == dataframes[str(row['PARENT TABLE'])][row['PARENT TABLE RELATIONSHIP COLUMN']].nunique()
406
+ # if primary_check:
407
+ # pass
408
+ # else:
409
+ # st.error(f"The Column {row['PARENT TABLE RELATIONSHIP COLUMN']} from Table: {row['PARENT TABLE']} has duplicate records and hence can not be considered as Primary Key.")
410
+ pass
411
+
412
+ add = st.button("Add Relationship", key='add')
413
+ if add:
414
+ if ((row['PARENT TABLE RELATIONSHIP COLUMN'] in pcolchecklist) and (row['CHILD TABLE RELATIONSHIP COLUMN'] in ccolchecklist)) and ((match != [])):
415
+ add_df = edited_map_df
416
+ else:
417
+ add_df = st.session_state.rdf
418
+ else:
419
+ add_df = st.session_state.rdf
420
+
421
+ add_df['CARDINALITY'] = add_df.apply(
422
+ lambda row: cardinality(
423
+ st.session_state.dataframes[str(row['PARENT TABLE'])],
424
+ st.session_state.dataframes[str(row['CHILD TABLE'])],
425
+ str(row['PARENT TABLE RELATIONSHIP COLUMN']),
426
+ str(row['CHILD TABLE RELATIONSHIP COLUMN'])),axis=1)
427
+
428
+ st.session_state.add_df = add_df
429
+ edited_map_df = st.session_state.add_df
430
+
431
+ rel_tabs = list(add_df['PARENT TABLE'].values) + list(add_df['CHILD TABLE'].values)
432
+ unrel_tabs = [tab for tab in tab_names if tab not in rel_tabs]
433
+ st.info(f"""Unrelated tables due to undetected pattern: {str(unrel_tabs).replace("[","").replace("]","")}""")
434
+
435
+ G, table_columns = create_er_diagram(st.session_state.add_df)
436
+ img_bytes= draw_er_diagram(G, table_columns)
437
+ col21, col22= st.columns([1,8])
438
+ with col21:
439
+ if st.button("Regenerate"):
440
+ st.rerun()
441
+ with col22:
442
+ st.download_button(
443
+ label="Download ER Diagram",
444
+ data=img_bytes,
445
+ file_name="er_diagram.png",
446
+ mime="image/png"
447
+ )