RamAnanth1 commited on
Commit
390d8ba
1 Parent(s): 42a009f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -4
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import pandas as pd
3
  from realtabformer import REaLTabFormer
4
  from scipy.io import arff
 
5
 
6
  rtf_model = REaLTabFormer(
7
  model_type="tabular",
@@ -20,7 +21,57 @@ def generate_data(file, num_samples):
20
  samples = rtf_model.sample(n_samples=num_samples)
21
 
22
  return samples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
24
  css = """
25
  .gradio-container {
26
  font-family: 'IBM Plex Sans', sans-serif;
@@ -106,7 +157,7 @@ with gr.Blocks(css = css) as demo:
106
  """)
107
  gr.HTML('''
108
  <p style="margin-bottom: 10px; font-size: 94%">
109
- This is an unofficial demo for REaLTabFormer that can be used to generate synthetic data from single tabular data using GPT. The demo is based on the <a href='https://github.com/avsolatorio/REaLTabFormer' style='text-decoration: underline;' target='_blank'> Github </a> implementation provided by the authors.
110
  </p>
111
  ''')
112
 
@@ -116,18 +167,27 @@ with gr.Blocks(css = css) as demo:
116
  # audio_input_r = gr.Audio(label = 'Record Audio Input',source="microphone",type="filepath")
117
  # transcribe_audio_r = gr.Button('Transcribe')
118
 
119
- with gr.Tab("Upload Data as File"):
120
- data_input_u = gr.File(label = 'Upload Data File', file_types=["text", ".json", ".csv", ".arff"])
121
  num_samples = gr.Slider(label="Number of Samples", minimum=5, maximum=100, value=5, step=10)
122
  generate_data_btn = gr.Button('Generate Synthetic Data')
123
 
 
 
 
 
 
 
 
124
  with gr.Row():
125
  #data_sample = gr.Dataframe(label = "Original Data")
126
  data_output = gr.Dataframe(label = "Synthetic Data")
127
-
 
128
 
129
 
130
  generate_data_btn.click(generate_data, inputs = [data_input_u,num_samples], outputs = [data_output])
 
131
  examples = gr.Examples(examples=[['diabetes.arff',5], ["titanic.csv", 15]],inputs = [data_input_u,num_samples], outputs = [data_output], cache_examples = True, fn = generate_data)
132
 
133
 
 
2
  import pandas as pd
3
  from realtabformer import REaLTabFormer
4
  from scipy.io import arff
5
+ import os
6
 
7
  rtf_model = REaLTabFormer(
8
  model_type="tabular",
 
21
  samples = rtf_model.sample(n_samples=num_samples)
22
 
23
  return samples
24
+
25
+ def generate_relational_data(parent_file, child_file, join_on):
26
+ parent_df = pd.read_csv(parent_file.name)
27
+ child_df = pd.read_csv(child_file.name)
28
+
29
+ #Make sure join_on column exists in both
30
+ assert ((join_on in parent_df.columns) and
31
+ (join_on in child_df.columns))
32
+
33
+ rtf_model.fit(parent_df.drop(join_on, axis=1), num_bootstrap=100)
34
+
35
+ pdir = Path("rtf_parent/")
36
+ rtf_model.save(pdir)
37
+
38
+ # # Get the most recently saved parent model,
39
+ # # or a specify some other saved model.
40
+ # parent_model_path = pdir / "idXXX"
41
+ parent_model_path = sorted([
42
+ p for p in pdir.glob("id*") if p.is_dir()],
43
+ key=os.path.getmtime)[-1]
44
+
45
+ child_model = REaLTabFormer(
46
+ model_type="relational",
47
+ parent_realtabformer_path=parent_model_path,
48
+ epochs = 25,
49
+ output_max_length=None,
50
+ train_size=0.8)
51
+
52
+ child_model.fit(
53
+ df=child_df,
54
+ in_df=parent_df,
55
+ join_on=join_on,
56
+ num_bootstrap=100)
57
+
58
+ # Generate parent samples.
59
+ parent_samples = rtf_model.sample(5)
60
+
61
+ # Create the unique ids based on the index.
62
+ parent_samples.index.name = join_on
63
+ parent_samples = parent_samples.reset_index()
64
+
65
+ # Generate the relational observations.
66
+ child_samples = child_model.sample(
67
+ input_unique_ids=parent_samples[join_on],
68
+ input_df=parent_samples.drop(join_on, axis=1),
69
+ gen_batch=5)
70
+
71
+ return parent_samples, child_samples, gr.update(visible = True)
72
 
73
+
74
+
75
  css = """
76
  .gradio-container {
77
  font-family: 'IBM Plex Sans', sans-serif;
 
157
  """)
158
  gr.HTML('''
159
  <p style="margin-bottom: 10px; font-size: 94%">
160
+ This is an unofficial demo for REaLTabFormer, an approach that can be used to generate synthetic data from single tabular data using GPT. The demo is based on the <a href='https://github.com/avsolatorio/REaLTabFormer' style='text-decoration: underline;' target='_blank'> Github </a> implementation provided by the authors.
161
  </p>
162
  ''')
163
 
 
167
  # audio_input_r = gr.Audio(label = 'Record Audio Input',source="microphone",type="filepath")
168
  # transcribe_audio_r = gr.Button('Transcribe')
169
 
170
+ with gr.Tab("Upload Data as File: Tabular Data"):
171
+ data_input_u = gr.File(label = 'Upload Data File (Currently supports CSV and ARFF)', file_types=[".csv", ".arff"])
172
  num_samples = gr.Slider(label="Number of Samples", minimum=5, maximum=100, value=5, step=10)
173
  generate_data_btn = gr.Button('Generate Synthetic Data')
174
 
175
+ with gr.Tab("Upload Data as File: Relational Data"):
176
+ data_input_parent = gr.File(label = 'Upload Data File for Parent Dataset', file_types=[ ".csv"])
177
+ data_input_child = gr.File(label = 'Upload Data File for Child Dataset', file_types=[ ".csv"])
178
+ join_on = gr.Textbox(label = 'Column name to join on')
179
+
180
+ generate_data_btn_relational = gr.Button('Generate Synthetic Data')
181
+
182
  with gr.Row():
183
  #data_sample = gr.Dataframe(label = "Original Data")
184
  data_output = gr.Dataframe(label = "Synthetic Data")
185
+ with gr.Row(visible = False) as child_sample:
186
+ data_output_child = gr.Dataframe(label = "Synthetic Data for Child Dataset")
187
 
188
 
189
  generate_data_btn.click(generate_data, inputs = [data_input_u,num_samples], outputs = [data_output])
190
+ generate_data_btn_relational.click(generate_relational_data, inputs = [data_input_parent,data_input_child,join_on], outputs = [data_output, data_output_child])
191
  examples = gr.Examples(examples=[['diabetes.arff',5], ["titanic.csv", 15]],inputs = [data_input_u,num_samples], outputs = [data_output], cache_examples = True, fn = generate_data)
192
 
193