Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
from realtabformer import REaLTabFormer | |
from scipy.io import arff | |
import os | |
rtf_model = REaLTabFormer( | |
model_type="tabular", | |
epochs=25, # Default is 200 | |
gradient_accumulation_steps=4) | |
def generate_data(file, num_samples): | |
if '.arff' in file.name: | |
data = arff.loadarff(open(file.name,'rt')) | |
df = pd.DataFrame(data[0]) | |
elif '.csv' in file.name: | |
df = pd.read_csv(file.name) | |
rtf_model.fit(df, num_bootstrap=10) # Default is 500 | |
# Generate synthetic data | |
samples = rtf_model.sample(n_samples=num_samples) | |
return samples | |
def generate_relational_data(parent_file, child_file, join_on): | |
parent_df = pd.read_csv(parent_file.name) | |
child_df = pd.read_csv(child_file.name) | |
#Make sure join_on column exists in both | |
assert ((join_on in parent_df.columns) and | |
(join_on in child_df.columns)) | |
rtf_model.fit(parent_df.drop(join_on, axis=1), num_bootstrap=100) | |
pdir = Path("rtf_parent/") | |
rtf_model.save(pdir) | |
# # Get the most recently saved parent model, | |
# # or a specify some other saved model. | |
# parent_model_path = pdir / "idXXX" | |
parent_model_path = sorted([ | |
p for p in pdir.glob("id*") if p.is_dir()], | |
key=os.path.getmtime)[-1] | |
child_model = REaLTabFormer( | |
model_type="relational", | |
parent_realtabformer_path=parent_model_path, | |
epochs = 25, | |
output_max_length=None, | |
train_size=0.8) | |
child_model.fit( | |
df=child_df, | |
in_df=parent_df, | |
join_on=join_on, | |
num_bootstrap=10) | |
# Generate parent samples. | |
parent_samples = rtf_model.sample(5) | |
# Create the unique ids based on the index. | |
parent_samples.index.name = join_on | |
parent_samples = parent_samples.reset_index() | |
# Generate the relational observations. | |
child_samples = child_model.sample( | |
input_unique_ids=parent_samples[join_on], | |
input_df=parent_samples.drop(join_on, axis=1), | |
gen_batch=5) | |
return parent_samples, child_samples, gr.update(visible = True) | |
css = """ | |
.gradio-container { | |
font-family: 'IBM Plex Sans', sans-serif; | |
} | |
.gr-button { | |
color: white; | |
border-color: black; | |
background: black; | |
} | |
input[type='range'] { | |
accent-color: black; | |
} | |
.dark input[type='range'] { | |
accent-color: #dfdfdf; | |
} | |
.container { | |
max-width: 430px; | |
margin: auto; | |
padding-top: 1.5rem; | |
} | |
#gallery { | |
min-height: 22rem; | |
margin-bottom: 15px; | |
margin-left: auto; | |
margin-right: auto; | |
border-bottom-right-radius: .5rem !important; | |
border-bottom-left-radius: .5rem !important; | |
} | |
#gallery>div>.h-full { | |
min-height: 20rem; | |
} | |
.details:hover { | |
text-decoration: underline; | |
} | |
.gr-button { | |
white-space: nowrap; | |
} | |
.gr-button:focus { | |
border-color: rgb(147 197 253 / var(--tw-border-opacity)); | |
outline: none; | |
box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); | |
--tw-border-opacity: 1; | |
--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); | |
--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); | |
--tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); | |
--tw-ring-opacity: .5; | |
} | |
#advanced-btn { | |
font-size: .7rem !important; | |
line-height: 19px; | |
margin-top: 12px; | |
margin-bottom: 12px; | |
padding: 2px 8px; | |
border-radius: 14px !important; | |
} | |
#advanced-options { | |
display: none; | |
margin-bottom: 20px; | |
} | |
.footer { | |
margin-bottom: 45px; | |
margin-top: 35px; | |
text-align: center; | |
border-bottom: 1px solid #e5e5e5; | |
} | |
.footer>p { | |
font-size: .8rem; | |
display: inline-block; | |
padding: 0 10px; | |
transform: translateY(10px); | |
background: white; | |
} | |
.dark .footer { | |
border-color: #303030; | |
} | |
.dark .footer>p { | |
background: #0b0f19; | |
} | |
""" | |
with gr.Blocks(css = css) as demo: | |
gr.Markdown(""" | |
## REaLTabFormer: Generating Realistic Relational and Tabular Data using Transformers | |
""") | |
gr.HTML(''' | |
<p style="margin-bottom: 10px; font-size: 94%"> | |
This is an unofficial demo for REaLTabFormer, an approach that can be used to generate synthetic data from single tabular data using GPT. The demo is based on the <a href='https://github.com/avsolatorio/REaLTabFormer' style='text-decoration: underline;' target='_blank'> Github </a> implementation provided by the authors. | |
</p> | |
''') | |
gr.HTML(''' | |
<p align="center"><img src="REalTabFormer_Final_EQ.png" style="width:75%"/></p> | |
''') | |
with gr.Column(): | |
with gr.Tab("Upload Data as File: Tabular Data"): | |
data_input_u = gr.File(label = 'Upload Data File (Currently supports CSV and ARFF)', file_types=[".csv", ".arff"]) | |
num_samples = gr.Slider(label="Number of Samples", minimum=5, maximum=100, value=5, step=10) | |
generate_data_btn = gr.Button('Generate Synthetic Data') | |
with gr.Tab("Upload Data as File: Relational Data"): | |
data_input_parent = gr.File(label = 'Upload Data File for Parent Dataset', file_types=[ ".csv"]) | |
data_input_child = gr.File(label = 'Upload Data File for Child Dataset', file_types=[ ".csv"]) | |
join_on = gr.Textbox(label = 'Column name to join on') | |
generate_data_btn_relational = gr.Button('Generate Synthetic Data') | |
with gr.Row(): | |
#data_sample = gr.Dataframe(label = "Original Data") | |
data_output = gr.Dataframe(label = "Synthetic Data") | |
with gr.Row(visible = False) as child_sample: | |
data_output_child = gr.Dataframe(label = "Synthetic Data for Child Dataset") | |
generate_data_btn.click(generate_data, inputs = [data_input_u,num_samples], outputs = [data_output]) | |
generate_data_btn_relational.click(generate_relational_data, inputs = [data_input_parent,data_input_child,join_on], outputs = [data_output, data_output_child]) | |
examples = gr.Examples(examples=[['diabetes.arff',5], ["titanic.csv", 15]],inputs = [data_input_u,num_samples], outputs = [data_output], cache_examples = True, fn = generate_data) | |
demo.launch() |