mervenoyan commited on
Commit
d2a61f1
1 Parent(s): acaa4d9

simplified UI

Browse files
Files changed (1) hide show
  1. app.py +26 -20
app.py CHANGED
@@ -12,22 +12,23 @@ import dabl
12
  import re
13
 
14
 
15
- def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
16
  df = pd.read_csv(dataset.name)
17
  username = HfApi().whoami(token=token)["name"]
18
  if column is not None:
19
  analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
20
  else:
21
  analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
22
-
23
  analyze_report.show_html('./index.html', open_browser=False)
24
- repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
25
 
26
- upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
 
 
27
  readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
28
  with open("README.md", "w+") as f:
29
  f.write(readme)
30
- upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
31
 
32
  return f"Your dataset report will be ready at {repo_url}"
33
 
@@ -45,9 +46,11 @@ def extract_estimator_config(model):
45
  def detect_training(df, column):
46
  if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
47
  trainer = dabl.SimpleRegressor()
 
48
  elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
49
  trainer = dabl.SimpleClassifier()
50
- return trainer
 
51
 
52
  def edit_types(df):
53
  types = dabl.detect_types(df)
@@ -61,10 +64,11 @@ def edit_types(df):
61
  df_clean = dabl.clean(df, type_hints=type_hints)
62
  return df_clean
63
 
64
- def train_baseline(dataset, dataset_name, token, column):
65
  df = pd.read_csv(dataset.name)
 
66
  df_clean = edit_types(df)
67
- fc = detect_training(df_clean, column)
68
  X = df_clean.drop(column, axis = 1)
69
  y = df_clean[column]
70
 
@@ -76,21 +80,25 @@ def train_baseline(dataset, dataset_name, token, column):
76
  print('Logging training')
77
  fc.fit(X, y)
78
  username = HfApi().whoami(token=token)["name"]
79
- repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
80
-
81
- readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
82
- readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
 
 
 
83
  readme+="**Metrics of the best model:**\n\n"
84
  for elem in str(fc.current_best_).split("\n"):
85
  readme+= f"{elem}\n\n"
86
  readme+= "\n\n**See model plot below:**\n\n"
87
  readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
88
- readme+= "\n\nThis model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
 
89
  with open(f"{tmpdirname}/README.md", "w+") as f:
90
  f.write(readme)
91
  with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
92
  pickle.dump(fc, file=f)
93
- upload_folder(repo_id =f"{username}/{dataset_name}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
94
 
95
  return f"Your model will be ready at {repo_url}"
96
 
@@ -107,10 +115,9 @@ with gr.Blocks() as demo:
107
  with gr.Column():
108
  title = gr.Markdown(""" ## Train a supervised baseline model""")
109
  description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
110
- dataset = gr.File(label = "Dataset")
111
  column = gr.Text(label = "Enter target variable:")
112
  pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
113
- dataset_name = gr.Text(label = "Enter dataset name:")
114
  token = gr.Textbox(label = "Your Hugging Face Token")
115
  inference_run = gr.Button("Train")
116
  inference_progress = gr.StatusTracker(cover_container=True)
@@ -118,7 +125,7 @@ with gr.Blocks() as demo:
118
  outcome = gr.outputs.Textbox(label = "Progress")
119
  inference_run.click(
120
  train_baseline,
121
- inputs=[dataset, dataset_name, token, column],
122
  outputs=outcome,
123
  status_tracker=inference_progress,
124
  )
@@ -127,18 +134,17 @@ with gr.Blocks() as demo:
127
  with gr.Column():
128
  title = gr.Markdown(""" ## Analyze Dataset """)
129
  description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
130
- dataset = gr.File(label = "Dataset")
131
  column = gr.Text(label = "Compare dataset against a target variable (Optional)")
132
  pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
133
  token = gr.Textbox(label = "Your Hugging Face Token")
134
- dataset_name = gr.Textbox(label = "Dataset Name")
135
  pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
136
  inference_run = gr.Button("Infer")
137
  inference_progress = gr.StatusTracker(cover_container=True)
138
  outcome = gr.outputs.Textbox()
139
  inference_run.click(
140
  analyze_datasets,
141
- inputs=[dataset, dataset_name, token, column, pairwise],
142
  outputs=outcome,
143
  status_tracker=inference_progress,
144
  )
 
12
  import re
13
 
14
 
15
+ def analyze_datasets(dataset, token, column=None, pairwise="off"):
16
  df = pd.read_csv(dataset.name)
17
  username = HfApi().whoami(token=token)["name"]
18
  if column is not None:
19
  analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
20
  else:
21
  analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
22
+ dataset_name = dataset.name.split("/")[-1].strip(".csv")
23
  analyze_report.show_html('./index.html', open_browser=False)
 
24
 
25
+ repo_url = create_repo(f"{username}/{dataset_name}-report", repo_type = "space", token = token, space_sdk = "static", private=False)
26
+
27
+ upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
28
  readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
29
  with open("README.md", "w+") as f:
30
  f.write(readme)
31
+ upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}-report", repo_type = "space", token=token)
32
 
33
  return f"Your dataset report will be ready at {repo_url}"
34
 
 
46
  def detect_training(df, column):
47
  if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
48
  trainer = dabl.SimpleRegressor()
49
+ task = "regression"
50
  elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
51
  trainer = dabl.SimpleClassifier()
52
+ task = "classification"
53
+ return trainer, task
54
 
55
  def edit_types(df):
56
  types = dabl.detect_types(df)
 
64
  df_clean = dabl.clean(df, type_hints=type_hints)
65
  return df_clean
66
 
67
+ def train_baseline(dataset, token, column):
68
  df = pd.read_csv(dataset.name)
69
+ dataset_name = dataset.name.split("/")[-1].strip(".csv")
70
  df_clean = edit_types(df)
71
+ fc, task = detect_training(df_clean, column)
72
  X = df_clean.drop(column, axis = 1)
73
  y = df_clean[column]
74
 
 
80
  print('Logging training')
81
  fc.fit(X, y)
82
  username = HfApi().whoami(token=token)["name"]
83
+ repo_url = create_repo(repo_id = f"{username}/{dataset_name}-{column}-{task}", token = token)
84
+ if task == "regression":
85
+ task_metadata = "tabular-regression"
86
+ else:
87
+ task_metadata = "tabular-classification"
88
+ readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\ntags:\n- {task_metadata}\n- baseline-trainer\n---\n\n"
89
+ readme += f"## Baseline Model trained on {dataset_name} to apply {task} on {column}\n\n"
90
  readme+="**Metrics of the best model:**\n\n"
91
  for elem in str(fc.current_best_).split("\n"):
92
  readme+= f"{elem}\n\n"
93
  readme+= "\n\n**See model plot below:**\n\n"
94
  readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
95
+ readme+= "\n\n**Disclaimer:** This model is trained with dabl library as a baseline, for better results, use [AutoTrain](https://huggingface.co/autotrain).\n\n"
96
+ readme+= "**Logs of training** including the models tried in the process can be found in logs.txt"
97
  with open(f"{tmpdirname}/README.md", "w+") as f:
98
  f.write(readme)
99
  with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
100
  pickle.dump(fc, file=f)
101
+ upload_folder(repo_id =f"{username}/{dataset_name}-{column}-{task}", folder_path=tmpdirname, repo_type = "model", token=token, path_in_repo="./")
102
 
103
  return f"Your model will be ready at {repo_url}"
104
 
 
115
  with gr.Column():
116
  title = gr.Markdown(""" ## Train a supervised baseline model""")
117
  description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
118
+ dataset = gr.File(label = "CSV Dataset")
119
  column = gr.Text(label = "Enter target variable:")
120
  pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
 
121
  token = gr.Textbox(label = "Your Hugging Face Token")
122
  inference_run = gr.Button("Train")
123
  inference_progress = gr.StatusTracker(cover_container=True)
 
125
  outcome = gr.outputs.Textbox(label = "Progress")
126
  inference_run.click(
127
  train_baseline,
128
+ inputs=[dataset, token, column],
129
  outputs=outcome,
130
  status_tracker=inference_progress,
131
  )
 
134
  with gr.Column():
135
  title = gr.Markdown(""" ## Analyze Dataset """)
136
  description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
137
+ dataset = gr.File(label = "CSV Dataset")
138
  column = gr.Text(label = "Compare dataset against a target variable (Optional)")
139
  pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
140
  token = gr.Textbox(label = "Your Hugging Face Token")
 
141
  pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
142
  inference_run = gr.Button("Infer")
143
  inference_progress = gr.StatusTracker(cover_container=True)
144
  outcome = gr.outputs.Textbox()
145
  inference_run.click(
146
  analyze_datasets,
147
+ inputs=[dataset, token, column, pairwise],
148
  outputs=outcome,
149
  status_tracker=inference_progress,
150
  )