easy-analysis / app.py
merve's picture
merve HF staff
Update app.py
c163a56
import gradio as gr
import pandas as pd
from huggingface_hub.hf_api import create_repo, upload_file, HfApi
from huggingface_hub.repository import Repository
import subprocess
import os
import tempfile
import sweetviz as sv
def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
df = pd.read_csv(dataset.name)
username = HfApi().whoami(token=token)["name"]
if column is not None:
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
else:
analyze_report = sv.analyze(df, pairwise_analysis=pairwise)
analyze_report.show_html('./index.html', open_browser=False)
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
with open("README.md", "w+") as f:
f.write(readme)
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
return f"Your dataset report will be ready at {repo_url}"
def compare_column_values(dataset, dataset_name, token, column, category):
df = pd.read_csv(dataset.name)
username = HfApi().whoami(token=token)["name"]
arr = df[column].unique()
arr = list(arr[arr != column])
compare_report = sv.compare_intra(df, df[column] == category, arr[0])
compare_report.show_html('./index.html', open_browser=False)
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
with open("README.md", "w+") as f:
f.write(readme)
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
return f"Your dataset report will be ready at {repo_url}"
def compare_dataset_splits(dataset, dataset_name, token, splits):
df = pd.read_csv(dataset.name)
train = df.sample(frac=splits)
test = df.loc[df.index.difference(train.index)]
username = HfApi().whoami(token=token)["name"]
compare_report = sv.compare([train, "Training Data"], [test, "Test Data"])
compare_report.show_html('./index.html', open_browser=False)
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
with open("README.md", "w+") as f:
f.write(readme)
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
return f"Your dataset report will be ready at {repo_url}"
with gr.Blocks() as demo:
main_title = gr.Markdown("""# Easy Analysis🪄🌟✨""")
main_desc = gr.Markdown("""This app enables you to run three type of dataset analysis and pushes the interactive reports to your Hugging Face Hub profile as a Space. It uses SweetViz in the back.""")
with gr.Tabs():
with gr.TabItem("Analyze") as analyze:
with gr.Row():
with gr.Column():
title = gr.Markdown(""" ## Analyze Dataset """)
description = gr.Markdown("Analyze a dataset or predictive variables against a target variable in a dataset (enter a column name to column section if you want to compare against target value). You can also do pairwise analysis, but it has quadratic complexity.")
dataset = gr.File(label = "Dataset")
column = gr.Text(label = "Compare dataset against a target variable (Optional)")
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
token = gr.Textbox(label = "Your Hugging Face Token")
dataset_name = gr.Textbox(label = "Dataset Name")
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
inference_run = gr.Button("Infer")
inference_progress = gr.StatusTracker(cover_container=True)
outcome = gr.outputs.Textbox()
inference_run.click(
analyze_datasets,
inputs=[dataset, dataset_name, token, column, pairwise],
outputs=outcome,
status_tracker=inference_progress,
)
with gr.TabItem("Compare Splits") as compare_splits:
with gr.Row():
with gr.Column():
title = gr.Markdown(""" ## Compare Splits""")
description = gr.Markdown("Split a dataset and compare splits. You need to give a fraction, e.g. 0.8.")
dataset = gr.File(label = "Dataset")
split_ratio = gr.Number(label = "Split Ratios")
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
token = gr.Textbox(label = "Your Hugging Face Token")
dataset_name = gr.Textbox(label = "Dataset Name")
inference_run = gr.Button("Infer")
inference_progress = gr.StatusTracker(cover_container=True)
outcome = gr.outputs.Textbox()
inference_run.click(
compare_dataset_splits,
inputs=[dataset, dataset_name, token, split_ratio],
outputs=outcome,
status_tracker=inference_progress,
)
with gr.TabItem("Compare Subsets") as compare_subsets:
with gr.Row():
with gr.Column():
title = gr.Markdown(""" ## Compare Subsets""")
description = gr.Markdown("Compare subsets of a dataset, e.g. you can pick Age Group column and compare adult category against young.")
dataset = gr.File(label = "Dataset")
column = gr.Text(label = "Enter column:")
category = gr.Text(label = "Enter category:")
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
token = gr.Textbox(label = "Your Hugging Face Token")
dataset_name = gr.Textbox(label = "Dataset Name")
inference_run = gr.Button("Run Analysis")
inference_progress = gr.StatusTracker(cover_container=True)
outcome = gr.outputs.Textbox()
inference_run.click(
compare_column_values,
inputs=[dataset, dataset_name, token, column, category ],
outputs=outcome,
status_tracker=inference_progress,
)
demo.launch(debug=True)