MarkusStoll's picture
navigate ready
5ee8932
raw history blame
No virus
2.36 kB
import pickle
from renumics import spotlight
import os
import requests
import pandas as pd
from renumics import spotlight
from renumics.spotlight.analysis import DataIssue
if __name__ == "__main__":
cache_file = "dataset_cache.pkl"
if os.path.exists(cache_file):
# Load dataset from cache
with open(cache_file, "rb") as file:
df = pickle.load(file)
print("Dataset loaded from cache.")
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
label_issue = DataIssue(
severity="medium",
title="label-issue",
rows=label_issue_rows,
description="Label issue found by cleanlab - Review and correct if necessary",
)
outlier_issue_row = (
df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
)
outlier_issue = DataIssue(
severity="medium",
title="outlier-issue",
rows=outlier_issue_row,
description="Outlier score < 0.6 - Review and remove or collect more data",
)
near_duplicate_issue_row = (
df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
)
near_duplicate_issue = DataIssue(
severity="medium",
title="near-duplicate-issue",
rows=near_duplicate_issue_row,
description="Near duplicate issue found by cleanlab - Review and remove if necessary",
)
while True:
dtypes = {
"image": spotlight.Image,
"image_full": spotlight.Image,
"embedding": spotlight.Embedding,
"embedding_reduced": spotlight.Embedding,
"probabilities": spotlight.Embedding,
}
view = spotlight.show(
df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
dtype=dtypes,
issues=[label_issue,outlier_issue,near_duplicate_issue],
layout="layout.json",
port=7860,
host="0.0.0.0",
allow_filebrowsing=False,
)
view.close()
else:
print(f"Dataset {cache_file} not found. Please run prepare.py first.")