MarkusStoll's picture
Duplicate from renumics/cifar10-cleanlab
c6a85a5
raw history blame
No virus
1.94 kB
import pickle
from renumics import spotlight
import os
import requests
import pandas as pd
from renumics import spotlight
from renumics.spotlight.analysis import DataIssue
if __name__ == "__main__":
cache_file = "dataset_cache.pkl"
if os.path.exists(cache_file):
# Load dataset from cache
with open(cache_file, "rb") as file:
df = pickle.load(file)
print("Dataset loaded from cache.")
label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
label_issue = DataIssue(severity="medium", title="label-issue", rows=label_issue_rows, description="Label issue found by cleanlab")
outlier_issue_row = df[df["outlier_score"]<0.6].sort_values("outlier_score").index.tolist()
outlier_issue = DataIssue(severity="medium", title="outlier-issue", rows=outlier_issue_row, description="Outlier score < 0.6")
near_duplicate_issue_row = df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
near_duplicate_issue = DataIssue(severity="medium", title="near-duplicate-issue", rows=near_duplicate_issue_row, description="Near duplicate issue found by cleanlab")
df = df.drop(["full_image"], axis=1)
while True:
dtypes = {
"image": spotlight.Image,
"embedding": spotlight.Embedding,
"embedding_reduced": spotlight.Embedding,
"probabilities": spotlight.Embedding,
}
view = spotlight.show(
df,
dtype=dtypes,
issues=[label_issue,outlier_issue,near_duplicate_issue],
layout="layout.json",
port=7860,
host="0.0.0.0",
allow_filebrowsing=False,
)
view.close()
else:
print(f"Dataset {cache_file} not found. Please run prepare.py first.")