import pickle from renumics import spotlight import os import requests import pandas as pd from renumics import spotlight from renumics.spotlight.analysis import DataIssue if __name__ == "__main__": cache_file = "dataset_cache.pkl" if os.path.exists(cache_file): # Load dataset from cache with open(cache_file, "rb") as file: df = pickle.load(file) print("Dataset loaded from cache.") label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist() label_issue = DataIssue( severity="medium", title="label-issue", rows=label_issue_rows, description="Label issue found by cleanlab - Review and correct if necessary", ) outlier_issue_row = ( df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist() ) outlier_issue = DataIssue( severity="medium", title="outlier-issue", rows=outlier_issue_row, description="Outlier score < 0.6 - Review and remove or collect more data", ) near_duplicate_issue_row = ( df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist() ) near_duplicate_issue = DataIssue( severity="medium", title="near-duplicate-issue", rows=near_duplicate_issue_row, description="Near duplicate issue found by cleanlab - Review and remove if necessary", ) while True: dtypes = { "image": spotlight.Image, "image_full": spotlight.Image, "embedding": spotlight.Embedding, "embedding_reduced": spotlight.Embedding, "probabilities": spotlight.Embedding, } view = spotlight.show( df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}), dtype=dtypes, issues=[label_issue,outlier_issue,near_duplicate_issue], layout="layout.json", port=7860, host="0.0.0.0", allow_filebrowsing=False, ) view.close() else: print(f"Dataset {cache_file} not found. Please run prepare.py first.")