navigate-data-issues

Runtime error

App Files Files Community

navigate-data-issues / run.py

MarkusStoll

navigate ready

5ee8932 about 1 year ago

raw

history blame

2.36 kB

	import pickle
	from renumics import spotlight
	import os
	import requests
	import pandas as pd
	from renumics import spotlight
	from renumics.spotlight.analysis import DataIssue



	if __name__ == "__main__":
	cache_file = "dataset_cache.pkl"

	if os.path.exists(cache_file):
	# Load dataset from cache
	with open(cache_file, "rb") as file:
	df = pickle.load(file)
	print("Dataset loaded from cache.")


	label_issue_rows = df[df["is_label_issue"]].sort_values("label_score").index.tolist()
	label_issue = DataIssue(
	severity="medium",
	title="label-issue",
	rows=label_issue_rows,
	description="Label issue found by cleanlab - Review and correct if necessary",
	)

	outlier_issue_row = (
	df[df["outlier_score"] < 0.6].sort_values("outlier_score").index.tolist()
	)
	outlier_issue = DataIssue(
	severity="medium",
	title="outlier-issue",
	rows=outlier_issue_row,
	description="Outlier score < 0.6 - Review and remove or collect more data",
	)

	near_duplicate_issue_row = (
	df[df["is_near_duplicate_issue"]].sort_values("near_duplicate_score").index.tolist()
	)
	near_duplicate_issue = DataIssue(
	severity="medium",
	title="near-duplicate-issue",
	rows=near_duplicate_issue_row,
	description="Near duplicate issue found by cleanlab - Review and remove if necessary",
	)


	while True:
	dtypes = {
	"image": spotlight.Image,
	"image_full": spotlight.Image,
	"embedding": spotlight.Embedding,
	"embedding_reduced": spotlight.Embedding,
	"probabilities": spotlight.Embedding,
	}

	view = spotlight.show(
	df.rename(columns={"fine_label_str": "label", "fine_label_prediction_str":"pred"}),
	dtype=dtypes,
	issues=[label_issue,outlier_issue,near_duplicate_issue],
	layout="layout.json",
	port=7860,
	host="0.0.0.0",
	allow_filebrowsing=False,
	)

	view.close()

	else:
	print(f"Dataset {cache_file} not found. Please run prepare.py first.")