Spaces:

michelecafagna26
/

High-Level-Dataset-explorer

Sleeping

Michele Cafagna

added purity and diversity scores

efe64cb over 1 year ago

No virus

3.09 kB

	import streamlit as st
	from datasets import load_dataset
	import numpy as np

	st.set_page_config(page_title="High-Level dataset")

	FIELDS = ["scene", "action", "rationale", "object"]
	QS = {
	"scene": "Where is the picture taken?",
	"action": "What is the subject doing?",
	"rationale": "Why is the subject doing it?"
	}
	SPLITS = ["test", "train"]

	AVG_PURITY = 1.10

	AVG_DIVERSITY = 0.872819
	MIN_DIVERSITY = 0
	MAX_DIVERSITY = 100

	@st.cache
	def load_data(split):

	dataset = load_dataset("michelecafagna26/hl")

	coco2id = {int(dataset[split][i]['file_name'].replace("COCO_train2014_", "").replace(".jpg", "")): i for i in
	range(len(dataset[split]))}

	return dataset, coco2id


	def write_obj(dataset, img_id, options, split, list_type="num", show_questions=False,
	show_conf=False):

	st.image(dataset[split][img_id]['image'])

	item_purity = np.mean([np.mean(dataset[split][img_id]['purity'][k]) for k in dataset[split][img_id]['purity']])
	item_diversity = np.mean(list(dataset[split][img_id]['diversity'].values()))

	# normalize
	item_diversity = 1-(item_diversity-MIN_DIVERSITY)/(MAX_DIVERSITY-MIN_DIVERSITY)

	col1, col2 = st.columns(2)

	col1.metric(label="Diversity score",
	value=round(item_diversity, 2),
	delta=round(item_diversity - AVG_DIVERSITY, 2),
	help="Item's internal lexical diversity.\n Positive delta means higher then the average")

	col2.metric(label="Purity score",
	value=round(item_purity, 2),
	delta=round(item_purity - AVG_PURITY, 2),
	help="Item's internal semantic similarity.\n Positive delta means higher then the average")

	for field in options:

	st.markdown(f"## {field.capitalize()}")

	if show_questions and field != "object":
	st.markdown(f" Question: _{QS[field]}_")

	for n, annotation in enumerate(dataset[split][img_id][field]):

	col1, col2 = st.columns(2)

	if list_type == "num":
	col1.markdown(f"{n + 1}. {annotation}")
	else:
	col1.markdown(f"{list_type} {annotation}")

	if show_conf and field != "object":
	col2.metric(label="confidence score",
	value=dataset[split][img_id]['confidence'][field][n])


	def main():
	st.title('High-Level Dataset')

	show_questions = st.sidebar.checkbox('Questions')
	show_conf = st.sidebar.checkbox('Confidence scores')
	options = st.sidebar.multiselect(
	'Choose the annotations',
	FIELDS,
	default=FIELDS)

	split = st.sidebar.selectbox(
	'Split',
	SPLITS)

	dataset, coco2id = load_data(split)

	# sidebar
	choosen_image = st.selectbox(
	'Select an image',
	list(coco2id.keys()),
	help="write a key like: 7603"
	)

	write_obj(dataset, coco2id[choosen_image], options=options, split=split, list_type="num",
	show_questions=show_questions, show_conf=show_conf)


	if __name__ == "__main__":
	main()