Spaces:

GeorgeIbrahim
/

Data_Collection

Sleeping

App Files Files Community

Data_Collection / app.py

GeorgeIbrahim

updates

2d9f0ba 7 months ago

raw

history blame contribute delete

10.4 kB

	import gradio as gr
	import os
	import threading
	import random
	from datasets import load_dataset, Dataset, Features, Value, concatenate_datasets
	from huggingface_hub import login
	import json
	import re

	# reset = False

	def check_word_count(caption):
	# Check if the caption has 3 or more words
	return gr.update(interactive=len(caption.split()) >= 3)

	# Authenticate with Hugging Face
	token = os.getenv("HUGGINGFACE_TOKEN")
	if token:
	login(token=token)
	else:
	print("HUGGINGFACE_TOKEN environment variable not set.")
	dataset_name = "GeorgeIbrahim/EGYCOCO" # Replace with your dataset name

	with open('nearest_neighbors_with_captions.json', 'r') as f:
	results = json.load(f)

	# Load or create the dataset
	try:
	dataset = load_dataset(dataset_name, split="train")
	dataset = dataset.filter(lambda example: example["image_id"] != "COCO_val2014_000000111367.jpg")

	print("Loaded existing dataset:", dataset)
	print("Dataset features:", dataset.features) # Check if 'split' is part of features

	# Check if the 'split' column exists; if not, add it
	if 'split' not in dataset.column_names:
	# Define the 'split' values based on `image_id`
	split_values = [
	"dev" if example["image_id"] in results else "train"
	for example in dataset
	]

	# Add 'split' column to the dataset
	dataset = dataset.add_column("split", split_values)
	print("Added 'split' column to dataset.")
	else:
	print("'split' column already exists.")

	# Create a dictionary to keep track of the highest annotation count for each image
	annotation_counts = {}
	for example in dataset:
	image_id = example["image_id"]
	count = example["annotation_count"]

	if image_id not in annotation_counts or count > annotation_counts[image_id]:
	annotation_counts[image_id] = count

	print("Annotation counts:", annotation_counts)


	except Exception as e:
	print(f"Error loading dataset: {e}")
	# Create an empty dataset if it doesn't exist
	features = Features({
	'image_id': Value(dtype='string'),
	'caption': Value(dtype='string'),
	'annotation_count': Value(dtype='int32'),
	'split': Value(dtype='string')
	})
	dataset = Dataset.from_dict({'image_id': [], 'caption': [], 'annotation_count': [], 'split': []}, features=features)
	annotation_counts = {}
	dataset.push_to_hub(dataset_name) # Push the empty dataset to Hugging Face


	# Initialize or reset data as needed based on the `reset` flag
	# if reset:
	# # Clear the annotation counts
	# annotation_counts = {}
	# shown_counts = {} # If you are tracking shown counts separately for images

	# # Optionally, clear or reinitialize the dataset
	# features = Features({
	# 'image_id': Value(dtype='string'),
	# 'caption': Value(dtype='string'),
	# 'annotation_count': Value(dtype='int32'),
	# 'split': Value(dtype='string')
	# })
	# dataset = Dataset.from_dict({
	# 'image_id': [],
	# 'caption': [],
	# 'annotation_count': [],
	# 'split': []
	# }, features=features)

	# # Push the reset dataset to Hugging Face or perform other necessary actions
	# dataset.push_to_hub(dataset_name)
	# print("Data has been reset.")

	image_folder = "images"
	image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
	len_files = len(image_files)
	lock = threading.Lock()


	def get_caption_for_image_id(image_path):
	"""
	Retrieve the caption for a given image_id from the JSON data.
	"""
	# Extract the numeric part of the image ID
	match = re.search(r'_(\d+)\.', image_path)
	if match:
	image_id = match.group(1).lstrip('0') # Remove leading zeros
	print("Searching for image_id:", image_id) # Debugging line

	# Check if image_id is a test image
	if image_id in results:
	print("Found caption in results:", results[image_id]["caption"]) # Debugging line
	return results[image_id]["caption"]

	# If image_id is not a test image, search in nearest neighbors
	for test_image_data in results.values():
	for neighbor in test_image_data["nearest_neighbors"]:
	if neighbor["image_id"] == image_id:
	print("Found caption in nearest neighbors:", neighbor["caption"]) # Debugging line
	return neighbor["caption"]

	# Return None if the image_id is not found
	print("Caption not found for image_id:", image_id) # Debugging line
	return None

	# Function to get a random image that hasn’t been fully annotated
	def get_next_image(session_data):
	with lock:
	# Available images filter
	available_images = []

	# Iterate over each image file to apply the filtering logic
	for img in image_files:

	# Match and extract the image_id from the filename
	match = re.search(r'_(\d+)\.', img)
	if match:
	image_id_2 = match.group(1).lstrip('0') # Remove leading zeros


	# Apply the filtering conditions
	if (img not in annotation_counts or
	(image_id_2 in results and annotation_counts.get(img, 0) < 2) or
	(image_id_2 not in results and annotation_counts.get(img, 0) == 0)):
	available_images.append(img)

	# print("Available images:", available_images) # Debugging line
	print(available_images)
	print("Remaining images: ", len_files - len(available_images))
	# random.shuffle(available_images)

	# Check if the user already has an image
	if session_data["current_image"] is None and available_images:
	# Assign a new random image to the user
	session_data["current_image"] = random.choice(available_images)
	# print("Current image_id:", session_data["current_image"]) # Print the current image_id

	return os.path.join(image_folder, session_data["current_image"]) if session_data["current_image"] else None


	# Function to save the annotation to Hugging Face dataset and fetch the next image
	def save_annotation(caption, session_data):
	global dataset, annotation_counts # Declare global dataset and annotation_counts at the start of the function

	if session_data["current_image"] is None:
	return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")

	with lock:
	image_id = session_data["current_image"]

	match = re.search(r'_(\d+)\.', image_id)
	image_2 = match.group(1).lstrip('0')
	split = "dev" if image_2 in results else "train"

	# Save caption or "skipped" based on user input
	if caption.strip().lower() == "skip":
	caption = "skipped"

	# Get current annotation count
	annotation_count = annotation_counts.get(image_id, 0)

	# Add the new annotation as a new row to the dataset
	new_data = Dataset.from_dict({
	"image_id": [image_id],
	"caption": [caption],
	"annotation_count": [annotation_count + 1],
	"split": [split]
	}, features=Features({
	'image_id': Value(dtype='string'),
	'caption': Value(dtype='string'),
	'annotation_count': Value(dtype='int32'),
	'split': Value(dtype='string')
	}))

	# Update the annotation count in the dictionary
	annotation_counts[image_id] = annotation_count + 1

	# Concatenate with the existing dataset and push the updated dataset to Hugging Face
	dataset = concatenate_datasets([dataset, new_data])

	dataset = dataset.filter(lambda example: example['caption'].strip() != "")

	dataset.push_to_hub(dataset_name)
	print("Pushed updated dataset")

	# # Clear user's current image if the validation image has been annotated twice
	# if (split == "train" and annotation_count > 1) or (split == "dev" and annotation_count > 2):
	session_data["current_image"] = None

	# Fetch the next image
	next_image = get_next_image(session_data)
	if next_image:
	next_caption = get_caption_for_image_id(os.path.basename(next_image)) # Retrieve the caption for the new image
	print("Next image_id:", os.path.basename(next_image)) # Debugging line
	return gr.update(value=next_image), gr.update(value=""), gr.update(value=next_caption or "")
	else:
	return gr.update(visible=False), gr.update(value="All images have been annotated!"), gr.update(value="")

	def initialize_interface(session_data):
	next_image = get_next_image(session_data)
	if next_image:
	next_caption = get_caption_for_image_id(os.path.basename(next_image)) # Retrieve caption for initial image
	print("Initial image_id:", os.path.basename(next_image)) # Print the initial image_id
	return gr.update(value=next_image), gr.update(value=next_caption or "")
	else:
	return gr.update(visible=False), gr.update(value="All images have been annotated!")


	# Build the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Image Captioning Tool")
	gr.Markdown("Please provide your caption in Egyptian Arabic 'Masri'")

	session_data = gr.State({"current_image": None}) # Session-specific state

	with gr.Row():
	image = gr.Image()
	caption = gr.Textbox(placeholder="Enter caption here...")
	existing_caption = gr.Textbox(label="Existing Caption", interactive=False) # Display existing caption
	submit = gr.Button("Submit", interactive=False) # Initially disabled

	# Enable/disable the submit button based on word count
	caption.change(fn=check_word_count, inputs=caption, outputs=submit)

	# Define actions for buttons
	submit.click(fn=save_annotation, inputs=[caption, session_data], outputs=[image, caption, existing_caption])

	# Load initial image
	demo.load(fn=initialize_interface, inputs=session_data, outputs=[image, existing_caption])

	demo.launch(share=True)