Spaces:

santanus24
/

DL4NLP

Runtime error

App Files Files Community

DL4NLP / dataset_preparation.py

santanus24

uploading all .py files

9b5fe77 verified 3 months ago

raw

history blame contribute delete

5.07 kB

	import pandas as pd
	import os
	import requests
	from PIL import Image
	from io import BytesIO
	from datasets import Dataset

	def modify_dataframe_and_extract_data(df):
	data_list = []
	for _, row in df.iterrows():
	messages = []
	for i in range(1, 5):
	user_question = row[f'Question{i}']
	user_answer = row[f'Answer{i}']
	if user_question:
	message_content = [{'index': None, 'text': user_question, 'type': 'text'}]
	if i == 1:
	message_content.append({'index': 0, 'text': None, 'type': 'image'})
	messages.append({'content': message_content, 'role': 'user'})
	if user_answer:
	messages.append({'content': [{'index': None, 'text': user_answer, 'type': 'text'}], 'role': 'assistant'})
	image = Image.open(row['imagePath'])
	data_list.append({'messages': messages, 'images': [image]})
	return {'messages': [data['messages'] for data in data_list], 'images': [data['images'] for data in data_list]}


	def download_and_resize_images(df, image_dir, target_size=(250, 250)):
	image_paths = []
	for index, row in df.iterrows():
	image_url = row['primaryImageLink']
	object_id = row['objectID']
	if image_url:
	# Extract filename from the URL
	filename = os.path.join(image_dir, f"{object_id}.jpg")
	# Download image from the URL
	response = requests.get(image_url)
	if response.status_code == 200:
	# Open the image using PIL
	image = Image.open(BytesIO(response.content))
	# Resize the image
	image = image.resize(target_size)
	# Save the resized image
	image.save(filename)
	image_paths.append(filename)
	else:
	print(f"Failed to download image from {image_url}")
	image_paths.append(None)
	else:
	image_paths.append(None)
	return image_paths

	def split_data_dict(data_dict, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
	assert train_ratio + test_ratio + val_ratio == 1.0, "Ratios must sum up to 1.0"

	total_samples = len(data_dict['messages'])
	train_size = int(total_samples * train_ratio)
	test_size = int(total_samples * test_ratio)
	val_size = int(total_samples * val_ratio)

	train_data_dict = {
	'messages': data_dict['messages'][:train_size],
	'images': data_dict['images'][:train_size]
	}
	test_data_dict = {
	'messages': data_dict['messages'][train_size:train_size + test_size],
	'images': data_dict['images'][train_size:train_size + test_size]
	}
	val_data_dict = {
	'messages': data_dict['messages'][-val_size:],
	'images': data_dict['images'][-val_size:]
	}

	return train_data_dict, test_data_dict, val_data_dict


	def save_data_dict_as_arrow(data_dict, file_path):
	# Convert the dictionary to a Dataset object
	dataset = Dataset.from_dict(data_dict)

	# Save the dataset to an Arrow file
	dataset.save_to_disk(file_path)

	if __name__ == "__main__":
	# Example usage:

	# df = pd.read_csv("/data/data_set_metmuseum.csv")
	# df1 = df[['objectID', 'primaryImageLink', 'Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4']]
	# df2 = df1.sample(frac=1)
	# df3 = df2.head(250)

	# df4 = df3.copy()

	df4 = pd.read_csv("sampled_data250.csv")
	paths = ['input_dataset', os.path.join('input_dataset', 'images'), 'output_dataset']
	for path in paths:
	if not os.path.exists(path):
	os.makedirs(path)

	# Call the function to download and resize images
	image_dir = 'input_dataset/images'
	# image_paths = download_and_resize_images(df4, image_dir)

	# Update the DataFrame with the resized image paths
	# new_df = df4.copy() # Create a copy of the original DataFrame
	# df4['imagePath'] = image_paths # Add a new column 'imagePath' containing the resized image paths
	# df4 = df4.drop(['primaryImageLink'], axis=1)

	# Call the function to modify the DataFrame and extract data
	data_dict = modify_dataframe_and_extract_data(df4)
	# split data_dict into train, test, valid
	train_data_dict, test_data_dict, val_data_dict = split_data_dict(data_dict, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2)

	# save these as arrow dataset
	save_data_dict_as_arrow(train_data_dict, os.path.join('output_dataset', 'train.arrow'))
	save_data_dict_as_arrow(test_data_dict, os.path.join('output_dataset', 'test.arrow'))
	save_data_dict_as_arrow(val_data_dict, os.path.join('output_dataset', 'val.arrow'))

	# save to zip format
	import shutil
	shutil.make_archive("/content/input_dataset", "zip", "/content/input_dataset")
	shutil.make_archive("/content/output_dataset", "zip", "/content/output_dataset")

	# read arrow from disk
	test_data = Dataset.load_from_disk("output_dataset/test.arrow")
	test_data