Spaces:
Runtime error
Runtime error
import pandas as pd | |
import os | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
from datasets import Dataset | |
def modify_dataframe_and_extract_data(df): | |
data_list = [] | |
for _, row in df.iterrows(): | |
messages = [] | |
for i in range(1, 5): | |
user_question = row[f'Question{i}'] | |
user_answer = row[f'Answer{i}'] | |
if user_question: | |
message_content = [{'index': None, 'text': user_question, 'type': 'text'}] | |
if i == 1: | |
message_content.append({'index': 0, 'text': None, 'type': 'image'}) | |
messages.append({'content': message_content, 'role': 'user'}) | |
if user_answer: | |
messages.append({'content': [{'index': None, 'text': user_answer, 'type': 'text'}], 'role': 'assistant'}) | |
image = Image.open(row['imagePath']) | |
data_list.append({'messages': messages, 'images': [image]}) | |
return {'messages': [data['messages'] for data in data_list], 'images': [data['images'] for data in data_list]} | |
def download_and_resize_images(df, image_dir, target_size=(250, 250)): | |
image_paths = [] | |
for index, row in df.iterrows(): | |
image_url = row['primaryImageLink'] | |
object_id = row['objectID'] | |
if image_url: | |
# Extract filename from the URL | |
filename = os.path.join(image_dir, f"{object_id}.jpg") | |
# Download image from the URL | |
response = requests.get(image_url) | |
if response.status_code == 200: | |
# Open the image using PIL | |
image = Image.open(BytesIO(response.content)) | |
# Resize the image | |
image = image.resize(target_size) | |
# Save the resized image | |
image.save(filename) | |
image_paths.append(filename) | |
else: | |
print(f"Failed to download image from {image_url}") | |
image_paths.append(None) | |
else: | |
image_paths.append(None) | |
return image_paths | |
def split_data_dict(data_dict, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1): | |
assert train_ratio + test_ratio + val_ratio == 1.0, "Ratios must sum up to 1.0" | |
total_samples = len(data_dict['messages']) | |
train_size = int(total_samples * train_ratio) | |
test_size = int(total_samples * test_ratio) | |
val_size = int(total_samples * val_ratio) | |
train_data_dict = { | |
'messages': data_dict['messages'][:train_size], | |
'images': data_dict['images'][:train_size] | |
} | |
test_data_dict = { | |
'messages': data_dict['messages'][train_size:train_size + test_size], | |
'images': data_dict['images'][train_size:train_size + test_size] | |
} | |
val_data_dict = { | |
'messages': data_dict['messages'][-val_size:], | |
'images': data_dict['images'][-val_size:] | |
} | |
return train_data_dict, test_data_dict, val_data_dict | |
def save_data_dict_as_arrow(data_dict, file_path): | |
# Convert the dictionary to a Dataset object | |
dataset = Dataset.from_dict(data_dict) | |
# Save the dataset to an Arrow file | |
dataset.save_to_disk(file_path) | |
if __name__ == "__main__": | |
# Example usage: | |
# df = pd.read_csv("/data/data_set_metmuseum.csv") | |
# df1 = df[['objectID', 'primaryImageLink', 'Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4']] | |
# df2 = df1.sample(frac=1) | |
# df3 = df2.head(250) | |
# df4 = df3.copy() | |
df4 = pd.read_csv("sampled_data250.csv") | |
paths = ['input_dataset', os.path.join('input_dataset', 'images'), 'output_dataset'] | |
for path in paths: | |
if not os.path.exists(path): | |
os.makedirs(path) | |
# Call the function to download and resize images | |
image_dir = 'input_dataset/images' | |
# image_paths = download_and_resize_images(df4, image_dir) | |
# Update the DataFrame with the resized image paths | |
# new_df = df4.copy() # Create a copy of the original DataFrame | |
# df4['imagePath'] = image_paths # Add a new column 'imagePath' containing the resized image paths | |
# df4 = df4.drop(['primaryImageLink'], axis=1) | |
# Call the function to modify the DataFrame and extract data | |
data_dict = modify_dataframe_and_extract_data(df4) | |
# split data_dict into train, test, valid | |
train_data_dict, test_data_dict, val_data_dict = split_data_dict(data_dict, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2) | |
# save these as arrow dataset | |
save_data_dict_as_arrow(train_data_dict, os.path.join('output_dataset', 'train.arrow')) | |
save_data_dict_as_arrow(test_data_dict, os.path.join('output_dataset', 'test.arrow')) | |
save_data_dict_as_arrow(val_data_dict, os.path.join('output_dataset', 'val.arrow')) | |
# save to zip format | |
import shutil | |
shutil.make_archive("/content/input_dataset", "zip", "/content/input_dataset") | |
shutil.make_archive("/content/output_dataset", "zip", "/content/output_dataset") | |
# read arrow from disk | |
test_data = Dataset.load_from_disk("output_dataset/test.arrow") | |
test_data | |