DL4NLP / dataset_preparation.py
santanus24's picture
uploading all .py files
9b5fe77 verified
import pandas as pd
import os
import requests
from PIL import Image
from io import BytesIO
from datasets import Dataset
def modify_dataframe_and_extract_data(df):
data_list = []
for _, row in df.iterrows():
messages = []
for i in range(1, 5):
user_question = row[f'Question{i}']
user_answer = row[f'Answer{i}']
if user_question:
message_content = [{'index': None, 'text': user_question, 'type': 'text'}]
if i == 1:
message_content.append({'index': 0, 'text': None, 'type': 'image'})
messages.append({'content': message_content, 'role': 'user'})
if user_answer:
messages.append({'content': [{'index': None, 'text': user_answer, 'type': 'text'}], 'role': 'assistant'})
image = Image.open(row['imagePath'])
data_list.append({'messages': messages, 'images': [image]})
return {'messages': [data['messages'] for data in data_list], 'images': [data['images'] for data in data_list]}
def download_and_resize_images(df, image_dir, target_size=(250, 250)):
image_paths = []
for index, row in df.iterrows():
image_url = row['primaryImageLink']
object_id = row['objectID']
if image_url:
# Extract filename from the URL
filename = os.path.join(image_dir, f"{object_id}.jpg")
# Download image from the URL
response = requests.get(image_url)
if response.status_code == 200:
# Open the image using PIL
image = Image.open(BytesIO(response.content))
# Resize the image
image = image.resize(target_size)
# Save the resized image
image.save(filename)
image_paths.append(filename)
else:
print(f"Failed to download image from {image_url}")
image_paths.append(None)
else:
image_paths.append(None)
return image_paths
def split_data_dict(data_dict, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
assert train_ratio + test_ratio + val_ratio == 1.0, "Ratios must sum up to 1.0"
total_samples = len(data_dict['messages'])
train_size = int(total_samples * train_ratio)
test_size = int(total_samples * test_ratio)
val_size = int(total_samples * val_ratio)
train_data_dict = {
'messages': data_dict['messages'][:train_size],
'images': data_dict['images'][:train_size]
}
test_data_dict = {
'messages': data_dict['messages'][train_size:train_size + test_size],
'images': data_dict['images'][train_size:train_size + test_size]
}
val_data_dict = {
'messages': data_dict['messages'][-val_size:],
'images': data_dict['images'][-val_size:]
}
return train_data_dict, test_data_dict, val_data_dict
def save_data_dict_as_arrow(data_dict, file_path):
# Convert the dictionary to a Dataset object
dataset = Dataset.from_dict(data_dict)
# Save the dataset to an Arrow file
dataset.save_to_disk(file_path)
if __name__ == "__main__":
# Example usage:
# df = pd.read_csv("/data/data_set_metmuseum.csv")
# df1 = df[['objectID', 'primaryImageLink', 'Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4']]
# df2 = df1.sample(frac=1)
# df3 = df2.head(250)
# df4 = df3.copy()
df4 = pd.read_csv("sampled_data250.csv")
paths = ['input_dataset', os.path.join('input_dataset', 'images'), 'output_dataset']
for path in paths:
if not os.path.exists(path):
os.makedirs(path)
# Call the function to download and resize images
image_dir = 'input_dataset/images'
# image_paths = download_and_resize_images(df4, image_dir)
# Update the DataFrame with the resized image paths
# new_df = df4.copy() # Create a copy of the original DataFrame
# df4['imagePath'] = image_paths # Add a new column 'imagePath' containing the resized image paths
# df4 = df4.drop(['primaryImageLink'], axis=1)
# Call the function to modify the DataFrame and extract data
data_dict = modify_dataframe_and_extract_data(df4)
# split data_dict into train, test, valid
train_data_dict, test_data_dict, val_data_dict = split_data_dict(data_dict, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2)
# save these as arrow dataset
save_data_dict_as_arrow(train_data_dict, os.path.join('output_dataset', 'train.arrow'))
save_data_dict_as_arrow(test_data_dict, os.path.join('output_dataset', 'test.arrow'))
save_data_dict_as_arrow(val_data_dict, os.path.join('output_dataset', 'val.arrow'))
# save to zip format
import shutil
shutil.make_archive("/content/input_dataset", "zip", "/content/input_dataset")
shutil.make_archive("/content/output_dataset", "zip", "/content/output_dataset")
# read arrow from disk
test_data = Dataset.load_from_disk("output_dataset/test.arrow")
test_data