import pandas as pd
import os
import requests
from PIL import Image
from io import BytesIO
from datasets import Dataset

def modify_dataframe_and_extract_data(df):
    data_list = []
    for _, row in df.iterrows():
        messages = []
        for i in range(1, 5):
            user_question = row[f'Question{i}']
            user_answer = row[f'Answer{i}']
            if user_question:
                message_content = [{'index': None, 'text': user_question, 'type': 'text'}]
                if i == 1:
                    message_content.append({'index': 0, 'text': None, 'type': 'image'})
                messages.append({'content': message_content, 'role': 'user'})
                if user_answer:
                    messages.append({'content': [{'index': None, 'text': user_answer, 'type': 'text'}], 'role': 'assistant'})
        image = Image.open(row['imagePath'])
        data_list.append({'messages': messages, 'images': [image]})
    return {'messages': [data['messages'] for data in data_list], 'images': [data['images'] for data in data_list]}


def download_and_resize_images(df, image_dir, target_size=(250, 250)):
    image_paths = []
    for index, row in df.iterrows():
        image_url = row['primaryImageLink']
        object_id = row['objectID']
        if image_url:
            # Extract filename from the URL
            filename = os.path.join(image_dir, f"{object_id}.jpg")
            # Download image from the URL
            response = requests.get(image_url)
            if response.status_code == 200:
                # Open the image using PIL
                image = Image.open(BytesIO(response.content))
                # Resize the image
                image = image.resize(target_size)
                # Save the resized image
                image.save(filename)
                image_paths.append(filename)
            else:
                print(f"Failed to download image from {image_url}")
                image_paths.append(None)
        else:
            image_paths.append(None)
    return image_paths

def split_data_dict(data_dict, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
    assert train_ratio + test_ratio + val_ratio == 1.0, "Ratios must sum up to 1.0"
    
    total_samples = len(data_dict['messages'])
    train_size = int(total_samples * train_ratio)
    test_size = int(total_samples * test_ratio)
    val_size = int(total_samples * val_ratio)

    train_data_dict = {
        'messages': data_dict['messages'][:train_size],
        'images': data_dict['images'][:train_size]
    }
    test_data_dict = {
        'messages': data_dict['messages'][train_size:train_size + test_size],
        'images': data_dict['images'][train_size:train_size + test_size]
    }
    val_data_dict = {
        'messages': data_dict['messages'][-val_size:],
        'images': data_dict['images'][-val_size:]
    }

    return train_data_dict, test_data_dict, val_data_dict


def save_data_dict_as_arrow(data_dict, file_path):
    # Convert the dictionary to a Dataset object
    dataset = Dataset.from_dict(data_dict)
    
    # Save the dataset to an Arrow file
    dataset.save_to_disk(file_path)

if __name__ == "__main__":
    # Example usage:

    # df = pd.read_csv("/data/data_set_metmuseum.csv")
    # df1 = df[['objectID', 'primaryImageLink', 'Question1', 'Answer1', 'Question2', 'Answer2', 'Question3', 'Answer3', 'Question4', 'Answer4']]
    # df2 = df1.sample(frac=1)
    # df3 = df2.head(250)

    # df4 = df3.copy()

    df4 = pd.read_csv("sampled_data250.csv")
    paths = ['input_dataset', os.path.join('input_dataset', 'images'), 'output_dataset']
    for path in paths:
        if not os.path.exists(path):
            os.makedirs(path)

    # Call the function to download and resize images
    image_dir = 'input_dataset/images'
    # image_paths = download_and_resize_images(df4, image_dir)

    # Update the DataFrame with the resized image paths
    # new_df = df4.copy()  # Create a copy of the original DataFrame
    # df4['imagePath'] = image_paths  # Add a new column 'imagePath' containing the resized image paths
    # df4 = df4.drop(['primaryImageLink'], axis=1)

    # Call the function to modify the DataFrame and extract data
    data_dict = modify_dataframe_and_extract_data(df4)
    # split data_dict into train, test, valid
    train_data_dict, test_data_dict, val_data_dict = split_data_dict(data_dict, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2)

    # save these as arrow dataset
    save_data_dict_as_arrow(train_data_dict, os.path.join('output_dataset', 'train.arrow'))
    save_data_dict_as_arrow(test_data_dict, os.path.join('output_dataset', 'test.arrow'))
    save_data_dict_as_arrow(val_data_dict, os.path.join('output_dataset', 'val.arrow'))

    # save to zip format
    import shutil
    shutil.make_archive("/content/input_dataset", "zip", "/content/input_dataset")
    shutil.make_archive("/content/output_dataset", "zip", "/content/output_dataset")
    
    # read arrow from disk
    test_data = Dataset.load_from_disk("output_dataset/test.arrow")
    test_data