| import os |
| import warnings |
| from io import BytesIO |
|
|
| import numpy as np |
| import pandas as pd |
| import requests |
| from PIL import Image |
| from sklearn.model_selection import train_test_split |
|
|
| |
| warnings.filterwarnings("ignore") |
|
|
|
|
| def process_embeddings(df, col_name): |
| """ |
| Process embeddings in a DataFrame column. |
| |
| Args: |
| - df (pd.DataFrame): The DataFrame containing the embeddings column. |
| - col_name (str): The name of the column containing the embeddings. |
| |
| Returns: |
| pd.DataFrame: The DataFrame with processed embeddings. |
| |
| Steps: |
| 1. Convert the values in the specified column to lists. |
| 2. Extract values from lists and create new columns for each element. |
| 3. Remove the original embeddings column. |
| |
| Example: |
| df_processed = process_embeddings(df, 'embeddings') |
| """ |
| |
| df[col_name] = df[col_name].apply(eval) |
|
|
| |
| """ 🔎 Example |
| text_1 text_2 text_3 |
| 0 -0.123 0.456 0.789 |
| 1 0.321 -0.654 0.987 |
| """ |
| embeddings_df = pd.DataFrame( |
| df[col_name].to_list(), |
| columns=[f"text_{i + 1}" for i in range(df[col_name].str.len().max())], |
| ) |
| df = pd.concat([df, embeddings_df], axis=1) |
|
|
| |
| df = df.drop(columns=[col_name]) |
|
|
| return df |
|
|
|
|
| def rename_image_embeddings(df): |
| """ |
| Rename columns in a DataFrame for image embeddings. |
| |
| Args: |
| - df (pd.DataFrame): The DataFrame containing columns to be renamed. |
| |
| Returns: |
| pd.DataFrame: The DataFrame with renamed columns. |
| |
| Example: |
| df_renamed = rename_image_embeddings(df) |
| """ |
| |
| df.columns = [f"image_{int(col)}" if col.isdigit() else col for col in df.columns] |
|
|
| return df |
|
|
|
|
| def preprocess_data( |
| text_data, |
| image_data, |
| text_id="image_id", |
| image_id="ImageName", |
| embeddings_col="embeddings", |
| ): |
| """ |
| Preprocess and merge text and image dataframes. |
| |
| Args: |
| - text_data (pd.DataFrame): DataFrame containing text data. |
| - image_data (pd.DataFrame): DataFrame containing image data. |
| - text_id (str): Column name for text data identifier. |
| - image_id (str): Column name for image data identifier. |
| - embeddings_col (str): Column name for embeddings data. |
| |
| Returns: |
| pd.DataFrame: Merged and preprocessed DataFrame. |
| |
| This function: |
| Process text and image embeddings. |
| Convert image_id and text_id values to integers. |
| Merge dataframes using id. |
| Drop unnecessary columns. |
| |
| Example: |
| merged_df = preprocess_data(text_df, image_df) |
| """ |
| |
| text_data = process_embeddings(text_data, embeddings_col) |
| image_data = rename_image_embeddings(image_data) |
|
|
| |
| image_data = image_data.dropna(subset=[image_id]) |
| text_data = text_data.dropna(subset=[text_id]) |
|
|
| |
| text_data[text_id] = text_data[text_id].apply(lambda x: x.split("/")[-1]) |
|
|
| |
| df = pd.merge(text_data, image_data, left_on=text_id, right_on=image_id) |
|
|
| |
| df.drop([image_id, text_id], axis=1, inplace=True) |
|
|
| return df |
|
|
|
|
| class ImageDownloader: |
| """ |
| Image downloader class to download images from URLs. |
| |
| Args: |
| - image_dir (str): Directory to save images. |
| - image_size (tuple): Size of the images to be saved. |
| - override (bool): Whether to override existing images. |
| |
| Methods: |
| - download_images(df, print_every=1000): Download images from URLs in a DataFrame. |
| Args: |
| - df (pd.DataFrame): DataFrame containing image URLs. |
| - print_every (int): Print progress every n images. |
| Returns: |
| pd.DataFrame: DataFrame with image paths added. |
| |
| Example: |
| downloader = ImageDownloader() |
| df = downloader.download_images(df) |
| """ |
|
|
| def __init__( |
| self, image_dir="data/images/", image_size=(224, 224), overwrite=False |
| ): |
| self.image_dir = image_dir |
| self.image_size = image_size |
| self.overwrite = overwrite |
|
|
| |
| if not os.path.exists(self.image_dir): |
| os.makedirs(self.image_dir) |
|
|
| def download_images(self, df, print_every=1000): |
| |
| image_paths = [] |
|
|
| i = 0 |
| for index, row in df.iterrows(): |
| if i % print_every == 0: |
| print(f"Downloading image {i}/{len(df)}") |
| i += 1 |
|
|
| sku = row["sku"] |
| image_url = row["image"] |
| image_path = os.path.join(self.image_dir, f"{sku}.jpg") |
|
|
| if os.path.exists(image_path) and not self.overwrite: |
| print(f"Image {sku} is already in the path.") |
| image_paths.append(image_path) |
| continue |
|
|
| try: |
| response = requests.get(image_url) |
| response.raise_for_status() |
| img = Image.open(BytesIO(response.content)) |
| img = img.resize(self.image_size, Image.Resampling.LANCZOS) |
| img.save(image_path) |
| |
| image_paths.append(image_path) |
| except Exception as e: |
| print(f"Could not download image for SKU: {sku}. Error: {e}") |
| image_paths.append(np.nan) |
|
|
| df["image_path"] = image_paths |
| return df |
|
|
|
|
| def train_test_split_and_feature_extraction(df, test_size=0.3, random_state=42): |
| """ |
| Split the data into train and test sets and extract features and labels. |
| |
| Args: |
| - df (pd.DataFrame): DataFrame containing the data. |
| |
| Keyword Args: |
| - test_size (float): Size of the test set. |
| - random_state (int): Random state for reproducibility |
| |
| Returns: |
| pd.DataFrame: Train DataFrame. |
| pd.DataFrame: Test DataFrame. |
| list: List of columns with text embeddings. |
| list: List of columns with image embeddings. |
| list: List of columns with class labels. |
| |
| Example: |
| train_df, test_df, text_columns, image_columns, label_columns = train_test_split_and_feature_extraction(df) |
| """ |
|
|
| |
| train_df, test_df = train_test_split( |
| df, test_size=test_size, random_state=random_state |
| ) |
|
|
| |
| text_columns = [col for col in df.columns if col.startswith("text_")] |
|
|
| |
| image_columns = [col for col in df.columns if col.startswith("image_")] |
|
|
| |
| label_columns = ["class_id"] |
|
|
| return train_df, test_df, text_columns, image_columns, label_columns |
|
|