import streamlit as st import zipfile import os import shutil import pandas as pd import numpy as np import cv2 from sklearn.model_selection import train_test_split # set the title to be blue color st.title("Automation for Data Preparation") # change color to red st.write("`- This app will automate data preparation to make it easier to handled by different ML libraries.`") st.write(" ") st.write("`- Online datasets are usually in the common format. However, different ML libraries require different formats. This app will help you to convert the common format to the converted format.`") # draw a divider st.write("---") # create 2 columns col1, col2 = st.columns(2) # column 1 with col1: st.write("**Common Format:**") st.write("- root\n" " - class1\n" " - image1.jpg\n" " - image2.jpg\n" " - image3.jpg\n" " - ...\n" " - class2\n" " - image1.jpg\n" " - image2.jpg\n" " - ...\n" " - class3\n" " - image1.jpg\n" " - image2.jpg\n" " - ...\n") # column 2 with col2: # make the text bold st.write("**Converted Format:**") st.write("- root\n" " - train\n" " - class1\n" " - image1.jpg\n" " - image2.jpg\n" " - ...\n" " - class2\n" " - test\n" " - class1\n" " - image1.jpg\n" " - image2.jpg\n" " - ...\n" " - class2\n" " - image1.jpg\n") st.write("---") # input folder st.write("Please update a folder containing images in the default format as a zip file.") input_zip_file = st.file_uploader("", type=["zip"]) default_folder = 'input_folder' X = [] y = [] # add some space st.write(" ") st.write(" ") # unzip the input folder if st.button("Transform"): if input_zip_file is not None: with zipfile.ZipFile(input_zip_file, 'r') as zip_ref: zip_ref.extractall('input_folder') # transform the folder # check the folder structure to see if it is in the default format for folder in os.listdir(default_folder): if folder != '__MACOSX': # now at the root folder for class_folder in os.listdir(folder): classimg = os.path.join(folder, class_folder) for file in os.listdir(classimg): curr_file = os.path.join(classimg, file) if curr_file.endswith('.jpg'): img = cv2.imread(curr_file) img = cv2.resize(img, (224, 224)) X.append(img) y.append(class_folder) X = np.array(X) y = np.array(y) print(len(X)) print(len(y)) # create the new folder with new structure # - ouput # - train # - class1 # - image1.jpg # - image2.jpg # - ... # - class2 # - test # - class1 # - image1.jpg # - image2.jpg # - ... # - class2 output_folder = 'output' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42) # create the output folder for folder in np.unique(y): curr_path = os.path.join(output_folder, 'train', folder) os.makedirs(curr_path, exist_ok=True) curr_path = os.path.join(output_folder, 'test', folder) os.makedirs(curr_path, exist_ok=True) for i in range(len(X_train)): curr_path = os.path.join(output_folder, 'train', y_train[i], str(i) + '.jpg') cv2.imwrite(curr_path, X_train[i]) for i in range(len(X_test)): curr_path = os.path.join(output_folder, 'test', y_test[i], str(i) + '.jpg') cv2.imwrite(curr_path, X_test[i]) # create the class folders # train # - class1 # - class2 # test # - class1 # - class2 st.write("Transform the folder successfully.") # zip the folder shutil.make_archive('output_folder', 'zip', 'output') def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = data href = f'{file_label}' return href get_binary_file_downloader_html('output_folder.zip', 'Zip File') st.write("Download the zip file successfully.")