Spaces:

samariddin
/

task1_v2

Sleeping

App Files Files Community

task1_v2 / scripts /create_dataset.py

samariddin

added

908e980 about 2 years ago

raw

history blame contribute delete

3.54 kB

	import os
	import shutil
	import zipfile
	import pandas as pd
	from tqdm import tqdm

	PROJECT_DIR = os.path.dirname(os.path.dirname(__file__))


	def extract_zipfile(zip_path, tmp_folder):
	with zipfile.ZipFile(zip_path, "r") as zip_ref:
	zip_ref.extractall(tmp_folder)


	def copy_files(folder_path, new_folder_path, file_extension, name_prefix, counter):
	num_files = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith(file_extension)])
	with tqdm(total=num_files) as pbar:
	for filename in os.listdir(folder_path):
	pbar.update(1)
	if filename.endswith(file_extension):
	# Generate a new filename for the file
	ext = os.path.splitext(filename)[1]
	new_filename = f"{name_prefix}_{counter:06d}{ext}"
	counter += 1

	# Copy the file to the new folder with the new name
	src_path = os.path.join(folder_path, filename)
	dst_path = os.path.join(new_folder_path, new_filename)
	shutil.copy(src_path, dst_path)
	return counter, num_files


	def process_zipfiles(zip_path, images_path=None, labels_path=None):
	# Create the output folders if they don't exist
	os.makedirs(images_path, exist_ok=True)
	os.makedirs(labels_path, exist_ok=True)

	# Initialize counters for generating new filenames
	img_counter = 0
	label_counter = 0

	# Create an empty list to hold the statistical report data
	report_data = []

	# Iterate over each zip file in the specified directory
	for zip_file in os.listdir(zip_path):
	if zip_file.endswith(".zip"):
	print(f"Processing {zip_file}")
	# Extract the zip file to a temporary folder
	tmp_folder = "tmp_extracted"
	extract_zipfile(os.path.join(zip_path, zip_file), tmp_folder)

	try:
	# Copy image files to the images folder
	img_counter, num_images = copy_files(os.path.join(tmp_folder, "obj_train_data"), images_path, ".PNG", "frame", img_counter)

	# Copy label files to the labels folder
	label_counter, num_labels = copy_files(os.path.join(tmp_folder, "obj_train_data"), labels_path, ".txt", "frame", label_counter)

	# Clean up the temporary folder
	shutil.rmtree(tmp_folder)

	# Add the statistical report data for the current zip file to the report list
	report_data.append({"Zip File": zip_file,
	"Num Images": num_images,
	"Num Labels": num_labels})
	except:
	print(f"Error: {zip_file}")

	return report_data


	def save_report(report_data, report_file):
	# Create a pandas DataFrame from the report data list
	report_df = pd.DataFrame(report_data)

	# Save the report DataFrame to an Excel file
	report_df.to_csv(report_file, index=False)
	print("Report file has been saved")


	if __name__ == "__main__":
	# Set the paths for the zip files and output folders
	zip_path = os.path.join(PROJECT_DIR, 'data', 'cvat_targetlangan')
	report_file = os.path.join(PROJECT_DIR, 'report', 'file.csv')

	images_path = os.path.join(PROJECT_DIR, 'data', 'yolo_format', 'images')
	labels_path = os.path.join(PROJECT_DIR, 'data', 'yolo_format', 'labels')

	report_data = process_zipfiles(zip_path, images_path, labels_path)
	save_report(report_data, report_file)