from datasets import load_dataset import json # Load the dataset base_url = "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_{i:06d}.tar" num_shards = 46 # Number of webdataset tar files def download_data(base_url, num_shards): # Download the data urls = [base_url.format(i=i) for i in range(num_shards)] dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True) return dataset def extract_prompts(dataset, json_file_path): # Write data to the jsonl file prompts = {} with open(jsonl_file_path, 'w') as f: for index, row in enumerate(dataset): prompts[index] = row['json']['prompt'] f.write(json.dumps(prompts[index]) + '\n') def read_data(jsonl_file_path): # Read data from the jsonl file with open(jsonl_file_path, 'r') as f: for line in f: row = json.loads(line) print(row) def load_prompts_from_jsonl(file_path): print('Loading prompts from:', file_path) prompts = [] with open(file_path, 'r') as f: for line in f: data = json.loads(line) # Each line is a JSON object prompts.append(data) # Extract the 'prompt' field print("Data loaded successfully.") return prompts if __name__ == "__main__": jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl" num_shards = 1 dataset = download_data(num_shards, base_url) extract_prompts(dataset, jsonl_file_path) read_data(jsonl_file_path)