import random from datasets import Dataset, DatasetDict import pandas as pd # Load positive examples from 'Positives.txt' with open('Positives.txt', 'r') as file: positive_examples = [line.strip() for line in file.readlines()] # Load negative examples from 'Negatives.txt' with open('Negatives.txt', 'r') as file: negative_examples = [line.strip() for line in file.readlines()] # Shuffle and combine positive and negative examples all_examples = [(example, 'POSITIVE') for example in positive_examples] + [(example, 'NEGATIVE') for example in negative_examples] random.shuffle(all_examples) # Convert to pandas DataFrame df = pd.DataFrame(all_examples, columns=['text', 'label']) # Split the dataset if desired (e.g., 80% train, 10% validation, 10% test) train_size = int(0.8 * len(df)) val_size = int(0.1 * len(df)) train_examples = df[:train_size] val_examples = df[train_size: train_size + val_size] test_examples = df[train_size + val_size:] # Save the dataset to CSV format with 'split' column import csv train_examples['split'] = 'train' val_examples['split'] = 'validation' test_examples['split'] = 'test' with open('dataset_with_split.csv', 'w', newline='', encoding='utf-8') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(['text', 'label', 'split']) # Write header csvwriter.writerows(train_examples.values.tolist()) # Write train examples csvwriter.writerows(val_examples.values.tolist()) # Write validation examples csvwriter.writerows(test_examples.values.tolist()) # Write test examples print("Dataset with 'split' column created successfully.") # Load the dataset from the CSV file dataset = Dataset.from_csv('dataset_with_split.csv') # Create a DatasetDict object containing train, validation, and test datasets datasets = DatasetDict({ 'train': dataset.filter(lambda example: example['split'] == 'train'), 'validation': dataset.filter(lambda example: example['split'] == 'val'), 'test': dataset.filter(lambda example: example['split'] == 'test'), }) # Optional: Define dataset metadata dataset_info = { "name": "img_intents", "description": "A dataset of positive and negative examples", "citation": "Provide the citation or source of the dataset", "homepage": "Link to the dataset homepage", }