|
import random |
|
from datasets import Dataset, DatasetDict |
|
import pandas as pd |
|
|
|
|
|
with open('Positives.txt', 'r') as file: |
|
positive_examples = [line.strip() for line in file.readlines()] |
|
|
|
|
|
with open('Negatives.txt', 'r') as file: |
|
negative_examples = [line.strip() for line in file.readlines()] |
|
|
|
|
|
all_examples = [(example, 'POSITIVE') for example in positive_examples] + [(example, 'NEGATIVE') for example in negative_examples] |
|
random.shuffle(all_examples) |
|
|
|
|
|
df = pd.DataFrame(all_examples, columns=['text', 'label']) |
|
|
|
|
|
train_size = int(0.8 * len(df)) |
|
val_size = int(0.1 * len(df)) |
|
train_examples = df[:train_size] |
|
val_examples = df[train_size: train_size + val_size] |
|
test_examples = df[train_size + val_size:] |
|
|
|
|
|
import csv |
|
|
|
train_examples['split'] = 'train' |
|
val_examples['split'] = 'validation' |
|
test_examples['split'] = 'test' |
|
|
|
with open('dataset_with_split.csv', 'w', newline='', encoding='utf-8') as csvfile: |
|
csvwriter = csv.writer(csvfile) |
|
csvwriter.writerow(['text', 'label', 'split']) |
|
csvwriter.writerows(train_examples.values.tolist()) |
|
csvwriter.writerows(val_examples.values.tolist()) |
|
csvwriter.writerows(test_examples.values.tolist()) |
|
|
|
print("Dataset with 'split' column created successfully.") |
|
|
|
|
|
dataset = Dataset.from_csv('dataset_with_split.csv') |
|
|
|
|
|
datasets = DatasetDict({ |
|
'train': dataset.filter(lambda example: example['split'] == 'train'), |
|
'validation': dataset.filter(lambda example: example['split'] == 'val'), |
|
'test': dataset.filter(lambda example: example['split'] == 'test'), |
|
}) |
|
|
|
|
|
dataset_info = { |
|
"name": "img_intents", |
|
"description": "A dataset of positive and negative examples", |
|
"citation": "Provide the citation or source of the dataset", |
|
"homepage": "Link to the dataset homepage", |
|
} |
|
|
|
|