Spaces:

Dyno1307
/

Translate-V2

Sleeping

Translate-V2 / scripts /create_sinhala_test_set.py

Upload 6 files

fd2f49a verified about 2 months ago

1.44 kB

	# scripts/create_sinhala_test_set.py
	import os
	from datasets import load_dataset

	# --- Configuration ---
	DATA_DIR = "data/processed"
	TEST_DIR = "data/test_sets"
	DATASET_NAME = "Programmer-RD-AI/sinhala-english-singlish-translation"
	NUM_TEST_LINES = 500
	# ---

	print("--- Creating a held-back test set for Sinhalese ---")
	os.makedirs(TEST_DIR, exist_ok=True)

	# Load the dataset from Hugging Face
	dataset = load_dataset(DATASET_NAME, split='train')

	# Split the dataset
	train_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES))
	test_dataset = dataset.select(range(len(dataset) - NUM_TEST_LINES, len(dataset)))

	# Write the new training files
	with open(os.path.join(DATA_DIR, "sinhala.si"), "w", encoding="utf-8") as f_source, \
	open(os.path.join(DATA_DIR, "sinhala.en"), "w", encoding="utf-8") as f_target:
	for example in train_dataset:
	f_source.write(example['Sinhala'] + "\n")
	f_target.write(example['English'] + "\n")

	# Write the new test files
	with open(os.path.join(TEST_DIR, "test.si"), "w", encoding="utf-8") as f_source, \
	open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f_target:
	for example in test_dataset:
	f_source.write(example['Sinhala'] + "\n")
	f_target.write(example['English'] + "\n")

	print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Sinhalese.")
	print(f"The original training files in '{DATA_DIR}' have been updated.")