--- library_name: peft base_model: unsloth/gemma-7b-bnb-4bit --- test parameter by use split="test" code to create dataset ```python import random alpaca_prompt = """{} {}""" BOS_TOKEN = tokenizer.bos_token # Must add EOS_TOKEN EOS_TOKEN = ""+tokenizer.eos_token # Must add EOS_TOKEN def formatting_prompts_func(examples): translations = examples["translation"] texts = [] text_en = "" text_th = "" translate_to = 'th' max_group_count = 1 group_count = 0 for translation in translations: if group_count >= max_group_count: if(translate_to == 'th'): text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN else: text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN texts.append(text) text_en = "" text_th = "" max_group_count = random.randint(1, 5) group_count = 0 translate_to = random.choice(['en', 'th']) num_newlines = random.randint(1, 5) newlines = '\n' * num_newlines if(text_en == ""): text_en = translation['en'] text_th = translation['th'] else: text_en = text_en+newlines+translation['en'] text_th = text_th+newlines+translation['th'] group_count = group_count+1 if(translate_to == 'th'): text = alpaca_prompt.format(text_en, translate_to, text_th) + EOS_TOKEN else: text = alpaca_prompt.format(text_th, translate_to, text_en) + EOS_TOKEN texts.append(text) return { "text" : texts, } from datasets import load_dataset dataset = load_dataset("scb_mt_enth_2020",'enth',split="test") dataset = dataset.map(formatting_prompts_func, batched = True,remove_columns=["translation",'subdataset']) dataset = dataset.train_test_split(test_size=0.1, shuffle=True) dataset['train'][0:5] ```