Experiment with distilabel:
dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs", split="train", token=hf_token)
dataset = dataset.filter(lambda r: r["status"]!="tie" and r["chosen_score"]>5)
def chatml_format(example):
# Format system
if len(example['system']) > 0:
message = {"role": "system", "content": example['system']}
system = tokenizer.apply_chat_template([message], tokenize=False)
else:
system = ""
# Format instruction
message = {"role": "user", "content": example['input']}
prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
# Format chosen answer
chosen = example['chosen'] + "<|im_end|>\n"
# Format rejected answer
rejected = example['rejected'] + "<|im_end|>\n"
return {
"prompt": system + prompt,
"chosen": chosen,
"rejected": rejected,
}
# Load dataset
#dataset = load_dataset("Intel/orca_dpo_pairs")['train']
# Save columns
original_columns = dataset.column_names
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Format dataset
dataset = dataset.map(
chatml_format,
remove_columns=original_columns
)
# Print sample
dataset[1]
- Downloads last month
- 896
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.