File size: 5,065 Bytes
07423df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import os
import random
import re
import uuid
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
def prepare_default_dataset_causal_language_modeling(path):
ds = load_dataset("OpenAssistant/oasst2")
train = ds["train"].to_pandas()
val = ds["validation"].to_pandas()
df = pd.concat([train, val], axis=0).reset_index(drop=True)
df_assistant = df[(df.role == "assistant")].copy()
df_prompter = df[(df.role == "prompter")].copy()
df_prompter = df_prompter.set_index("message_id")
df_assistant["output"] = df_assistant["text"].values
inputs = []
parent_ids = []
for _, row in df_assistant.iterrows():
input = df_prompter.loc[row.parent_id]
inputs.append(input.text)
parent_ids.append(input.parent_id)
df_assistant["instruction"] = inputs
df_assistant["parent_id"] = parent_ids
df_assistant = df_assistant[
["instruction", "output", "message_id", "parent_id", "lang", "rank"]
].rename(columns={"message_id": "id"})
df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full.pq"), index=False)
df_assistant[df_assistant["lang"] == "en"][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_allrank.pq"), index=False)
df_assistant[df_assistant["rank"] == 0.0][
["instruction", "output", "id", "parent_id"]
].to_parquet(os.path.join(path, "train_full_multilang.pq"), index=False)
df_assistant[["instruction", "output", "id", "parent_id"]].to_parquet(
os.path.join(path, "train_full_multilang_allrank.pq"), index=False
)
return df_assistant[(df_assistant["rank"] == 0.0) & (df_assistant["lang"] == "en")]
def prepare_default_dataset_dpo_modeling() -> pd.DataFrame:
df = load_dataset("Intel/orca_dpo_pairs")["train"].to_pandas()
return df
def extract_anthropic_prompt(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response.rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
return prompt_and_response[: search_term_idx + len(search_term)]
def _parse_row(prompt_and_response):
"""Extract the anthropic prompt from a prompt and response pair."""
search_term = "\n\nAssistant:"
search_term_idx = prompt_and_response["chosen"].rfind(search_term)
assert (
search_term_idx != -1
), f"Prompt and response does not contain '{search_term}'"
prompt = prompt_and_response["chosen"][: search_term_idx + len(search_term)]
chosen_response = prompt_and_response["chosen"][len(prompt) :]
rejected_response = prompt_and_response["rejected"][len(prompt) :]
return prompt, chosen_response, rejected_response
def _split_up_prompt(prompt):
human_texts = re.findall(
r"\n\nHuman:(.*?)(?=(\n\nAssistant:|$))", prompt, flags=re.DOTALL
)
assistant_texts = re.findall(
r"\n\nAssistant:(.*?)(?=(\n\nHuman:|$))", prompt, flags=re.DOTALL
)
human_texts = [text[0].strip() for text in human_texts]
assistant_texts = [text[0].strip() for text in assistant_texts]
assert len(human_texts) == len(assistant_texts), prompt
dialogue = list(zip(human_texts, assistant_texts))
return dialogue
def prepare_hh_dpo_modeling(split: str) -> pd.DataFrame:
"""
Adapted from
https://github.com/eric-mitchell/direct-preference-optimization/blob/main/preference_datasets.py
"""
dataset = load_dataset("Anthropic/hh-rlhf", split=split)
rnd = random.Random()
rnd.seed(123)
dfs = []
for row in tqdm(dataset):
prompt, chosen_response, rejected_response = _parse_row(row)
if len(rejected_response) == 0:
# remove rejected answers that are empty
continue
parent_uuid = None
parsed_texts = []
for human_text, assistant_text in _split_up_prompt(prompt):
random_uuid = str(uuid.UUID(int=rnd.getrandbits(128), version=4))
parsed_texts += [
[human_text, assistant_text, random_uuid, parent_uuid, None, None]
]
parent_uuid = random_uuid
parsed_texts[-1][-2] = chosen_response
parsed_texts[-1][-1] = rejected_response
df = pd.DataFrame(
parsed_texts,
columns=[
"instruction",
"output",
"id",
"parent_id",
"chosen_response",
"rejected_response",
],
)
dfs.append(df)
df = pd.concat(dfs).reset_index(drop=True)
# merge output into chosen and rejected response
df["chosen_response"] = df["chosen_response"].fillna(df["output"])
df["rejected_response"] = df["rejected_response"].fillna(df["output"])
del df["output"]
return df
|