Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from peft import LoraConfig
|
|
6 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
|
7 |
from trl import SFTTrainer, setup_chat_format
|
8 |
|
9 |
-
# Fine-tune on NVidia
|
10 |
|
11 |
hf_profile = "bstraehle"
|
12 |
|
@@ -38,11 +38,42 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
|
|
38 |
|
39 |
def fine_tune_model(base_model_id, dataset):
|
40 |
#tokenizer = download_model(base_model_id)
|
41 |
-
|
42 |
train_model(base_model_id)
|
43 |
#fine_tuned_model_id = upload_model(base_model_id, tokenizer)
|
44 |
return "fine_tuned_model_id"
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def train_model(model_id):
|
47 |
print("111")
|
48 |
dataset = load_dataset("json", data_files="train_dataset.json", split="train")
|
@@ -124,38 +155,7 @@ def train_model(model_id):
|
|
124 |
del model
|
125 |
del trainer
|
126 |
torch.cuda.empty_cache()
|
127 |
-
|
128 |
-
def download_model(base_model_id):
|
129 |
-
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
130 |
-
model = AutoModelForCausalLM.from_pretrained(base_model_id)
|
131 |
-
model.save_pretrained(base_model_id)
|
132 |
-
return tokenizer
|
133 |
-
|
134 |
-
def create_conversation(sample):
|
135 |
-
return {
|
136 |
-
"messages": [
|
137 |
-
{"role": "system", "content": system_prompt.format(schema=sample["context"])},
|
138 |
-
{"role": "user", "content": sample["question"]},
|
139 |
-
{"role": "assistant", "content": sample["answer"]}
|
140 |
-
]
|
141 |
-
}
|
142 |
|
143 |
-
def download_dataset(dataset):
|
144 |
-
dataset = load_dataset(dataset, split="train")
|
145 |
-
dataset = dataset.shuffle().select(range(12500))
|
146 |
-
|
147 |
-
# Convert dataset to OAI messages
|
148 |
-
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
|
149 |
-
# split dataset into 10,000 training samples and 2,500 test samples
|
150 |
-
dataset = dataset.train_test_split(test_size=2500/12500)
|
151 |
-
|
152 |
-
print(dataset["train"][345]["messages"])
|
153 |
-
|
154 |
-
# save datasets to disk
|
155 |
-
dataset["train"].to_json("train_dataset.json", orient="records")
|
156 |
-
dataset["test"].to_json("test_dataset.json", orient="records")
|
157 |
-
###
|
158 |
-
|
159 |
def upload_model(base_model_id, tokenizer):
|
160 |
fine_tuned_model_id = replace_hf_profile(base_model_id)
|
161 |
login(token=os.environ["HF_TOKEN"])
|
|
|
6 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
|
7 |
from trl import SFTTrainer, setup_chat_format
|
8 |
|
9 |
+
# Fine-tune on NVidia 4xL4 (sleep after 10 hours)
|
10 |
|
11 |
hf_profile = "bstraehle"
|
12 |
|
|
|
38 |
|
39 |
def fine_tune_model(base_model_id, dataset):
|
40 |
#tokenizer = download_model(base_model_id)
|
41 |
+
prepare_dataset(dataset)
|
42 |
train_model(base_model_id)
|
43 |
#fine_tuned_model_id = upload_model(base_model_id, tokenizer)
|
44 |
return "fine_tuned_model_id"
|
45 |
|
46 |
+
def download_model(base_model_id):
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
48 |
+
model = AutoModelForCausalLM.from_pretrained(base_model_id)
|
49 |
+
model.save_pretrained(base_model_id)
|
50 |
+
return tokenizer
|
51 |
+
|
52 |
+
def create_conversation(sample):
|
53 |
+
return {
|
54 |
+
"messages": [
|
55 |
+
{"role": "system", "content": system_prompt.format(schema=sample["context"])},
|
56 |
+
{"role": "user", "content": sample["question"]},
|
57 |
+
{"role": "assistant", "content": sample["answer"]}
|
58 |
+
]
|
59 |
+
}
|
60 |
+
|
61 |
+
def prepare_dataset(dataset):
|
62 |
+
dataset = load_dataset(dataset, split="train")
|
63 |
+
dataset = dataset.shuffle().select(range(12500))
|
64 |
+
|
65 |
+
# Convert dataset to OAI messages
|
66 |
+
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
|
67 |
+
# split dataset into 10,000 training samples and 2,500 test samples
|
68 |
+
dataset = dataset.train_test_split(test_size=2500/12500)
|
69 |
+
|
70 |
+
print(dataset["train"][345]["messages"])
|
71 |
+
|
72 |
+
# save datasets to disk
|
73 |
+
dataset["train"].to_json("train_dataset.json", orient="records")
|
74 |
+
dataset["test"].to_json("test_dataset.json", orient="records")
|
75 |
+
###
|
76 |
+
|
77 |
def train_model(model_id):
|
78 |
print("111")
|
79 |
dataset = load_dataset("json", data_files="train_dataset.json", split="train")
|
|
|
155 |
del model
|
156 |
del trainer
|
157 |
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
def upload_model(base_model_id, tokenizer):
|
160 |
fine_tuned_model_id = replace_hf_profile(base_model_id)
|
161 |
login(token=os.environ["HF_TOKEN"])
|