bstraehle commited on
Commit
613b540
·
verified ·
1 Parent(s): 83710df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -33
app.py CHANGED
@@ -6,7 +6,7 @@ from peft import LoraConfig
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
7
  from trl import SFTTrainer, setup_chat_format
8
 
9
- # Fine-tune on NVidia A10G Large (sleep after 1 hour)
10
 
11
  hf_profile = "bstraehle"
12
 
@@ -38,11 +38,42 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
38
 
39
  def fine_tune_model(base_model_id, dataset):
40
  #tokenizer = download_model(base_model_id)
41
- download_dataset(dataset)
42
  train_model(base_model_id)
43
  #fine_tuned_model_id = upload_model(base_model_id, tokenizer)
44
  return "fine_tuned_model_id"
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def train_model(model_id):
47
  print("111")
48
  dataset = load_dataset("json", data_files="train_dataset.json", split="train")
@@ -124,38 +155,7 @@ def train_model(model_id):
124
  del model
125
  del trainer
126
  torch.cuda.empty_cache()
127
-
128
- def download_model(base_model_id):
129
- tokenizer = AutoTokenizer.from_pretrained(base_model_id)
130
- model = AutoModelForCausalLM.from_pretrained(base_model_id)
131
- model.save_pretrained(base_model_id)
132
- return tokenizer
133
-
134
- def create_conversation(sample):
135
- return {
136
- "messages": [
137
- {"role": "system", "content": system_prompt.format(schema=sample["context"])},
138
- {"role": "user", "content": sample["question"]},
139
- {"role": "assistant", "content": sample["answer"]}
140
- ]
141
- }
142
 
143
- def download_dataset(dataset):
144
- dataset = load_dataset(dataset, split="train")
145
- dataset = dataset.shuffle().select(range(12500))
146
-
147
- # Convert dataset to OAI messages
148
- dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
149
- # split dataset into 10,000 training samples and 2,500 test samples
150
- dataset = dataset.train_test_split(test_size=2500/12500)
151
-
152
- print(dataset["train"][345]["messages"])
153
-
154
- # save datasets to disk
155
- dataset["train"].to_json("train_dataset.json", orient="records")
156
- dataset["test"].to_json("test_dataset.json", orient="records")
157
- ###
158
-
159
  def upload_model(base_model_id, tokenizer):
160
  fine_tuned_model_id = replace_hf_profile(base_model_id)
161
  login(token=os.environ["HF_TOKEN"])
 
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
7
  from trl import SFTTrainer, setup_chat_format
8
 
9
+ # Fine-tune on NVidia 4xL4 (sleep after 10 hours)
10
 
11
  hf_profile = "bstraehle"
12
 
 
38
 
39
  def fine_tune_model(base_model_id, dataset):
40
  #tokenizer = download_model(base_model_id)
41
+ prepare_dataset(dataset)
42
  train_model(base_model_id)
43
  #fine_tuned_model_id = upload_model(base_model_id, tokenizer)
44
  return "fine_tuned_model_id"
45
 
46
+ def download_model(base_model_id):
47
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
48
+ model = AutoModelForCausalLM.from_pretrained(base_model_id)
49
+ model.save_pretrained(base_model_id)
50
+ return tokenizer
51
+
52
+ def create_conversation(sample):
53
+ return {
54
+ "messages": [
55
+ {"role": "system", "content": system_prompt.format(schema=sample["context"])},
56
+ {"role": "user", "content": sample["question"]},
57
+ {"role": "assistant", "content": sample["answer"]}
58
+ ]
59
+ }
60
+
61
+ def prepare_dataset(dataset):
62
+ dataset = load_dataset(dataset, split="train")
63
+ dataset = dataset.shuffle().select(range(12500))
64
+
65
+ # Convert dataset to OAI messages
66
+ dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
67
+ # split dataset into 10,000 training samples and 2,500 test samples
68
+ dataset = dataset.train_test_split(test_size=2500/12500)
69
+
70
+ print(dataset["train"][345]["messages"])
71
+
72
+ # save datasets to disk
73
+ dataset["train"].to_json("train_dataset.json", orient="records")
74
+ dataset["test"].to_json("test_dataset.json", orient="records")
75
+ ###
76
+
77
  def train_model(model_id):
78
  print("111")
79
  dataset = load_dataset("json", data_files="train_dataset.json", split="train")
 
155
  del model
156
  del trainer
157
  torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def upload_model(base_model_id, tokenizer):
160
  fine_tuned_model_id = replace_hf_profile(base_model_id)
161
  login(token=os.environ["HF_TOKEN"])