Spaces:

bstraehle
/

sft

Running

App Files Files Community

bstraehle commited on Jul 13, 2024

Commit

7f9f34a

•

1 Parent(s): 7e05fe4

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -4

app.py CHANGED Viewed

@@ -18,8 +18,8 @@ system_prompt = "You are a text to SQL query translator. Given a question in Eng
 user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
 schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
-base_model_id = "google/gemma-2-9b-it" # "meta-llama/Meta-Llama-3-8B-Instruct"
-dataset = "b-mc2/sql-create-context"
 def prompt_model(model_id, system_prompt, user_prompt, schema):
     pipe = pipeline("text-generation",
@@ -64,12 +64,73 @@ def prompt_model(model_id, system_prompt, user_prompt, schema):
 #    print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
 def fine_tune_model(base_model_id, dataset):
-    tokenizer = download_model(base_model_id)
     #prepare_dataset(dataset)
     #train_model(base_model_id)
-    fine_tuned_model_id = upload_model(base_model_id, tokenizer)
     return fine_tuned_model_id
 def download_model(base_model_id):
     tokenizer = AutoTokenizer.from_pretrained(base_model_id)
     model = AutoModelForCausalLM.from_pretrained(base_model_id)

 user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
 schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
+base_model_id = "microsoft/Phi-3-mini-4k-instruct"
+dataset = "gretelai/synthetic_text_to_sql"
 def prompt_model(model_id, system_prompt, user_prompt, schema):
     pipe = pipeline("text-generation",
 #    print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
 def fine_tune_model(base_model_id, dataset):
+    test(base_model_id, dataset)
+    ##tokenizer = download_model(base_model_id)
     #prepare_dataset(dataset)
     #train_model(base_model_id)
+    ##fine_tuned_model_id = upload_model(base_model_id, tokenizer)
     return fine_tuned_model_id
+def test(base_model_id, dataset):
+    print("111")
+    model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
+    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+    # Load the dataset for fine-tuning
+    print("222")
+    dataset = load_dataset(dataset, split="train")
+    # Define the formatting function for the prompts
+    def formatting_prompts_func(examples):
+        convos = examples["conversations"]
+        texts = []
+        mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
+        end_mapper = {"system": "", "human": "", "gpt": ""}
+        for convo in convos:
+            text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
+            texts.append(f"{text}{tokenizer.eos_token}")
+        return {"text": texts}
+    # Apply the formatting function to the dataset
+    print("333")
+    dataset = dataset.map(formatting_prompts_func, batched=True)
+    # Define the training arguments
+    print("444")
+    args = TrainingArguments(
+        evaluation_strategy="steps",
+        per_device_train_batch_size=7,
+        gradient_accumulation_steps=4,
+        gradient_checkpointing=True,
+        learning_rate=1e-4,
+        fp16=True,
+        max_steps=-1,
+        num_train_epochs=3,
+        save_strategy="epoch",
+        logging_steps=10,
+        output_dir=NEW_MODEL_NAME,
+        optim="paged_adamw_32bit",
+        lr_scheduler_type="linear"
+    )
+    # Create the trainer
+    print("555")
+    trainer = SFTTrainer(
+        model=model,
+        args=args,
+        train_dataset=dataset,
+        dataset_text_field="text",
+        max_seq_length=128,
+        formatting_func=formatting_prompts_func
+    )
+    # Start the training process
+    print("666")
+    trainer.train()
+    print("777")
+    trainer.save_model()
 def download_model(base_model_id):
     tokenizer = AutoTokenizer.from_pretrained(base_model_id)
     model = AutoModelForCausalLM.from_pretrained(base_model_id)