Commit
·
dfae4e0
1
Parent(s):
23690d0
First commit
Browse files- app.py +59 -0
- checkpoint_dir/ERA_V2_S30.ipynb +974 -0
- checkpoint_dir/README.md +202 -0
- checkpoint_dir/adapter_config.json +29 -0
- checkpoint_dir/adapter_model.safetensors +3 -0
- checkpoint_dir/all_results.json +13 -0
- checkpoint_dir/checkpoint-6000/optimizer.pt +3 -0
- checkpoint_dir/checkpoint-6000/phi_model/README.md +202 -0
- checkpoint_dir/checkpoint-6000/phi_model/adapter_config.json +29 -0
- checkpoint_dir/checkpoint-6000/phi_model/adapter_model.safetensors +3 -0
- checkpoint_dir/checkpoint-6000/projection_layer/pytorch_model.bin +3 -0
- checkpoint_dir/checkpoint-6000/rng_state.pth +3 -0
- checkpoint_dir/checkpoint-6000/scheduler.pt +3 -0
- checkpoint_dir/checkpoint-6000/trainer_state.json +453 -0
- checkpoint_dir/config.json +51 -0
- checkpoint_dir/eval_results.json +8 -0
- checkpoint_dir/image_projector.pth +3 -0
- checkpoint_dir/train_results.json +8 -0
- checkpoint_dir/trainer_state.json +462 -0
- model.py +27 -0
- requirements.txt +24 -0
app.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
2 |
+
from peft import PeftModel
|
3 |
+
import torch
|
4 |
+
import clip
|
5 |
+
from PIL import Image
|
6 |
+
import torch.nn as nn
|
7 |
+
from model import Projections
|
8 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
+
projections = Projections(512, 3072)
|
13 |
+
projections.load_state_dict(torch.load('checkpoint_dir/checkpoint-6000/projection_layer/pytorch_model.bin', map_location=device), strict=False)
|
14 |
+
projections = projections.to(device)
|
15 |
+
projections = projections.to(torch.bfloat16)
|
16 |
+
|
17 |
+
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
|
18 |
+
model_kwargs = dict(
|
19 |
+
use_cache=False,
|
20 |
+
trust_remote_code=True,
|
21 |
+
attn_implementation='eager',
|
22 |
+
torch_dtype=torch.bfloat16,
|
23 |
+
device_map=None
|
24 |
+
)
|
25 |
+
base_model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
|
26 |
+
|
27 |
+
new_model = "checkpoint_dir/checkpoint-6000/phi_model" # change to the path where your model is saved
|
28 |
+
|
29 |
+
model = PeftModel.from_pretrained(base_model, new_model)
|
30 |
+
model = model.merge_and_unload()
|
31 |
+
model = model.to(device)
|
32 |
+
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
|
34 |
+
tokenizer.model_max_length = 2048
|
35 |
+
tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
|
36 |
+
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
37 |
+
tokenizer.padding_side = 'right'
|
38 |
+
tokenizer.chat_template = "{% for message in messages %}{% if message['from'] == 'system' %}{{'<|system|>' + message['value'] + '<|end|>'}}{% elif message['from'] ==\
|
39 |
+
'human' %}{{'<|user|>' + message['value'] + '<|end|>'}}{% elif message['from'] == 'gpt' %}{{'<|assistant|>' + message['value'] +\
|
40 |
+
'<|end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}"
|
41 |
+
|
42 |
+
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
|
43 |
+
|
44 |
+
# Load Whisper model and processor
|
45 |
+
whisper_model_name = "openai/whisper-small"
|
46 |
+
whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
|
47 |
+
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
|
48 |
+
|
49 |
+
def infer(message, history):
|
50 |
+
return message.keys()
|
51 |
+
|
52 |
+
examples=[{'text':"I am planning to buy a dog and a cat. Suggest some breeds that get along with each other"},
|
53 |
+
{'text':"Explain biased coin flip"},
|
54 |
+
{'text': "I want to buy a house. Suggest some factors to consider while making final decision"}]
|
55 |
+
|
56 |
+
gr.ChatInterface(infer, chatbot=gr.Chatbot(height=600),
|
57 |
+
textbox=gr.Textbox(placeholder="How can I help you today", container=False,
|
58 |
+
scale=7), theme="soft", examples=examples, undo_btn=None,
|
59 |
+
title="Phi-3 Multimodel Assistant").launch()
|
checkpoint_dir/ERA_V2_S30.ipynb
ADDED
@@ -0,0 +1,974 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {
|
7 |
+
"id": "C_YSfsRILGPG",
|
8 |
+
"tags": [],
|
9 |
+
"colab": {
|
10 |
+
"base_uri": "https://localhost:8080/"
|
11 |
+
},
|
12 |
+
"outputId": "107ed765-da2b-4d6e-e562-43c5573d8566"
|
13 |
+
},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"output_type": "stream",
|
17 |
+
"name": "stdout",
|
18 |
+
"text": [
|
19 |
+
"fatal: destination path 'multi_model_phi_3' already exists and is not an empty directory.\n"
|
20 |
+
]
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"source": [
|
24 |
+
"!git clone https://github.com/AkashDataScience/multi_model_phi_3"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": null,
|
30 |
+
"metadata": {
|
31 |
+
"id": "CBVAhJBULs5R",
|
32 |
+
"tags": [],
|
33 |
+
"colab": {
|
34 |
+
"base_uri": "https://localhost:8080/"
|
35 |
+
},
|
36 |
+
"outputId": "3843bc68-eac9-45aa-a061-b284ae3ddefd"
|
37 |
+
},
|
38 |
+
"outputs": [
|
39 |
+
{
|
40 |
+
"output_type": "stream",
|
41 |
+
"name": "stdout",
|
42 |
+
"text": [
|
43 |
+
"/content/multi_model_phi_3\n"
|
44 |
+
]
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"source": [
|
48 |
+
"%cd multi_model_phi_3"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"cell_type": "code",
|
53 |
+
"execution_count": null,
|
54 |
+
"metadata": {
|
55 |
+
"id": "75koL8tzLxKS",
|
56 |
+
"tags": [],
|
57 |
+
"colab": {
|
58 |
+
"base_uri": "https://localhost:8080/"
|
59 |
+
},
|
60 |
+
"outputId": "7c5217f4-e70b-4d6d-bda3-65e94878b0e5"
|
61 |
+
},
|
62 |
+
"outputs": [
|
63 |
+
{
|
64 |
+
"output_type": "stream",
|
65 |
+
"name": "stdout",
|
66 |
+
"text": [
|
67 |
+
"Collecting clip@ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 (from -r requirements.txt (line 2))\n",
|
68 |
+
" Using cached clip-1.0-py3-none-any.whl\n",
|
69 |
+
"Requirement already satisfied: bitsandbytes==0.43.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (0.43.3)\n",
|
70 |
+
"Requirement already satisfied: colorama==0.4.6 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (0.4.6)\n",
|
71 |
+
"Requirement already satisfied: datasets==3.0.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (3.0.0)\n",
|
72 |
+
"Requirement already satisfied: dill==0.3.8 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (0.3.8)\n",
|
73 |
+
"Requirement already satisfied: multiprocess==0.70.16 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (0.70.16)\n",
|
74 |
+
"Requirement already satisfied: numpy==1.26.4 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (1.26.4)\n",
|
75 |
+
"Requirement already satisfied: pandas==2.2.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 8)) (2.2.2)\n",
|
76 |
+
"Requirement already satisfied: peft==0.12.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 9)) (0.12.0)\n",
|
77 |
+
"Requirement already satisfied: shtab==1.7.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 10)) (1.7.1)\n",
|
78 |
+
"Requirement already satisfied: tokenizers==0.19.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 11)) (0.19.1)\n",
|
79 |
+
"Requirement already satisfied: torch==2.4.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 12)) (2.4.1+cu121)\n",
|
80 |
+
"Requirement already satisfied: torchvision==0.19.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 13)) (0.19.1+cu121)\n",
|
81 |
+
"Requirement already satisfied: tqdm==4.66.5 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 14)) (4.66.5)\n",
|
82 |
+
"Requirement already satisfied: transformers==4.44.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 15)) (4.44.2)\n",
|
83 |
+
"Requirement already satisfied: treelib==1.7.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 16)) (1.7.0)\n",
|
84 |
+
"Requirement already satisfied: trl==0.10.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 17)) (0.10.1)\n",
|
85 |
+
"Requirement already satisfied: typing_extensions==4.12.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 18)) (4.12.2)\n",
|
86 |
+
"Requirement already satisfied: tyro==0.8.10 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 19)) (0.8.10)\n",
|
87 |
+
"Requirement already satisfied: tzdata==2024.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 20)) (2024.1)\n",
|
88 |
+
"Requirement already satisfied: urllib3==2.2.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 21)) (2.2.3)\n",
|
89 |
+
"Requirement already satisfied: wcwidth==0.2.13 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 22)) (0.2.13)\n",
|
90 |
+
"Requirement already satisfied: xxhash==3.5.0 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 23)) (3.5.0)\n",
|
91 |
+
"Requirement already satisfied: yarl==1.11.1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 24)) (1.11.1)\n",
|
92 |
+
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (3.16.1)\n",
|
93 |
+
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (16.1.0)\n",
|
94 |
+
"Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (2.32.3)\n",
|
95 |
+
"Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==3.0.0->-r requirements.txt (line 4)) (2024.6.1)\n",
|
96 |
+
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (3.10.5)\n",
|
97 |
+
"Requirement already satisfied: huggingface-hub>=0.22.0 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (0.24.7)\n",
|
98 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (24.1)\n",
|
99 |
+
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets==3.0.0->-r requirements.txt (line 4)) (6.0.2)\n",
|
100 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.2->-r requirements.txt (line 8)) (2.8.2)\n",
|
101 |
+
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.2->-r requirements.txt (line 8)) (2024.2)\n",
|
102 |
+
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (5.9.5)\n",
|
103 |
+
"Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (0.34.2)\n",
|
104 |
+
"Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft==0.12.0->-r requirements.txt (line 9)) (0.4.5)\n",
|
105 |
+
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (1.13.3)\n",
|
106 |
+
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (3.3)\n",
|
107 |
+
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.4.1->-r requirements.txt (line 12)) (3.1.4)\n",
|
108 |
+
"Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision==0.19.1->-r requirements.txt (line 13)) (10.4.0)\n",
|
109 |
+
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.44.2->-r requirements.txt (line 15)) (2024.9.11)\n",
|
110 |
+
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from treelib==1.7.0->-r requirements.txt (line 16)) (1.16.0)\n",
|
111 |
+
"Requirement already satisfied: docstring-parser>=0.16 in /usr/local/lib/python3.10/dist-packages (from tyro==0.8.10->-r requirements.txt (line 19)) (0.16)\n",
|
112 |
+
"Requirement already satisfied: rich>=11.1.0 in /usr/local/lib/python3.10/dist-packages (from tyro==0.8.10->-r requirements.txt (line 19)) (13.9.1)\n",
|
113 |
+
"Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.10/dist-packages (from yarl==1.11.1->-r requirements.txt (line 24)) (3.10)\n",
|
114 |
+
"Requirement already satisfied: multidict>=4.0 in /usr/local/lib/python3.10/dist-packages (from yarl==1.11.1->-r requirements.txt (line 24)) (6.1.0)\n",
|
115 |
+
"Requirement already satisfied: ftfy in /usr/local/lib/python3.10/dist-packages (from clip@ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1->-r requirements.txt (line 2)) (6.2.3)\n",
|
116 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (2.4.3)\n",
|
117 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (1.3.1)\n",
|
118 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (24.2.0)\n",
|
119 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (1.4.1)\n",
|
120 |
+
"Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==3.0.0->-r requirements.txt (line 4)) (4.0.3)\n",
|
121 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets==3.0.0->-r requirements.txt (line 4)) (3.3.2)\n",
|
122 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets==3.0.0->-r requirements.txt (line 4)) (2024.8.30)\n",
|
123 |
+
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (3.0.0)\n",
|
124 |
+
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (2.18.0)\n",
|
125 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.4.1->-r requirements.txt (line 12)) (2.1.5)\n",
|
126 |
+
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.4.1->-r requirements.txt (line 12)) (1.3.0)\n",
|
127 |
+
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=11.1.0->tyro==0.8.10->-r requirements.txt (line 19)) (0.1.2)\n"
|
128 |
+
]
|
129 |
+
}
|
130 |
+
],
|
131 |
+
"source": [
|
132 |
+
"!pip install -r requirements.txt"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": null,
|
138 |
+
"metadata": {
|
139 |
+
"id": "QauI2fQjWWTg",
|
140 |
+
"colab": {
|
141 |
+
"base_uri": "https://localhost:8080/"
|
142 |
+
},
|
143 |
+
"outputId": "fa8e0f93-d988-4108-93f7-15a7281a1b21"
|
144 |
+
},
|
145 |
+
"outputs": [
|
146 |
+
{
|
147 |
+
"output_type": "stream",
|
148 |
+
"name": "stdout",
|
149 |
+
"text": [
|
150 |
+
"/content/multi_model_phi_3/image_finetuning/finetuning\n"
|
151 |
+
]
|
152 |
+
}
|
153 |
+
],
|
154 |
+
"source": [
|
155 |
+
"%cd image_finetuning/finetuning"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"source": [
|
161 |
+
"!wget -c https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json"
|
162 |
+
],
|
163 |
+
"metadata": {
|
164 |
+
"id": "koXJ8mCciYYn",
|
165 |
+
"colab": {
|
166 |
+
"base_uri": "https://localhost:8080/"
|
167 |
+
},
|
168 |
+
"outputId": "ad30c91c-59f7-48c3-c6c9-7cd824dbdaaf"
|
169 |
+
},
|
170 |
+
"execution_count": null,
|
171 |
+
"outputs": [
|
172 |
+
{
|
173 |
+
"output_type": "stream",
|
174 |
+
"name": "stdout",
|
175 |
+
"text": [
|
176 |
+
"--2024-10-10 15:16:24-- https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json\n",
|
177 |
+
"Resolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.59, 3.165.160.11, ...\n",
|
178 |
+
"Connecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\n",
|
179 |
+
"HTTP request sent, awaiting response... 302 Found\n",
|
180 |
+
"Location: https://cdn-lfs.hf.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1728832584&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODgzMjU4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=hSNSHtz4qcHoAKL%7EBQFjBgq04GmcG2H-ajjYJrixr%7EHufuWwWQMy5AcuLKkDmolFgE8M82AnKQ08idN5ZvzJcgcoyt4QLWmrwLFRMnkORPQNFAoZk9FKvkthxfpdIdLtTZoPb6BqMg5l4SeggvOSC5q8%7EtfC5ASQMw%7ExqIqSGPTo9yIb-CfLXyE3Ceef8E7MIfW8s796ZpgilPx1zhl4cx8s2DyieL84KckvhYxf2Lc5MRBZnUdl0sUuvHBlC7SCr5lB2v-W1veTiqwur9fSpQ4uawD1BApft-zlSA84DnjssFWqhBa-T49X5-P2fGLmwAPcyVlUT17%7EvhHc-reAJg__&Key-Pair-Id=K3RPWS32NSSJCE [following]\n",
|
181 |
+
"--2024-10-10 15:16:25-- https://cdn-lfs.hf.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1728832584&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODgzMjU4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5LzZiNjhiYzVjYTJiZmQ4YTcxMTE5YWYwZTg0NTQ5Mjk2NjhjY2RhNmEzMzQ5NTVjY2M5NWQxMTRmYzhkMDgyZmE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=hSNSHtz4qcHoAKL%7EBQFjBgq04GmcG2H-ajjYJrixr%7EHufuWwWQMy5AcuLKkDmolFgE8M82AnKQ08idN5ZvzJcgcoyt4QLWmrwLFRMnkORPQNFAoZk9FKvkthxfpdIdLtTZoPb6BqMg5l4SeggvOSC5q8%7EtfC5ASQMw%7ExqIqSGPTo9yIb-CfLXyE3Ceef8E7MIfW8s796ZpgilPx1zhl4cx8s2DyieL84KckvhYxf2Lc5MRBZnUdl0sUuvHBlC7SCr5lB2v-W1veTiqwur9fSpQ4uawD1BApft-zlSA84DnjssFWqhBa-T49X5-P2fGLmwAPcyVlUT17%7EvhHc-reAJg__&Key-Pair-Id=K3RPWS32NSSJCE\n",
|
182 |
+
"Resolving cdn-lfs.hf.co (cdn-lfs.hf.co)... 18.172.170.21, 18.172.170.29, 18.172.170.5, ...\n",
|
183 |
+
"Connecting to cdn-lfs.hf.co (cdn-lfs.hf.co)|18.172.170.21|:443... connected.\n",
|
184 |
+
"HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n",
|
185 |
+
"\n",
|
186 |
+
" The file is already fully retrieved; nothing to do.\n",
|
187 |
+
"\n"
|
188 |
+
]
|
189 |
+
}
|
190 |
+
]
|
191 |
+
},
|
192 |
+
{
|
193 |
+
"cell_type": "code",
|
194 |
+
"execution_count": null,
|
195 |
+
"metadata": {
|
196 |
+
"id": "B325mAHNtJCB",
|
197 |
+
"tags": [],
|
198 |
+
"colab": {
|
199 |
+
"base_uri": "https://localhost:8080/",
|
200 |
+
"height": 1000,
|
201 |
+
"referenced_widgets": [
|
202 |
+
"f8108b8c120d4f49ac2b0fa38d6213c3",
|
203 |
+
"868fd8ed1fb5432ab6d6761b4e4ce17d",
|
204 |
+
"f75b8c1cb15a4dc9871b28a286dd3b82",
|
205 |
+
"a08098910db44feabc38e65bf4a55379",
|
206 |
+
"13be5acd97ab44babec61cedcf5b2a3a",
|
207 |
+
"239ab0a871684811ae7a3e16daa8991a",
|
208 |
+
"4ca7774f57454d74bd1b7c9445030038",
|
209 |
+
"557ecfce51574b8db9236bdc8d0bd555",
|
210 |
+
"bc0e09ab397f42879b8c874eb10e6a2b",
|
211 |
+
"8dec270eb8c649649f6b95ddde159a0f",
|
212 |
+
"f9d65533a8fd4310b1466713c22d8255"
|
213 |
+
]
|
214 |
+
},
|
215 |
+
"outputId": "1a2eb33f-62b2-4f96-d3f5-365869a0aa0b"
|
216 |
+
},
|
217 |
+
"outputs": [
|
218 |
+
{
|
219 |
+
"output_type": "stream",
|
220 |
+
"name": "stderr",
|
221 |
+
"text": [
|
222 |
+
"WARNING:__main__:Process rank: 0, device: cuda:0, n_gpu: 1 distributed training: True, 16-bits training: False\n",
|
223 |
+
"INFO:__main__:Training/evaluation parameters TrainingArguments(\n",
|
224 |
+
"_n_gpu=1,\n",
|
225 |
+
"accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
|
226 |
+
"adafactor=False,\n",
|
227 |
+
"adam_beta1=0.9,\n",
|
228 |
+
"adam_beta2=0.999,\n",
|
229 |
+
"adam_epsilon=1e-08,\n",
|
230 |
+
"auto_find_batch_size=False,\n",
|
231 |
+
"batch_eval_metrics=False,\n",
|
232 |
+
"bf16=True,\n",
|
233 |
+
"bf16_full_eval=False,\n",
|
234 |
+
"data_seed=None,\n",
|
235 |
+
"dataloader_drop_last=False,\n",
|
236 |
+
"dataloader_num_workers=0,\n",
|
237 |
+
"dataloader_persistent_workers=False,\n",
|
238 |
+
"dataloader_pin_memory=True,\n",
|
239 |
+
"dataloader_prefetch_factor=None,\n",
|
240 |
+
"ddp_backend=None,\n",
|
241 |
+
"ddp_broadcast_buffers=None,\n",
|
242 |
+
"ddp_bucket_cap_mb=None,\n",
|
243 |
+
"ddp_find_unused_parameters=None,\n",
|
244 |
+
"ddp_timeout=1800,\n",
|
245 |
+
"debug=[],\n",
|
246 |
+
"deepspeed=None,\n",
|
247 |
+
"disable_tqdm=False,\n",
|
248 |
+
"dispatch_batches=None,\n",
|
249 |
+
"do_eval=False,\n",
|
250 |
+
"do_predict=False,\n",
|
251 |
+
"do_train=False,\n",
|
252 |
+
"eval_accumulation_steps=None,\n",
|
253 |
+
"eval_delay=0,\n",
|
254 |
+
"eval_do_concat_batches=True,\n",
|
255 |
+
"eval_on_start=False,\n",
|
256 |
+
"eval_steps=None,\n",
|
257 |
+
"eval_strategy=no,\n",
|
258 |
+
"eval_use_gather_object=False,\n",
|
259 |
+
"evaluation_strategy=None,\n",
|
260 |
+
"fp16=False,\n",
|
261 |
+
"fp16_backend=auto,\n",
|
262 |
+
"fp16_full_eval=False,\n",
|
263 |
+
"fp16_opt_level=O1,\n",
|
264 |
+
"fsdp=[],\n",
|
265 |
+
"fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
|
266 |
+
"fsdp_min_num_params=0,\n",
|
267 |
+
"fsdp_transformer_layer_cls_to_wrap=None,\n",
|
268 |
+
"full_determinism=False,\n",
|
269 |
+
"gradient_accumulation_steps=1,\n",
|
270 |
+
"gradient_checkpointing=True,\n",
|
271 |
+
"gradient_checkpointing_kwargs={'use_reentrant': False},\n",
|
272 |
+
"greater_is_better=None,\n",
|
273 |
+
"group_by_length=False,\n",
|
274 |
+
"half_precision_backend=auto,\n",
|
275 |
+
"hub_always_push=False,\n",
|
276 |
+
"hub_model_id=None,\n",
|
277 |
+
"hub_private_repo=False,\n",
|
278 |
+
"hub_strategy=every_save,\n",
|
279 |
+
"hub_token=<HUB_TOKEN>,\n",
|
280 |
+
"ignore_data_skip=False,\n",
|
281 |
+
"include_inputs_for_metrics=False,\n",
|
282 |
+
"include_num_input_tokens_seen=False,\n",
|
283 |
+
"include_tokens_per_second=False,\n",
|
284 |
+
"jit_mode_eval=False,\n",
|
285 |
+
"label_names=None,\n",
|
286 |
+
"label_smoothing_factor=0.0,\n",
|
287 |
+
"learning_rate=5e-06,\n",
|
288 |
+
"length_column_name=length,\n",
|
289 |
+
"load_best_model_at_end=False,\n",
|
290 |
+
"local_rank=0,\n",
|
291 |
+
"log_level=info,\n",
|
292 |
+
"log_level_replica=warning,\n",
|
293 |
+
"log_on_each_node=True,\n",
|
294 |
+
"logging_dir=./checkpoint_dir/runs/Oct10_15-16-37_33ba61f47fc9,\n",
|
295 |
+
"logging_first_step=False,\n",
|
296 |
+
"logging_nan_inf_filter=True,\n",
|
297 |
+
"logging_steps=20,\n",
|
298 |
+
"logging_strategy=steps,\n",
|
299 |
+
"lr_scheduler_kwargs={},\n",
|
300 |
+
"lr_scheduler_type=cosine,\n",
|
301 |
+
"max_grad_norm=1.0,\n",
|
302 |
+
"max_steps=60,\n",
|
303 |
+
"metric_for_best_model=None,\n",
|
304 |
+
"mp_parameters=,\n",
|
305 |
+
"neftune_noise_alpha=None,\n",
|
306 |
+
"no_cuda=False,\n",
|
307 |
+
"num_train_epochs=1,\n",
|
308 |
+
"optim=adamw_torch,\n",
|
309 |
+
"optim_args=None,\n",
|
310 |
+
"optim_target_modules=None,\n",
|
311 |
+
"output_dir=./checkpoint_dir,\n",
|
312 |
+
"overwrite_output_dir=True,\n",
|
313 |
+
"past_index=-1,\n",
|
314 |
+
"per_device_eval_batch_size=4,\n",
|
315 |
+
"per_device_train_batch_size=4,\n",
|
316 |
+
"prediction_loss_only=False,\n",
|
317 |
+
"push_to_hub=False,\n",
|
318 |
+
"push_to_hub_model_id=None,\n",
|
319 |
+
"push_to_hub_organization=None,\n",
|
320 |
+
"push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
|
321 |
+
"ray_scope=last,\n",
|
322 |
+
"remove_unused_columns=False,\n",
|
323 |
+
"report_to=['tensorboard'],\n",
|
324 |
+
"restore_callback_states_from_checkpoint=False,\n",
|
325 |
+
"resume_from_checkpoint=None,\n",
|
326 |
+
"run_name=./checkpoint_dir,\n",
|
327 |
+
"save_on_each_node=False,\n",
|
328 |
+
"save_only_model=False,\n",
|
329 |
+
"save_safetensors=True,\n",
|
330 |
+
"save_steps=60,\n",
|
331 |
+
"save_strategy=steps,\n",
|
332 |
+
"save_total_limit=1,\n",
|
333 |
+
"seed=0,\n",
|
334 |
+
"skip_memory_metrics=True,\n",
|
335 |
+
"split_batches=None,\n",
|
336 |
+
"tf32=None,\n",
|
337 |
+
"torch_compile=False,\n",
|
338 |
+
"torch_compile_backend=None,\n",
|
339 |
+
"torch_compile_mode=None,\n",
|
340 |
+
"torch_empty_cache_steps=None,\n",
|
341 |
+
"torchdynamo=None,\n",
|
342 |
+
"tpu_metrics_debug=False,\n",
|
343 |
+
"tpu_num_cores=None,\n",
|
344 |
+
"use_cpu=False,\n",
|
345 |
+
"use_ipex=False,\n",
|
346 |
+
"use_legacy_prediction_loop=False,\n",
|
347 |
+
"use_mps_device=False,\n",
|
348 |
+
"warmup_ratio=0.2,\n",
|
349 |
+
"warmup_steps=0,\n",
|
350 |
+
"weight_decay=0.0,\n",
|
351 |
+
")\n",
|
352 |
+
"INFO:__main__:PEFT parameters LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules={'qkv_proj', 'o_proj'}, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))\n",
|
353 |
+
"/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
|
354 |
+
"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
|
355 |
+
"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
|
356 |
+
"You will be able to reuse this secret in all of your notebooks.\n",
|
357 |
+
"Please note that authentication is recommended but still optional to access public models or datasets.\n",
|
358 |
+
" warnings.warn(\n",
|
359 |
+
"[INFO|configuration_utils.py:733] 2024-10-10 15:16:39,682 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json\n",
|
360 |
+
"[INFO|configuration_utils.py:733] 2024-10-10 15:16:39,856 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json\n",
|
361 |
+
"[INFO|configuration_utils.py:800] 2024-10-10 15:16:39,858 >> Model config Phi3Config {\n",
|
362 |
+
" \"_name_or_path\": \"microsoft/phi-3-mini-4k-instruct\",\n",
|
363 |
+
" \"architectures\": [\n",
|
364 |
+
" \"Phi3ForCausalLM\"\n",
|
365 |
+
" ],\n",
|
366 |
+
" \"attention_bias\": false,\n",
|
367 |
+
" \"attention_dropout\": 0.0,\n",
|
368 |
+
" \"auto_map\": {\n",
|
369 |
+
" \"AutoConfig\": \"microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config\",\n",
|
370 |
+
" \"AutoModelForCausalLM\": \"microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM\"\n",
|
371 |
+
" },\n",
|
372 |
+
" \"bos_token_id\": 1,\n",
|
373 |
+
" \"embd_pdrop\": 0.0,\n",
|
374 |
+
" \"eos_token_id\": 32000,\n",
|
375 |
+
" \"hidden_act\": \"silu\",\n",
|
376 |
+
" \"hidden_size\": 3072,\n",
|
377 |
+
" \"initializer_range\": 0.02,\n",
|
378 |
+
" \"intermediate_size\": 8192,\n",
|
379 |
+
" \"max_position_embeddings\": 4096,\n",
|
380 |
+
" \"model_type\": \"phi3\",\n",
|
381 |
+
" \"num_attention_heads\": 32,\n",
|
382 |
+
" \"num_hidden_layers\": 32,\n",
|
383 |
+
" \"num_key_value_heads\": 32,\n",
|
384 |
+
" \"original_max_position_embeddings\": 4096,\n",
|
385 |
+
" \"pad_token_id\": 32000,\n",
|
386 |
+
" \"resid_pdrop\": 0.0,\n",
|
387 |
+
" \"rms_norm_eps\": 1e-05,\n",
|
388 |
+
" \"rope_scaling\": null,\n",
|
389 |
+
" \"rope_theta\": 10000.0,\n",
|
390 |
+
" \"sliding_window\": 2047,\n",
|
391 |
+
" \"tie_word_embeddings\": false,\n",
|
392 |
+
" \"torch_dtype\": \"bfloat16\",\n",
|
393 |
+
" \"transformers_version\": \"4.44.2\",\n",
|
394 |
+
" \"use_cache\": false,\n",
|
395 |
+
" \"vocab_size\": 32064\n",
|
396 |
+
"}\n",
|
397 |
+
"\n",
|
398 |
+
"WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.\n",
|
399 |
+
"WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.\n",
|
400 |
+
"[INFO|modeling_utils.py:3678] 2024-10-10 15:16:40,221 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/model.safetensors.index.json\n",
|
401 |
+
"[INFO|modeling_utils.py:1606] 2024-10-10 15:16:40,225 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.\n",
|
402 |
+
"[INFO|configuration_utils.py:1038] 2024-10-10 15:16:40,228 >> Generate config GenerationConfig {\n",
|
403 |
+
" \"bos_token_id\": 1,\n",
|
404 |
+
" \"eos_token_id\": 32000,\n",
|
405 |
+
" \"pad_token_id\": 32000,\n",
|
406 |
+
" \"use_cache\": false\n",
|
407 |
+
"}\n",
|
408 |
+
"\n"
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"output_type": "display_data",
|
413 |
+
"data": {
|
414 |
+
"text/plain": [
|
415 |
+
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
|
416 |
+
],
|
417 |
+
"application/vnd.jupyter.widget-view+json": {
|
418 |
+
"version_major": 2,
|
419 |
+
"version_minor": 0,
|
420 |
+
"model_id": "f8108b8c120d4f49ac2b0fa38d6213c3"
|
421 |
+
}
|
422 |
+
},
|
423 |
+
"metadata": {}
|
424 |
+
},
|
425 |
+
{
|
426 |
+
"output_type": "stream",
|
427 |
+
"name": "stderr",
|
428 |
+
"text": [
|
429 |
+
"[INFO|modeling_utils.py:4507] 2024-10-10 15:17:11,062 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.\n",
|
430 |
+
"\n",
|
431 |
+
"[INFO|modeling_utils.py:4515] 2024-10-10 15:17:11,070 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/phi-3-mini-4k-instruct.\n",
|
432 |
+
"If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.\n",
|
433 |
+
"[INFO|configuration_utils.py:993] 2024-10-10 15:17:11,251 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json\n",
|
434 |
+
"[INFO|configuration_utils.py:1038] 2024-10-10 15:17:11,253 >> Generate config GenerationConfig {\n",
|
435 |
+
" \"bos_token_id\": 1,\n",
|
436 |
+
" \"eos_token_id\": [\n",
|
437 |
+
" 32000,\n",
|
438 |
+
" 32001,\n",
|
439 |
+
" 32007\n",
|
440 |
+
" ],\n",
|
441 |
+
" \"pad_token_id\": 32000\n",
|
442 |
+
"}\n",
|
443 |
+
"\n",
|
444 |
+
"[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,768 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.model\n",
|
445 |
+
"[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,769 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer.json\n",
|
446 |
+
"[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,771 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/added_tokens.json\n",
|
447 |
+
"[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,772 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/special_tokens_map.json\n",
|
448 |
+
"[INFO|tokenization_utils_base.py:2269] 2024-10-10 15:17:11,775 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/tokenizer_config.json\n",
|
449 |
+
"[INFO|tokenization_utils_base.py:2513] 2024-10-10 15:17:11,857 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
|
450 |
+
"/content/multi_model_phi_3/image_finetuning/finetuning/model.py:39: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
451 |
+
" self.projections.load_state_dict(torch.load(projection_path, map_location=device), strict=False)\n",
|
452 |
+
"Using custom data configuration default-559b28e319de0343\n",
|
453 |
+
"INFO:datasets.builder:Using custom data configuration default-559b28e319de0343\n",
|
454 |
+
"Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json\n",
|
455 |
+
"INFO:datasets.info:Loading Dataset Infos from /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json\n",
|
456 |
+
"Overwrite dataset info from restored data version if exists.\n",
|
457 |
+
"INFO:datasets.builder:Overwrite dataset info from restored data version if exists.\n",
|
458 |
+
"Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
|
459 |
+
"INFO:datasets.info:Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
|
460 |
+
"Found cached dataset json (/root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)\n",
|
461 |
+
"INFO:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)\n",
|
462 |
+
"Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
|
463 |
+
"INFO:datasets.info:Loading Dataset info from /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092\n",
|
464 |
+
"Process #0 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00000_of_00010.arrow\n",
|
465 |
+
"INFO:datasets.arrow_dataset:Process #0 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00000_of_00010.arrow\n",
|
466 |
+
"Process #1 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00001_of_00010.arrow\n",
|
467 |
+
"INFO:datasets.arrow_dataset:Process #1 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00001_of_00010.arrow\n",
|
468 |
+
"Process #2 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00002_of_00010.arrow\n",
|
469 |
+
"INFO:datasets.arrow_dataset:Process #2 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00002_of_00010.arrow\n",
|
470 |
+
"Process #3 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00003_of_00010.arrow\n",
|
471 |
+
"INFO:datasets.arrow_dataset:Process #3 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00003_of_00010.arrow\n",
|
472 |
+
"Process #4 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00004_of_00010.arrow\n",
|
473 |
+
"INFO:datasets.arrow_dataset:Process #4 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00004_of_00010.arrow\n",
|
474 |
+
"Process #5 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00005_of_00010.arrow\n",
|
475 |
+
"INFO:datasets.arrow_dataset:Process #5 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00005_of_00010.arrow\n",
|
476 |
+
"Process #6 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00006_of_00010.arrow\n",
|
477 |
+
"INFO:datasets.arrow_dataset:Process #6 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00006_of_00010.arrow\n",
|
478 |
+
"Process #7 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00007_of_00010.arrow\n",
|
479 |
+
"INFO:datasets.arrow_dataset:Process #7 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00007_of_00010.arrow\n",
|
480 |
+
"Process #8 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00008_of_00010.arrow\n",
|
481 |
+
"INFO:datasets.arrow_dataset:Process #8 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00008_of_00010.arrow\n",
|
482 |
+
"Process #9 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00009_of_00010.arrow\n",
|
483 |
+
"INFO:datasets.arrow_dataset:Process #9 will write at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_00009_of_00010.arrow\n",
|
484 |
+
"Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_*_of_00010.arrow\n",
|
485 |
+
"INFO:datasets.arrow_dataset:Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-559b28e319de0343/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-6111b26d09859c8f_*_of_00010.arrow\n",
|
486 |
+
"Concatenating 10 shards\n",
|
487 |
+
"INFO:datasets.arrow_dataset:Concatenating 10 shards\n",
|
488 |
+
"/content/multi_model_phi_3/image_finetuning/finetuning/dataset.py:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
|
489 |
+
" self.image_embeddings = torch.load('clip_embeddings.pt')\n",
|
490 |
+
"[WARNING|trainer.py:598] 2024-10-10 15:17:18,521 >> max_steps is given, it will override any value given in num_train_epochs\n",
|
491 |
+
"[INFO|trainer.py:648] 2024-10-10 15:17:18,522 >> Using auto half precision backend\n",
|
492 |
+
"[INFO|trainer.py:2134] 2024-10-10 15:17:19,563 >> ***** Running training *****\n",
|
493 |
+
"[INFO|trainer.py:2135] 2024-10-10 15:17:19,565 >> Num examples = 141,941\n",
|
494 |
+
"[INFO|trainer.py:2136] 2024-10-10 15:17:19,570 >> Num Epochs = 1\n",
|
495 |
+
"[INFO|trainer.py:2137] 2024-10-10 15:17:19,571 >> Instantaneous batch size per device = 4\n",
|
496 |
+
"[INFO|trainer.py:2140] 2024-10-10 15:17:19,573 >> Total train batch size (w. parallel, distributed & accumulation) = 4\n",
|
497 |
+
"[INFO|trainer.py:2141] 2024-10-10 15:17:19,574 >> Gradient Accumulation steps = 1\n",
|
498 |
+
"[INFO|trainer.py:2142] 2024-10-10 15:17:19,576 >> Total optimization steps = 60\n",
|
499 |
+
"[INFO|trainer.py:2143] 2024-10-10 15:17:19,580 >> Number of trainable parameters = 124,302,336\n",
|
500 |
+
"WARNING:transformers_modules.microsoft.phi-3-mini-4k-instruct.0a67737cc96d2554230f90338b163bc6380a2a85.modeling_phi3:You are not running the flash-attention implementation, expect numerical differences.\n",
|
501 |
+
"/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:1399: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.\n",
|
502 |
+
" with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context: # type: ignore[attr-defined]\n",
|
503 |
+
"[WARNING|modeling_utils.py:1264] 2024-10-10 15:17:29,290 >> Could not estimate the number of tokens of the input, floating-point operations will not be computed\n"
|
504 |
+
]
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"output_type": "display_data",
|
508 |
+
"data": {
|
509 |
+
"text/plain": [
|
510 |
+
"<IPython.core.display.HTML object>"
|
511 |
+
],
|
512 |
+
"text/html": [
|
513 |
+
"\n",
|
514 |
+
" <div>\n",
|
515 |
+
" \n",
|
516 |
+
" <progress value='60' max='60' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
517 |
+
" [60/60 15:44, Epoch 0/1]\n",
|
518 |
+
" </div>\n",
|
519 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
520 |
+
" <thead>\n",
|
521 |
+
" <tr style=\"text-align: left;\">\n",
|
522 |
+
" <th>Step</th>\n",
|
523 |
+
" <th>Training Loss</th>\n",
|
524 |
+
" </tr>\n",
|
525 |
+
" </thead>\n",
|
526 |
+
" <tbody>\n",
|
527 |
+
" <tr>\n",
|
528 |
+
" <td>20</td>\n",
|
529 |
+
" <td>9.531900</td>\n",
|
530 |
+
" </tr>\n",
|
531 |
+
" <tr>\n",
|
532 |
+
" <td>40</td>\n",
|
533 |
+
" <td>10.267400</td>\n",
|
534 |
+
" </tr>\n",
|
535 |
+
" <tr>\n",
|
536 |
+
" <td>60</td>\n",
|
537 |
+
" <td>9.545700</td>\n",
|
538 |
+
" </tr>\n",
|
539 |
+
" </tbody>\n",
|
540 |
+
"</table><p>"
|
541 |
+
]
|
542 |
+
},
|
543 |
+
"metadata": {}
|
544 |
+
},
|
545 |
+
{
|
546 |
+
"output_type": "stream",
|
547 |
+
"name": "stderr",
|
548 |
+
"text": [
|
549 |
+
"[INFO|trainer.py:3503] 2024-10-10 15:32:51,845 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-60\n",
|
550 |
+
"[INFO|configuration_utils.py:472] 2024-10-10 15:32:51,849 >> Configuration saved in ./checkpoint_dir/checkpoint-60/config.json\n",
|
551 |
+
"[INFO|modeling_utils.py:2799] 2024-10-10 15:33:11,817 >> Model weights saved in ./checkpoint_dir/checkpoint-60/model.safetensors\n",
|
552 |
+
"[INFO|tokenization_utils_base.py:2684] 2024-10-10 15:33:11,827 >> tokenizer config file saved in ./checkpoint_dir/checkpoint-60/tokenizer_config.json\n",
|
553 |
+
"[INFO|tokenization_utils_base.py:2693] 2024-10-10 15:33:11,830 >> Special tokens file saved in ./checkpoint_dir/checkpoint-60/special_tokens_map.json\n",
|
554 |
+
"[INFO|trainer.py:2394] 2024-10-10 15:33:13,911 >> \n",
|
555 |
+
"\n",
|
556 |
+
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
|
557 |
+
"\n",
|
558 |
+
"\n",
|
559 |
+
"[INFO|trainer.py:3819] 2024-10-10 15:33:13,928 >> \n",
|
560 |
+
"***** Running Evaluation *****\n",
|
561 |
+
"[INFO|trainer.py:3821] 2024-10-10 15:33:13,931 >> Num examples = 15771\n",
|
562 |
+
"[INFO|trainer.py:3824] 2024-10-10 15:33:13,933 >> Batch size = 4\n"
|
563 |
+
]
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"output_type": "stream",
|
567 |
+
"name": "stdout",
|
568 |
+
"text": [
|
569 |
+
"***** train metrics *****\n",
|
570 |
+
" epoch = 0.0017\n",
|
571 |
+
" total_flos = 0GF\n",
|
572 |
+
" train_loss = 9.7817\n",
|
573 |
+
" train_runtime = 0:15:54.33\n",
|
574 |
+
" train_samples_per_second = 0.251\n",
|
575 |
+
" train_steps_per_second = 0.063\n"
|
576 |
+
]
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"output_type": "display_data",
|
580 |
+
"data": {
|
581 |
+
"text/plain": [
|
582 |
+
"<IPython.core.display.HTML object>"
|
583 |
+
],
|
584 |
+
"text/html": [
|
585 |
+
"\n",
|
586 |
+
" <div>\n",
|
587 |
+
" \n",
|
588 |
+
" <progress value='1158' max='3943' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
589 |
+
" [1158/3943 1:48:54 < 4:22:10, 0.18 it/s]\n",
|
590 |
+
" </div>\n",
|
591 |
+
" "
|
592 |
+
]
|
593 |
+
},
|
594 |
+
"metadata": {}
|
595 |
+
}
|
596 |
+
],
|
597 |
+
"source": [
|
598 |
+
"%run finetune.py"
|
599 |
+
]
|
600 |
+
}
|
601 |
+
],
|
602 |
+
"metadata": {
|
603 |
+
"accelerator": "GPU",
|
604 |
+
"colab": {
|
605 |
+
"gpuType": "T4",
|
606 |
+
"provenance": []
|
607 |
+
},
|
608 |
+
"kernelspec": {
|
609 |
+
"display_name": "Python 3 (ipykernel)",
|
610 |
+
"language": "python",
|
611 |
+
"name": "python3"
|
612 |
+
},
|
613 |
+
"language_info": {
|
614 |
+
"codemirror_mode": {
|
615 |
+
"name": "ipython",
|
616 |
+
"version": 3
|
617 |
+
},
|
618 |
+
"file_extension": ".py",
|
619 |
+
"mimetype": "text/x-python",
|
620 |
+
"name": "python",
|
621 |
+
"nbconvert_exporter": "python",
|
622 |
+
"pygments_lexer": "ipython3",
|
623 |
+
"version": "3.10.13"
|
624 |
+
},
|
625 |
+
"widgets": {
|
626 |
+
"application/vnd.jupyter.widget-state+json": {
|
627 |
+
"f8108b8c120d4f49ac2b0fa38d6213c3": {
|
628 |
+
"model_module": "@jupyter-widgets/controls",
|
629 |
+
"model_name": "HBoxModel",
|
630 |
+
"model_module_version": "1.5.0",
|
631 |
+
"state": {
|
632 |
+
"_dom_classes": [],
|
633 |
+
"_model_module": "@jupyter-widgets/controls",
|
634 |
+
"_model_module_version": "1.5.0",
|
635 |
+
"_model_name": "HBoxModel",
|
636 |
+
"_view_count": null,
|
637 |
+
"_view_module": "@jupyter-widgets/controls",
|
638 |
+
"_view_module_version": "1.5.0",
|
639 |
+
"_view_name": "HBoxView",
|
640 |
+
"box_style": "",
|
641 |
+
"children": [
|
642 |
+
"IPY_MODEL_868fd8ed1fb5432ab6d6761b4e4ce17d",
|
643 |
+
"IPY_MODEL_f75b8c1cb15a4dc9871b28a286dd3b82",
|
644 |
+
"IPY_MODEL_a08098910db44feabc38e65bf4a55379"
|
645 |
+
],
|
646 |
+
"layout": "IPY_MODEL_13be5acd97ab44babec61cedcf5b2a3a"
|
647 |
+
}
|
648 |
+
},
|
649 |
+
"868fd8ed1fb5432ab6d6761b4e4ce17d": {
|
650 |
+
"model_module": "@jupyter-widgets/controls",
|
651 |
+
"model_name": "HTMLModel",
|
652 |
+
"model_module_version": "1.5.0",
|
653 |
+
"state": {
|
654 |
+
"_dom_classes": [],
|
655 |
+
"_model_module": "@jupyter-widgets/controls",
|
656 |
+
"_model_module_version": "1.5.0",
|
657 |
+
"_model_name": "HTMLModel",
|
658 |
+
"_view_count": null,
|
659 |
+
"_view_module": "@jupyter-widgets/controls",
|
660 |
+
"_view_module_version": "1.5.0",
|
661 |
+
"_view_name": "HTMLView",
|
662 |
+
"description": "",
|
663 |
+
"description_tooltip": null,
|
664 |
+
"layout": "IPY_MODEL_239ab0a871684811ae7a3e16daa8991a",
|
665 |
+
"placeholder": "",
|
666 |
+
"style": "IPY_MODEL_4ca7774f57454d74bd1b7c9445030038",
|
667 |
+
"value": "Loading checkpoint shards: 100%"
|
668 |
+
}
|
669 |
+
},
|
670 |
+
"f75b8c1cb15a4dc9871b28a286dd3b82": {
|
671 |
+
"model_module": "@jupyter-widgets/controls",
|
672 |
+
"model_name": "FloatProgressModel",
|
673 |
+
"model_module_version": "1.5.0",
|
674 |
+
"state": {
|
675 |
+
"_dom_classes": [],
|
676 |
+
"_model_module": "@jupyter-widgets/controls",
|
677 |
+
"_model_module_version": "1.5.0",
|
678 |
+
"_model_name": "FloatProgressModel",
|
679 |
+
"_view_count": null,
|
680 |
+
"_view_module": "@jupyter-widgets/controls",
|
681 |
+
"_view_module_version": "1.5.0",
|
682 |
+
"_view_name": "ProgressView",
|
683 |
+
"bar_style": "success",
|
684 |
+
"description": "",
|
685 |
+
"description_tooltip": null,
|
686 |
+
"layout": "IPY_MODEL_557ecfce51574b8db9236bdc8d0bd555",
|
687 |
+
"max": 2,
|
688 |
+
"min": 0,
|
689 |
+
"orientation": "horizontal",
|
690 |
+
"style": "IPY_MODEL_bc0e09ab397f42879b8c874eb10e6a2b",
|
691 |
+
"value": 2
|
692 |
+
}
|
693 |
+
},
|
694 |
+
"a08098910db44feabc38e65bf4a55379": {
|
695 |
+
"model_module": "@jupyter-widgets/controls",
|
696 |
+
"model_name": "HTMLModel",
|
697 |
+
"model_module_version": "1.5.0",
|
698 |
+
"state": {
|
699 |
+
"_dom_classes": [],
|
700 |
+
"_model_module": "@jupyter-widgets/controls",
|
701 |
+
"_model_module_version": "1.5.0",
|
702 |
+
"_model_name": "HTMLModel",
|
703 |
+
"_view_count": null,
|
704 |
+
"_view_module": "@jupyter-widgets/controls",
|
705 |
+
"_view_module_version": "1.5.0",
|
706 |
+
"_view_name": "HTMLView",
|
707 |
+
"description": "",
|
708 |
+
"description_tooltip": null,
|
709 |
+
"layout": "IPY_MODEL_8dec270eb8c649649f6b95ddde159a0f",
|
710 |
+
"placeholder": "",
|
711 |
+
"style": "IPY_MODEL_f9d65533a8fd4310b1466713c22d8255",
|
712 |
+
"value": " 2/2 [00:30<00:00, 14.75s/it]"
|
713 |
+
}
|
714 |
+
},
|
715 |
+
"13be5acd97ab44babec61cedcf5b2a3a": {
|
716 |
+
"model_module": "@jupyter-widgets/base",
|
717 |
+
"model_name": "LayoutModel",
|
718 |
+
"model_module_version": "1.2.0",
|
719 |
+
"state": {
|
720 |
+
"_model_module": "@jupyter-widgets/base",
|
721 |
+
"_model_module_version": "1.2.0",
|
722 |
+
"_model_name": "LayoutModel",
|
723 |
+
"_view_count": null,
|
724 |
+
"_view_module": "@jupyter-widgets/base",
|
725 |
+
"_view_module_version": "1.2.0",
|
726 |
+
"_view_name": "LayoutView",
|
727 |
+
"align_content": null,
|
728 |
+
"align_items": null,
|
729 |
+
"align_self": null,
|
730 |
+
"border": null,
|
731 |
+
"bottom": null,
|
732 |
+
"display": null,
|
733 |
+
"flex": null,
|
734 |
+
"flex_flow": null,
|
735 |
+
"grid_area": null,
|
736 |
+
"grid_auto_columns": null,
|
737 |
+
"grid_auto_flow": null,
|
738 |
+
"grid_auto_rows": null,
|
739 |
+
"grid_column": null,
|
740 |
+
"grid_gap": null,
|
741 |
+
"grid_row": null,
|
742 |
+
"grid_template_areas": null,
|
743 |
+
"grid_template_columns": null,
|
744 |
+
"grid_template_rows": null,
|
745 |
+
"height": null,
|
746 |
+
"justify_content": null,
|
747 |
+
"justify_items": null,
|
748 |
+
"left": null,
|
749 |
+
"margin": null,
|
750 |
+
"max_height": null,
|
751 |
+
"max_width": null,
|
752 |
+
"min_height": null,
|
753 |
+
"min_width": null,
|
754 |
+
"object_fit": null,
|
755 |
+
"object_position": null,
|
756 |
+
"order": null,
|
757 |
+
"overflow": null,
|
758 |
+
"overflow_x": null,
|
759 |
+
"overflow_y": null,
|
760 |
+
"padding": null,
|
761 |
+
"right": null,
|
762 |
+
"top": null,
|
763 |
+
"visibility": null,
|
764 |
+
"width": null
|
765 |
+
}
|
766 |
+
},
|
767 |
+
"239ab0a871684811ae7a3e16daa8991a": {
|
768 |
+
"model_module": "@jupyter-widgets/base",
|
769 |
+
"model_name": "LayoutModel",
|
770 |
+
"model_module_version": "1.2.0",
|
771 |
+
"state": {
|
772 |
+
"_model_module": "@jupyter-widgets/base",
|
773 |
+
"_model_module_version": "1.2.0",
|
774 |
+
"_model_name": "LayoutModel",
|
775 |
+
"_view_count": null,
|
776 |
+
"_view_module": "@jupyter-widgets/base",
|
777 |
+
"_view_module_version": "1.2.0",
|
778 |
+
"_view_name": "LayoutView",
|
779 |
+
"align_content": null,
|
780 |
+
"align_items": null,
|
781 |
+
"align_self": null,
|
782 |
+
"border": null,
|
783 |
+
"bottom": null,
|
784 |
+
"display": null,
|
785 |
+
"flex": null,
|
786 |
+
"flex_flow": null,
|
787 |
+
"grid_area": null,
|
788 |
+
"grid_auto_columns": null,
|
789 |
+
"grid_auto_flow": null,
|
790 |
+
"grid_auto_rows": null,
|
791 |
+
"grid_column": null,
|
792 |
+
"grid_gap": null,
|
793 |
+
"grid_row": null,
|
794 |
+
"grid_template_areas": null,
|
795 |
+
"grid_template_columns": null,
|
796 |
+
"grid_template_rows": null,
|
797 |
+
"height": null,
|
798 |
+
"justify_content": null,
|
799 |
+
"justify_items": null,
|
800 |
+
"left": null,
|
801 |
+
"margin": null,
|
802 |
+
"max_height": null,
|
803 |
+
"max_width": null,
|
804 |
+
"min_height": null,
|
805 |
+
"min_width": null,
|
806 |
+
"object_fit": null,
|
807 |
+
"object_position": null,
|
808 |
+
"order": null,
|
809 |
+
"overflow": null,
|
810 |
+
"overflow_x": null,
|
811 |
+
"overflow_y": null,
|
812 |
+
"padding": null,
|
813 |
+
"right": null,
|
814 |
+
"top": null,
|
815 |
+
"visibility": null,
|
816 |
+
"width": null
|
817 |
+
}
|
818 |
+
},
|
819 |
+
"4ca7774f57454d74bd1b7c9445030038": {
|
820 |
+
"model_module": "@jupyter-widgets/controls",
|
821 |
+
"model_name": "DescriptionStyleModel",
|
822 |
+
"model_module_version": "1.5.0",
|
823 |
+
"state": {
|
824 |
+
"_model_module": "@jupyter-widgets/controls",
|
825 |
+
"_model_module_version": "1.5.0",
|
826 |
+
"_model_name": "DescriptionStyleModel",
|
827 |
+
"_view_count": null,
|
828 |
+
"_view_module": "@jupyter-widgets/base",
|
829 |
+
"_view_module_version": "1.2.0",
|
830 |
+
"_view_name": "StyleView",
|
831 |
+
"description_width": ""
|
832 |
+
}
|
833 |
+
},
|
834 |
+
"557ecfce51574b8db9236bdc8d0bd555": {
|
835 |
+
"model_module": "@jupyter-widgets/base",
|
836 |
+
"model_name": "LayoutModel",
|
837 |
+
"model_module_version": "1.2.0",
|
838 |
+
"state": {
|
839 |
+
"_model_module": "@jupyter-widgets/base",
|
840 |
+
"_model_module_version": "1.2.0",
|
841 |
+
"_model_name": "LayoutModel",
|
842 |
+
"_view_count": null,
|
843 |
+
"_view_module": "@jupyter-widgets/base",
|
844 |
+
"_view_module_version": "1.2.0",
|
845 |
+
"_view_name": "LayoutView",
|
846 |
+
"align_content": null,
|
847 |
+
"align_items": null,
|
848 |
+
"align_self": null,
|
849 |
+
"border": null,
|
850 |
+
"bottom": null,
|
851 |
+
"display": null,
|
852 |
+
"flex": null,
|
853 |
+
"flex_flow": null,
|
854 |
+
"grid_area": null,
|
855 |
+
"grid_auto_columns": null,
|
856 |
+
"grid_auto_flow": null,
|
857 |
+
"grid_auto_rows": null,
|
858 |
+
"grid_column": null,
|
859 |
+
"grid_gap": null,
|
860 |
+
"grid_row": null,
|
861 |
+
"grid_template_areas": null,
|
862 |
+
"grid_template_columns": null,
|
863 |
+
"grid_template_rows": null,
|
864 |
+
"height": null,
|
865 |
+
"justify_content": null,
|
866 |
+
"justify_items": null,
|
867 |
+
"left": null,
|
868 |
+
"margin": null,
|
869 |
+
"max_height": null,
|
870 |
+
"max_width": null,
|
871 |
+
"min_height": null,
|
872 |
+
"min_width": null,
|
873 |
+
"object_fit": null,
|
874 |
+
"object_position": null,
|
875 |
+
"order": null,
|
876 |
+
"overflow": null,
|
877 |
+
"overflow_x": null,
|
878 |
+
"overflow_y": null,
|
879 |
+
"padding": null,
|
880 |
+
"right": null,
|
881 |
+
"top": null,
|
882 |
+
"visibility": null,
|
883 |
+
"width": null
|
884 |
+
}
|
885 |
+
},
|
886 |
+
"bc0e09ab397f42879b8c874eb10e6a2b": {
|
887 |
+
"model_module": "@jupyter-widgets/controls",
|
888 |
+
"model_name": "ProgressStyleModel",
|
889 |
+
"model_module_version": "1.5.0",
|
890 |
+
"state": {
|
891 |
+
"_model_module": "@jupyter-widgets/controls",
|
892 |
+
"_model_module_version": "1.5.0",
|
893 |
+
"_model_name": "ProgressStyleModel",
|
894 |
+
"_view_count": null,
|
895 |
+
"_view_module": "@jupyter-widgets/base",
|
896 |
+
"_view_module_version": "1.2.0",
|
897 |
+
"_view_name": "StyleView",
|
898 |
+
"bar_color": null,
|
899 |
+
"description_width": ""
|
900 |
+
}
|
901 |
+
},
|
902 |
+
"8dec270eb8c649649f6b95ddde159a0f": {
|
903 |
+
"model_module": "@jupyter-widgets/base",
|
904 |
+
"model_name": "LayoutModel",
|
905 |
+
"model_module_version": "1.2.0",
|
906 |
+
"state": {
|
907 |
+
"_model_module": "@jupyter-widgets/base",
|
908 |
+
"_model_module_version": "1.2.0",
|
909 |
+
"_model_name": "LayoutModel",
|
910 |
+
"_view_count": null,
|
911 |
+
"_view_module": "@jupyter-widgets/base",
|
912 |
+
"_view_module_version": "1.2.0",
|
913 |
+
"_view_name": "LayoutView",
|
914 |
+
"align_content": null,
|
915 |
+
"align_items": null,
|
916 |
+
"align_self": null,
|
917 |
+
"border": null,
|
918 |
+
"bottom": null,
|
919 |
+
"display": null,
|
920 |
+
"flex": null,
|
921 |
+
"flex_flow": null,
|
922 |
+
"grid_area": null,
|
923 |
+
"grid_auto_columns": null,
|
924 |
+
"grid_auto_flow": null,
|
925 |
+
"grid_auto_rows": null,
|
926 |
+
"grid_column": null,
|
927 |
+
"grid_gap": null,
|
928 |
+
"grid_row": null,
|
929 |
+
"grid_template_areas": null,
|
930 |
+
"grid_template_columns": null,
|
931 |
+
"grid_template_rows": null,
|
932 |
+
"height": null,
|
933 |
+
"justify_content": null,
|
934 |
+
"justify_items": null,
|
935 |
+
"left": null,
|
936 |
+
"margin": null,
|
937 |
+
"max_height": null,
|
938 |
+
"max_width": null,
|
939 |
+
"min_height": null,
|
940 |
+
"min_width": null,
|
941 |
+
"object_fit": null,
|
942 |
+
"object_position": null,
|
943 |
+
"order": null,
|
944 |
+
"overflow": null,
|
945 |
+
"overflow_x": null,
|
946 |
+
"overflow_y": null,
|
947 |
+
"padding": null,
|
948 |
+
"right": null,
|
949 |
+
"top": null,
|
950 |
+
"visibility": null,
|
951 |
+
"width": null
|
952 |
+
}
|
953 |
+
},
|
954 |
+
"f9d65533a8fd4310b1466713c22d8255": {
|
955 |
+
"model_module": "@jupyter-widgets/controls",
|
956 |
+
"model_name": "DescriptionStyleModel",
|
957 |
+
"model_module_version": "1.5.0",
|
958 |
+
"state": {
|
959 |
+
"_model_module": "@jupyter-widgets/controls",
|
960 |
+
"_model_module_version": "1.5.0",
|
961 |
+
"_model_name": "DescriptionStyleModel",
|
962 |
+
"_view_count": null,
|
963 |
+
"_view_module": "@jupyter-widgets/base",
|
964 |
+
"_view_module_version": "1.2.0",
|
965 |
+
"_view_name": "StyleView",
|
966 |
+
"description_width": ""
|
967 |
+
}
|
968 |
+
}
|
969 |
+
}
|
970 |
+
}
|
971 |
+
},
|
972 |
+
"nbformat": 4,
|
973 |
+
"nbformat_minor": 0
|
974 |
+
}
|
checkpoint_dir/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: microsoft/phi-3-mini-4k-instruct
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
checkpoint_dir/adapter_config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 32,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 16,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
+
"qkv_proj"
|
25 |
+
],
|
26 |
+
"task_type": "CAUSAL_LM",
|
27 |
+
"use_dora": false,
|
28 |
+
"use_rslora": false
|
29 |
+
}
|
checkpoint_dir/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72ecb3d5d3e593ecf340dd7c5904c74c644cd04902f6705454c0f5223a399ddf
|
3 |
+
size 37766064
|
checkpoint_dir/all_results.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.6762849413886384,
|
3 |
+
"eval_loss": 7.323873043060303,
|
4 |
+
"eval_runtime": 386.842,
|
5 |
+
"eval_samples": 3154,
|
6 |
+
"eval_samples_per_second": 8.153,
|
7 |
+
"eval_steps_per_second": 0.512,
|
8 |
+
"total_flos": 0.0,
|
9 |
+
"train_loss": 2.13714133199056,
|
10 |
+
"train_runtime": 37217.3738,
|
11 |
+
"train_samples_per_second": 2.579,
|
12 |
+
"train_steps_per_second": 0.161
|
13 |
+
}
|
checkpoint_dir/checkpoint-6000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52b6c0d5af1fb0efa5ee1862caaee15f6ba008a7c7ba60d9d841995e490ff27f
|
3 |
+
size 535089466
|
checkpoint_dir/checkpoint-6000/phi_model/README.md
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: microsoft/phi-3-mini-4k-instruct
|
3 |
+
library_name: peft
|
4 |
+
---
|
5 |
+
|
6 |
+
# Model Card for Model ID
|
7 |
+
|
8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
## Model Details
|
13 |
+
|
14 |
+
### Model Description
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
- **Developed by:** [More Information Needed]
|
21 |
+
- **Funded by [optional]:** [More Information Needed]
|
22 |
+
- **Shared by [optional]:** [More Information Needed]
|
23 |
+
- **Model type:** [More Information Needed]
|
24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
25 |
+
- **License:** [More Information Needed]
|
26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
27 |
+
|
28 |
+
### Model Sources [optional]
|
29 |
+
|
30 |
+
<!-- Provide the basic links for the model. -->
|
31 |
+
|
32 |
+
- **Repository:** [More Information Needed]
|
33 |
+
- **Paper [optional]:** [More Information Needed]
|
34 |
+
- **Demo [optional]:** [More Information Needed]
|
35 |
+
|
36 |
+
## Uses
|
37 |
+
|
38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
39 |
+
|
40 |
+
### Direct Use
|
41 |
+
|
42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
43 |
+
|
44 |
+
[More Information Needed]
|
45 |
+
|
46 |
+
### Downstream Use [optional]
|
47 |
+
|
48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
49 |
+
|
50 |
+
[More Information Needed]
|
51 |
+
|
52 |
+
### Out-of-Scope Use
|
53 |
+
|
54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
55 |
+
|
56 |
+
[More Information Needed]
|
57 |
+
|
58 |
+
## Bias, Risks, and Limitations
|
59 |
+
|
60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
61 |
+
|
62 |
+
[More Information Needed]
|
63 |
+
|
64 |
+
### Recommendations
|
65 |
+
|
66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
67 |
+
|
68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
69 |
+
|
70 |
+
## How to Get Started with the Model
|
71 |
+
|
72 |
+
Use the code below to get started with the model.
|
73 |
+
|
74 |
+
[More Information Needed]
|
75 |
+
|
76 |
+
## Training Details
|
77 |
+
|
78 |
+
### Training Data
|
79 |
+
|
80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
81 |
+
|
82 |
+
[More Information Needed]
|
83 |
+
|
84 |
+
### Training Procedure
|
85 |
+
|
86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
87 |
+
|
88 |
+
#### Preprocessing [optional]
|
89 |
+
|
90 |
+
[More Information Needed]
|
91 |
+
|
92 |
+
|
93 |
+
#### Training Hyperparameters
|
94 |
+
|
95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
96 |
+
|
97 |
+
#### Speeds, Sizes, Times [optional]
|
98 |
+
|
99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
100 |
+
|
101 |
+
[More Information Needed]
|
102 |
+
|
103 |
+
## Evaluation
|
104 |
+
|
105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
106 |
+
|
107 |
+
### Testing Data, Factors & Metrics
|
108 |
+
|
109 |
+
#### Testing Data
|
110 |
+
|
111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
112 |
+
|
113 |
+
[More Information Needed]
|
114 |
+
|
115 |
+
#### Factors
|
116 |
+
|
117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
118 |
+
|
119 |
+
[More Information Needed]
|
120 |
+
|
121 |
+
#### Metrics
|
122 |
+
|
123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
124 |
+
|
125 |
+
[More Information Needed]
|
126 |
+
|
127 |
+
### Results
|
128 |
+
|
129 |
+
[More Information Needed]
|
130 |
+
|
131 |
+
#### Summary
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
## Model Examination [optional]
|
136 |
+
|
137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
138 |
+
|
139 |
+
[More Information Needed]
|
140 |
+
|
141 |
+
## Environmental Impact
|
142 |
+
|
143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
144 |
+
|
145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
146 |
+
|
147 |
+
- **Hardware Type:** [More Information Needed]
|
148 |
+
- **Hours used:** [More Information Needed]
|
149 |
+
- **Cloud Provider:** [More Information Needed]
|
150 |
+
- **Compute Region:** [More Information Needed]
|
151 |
+
- **Carbon Emitted:** [More Information Needed]
|
152 |
+
|
153 |
+
## Technical Specifications [optional]
|
154 |
+
|
155 |
+
### Model Architecture and Objective
|
156 |
+
|
157 |
+
[More Information Needed]
|
158 |
+
|
159 |
+
### Compute Infrastructure
|
160 |
+
|
161 |
+
[More Information Needed]
|
162 |
+
|
163 |
+
#### Hardware
|
164 |
+
|
165 |
+
[More Information Needed]
|
166 |
+
|
167 |
+
#### Software
|
168 |
+
|
169 |
+
[More Information Needed]
|
170 |
+
|
171 |
+
## Citation [optional]
|
172 |
+
|
173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
174 |
+
|
175 |
+
**BibTeX:**
|
176 |
+
|
177 |
+
[More Information Needed]
|
178 |
+
|
179 |
+
**APA:**
|
180 |
+
|
181 |
+
[More Information Needed]
|
182 |
+
|
183 |
+
## Glossary [optional]
|
184 |
+
|
185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
186 |
+
|
187 |
+
[More Information Needed]
|
188 |
+
|
189 |
+
## More Information [optional]
|
190 |
+
|
191 |
+
[More Information Needed]
|
192 |
+
|
193 |
+
## Model Card Authors [optional]
|
194 |
+
|
195 |
+
[More Information Needed]
|
196 |
+
|
197 |
+
## Model Card Contact
|
198 |
+
|
199 |
+
[More Information Needed]
|
200 |
+
### Framework versions
|
201 |
+
|
202 |
+
- PEFT 0.12.0
|
checkpoint_dir/checkpoint-6000/phi_model/adapter_config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "microsoft/phi-3-mini-4k-instruct",
|
5 |
+
"bias": "none",
|
6 |
+
"fan_in_fan_out": false,
|
7 |
+
"inference_mode": true,
|
8 |
+
"init_lora_weights": true,
|
9 |
+
"layer_replication": null,
|
10 |
+
"layers_pattern": null,
|
11 |
+
"layers_to_transform": null,
|
12 |
+
"loftq_config": {},
|
13 |
+
"lora_alpha": 32,
|
14 |
+
"lora_dropout": 0.05,
|
15 |
+
"megatron_config": null,
|
16 |
+
"megatron_core": "megatron.core",
|
17 |
+
"modules_to_save": null,
|
18 |
+
"peft_type": "LORA",
|
19 |
+
"r": 16,
|
20 |
+
"rank_pattern": {},
|
21 |
+
"revision": null,
|
22 |
+
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
+
"qkv_proj"
|
25 |
+
],
|
26 |
+
"task_type": "CAUSAL_LM",
|
27 |
+
"use_dora": false,
|
28 |
+
"use_rslora": false
|
29 |
+
}
|
checkpoint_dir/checkpoint-6000/phi_model/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72ecb3d5d3e593ecf340dd7c5904c74c644cd04902f6705454c0f5223a399ddf
|
3 |
+
size 37766064
|
checkpoint_dir/checkpoint-6000/projection_layer/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16cce5f1f2b77da68f1e881a3cd1bff784aa48930991b1f0b52a3b81cc6f2923
|
3 |
+
size 229740738
|
checkpoint_dir/checkpoint-6000/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50759f462f2cfd4149cdba36c5f0d942c8659cc7e9f4c6a09f5d75e2b1f5e160
|
3 |
+
size 14180
|
checkpoint_dir/checkpoint-6000/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1fca623b299b046cffde6dd6fc538bf592ad9cba4a26768597da5e62b84c1662
|
3 |
+
size 1064
|
checkpoint_dir/checkpoint-6000/trainer_state.json
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.6762849413886384,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6000,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.01127141568981064,
|
13 |
+
"grad_norm": 36.142295837402344,
|
14 |
+
"learning_rate": 4.1666666666666667e-07,
|
15 |
+
"loss": 12.6438,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.02254283137962128,
|
20 |
+
"grad_norm": 30.892282485961914,
|
21 |
+
"learning_rate": 8.333333333333333e-07,
|
22 |
+
"loss": 12.4413,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.03381424706943192,
|
27 |
+
"grad_norm": 45.94392395019531,
|
28 |
+
"learning_rate": 1.25e-06,
|
29 |
+
"loss": 11.4952,
|
30 |
+
"step": 300
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.04508566275924256,
|
34 |
+
"grad_norm": 19.69997787475586,
|
35 |
+
"learning_rate": 1.6666666666666667e-06,
|
36 |
+
"loss": 7.0421,
|
37 |
+
"step": 400
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.0563570784490532,
|
41 |
+
"grad_norm": 3.0256764888763428,
|
42 |
+
"learning_rate": 2.0833333333333334e-06,
|
43 |
+
"loss": 3.1071,
|
44 |
+
"step": 500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.06762849413886383,
|
48 |
+
"grad_norm": 1.9536222219467163,
|
49 |
+
"learning_rate": 2.5e-06,
|
50 |
+
"loss": 2.7468,
|
51 |
+
"step": 600
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.07889990982867448,
|
55 |
+
"grad_norm": 3.0367178916931152,
|
56 |
+
"learning_rate": 2.916666666666667e-06,
|
57 |
+
"loss": 2.5461,
|
58 |
+
"step": 700
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.09017132551848513,
|
62 |
+
"grad_norm": 1.1410305500030518,
|
63 |
+
"learning_rate": 3.3333333333333333e-06,
|
64 |
+
"loss": 2.3206,
|
65 |
+
"step": 800
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.10144274120829576,
|
69 |
+
"grad_norm": 1.2108758687973022,
|
70 |
+
"learning_rate": 3.7500000000000005e-06,
|
71 |
+
"loss": 2.036,
|
72 |
+
"step": 900
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.1127141568981064,
|
76 |
+
"grad_norm": 1.0124415159225464,
|
77 |
+
"learning_rate": 4.166666666666667e-06,
|
78 |
+
"loss": 1.9927,
|
79 |
+
"step": 1000
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.12398557258791704,
|
83 |
+
"grad_norm": 1.2103397846221924,
|
84 |
+
"learning_rate": 4.583333333333333e-06,
|
85 |
+
"loss": 1.8255,
|
86 |
+
"step": 1100
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.13525698827772767,
|
90 |
+
"grad_norm": 1.2905486822128296,
|
91 |
+
"learning_rate": 5e-06,
|
92 |
+
"loss": 1.6822,
|
93 |
+
"step": 1200
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.14652840396753833,
|
97 |
+
"grad_norm": 0.9415493607521057,
|
98 |
+
"learning_rate": 4.994647308096509e-06,
|
99 |
+
"loss": 1.6783,
|
100 |
+
"step": 1300
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.15779981965734896,
|
104 |
+
"grad_norm": 0.9576979279518127,
|
105 |
+
"learning_rate": 4.978612153434527e-06,
|
106 |
+
"loss": 1.5967,
|
107 |
+
"step": 1400
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.1690712353471596,
|
111 |
+
"grad_norm": 1.068982481956482,
|
112 |
+
"learning_rate": 4.9519632010080765e-06,
|
113 |
+
"loss": 1.5821,
|
114 |
+
"step": 1500
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.18034265103697025,
|
118 |
+
"grad_norm": 0.7473943829536438,
|
119 |
+
"learning_rate": 4.914814565722671e-06,
|
120 |
+
"loss": 1.5587,
|
121 |
+
"step": 1600
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.19161406672678089,
|
125 |
+
"grad_norm": 0.9740411043167114,
|
126 |
+
"learning_rate": 4.867325323737765e-06,
|
127 |
+
"loss": 1.525,
|
128 |
+
"step": 1700
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.20288548241659152,
|
132 |
+
"grad_norm": 0.6997997164726257,
|
133 |
+
"learning_rate": 4.809698831278217e-06,
|
134 |
+
"loss": 1.49,
|
135 |
+
"step": 1800
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.21415689810640218,
|
139 |
+
"grad_norm": 0.6754641532897949,
|
140 |
+
"learning_rate": 4.742181853831721e-06,
|
141 |
+
"loss": 1.4597,
|
142 |
+
"step": 1900
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.2254283137962128,
|
146 |
+
"grad_norm": 1.0468783378601074,
|
147 |
+
"learning_rate": 4.665063509461098e-06,
|
148 |
+
"loss": 1.4539,
|
149 |
+
"step": 2000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.23669972948602344,
|
153 |
+
"grad_norm": 0.9208471179008484,
|
154 |
+
"learning_rate": 4.578674030756364e-06,
|
155 |
+
"loss": 1.4459,
|
156 |
+
"step": 2100
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.24797114517583407,
|
160 |
+
"grad_norm": 0.9443785548210144,
|
161 |
+
"learning_rate": 4.4833833507280884e-06,
|
162 |
+
"loss": 1.4437,
|
163 |
+
"step": 2200
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.25924256086564473,
|
167 |
+
"grad_norm": 1.0956292152404785,
|
168 |
+
"learning_rate": 4.379599518697444e-06,
|
169 |
+
"loss": 1.417,
|
170 |
+
"step": 2300
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.27051397655545534,
|
174 |
+
"grad_norm": 0.8564383387565613,
|
175 |
+
"learning_rate": 4.267766952966369e-06,
|
176 |
+
"loss": 1.4138,
|
177 |
+
"step": 2400
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.281785392245266,
|
181 |
+
"grad_norm": 0.8340147733688354,
|
182 |
+
"learning_rate": 4.1483645377501726e-06,
|
183 |
+
"loss": 1.3637,
|
184 |
+
"step": 2500
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.29305680793507666,
|
188 |
+
"grad_norm": 1.1379494667053223,
|
189 |
+
"learning_rate": 4.021903572521802e-06,
|
190 |
+
"loss": 1.4079,
|
191 |
+
"step": 2600
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.30432822362488726,
|
195 |
+
"grad_norm": 0.8148013353347778,
|
196 |
+
"learning_rate": 3.888925582549006e-06,
|
197 |
+
"loss": 1.3729,
|
198 |
+
"step": 2700
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.3155996393146979,
|
202 |
+
"grad_norm": 0.9854215383529663,
|
203 |
+
"learning_rate": 3.7500000000000005e-06,
|
204 |
+
"loss": 1.3824,
|
205 |
+
"step": 2800
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.3268710550045086,
|
209 |
+
"grad_norm": 0.9190597534179688,
|
210 |
+
"learning_rate": 3.6057217255475034e-06,
|
211 |
+
"loss": 1.3879,
|
212 |
+
"step": 2900
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.3381424706943192,
|
216 |
+
"grad_norm": 1.1362121105194092,
|
217 |
+
"learning_rate": 3.4567085809127247e-06,
|
218 |
+
"loss": 1.3939,
|
219 |
+
"step": 3000
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.34941388638412985,
|
223 |
+
"grad_norm": 1.2068192958831787,
|
224 |
+
"learning_rate": 3.303598663257904e-06,
|
225 |
+
"loss": 1.3463,
|
226 |
+
"step": 3100
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.3606853020739405,
|
230 |
+
"grad_norm": 0.952643632888794,
|
231 |
+
"learning_rate": 3.147047612756302e-06,
|
232 |
+
"loss": 1.3741,
|
233 |
+
"step": 3200
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.3719567177637511,
|
237 |
+
"grad_norm": 0.8026754260063171,
|
238 |
+
"learning_rate": 2.9877258050403214e-06,
|
239 |
+
"loss": 1.3704,
|
240 |
+
"step": 3300
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.38322813345356177,
|
244 |
+
"grad_norm": 0.8540117144584656,
|
245 |
+
"learning_rate": 2.82631548055013e-06,
|
246 |
+
"loss": 1.3904,
|
247 |
+
"step": 3400
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.39449954914337243,
|
251 |
+
"grad_norm": 0.9906865954399109,
|
252 |
+
"learning_rate": 2.663507823075358e-06,
|
253 |
+
"loss": 1.3523,
|
254 |
+
"step": 3500
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.40577096483318303,
|
258 |
+
"grad_norm": 0.8289706707000732,
|
259 |
+
"learning_rate": 2.5e-06,
|
260 |
+
"loss": 1.3519,
|
261 |
+
"step": 3600
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.4170423805229937,
|
265 |
+
"grad_norm": 1.0827723741531372,
|
266 |
+
"learning_rate": 2.3364921769246423e-06,
|
267 |
+
"loss": 1.3475,
|
268 |
+
"step": 3700
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.42831379621280435,
|
272 |
+
"grad_norm": 1.220688819885254,
|
273 |
+
"learning_rate": 2.173684519449872e-06,
|
274 |
+
"loss": 1.3243,
|
275 |
+
"step": 3800
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.43958521190261496,
|
279 |
+
"grad_norm": 1.0109795331954956,
|
280 |
+
"learning_rate": 2.01227419495968e-06,
|
281 |
+
"loss": 1.3444,
|
282 |
+
"step": 3900
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.4508566275924256,
|
286 |
+
"grad_norm": 1.041104793548584,
|
287 |
+
"learning_rate": 1.852952387243698e-06,
|
288 |
+
"loss": 1.3436,
|
289 |
+
"step": 4000
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.4621280432822362,
|
293 |
+
"grad_norm": 0.7376370429992676,
|
294 |
+
"learning_rate": 1.6964013367420967e-06,
|
295 |
+
"loss": 1.3002,
|
296 |
+
"step": 4100
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.4733994589720469,
|
300 |
+
"grad_norm": 0.8842127919197083,
|
301 |
+
"learning_rate": 1.5432914190872757e-06,
|
302 |
+
"loss": 1.3454,
|
303 |
+
"step": 4200
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.48467087466185754,
|
307 |
+
"grad_norm": 1.0636272430419922,
|
308 |
+
"learning_rate": 1.3942782744524974e-06,
|
309 |
+
"loss": 1.3396,
|
310 |
+
"step": 4300
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.49594229035166815,
|
314 |
+
"grad_norm": 1.2041317224502563,
|
315 |
+
"learning_rate": 1.2500000000000007e-06,
|
316 |
+
"loss": 1.335,
|
317 |
+
"step": 4400
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.5072137060414789,
|
321 |
+
"grad_norm": 0.9379550218582153,
|
322 |
+
"learning_rate": 1.1110744174509952e-06,
|
323 |
+
"loss": 1.3213,
|
324 |
+
"step": 4500
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.5184851217312895,
|
328 |
+
"grad_norm": 0.7874147891998291,
|
329 |
+
"learning_rate": 9.780964274781984e-07,
|
330 |
+
"loss": 1.3198,
|
331 |
+
"step": 4600
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.5297565374211001,
|
335 |
+
"grad_norm": 0.7258532047271729,
|
336 |
+
"learning_rate": 8.516354622498279e-07,
|
337 |
+
"loss": 1.2681,
|
338 |
+
"step": 4700
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.5410279531109107,
|
342 |
+
"grad_norm": 1.1035155057907104,
|
343 |
+
"learning_rate": 7.322330470336314e-07,
|
344 |
+
"loss": 1.3219,
|
345 |
+
"step": 4800
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 0.5522993688007214,
|
349 |
+
"grad_norm": 0.7815728187561035,
|
350 |
+
"learning_rate": 6.204004813025569e-07,
|
351 |
+
"loss": 1.3381,
|
352 |
+
"step": 4900
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 0.563570784490532,
|
356 |
+
"grad_norm": 1.0812482833862305,
|
357 |
+
"learning_rate": 5.166166492719124e-07,
|
358 |
+
"loss": 1.3322,
|
359 |
+
"step": 5000
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"epoch": 0.5748422001803426,
|
363 |
+
"grad_norm": 0.9642081260681152,
|
364 |
+
"learning_rate": 4.2132596924363666e-07,
|
365 |
+
"loss": 1.3218,
|
366 |
+
"step": 5100
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"epoch": 0.5861136158701533,
|
370 |
+
"grad_norm": 0.8039354085922241,
|
371 |
+
"learning_rate": 3.3493649053890325e-07,
|
372 |
+
"loss": 1.3216,
|
373 |
+
"step": 5200
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"epoch": 0.5973850315599639,
|
377 |
+
"grad_norm": 0.8643052577972412,
|
378 |
+
"learning_rate": 2.5781814616827936e-07,
|
379 |
+
"loss": 1.3333,
|
380 |
+
"step": 5300
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 0.6086564472497745,
|
384 |
+
"grad_norm": 1.4110788106918335,
|
385 |
+
"learning_rate": 1.9030116872178317e-07,
|
386 |
+
"loss": 1.3412,
|
387 |
+
"step": 5400
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"epoch": 0.6199278629395852,
|
391 |
+
"grad_norm": 0.7792391180992126,
|
392 |
+
"learning_rate": 1.3267467626223606e-07,
|
393 |
+
"loss": 1.3069,
|
394 |
+
"step": 5500
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"epoch": 0.6311992786293958,
|
398 |
+
"grad_norm": 1.0475589036941528,
|
399 |
+
"learning_rate": 8.518543427732951e-08,
|
400 |
+
"loss": 1.3187,
|
401 |
+
"step": 5600
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"epoch": 0.6424706943192064,
|
405 |
+
"grad_norm": 0.9902795553207397,
|
406 |
+
"learning_rate": 4.8036798991923925e-08,
|
407 |
+
"loss": 1.328,
|
408 |
+
"step": 5700
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 0.6537421100090172,
|
412 |
+
"grad_norm": 0.7977623343467712,
|
413 |
+
"learning_rate": 2.1387846565474047e-08,
|
414 |
+
"loss": 1.3175,
|
415 |
+
"step": 5800
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"epoch": 0.6650135256988278,
|
419 |
+
"grad_norm": 0.872138261795044,
|
420 |
+
"learning_rate": 5.352691903491303e-09,
|
421 |
+
"loss": 1.3116,
|
422 |
+
"step": 5900
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"epoch": 0.6762849413886384,
|
426 |
+
"grad_norm": 0.8640491366386414,
|
427 |
+
"learning_rate": 0.0,
|
428 |
+
"loss": 1.3081,
|
429 |
+
"step": 6000
|
430 |
+
}
|
431 |
+
],
|
432 |
+
"logging_steps": 100,
|
433 |
+
"max_steps": 6000,
|
434 |
+
"num_input_tokens_seen": 0,
|
435 |
+
"num_train_epochs": 1,
|
436 |
+
"save_steps": 100,
|
437 |
+
"stateful_callbacks": {
|
438 |
+
"TrainerControl": {
|
439 |
+
"args": {
|
440 |
+
"should_epoch_stop": false,
|
441 |
+
"should_evaluate": false,
|
442 |
+
"should_log": false,
|
443 |
+
"should_save": true,
|
444 |
+
"should_training_stop": true
|
445 |
+
},
|
446 |
+
"attributes": {}
|
447 |
+
}
|
448 |
+
},
|
449 |
+
"total_flos": 0.0,
|
450 |
+
"train_batch_size": 16,
|
451 |
+
"trial_name": null,
|
452 |
+
"trial_params": null
|
453 |
+
}
|
checkpoint_dir/config.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "microsoft/phi-3-mini-4k-instruct",
|
3 |
+
"architectures": [
|
4 |
+
"Phi3ForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "microsoft/phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
|
10 |
+
"AutoModelForCausalLM": "microsoft/phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
|
11 |
+
},
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"embd_pdrop": 0.0,
|
14 |
+
"eos_token_id": 32000,
|
15 |
+
"hidden_act": "silu",
|
16 |
+
"hidden_size": 3072,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"intermediate_size": 8192,
|
19 |
+
"max_position_embeddings": 4096,
|
20 |
+
"model_type": "phi3",
|
21 |
+
"num_attention_heads": 32,
|
22 |
+
"num_hidden_layers": 32,
|
23 |
+
"num_key_value_heads": 32,
|
24 |
+
"original_max_position_embeddings": 4096,
|
25 |
+
"pad_token_id": 32000,
|
26 |
+
"quantization_config": {
|
27 |
+
"_load_in_4bit": true,
|
28 |
+
"_load_in_8bit": false,
|
29 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
30 |
+
"bnb_4bit_quant_storage": "uint8",
|
31 |
+
"bnb_4bit_quant_type": "nf4",
|
32 |
+
"bnb_4bit_use_double_quant": true,
|
33 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
34 |
+
"llm_int8_has_fp16_weight": false,
|
35 |
+
"llm_int8_skip_modules": null,
|
36 |
+
"llm_int8_threshold": 6.0,
|
37 |
+
"load_in_4bit": true,
|
38 |
+
"load_in_8bit": false,
|
39 |
+
"quant_method": "bitsandbytes"
|
40 |
+
},
|
41 |
+
"resid_pdrop": 0.0,
|
42 |
+
"rms_norm_eps": 1e-05,
|
43 |
+
"rope_scaling": null,
|
44 |
+
"rope_theta": 10000.0,
|
45 |
+
"sliding_window": 2047,
|
46 |
+
"tie_word_embeddings": false,
|
47 |
+
"torch_dtype": "bfloat16",
|
48 |
+
"transformers_version": "4.44.2",
|
49 |
+
"use_cache": false,
|
50 |
+
"vocab_size": 32064
|
51 |
+
}
|
checkpoint_dir/eval_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.6762849413886384,
|
3 |
+
"eval_loss": 7.323873043060303,
|
4 |
+
"eval_runtime": 386.842,
|
5 |
+
"eval_samples": 3154,
|
6 |
+
"eval_samples_per_second": 8.153,
|
7 |
+
"eval_steps_per_second": 0.512
|
8 |
+
}
|
checkpoint_dir/image_projector.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aecea00b5c6bae9364ea91f91668bdd520b9aa8596318d8fad8cbdc846442502
|
3 |
+
size 229740802
|
checkpoint_dir/train_results.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 0.6762849413886384,
|
3 |
+
"total_flos": 0.0,
|
4 |
+
"train_loss": 2.13714133199056,
|
5 |
+
"train_runtime": 37217.3738,
|
6 |
+
"train_samples_per_second": 2.579,
|
7 |
+
"train_steps_per_second": 0.161
|
8 |
+
}
|
checkpoint_dir/trainer_state.json
ADDED
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 0.6762849413886384,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 6000,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.01127141568981064,
|
13 |
+
"grad_norm": 36.142295837402344,
|
14 |
+
"learning_rate": 4.1666666666666667e-07,
|
15 |
+
"loss": 12.6438,
|
16 |
+
"step": 100
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.02254283137962128,
|
20 |
+
"grad_norm": 30.892282485961914,
|
21 |
+
"learning_rate": 8.333333333333333e-07,
|
22 |
+
"loss": 12.4413,
|
23 |
+
"step": 200
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.03381424706943192,
|
27 |
+
"grad_norm": 45.94392395019531,
|
28 |
+
"learning_rate": 1.25e-06,
|
29 |
+
"loss": 11.4952,
|
30 |
+
"step": 300
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.04508566275924256,
|
34 |
+
"grad_norm": 19.69997787475586,
|
35 |
+
"learning_rate": 1.6666666666666667e-06,
|
36 |
+
"loss": 7.0421,
|
37 |
+
"step": 400
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.0563570784490532,
|
41 |
+
"grad_norm": 3.0256764888763428,
|
42 |
+
"learning_rate": 2.0833333333333334e-06,
|
43 |
+
"loss": 3.1071,
|
44 |
+
"step": 500
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.06762849413886383,
|
48 |
+
"grad_norm": 1.9536222219467163,
|
49 |
+
"learning_rate": 2.5e-06,
|
50 |
+
"loss": 2.7468,
|
51 |
+
"step": 600
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.07889990982867448,
|
55 |
+
"grad_norm": 3.0367178916931152,
|
56 |
+
"learning_rate": 2.916666666666667e-06,
|
57 |
+
"loss": 2.5461,
|
58 |
+
"step": 700
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.09017132551848513,
|
62 |
+
"grad_norm": 1.1410305500030518,
|
63 |
+
"learning_rate": 3.3333333333333333e-06,
|
64 |
+
"loss": 2.3206,
|
65 |
+
"step": 800
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.10144274120829576,
|
69 |
+
"grad_norm": 1.2108758687973022,
|
70 |
+
"learning_rate": 3.7500000000000005e-06,
|
71 |
+
"loss": 2.036,
|
72 |
+
"step": 900
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.1127141568981064,
|
76 |
+
"grad_norm": 1.0124415159225464,
|
77 |
+
"learning_rate": 4.166666666666667e-06,
|
78 |
+
"loss": 1.9927,
|
79 |
+
"step": 1000
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.12398557258791704,
|
83 |
+
"grad_norm": 1.2103397846221924,
|
84 |
+
"learning_rate": 4.583333333333333e-06,
|
85 |
+
"loss": 1.8255,
|
86 |
+
"step": 1100
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"epoch": 0.13525698827772767,
|
90 |
+
"grad_norm": 1.2905486822128296,
|
91 |
+
"learning_rate": 5e-06,
|
92 |
+
"loss": 1.6822,
|
93 |
+
"step": 1200
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"epoch": 0.14652840396753833,
|
97 |
+
"grad_norm": 0.9415493607521057,
|
98 |
+
"learning_rate": 4.994647308096509e-06,
|
99 |
+
"loss": 1.6783,
|
100 |
+
"step": 1300
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"epoch": 0.15779981965734896,
|
104 |
+
"grad_norm": 0.9576979279518127,
|
105 |
+
"learning_rate": 4.978612153434527e-06,
|
106 |
+
"loss": 1.5967,
|
107 |
+
"step": 1400
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"epoch": 0.1690712353471596,
|
111 |
+
"grad_norm": 1.068982481956482,
|
112 |
+
"learning_rate": 4.9519632010080765e-06,
|
113 |
+
"loss": 1.5821,
|
114 |
+
"step": 1500
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"epoch": 0.18034265103697025,
|
118 |
+
"grad_norm": 0.7473943829536438,
|
119 |
+
"learning_rate": 4.914814565722671e-06,
|
120 |
+
"loss": 1.5587,
|
121 |
+
"step": 1600
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"epoch": 0.19161406672678089,
|
125 |
+
"grad_norm": 0.9740411043167114,
|
126 |
+
"learning_rate": 4.867325323737765e-06,
|
127 |
+
"loss": 1.525,
|
128 |
+
"step": 1700
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"epoch": 0.20288548241659152,
|
132 |
+
"grad_norm": 0.6997997164726257,
|
133 |
+
"learning_rate": 4.809698831278217e-06,
|
134 |
+
"loss": 1.49,
|
135 |
+
"step": 1800
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"epoch": 0.21415689810640218,
|
139 |
+
"grad_norm": 0.6754641532897949,
|
140 |
+
"learning_rate": 4.742181853831721e-06,
|
141 |
+
"loss": 1.4597,
|
142 |
+
"step": 1900
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"epoch": 0.2254283137962128,
|
146 |
+
"grad_norm": 1.0468783378601074,
|
147 |
+
"learning_rate": 4.665063509461098e-06,
|
148 |
+
"loss": 1.4539,
|
149 |
+
"step": 2000
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"epoch": 0.23669972948602344,
|
153 |
+
"grad_norm": 0.9208471179008484,
|
154 |
+
"learning_rate": 4.578674030756364e-06,
|
155 |
+
"loss": 1.4459,
|
156 |
+
"step": 2100
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"epoch": 0.24797114517583407,
|
160 |
+
"grad_norm": 0.9443785548210144,
|
161 |
+
"learning_rate": 4.4833833507280884e-06,
|
162 |
+
"loss": 1.4437,
|
163 |
+
"step": 2200
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"epoch": 0.25924256086564473,
|
167 |
+
"grad_norm": 1.0956292152404785,
|
168 |
+
"learning_rate": 4.379599518697444e-06,
|
169 |
+
"loss": 1.417,
|
170 |
+
"step": 2300
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"epoch": 0.27051397655545534,
|
174 |
+
"grad_norm": 0.8564383387565613,
|
175 |
+
"learning_rate": 4.267766952966369e-06,
|
176 |
+
"loss": 1.4138,
|
177 |
+
"step": 2400
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"epoch": 0.281785392245266,
|
181 |
+
"grad_norm": 0.8340147733688354,
|
182 |
+
"learning_rate": 4.1483645377501726e-06,
|
183 |
+
"loss": 1.3637,
|
184 |
+
"step": 2500
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"epoch": 0.29305680793507666,
|
188 |
+
"grad_norm": 1.1379494667053223,
|
189 |
+
"learning_rate": 4.021903572521802e-06,
|
190 |
+
"loss": 1.4079,
|
191 |
+
"step": 2600
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"epoch": 0.30432822362488726,
|
195 |
+
"grad_norm": 0.8148013353347778,
|
196 |
+
"learning_rate": 3.888925582549006e-06,
|
197 |
+
"loss": 1.3729,
|
198 |
+
"step": 2700
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"epoch": 0.3155996393146979,
|
202 |
+
"grad_norm": 0.9854215383529663,
|
203 |
+
"learning_rate": 3.7500000000000005e-06,
|
204 |
+
"loss": 1.3824,
|
205 |
+
"step": 2800
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"epoch": 0.3268710550045086,
|
209 |
+
"grad_norm": 0.9190597534179688,
|
210 |
+
"learning_rate": 3.6057217255475034e-06,
|
211 |
+
"loss": 1.3879,
|
212 |
+
"step": 2900
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"epoch": 0.3381424706943192,
|
216 |
+
"grad_norm": 1.1362121105194092,
|
217 |
+
"learning_rate": 3.4567085809127247e-06,
|
218 |
+
"loss": 1.3939,
|
219 |
+
"step": 3000
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"epoch": 0.34941388638412985,
|
223 |
+
"grad_norm": 1.2068192958831787,
|
224 |
+
"learning_rate": 3.303598663257904e-06,
|
225 |
+
"loss": 1.3463,
|
226 |
+
"step": 3100
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"epoch": 0.3606853020739405,
|
230 |
+
"grad_norm": 0.952643632888794,
|
231 |
+
"learning_rate": 3.147047612756302e-06,
|
232 |
+
"loss": 1.3741,
|
233 |
+
"step": 3200
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"epoch": 0.3719567177637511,
|
237 |
+
"grad_norm": 0.8026754260063171,
|
238 |
+
"learning_rate": 2.9877258050403214e-06,
|
239 |
+
"loss": 1.3704,
|
240 |
+
"step": 3300
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"epoch": 0.38322813345356177,
|
244 |
+
"grad_norm": 0.8540117144584656,
|
245 |
+
"learning_rate": 2.82631548055013e-06,
|
246 |
+
"loss": 1.3904,
|
247 |
+
"step": 3400
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"epoch": 0.39449954914337243,
|
251 |
+
"grad_norm": 0.9906865954399109,
|
252 |
+
"learning_rate": 2.663507823075358e-06,
|
253 |
+
"loss": 1.3523,
|
254 |
+
"step": 3500
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"epoch": 0.40577096483318303,
|
258 |
+
"grad_norm": 0.8289706707000732,
|
259 |
+
"learning_rate": 2.5e-06,
|
260 |
+
"loss": 1.3519,
|
261 |
+
"step": 3600
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"epoch": 0.4170423805229937,
|
265 |
+
"grad_norm": 1.0827723741531372,
|
266 |
+
"learning_rate": 2.3364921769246423e-06,
|
267 |
+
"loss": 1.3475,
|
268 |
+
"step": 3700
|
269 |
+
},
|
270 |
+
{
|
271 |
+
"epoch": 0.42831379621280435,
|
272 |
+
"grad_norm": 1.220688819885254,
|
273 |
+
"learning_rate": 2.173684519449872e-06,
|
274 |
+
"loss": 1.3243,
|
275 |
+
"step": 3800
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"epoch": 0.43958521190261496,
|
279 |
+
"grad_norm": 1.0109795331954956,
|
280 |
+
"learning_rate": 2.01227419495968e-06,
|
281 |
+
"loss": 1.3444,
|
282 |
+
"step": 3900
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"epoch": 0.4508566275924256,
|
286 |
+
"grad_norm": 1.041104793548584,
|
287 |
+
"learning_rate": 1.852952387243698e-06,
|
288 |
+
"loss": 1.3436,
|
289 |
+
"step": 4000
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"epoch": 0.4621280432822362,
|
293 |
+
"grad_norm": 0.7376370429992676,
|
294 |
+
"learning_rate": 1.6964013367420967e-06,
|
295 |
+
"loss": 1.3002,
|
296 |
+
"step": 4100
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"epoch": 0.4733994589720469,
|
300 |
+
"grad_norm": 0.8842127919197083,
|
301 |
+
"learning_rate": 1.5432914190872757e-06,
|
302 |
+
"loss": 1.3454,
|
303 |
+
"step": 4200
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"epoch": 0.48467087466185754,
|
307 |
+
"grad_norm": 1.0636272430419922,
|
308 |
+
"learning_rate": 1.3942782744524974e-06,
|
309 |
+
"loss": 1.3396,
|
310 |
+
"step": 4300
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"epoch": 0.49594229035166815,
|
314 |
+
"grad_norm": 1.2041317224502563,
|
315 |
+
"learning_rate": 1.2500000000000007e-06,
|
316 |
+
"loss": 1.335,
|
317 |
+
"step": 4400
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"epoch": 0.5072137060414789,
|
321 |
+
"grad_norm": 0.9379550218582153,
|
322 |
+
"learning_rate": 1.1110744174509952e-06,
|
323 |
+
"loss": 1.3213,
|
324 |
+
"step": 4500
|
325 |
+
},
|
326 |
+
{
|
327 |
+
"epoch": 0.5184851217312895,
|
328 |
+
"grad_norm": 0.7874147891998291,
|
329 |
+
"learning_rate": 9.780964274781984e-07,
|
330 |
+
"loss": 1.3198,
|
331 |
+
"step": 4600
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"epoch": 0.5297565374211001,
|
335 |
+
"grad_norm": 0.7258532047271729,
|
336 |
+
"learning_rate": 8.516354622498279e-07,
|
337 |
+
"loss": 1.2681,
|
338 |
+
"step": 4700
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"epoch": 0.5410279531109107,
|
342 |
+
"grad_norm": 1.1035155057907104,
|
343 |
+
"learning_rate": 7.322330470336314e-07,
|
344 |
+
"loss": 1.3219,
|
345 |
+
"step": 4800
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"epoch": 0.5522993688007214,
|
349 |
+
"grad_norm": 0.7815728187561035,
|
350 |
+
"learning_rate": 6.204004813025569e-07,
|
351 |
+
"loss": 1.3381,
|
352 |
+
"step": 4900
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"epoch": 0.563570784490532,
|
356 |
+
"grad_norm": 1.0812482833862305,
|
357 |
+
"learning_rate": 5.166166492719124e-07,
|
358 |
+
"loss": 1.3322,
|
359 |
+
"step": 5000
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"epoch": 0.5748422001803426,
|
363 |
+
"grad_norm": 0.9642081260681152,
|
364 |
+
"learning_rate": 4.2132596924363666e-07,
|
365 |
+
"loss": 1.3218,
|
366 |
+
"step": 5100
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"epoch": 0.5861136158701533,
|
370 |
+
"grad_norm": 0.8039354085922241,
|
371 |
+
"learning_rate": 3.3493649053890325e-07,
|
372 |
+
"loss": 1.3216,
|
373 |
+
"step": 5200
|
374 |
+
},
|
375 |
+
{
|
376 |
+
"epoch": 0.5973850315599639,
|
377 |
+
"grad_norm": 0.8643052577972412,
|
378 |
+
"learning_rate": 2.5781814616827936e-07,
|
379 |
+
"loss": 1.3333,
|
380 |
+
"step": 5300
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"epoch": 0.6086564472497745,
|
384 |
+
"grad_norm": 1.4110788106918335,
|
385 |
+
"learning_rate": 1.9030116872178317e-07,
|
386 |
+
"loss": 1.3412,
|
387 |
+
"step": 5400
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"epoch": 0.6199278629395852,
|
391 |
+
"grad_norm": 0.7792391180992126,
|
392 |
+
"learning_rate": 1.3267467626223606e-07,
|
393 |
+
"loss": 1.3069,
|
394 |
+
"step": 5500
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"epoch": 0.6311992786293958,
|
398 |
+
"grad_norm": 1.0475589036941528,
|
399 |
+
"learning_rate": 8.518543427732951e-08,
|
400 |
+
"loss": 1.3187,
|
401 |
+
"step": 5600
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"epoch": 0.6424706943192064,
|
405 |
+
"grad_norm": 0.9902795553207397,
|
406 |
+
"learning_rate": 4.8036798991923925e-08,
|
407 |
+
"loss": 1.328,
|
408 |
+
"step": 5700
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"epoch": 0.6537421100090172,
|
412 |
+
"grad_norm": 0.7977623343467712,
|
413 |
+
"learning_rate": 2.1387846565474047e-08,
|
414 |
+
"loss": 1.3175,
|
415 |
+
"step": 5800
|
416 |
+
},
|
417 |
+
{
|
418 |
+
"epoch": 0.6650135256988278,
|
419 |
+
"grad_norm": 0.872138261795044,
|
420 |
+
"learning_rate": 5.352691903491303e-09,
|
421 |
+
"loss": 1.3116,
|
422 |
+
"step": 5900
|
423 |
+
},
|
424 |
+
{
|
425 |
+
"epoch": 0.6762849413886384,
|
426 |
+
"grad_norm": 0.8640491366386414,
|
427 |
+
"learning_rate": 0.0,
|
428 |
+
"loss": 1.3081,
|
429 |
+
"step": 6000
|
430 |
+
},
|
431 |
+
{
|
432 |
+
"epoch": 0.6762849413886384,
|
433 |
+
"step": 6000,
|
434 |
+
"total_flos": 0.0,
|
435 |
+
"train_loss": 2.13714133199056,
|
436 |
+
"train_runtime": 37217.3738,
|
437 |
+
"train_samples_per_second": 2.579,
|
438 |
+
"train_steps_per_second": 0.161
|
439 |
+
}
|
440 |
+
],
|
441 |
+
"logging_steps": 100,
|
442 |
+
"max_steps": 6000,
|
443 |
+
"num_input_tokens_seen": 0,
|
444 |
+
"num_train_epochs": 1,
|
445 |
+
"save_steps": 100,
|
446 |
+
"stateful_callbacks": {
|
447 |
+
"TrainerControl": {
|
448 |
+
"args": {
|
449 |
+
"should_epoch_stop": false,
|
450 |
+
"should_evaluate": false,
|
451 |
+
"should_log": false,
|
452 |
+
"should_save": true,
|
453 |
+
"should_training_stop": true
|
454 |
+
},
|
455 |
+
"attributes": {}
|
456 |
+
}
|
457 |
+
},
|
458 |
+
"total_flos": 0.0,
|
459 |
+
"train_batch_size": 16,
|
460 |
+
"trial_name": null,
|
461 |
+
"trial_params": null
|
462 |
+
}
|
model.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
class Projections(nn.Module):
|
4 |
+
def __init__(self, clip_embed, phi_embed, num_projection_layers=6):
|
5 |
+
super().__init__()
|
6 |
+
|
7 |
+
self.output = nn.Linear(clip_embed, phi_embed)
|
8 |
+
self.norm = nn.LayerNorm(phi_embed)
|
9 |
+
self.projection_layers = nn.ModuleList(
|
10 |
+
[
|
11 |
+
nn.Sequential(
|
12 |
+
nn.Linear(phi_embed, phi_embed),
|
13 |
+
nn.GELU(),
|
14 |
+
nn.Linear(phi_embed, phi_embed),
|
15 |
+
)
|
16 |
+
for _ in range(num_projection_layers)
|
17 |
+
]
|
18 |
+
)
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
x = self.output(x)
|
22 |
+
x = self.norm(x)
|
23 |
+
for layer in self.projection_layers:
|
24 |
+
residual = x
|
25 |
+
x = layer(x) + residual
|
26 |
+
|
27 |
+
return x
|
requirements.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bitsandbytes==0.43.3
|
2 |
+
clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
|
3 |
+
colorama==0.4.6
|
4 |
+
datasets==3.0.0
|
5 |
+
dill==0.3.8
|
6 |
+
multiprocess==0.70.16
|
7 |
+
numpy==1.26.4
|
8 |
+
pandas==2.2.2
|
9 |
+
peft==0.12.0
|
10 |
+
shtab==1.7.1
|
11 |
+
tokenizers==0.19.1
|
12 |
+
torch==2.4.1
|
13 |
+
torchvision==0.19.1
|
14 |
+
tqdm==4.66.5
|
15 |
+
transformers==4.44.2
|
16 |
+
treelib==1.7.0
|
17 |
+
trl==0.10.1
|
18 |
+
typing_extensions==4.12.2
|
19 |
+
tyro==0.8.10
|
20 |
+
tzdata==2024.1
|
21 |
+
urllib3==2.2.3
|
22 |
+
wcwidth==0.2.13
|
23 |
+
xxhash==3.5.0
|
24 |
+
yarl==1.11.1
|