In [1]:
import json
from pathlib import Path

params = {}
params_path = Path("/content/params.json")
if params_path.is_file():
    with params_path.open("r", encoding="UTF-8") as params_file:
        params = json.load(params_file)

params

{'hugging_face_hub_token': '' 'num_train_epochs': 1,
 'prompt_template': '## Instruction\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n\nOnly use the API reference to understand the syntax of the request.\n\n## Natural Language Query\n{nlcommand}\n\n## Schema\n{schema}\n\n## API reference\n{apiRef}\n\n## Answer\n{output}\n',
 'push_to_hub': 'substratusai/weaviate-gorilla-v4-schema-split',
 'save_steps': 5}

In [2]:
import transformers
from datasets import load_dataset
import torch

import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/content/saved-model/"
trained_model_path = "/content/model"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Prompt before fine tuning

In [3]:
import os 

data = load_dataset("json", data_files="/content/data/*.json*")
data

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],
        num_rows: 1892
    })
})

In [4]:
default_prompt = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{prompt}
### Response:
{completion}
"""

prompt = params.get("prompt_template", default_prompt)
prompt.format_map(data["train"][0])

'## Instruction\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\n\nOnly use the API reference to understand the syntax of the request.\n\n## Natural Language Query\n```text\nGet me the top 10 historical events related to \'World War II\', and show the event name, description, year, significant impact, and the names and populations of the involved countries.\n```\n\n## Schema\n{\n"classes": [\n{\n"class": "HistoricalEvent",\n"description": "Information about historical events",\n"vectorIndexType": "hnsw",\n"vectorizer": "text2vec-transformers",\n"properties": [\n{\n"name": "eventName",\n"dataType": ["text"],\n"description": "Name of the historical event"\n},\n{\n"name": "description",\n"dataType": ["text"],\n"description": "Detailed description of the event"\n},\n{\n"name": "year",\n"dataType": ["int"],\n"description": "Year the event occurred"\n},\n{\n"na

In [5]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print(data)
data = data.map(lambda x: tokenizer(
    prompt.format_map(x), padding='max_length', truncation=True))

print("After tokenizing:", data)

Using pad_token, but it is not set yet.
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath'],
        num_rows: 1892
    })
})


Map:   0%|          | 0/1892 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


After tokenizing: DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'nlcommand', 'apiRef', 'apiRefPath', 'schema', 'schemaPath', 'input_ids', 'attention_mask'],
        num_rows: 1892
    })
})


In [6]:
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

lora_config2 = LoraConfig(
 r=16,
 lora_alpha=32,
 # target modules should be unset so it can detect target_modules automatically
 # target_modules=["query_key_value"],
 lora_dropout=0.05,
 bias="none",
 task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config2)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,812,416 || trainable%: 0.12433438908285782


In [7]:
from utils import parse_training_args

training_args = parse_training_args(params)
training_args

TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_

In [8]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

checkpoint_path = Path("/content/model/checkpoints")

# Only set resume_from_checkpoint True when directory exists and contains files
resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())
if resume_from_checkpoint:
    print("Resuming from checkpoint:", list(checkpoint_path.rglob("")))
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

trainer.save_model(trained_model_path)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.2284
2,1.195
3,1.2358
4,1.1955
5,1.1818
6,1.1197
7,1.0286
8,0.9282
9,0.9435
10,0.938


In [9]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Sep  3 19:25:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA L4           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    32W /  72W |  17390MiB / 23034MiB |      0%      Default |
|                               |   

In [10]:
! ls -lash {trained_model_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 13G
 512 -rw-r--r-- 1 root 3003   93 Sep  3 19:25 README.md
 512 -rw-r--r-- 1 root 3003  444 Sep  3 19:25 adapter_config.json
 33M -rw-r--r-- 1 root 3003  33M Sep  3 19:25 adapter_model.bin
   0 drwxr-xr-x 1 root 3003    0 Sep  3 18:49 checkpoints
1.0K -rw-r--r-- 1 root 3003  631 Sep  3 08:34 config.json
9.3G -rw-r--r-- 1 root 3003 9.3G Sep  3 08:34 pytorch_model-00001-of-00002.bin
3.3G -rw-r--r-- 1 root 3003 3.3G Sep  3 08:37 pytorch_model-00002-of-00002.bin
 24K -rw-r--r-- 1 root 3003  24K Sep  3 08:38 pytorch_model.bin.index.json
 512 -rw-r--r-- 1 root 3003  438 Sep  3 08:38 special_tokens_map.json
1.8M -rw-r--r-- 1 root 3003 1.8M Sep  3 08:38 tokenizer.json
1.0K -rw-r--r-- 1 root 3003  7

In [11]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear(
     

In [12]:
model = model.merge_and_unload()

In [13]:
model.save_pretrained(trained_model_path)
tokenizer.save_pretrained(trained_model_path)


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/tokenizer.json')

In [14]:
! ls -lash {trained_model_path}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 26G
 512 -rw-r--r-- 1 root 3003   93 Sep  3 19:25 README.md
 512 -rw-r--r-- 1 root 3003  444 Sep  3 19:25 adapter_config.json
 33M -rw-r--r-- 1 root 3003  33M Sep  3 19:25 adapter_model.bin
   0 drwxr-xr-x 1 root 3003    0 Sep  3 18:49 checkpoints
1.0K -rw-r--r-- 1 root 3003  632 Sep  3 19:25 config.json
9.2G -rw-r--r-- 1 root 3003 9.2G Sep  3 19:25 pytorch_model-00001-of-00003.bin
9.3G -rw-r--r-- 1 root 3003 9.3G Sep  3 19:28 pytorch_model-00002-of-00003.bin
6.7G -rw-r--r-- 1 root 3003 6.7G Sep  3 19:30 pytorch_model-00003-of-00003.bin
 24K -rw-r--r-- 1 root 3003  24K Sep  3 19:32 pytorch_model.bin.index.json
 512 -rw-r--r-- 1 root 3003  438 Sep  3 19:32 special_tokens_map.json
1.8M -rw-r--

In [15]:
if params.get("push_to_hub"):
    model.push_to_hub(params["push_to_hub"])
    tokenizer.push_to_hub(params["push_to_hub"])


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
