In [1]:
# #@title ðŸ¤— AutoTrain LLM
# #@markdown In order to use this colab
# #@markdown - upload train.csv to a folder named `data/`
# #@markdown - train.csv must contain a `text` column
# #@markdown - choose a project name if you wish
# #@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
# #@markdown - add huggingface information (token and repo_id) if you wish to push trained model to huggingface hub
# #@markdown - update hyperparameters if you wish
# #@markdown - click `Runtime > Run all` or run each cell individually

import os
# !pip install -U autotrain-advanced > install_logs.txt
# !autotrain setup > setup_logs.txt

In [2]:
# pip install transformers --upgrad
# !pip install torch --upgrade
# !pip install tokenizers --upgrade

In [3]:
#@markdown ---
#@markdown #### Project Config
#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
project_name = 'my_autotrain_llm_sys_temp_meta_llama_chat' # @param {type:"string"}
model_name =    "meta-llama/Llama-2-7b-chat-hf"         # 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}

#@markdown ---
#@markdown #### Push to Hub?
#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
#@markdown You can find your token here: https://huggingface.co/settings/tokens
push_to_hub = False # @param ["False", "True"] {type:"raw"}
hf_token = "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" #@param {type:"string"}
repo_id = "hemantk089/llm_fine_tuning" #@param {type:"string"}

#@markdown ---
#@markdown #### Hyperparameters
learning_rate = 2e-4 # @param {type:"number"}
num_epochs = 200 #@param {type:"number"}
batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
block_size = 1024 # @param {type:"number"}
trainer = "sft" # @param ["default", "sft"] {type:"raw"}
warmup_ratio = 0.1 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
gradient_accumulation = 4 # @param {type:"number"}
use_fp16 = True # @param ["False", "True"] {type:"raw"}
use_peft = True # @param ["False", "True"] {type:"raw"}
use_int4 = True # @param ["False", "True"] {type:"raw"}
lora_r = 16 #@param {type:"number"}
lora_alpha = 32 #@param {type:"number"}
lora_dropout = 0.05 #@param {type:"number"}

os.environ["PROJECT_NAME"] = project_name
os.environ["MODEL_NAME"] = model_name
os.environ["PUSH_TO_HUB"] = str(push_to_hub)
os.environ["HF_TOKEN"] = hf_token
os.environ["REPO_ID"] = repo_id
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["USE_FP16"] = str(use_fp16)
os.environ["USE_PEFT"] = str(use_peft)
os.environ["USE_INT4"] = str(use_int4)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)


In [4]:
%%time
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
$( [[ "$USE_FP16" == "True" ]] && echo "--fp16" ) \
$( [[ "$USE_PEFT" == "True" ]] && echo "--use-peft" ) \
$( [[ "$USE_INT4" == "True" ]] && echo "--use-int4" ) \
$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN} --repo-id ${REPO_ID}" )

[2023-10-04 14:41:59,153] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
> [1mINFO    Running LLM[0m
> [1mINFO    Params: Namespace(add_eos_token=False, auto_find_batch_size=False, backend='default', block_size=1024, data_path='data/', deploy=False, evaluation_strategy='epoch', fp16=True, func=<function run_llm_command_factory at 0x1468523be1f0>, gradient_accumulation_steps=4, inference=False, learning_rate=0.0002, logging_steps=-1, lora_alpha=32, lora_dropout=0.05, lora_r=16, max_grad_norm=1.0, merge_adapter=False, model='meta-llama/Llama-2-7b-chat-hf', model_max_length=1024, num_train_epochs=200, optimizer='adamw_torch', project_name='my_autotrain_llm_sys_temp_meta_llama_chat', push_to_hub=False, repo_id=None, save_strategy='epoch', save_total_limit=1, scheduler='linear', seed=42, target_modules=None, text_column='text', token=None, train=True, train_batch_size=1, train_split='train', trainer='default', use_flash_attention_2=False, us

{'loss': 0.0051, 'learning_rate': 3e-05, 'epoch': 174.0}                        
{'loss': 0.0051, 'learning_rate': 2.8888888888888888e-05, 'epoch': 175.0}       
{'loss': 0.005, 'learning_rate': 2.777777777777778e-05, 'epoch': 176.0}         
{'loss': 0.0052, 'learning_rate': 2.6666666666666667e-05, 'epoch': 177.0}       
{'loss': 0.005, 'learning_rate': 2.5555555555555554e-05, 'epoch': 178.0}        
{'loss': 0.005, 'learning_rate': 2.4444444444444445e-05, 'epoch': 179.0}        
{'loss': 0.005, 'learning_rate': 2.3333333333333336e-05, 'epoch': 180.0}        
{'loss': 0.005, 'learning_rate': 2.2222222222222223e-05, 'epoch': 181.0}        
{'loss': 0.005, 'learning_rate': 2.111111111111111e-05, 'epoch': 182.0}         
{'loss': 0.005, 'learning_rate': 2e-05, 'epoch': 183.0}                         
{'loss': 0.0049, 'learning_rate': 1.888888888888889e-05, 'epoch': 184.0}        
{'loss': 0.0049, 'learning_rate': 1.777777777777778e-05, 'epoch': 185.0}        
{'loss': 0.005, 'learning_ra

In [5]:
# !mkdir data

In [6]:
# cd data

In [7]:
import pandas as pd
df = pd.read_csv('./data/train.csv')

In [8]:
print(df.shape)
df.head(5)

(90, 3)


Unnamed: 0,Description,Relevances,text
0,Give the processor information,vendor names of processors,<s>[INST] <<SYS>> Write the BigFixRelevance fo...
1,find the speed of processor,speeds of processors,<s>[INST] <<SYS>> Write the BigFixRelevance fo...
2,find unique processor names,unique values of vendor names of processors,<s>[INST] <<SYS>> Write the BigFixRelevance fo...
3,find name of operating system,name of operating system,<s>[INST] <<SYS>> Write the BigFixRelevance fo...
4,find family names of processor,family names of processors,<s>[INST] <<SYS>> Write the BigFixRelevance fo...


In [14]:
from dataclasses import dataclass
from typing import Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig


@dataclass
class TextGenerationInference:
    model_path: str = "my_autotrain_llm_sys_temp_meta_llama_chat"
    use_int4: Optional[bool] = False
    use_int8: Optional[bool] = False
    temperature: Optional[float] = 0.6
    top_k: Optional[int] = 50
    top_p: Optional[float] = 0.95
    repetition_penalty: Optional[float] = 1.0
    num_return_sequences: Optional[int] = 1
    num_beams: Optional[int] = 5
    max_new_tokens: Optional[int] = 1024
    do_sample: Optional[bool] = True

    def __post_init__(self):
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            load_in_4bit=self.use_int4,
            load_in_8bit=self.use_int8,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map="auto",
            offload_folder="/azusers/work/Hemant/data"   # "./data"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
        self.model.eval()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.generation_config = GenerationConfig(
            temperature=self.temperature,
            top_k=self.top_k,
            top_p=self.top_p,
            repetition_penalty=self.repetition_penalty,
            num_return_sequences=self.num_return_sequences,
            num_beams=self.num_beams,
            max_length=self.max_new_tokens,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=self.do_sample,
            max_new_tokens=self.max_new_tokens,
        )

#     def chat(self, prompt):
#         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
#         outputs = self.model.generate(**inputs, generation_config=self.generation_config)
#         return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     def chat(self, prompt):
#         inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)

#         # Ensure the generation config uses beam search and returns 5 sequences
#         self.generation_config['num_beams'] = 5
#         self.generation_config['num_return_sequences'] = 5

#         outputs = self.model.generate(**inputs, **self.generation_config)

#         # Decode each of the returned sequences
#         responses = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
#         return responses
    
    def chat(self, prompt):
        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)

        # Ensure the generation config uses beam search and returns 5 sequences
        setattr(self.generation_config, 'num_beams', 5)
        setattr(self.generation_config, 'num_return_sequences', 5)

        outputs = self.model.generate(
            **inputs, 
            num_beams=self.generation_config.num_beams, 
            num_return_sequences=self.generation_config.num_return_sequences
        )

        # Decode each of the returned sequences
        responses = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        return responses


    

In [None]:
# %%time
# inference = TextGenerationInference()  # Create an instance with default settings

# prompt = """
# Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# Continue the story based on the given starting sentence.

# ### Input:
# Once upon a time,
# """

# response = inference.chat(prompt)
# print(response)

In [16]:
import torch
torch.cuda.empty_cache()

In [15]:
%%time
inference = TextGenerationInference()  # Create an instance with default settings

# Prompt 1
# prompt = """
# Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# Write the relevance for the given input description.

# ### Input:
# Find out about the specified mapped drive
# """

# Prompt 2
# prompt = """
# ### Instruction:
# Write the relevance for the given input description.

# ### Input:
# Find out about the specified mapped drive.
# """

# Prompt 3
prompt = """
<s>[INST] <<SYS>> Write the BigFixRelevance for the following description: Give the processor information <</SYS>> [/INST]
"""

response = inference.chat(prompt)
print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You shouldn't move a model when it is dispatched on multiple devices.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [None]:
relative_path = "./data"
absolute_path = os.path.abspath(relative_path)
print(absolute_path)